Select Git revision
fanfic_preprocessing.py
fanfic_preprocessing.py 6.17 KiB
import seaborn as sns
import matplotlib.pyplot as plt
from cycler import cycler
import os
import pandas as pd
import statistics
import re
from nltk.probability import FreqDist
import numpy as np
# code snippets for prettifying plots
#colours
pink = '#d600a7'
light_green = '#55a480'
blue_grey = '#5d9c9c'
purple_grey = '#636273'
CB91_Blue = '#2CBDFE'
CB91_Green = '#47DBCD'
CB91_Pink = '#F3A0F2'
CB91_Purple = '#9D2EC5'
CB91_Violet = '#661D98'
CB91_Amber = '#F5B14C'
color_list = [ blue_grey, CB91_Amber, pink, light_green, CB91_Green, CB91_Pink, CB91_Blue,
CB91_Purple, CB91_Violet]
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list)
#some colour palette playing around
cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True)
cm2 = sns.cubehelix_palette(as_cmap=True)
#palette_1 = sns.color_palette("flare")
#palette_2 = sns.color_palette("mako_r", as_cmap=True)
# actual preprocessing code
#file header:
# work_id,title,author,rating,category,fandom,relationship,character,additional tags,language,published,status,status date,words,chapters,comments,kudos,bookmarks,hits,all_kudos,all_bookmarks,body
# 27852922,Dealing with Our Demons,['ravenyenn19'],Mature,F/M,"Six of Crows Series",Kaz Brekker/Inej Ghafa,"Kaz B","Romance,Kanej - Freeform, Eventual Smut",English,2020-12-03,Updated,2023-03-16,747673,162/?,8573,12204,1373,709212,"['ud4m', 'book_addict_1228', 'ephemeraldelights', 'bluedelilah25', 'sunshinecorsets', 'I_do_not_like_purple_glasses', 'beep_boop_00', 'schleswigholstein', 'moonandstars75', 'ewerythingoes', 'mindfighters', 'rosibunnis', 'Lizie06', 'ghostlatte', 'aguswolman', 'QueenofEnglan', 'JenBoyette04', 'gnitneb_reads', 'gloomysunshine', 'v1ofvs', 'BazzaKrekker', 'BookGeek', 'poppyflower19', 'Cassanibal', 'vanilla_chai_tea', 'Honorthyword', 'mariaarmengol', 'luc1inda', 'zarawrites', 'monmough', 'Guilty__Pleasures', 'Ilyann', 'folieadeux_0_0', 'dragonguard', 'Emeliemarx', 'angrydabee', 'slythxrclaw', 'samaram0215', 'letsgetthisbread69', 'Mintmew', 'biblichour', 'Katloupet', 'Miss_ginger', 'inejsquake', 'Arabella_7833', 'flossy_flo99', 'a_k123', 'hushedwanderer', 'siriuslymichele', 'AnnaAvinaVTDX']",[],"Dear Kaz,
grisha_fanfics = pd.read_csv("grishaverse/data/fanfics/grishaverse_fics.csv")
tog_fanfics = pd.read_csv("throne_of_glass/data/fanfics/throne_of_glass_fics.csv")
def read_csv_to_pd(file_path, name_of_file) -> pd: #fix type hints
name_of_file = pd.read_csv(file_path)
return name_of_file
def calculate_cum_kudo_distribution(fanfic_pd):
fanfic_kudos = fanfic_pd["kudos"].values.tolist()
fanfic_kudos_freq_dist = FreqDist(fanfic_kudos)
# convert to FreqDist object to a pandas series for easier processing
dist_panda = pd.Series(dict(fanfic_kudos_freq_dist))
# sort, normalise and round the panda series
new_dist = dist_panda.sort_index()
for i in range(0, len(new_dist.index)):
#for index in new_token_len_dist.index:
new_dist.iat[i] = round(new_dist.iat[i]/len(fanfic_kudos), 3) #index-1 bc the index starts counting from zero, the word lengths not
#if float(new_token_len_dist.iat[i]) == 0.00:
# new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
#calculate cumulative distribution
cum_dist = np.cumsum(new_dist.values)
return new_dist, cum_dist
def plot_distribution(new_dist, cum_dist, plt_title, file_path_for_pic:str, x_label="Number of Kudos", y_label="Percentage of Occurence", scatter_plt=False, max_ticks=10):
plt.figure(figsize=(10,10))
plt.title(plt_title)
plt.xlabel(x_label)
plt.ylabel(y_label)
if scatter_plt:
sns.scatterplot(x=new_dist.index, y=cum_dist)
#plt.xticks(new_dist.index[::100], new_dist.index[::100])
else:
sns.lineplot(x=new_dist.index, y=cum_dist)
plt.savefig(file_path_for_pic)
plt.close()
def separate_fanfics_by_good_medium_bad(df, series):
good_fics = []
medium_fics = []
bad_fics = []
few_kudos = 100
medium_kudos = 1500
for index, row in df.iterrows():
published = pd.to_datetime(row["published"])
if published.year != 2023:
if not pd.isna(row["kudos"]):
kudos = pd.to_numeric(row["kudos"], errors="coerce")
if kudos <= few_kudos:
bad_fics.append(row["body"])
elif kudos <= medium_kudos:
medium_fics.append(row["body"])
elif kudos > medium_kudos:
good_fics.append(row["body"])
else:
print(f"Missing kudos value for row {index}")
bad_fics_joined = ''.join(map(str, bad_fics))
good_fics_joined = ''.join(map(str, good_fics))
medium_fics_joined = ''.join(map(str, medium_fics))
with open(f"{series}/data/split_txt_fanfics/good_fics.txt", "w") as f:
f.write(good_fics_joined)
with open(f"{series}/data/split_txt_fanfics/bad_fics.txt", "w") as f:
f.write(bad_fics_joined)
with open(f"{series}/data/split_txt_fanfics/medium_fics.txt", "w") as f:
f.write(medium_fics_joined)
if __name__ == "__main__":
#grishaverse
#grisha_fanfics = read_csv_to_pd(file_path="grishaverse/data/fanfics/grishaverse_fics.csv", name_of_file=grisha_fanfics)
#new_dist, cum_dist = calculate_cum_kudo_distribution(grisha_fanfics)
#plot_distribution(new_dist=new_dist, cum_dist=cum_dist, plt_title="Grishaverse Cumulative Frequency Distribution of All Kudos", file_path_for_pic="grishaverse/freq_distribution/fanfic_kudo_freq_dist.png", scatter_plt=_plt=True)
#throne of glass
tog_fanfics = read_csv_to_pd(file_path="throne_of_glass/data/fanfics/throne_of_glass_fics.csv", name_of_file=tog_fanfics)
new_dist, cum_dist = calculate_cum_kudo_distribution(tog_fanfics)
plot_distribution(new_dist=new_dist, cum_dist=cum_dist, plt_title="Throne of Glass Cumulative Frequency Distribution of All Kudos", file_path_for_pic= "throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png", scatter_plt=True)
#separate_fanfics_by_good_medium_bad(grisha_fanfics, "grishaverse")
#separate_fanfics_by_good_medium_bad(tog_fanfics, "throne_of_glass")