diff --git a/fanfic_preprocessing.py b/fanfic_preprocessing.py index 5dc2424c0a69a51a5d9ed1a6b236cc5957619109..48a311da8daabdbb52d2ae9f17b59a106f9a2ad2 100644 --- a/fanfic_preprocessing.py +++ b/fanfic_preprocessing.py @@ -6,6 +6,36 @@ import pandas as pd import statistics import re from nltk.probability import FreqDist +import numpy as np + +# code snippets for prettifying plots + +#colours +pink = '#d600a7' +light_green = '#55a480' +blue_grey = '#5d9c9c' +purple_grey = '#636273' +CB91_Blue = '#2CBDFE' +CB91_Green = '#47DBCD' +CB91_Pink = '#F3A0F2' +CB91_Purple = '#9D2EC5' +CB91_Violet = '#661D98' +CB91_Amber = '#F5B14C' + +color_list = [pink, light_green, purple_grey, blue_grey, CB91_Green, CB91_Pink, CB91_Blue, CB91_Amber, + CB91_Purple, CB91_Violet] +plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list) + +#some colour palette playing around + +cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True) +cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True) +cm2 = sns.cubehelix_palette(as_cmap=True) + + + +#palette_1 = sns.color_palette("flare") +#palette_2 = sns.color_palette("mako_r", as_cmap=True) #file header: # work_id,title,author,rating,category,fandom,relationship,character,additional tags,language,published,status,status date,words,chapters,comments,kudos,bookmarks,hits,all_kudos,all_bookmarks,body @@ -14,7 +44,10 @@ from nltk.probability import FreqDist grisha_fanfics = pd.read_csv("grishaverse/data/fanfics/grishaverse_fics.csv") tog_fanfics = pd.read_csv("throne_of_glass/data/fanfics/throne_of_glass_fics.csv") + """ +# plot distribution of kudos for Grishaverse Fanfics + grisha_kudos = grisha_fanfics["kudos"].values.tolist() grisha_kudos_freq_dist = FreqDist(grisha_kudos) @@ -22,6 +55,18 @@ grisha_kudos_freq_dist = FreqDist(grisha_kudos) dist_panda = pd.Series(dict(grisha_kudos_freq_dist)) #print(dist_panda) +# sort, normalise and round the panda series + +new_dist = dist_panda.sort_index() + +for i in range(0, len(new_dist.index)): +#for index in new_token_len_dist.index: + new_dist.iat[i] = round(new_dist.iat[i]/len(grisha_kudos), 3) #index-1 bc the index starts counting from zero, the word lengths not + #if float(new_token_len_dist.iat[i]) == 0.00: + # new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry + +#calculate cumulative distribution +cum_dist = np.cumsum(new_dist.values) # plot using matplotlib and seaborn @@ -31,16 +76,58 @@ fig, ax = plt.subplots(figsize=(10,10)) # call function for bar (value) labels #addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values) -plt.title("Grishaverse Frequency Distribution of All Kudos") +plt.title("Grishaverse Cumulative Frequency Distribution of All Kudos") ax.set_xlabel("Number of Kudos") ax.set_ylabel("Percentage of Occurence") -sns.lineplot(x=dist_panda.index, y=dist_panda.values, ax=ax, palette="flare") +sns.lineplot(x=new_dist.index, y=cum_dist, ax=ax) #plt.xticks(rotation=30) !!! very useful for words -plt.savefig(f"grishaverse/freq_distribution/fanfic_kudo_freq_dist.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png" +fig.savefig(f"grishaverse/freq_distribution/fanfic_kudo_freq_dist.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png" + """ +# plot distribution of kudos for Throne of Glass Fanfics + +tog_kudos = tog_fanfics["kudos"].values.tolist() + +tog_kudos_freq_dist = FreqDist(tog_kudos) +# convert to FreqDist object to a pandas series for easier processing +dist_panda = pd.Series(dict(tog_kudos_freq_dist)) +#print(dist_panda) + +# sort, normalise and round the panda series + +new_dist = dist_panda.sort_index() +for i in range(0, len(new_dist.index)): +#for index in new_token_len_dist.index: + new_dist.iat[i] = round(new_dist.iat[i]/len(tog_kudos), 3) #index-1 bc the index starts counting from zero, the word lengths not + #if float(new_token_len_dist.iat[i]) == 0.00: + # new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry + +#calculate cumulative distribution +cum_dist = np.cumsum(new_dist.values) + +# plot using matplotlib and seaborn + +# set figure, ax into variables +fig, ax = plt.subplots(figsize=(10,10)) + +# call function for bar (value) labels +#addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values) + +plt.title("Throne of Glass Cumulative Frequency Distribution of Kudos") +ax.set_xlabel("Number of Kudos") +ax.set_ylabel("Percentage of Occurence") + + +sns.lineplot(x=new_dist.index, y=cum_dist, ax=ax) +#plt.xticks(rotation=30) !!! very useful for words +fig.savefig(f"throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png" + + + +""" def preprocess_data(df, series): good_fics = [] medium_fics = [] @@ -78,3 +165,4 @@ def preprocess_data(df, series): preprocess_data(grisha_fanfics, "grishaverse") preprocess_data(tog_fanfics, "throne_of_glass") +""" \ No newline at end of file diff --git a/grishaverse/filler.md b/grishaverse/filler.md deleted file mode 100644 index 8da5a0ff488c5d479a93d33ef0a2dd9cfd686b42..0000000000000000000000000000000000000000 --- a/grishaverse/filler.md +++ /dev/null @@ -1 +0,0 @@ -I am a filler, namely a chubby pink dragon subsisting on mint drops to uphold the folder structure. Of course I could be replaced with a git ignore file, but where's the fun in that? Have a mint drop! \ No newline at end of file diff --git a/grishaverse/freq_distribution/fanfic_kudo_freq_dist.png b/grishaverse/freq_distribution/fanfic_kudo_freq_dist.png index c00f6b85cc71e0b171a57c1b9147ba9370432505..6af7e5e445e3b79b93be2d05cdbd138942633747 100644 Binary files a/grishaverse/freq_distribution/fanfic_kudo_freq_dist.png and b/grishaverse/freq_distribution/fanfic_kudo_freq_dist.png differ diff --git a/throne_of_glass/filler_13.md b/throne_of_glass/filler_13.md deleted file mode 100644 index 43f3a78645f383be41a3aeeeedf3f0f635c3da63..0000000000000000000000000000000000000000 --- a/throne_of_glass/filler_13.md +++ /dev/null @@ -1 +0,0 @@ -hello, I am a sentient cream puff instead of a professional git ignore file. I exist. (I like prime numbers) \ No newline at end of file diff --git a/throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png b/throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png new file mode 100644 index 0000000000000000000000000000000000000000..55f0c59b642d58388ef446ce42dbfd36c88bf07f Binary files /dev/null and b/throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png differ