Add cumulative kudo distribution of fanfics

eb2bfe91 · chrysanthopoulou · 32a5209c · eb2bfe91 · 32a5209c · 32a5209c
Commit eb2bfe91 authored 2 years ago by chrysanthopoulou
--- a/fanfic_preprocessing.py
+++ b/fanfic_preprocessing.py
@@ -6,6 +6,36 @@ import pandas as pd
 import statistics
 import re
 from nltk.probability import FreqDist
+import numpy as np
+# code snippets for prettifying plots
+#colours
+pink = '#d600a7'
+light_green = '#55a480'
+blue_grey = '#5d9c9c'
+purple_grey = '#636273'
+CB91_Blue = '#2CBDFE'
+CB91_Green = '#47DBCD'
+CB91_Pink = '#F3A0F2'
+CB91_Purple = '#9D2EC5'
+CB91_Violet = '#661D98'
+CB91_Amber = '#F5B14C'
+color_list = [pink, light_green, purple_grey, blue_grey, CB91_Green, CB91_Pink, CB91_Blue, CB91_Amber,
+              CB91_Purple, CB91_Violet]
+plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list)
+#some colour palette playing around
+cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
+cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True)
+cm2 = sns.cubehelix_palette(as_cmap=True)
+#palette_1 = sns.color_palette("flare")
+#palette_2 = sns.color_palette("mako_r", as_cmap=True)
 #file header: 
 # work_id,title,author,rating,category,fandom,relationship,character,additional tags,language,published,status,status date,words,chapters,comments,kudos,bookmarks,hits,all_kudos,all_bookmarks,body
@@ -14,7 +44,10 @@ from nltk.probability import FreqDist
 grisha_fanfics = pd.read_csv("grishaverse/data/fanfics/grishaverse_fics.csv")
 tog_fanfics = pd.read_csv("throne_of_glass/data/fanfics/throne_of_glass_fics.csv")
 """
+# plot distribution of kudos for Grishaverse Fanfics
 grisha_kudos = grisha_fanfics["kudos"].values.tolist()
 grisha_kudos_freq_dist = FreqDist(grisha_kudos)
@@ -22,6 +55,18 @@ grisha_kudos_freq_dist = FreqDist(grisha_kudos)
 dist_panda = pd.Series(dict(grisha_kudos_freq_dist))
 #print(dist_panda)
+# sort, normalise and round the panda series
+new_dist = dist_panda.sort_index()
+for i in range(0, len(new_dist.index)):
+#for index in new_token_len_dist.index:
+    new_dist.iat[i] = round(new_dist.iat[i]/len(grisha_kudos), 3) #index-1 bc the index starts counting from zero, the word lengths not
+    #if float(new_token_len_dist.iat[i]) == 0.00:
+        #   new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
+#calculate cumulative distribution
+cum_dist = np.cumsum(new_dist.values)
 # plot using matplotlib and seaborn 
@@ -31,16 +76,58 @@ fig, ax = plt.subplots(figsize=(10,10))
 # call function for bar (value) labels 
 #addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values)
-plt.title("Grishaverse Frequency Distribution of All Kudos")
+plt.title("Grishaverse Cumulative Frequency Distribution of All Kudos")
 ax.set_xlabel("Number of Kudos")
 ax.set_ylabel("Percentage of Occurence")
-sns.lineplot(x=dist_panda.index, y=dist_panda.values, ax=ax, palette="flare")
+sns.lineplot(x=new_dist.index, y=cum_dist, ax=ax)
 #plt.xticks(rotation=30) !!! very useful for words
-plt.savefig(f"grishaverse/freq_distribution/fanfic_kudo_freq_dist.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
+fig.savefig(f"grishaverse/freq_distribution/fanfic_kudo_freq_dist.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
 """
+# plot distribution of kudos for Throne of Glass Fanfics
+tog_kudos = tog_fanfics["kudos"].values.tolist()
+tog_kudos_freq_dist = FreqDist(tog_kudos)
+# convert to FreqDist object to a pandas series for easier processing
+dist_panda = pd.Series(dict(tog_kudos_freq_dist))
+#print(dist_panda)
+# sort, normalise and round the panda series
+new_dist = dist_panda.sort_index()
+for i in range(0, len(new_dist.index)):
+#for index in new_token_len_dist.index:
+    new_dist.iat[i] = round(new_dist.iat[i]/len(tog_kudos), 3) #index-1 bc the index starts counting from zero, the word lengths not
+    #if float(new_token_len_dist.iat[i]) == 0.00:
+        #   new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
+#calculate cumulative distribution
+cum_dist = np.cumsum(new_dist.values)
+# plot using matplotlib and seaborn 
+# set figure, ax into variables
+fig, ax = plt.subplots(figsize=(10,10))
+# call function for bar (value) labels 
+#addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values)
+plt.title("Throne of Glass Cumulative Frequency Distribution of Kudos")
+ax.set_xlabel("Number of Kudos")
+ax.set_ylabel("Percentage of Occurence")
+sns.lineplot(x=new_dist.index, y=cum_dist, ax=ax)
+#plt.xticks(rotation=30) !!! very useful for words
+fig.savefig(f"throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
+"""
 def preprocess_data(df, series):
    good_fics = []
    medium_fics = []
@@ -78,3 +165,4 @@ def preprocess_data(df, series):
 preprocess_data(grisha_fanfics, "grishaverse")
 preprocess_data(tog_fanfics, "throne_of_glass")
+"""
\ No newline at end of file
--- a/grishaverse/filler.md
+++ b/grishaverse/filler.md
-I am a filler, namely a chubby pink dragon subsisting on mint drops to uphold the folder structure. Of course I could be replaced with a git ignore file, but where's the fun in that? Have a mint drop!
\ No newline at end of file
--- a/grishaverse/freq_distribution/fanfic_kudo_freq_dist.png
+++ b/grishaverse/freq_distribution/fanfic_kudo_freq_dist.png
--- a/throne_of_glass/filler_13.md
+++ b/throne_of_glass/filler_13.md
-hello, I am a sentient cream puff instead of a professional git ignore file. I exist. (I like prime numbers)
\ No newline at end of file
--- a/throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png
+++ b/throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png