Add plots for the fanfictions

ba1c60ba · chrysanthopoulou · eb2bfe91 · ba1c60ba · ba1c60ba · ba1c60ba
Commit ba1c60ba authored 2 years ago by chrysanthopoulou
--- a/data_overview/data_overview.csv
+++ b/data_overview/data_overview.csv
-,mean_tokens,std_dev,type_token_ratio,mean_sent
-throne_of_glass_canon,4.20580153308561,9.820105672393566,0.4612289416846652,14.468550677890269
-grishaverse_canon,4.1116821403167725,9.42692548599567,0.4679412861136999,14.026379022147932
+,mean_tokens,std_dev_tokens,type_token_ratio,mean_sent,std_dev_sent
+throne_of_glass_canon,4.20580153308561,2.0348877670869365,0.4612289416846652,14.468550677890269,9.820105672393566
+grishaverse_canon,4.1116821403167725,2.1047643402022285,0.4679412861136999,14.026379022147932,9.42692548599567
+grishaverse_good_fics,4.128605681546294,2.12767094657917,0.44176648168701443,12.920361563144626,10.031898461069263
+grishaverse_bad_fics,4.192839204109023,2.1961898296996827,0.4488349209373214,13.098263374311202,10.83490565859641
+grishaverse_medium_fics,4.125989775260719,2.1266952539859654,0.4420552018160678,13.1788589173054,10.270865275375563
+throne_of_glass_good_fics,4.197038090427363,2.0907564170382065,0.4495104669887279,13.376067824328105,9.013067041149515
+throne_of_glass_bad_fics,4.123089252572971,2.075327500013793,0.43527116374871266,12.966996479535549,9.797982354809053
+throne_of_glass_medium_fics,4.123495735120379,2.072193436253281,0.4337096917417227,12.511614522473558,8.912865289012412
--- a/data_overview/data_overview.png
+++ b/data_overview/data_overview.png
--- a/data_overview/delta_scores_grouped_fanfics.png
+++ b/data_overview/delta_scores_grouped_fanfics.png
--- a/data_overview/z_scores_all_data.png
+++ b/data_overview/z_scores_all_data.png
--- a/data_visualisation.py
+++ b/data_visualisation.py
+import seaborn as sns
+import matplotlib.pyplot as plt
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns 
+from cycler import cycler
+import json
+import dataframe_image as dfi
+
+#make the plots a bit less ugly
+
+CB91_Blue = '#2CBDFE'
+CB91_Green = '#47DBCD'
+CB91_Pink = '#F3A0F2'
+CB91_Purple = '#9D2EC5'
+CB91_Violet = '#661D98'
+CB91_Amber = '#F5B14C'
+
+color_list = [CB91_Pink, CB91_Blue, CB91_Green, CB91_Amber,
+              CB91_Purple, CB91_Violet]
+plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list)
+
+#some colour palette playing around
+
+cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
+cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True)
+cm2 = sns.cubehelix_palette(as_cmap=True)
+
+#read data
+
+data_overview = pd.DataFrame(pd.read_csv("data_overview/data_overview.csv"))
+
+# pairplot initial features -- kinda useless in our case, but hey
+
+"""
+data_pairplot = sns.pairplot(ceramics_motives)
+
+data_pairplot.savefig(r"project\pictures_general\data_pairplot.png")
+
+"""
+
+data_overview_styled = data_overview.style.background_gradient(cmap=cm)
+
+dfi.export(data_overview_styled, "data_overview/data_overview.png", table_conversion = "matplotlib")
\ No newline at end of file
--- a/delta_measure.py
+++ b/delta_measure.py
+import matplotlib.pyplot as plt
+import os
+from nltk.probability import FreqDist
+import pandas as pd
+import statistics
+import re
+import dataframe_image as dfi
+
+data_overview = pd.DataFrame(pd.read_csv("data_overview/data_overview.csv", index_col=0))
+
+"""
+data_overview = pd.DataFrame(
+    {"mean_tokens":mean_tokens, 
+     "std_dev_tokens":std_dev_tokens, 
+     "type_token_ratio":type_token_ratio, 
+     "mean_sent":mean_sent, 
+     "std_dev_tokens":std_dev_tokens}, 
+     index = index
+)
+    
+data_overview.to_csv(f"data_overview/data_overview.csv")
+"""
+z_score_provider = data_overview.drop(["grishaverse_bad_fics", "grishaverse_good_fics", "grishaverse_medium_fics"], axis=0)
+mean_std_dev_list = []
+mean_std_dev_list = [[columnName, columnData.mean(), columnData.std()] for columnName, columnData in z_score_provider.iteritems()]
+
+# Create a new DataFrame with the same column names and index labels as data_overview
+z_scores_all_data = pd.DataFrame(columns=data_overview.columns, index=data_overview.index)
+
+# Iterate over each cell in the data_overview DataFrame and write the corresponding z-score in the z_scores_all_data DataFrame
+for index, row in data_overview.iterrows():
+    for column in data_overview.columns:
+        mean, std_dev = [elem[1:] for elem in mean_std_dev_list if elem[0]==column][0]
+        cell_value = data_overview.loc[index, column]
+        z_score = (cell_value - mean) / std_dev
+        z_scores_all_data.loc[index, column] = z_score
+
+dfi.export(z_scores_all_data, "data_overview/z_scores_all_data.png", table_conversion = "matplotlib")
+
+print(z_scores_all_data)
+
+
+delta_scores_grouped_fanfics = pd.DataFrame(columns=["throne_of_glass_canon", "grishaverse_canon", "throne_of_glass_bad_fics", "throne_of_glass_good_fics", "throne_of_glass_medium_fics"], index=["grishaverse_bad_fics", "grishaverse_good_fics", "grishaverse_medium_fics"])
+
+for fic in ["grishaverse_bad_fics", "grishaverse_good_fics", "grishaverse_medium_fics"]:
+    delta_scores = []
+    for index, row in z_scores_all_data.iterrows():
+        if index not in ["grishaverse_bad_fics", "grishaverse_good_fics", "grishaverse_medium_fics"]:
+            for column in z_scores_all_data.columns:
+                delta_score = 0
+                delta_score += abs(row[column] - z_scores_all_data.loc[fic, column])
+                delta_score /= len(z_scores_all_data.columns)
+                delta_scores.append(delta_score)
+                delta_scores_grouped_fanfics.loc[fic, index] = delta_score
+    #delta_scores_grouped_fanfics.loc[fic, :] = delta_score 
+
+print(delta_scores)
+
+dfi.export(delta_scores_grouped_fanfics, "data_overview/delta_scores_grouped_fanfics.png", table_conversion = "matplotlib")
+
+print(delta_scores_grouped_fanfics)
+
--- a/fanfic_internal_metrics.py
+++ b/fanfic_internal_metrics.py
+import os
+from nltk.tokenize import word_tokenize
+from nltk.tokenize import sent_tokenize
+import pandas as pd
+import statistics
+import re
+
+
+# you'll have to also download "punkt" from nltk
+
+# by subdiving the text into segments of 1000, it calculates the type token ratio for each segment and then averages over them
+# this ensures a comparability of the type token ratios for varying text sizes
+def standardised_type_token_ratio(tokens):
+    ttrs = []
+    segment_tokens = []
+    segment = 0
+    for token in tokens:
+        if segment < 1000:
+            segment_tokens.append(token)
+            segment += 1
+        elif segment == 1000:
+            types = set(segment_tokens)
+            ttr = len(types)/len(segment_tokens)
+            ttrs.append(ttr)
+            segment_tokens =[]
+            segment = 0
+    if len(ttrs) <= 1:
+        types = set(tokens)
+        std_ttr = len(types)/len(tokens)
+        print("Warning: Text was too short for segmentation!")
+    else:
+        std_ttr = statistics.mean(ttrs)
+    return std_ttr
+
+
+def tokenize_and_clean_text(text):
+
+    tokens = word_tokenize(text)
+    cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)])
+    short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens
+    # had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin"
+    # and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running"
+
+    for token in cleaned_tokens:
+        dehyphenated_token = []
+        letter_present = 0
+        dehyphenated = 0
+        second_word_in_compound = 0
+        for c in token:
+            if c.isalpha() == True:
+                dehyphenated_token.append(c)
+                letter_present = 1
+                if dehyphenated == 1:
+                    second_word_in_compound = 1
+            elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens, 
+                #bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of 
+                # high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
+                # relatively speaking 
+                dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
+                #print(dehyphenated_token_joined)
+                short_clean_tokens.append(dehyphenated_token_joined)
+                dehyphenated_token = []
+                letter_present = 0
+                dehyphenated = 1
+                second_word_in_compound = 0
+        if letter_present == 1 and dehyphenated == 0:
+            short_clean_tokens.append(token) #catching the tokens that didn't have any special characters; but not the dehyphenated ones twice
+        elif letter_present == 1 and dehyphenated == 1 and second_word_in_compound == 1:
+            short_clean_tokens.append(''.join(map(str, dehyphenated_token)))
+    return short_clean_tokens
+
+
+
+# this function takes a corpus as its input and gives a Mendenhall curve, i.e. a frequency distribution of tokens as its output
+# precise input: corpus = string ; 
+# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series"
+# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png" 
+      
+
+def mendenhall_curve(corpus): 
+    
+    short_clean_tokens = tokenize_and_clean_text(corpus)
+    
+    # create the distribution of token lengths / Mendenhall curve
+
+    token_lengths = [len(token) for token in short_clean_tokens]
+    
+    # calculate the standard deviation, mean, token/type ratio
+    standard_deviation = statistics.stdev(token_lengths)
+    mean = statistics.mean(token_lengths)
+
+    type_token_ratio = standardised_type_token_ratio(short_clean_tokens)
+
+    return standard_deviation, mean, type_token_ratio
+
+
+def sentence_metrics(corpus): 
+
+    sents = sent_tokenize(corpus)
+    sent_lens = []
+    for sent in sents:
+        short_clean_tokens = tokenize_and_clean_text(sent)
+        sent_lens.append(len(short_clean_tokens))
+        
+    # calculate the standard deviation, mean
+    standard_deviation_sent = statistics.stdev(sent_lens)
+    mean_sent = statistics.mean(sent_lens)
+
+    return standard_deviation_sent, mean_sent
+
+
+def run_functions(directory_path):
+    good_mean_tks = []
+    bad_mean_tks = []
+    medium_mean_tks = []
+    #idx = []
+    good_std_dev_tks = []
+    bad_std_dev_tks = []
+    medium_std_dev_tks = []
+    good_ttrs = []
+    bad_ttrs = []
+    medium_ttrs = []
+    good_mean_sts= []
+    bad_mean_sts= []
+    medium_mean_sts= []
+    good_std_dev_sts = []
+    bad_std_dev_sts = []
+    medium_std_dev_sts = []
+
+    few_kudos = 100
+    medium_kudos = 1500
+
+    for index, body in grisha_fanfics["body"]:
+        
+        published = pd.to_datetime(grisha_fanfics["published"][index])
+        if published.year != 2023:
+            if not pd.isna(grisha_fanfics["kudos"][index]):
+                kudos = pd.to_numeric(grisha_fanfics["kudos"][index], errors="coerce")
+
+                if kudos <= few_kudos:
+                    std_dev_tk, mean_tk, ttr = mendenhall_curve(body)
+                    std_dev_st, mean_st = sentence_metrics(body)
+                    bad_mean_tks.append(mean_tk)
+                    bad_std_dev_tks.append(std_dev_tk)
+                    bad_ttrs.append(ttr)
+                    bad_mean_tks.append(mean_tk)
+                    bad_std_dev_tks.append(std_dev_tk)
+                    bad_ttrs.append(ttr)
+
+                elif kudos <= medium_kudos:
+                    std_dev_tk, mean_tk, ttr = mendenhall_curve(body)
+                    std_dev_st, mean_st = sentence_metrics(body)
+                    medium_mean_tks.append(mean_tk)
+                    medium_std_dev_tks.append(std_dev_tk)
+                    medium_ttrs.append(ttr)
+                    medium_mean_sts.append(mean_st)
+                    medium_std_dev_sts.append(std_dev_st)
+                    
+                elif kudos > medium_kudos:
+                    std_dev_tk, mean_tk, ttr = mendenhall_curve(body)
+                    std_dev_st, mean_st = sentence_metrics(body)
+                    good_mean_tks.append(mean_tk)
+                    good_std_dev_tks.append(std_dev_tk)
+                    good_ttrs.append(ttr)
+                    good_mean_sts.append(mean_st)
+                    good_std_dev_sts.append(std_dev_st)
+                    
+            else:
+                print(f"Missing kudos value for row {index}")
+        
+
+    lists = []
+
+    for list in lists:
+
+
+        #idx.append(grisha_fanfics["work_id"][index])
+
+
+grisha_fanfics = pd.read_csv("grishaverse/data/fanfics/grishaverse_fics.csv")
+#grishaverse/data/split_txt_fanfics
+
+#create lists for each of the columns of the dataframe we'll create
+
+mean_tokens = []
+std_dev_tokens = []
+type_token_ratio = []
+mean_sent = []
+std_dev_tokens = []
+index = []
+
+
+# create a dataframe to store all the overview statistics in
+# columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15; 
+# mean_sent; std_dev_sent; freq_sent_len ....
+# tag_frequencies 
+# tag_ngram_frequencies
+# punctuation frequencies
+# token/type ratio
+
+data_overview = pd.DataFrame(
+    {"mean_tokens":mean_tokens, 
+     "std_dev_tokens":std_dev_tokens, 
+     "type_token_ratio":type_token_ratio, 
+     "mean_sent":mean_sent, 
+     "std_dev_tokens":std_dev_tokens}, 
+     index = index
+)
+    
+data_overview.to_csv(f"data_overview/data_overview.csv")
--- a/grishaverse/freq_distribution/bad_fics_pos_tag_frequencies.png
+++ b/grishaverse/freq_distribution/bad_fics_pos_tag_frequencies.png
--- a/grishaverse/freq_distribution/bad_fics_punctuation_frequencies.png
+++ b/grishaverse/freq_distribution/bad_fics_punctuation_frequencies.png
--- a/grishaverse/freq_distribution/bad_fics_sent_len_long.png
+++ b/grishaverse/freq_distribution/bad_fics_sent_len_long.png
--- a/grishaverse/freq_distribution/bad_fics_sent_len_short.png
+++ b/grishaverse/freq_distribution/bad_fics_sent_len_short.png
--- a/grishaverse/freq_distribution/bad_fics_token_len.png
+++ b/grishaverse/freq_distribution/bad_fics_token_len.png
--- a/grishaverse/freq_distribution/canon_pos_tag_frequencies.png
+++ b/grishaverse/freq_distribution/canon_pos_tag_frequencies.png
--- a/grishaverse/freq_distribution/canon_punctuation_frequencies.png
+++ b/grishaverse/freq_distribution/canon_punctuation_frequencies.png
--- a/grishaverse/freq_distribution/canon_sent_len_short.png
+++ b/grishaverse/freq_distribution/canon_sent_len_short.png
--- a/grishaverse/freq_distribution/good_fics_pos_tag_frequencies.png
+++ b/grishaverse/freq_distribution/good_fics_pos_tag_frequencies.png
--- a/grishaverse/freq_distribution/good_fics_punctuation_frequencies.png
+++ b/grishaverse/freq_distribution/good_fics_punctuation_frequencies.png
--- a/grishaverse/freq_distribution/good_fics_sent_len_long.png
+++ b/grishaverse/freq_distribution/good_fics_sent_len_long.png
--- a/grishaverse/freq_distribution/good_fics_sent_len_short.png
+++ b/grishaverse/freq_distribution/good_fics_sent_len_short.png
--- a/grishaverse/freq_distribution/good_fics_token_len.png
+++ b/grishaverse/freq_distribution/good_fics_token_len.png