Merge branch 'master' of https://gitlab.cl.uni-heidelberg.de/chrysanthopoulou/fanfic-and-stylometry

3488607f · chrysanthopoulou · a35c0eda · a8cfdec0 · 3488607f · 3488607f
Commit 3488607f authored 1 year ago by chrysanthopoulou
--- a/README.md
+++ b/README.md
 # fanfic and stylometry :chipmunk:
-:lizard: :blowfish:  :chipmunk: :unicorn: :deer: 
-:books: :notebook: :ledger: :page_with_curl: :scroll: :page_facing_up: :newspaper: :rolled_up_newspaper: :old_key: :key: :dagger: :crossed_swords: :bow_and_arrow: 
 ![](/general_pictures/cute_otter.jpg)
@@ -9,11 +6,11 @@
 1. [Project Description :black_nib:](#project-description-✒️) 
 2. [Data :card_index_dividers:](#data-🗂️) 
-3. [Roadmap :telescope: :world_map:](#roadmap-🔭-🗺️) 
+4. [Support :dragon: & Author](#support-🐉--author)
-4. [Support :dragon: & Authors](#support-🐉--authors)
 ## Project Description :black_nib:
+This project aims to perform a stylometry analysis on a web-scraped corpus of fanfiction and compare it to its origin work.
 ## Data :card_index_dividers:
@@ -23,25 +20,6 @@
 - [Grishaverse Series by Leigh Bardugo]()
-## Roadmap :telescope: :world_map:
- [x] Upload Datasets
- [x] Reformat SCI
- [x] Reformat MNLI
- [x] Reformat Veridicality Dataset
- [x] Test Berti :penguin: <br>
-imminent to-do stuff :exclamation: 
-***
- [x] Train Multi :ghost: 
- [x] Train Verdi :crocodile: 
- [x] Test Multi :ghost:
- [x] Test Verdi :crocodile:
- [x] Figure out Evaluation Metric
- [ ] Apply Evaluation Metric
- [ ] Make Interpretability Datasets
- [ ] Interpretability
 ## Support :dragon: & Author
 Lea Kyveli Chrysanthopoulou: leakyveli.chrysanthopoulou@stud.uni-heidelberg.de<br>
--- a/clean_stylometry.py
+++ b/clean_stylometry.py
+import seaborn as sns
+import matplotlib.pyplot as plt
+from cycler import cycler
+import os
+from nltk.tokenize import word_tokenize
+from nltk.probability import FreqDist
+from nltk.tokenize import sent_tokenize
+from nltk.tag import pos_tag
+import pandas as pd
+import statistics
+import re
+# create function for bar (value) labels
+def addlabels(x,y):
+    for i in range(len(x)):
+        plt.text(i, y[i], y[i], ha = "center")
+# function compiling the works given into a single string. Input required:
+# general path of the files as string, for example: "/throne_of_glass/data/canon_works/"
+# specific names of the works as a list of strings, for example "throne_of_glass_1.txt"
+# /throne_of_glass/data/canon_works/
+def read_works_into_string(directory_path):
+    strings = []
+    works = os.listdir(directory_path)
+    for work in works:
+        with open(f"{directory_path}"+f"/{work}", "r", errors='ignore') as f: #ignores mostly unicode errors due to problematic encoding of text files
+            strings.append(f.read())
+    return "\n".join(strings)
+# by subdiving the text into segments of 1000, it calculates the type token ratio for each segment and then averages over them
+# this ensures a comparability of the type token ratios for varying text sizes
+def standardised_type_token_ratio(tokens):
+    ttrs = []
+    segment_tokens = []
+    segment = 0
+    for token in tokens:
+        if segment < 1000:
+            segment_tokens.append(token)
+            segment += 1
+        elif segment == 1000:
+            types = set(segment_tokens)
+            ttr = len(types)/len(segment_tokens)
+            ttrs.append(ttr)
+            segment_tokens =[]
+            segment = 0
+    if len(ttrs) <= 1:
+        types = set(tokens)
+        std_ttr = len(types)/len(tokens)
+        print("Warning: Text was too short for segmentation!")
+    else:
+        std_ttr = statistics.mean(ttrs)
+    return std_ttr
+def tokenize_and_clean_text(text):
+    tokens = word_tokenize(text)
+    cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)])
+    short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens
+    # had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin"
+    # and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running"
+    for token in cleaned_tokens:
+        dehyphenated_token = []
+        letter_present = 0
+        dehyphenated = 0
+        second_word_in_compound = 0
+        for c in token:
+            if c.isalpha() == True:
+                dehyphenated_token.append(c)
+                letter_present = 1
+                if dehyphenated == 1:
+                    second_word_in_compound = 1
+            elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens, 
+                #bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of 
+                # high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
+                # relatively speaking 
+                dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
+                #print(dehyphenated_token_joined)
+                short_clean_tokens.append(dehyphenated_token_joined)
+                dehyphenated_token = []
+                letter_present = 0
+                dehyphenated = 1
+                second_word_in_compound = 0
+        if letter_present == 1 and dehyphenated == 0:
+            short_clean_tokens.append(token) #catching the tokens that didn't have any special characters; but not the dehyphenated ones twice
+        elif letter_present == 1 and dehyphenated == 1 and second_word_in_compound == 1:
+            short_clean_tokens.append(''.join(map(str, dehyphenated_token)))
+    return short_clean_tokens
+def calculate_freq_dist_as_clean_panda(list_of_items, most_common_limit=False):
+    if most_common_limit == False:
+        freq_dist = FreqDist(list_of_items)
+    else: 
+        freq_dist = FreqDist(list_of_items).most_common(most_common_limit)
+    # convert FreqDist object to a pandas series for easier processing
+    dist_panda = pd.Series(dict(freq_dist))
+    # sort, normalise and round the panda series
+    new_dist = dist_panda.sort_index()
+    for i in range(0, len(new_dist.index)):
+    #for index in new_token_len_dist.index:
+        new_dist.iat[i] = round(new_dist.iat[i]/len(list_of_items), 3) #index-1 bc the index starts counting from zero, the word lengths not
+        #if float(new_token_len_dist.iat[i]) == 0.00:
+         #   new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
+    return new_dist
+def mendenhall_token_metrics(tokens):
+    # create the distribution of token lengths / Mendenhall curve
+    token_lengths = [len(token) for token in tokens]
+    # Calculate the trimmed token length (with 5% trimming) We need to remove the outliers, bc even despite preprocessing, 
+    # there still are some very wrong lengths, which entirely skews the metrics and also ruins our p-values later on
+    trim_percent = 0.005
+    trim_len = int(len(token_lengths) * trim_percent / 2)
+    token_lengths = sorted(token_lengths)[trim_len:-trim_len]
+    new_token_len_dist = calculate_freq_dist_as_clean_panda(token_lengths, most_common_limit=15) # token len freq dist
+    standard_deviation = statistics.stdev(token_lengths)
+    mean = statistics.mean(token_lengths)
+    return new_token_len_dist, standard_deviation, mean
+def plot_distribution(x, y, plt_title, file_path_for_pic:str, x_label="Number of Kudos", y_label="Percentage of Occurence",palette="flare", plt_type="barplot", add_labels=True, rotate_ticks=True):
+    plt.figure(figsize=(10,10))
+    plt.title(plt_title)
+    plt.xlabel(x_label)
+    plt.ylabel(y_label)
+    if add_labels:
+        addlabels(x=x.index, y=y.values)
+    match plt_type:
+        case "scatterplot":
+            sns.scatterplot(x=x.index, y=y.values, palette=palette)
+        case "lineplot":
+            sns.lineplot(x=x.index, y=y.values, palette=palette)
+        case "barplot":
+            sns.barplot(x=x.index, y=y.values, palette=palette)
+        case "histplot":
+            sns.histplot(x=x.index, y=y.values, palette=palette)
+        case _:
+            print(f"{plt_type} is not a valid format for this function")
+    if rotate_ticks:
+        plt.xticks(rotation=30) # !!! very useful for words
+    plt.savefig(file_path_for_pic) 
+    plt.close()
+def pos_tag_freq(tokens):
+    #nltk.pos_tag(text) --> [('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'),
+    #('completely', 'RB'), ('different', 'JJ')]
+    tag_token_tuples = pos_tag(tokens)
+    punctuation_regex = r"[^\w\s]+"
+    summarised_tags = []
+    punctuation_tags = []
+    modal_verbs = []
+    index = 0
+    for token, tag in tag_token_tuples:
+        if re.match(punctuation_regex, token):
+            summarised_tags.append("punctuation")
+            if re.match(r"[\"\'“”’‘]+", token):
+                punctuation_tags.append("quotation_marks")
+            elif re.match(r"[,;:.?!-]+", token):
+                try:
+                    punctuation_tags.append("ellipsis" if token == "." and tag_token_tuples[index+1][1] == "." and tag_token_tuples[index+2][1] == "." else "full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct")
+                except:
+                    punctuation_tags.append("full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct")
+        else:
+            if tag in ["MD"]:
+                summarised_tags.append("modal verb")
+                modal_verbs.append(token.lower())
+            elif tag in ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
+                summarised_tags.append("verb")
+            elif tag in ["JJ", "JJR", "JJS"]:
+                summarised_tags.append("adjective")
+            elif tag in ["RB", "RBR", "RBS", "WRB"]:
+                summarised_tags.append("adverb")
+            elif tag in ["PRP", "PRP$", "WP", "WP$"]:
+                summarised_tags.append("pronoun")
+            elif tag in ["NNP", "NNPS"]:
+                summarised_tags.append("proper_noun")
+            elif tag in ["NN", "NNS"]:
+                summarised_tags.append("common_noun")
+            elif tag in ["DT", "PDT", "WDT"]:
+                summarised_tags.append("determiner")
+            elif tag == "CC":
+                summarised_tags.append("coordinating_conj")
+            elif tag == "IN":
+                summarised_tags.append("subordinating_conj")
+            elif tag in ["$", "CD", "EX", "LS", "POS", "SYM", "TO", "UH", "RP", "FW"]:
+                summarised_tags.append("other_tag")
+        index += 1
+    #pos tag freq dist
+    new_tag_freq_dist = calculate_freq_dist_as_clean_panda(summarised_tags)
+    #punctuation frequency distribution
+    new_punct_tag_freq_dist = calculate_freq_dist_as_clean_panda(punctuation_tags)
+    # modal verbs in more detail
+    new_md_freq_dist_panda = calculate_freq_dist_as_clean_panda(modal_verbs, most_common_limit=10)
+    return new_tag_freq_dist, new_punct_tag_freq_dist, new_md_freq_dist_panda
+#f"throne_of_glass/data/canon_works"
+def extract_info_from_directory_path(directory_path):
+ #for txt_fic in os.listdir(directory_path):
+    works = os.listdir(directory_path)
+    pattern = r"^[a-zA-Z_]+(?=/)" # get series from directory path
+    match = re.search(pattern, directory_path)
+    if match:
+        series = match.group(0)
+    for work in works:
+        with open(f"{directory_path}"+f"/{work}", "r") as f:
+            f = f.read()
+            std_dev_tk, mean_tk, ttr = mendenhall_curve(f, f"Mendenhall Curve for the {series.replace('_' , ' ').title()} {work[:-4].replace('_' , ' ').title()}", f"{series}/freq_distribution/{work[:-4]}_token_len.png")
+            mean_tokens.append(mean_tk)
+def calculate_sent_len_dist(text): 
+    sents = sent_tokenize(text)
+    sent_lens = []
+    for sent in sents:
+        short_clean_tokens = tokenize_and_clean_text(sent)
+        sent_lens.append(len(short_clean_tokens))
+        #if len(short_clean_tokens)>= 90:
+            #print(f"This sentence: \n {sent} \n is this long: {len(short_clean_tokens)}")
+    # Calculate the trimmed mean sentence length (with 5% trimming) We need to remove the outliers, bc even despite preprocessing, 
+    # there still are some sentences that are 1200 tokens long, which entirely skews the metrics and also ruins our p-values later on
+    trim_percent = 0.05
+    trim_len = int(len(sent_lens) * trim_percent / 2)
+    sent_lens = sorted(sent_lens)[trim_len:-trim_len]
+    sent_len_dist = calculate_freq_dist_as_clean_panda(sent_lens) #new_sent_len_dist
+    # plot the 25 most frequent sentence lenghts as a barplot for a more detailed insight
+    sent_len_dist_short = calculate_freq_dist_as_clean_panda(sent_lens, most_common_limit=25)
+    # calculate the standard deviation, mean
+    standard_deviation_sent = statistics.stdev(sent_lens)
+    mean_sent = statistics.mean(sent_lens)
+    return sent_len_dist, sent_len_dist_short, standard_deviation_sent, mean_sent
+class StylometryMetrics:
+    def __init__(self, directory_path, name_of_work, quality="", fanfiction=True):
+        self.text = read_works_into_string(directory_path)
+        self.clean_tokens = tokenize_and_clean_text(self.text)
+        self.name = name_of_work
+        self.fanfiction = fanfiction
+        self.quality = quality # good medium bad
+    def determine_titles(self, plot_topic):
+        if self.fanfiction:
+            plt_title = f"{plot_topic} for the {self.name} {self.quality} Fanfiction"
+        else:
+            plt_title = f"{plot_topic} for the {self.name} Canon"
+        return plt_title
+    def calculate_standardised_ttr(self):
+        self.sttr = standardised_type_token_ratio(self.clean_tokens)
+    def calculate_mendenhall_token_metrics(self):
+        self.tk_len_dist, self.tk_len_std, self.tk_len_mean = mendenhall_token_metrics(self.clean_tokens)
+    def plot_token_metrics(self, file_path_for_pic):
+        plt_title = self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon" + " Token Frequency Distribution"
+        plot_distribution(x=self.tk_len_dist, y=self.tk_len_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic,  x_label="Token Length", y_label="Percentage of Occurence")
+    def calculate_pos_tag_distribution(self):
+        self.tag_freq_dist, self.punct_tag_freq_dist, self.md_freq_dist = pos_tag_freq(self.clean_tokens)
+    def calculate_sent_len_distribution(self):
+        self.sent_len_dist, self.sent_len_dist_short, self.sent_std_dev, self.sent_mean = calculate_sent_len_dist(self.text)
+    def plot_long_sent_len_dist(self, file_path_for_pic):
+        plt_title = self.determine_titles(plot_topic="Full Sentence Length Distribution")
+        plot_distribution(x=self.sent_len_dist, y=self.sent_len_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Sentence Lengths", y_label="Percentage of Occurence", plt_type="lineplot")
+    def plot_short_sent_len_dist(self, file_path_for_pic):
+        plt_title = self.determine_titles(plot_topic="Short Sentence Length Distribution")
+        plot_distribution(x=self.sent_len_dist_short, y=self.sent_len_dist_short, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Sentence Lengths", y_label="Percentage of Occurence")
+    def plot_pos_tag_freq(self, file_path_for_pic):
+        plt_title = self.determine_titles(plot_topic="POS Tag Frequencies")
+        plot_distribution(x=self.tag_freq_dist, y=self.tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="POS Tags", y_label="Percentage of Occurence")
+    def plot_md_freq(self, file_path_for_pic):
+        plt_title = self.determine_titles(plot_topic="Modal Verb Frequencies")
+        plot_distribution(x=self.md_freq_dist, y=self.md_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Modal Verbs", y_label="Percentage of Occurence")
+    def plot_punct_freq(self, file_path_for_pic):
+        plt_title = self.determine_titles(plot_topic="Punctuation Frequencies")
+        plot_distribution(x=self.punct_tag_freq_dist, y=self.punct_tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Types of Punctuation", y_label="Percentage of Occurence")
+# overall pos_tag frequency distribution
+# pos_tag ngrams; (maybe exclude stopwords?) 
+# tag collocates for specific tags --> adjectives most frequently with nouns
+# most frequent words 
+# most frequent words for specific tags --> punctuation; 
+# most frequent adjectives
+#create the Mendenhall Curve for the Throne of Glass Series
+#std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")
+#create the Mendenhall Curve for the Grishaverse Books
+#std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png")
+# Mendenhall Curve Sentence Lengths for Throne of Glass Canon
+#std_dev_sent_tog_canon, mean_sent_tog_canon = sentence_metrics(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Throne of Glass Series", "throne_of_glass", "canon")
+# Mendenhall Curve Sentence Lenghts for Grishavers Canon
+#std_dev_sent_grishaverse_canon, mean_sent_grishaverse_canon = sentence_metrics(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Grishaverse Books", "grishaverse", "canon")
+# POS Tag frequencies for TOG
+#pos_tag_frequencies(read_works_into_string(f"throne_of_glass/data/canon_works"), "throne_of_glass", "canon")
+# POS Tag frequencies for Grishaverse
+#pos_tag_frequencies(read_works_into_string(f"grishaverse/data/canon_works"), "grishaverse", "canon")
+def run_functions(directory_path):
+    """
+    mean_tks = []
+    idx = []
+    std_dev_tks = []
+    ttrs = []
+    mean_sts= []
+    std_dev_sts = []
+    """
+    #for txt_fic in os.listdir(directory_path):
+    works = os.listdir(directory_path)
+    pattern = r"^[a-zA-Z_]+(?=/)" # get series from directory path
+    match = re.search(pattern, directory_path)
+    if match:
+        series = match.group(0)
+    for work in works:
+        with open(f"{directory_path}"+f"/{work}", "r") as f:
+            f = f.read()
+            std_dev_tk, mean_tk, ttr = mendenhall_curve(f, f"Mendenhall Curve for the {series.replace('_' , ' ').title()} {work[:-4].replace('_' , ' ').title()}", f"{series}/freq_distribution/{work[:-4]}_token_len.png")
+            mean_tokens.append(mean_tk)
+            std_dev_tokens.append(std_dev_tk)
+            type_token_ratio.append(ttr)
+            std_dev_st, mean_st = sentence_metrics(f, f"Mendenhall Curve for Sentence Lenghts for the {series.replace('_' , ' ').title()} {work[:-4].replace('_' , ' ').title()}", series, work[:-4])
+            mean_sent.append(mean_st)
+            std_dev_sents.append(std_dev_st)
+            pos_tag_frequencies(f, series, work[:-4])
+            index.append(f"{series}_{work[:-4]}")
+#grishaverse/data/split_txt_fanfics
+def create_dataframe_with_overview_info():
+    #create lists for each of the columns of the dataframe we'll create
+    mean_tokens = [mean_tokens_tog_canon, mean_tokens_grishaverse_canon]
+    std_dev_tokens = [std_dev_tokens_tog_canon, std_dev_tokens_grishaverse_canon]
+    type_token_ratio = [type_token_ratio_tog_canon, type_token_ratio_grishaverse_canon]
+    mean_sent = [mean_sent_tog_canon, mean_sent_grishaverse_canon]
+    std_dev_sents = [std_dev_sent_tog_canon, std_dev_sent_grishaverse_canon]
+    index = ["throne_of_glass_canon", "grishaverse_canon"]
+    # create a dataframe to store all the overview statistics in
+    # columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15; 
+    # mean_sent; std_dev_sent; freq_sent_len ....
+    # tag_frequencies 
+    # tag_ngram_frequencies
+    # punctuation frequencies
+    # token/type ratio
+    data_overview = pd.DataFrame(
+        {"mean_tokens":mean_tokens, 
+        "std_dev_tokens":std_dev_tokens, 
+        "type_token_ratio":type_token_ratio, 
+        "mean_sent":mean_sent, 
+        "std_dev_sent":std_dev_sents}, 
+        index = index
+    )
+if __name__ == "__main__":
+    #run_functions("grishaverse/data/split_txt_fanfics")
+    #run_functions("throne_of_glass/data/split_txt_fanfics")
+    #data_overview.to_csv(f"data_overview/data_overview.csv")
+    GrishaverseCanon = StylometryMetrics(directory_path="grishaverse/data/canon_works", name_of_work="Grishaverse", fanfiction=False)
+    GrishaverseCanon.calculate_pos_tag_distribution()
+    GrishaverseCanon.plot_md_freq("grishaverse/plots/canon/md_freq.png")
\ No newline at end of file
--- a/colour_code.py
+++ b/colour_code.py
+# code snippets for prettifying plots
+#colours
+CB91_Blue = '#2CBDFE'
+CB91_Green = '#47DBCD'
+CB91_Pink = '#F3A0F2'
+CB91_Purple = '#9D2EC5'
+CB91_Violet = '#661D98'
+CB91_Amber = '#F5B14C'
+color_list = [CB91_Pink, CB91_Blue, CB91_Green, CB91_Amber,
+              CB91_Purple, CB91_Violet]
+plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list)
+#some colour palette playing around
+cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
+cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True)
+cm2 = sns.cubehelix_palette(as_cmap=True)
\ No newline at end of file
--- a/fanfic_preprocessing.py
+++ b/fanfic_preprocessing.py
@@ -22,7 +22,7 @@ CB91_Purple = '#9D2EC5'
 CB91_Violet = '#661D98'
 CB91_Amber = '#F5B14C'
-color_list = [pink, light_green, purple_grey, blue_grey, CB91_Green, CB91_Pink, CB91_Blue, CB91_Amber,
+color_list = [  blue_grey, CB91_Amber, pink, light_green, CB91_Green, CB91_Pink, CB91_Blue, 
              CB91_Purple, CB91_Violet]
 plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list)
@@ -32,11 +32,11 @@ cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
 cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True)
 cm2 = sns.cubehelix_palette(as_cmap=True)
 #palette_1 = sns.color_palette("flare")
 #palette_2 = sns.color_palette("mako_r", as_cmap=True)
+# actual preprocessing code
 #file header: 
 # work_id,title,author,rating,category,fandom,relationship,character,additional tags,language,published,status,status date,words,chapters,comments,kudos,bookmarks,hits,all_kudos,all_bookmarks,body
 # 27852922,Dealing with Our Demons,['ravenyenn19'],Mature,F/M,"Six of Crows Series",Kaz Brekker/Inej Ghafa,"Kaz B","Romance,Kanej - Freeform, Eventual Smut",English,2020-12-03,Updated,2023-03-16,747673,162/?,8573,12204,1373,709212,"['ud4m', 'book_addict_1228', 'ephemeraldelights', 'bluedelilah25', 'sunshinecorsets', 'I_do_not_like_purple_glasses', 'beep_boop_00', 'schleswigholstein', 'moonandstars75', 'ewerythingoes', 'mindfighters', 'rosibunnis', 'Lizie06', 'ghostlatte', 'aguswolman', 'QueenofEnglan', 'JenBoyette04', 'gnitneb_reads', 'gloomysunshine', 'v1ofvs', 'BazzaKrekker', 'BookGeek', 'poppyflower19', 'Cassanibal', 'vanilla_chai_tea', 'Honorthyword', 'mariaarmengol', 'luc1inda', 'zarawrites', 'monmough', 'Guilty__Pleasures', 'Ilyann', 'folieadeux_0_0', 'dragonguard', 'Emeliemarx', 'angrydabee', 'slythxrclaw', 'samaram0215', 'letsgetthisbread69', 'Mintmew', 'biblichour', 'Katloupet', 'Miss_ginger', 'inejsquake', 'Arabella_7833', 'flossy_flo99', 'a_k123', 'hushedwanderer', 'siriuslymichele', 'AnnaAvinaVTDX']",[],"Dear Kaz,
@@ -45,90 +45,49 @@ cm2 = sns.cubehelix_palette(as_cmap=True)
 grisha_fanfics = pd.read_csv("grishaverse/data/fanfics/grishaverse_fics.csv")
 tog_fanfics = pd.read_csv("throne_of_glass/data/fanfics/throne_of_glass_fics.csv")
-"""
+def read_csv_to_pd(file_path, name_of_file) -> pd: #fix type hints
-# plot distribution of kudos for Grishaverse Fanfics
+    name_of_file = pd.read_csv(file_path)
+    return name_of_file
-grisha_kudos = grisha_fanfics["kudos"].values.tolist()
-grisha_kudos_freq_dist = FreqDist(grisha_kudos)
-# convert to FreqDist object to a pandas series for easier processing
-dist_panda = pd.Series(dict(grisha_kudos_freq_dist))
-#print(dist_panda)
-# sort, normalise and round the panda series
-new_dist = dist_panda.sort_index()
-for i in range(0, len(new_dist.index)):
-#for index in new_token_len_dist.index:
-    new_dist.iat[i] = round(new_dist.iat[i]/len(grisha_kudos), 3) #index-1 bc the index starts counting from zero, the word lengths not
-    #if float(new_token_len_dist.iat[i]) == 0.00:
-        #   new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
-#calculate cumulative distribution
-cum_dist = np.cumsum(new_dist.values)
-# plot using matplotlib and seaborn 
-# set figure, ax into variables
-fig, ax = plt.subplots(figsize=(10,10))
-# call function for bar (value) labels 
-#addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values)
-plt.title("Grishaverse Cumulative Frequency Distribution of All Kudos")
-ax.set_xlabel("Number of Kudos")
-ax.set_ylabel("Percentage of Occurence")
+def calculate_cum_kudo_distribution(fanfic_pd):
+    fanfic_kudos = fanfic_pd["kudos"].values.tolist()
+    fanfic_kudos_freq_dist = FreqDist(fanfic_kudos)
+    # convert to FreqDist object to a pandas series for easier processing
+    dist_panda = pd.Series(dict(fanfic_kudos_freq_dist))
-sns.lineplot(x=new_dist.index, y=cum_dist, ax=ax)
+    # sort, normalise and round the panda series
-#plt.xticks(rotation=30) !!! very useful for words
+    new_dist = dist_panda.sort_index()
-fig.savefig(f"grishaverse/freq_distribution/fanfic_kudo_freq_dist.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
-"""
+    for i in range(0, len(new_dist.index)):
-# plot distribution of kudos for Throne of Glass Fanfics
+    #for index in new_token_len_dist.index:
+        new_dist.iat[i] = round(new_dist.iat[i]/len(fanfic_kudos), 3) #index-1 bc the index starts counting from zero, the word lengths not
+        #if float(new_token_len_dist.iat[i]) == 0.00:
+            #   new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
-tog_kudos = tog_fanfics["kudos"].values.tolist()
+    #calculate cumulative distribution
+    cum_dist = np.cumsum(new_dist.values)
+    return new_dist, cum_dist
-tog_kudos_freq_dist = FreqDist(tog_kudos)
-# convert to FreqDist object to a pandas series for easier processing
-dist_panda = pd.Series(dict(tog_kudos_freq_dist))
-#print(dist_panda)
-# sort, normalise and round the panda series
+def plot_distribution(new_dist, cum_dist, plt_title, file_path_for_pic:str, x_label="Number of Kudos", y_label="Percentage of Occurence", scatter_plt=False, max_ticks=10):
+    plt.figure(figsize=(10,10))
+    plt.title(plt_title)
+    plt.xlabel(x_label)
+    plt.ylabel(y_label)
+    if scatter_plt:
+        sns.scatterplot(x=new_dist.index, y=cum_dist)
+        #plt.xticks(new_dist.index[::100], new_dist.index[::100])
-new_dist = dist_panda.sort_index()
+    else:
+        sns.lineplot(x=new_dist.index, y=cum_dist)
-for i in range(0, len(new_dist.index)):
+    plt.savefig(file_path_for_pic) 
-#for index in new_token_len_dist.index:
+    plt.close()
-    new_dist.iat[i] = round(new_dist.iat[i]/len(tog_kudos), 3) #index-1 bc the index starts counting from zero, the word lengths not
-    #if float(new_token_len_dist.iat[i]) == 0.00:
-        #   new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
-#calculate cumulative distribution
+def separate_fanfics_by_good_medium_bad(df, series):
-cum_dist = np.cumsum(new_dist.values)
-# plot using matplotlib and seaborn 
-# set figure, ax into variables
-fig, ax = plt.subplots(figsize=(10,10))
-# call function for bar (value) labels 
-#addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values)
-plt.title("Throne of Glass Cumulative Frequency Distribution of Kudos")
-ax.set_xlabel("Number of Kudos")
-ax.set_ylabel("Percentage of Occurence")
-sns.lineplot(x=new_dist.index, y=cum_dist, ax=ax)
-#plt.xticks(rotation=30) !!! very useful for words
-fig.savefig(f"throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
-"""
-def preprocess_data(df, series):
    good_fics = []
    medium_fics = []
    bad_fics = []
@@ -163,6 +122,16 @@ def preprocess_data(df, series):
        f.write(medium_fics_joined)
-preprocess_data(grisha_fanfics, "grishaverse")
+if __name__ == "__main__":
-preprocess_data(tog_fanfics, "throne_of_glass")
+    #grishaverse
-"""
+    #grisha_fanfics = read_csv_to_pd(file_path="grishaverse/data/fanfics/grishaverse_fics.csv", name_of_file=grisha_fanfics)
\ No newline at end of file
+    #new_dist, cum_dist = calculate_cum_kudo_distribution(grisha_fanfics)
+    #plot_distribution(new_dist=new_dist, cum_dist=cum_dist, plt_title="Grishaverse Cumulative Frequency Distribution of All Kudos", file_path_for_pic="grishaverse/freq_distribution/fanfic_kudo_freq_dist.png", scatter_plt=_plt=True)
+    #throne of glass
+    tog_fanfics = read_csv_to_pd(file_path="throne_of_glass/data/fanfics/throne_of_glass_fics.csv", name_of_file=tog_fanfics)
+    new_dist, cum_dist = calculate_cum_kudo_distribution(tog_fanfics)
+    plot_distribution(new_dist=new_dist, cum_dist=cum_dist, plt_title="Throne of Glass Cumulative Frequency Distribution of All Kudos", file_path_for_pic= "throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png", scatter_plt=True)
+    #separate_fanfics_by_good_medium_bad(grisha_fanfics, "grishaverse")
+    #separate_fanfics_by_good_medium_bad(tog_fanfics, "throne_of_glass")
--- a/grishaverse/freq_distribution/fanfic_kudo_freq_dist.png
+++ b/grishaverse/freq_distribution/fanfic_kudo_freq_dist.png
--- a/grishaverse/plots/canon/md_freq.png
+++ b/grishaverse/plots/canon/md_freq.png
--- a/grishaverse/plots/canon/pos_tag_freq.png
+++ b/grishaverse/plots/canon/pos_tag_freq.png
--- a/grishaverse/plots/filler.txt
+++ b/grishaverse/plots/filler.txt
+alki ist blöd
\ No newline at end of file
--- a/grishaverse/plots/medium/filler.txt
+++ b/grishaverse/plots/medium/filler.txt
+me is good for filling stuffs
\ No newline at end of file
--- a/stylometry_code.py
+++ b/stylometry_code.py
@@ -13,27 +13,6 @@ import re
 # you'll have to also download "punkt" from nltk
-# code snippets for prettifying plots
-#colours
-CB91_Blue = '#2CBDFE'
-CB91_Green = '#47DBCD'
-CB91_Pink = '#F3A0F2'
-CB91_Purple = '#9D2EC5'
-CB91_Violet = '#661D98'
-CB91_Amber = '#F5B14C'
-color_list = [CB91_Pink, CB91_Blue, CB91_Green, CB91_Amber,
-              CB91_Purple, CB91_Violet]
-plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list)
-#some colour palette playing around
-cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
-cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True)
-cm2 = sns.cubehelix_palette(as_cmap=True)
 # create function for bar (value) labels
 def addlabels(x,y):
    for i in range(len(x)):
@@ -502,5 +481,9 @@ data_overview = pd.DataFrame(
     "std_dev_sent":std_dev_sents}, 
     index = index
 )
-data_overview.to_csv(f"data_overview/data_overview.csv")
+if __name__ == "__main__":
+    run_functions("grishaverse/data/split_txt_fanfics")
+    run_functions("throne_of_glass/data/split_txt_fanfics")
+    data_overview.to_csv(f"data_overview/data_overview.csv")
--- a/throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png
+++ b/throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png