Clean up code and put it into classes

6f37468b · Lea Kyveli Chrysanthopoulou · 8dda779c · 6f37468b · 6f37468b · 6f37468b
Commit 6f37468b authored 1 year ago by Lea Kyveli Chrysanthopoulou
--- a/clean_stylometry.py
+++ b/clean_stylometry.py
+import seaborn as sns
+import matplotlib.pyplot as plt
+from cycler import cycler
+import os
+from nltk.tokenize import word_tokenize
+from nltk.probability import FreqDist
+from nltk.tokenize import sent_tokenize
+from nltk.tag import pos_tag
+import pandas as pd
+import statistics
+import re
+
+# you'll have to also download "punkt" from nltk
+
+# create function for bar (value) labels
+def addlabels(x,y):
+    for i in range(len(x)):
+        plt.text(i, y[i], y[i], ha = "center")
+
+# function compiling the works given into a single string. Input required:
+# general path of the files as string, for example: "/throne_of_glass/data/canon_works/"
+# specific names of the works as a list of strings, for example "throne_of_glass_1.txt"
+
+# /throne_of_glass/data/canon_works/
+def read_works_into_string(directory_path):
+    strings = []
+    works = os.listdir(directory_path)
+    for work in works:
+        with open(f"{directory_path}"+f"/{work}", "r") as f:
+            strings.append(f.read())
+    return "\n".join(strings)
+
+# by subdiving the text into segments of 1000, it calculates the type token ratio for each segment and then averages over them
+# this ensures a comparability of the type token ratios for varying text sizes
+def standardised_type_token_ratio(tokens):
+    ttrs = []
+    segment_tokens = []
+    segment = 0
+    for token in tokens:
+        if segment < 1000:
+            segment_tokens.append(token)
+            segment += 1
+        elif segment == 1000:
+            types = set(segment_tokens)
+            ttr = len(types)/len(segment_tokens)
+            ttrs.append(ttr)
+            segment_tokens =[]
+            segment = 0
+    if len(ttrs) <= 1:
+        types = set(tokens)
+        std_ttr = len(types)/len(tokens)
+        print("Warning: Text was too short for segmentation!")
+    else:
+        std_ttr = statistics.mean(ttrs)
+    return std_ttr
+
+
+def tokenize_and_clean_text(text):
+
+    tokens = word_tokenize(text)
+    cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)])
+    short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens
+    # had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin"
+    # and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running"
+
+    for token in cleaned_tokens:
+        dehyphenated_token = []
+        letter_present = 0
+        dehyphenated = 0
+        second_word_in_compound = 0
+        for c in token:
+            if c.isalpha() == True:
+                dehyphenated_token.append(c)
+                letter_present = 1
+                if dehyphenated == 1:
+                    second_word_in_compound = 1
+            elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens, 
+                #bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of 
+                # high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
+                # relatively speaking 
+                dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
+                #print(dehyphenated_token_joined)
+                short_clean_tokens.append(dehyphenated_token_joined)
+                dehyphenated_token = []
+                letter_present = 0
+                dehyphenated = 1
+                second_word_in_compound = 0
+        if letter_present == 1 and dehyphenated == 0:
+            short_clean_tokens.append(token) #catching the tokens that didn't have any special characters; but not the dehyphenated ones twice
+        elif letter_present == 1 and dehyphenated == 1 and second_word_in_compound == 1:
+            short_clean_tokens.append(''.join(map(str, dehyphenated_token)))
+    return short_clean_tokens
+
+def mendenhall_token_metrics(tokens):
+    # create the distribution of token lengths / Mendenhall curve
+
+    token_lengths = [len(token) for token in tokens]
+
+    # Calculate the trimmed token length (with 5% trimming) We need to remove the outliers, bc even despite preprocessing, 
+    # there still are some very wrong lengths, which entirely skews the metrics and also ruins our p-values later on
+    trim_percent = 0.005
+    trim_len = int(len(token_lengths) * trim_percent / 2)
+    token_lengths = sorted(token_lengths)[trim_len:-trim_len]
+
+
+    token_length_distribution = FreqDist(token_lengths).most_common(15)
+
+    # convert to FreqDist object to a pandas series for easier processing
+    token_len_dist_panda = pd.Series(dict(token_length_distribution))
+    
+    # sort, normalise and round the panda series
+
+    new_token_len_dist = token_len_dist_panda.sort_index()
+    
+    for i in range(0, len(new_token_len_dist.index)):
+    #for index in new_token_len_dist.index:
+        new_token_len_dist.iat[i] = round(new_token_len_dist.iat[i]/len(tokens), 3) #index-1 bc the index starts counting from zero, the word lengths not
+        #if float(new_token_len_dist.iat[i]) == 0.00:
+         #   new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
+    
+    standard_deviation = statistics.stdev(token_lengths)
+    mean = statistics.mean(token_lengths)
+
+    return new_token_len_dist, standard_deviation, mean
+
+def plot_distribution(x, y, plt_title, file_path_for_pic:str, x_label="Number of Kudos", y_label="Percentage of Occurence",palette="flare", plt_type="barplot", add_labels=True):
+    
+    plt.figure(figsize=(10,10))
+    plt.title(plt_title)
+    plt.xlabel(x_label)
+    plt.ylabel(y_label)
+    
+    if add_labels:
+        addlabels(x=x.index, y=y.values)
+
+    match case:
+        sns.scatterplot(x=x.index, y=y.values, palette=palette)
+        #plt.xticks(new_dist.index[::100], new_dist.index[::100])
+
+    else:
+        sns.lineplot(x=x.index, y=y.values, palette=palette)
+
+    plt.savefig(file_path_for_pic) 
+    plt.close()
+
+
+# plot using matplotlib and seaborn 
+
+    # set figure, ax into variables
+    fig, ax = plt.subplots(figsize=(10,10))
+
+    # call function for bar (value) labels 
+    addlabels(x=new_token_len_dist.index, y=new_token_len_dist.values)
+
+    plt.title(curve_title)
+    ax.set_xlabel("Word Length")
+    ax.set_ylabel("Percentage of Occurence")
+    
+    sns.barplot(x=new_token_len_dist.index, y=new_token_len_dist.values, ax=ax, palette="flare")
+    #plt.xticks(rotation=30) !!! very useful for words
+    #plt.get_figure()
+    plt.savefig(plot_destination)
+    #print(new_token_len_dist.tabulate())
+    #token_length_freq_dist_plot = token_length_distribution.plot(title=curve_title, percents=True)
+
+    #fig_freq_dist = token_length_freq_dist_plot.get_figure()
+    #fig_freq_dist.savefig(plot_destination)
+
+
+class StylometryMetrics:
+
+    def __init__(self, directory_path):
+        self.text = read_works_into_string(directory_path)
+        self.clean_tokens = tokenize_and_clean_text(self.text)
+
+    def calculate_standardised_ttr(self):
+        self.sttr = standardised_type_token_ratio(self.clean_tokens)
+    
+    def calculate_mendenhall_token_metrics(self):
+        self.tk_len_dist, self.tk_len_std, self.tk_len_mean = mendenhall_token_metrics(self.clean_tokens)
+
+    def plot
+
+
+    
+
+    
+
+
+
+
+
+# this function takes a corpus as its input and gives a Mendenhall curve, i.e. a frequency distribution of tokens as its output
+# precise input: corpus = string ; 
+# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series"
+# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png" 
+      
+
+def mendenhall_curve(corpus, curve_title, plot_destination): 
+    
+    short_clean_tokens = tokenize_and_clean_text(corpus)
+    
+    # create the distribution of token lengths / Mendenhall curve
+
+    token_lengths = [len(token) for token in short_clean_tokens]
+
+    # Calculate the trimmed token length (with 5% trimming) We need to remove the outliers, bc even despite preprocessing, 
+    # there still are some very wrong lengths, which entirely skews the metrics and also ruins our p-values later on
+    trim_percent = 0.005
+    trim_len = int(len(token_lengths) * trim_percent / 2)
+    token_lengths = sorted(token_lengths)[trim_len:-trim_len]
+
+
+    token_length_distribution = FreqDist(token_lengths).most_common(15)
+
+    # convert to FreqDist object to a pandas series for easier processing
+    token_len_dist_panda = pd.Series(dict(token_length_distribution))
+    
+    # sort, normalise and round the panda series
+
+    new_token_len_dist = token_len_dist_panda.sort_index()
+    
+    for i in range(0, len(new_token_len_dist.index)):
+    #for index in new_token_len_dist.index:
+        new_token_len_dist.iat[i] = round(new_token_len_dist.iat[i]/len(short_clean_tokens), 3) #index-1 bc the index starts counting from zero, the word lengths not
+        #if float(new_token_len_dist.iat[i]) == 0.00:
+         #   new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
+
+    
+    # plot using matplotlib and seaborn 
+
+    # set figure, ax into variables
+    fig, ax = plt.subplots(figsize=(10,10))
+
+    # call function for bar (value) labels 
+    addlabels(x=new_token_len_dist.index, y=new_token_len_dist.values)
+
+    plt.title(curve_title)
+    ax.set_xlabel("Word Length")
+    ax.set_ylabel("Percentage of Occurence")
+    
+    sns.barplot(x=new_token_len_dist.index, y=new_token_len_dist.values, ax=ax, palette="flare")
+    #plt.xticks(rotation=30) !!! very useful for words
+    #plt.get_figure()
+    plt.savefig(plot_destination)
+    #print(new_token_len_dist.tabulate())
+    #token_length_freq_dist_plot = token_length_distribution.plot(title=curve_title, percents=True)
+
+    #fig_freq_dist = token_length_freq_dist_plot.get_figure()
+    #fig_freq_dist.savefig(plot_destination)
+
+    # calculate the standard deviation, mean, token/type ratio
+    standard_deviation = statistics.stdev(token_lengths)
+    mean = statistics.mean(token_lengths)
+
+    type_token_ratio = standardised_type_token_ratio(short_clean_tokens)
+
+    return standard_deviation, mean, type_token_ratio
+
+
+def sentence_metrics(corpus, curve_title, series, canon_or_fanfic): 
+
+    sents = sent_tokenize(corpus)
+    sent_lens = []
+    for sent in sents:
+        short_clean_tokens = tokenize_and_clean_text(sent)
+        sent_lens.append(len(short_clean_tokens))
+        #if len(short_clean_tokens)>= 90:
+            #print(f"This sentence: \n {sent} \n is this long: {len(short_clean_tokens)}")
+    
+    # Calculate the trimmed mean sentence length (with 5% trimming) We need to remove the outliers, bc even despite preprocessing, 
+    # there still are some sentences that are 1200 tokens long, which entirely skews the metrics and also ruins our p-values later on
+    trim_percent = 0.05
+    trim_len = int(len(sent_lens) * trim_percent / 2)
+    sent_lens = sorted(sent_lens)[trim_len:-trim_len]
+    
+
+    sent_len_dist = FreqDist(sent_lens)
+    #print(sent_len_dist)
+    
+    # convert to FreqDist object to a pandas series for easier processing
+    sent_len_dist_panda = pd.Series(dict(sent_len_dist))
+    
+    # sort, normalise and round the panda series
+
+    new_sent_len_dist = sent_len_dist_panda.sort_index()
+    #print(new_sent_len_dist)
+    
+    for i in range(0, len(new_sent_len_dist.index)):
+    #for index in new_token_len_dist.index:
+        new_sent_len_dist.iat[i] = round(new_sent_len_dist.iat[i]/len(sent_lens), 2) #index-1 bc the index starts counting from zero, the word lengths not
+    
+    #print(new_sent_len_dist)
+    # plot using matplotlib and seaborn 
+
+    # set figure, ax into variables
+    fig, ax = plt.subplots(figsize=(10,10))
+
+    # call function for bar (value) labels 
+    #addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values)
+
+    plt.title(curve_title)
+    ax.set_xlabel("Sentence Length")
+    ax.set_ylabel("Percentage of Occurence")
+    
+    
+    sns.lineplot(x=new_sent_len_dist.index, y=new_sent_len_dist.values, ax=ax, palette="crest")
+    #plt.xticks(rotation=30) !!! very useful for words
+    plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_sent_len_long.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
+
+    # plot the 40 most frequent sentence lenghts as a barplot for a more detailed insight
+    sent_len_dist_short = FreqDist(sent_lens).most_common(25)
+
+    # convert to FreqDist object to a pandas series for easier processing
+    sent_len_dist_short_panda = pd.Series(dict(sent_len_dist_short))
+    
+    # sort, normalise and round the panda series
+
+    new_sent_len_dist_short = sent_len_dist_short_panda.sort_index()
+    #print(new_sent_len_dist)
+    
+    for i in range(0, len(new_sent_len_dist_short.index)):
+    #for index in new_token_len_dist.index:
+        new_sent_len_dist_short.iat[i] = round(new_sent_len_dist_short.iat[i]/len(sent_lens), 2) #index-1 bc the index starts counting from zero, the word lengths not
+
+    # set figure, ax into variables
+    fig, ax = plt.subplots(figsize=(10,10))
+
+    # call function for bar (value) labels 
+    addlabels(x=new_sent_len_dist_short.index, y=new_sent_len_dist_short.values)
+
+    plt.title(curve_title)
+    ax.set_xlabel("Sentence Length")
+    ax.set_ylabel("Percentage of Occurence")
+    
+    sns.barplot(x=new_sent_len_dist_short.index, y=new_sent_len_dist_short.values, ax=ax, palette="YlGnBu")
+    #plt.xticks(rotation=30) !!! very useful for words
+    plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_sent_len_short.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
+    
+    # calculate the standard deviation, mean, token/type ratio
+    standard_deviation_sent = statistics.stdev(sent_lens)
+    mean_sent = statistics.mean(sent_lens)
+
+    return standard_deviation_sent, mean_sent
+
+
+# overall pos_tag frequency distribution
+# pos_tag ngrams; (maybe exclude stopwords?) 
+# tag collocates for specific tags --> adjectives most frequently with nouns
+# most frequent words 
+# most frequent words for specific tags --> punctuation; 
+# most frequent adjectives
+
+def pos_tag_frequencies(corpus, series, canon_or_fanfic):
+    #nltk.pos_tag(text) --> [('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'),
+    #('completely', 'RB'), ('different', 'JJ')]
+    tokens = word_tokenize(corpus)
+    """
+    short_tokens = []
+    for token in tokens:
+        dehyphenated_token = []
+        letter_present = 0
+        dehyphenated = 0
+        second_word_in_compound = 0
+        for c in token:
+            if c.isalpha() == True:
+                dehyphenated_token.append(c)
+                letter_present = 1
+                if dehyphenated == 1:
+                    second_word_in_compound = 1
+            elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens, 
+                #bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of 
+                # high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
+                # relatively speaking 
+                dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
+                #print(dehyphenated_token_joined)
+                short_tokens.append(dehyphenated_token_joined)
+                short_tokens.append(c) #append the hyphen/ other punctuation --> we're also interested in that
+                dehyphenated_token = []
+                letter_present = 0
+                dehyphenated = 1
+                second_word_in_compound = 0
+        if letter_present == 1 and dehyphenated == 0:
+            short_tokens.append(token) #catching the tokens that didn't have any special characters; but not the dehyphenated ones twice
+        elif letter_present == 1 and dehyphenated == 1 and second_word_in_compound == 1:
+            short_tokens.append(''.join(map(str, dehyphenated_token)))
+    """
+    tag_token_tuples = pos_tag(tokens)
+    punctuation_regex = r"[^\w\s]+"
+    summarised_tags = []
+    punctuation_tags = []
+    index = 0
+    for token, tag in tag_token_tuples:
+        if re.match(punctuation_regex, token):
+            summarised_tags.append("punctuation")
+            if re.match(r"[\"\'“”’‘]+", token):
+                punctuation_tags.append("quotation_marks")
+            elif re.match(r"[,;:.?!-]+", token):
+                try:
+                    punctuation_tags.append("ellipsis" if token == "." and tag_token_tuples[index+1][1] == "." and tag_token_tuples[index+2][1] == "." else "full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct")
+                except:
+                    punctuation_tags.append("full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct")
+
+        else:
+            if tag in ["MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
+                summarised_tags.append("verb")
+            elif tag in ["JJ", "JJR", "JJS"]:
+                summarised_tags.append("adjective")
+            elif tag in ["RB", "RBR", "RBS", "WRB"]:
+                summarised_tags.append("adverb")
+            elif tag in ["PRP", "PRP$", "WP", "WP$"]:
+                summarised_tags.append("pronoun")
+            elif tag in ["NNP", "NNPS"]:
+                summarised_tags.append("proper_noun")
+            elif tag in ["NN", "NNS"]:
+                summarised_tags.append("common_noun")
+            elif tag in ["DT", "PDT", "WDT"]:
+                summarised_tags.append("determiner")
+            elif tag == "CC":
+                summarised_tags.append("coordinating_conj")
+            elif tag == "IN":
+                summarised_tags.append("subordinating_conj")
+            elif tag in ["$", "CD", "EX", "LS", "POS", "SYM", "TO", "UH", "RP", "FW"]:
+                summarised_tags.append("other_tag")
+        index += 1
+    
+    
+    tag_freq_dist = FreqDist(summarised_tags)
+    #print(tag_freq_dist)
+
+    # convert FreqDist object to a pandas series for easier processing
+    tag_freq_dist_panda = pd.Series(dict(tag_freq_dist))
+    #print(tag_freq_dist_panda)
+    
+    # sort, normalise and round the panda series
+
+    new_tag_freq_dist = tag_freq_dist_panda.sort_index()
+    #print(new_sent_len_dist)
+    
+    for i in range(0, len(new_tag_freq_dist.index)):
+    #for index in new_token_len_dist.index:
+        new_tag_freq_dist.iat[i] = round(new_tag_freq_dist.iat[i]/len(tag_token_tuples), 2) #index-1 bc the index starts counting from zero, the word lengths not
+    
+    print(new_tag_freq_dist)
+
+    # set figure, ax into variables
+    fig, ax = plt.subplots(figsize=(10,10))
+
+    # call function for bar (value) labels 
+    addlabels(x=new_tag_freq_dist.index, y=new_tag_freq_dist.values)
+
+    plt.title(f"POS Tag Frequencies for the {series.replace('_' , ' ').title()} {canon_or_fanfic.replace('_' , ' ').title()}")
+    ax.set_xlabel("POS Tags")
+    ax.set_ylabel("Percentage of Occurence")
+    
+    sns.barplot(x=new_tag_freq_dist.index, y=new_tag_freq_dist.values, ax=ax, palette="RdPu")
+    plt.xticks(rotation=30) # !!! very useful for words
+    plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_pos_tag_frequencies.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
+    
+
+    #punctuation frequency distribution
+
+    punct_tag_freq_dist = FreqDist(punctuation_tags)
+    #print(tag_freq_dist)
+
+    # convert FreqDist object to a pandas series for easier processing
+    punct_tag_freq_dist_panda = pd.Series(dict(punct_tag_freq_dist))
+    #print(punct_tag_freq_dist_panda)
+    
+    # sort, normalise and round the panda series
+
+    new_punct_tag_freq_dist = punct_tag_freq_dist_panda.sort_index()
+    #print(new_sent_len_dist)
+    
+    for i in range(0, len(new_punct_tag_freq_dist.index)):
+    #for index in new_token_len_dist.index:
+        new_punct_tag_freq_dist.iat[i] = round(new_punct_tag_freq_dist.iat[i]/len(punctuation_tags), 3) #index-1 bc the index starts counting from zero, the word lengths not
+    
+    #print(new_punct_tag_freq_dist)
+
+    # set figure, ax into variables
+    fig, ax = plt.subplots(figsize=(10,10))
+
+    # call function for bar (value) labels 
+    addlabels(x=new_punct_tag_freq_dist.index, y=new_punct_tag_freq_dist.values)
+
+    
+    plt.title(f"Punctuation Frequencies for the {series.replace('_' , ' ').title()} {canon_or_fanfic.replace('_' , ' ').title()}")
+    ax.set_xlabel("Types of Punctuation")
+    ax.set_ylabel("Percentage of Occurence")
+    
+    sns.barplot(x=new_punct_tag_freq_dist.index, y=new_punct_tag_freq_dist.values, ax=ax, palette="OrRd")
+    plt.xticks(rotation=30) # !!! very useful for words
+    plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_punctuation_frequencies.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
+    
+    
+#create the Mendenhall Curve for the Throne of Glass Series
+std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")
+
+#create the Mendenhall Curve for the Grishaverse Books
+std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png")
+
+
+# Mendenhall Curve Sentence Lengths for Throne of Glass Canon
+std_dev_sent_tog_canon, mean_sent_tog_canon = sentence_metrics(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Throne of Glass Series", "throne_of_glass", "canon")
+
+# Mendenhall Curve Sentence Lenghts for Grishavers Canon
+std_dev_sent_grishaverse_canon, mean_sent_grishaverse_canon = sentence_metrics(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Grishaverse Books", "grishaverse", "canon")
+
+# POS Tag frequencies for TOG
+pos_tag_frequencies(read_works_into_string(f"throne_of_glass/data/canon_works"), "throne_of_glass", "canon")
+
+# POS Tag frequencies for Grishaverse
+pos_tag_frequencies(read_works_into_string(f"grishaverse/data/canon_works"), "grishaverse", "canon")
+
+def run_functions(directory_path):
+    """
+    mean_tks = []
+    idx = []
+    std_dev_tks = []
+    ttrs = []
+    mean_sts= []
+    std_dev_sts = []
+
+    """
+
+    #for txt_fic in os.listdir(directory_path):
+    works = os.listdir(directory_path)
+    pattern = r"^[a-zA-Z_]+(?=/)" # get series from directory path
+    match = re.search(pattern, directory_path)
+    if match:
+        series = match.group(0)
+    for work in works:
+        with open(f"{directory_path}"+f"/{work}", "r") as f:
+            f = f.read()
+            std_dev_tk, mean_tk, ttr = mendenhall_curve(f, f"Mendenhall Curve for the {series.replace('_' , ' ').title()} {work[:-4].replace('_' , ' ').title()}", f"{series}/freq_distribution/{work[:-4]}_token_len.png")
+            mean_tokens.append(mean_tk)
+            std_dev_tokens.append(std_dev_tk)
+            type_token_ratio.append(ttr)
+            std_dev_st, mean_st = sentence_metrics(f, f"Mendenhall Curve for Sentence Lenghts for the {series.replace('_' , ' ').title()} {work[:-4].replace('_' , ' ').title()}", series, work[:-4])
+            mean_sent.append(mean_st)
+            std_dev_sents.append(std_dev_st)
+            pos_tag_frequencies(f, series, work[:-4])
+            index.append(f"{series}_{work[:-4]}")
+
+
+#grishaverse/data/split_txt_fanfics
+
+#create lists for each of the columns of the dataframe we'll create
+
+mean_tokens = [mean_tokens_tog_canon, mean_tokens_grishaverse_canon]
+std_dev_tokens = [std_dev_tokens_tog_canon, std_dev_tokens_grishaverse_canon]
+type_token_ratio = [type_token_ratio_tog_canon, type_token_ratio_grishaverse_canon]
+mean_sent = [mean_sent_tog_canon, mean_sent_grishaverse_canon]
+std_dev_sents = [std_dev_sent_tog_canon, std_dev_sent_grishaverse_canon]
+index = ["throne_of_glass_canon", "grishaverse_canon"]
+
+# create a dataframe to store all the overview statistics in
+# columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15; 
+# mean_sent; std_dev_sent; freq_sent_len ....
+# tag_frequencies 
+# tag_ngram_frequencies
+# punctuation frequencies
+# token/type ratio
+
+data_overview = pd.DataFrame(
+    {"mean_tokens":mean_tokens, 
+     "std_dev_tokens":std_dev_tokens, 
+     "type_token_ratio":type_token_ratio, 
+     "mean_sent":mean_sent, 
+     "std_dev_sent":std_dev_sents}, 
+     index = index
+)
+
+if __name__ == "__main__":
+
+    run_functions("grishaverse/data/split_txt_fanfics")
+    run_functions("throne_of_glass/data/split_txt_fanfics")
+    data_overview.to_csv(f"data_overview/data_overview.csv")
--- a/colour_code.py
+++ b/colour_code.py
+
+# code snippets for prettifying plots
+
+#colours
+
+CB91_Blue = '#2CBDFE'
+CB91_Green = '#47DBCD'
+CB91_Pink = '#F3A0F2'
+CB91_Purple = '#9D2EC5'
+CB91_Violet = '#661D98'
+CB91_Amber = '#F5B14C'
+
+color_list = [CB91_Pink, CB91_Blue, CB91_Green, CB91_Amber,
+              CB91_Purple, CB91_Violet]
+plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list)
+
+#some colour palette playing around
+
+cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
+cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True)
+cm2 = sns.cubehelix_palette(as_cmap=True)
\ No newline at end of file
--- a/fanfic_preprocessing.py
+++ b/fanfic_preprocessing.py
@@ -22,7 +22,7 @@ CB91_Purple = '#9D2EC5'
 CB91_Violet = '#661D98'
 CB91_Amber = '#F5B14C'

-color_list = [pink, light_green, purple_grey, blue_grey, CB91_Green, CB91_Pink, CB91_Blue, CB91_Amber,
+color_list = [  blue_grey, CB91_Amber, pink, light_green, CB91_Green, CB91_Pink, CB91_Blue, 
              CB91_Purple, CB91_Violet]
 plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list)

@@ -32,11 +32,11 @@ cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
 cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True)
 cm2 = sns.cubehelix_palette(as_cmap=True)

-
-
 #palette_1 = sns.color_palette("flare")
 #palette_2 = sns.color_palette("mako_r", as_cmap=True)

+# actual preprocessing code
+
 #file header: 
 # work_id,title,author,rating,category,fandom,relationship,character,additional tags,language,published,status,status date,words,chapters,comments,kudos,bookmarks,hits,all_kudos,all_bookmarks,body
 # 27852922,Dealing with Our Demons,['ravenyenn19'],Mature,F/M,"Six of Crows Series",Kaz Brekker/Inej Ghafa,"Kaz B","Romance,Kanej - Freeform, Eventual Smut",English,2020-12-03,Updated,2023-03-16,747673,162/?,8573,12204,1373,709212,"['ud4m', 'book_addict_1228', 'ephemeraldelights', 'bluedelilah25', 'sunshinecorsets', 'I_do_not_like_purple_glasses', 'beep_boop_00', 'schleswigholstein', 'moonandstars75', 'ewerythingoes', 'mindfighters', 'rosibunnis', 'Lizie06', 'ghostlatte', 'aguswolman', 'QueenofEnglan', 'JenBoyette04', 'gnitneb_reads', 'gloomysunshine', 'v1ofvs', 'BazzaKrekker', 'BookGeek', 'poppyflower19', 'Cassanibal', 'vanilla_chai_tea', 'Honorthyword', 'mariaarmengol', 'luc1inda', 'zarawrites', 'monmough', 'Guilty__Pleasures', 'Ilyann', 'folieadeux_0_0', 'dragonguard', 'Emeliemarx', 'angrydabee', 'slythxrclaw', 'samaram0215', 'letsgetthisbread69', 'Mintmew', 'biblichour', 'Katloupet', 'Miss_ginger', 'inejsquake', 'Arabella_7833', 'flossy_flo99', 'a_k123', 'hushedwanderer', 'siriuslymichele', 'AnnaAvinaVTDX']",[],"Dear Kaz,
@@ -45,90 +45,49 @@ cm2 = sns.cubehelix_palette(as_cmap=True)
 grisha_fanfics = pd.read_csv("grishaverse/data/fanfics/grishaverse_fics.csv")
 tog_fanfics = pd.read_csv("throne_of_glass/data/fanfics/throne_of_glass_fics.csv")

-"""
-# plot distribution of kudos for Grishaverse Fanfics
-
-grisha_kudos = grisha_fanfics["kudos"].values.tolist()
-
-grisha_kudos_freq_dist = FreqDist(grisha_kudos)
-# convert to FreqDist object to a pandas series for easier processing
-dist_panda = pd.Series(dict(grisha_kudos_freq_dist))
-#print(dist_panda)
-
-# sort, normalise and round the panda series
-
-new_dist = dist_panda.sort_index()
-
-for i in range(0, len(new_dist.index)):
-#for index in new_token_len_dist.index:
-    new_dist.iat[i] = round(new_dist.iat[i]/len(grisha_kudos), 3) #index-1 bc the index starts counting from zero, the word lengths not
-    #if float(new_token_len_dist.iat[i]) == 0.00:
-        #   new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
-
-#calculate cumulative distribution
-cum_dist = np.cumsum(new_dist.values)
-
-# plot using matplotlib and seaborn 
-
-# set figure, ax into variables
-fig, ax = plt.subplots(figsize=(10,10))
-
-# call function for bar (value) labels 
-#addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values)
+def read_csv_to_pd(file_path, name_of_file) -> pd: #fix type hints
+    name_of_file = pd.read_csv(file_path)
+    return name_of_file

-plt.title("Grishaverse Cumulative Frequency Distribution of All Kudos")
-ax.set_xlabel("Number of Kudos")
-ax.set_ylabel("Percentage of Occurence")

+def calculate_cum_kudo_distribution(fanfic_pd):
+    fanfic_kudos = fanfic_pd["kudos"].values.tolist()
+    fanfic_kudos_freq_dist = FreqDist(fanfic_kudos)
+    # convert to FreqDist object to a pandas series for easier processing
+    dist_panda = pd.Series(dict(fanfic_kudos_freq_dist))

-sns.lineplot(x=new_dist.index, y=cum_dist, ax=ax)
-#plt.xticks(rotation=30) !!! very useful for words
-fig.savefig(f"grishaverse/freq_distribution/fanfic_kudo_freq_dist.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
+    # sort, normalise and round the panda series
+    new_dist = dist_panda.sort_index()

-"""
-# plot distribution of kudos for Throne of Glass Fanfics
+    for i in range(0, len(new_dist.index)):
+    #for index in new_token_len_dist.index:
+        new_dist.iat[i] = round(new_dist.iat[i]/len(fanfic_kudos), 3) #index-1 bc the index starts counting from zero, the word lengths not
+        #if float(new_token_len_dist.iat[i]) == 0.00:
+            #   new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry

-tog_kudos = tog_fanfics["kudos"].values.tolist()
+    #calculate cumulative distribution
+    cum_dist = np.cumsum(new_dist.values)
+    return new_dist, cum_dist

-tog_kudos_freq_dist = FreqDist(tog_kudos)
-# convert to FreqDist object to a pandas series for easier processing
-dist_panda = pd.Series(dict(tog_kudos_freq_dist))
-#print(dist_panda)

-# sort, normalise and round the panda series
+def plot_distribution(new_dist, cum_dist, plt_title, file_path_for_pic:str, x_label="Number of Kudos", y_label="Percentage of Occurence", scatter_plt=False, max_ticks=10):
+    
+    plt.figure(figsize=(10,10))
+    plt.title(plt_title)
+    plt.xlabel(x_label)
+    plt.ylabel(y_label)
+    
+    if scatter_plt:
+        sns.scatterplot(x=new_dist.index, y=cum_dist)
+        #plt.xticks(new_dist.index[::100], new_dist.index[::100])

-new_dist = dist_panda.sort_index()
+    else:
+        sns.lineplot(x=new_dist.index, y=cum_dist)

-for i in range(0, len(new_dist.index)):
-#for index in new_token_len_dist.index:
-    new_dist.iat[i] = round(new_dist.iat[i]/len(tog_kudos), 3) #index-1 bc the index starts counting from zero, the word lengths not
-    #if float(new_token_len_dist.iat[i]) == 0.00:
-        #   new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
+    plt.savefig(file_path_for_pic) 
+    plt.close()

-#calculate cumulative distribution
-cum_dist = np.cumsum(new_dist.values)
-
-# plot using matplotlib and seaborn 
-
-# set figure, ax into variables
-fig, ax = plt.subplots(figsize=(10,10))
-
-# call function for bar (value) labels 
-#addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values)
-
-plt.title("Throne of Glass Cumulative Frequency Distribution of Kudos")
-ax.set_xlabel("Number of Kudos")
-ax.set_ylabel("Percentage of Occurence")
-
-
-sns.lineplot(x=new_dist.index, y=cum_dist, ax=ax)
-#plt.xticks(rotation=30) !!! very useful for words
-fig.savefig(f"throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
-
-
-
-"""
-def preprocess_data(df, series):
+def separate_fanfics_by_good_medium_bad(df, series):
    good_fics = []
    medium_fics = []
    bad_fics = []
@@ -163,6 +122,16 @@ def preprocess_data(df, series):
        f.write(medium_fics_joined)


-preprocess_data(grisha_fanfics, "grishaverse")
-preprocess_data(tog_fanfics, "throne_of_glass")
-"""
\ No newline at end of file
+if __name__ == "__main__":
+    #grishaverse
+    #grisha_fanfics = read_csv_to_pd(file_path="grishaverse/data/fanfics/grishaverse_fics.csv", name_of_file=grisha_fanfics)
+    #new_dist, cum_dist = calculate_cum_kudo_distribution(grisha_fanfics)
+    #plot_distribution(new_dist=new_dist, cum_dist=cum_dist, plt_title="Grishaverse Cumulative Frequency Distribution of All Kudos", file_path_for_pic="grishaverse/freq_distribution/fanfic_kudo_freq_dist.png", scatter_plt=_plt=True)
+
+    #throne of glass
+    tog_fanfics = read_csv_to_pd(file_path="throne_of_glass/data/fanfics/throne_of_glass_fics.csv", name_of_file=tog_fanfics)
+    new_dist, cum_dist = calculate_cum_kudo_distribution(tog_fanfics)
+    plot_distribution(new_dist=new_dist, cum_dist=cum_dist, plt_title="Throne of Glass Cumulative Frequency Distribution of All Kudos", file_path_for_pic= "throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png", scatter_plt=True)
+
+    #separate_fanfics_by_good_medium_bad(grisha_fanfics, "grishaverse")
+    #separate_fanfics_by_good_medium_bad(tog_fanfics, "throne_of_glass")
--- a/grishaverse/freq_distribution/fanfic_kudo_freq_dist.png
+++ b/grishaverse/freq_distribution/fanfic_kudo_freq_dist.png
--- a/stylometry_code.py
+++ b/stylometry_code.py
@@ -13,27 +13,6 @@ import re

 # you'll have to also download "punkt" from nltk

-# code snippets for prettifying plots
-
-#colours
-
-CB91_Blue = '#2CBDFE'
-CB91_Green = '#47DBCD'
-CB91_Pink = '#F3A0F2'
-CB91_Purple = '#9D2EC5'
-CB91_Violet = '#661D98'
-CB91_Amber = '#F5B14C'
-
-color_list = [CB91_Pink, CB91_Blue, CB91_Green, CB91_Amber,
-              CB91_Purple, CB91_Violet]
-plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list)
-
-#some colour palette playing around
-
-cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
-cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True)
-cm2 = sns.cubehelix_palette(as_cmap=True)
-
 # create function for bar (value) labels
 def addlabels(x,y):
    for i in range(len(x)):
@@ -482,10 +461,6 @@ mean_sent = [mean_sent_tog_canon, mean_sent_grishaverse_canon]
 std_dev_sents = [std_dev_sent_tog_canon, std_dev_sent_grishaverse_canon]
 index = ["throne_of_glass_canon", "grishaverse_canon"]

-
-run_functions("grishaverse/data/split_txt_fanfics")
-run_functions("throne_of_glass/data/split_txt_fanfics")
-
 # create a dataframe to store all the overview statistics in
 # columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15; 
 # mean_sent; std_dev_sent; freq_sent_len ....
@@ -502,5 +477,9 @@ data_overview = pd.DataFrame(
     "std_dev_sent":std_dev_sents}, 
     index = index
 )
-    
-data_overview.to_csv(f"data_overview/data_overview.csv")
+
+if __name__ == "__main__":
+
+    run_functions("grishaverse/data/split_txt_fanfics")
+    run_functions("throne_of_glass/data/split_txt_fanfics")
+    data_overview.to_csv(f"data_overview/data_overview.csv")
--- a/throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png
+++ b/throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png