Add modal verb distribution

a8cfdec0 · Lea Kyveli Chrysanthopoulou · b9aad95f · a8cfdec0 · a8cfdec0 · a8cfdec0
Commit a8cfdec0 authored 1 year ago by Lea Kyveli Chrysanthopoulou
--- a/clean_stylometry.py
+++ b/clean_stylometry.py
@@ -10,8 +10,6 @@ import pandas as pd
 import statistics
 import re

-# you'll have to also download "punkt" from nltk
-
 # create function for bar (value) labels
 def addlabels(x,y):
    for i in range(len(x)):
@@ -26,7 +24,7 @@ def read_works_into_string(directory_path):
    strings = []
    works = os.listdir(directory_path)
    for work in works:
-        with open(f"{directory_path}"+f"/{work}", "r") as f:
+        with open(f"{directory_path}"+f"/{work}", "r", errors='ignore') as f: #ignores mostly unicode errors due to problematic encoding of text files
            strings.append(f.read())
    return "\n".join(strings)

@@ -91,6 +89,27 @@ def tokenize_and_clean_text(text):
            short_clean_tokens.append(''.join(map(str, dehyphenated_token)))
    return short_clean_tokens

+def calculate_freq_dist_as_clean_panda(list_of_items, most_common_limit=False):
+
+    if most_common_limit == False:
+        freq_dist = FreqDist(list_of_items)
+    else: 
+        freq_dist = FreqDist(list_of_items).most_common(most_common_limit)
+
+    # convert FreqDist object to a pandas series for easier processing
+    dist_panda = pd.Series(dict(freq_dist))
+    
+    # sort, normalise and round the panda series
+    new_dist = dist_panda.sort_index()
+    
+    for i in range(0, len(new_dist.index)):
+    #for index in new_token_len_dist.index:
+        new_dist.iat[i] = round(new_dist.iat[i]/len(list_of_items), 3) #index-1 bc the index starts counting from zero, the word lengths not
+        #if float(new_token_len_dist.iat[i]) == 0.00:
+         #   new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
+    return new_dist
+
+
 def mendenhall_token_metrics(tokens):
    # create the distribution of token lengths / Mendenhall curve

@@ -101,23 +120,8 @@ def mendenhall_token_metrics(tokens):
    trim_percent = 0.005
    trim_len = int(len(token_lengths) * trim_percent / 2)
    token_lengths = sorted(token_lengths)[trim_len:-trim_len]
+    new_token_len_dist = calculate_freq_dist_as_clean_panda(token_lengths, most_common_limit=15) # token len freq dist

-
-    token_length_distribution = FreqDist(token_lengths).most_common(15)
-
-    # convert to FreqDist object to a pandas series for easier processing
-    token_len_dist_panda = pd.Series(dict(token_length_distribution))
-    
-    # sort, normalise and round the panda series
-
-    new_token_len_dist = token_len_dist_panda.sort_index()
-    
-    for i in range(0, len(new_token_len_dist.index)):
-    #for index in new_token_len_dist.index:
-        new_token_len_dist.iat[i] = round(new_token_len_dist.iat[i]/len(tokens), 3) #index-1 bc the index starts counting from zero, the word lengths not
-        #if float(new_token_len_dist.iat[i]) == 0.00:
-         #   new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
-    
    standard_deviation = statistics.stdev(token_lengths)
    mean = statistics.mean(token_lengths)

@@ -157,6 +161,7 @@ def pos_tag_freq(tokens):
    punctuation_regex = r"[^\w\s]+"
    summarised_tags = []
    punctuation_tags = []
+    modal_verbs = []
    index = 0
    for token, tag in tag_token_tuples:
        if re.match(punctuation_regex, token):
@@ -170,7 +175,10 @@ def pos_tag_freq(tokens):
                    punctuation_tags.append("full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct")

        else:
-            if tag in ["MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
+            if tag in ["MD"]:
+                summarised_tags.append("modal verb")
+                modal_verbs.append(token.lower())
+            elif tag in ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
                summarised_tags.append("verb")
            elif tag in ["JJ", "JJR", "JJS"]:
                summarised_tags.append("adjective")
@@ -192,33 +200,16 @@ def pos_tag_freq(tokens):
                summarised_tags.append("other_tag")
        index += 1
    
-    
-    tag_freq_dist = FreqDist(summarised_tags)
-    
-    # convert FreqDist object to a pandas series for easier processing
-    tag_freq_dist_panda = pd.Series(dict(tag_freq_dist))
-    
-    # sort, normalise and round the panda series
-    new_tag_freq_dist = tag_freq_dist_panda.sort_index()
-   
-    for i in range(0, len(new_tag_freq_dist.index)):
-    #for index in new_token_len_dist.index:
-        new_tag_freq_dist.iat[i] = round(new_tag_freq_dist.iat[i]/len(tag_token_tuples), 2) #index-1 bc the index starts counting from zero, the word lengths not
+    #pos tag freq dist
+    new_tag_freq_dist = calculate_freq_dist_as_clean_panda(summarised_tags)
    
    #punctuation frequency distribution
-    punct_tag_freq_dist = FreqDist(punctuation_tags)
+    new_punct_tag_freq_dist = calculate_freq_dist_as_clean_panda(punctuation_tags)
    
-    # convert FreqDist object to a pandas series for easier processing
-    punct_tag_freq_dist_panda = pd.Series(dict(punct_tag_freq_dist))
-   
-    # sort, normalise and round the panda series
-    new_punct_tag_freq_dist = punct_tag_freq_dist_panda.sort_index()
-   
-    for i in range(0, len(new_punct_tag_freq_dist.index)):
-    #for index in new_token_len_dist.index:
-        new_punct_tag_freq_dist.iat[i] = round(new_punct_tag_freq_dist.iat[i]/len(punctuation_tags), 3) #index-1 bc the index starts counting from zero, the word lengths not
+    # modal verbs in more detail
+    new_md_freq_dist_panda = calculate_freq_dist_as_clean_panda(modal_verbs, most_common_limit=10)
    
-    return new_tag_freq_dist, new_punct_tag_freq_dist
+    return new_tag_freq_dist, new_punct_tag_freq_dist, new_md_freq_dist_panda

 #f"throne_of_glass/data/canon_works"
 def extract_info_from_directory_path(directory_path):
@@ -234,39 +225,9 @@ def extract_info_from_directory_path(directory_path):
            std_dev_tk, mean_tk, ttr = mendenhall_curve(f, f"Mendenhall Curve for the {series.replace('_' , ' ').title()} {work[:-4].replace('_' , ' ').title()}", f"{series}/freq_distribution/{work[:-4]}_token_len.png")
            mean_tokens.append(mean_tk)

-class StylometryMetrics:
+def calculate_sent_len_dist(text): 

-    def __init__(self, directory_path, name_of_work, quality="", fanfiction=True):
-        self.text = read_works_into_string(directory_path)
-        self.clean_tokens = tokenize_and_clean_text(self.text)
-        self.name = name_of_work
-        self.fanfiction = fanfiction
-        self.quality = quality # good medium bad
-
-    def calculate_standardised_ttr(self):
-        self.sttr = standardised_type_token_ratio(self.clean_tokens)
-    
-    def calculate_mendenhall_token_metrics(self):
-        self.tk_len_dist, self.tk_len_std, self.tk_len_mean = mendenhall_token_metrics(self.clean_tokens)
-
-    def plot_token_metrics(self, file_path_for_pic):
-        plt_title = self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon" + " Token Frequency Distribution"
-        plot_distribution(x=self.tk_len_dist, y=self.tk_len_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic,  x_label="Token Length", y_label="Percentage of Occurence")
-
-    def calculate_pos_tag_distribution(self):
-        self.tag_freq_dist, self.punct_tag_freq_dist = pos_tag_freq(self.clean_tokens)
-    
-    def plot_pos_tag_freq(self, file_path_for_pic):
-        plt_title = "POS Tag Frequencies for the " + self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon"
-        plot_distribution(x=self.tag_freq_dist, y=self.tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="POS Tags", y_label="Percentage of Occurence")
-    
-    def plot_punct_freq(self, file_path_for_pic):
-        plt_title = "Punctuation Frequencies for the " + self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon"
-        plot_distribution(x=self.punct_tag_freq_dist, y=self.punct_tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Types of Punctuation", y_label="Percentage of Occurence")
-
-def sentence_metrics(corpus, curve_title, series, canon_or_fanfic): 
-
-    sents = sent_tokenize(corpus)
+    sents = sent_tokenize(text)
    sent_lens = []
    for sent in sents:
        short_clean_tokens = tokenize_and_clean_text(sent)
@@ -279,76 +240,68 @@ def sentence_metrics(corpus, curve_title, series, canon_or_fanfic):
    trim_percent = 0.05
    trim_len = int(len(sent_lens) * trim_percent / 2)
    sent_lens = sorted(sent_lens)[trim_len:-trim_len]
-    
-
-    sent_len_dist = FreqDist(sent_lens)
-    #print(sent_len_dist)
-    
-    # convert to FreqDist object to a pandas series for easier processing
-    sent_len_dist_panda = pd.Series(dict(sent_len_dist))
-    
-    # sort, normalise and round the panda series
+    sent_len_dist = calculate_freq_dist_as_clean_panda(sent_lens) #new_sent_len_dist

-    new_sent_len_dist = sent_len_dist_panda.sort_index()
-    #print(new_sent_len_dist)
+    # plot the 25 most frequent sentence lenghts as a barplot for a more detailed insight
+    sent_len_dist_short = calculate_freq_dist_as_clean_panda(sent_lens, most_common_limit=25)
    
-    for i in range(0, len(new_sent_len_dist.index)):
-    #for index in new_token_len_dist.index:
-        new_sent_len_dist.iat[i] = round(new_sent_len_dist.iat[i]/len(sent_lens), 2) #index-1 bc the index starts counting from zero, the word lengths not
-    
-    #print(new_sent_len_dist)
-    # plot using matplotlib and seaborn 
-
-    # set figure, ax into variables
-    fig, ax = plt.subplots(figsize=(10,10))
-
-    # call function for bar (value) labels 
-    #addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values)
+    # calculate the standard deviation, mean
+    standard_deviation_sent = statistics.stdev(sent_lens)
+    mean_sent = statistics.mean(sent_lens)

-    plt.title(curve_title)
-    ax.set_xlabel("Sentence Length")
-    ax.set_ylabel("Percentage of Occurence")
-    
-    
-    sns.lineplot(x=new_sent_len_dist.index, y=new_sent_len_dist.values, ax=ax, palette="crest")
-    #plt.xticks(rotation=30) !!! very useful for words
-    plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_sent_len_long.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
+    return sent_len_dist, sent_len_dist_short, standard_deviation_sent, mean_sent

-    # plot the 40 most frequent sentence lenghts as a barplot for a more detailed insight
-    sent_len_dist_short = FreqDist(sent_lens).most_common(25)
+class StylometryMetrics:

-    # convert to FreqDist object to a pandas series for easier processing
-    sent_len_dist_short_panda = pd.Series(dict(sent_len_dist_short))
+    def __init__(self, directory_path, name_of_work, quality="", fanfiction=True):
+        self.text = read_works_into_string(directory_path)
+        self.clean_tokens = tokenize_and_clean_text(self.text)
+        self.name = name_of_work
+        self.fanfiction = fanfiction
+        self.quality = quality # good medium bad
    
-    # sort, normalise and round the panda series
+    def determine_titles(self, plot_topic):
+        if self.fanfiction:
+            plt_title = f"{plot_topic} for the {self.name} {self.quality} Fanfiction"
+        else:
+            plt_title = f"{plot_topic} for the {self.name} Canon"
+        return plt_title

-    new_sent_len_dist_short = sent_len_dist_short_panda.sort_index()
-    #print(new_sent_len_dist)
+    def calculate_standardised_ttr(self):
+        self.sttr = standardised_type_token_ratio(self.clean_tokens)
    
-    for i in range(0, len(new_sent_len_dist_short.index)):
-    #for index in new_token_len_dist.index:
-        new_sent_len_dist_short.iat[i] = round(new_sent_len_dist_short.iat[i]/len(sent_lens), 2) #index-1 bc the index starts counting from zero, the word lengths not
-
-    # set figure, ax into variables
-    fig, ax = plt.subplots(figsize=(10,10))
+    def calculate_mendenhall_token_metrics(self):
+        self.tk_len_dist, self.tk_len_std, self.tk_len_mean = mendenhall_token_metrics(self.clean_tokens)

-    # call function for bar (value) labels 
-    addlabels(x=new_sent_len_dist_short.index, y=new_sent_len_dist_short.values)
+    def plot_token_metrics(self, file_path_for_pic):
+        plt_title = self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon" + " Token Frequency Distribution"
+        plot_distribution(x=self.tk_len_dist, y=self.tk_len_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic,  x_label="Token Length", y_label="Percentage of Occurence")

-    plt.title(curve_title)
-    ax.set_xlabel("Sentence Length")
-    ax.set_ylabel("Percentage of Occurence")
+    def calculate_pos_tag_distribution(self):
+        self.tag_freq_dist, self.punct_tag_freq_dist, self.md_freq_dist = pos_tag_freq(self.clean_tokens)
    
-    sns.barplot(x=new_sent_len_dist_short.index, y=new_sent_len_dist_short.values, ax=ax, palette="YlGnBu")
-    #plt.xticks(rotation=30) !!! very useful for words
-    plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_sent_len_short.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
+    def calculate_sent_len_distribution(self):
+        self.sent_len_dist, self.sent_len_dist_short, self.sent_std_dev, self.sent_mean = calculate_sent_len_dist(self.text)
    
-    # calculate the standard deviation, mean, token/type ratio
-    standard_deviation_sent = statistics.stdev(sent_lens)
-    mean_sent = statistics.mean(sent_lens)
+    def plot_long_sent_len_dist(self, file_path_for_pic):
+        plt_title = self.determine_titles(plot_topic="Full Sentence Length Distribution")
+        plot_distribution(x=self.sent_len_dist, y=self.sent_len_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Sentence Lengths", y_label="Percentage of Occurence", plt_type="lineplot")
+    
+    def plot_short_sent_len_dist(self, file_path_for_pic):
+        plt_title = self.determine_titles(plot_topic="Short Sentence Length Distribution")
+        plot_distribution(x=self.sent_len_dist_short, y=self.sent_len_dist_short, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Sentence Lengths", y_label="Percentage of Occurence")

-    return standard_deviation_sent, mean_sent
+    def plot_pos_tag_freq(self, file_path_for_pic):
+        plt_title = self.determine_titles(plot_topic="POS Tag Frequencies")
+        plot_distribution(x=self.tag_freq_dist, y=self.tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="POS Tags", y_label="Percentage of Occurence")
+    
+    def plot_md_freq(self, file_path_for_pic):
+        plt_title = self.determine_titles(plot_topic="Modal Verb Frequencies")
+        plot_distribution(x=self.md_freq_dist, y=self.md_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Modal Verbs", y_label="Percentage of Occurence")

+    def plot_punct_freq(self, file_path_for_pic):
+        plt_title = self.determine_titles(plot_topic="Punctuation Frequencies")
+        plot_distribution(x=self.punct_tag_freq_dist, y=self.punct_tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Types of Punctuation", y_label="Percentage of Occurence")

 # overall pos_tag frequency distribution
 # pos_tag ngrams; (maybe exclude stopwords?) 
@@ -358,7 +311,6 @@ def sentence_metrics(corpus, curve_title, series, canon_or_fanfic):
 # most frequent adjectives


-    
 #create the Mendenhall Curve for the Throne of Glass Series
 #std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")

@@ -410,35 +362,39 @@ def run_functions(directory_path):


 #grishaverse/data/split_txt_fanfics
-
-#create lists for each of the columns of the dataframe we'll create
-
-mean_tokens = [mean_tokens_tog_canon, mean_tokens_grishaverse_canon]
-std_dev_tokens = [std_dev_tokens_tog_canon, std_dev_tokens_grishaverse_canon]
-type_token_ratio = [type_token_ratio_tog_canon, type_token_ratio_grishaverse_canon]
-mean_sent = [mean_sent_tog_canon, mean_sent_grishaverse_canon]
-std_dev_sents = [std_dev_sent_tog_canon, std_dev_sent_grishaverse_canon]
-index = ["throne_of_glass_canon", "grishaverse_canon"]
-
-# create a dataframe to store all the overview statistics in
-# columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15; 
-# mean_sent; std_dev_sent; freq_sent_len ....
-# tag_frequencies 
-# tag_ngram_frequencies
-# punctuation frequencies
-# token/type ratio
-
-data_overview = pd.DataFrame(
-    {"mean_tokens":mean_tokens, 
-     "std_dev_tokens":std_dev_tokens, 
-     "type_token_ratio":type_token_ratio, 
-     "mean_sent":mean_sent, 
-     "std_dev_sent":std_dev_sents}, 
-     index = index
-)
+def create_dataframe_with_overview_info():
+    #create lists for each of the columns of the dataframe we'll create
+
+    mean_tokens = [mean_tokens_tog_canon, mean_tokens_grishaverse_canon]
+    std_dev_tokens = [std_dev_tokens_tog_canon, std_dev_tokens_grishaverse_canon]
+    type_token_ratio = [type_token_ratio_tog_canon, type_token_ratio_grishaverse_canon]
+    mean_sent = [mean_sent_tog_canon, mean_sent_grishaverse_canon]
+    std_dev_sents = [std_dev_sent_tog_canon, std_dev_sent_grishaverse_canon]
+    index = ["throne_of_glass_canon", "grishaverse_canon"]
+
+    # create a dataframe to store all the overview statistics in
+    # columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15; 
+    # mean_sent; std_dev_sent; freq_sent_len ....
+    # tag_frequencies 
+    # tag_ngram_frequencies
+    # punctuation frequencies
+    # token/type ratio
+
+    data_overview = pd.DataFrame(
+        {"mean_tokens":mean_tokens, 
+        "std_dev_tokens":std_dev_tokens, 
+        "type_token_ratio":type_token_ratio, 
+        "mean_sent":mean_sent, 
+        "std_dev_sent":std_dev_sents}, 
+        index = index
+    )

 if __name__ == "__main__":

    #run_functions("grishaverse/data/split_txt_fanfics")
    #run_functions("throne_of_glass/data/split_txt_fanfics")
-    #data_overview.to_csv(f"data_overview/data_overview.csv")
\ No newline at end of file
+    #data_overview.to_csv(f"data_overview/data_overview.csv")
+    GrishaverseCanon = StylometryMetrics(directory_path="grishaverse/data/canon_works", name_of_work="Grishaverse", fanfiction=False)
+    GrishaverseCanon.calculate_pos_tag_distribution()
+    GrishaverseCanon.plot_md_freq("grishaverse/plots/canon/md_freq.png")
+    
\ No newline at end of file
--- a/grishaverse/plots/canon/md_freq.png
+++ b/grishaverse/plots/canon/md_freq.png
--- a/grishaverse/plots/canon/pos_tag_freq.png
+++ b/grishaverse/plots/canon/pos_tag_freq.png
--- a/grishaverse/plots/filler.txt
+++ b/grishaverse/plots/filler.txt
+alki ist blöd
\ No newline at end of file
--- a/grishaverse/plots/medium/filler.txt
+++ b/grishaverse/plots/medium/filler.txt
+me is good for filling stuffs
\ No newline at end of file