Add sentence length metrics

57a67ffc · chrysanthopoulou · ba349be4 · 57a67ffc · 57a67ffc · 57a67ffc
Commit 57a67ffc authored 1 year ago by chrysanthopoulou
--- a/grishaverse/freq_distribution/all_canon_sent_len.png
+++ b/grishaverse/freq_distribution/all_canon_sent_len.png
--- a/stylometry_code.py
+++ b/stylometry_code.py
@@ -4,9 +4,11 @@ from cycler import cycler
 import os
 from nltk.tokenize import word_tokenize
 from nltk.probability import FreqDist
+from nltk.tokenize import sent_tokenize
 import pandas as pd
 import statistics

+
 # you'll have to also download "punkt" from nltk

 # code snippets for prettifying plots
@@ -49,10 +51,6 @@ def read_works_into_string(directory_path):
            strings.append(f.read())
    return "\n".join(strings)

-# this function takes a corpus as its input and gives a Mendenhall curve, i.e. a frequency distribution of tokens as its output
-# precise input: corpus = string ; 
-# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series"
-# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png" 

 # by subdiving the text into segments of 1000, it calculates the type token ratio for each segment and then averages over them
 # this ensures a comparability of the type token ratios for varying text sizes
@@ -78,7 +76,12 @@ def standardised_type_token_ratio(tokens):
        std_ttr = statistics.mean(ttrs)
    return std_ttr

-            
+
+# this function takes a corpus as its input and gives a Mendenhall curve, i.e. a frequency distribution of tokens as its output
+# precise input: corpus = string ; 
+# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series"
+# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png" 
+      

 def mendenhall_curve(corpus, curve_title, plot_destination): 
    tokens = word_tokenize(corpus)
@@ -154,11 +157,88 @@ def mendenhall_curve(corpus, curve_title, plot_destination):
    return standard_deviation, mean, type_token_ratio


+def sentence_metrics(corpus, curve_title, plot_destination): 
+    sents = sent_tokenize(corpus)
+    
+    sent_lens = []
+    for sent in sents:
+        tokens = word_tokenize(sent)
+        
+        #cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)])
+        """
+        short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens
+        # had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin"
+        # and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running"
+        for token in cleaned_tokens:
+            dehyphenated_token = []
+            letter_present = 0
+            for c in token:
+                if c.isalpha() == True:
+                    dehyphenated_token.append(c)
+                    letter_present = 1
+                elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens, 
+                    #bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of 
+                    # high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
+                    # relatively speaking 
+                    dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
+                    #print(dehyphenated_token_joined)
+                    short_clean_tokens.append(dehyphenated_token_joined)
+                    dehyphenated_token = []
+                    letter_present = 0
+        """
+        sent_lens.append(len(tokens))
+
+
+    sent_len_dist = FreqDist(sent_lens).most_common(50)
+    
+    # convert to FreqDist object to a pandas series for easier processing
+    sent_len_dist_panda = pd.Series(dict(sent_len_dist))
+    
+    # sort, normalise and round the panda series
+
+    new_sent_len_dist = sent_len_dist_panda.sort_index()
+    print(new_sent_len_dist)
+    
+    for i in range(0, len(new_sent_len_dist.index)):
+    #for index in new_token_len_dist.index:
+        new_sent_len_dist.iat[i] = round(new_sent_len_dist.iat[i]/len(sent_lens), 2) #index-1 bc the index starts counting from zero, the word lengths not
+    
+    # plot using matplotlib and seaborn 
+
+    # set figure, ax into variables
+    fig, ax = plt.subplots(figsize=(10,10))
+
+    # call function for bar (value) labels 
+    #addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values)
+
+    plt.title(curve_title)
+    ax.set_xlabel("Sentence Length")
+    ax.set_ylabel("Percentage of Occurence")
+    
+    
+    sns.lineplot(x=new_sent_len_dist.index, y=new_sent_len_dist.values, ax=ax, palette="flare")
+    #plt.xticks(rotation=30) !!! very useful for words
+    plt.savefig(plot_destination)
+    
+    # calculate the standard deviation, mean, token/type ratio
+    standard_deviation_sent = statistics.stdev(sent_lens)
+    mean_sent = statistics.mean(sent_lens)
+
+    return standard_deviation_sent, mean_sent
+
+
 #create the Mendenhall Curve for the Throne of Glass Series
-std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")
+#std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")

 #create the Mendenhall Curve for the Grishaverse Books
-std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png")
+#std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png")
+
+# Mendenhall Curve Sentence Lengths for Throne of Glass Canon
+std_dev_sent_tog_canon, mean_sent_tog_canon = sentence_metrics(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_sent_len.png")
+
+# Mendenhall Curve Sentence Lenghts for Grishavers Canon
+std_dev_sent_grishaverse_canon, mean_sent_grishaverse_canon = sentence_metrics(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_sent_len.png")
+

 # create a dataframe to store all the overview statistics in
 # columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15; 
@@ -167,5 +247,14 @@ std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_rati
 # tag_ngram_frequencies
 # punctuation frequencies
 # token/type ratio
-data_overview = pd.DataFrame({"mean_tokens":[mean_tokens_tog_canon, mean_tokens_grishaverse_canon], "std_dev":[std_dev_tokens_tog_canon, std_dev_tokens_grishaverse_canon], "type_token_ratio":[type_token_ratio_tog_canon, type_token_ratio_grishaverse_canon]}, index= ["throne_of_glass_canon", "grishaverse_canon"])
-data_overview.to_csv(f"data_overview/data_overview.csv")
\ No newline at end of file
+"""
+data_overview = pd.DataFrame(
+    {"mean_tokens":[mean_tokens_tog_canon, mean_tokens_grishaverse_canon], 
+     "std_dev":[std_dev_tokens_tog_canon, std_dev_tokens_grishaverse_canon], 
+     "type_token_ratio":[type_token_ratio_tog_canon, type_token_ratio_grishaverse_canon], 
+     "mean_sent":[mean_sent_tog_canon, mean_sent_grishaverse_canon], 
+     "std_dev":[std_dev_sent_tog_canon, std_dev_sent_grishaverse_canon]}, 
+     index= ["throne_of_glass_canon", "grishaverse_canon"]
+    )
+    """
+#data_overview.to_csv(f"data_overview/data_overview.csv")
\ No newline at end of file
--- a/throne_of_glass/freq_distribution/all_canon_sent_len.png
+++ b/throne_of_glass/freq_distribution/all_canon_sent_len.png