Update diagrams

4349e916 · chrysanthopoulou · af1dbc08 · 4349e916 · 4349e916 · af1dbc08
Commit 4349e916 authored 2 years ago by chrysanthopoulou
--- a/grishaverse/freq_distribution/all_canon_token_len.png
+++ b/grishaverse/freq_distribution/all_canon_token_len.png
--- a/stylometry_code.py
+++ b/stylometry_code.py
@@ -4,10 +4,14 @@ from cycler import cycler
 import os
 from nltk.tokenize import word_tokenize
 from nltk.probability import FreqDist
+import pandas as pd
+import statistics

 # you'll have to also download "punkt" from nltk

-#make the plots a bit less ugly
+# code snippets for prettifying plots
+
+#colours

 CB91_Blue = '#2CBDFE'
 CB91_Green = '#47DBCD'
@@ -26,6 +30,11 @@ cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
 cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True)
 cm2 = sns.cubehelix_palette(as_cmap=True)

+# create function for bar (value) labels
+def addlabels(x,y):
+    for i in range(len(x)):
+        plt.text(i, y[i], y[i], ha = "center")
+

 # function compiling the works given into a single string. Input required:
 # general path of the files as string, for example: "/throne_of_glass/data/canon_works/"
@@ -40,77 +49,79 @@ def read_works_into_string(directory_path):
            strings.append(f.read())
    return "\n".join(strings)

+# this function takes a corpus as its input and gives a Mendenhall curve, i.e. a frequency distribution of tokens as its output
+# precise input: corpus = string ; 
+# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series"
+# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png" 

-tokens = word_tokenize(read_works_into_string("throne_of_glass/data/canon_works"))
-cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)])
-short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens
-# had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin"
-# and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running"
-"""
-for token in cleaned_tokens:
-    dehyphenated_token = []
-    letter_present = 0
-    if len(token) >= 19:
-        for c in token:
-            if c.isalpha() == True:
-                dehyphenated_token.append(c)
-                letter_present = 1
-                #print(dehyphenated_token)
-            elif c.isalpha() == False and (c == "-" or c == "—") and letter_present == 1: #here I am eliminating both dashes and hyphens, 
-                #bc the hyphens are used both correctly and incorrectly and it skews my distribution a lot
-                #print(dehyphenated_token)
-                dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
-                #print(dehyphenated_token_joined)
-                short_clean_tokens.append(dehyphenated_token_joined)
-                dehyphenated_token = []
-                letter_present = 0
-    elif len(token) >= 14:
+
+def mendenhall_curve(corpus, curve_title, plot_destination): 
+    tokens = word_tokenize(corpus)
+    cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)])
+    short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens
+    # had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin"
+    # and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running"
+
+    for token in cleaned_tokens:
+        dehyphenated_token = []
+        letter_present = 0
        for c in token:
            if c.isalpha() == True:
                dehyphenated_token.append(c)
                letter_present = 1
-                #print(dehyphenated_token)
-            elif c == "—" and letter_present == 1: #here I am eliminating only dashes "territory—thanks" but keeping hyphenated 
-                # words as one "cobbled-together"
-                #print(dehyphenated_token)
+            elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens, 
+                #bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of 
+                # high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
+                # relatively speaking 
                dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
                #print(dehyphenated_token_joined)
                short_clean_tokens.append(dehyphenated_token_joined)
                dehyphenated_token = []
                letter_present = 0
-    else:
-        short_clean_tokens.append(token)
-"""
-for token in cleaned_tokens:
-    dehyphenated_token = []
-    letter_present = 0
-    for c in token:
-        if c.isalpha() == True:
-            dehyphenated_token.append(c)
-            letter_present = 1
-        elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens, 
-            #bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of 
-            # high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
-            # relatively speaking 
-            dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
-            #print(dehyphenated_token_joined)
-            short_clean_tokens.append(dehyphenated_token_joined)
-            dehyphenated_token = []
-            letter_present = 0
-
-# distribution of token lengths / Mendenhall curve
-
-token_lengths = [len(token) for token in short_clean_tokens]
-token_length_distribution = FreqDist(token_lengths)
-print(token_length_distribution.tabulate())
-token_length_freq_dist_plot = token_length_distribution.plot(title="Token Length Frequency Distribution: Throne of Glass Series", percents=True)
-
-fig_freq_dist = token_length_freq_dist_plot.get_figure()
-fig_freq_dist.savefig("throne_of_glass/freq_distribution/all_canon_token_len.png")
-
-for token in short_clean_tokens:
-    if len(token)>= 14:
-        print(f"this is the word: {token} and it's this long {len(token)}")
-#print(read_works_into_string("throne_of_glass/data/canon_works"))
-
-# transform corpus into a list of tokens
\ No newline at end of file
+
+    # create the distribution of token lengths / Mendenhall curve
+
+    token_lengths = [len(token) for token in short_clean_tokens]
+    token_length_distribution = FreqDist(token_lengths)
+
+    # convert to FreqDist object to a pandas series for easier processing
+    token_len_dist_panda = pd.Series(dict(token_length_distribution))
+    
+    # sort, normalise and round the panda series
+
+    new_token_len_dist = token_len_dist_panda.sort_index()
+    
+    for i in range(0, len(new_token_len_dist.index)):
+    #for index in new_token_len_dist.index:
+        new_token_len_dist.iat[i] = round(new_token_len_dist.iat[i]/len(short_clean_tokens), 2) #index-1 bc the index starts counting from zero, the word lengths not
+        #if float(new_token_len_dist.iat[i]) == 0.00:
+         #   new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
+
+    
+    # plot using matplotlib and seaborn 
+
+    # set figure, ax into variables
+    fig, ax = plt.subplots(figsize=(10,10))
+
+    # call function for bar (value) labels 
+    addlabels(x=new_token_len_dist.index, y=new_token_len_dist.values)
+
+    plt.title(curve_title)
+    ax.set_xlabel("Word Length")
+    ax.set_ylabel("Percentage of Occurence")
+    
+    sns.barplot(x=new_token_len_dist.index, y=new_token_len_dist.values, ax=ax, palette="flare")
+    #plt.xticks(rotation=30) !!! very useful for words
+    #plt.get_figure()
+    plt.savefig(plot_destination)
+    #print(new_token_len_dist.tabulate())
+    #token_length_freq_dist_plot = token_length_distribution.plot(title=curve_title, percents=True)
+
+    #fig_freq_dist = token_length_freq_dist_plot.get_figure()
+    #fig_freq_dist.savefig(plot_destination)
+
+#create the Mendenhall Curve for the Throne of Glass Series
+mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")
+
+#create the Mendenhall Curve for the Grishaverse Books
+mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png")
--- a/throne_of_glass/freq_distribution/all_canon_token_len.png
+++ b/throne_of_glass/freq_distribution/all_canon_token_len.png