Remove the hyphens to unskew the Mendenhall curve

8f71a165 · chrysanthopoulou · 1617381c · 8f71a165 · 1617381c · 8f71a165
Commit 8f71a165 authored 1 year ago by chrysanthopoulou
--- a/stylometry_code.py
+++ b/stylometry_code.py
@@ -43,16 +43,74 @@ def read_works_into_string(directory_path):

 tokens = word_tokenize(read_works_into_string("throne_of_glass/data/canon_works"))
 cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)])
+short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens
+# had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin"
+# and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running"
+"""
+for token in cleaned_tokens:
+    dehyphenated_token = []
+    letter_present = 0
+    if len(token) >= 19:
+        for c in token:
+            if c.isalpha() == True:
+                dehyphenated_token.append(c)
+                letter_present = 1
+                #print(dehyphenated_token)
+            elif c.isalpha() == False and (c == "-" or c == "—") and letter_present == 1: #here I am eliminating both dashes and hyphens, 
+                #bc the hyphens are used both correctly and incorrectly and it skews my distribution a lot
+                #print(dehyphenated_token)
+                dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
+                #print(dehyphenated_token_joined)
+                short_clean_tokens.append(dehyphenated_token_joined)
+                dehyphenated_token = []
+                letter_present = 0
+    elif len(token) >= 14:
+        for c in token:
+            if c.isalpha() == True:
+                dehyphenated_token.append(c)
+                letter_present = 1
+                #print(dehyphenated_token)
+            elif c == "—" and letter_present == 1: #here I am eliminating only dashes "territory—thanks" but keeping hyphenated 
+                # words as one "cobbled-together"
+                #print(dehyphenated_token)
+                dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
+                #print(dehyphenated_token_joined)
+                short_clean_tokens.append(dehyphenated_token_joined)
+                dehyphenated_token = []
+                letter_present = 0
+    else:
+        short_clean_tokens.append(token)
+"""
+for token in cleaned_tokens:
+    dehyphenated_token = []
+    letter_present = 0
+    for c in token:
+        if c.isalpha() == True:
+            dehyphenated_token.append(c)
+            letter_present = 1
+        elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens, 
+            #bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of 
+            # high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
+            # relatively speaking 
+            dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
+            #print(dehyphenated_token_joined)
+            short_clean_tokens.append(dehyphenated_token_joined)
+            dehyphenated_token = []
+            letter_present = 0

 # distribution of token lengths / Mendenhall curve

-token_lengths = [len(token) for token in cleaned_tokens]
+token_lengths = [len(token) for token in short_clean_tokens]
 token_length_distribution = FreqDist(token_lengths)
+print(token_length_distribution.tabulate())
 token_length_freq_dist_plot = token_length_distribution.plot(title="Token Length Frequency Distribution: Throne of Glass Series", percents=True)

 fig_freq_dist = token_length_freq_dist_plot.get_figure()
 fig_freq_dist.savefig("throne_of_glass/freq_distribution/all_canon_token_len.png")

+for token in short_clean_tokens:
+    if len(token)>= 14:
+        print(f"this is the word: {token} and it's this long {len(token)}")
 #print(read_works_into_string("throne_of_glass/data/canon_works"))

 # transform corpus into a list of tokens
\ No newline at end of file
--- a/throne_of_glass/freq_distribution/all_canon_token_len.png
+++ b/throne_of_glass/freq_distribution/all_canon_token_len.png