diff --git a/stylometry_code.py b/stylometry_code.py
index 29ea41faf1883ba1db8b7b65ee0eaab86771bbab..50e07ce78a7536ff3b54f50e6e7d21247144d443 100644
--- a/stylometry_code.py
+++ b/stylometry_code.py
@@ -43,16 +43,74 @@ def read_works_into_string(directory_path):
 
 tokens = word_tokenize(read_works_into_string("throne_of_glass/data/canon_works"))
 cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)])
+short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens
+# had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhereâ€”assassin"
+# and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running"
+"""
+for token in cleaned_tokens:
+    dehyphenated_token = []
+    letter_present = 0
+    if len(token) >= 19:
+        for c in token:
+            if c.isalpha() == True:
+                dehyphenated_token.append(c)
+                letter_present = 1
+                #print(dehyphenated_token)
+            elif c.isalpha() == False and (c == "-" or c == "â€”") and letter_present == 1: #here I am eliminating both dashes and hyphens, 
+                #bc the hyphens are used both correctly and incorrectly and it skews my distribution a lot
+                #print(dehyphenated_token)
+                dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
+                #print(dehyphenated_token_joined)
+                short_clean_tokens.append(dehyphenated_token_joined)
+                dehyphenated_token = []
+                letter_present = 0
+    elif len(token) >= 14:
+        for c in token:
+            if c.isalpha() == True:
+                dehyphenated_token.append(c)
+                letter_present = 1
+                #print(dehyphenated_token)
+            elif c == "â€”" and letter_present == 1: #here I am eliminating only dashes "territoryâ€”thanks" but keeping hyphenated 
+                # words as one "cobbled-together"
+                #print(dehyphenated_token)
+                dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
+                #print(dehyphenated_token_joined)
+                short_clean_tokens.append(dehyphenated_token_joined)
+                dehyphenated_token = []
+                letter_present = 0
+    else:
+        short_clean_tokens.append(token)
+"""
+for token in cleaned_tokens:
+    dehyphenated_token = []
+    letter_present = 0
+    for c in token:
+        if c.isalpha() == True:
+            dehyphenated_token.append(c)
+            letter_present = 1
+        elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens, 
+            #bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of 
+            # high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
+            # relatively speaking 
+            dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
+            #print(dehyphenated_token_joined)
+            short_clean_tokens.append(dehyphenated_token_joined)
+            dehyphenated_token = []
+            letter_present = 0
 
 # distribution of token lengths / Mendenhall curve
 
-token_lengths = [len(token) for token in cleaned_tokens]
+token_lengths = [len(token) for token in short_clean_tokens]
 token_length_distribution = FreqDist(token_lengths)
+print(token_length_distribution.tabulate())
 token_length_freq_dist_plot = token_length_distribution.plot(title="Token Length Frequency Distribution: Throne of Glass Series", percents=True)
 
 fig_freq_dist = token_length_freq_dist_plot.get_figure()
 fig_freq_dist.savefig("throne_of_glass/freq_distribution/all_canon_token_len.png")
 
+for token in short_clean_tokens:
+    if len(token)>= 14:
+        print(f"this is the word: {token} and it's this long {len(token)}")
 #print(read_works_into_string("throne_of_glass/data/canon_works"))
 
 # transform corpus into a list of tokens
\ No newline at end of file
diff --git a/throne_of_glass/freq_distribution/all_canon_token_len.png b/throne_of_glass/freq_distribution/all_canon_token_len.png
index 1adb975cf5429222189e46034e7ba76635c16ab6..75f317bdf53afca4289b1c029d38328ddb67f080 100644
Binary files a/throne_of_glass/freq_distribution/all_canon_token_len.png and b/throne_of_glass/freq_distribution/all_canon_token_len.png differ