diff --git a/stylometry_code.py b/stylometry_code.py index 29ea41faf1883ba1db8b7b65ee0eaab86771bbab..50e07ce78a7536ff3b54f50e6e7d21247144d443 100644 --- a/stylometry_code.py +++ b/stylometry_code.py @@ -43,16 +43,74 @@ def read_works_into_string(directory_path): tokens = word_tokenize(read_works_into_string("throne_of_glass/data/canon_works")) cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)]) +short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens +# had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin" +# and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running" +""" +for token in cleaned_tokens: + dehyphenated_token = [] + letter_present = 0 + if len(token) >= 19: + for c in token: + if c.isalpha() == True: + dehyphenated_token.append(c) + letter_present = 1 + #print(dehyphenated_token) + elif c.isalpha() == False and (c == "-" or c == "—") and letter_present == 1: #here I am eliminating both dashes and hyphens, + #bc the hyphens are used both correctly and incorrectly and it skews my distribution a lot + #print(dehyphenated_token) + dehyphenated_token_joined = ''.join(map(str, dehyphenated_token)) + #print(dehyphenated_token_joined) + short_clean_tokens.append(dehyphenated_token_joined) + dehyphenated_token = [] + letter_present = 0 + elif len(token) >= 14: + for c in token: + if c.isalpha() == True: + dehyphenated_token.append(c) + letter_present = 1 + #print(dehyphenated_token) + elif c == "—" and letter_present == 1: #here I am eliminating only dashes "territory—thanks" but keeping hyphenated + # words as one "cobbled-together" + #print(dehyphenated_token) + dehyphenated_token_joined = ''.join(map(str, dehyphenated_token)) + #print(dehyphenated_token_joined) + short_clean_tokens.append(dehyphenated_token_joined) + dehyphenated_token = [] + letter_present = 0 + else: + short_clean_tokens.append(token) +""" +for token in cleaned_tokens: + dehyphenated_token = [] + letter_present = 0 + for c in token: + if c.isalpha() == True: + dehyphenated_token.append(c) + letter_present = 1 + elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens, + #bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of + # high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference, + # relatively speaking + dehyphenated_token_joined = ''.join(map(str, dehyphenated_token)) + #print(dehyphenated_token_joined) + short_clean_tokens.append(dehyphenated_token_joined) + dehyphenated_token = [] + letter_present = 0 # distribution of token lengths / Mendenhall curve -token_lengths = [len(token) for token in cleaned_tokens] +token_lengths = [len(token) for token in short_clean_tokens] token_length_distribution = FreqDist(token_lengths) +print(token_length_distribution.tabulate()) token_length_freq_dist_plot = token_length_distribution.plot(title="Token Length Frequency Distribution: Throne of Glass Series", percents=True) fig_freq_dist = token_length_freq_dist_plot.get_figure() fig_freq_dist.savefig("throne_of_glass/freq_distribution/all_canon_token_len.png") +for token in short_clean_tokens: + if len(token)>= 14: + print(f"this is the word: {token} and it's this long {len(token)}") #print(read_works_into_string("throne_of_glass/data/canon_works")) # transform corpus into a list of tokens \ No newline at end of file diff --git a/throne_of_glass/freq_distribution/all_canon_token_len.png b/throne_of_glass/freq_distribution/all_canon_token_len.png index 1adb975cf5429222189e46034e7ba76635c16ab6..75f317bdf53afca4289b1c029d38328ddb67f080 100644 Binary files a/throne_of_glass/freq_distribution/all_canon_token_len.png and b/throne_of_glass/freq_distribution/all_canon_token_len.png differ