Skip to content
Snippets Groups Projects
Commit 8f71a165 authored by chrysanthopoulou's avatar chrysanthopoulou
Browse files

Remove the hyphens to unskew the Mendenhall curve

parent 1617381c
No related branches found
No related tags found
No related merge requests found
......@@ -43,16 +43,74 @@ def read_works_into_string(directory_path):
tokens = word_tokenize(read_works_into_string("throne_of_glass/data/canon_works"))
cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)])
short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens
# had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin"
# and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running"
"""
for token in cleaned_tokens:
dehyphenated_token = []
letter_present = 0
if len(token) >= 19:
for c in token:
if c.isalpha() == True:
dehyphenated_token.append(c)
letter_present = 1
#print(dehyphenated_token)
elif c.isalpha() == False and (c == "-" or c == "") and letter_present == 1: #here I am eliminating both dashes and hyphens,
#bc the hyphens are used both correctly and incorrectly and it skews my distribution a lot
#print(dehyphenated_token)
dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
#print(dehyphenated_token_joined)
short_clean_tokens.append(dehyphenated_token_joined)
dehyphenated_token = []
letter_present = 0
elif len(token) >= 14:
for c in token:
if c.isalpha() == True:
dehyphenated_token.append(c)
letter_present = 1
#print(dehyphenated_token)
elif c == "" and letter_present == 1: #here I am eliminating only dashes "territory—thanks" but keeping hyphenated
# words as one "cobbled-together"
#print(dehyphenated_token)
dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
#print(dehyphenated_token_joined)
short_clean_tokens.append(dehyphenated_token_joined)
dehyphenated_token = []
letter_present = 0
else:
short_clean_tokens.append(token)
"""
for token in cleaned_tokens:
dehyphenated_token = []
letter_present = 0
for c in token:
if c.isalpha() == True:
dehyphenated_token.append(c)
letter_present = 1
elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens,
#bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of
# high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
# relatively speaking
dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
#print(dehyphenated_token_joined)
short_clean_tokens.append(dehyphenated_token_joined)
dehyphenated_token = []
letter_present = 0
# distribution of token lengths / Mendenhall curve
token_lengths = [len(token) for token in cleaned_tokens]
token_lengths = [len(token) for token in short_clean_tokens]
token_length_distribution = FreqDist(token_lengths)
print(token_length_distribution.tabulate())
token_length_freq_dist_plot = token_length_distribution.plot(title="Token Length Frequency Distribution: Throne of Glass Series", percents=True)
fig_freq_dist = token_length_freq_dist_plot.get_figure()
fig_freq_dist.savefig("throne_of_glass/freq_distribution/all_canon_token_len.png")
for token in short_clean_tokens:
if len(token)>= 14:
print(f"this is the word: {token} and it's this long {len(token)}")
#print(read_works_into_string("throne_of_glass/data/canon_works"))
# transform corpus into a list of tokens
\ No newline at end of file
throne_of_glass/freq_distribution/all_canon_token_len.png

35.3 KiB | W: | H:

throne_of_glass/freq_distribution/all_canon_token_len.png

33 KiB | W: | H:

throne_of_glass/freq_distribution/all_canon_token_len.png
throne_of_glass/freq_distribution/all_canon_token_len.png
throne_of_glass/freq_distribution/all_canon_token_len.png
throne_of_glass/freq_distribution/all_canon_token_len.png
  • 2-up
  • Swipe
  • Onion skin
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment