Skip to content
Snippets Groups Projects
Commit 4349e916 authored by chrysanthopoulou's avatar chrysanthopoulou
Browse files

Update diagrams

parent af1dbc08
No related branches found
No related tags found
No related merge requests found
grishaverse/freq_distribution/all_canon_token_len.png

32.8 KiB

......@@ -4,10 +4,14 @@ from cycler import cycler
import os
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import pandas as pd
import statistics
# you'll have to also download "punkt" from nltk
#make the plots a bit less ugly
# code snippets for prettifying plots
#colours
CB91_Blue = '#2CBDFE'
CB91_Green = '#47DBCD'
......@@ -26,6 +30,11 @@ cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True)
cm2 = sns.cubehelix_palette(as_cmap=True)
# create function for bar (value) labels
def addlabels(x,y):
for i in range(len(x)):
plt.text(i, y[i], y[i], ha = "center")
# function compiling the works given into a single string. Input required:
# general path of the files as string, for example: "/throne_of_glass/data/canon_works/"
......@@ -40,77 +49,79 @@ def read_works_into_string(directory_path):
strings.append(f.read())
return "\n".join(strings)
# this function takes a corpus as its input and gives a Mendenhall curve, i.e. a frequency distribution of tokens as its output
# precise input: corpus = string ;
# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series"
# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png"
tokens = word_tokenize(read_works_into_string("throne_of_glass/data/canon_works"))
cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)])
short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens
# had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin"
# and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running"
"""
for token in cleaned_tokens:
dehyphenated_token = []
letter_present = 0
if len(token) >= 19:
for c in token:
if c.isalpha() == True:
dehyphenated_token.append(c)
letter_present = 1
#print(dehyphenated_token)
elif c.isalpha() == False and (c == "-" or c == "") and letter_present == 1: #here I am eliminating both dashes and hyphens,
#bc the hyphens are used both correctly and incorrectly and it skews my distribution a lot
#print(dehyphenated_token)
dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
#print(dehyphenated_token_joined)
short_clean_tokens.append(dehyphenated_token_joined)
dehyphenated_token = []
letter_present = 0
elif len(token) >= 14:
def mendenhall_curve(corpus, curve_title, plot_destination):
tokens = word_tokenize(corpus)
cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)])
short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens
# had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin"
# and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running"
for token in cleaned_tokens:
dehyphenated_token = []
letter_present = 0
for c in token:
if c.isalpha() == True:
dehyphenated_token.append(c)
letter_present = 1
#print(dehyphenated_token)
elif c == "" and letter_present == 1: #here I am eliminating only dashes "territory—thanks" but keeping hyphenated
# words as one "cobbled-together"
#print(dehyphenated_token)
elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens,
#bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of
# high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
# relatively speaking
dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
#print(dehyphenated_token_joined)
short_clean_tokens.append(dehyphenated_token_joined)
dehyphenated_token = []
letter_present = 0
else:
short_clean_tokens.append(token)
"""
for token in cleaned_tokens:
dehyphenated_token = []
letter_present = 0
for c in token:
if c.isalpha() == True:
dehyphenated_token.append(c)
letter_present = 1
elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens,
#bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of
# high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
# relatively speaking
dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
#print(dehyphenated_token_joined)
short_clean_tokens.append(dehyphenated_token_joined)
dehyphenated_token = []
letter_present = 0
# distribution of token lengths / Mendenhall curve
token_lengths = [len(token) for token in short_clean_tokens]
token_length_distribution = FreqDist(token_lengths)
print(token_length_distribution.tabulate())
token_length_freq_dist_plot = token_length_distribution.plot(title="Token Length Frequency Distribution: Throne of Glass Series", percents=True)
fig_freq_dist = token_length_freq_dist_plot.get_figure()
fig_freq_dist.savefig("throne_of_glass/freq_distribution/all_canon_token_len.png")
for token in short_clean_tokens:
if len(token)>= 14:
print(f"this is the word: {token} and it's this long {len(token)}")
#print(read_works_into_string("throne_of_glass/data/canon_works"))
# transform corpus into a list of tokens
\ No newline at end of file
# create the distribution of token lengths / Mendenhall curve
token_lengths = [len(token) for token in short_clean_tokens]
token_length_distribution = FreqDist(token_lengths)
# convert to FreqDist object to a pandas series for easier processing
token_len_dist_panda = pd.Series(dict(token_length_distribution))
# sort, normalise and round the panda series
new_token_len_dist = token_len_dist_panda.sort_index()
for i in range(0, len(new_token_len_dist.index)):
#for index in new_token_len_dist.index:
new_token_len_dist.iat[i] = round(new_token_len_dist.iat[i]/len(short_clean_tokens), 2) #index-1 bc the index starts counting from zero, the word lengths not
#if float(new_token_len_dist.iat[i]) == 0.00:
# new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
# plot using matplotlib and seaborn
# set figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))
# call function for bar (value) labels
addlabels(x=new_token_len_dist.index, y=new_token_len_dist.values)
plt.title(curve_title)
ax.set_xlabel("Word Length")
ax.set_ylabel("Percentage of Occurence")
sns.barplot(x=new_token_len_dist.index, y=new_token_len_dist.values, ax=ax, palette="flare")
#plt.xticks(rotation=30) !!! very useful for words
#plt.get_figure()
plt.savefig(plot_destination)
#print(new_token_len_dist.tabulate())
#token_length_freq_dist_plot = token_length_distribution.plot(title=curve_title, percents=True)
#fig_freq_dist = token_length_freq_dist_plot.get_figure()
#fig_freq_dist.savefig(plot_destination)
#create the Mendenhall Curve for the Throne of Glass Series
mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")
#create the Mendenhall Curve for the Grishaverse Books
mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png")
throne_of_glass/freq_distribution/all_canon_token_len.png

33 KiB | W: | H:

throne_of_glass/freq_distribution/all_canon_token_len.png

34.1 KiB | W: | H:

throne_of_glass/freq_distribution/all_canon_token_len.png
throne_of_glass/freq_distribution/all_canon_token_len.png
throne_of_glass/freq_distribution/all_canon_token_len.png
throne_of_glass/freq_distribution/all_canon_token_len.png
  • 2-up
  • Swipe
  • Onion skin
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment