Skip to content
Snippets Groups Projects
Commit 4349e916 authored by chrysanthopoulou's avatar chrysanthopoulou
Browse files

Update diagrams

parent af1dbc08
No related branches found
No related tags found
No related merge requests found
grishaverse/freq_distribution/all_canon_token_len.png

32.8 KiB

...@@ -4,10 +4,14 @@ from cycler import cycler ...@@ -4,10 +4,14 @@ from cycler import cycler
import os import os
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist from nltk.probability import FreqDist
import pandas as pd
import statistics
# you'll have to also download "punkt" from nltk # you'll have to also download "punkt" from nltk
#make the plots a bit less ugly # code snippets for prettifying plots
#colours
CB91_Blue = '#2CBDFE' CB91_Blue = '#2CBDFE'
CB91_Green = '#47DBCD' CB91_Green = '#47DBCD'
...@@ -26,6 +30,11 @@ cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True) ...@@ -26,6 +30,11 @@ cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True) cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True)
cm2 = sns.cubehelix_palette(as_cmap=True) cm2 = sns.cubehelix_palette(as_cmap=True)
# create function for bar (value) labels
def addlabels(x,y):
for i in range(len(x)):
plt.text(i, y[i], y[i], ha = "center")
# function compiling the works given into a single string. Input required: # function compiling the works given into a single string. Input required:
# general path of the files as string, for example: "/throne_of_glass/data/canon_works/" # general path of the files as string, for example: "/throne_of_glass/data/canon_works/"
...@@ -40,77 +49,79 @@ def read_works_into_string(directory_path): ...@@ -40,77 +49,79 @@ def read_works_into_string(directory_path):
strings.append(f.read()) strings.append(f.read())
return "\n".join(strings) return "\n".join(strings)
# this function takes a corpus as its input and gives a Mendenhall curve, i.e. a frequency distribution of tokens as its output
# precise input: corpus = string ;
# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series"
# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png"
tokens = word_tokenize(read_works_into_string("throne_of_glass/data/canon_works"))
cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)]) def mendenhall_curve(corpus, curve_title, plot_destination):
short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens tokens = word_tokenize(corpus)
# had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin" cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)])
# and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running" short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens
""" # had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin"
for token in cleaned_tokens: # and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running"
dehyphenated_token = []
letter_present = 0 for token in cleaned_tokens:
if len(token) >= 19: dehyphenated_token = []
for c in token: letter_present = 0
if c.isalpha() == True:
dehyphenated_token.append(c)
letter_present = 1
#print(dehyphenated_token)
elif c.isalpha() == False and (c == "-" or c == "") and letter_present == 1: #here I am eliminating both dashes and hyphens,
#bc the hyphens are used both correctly and incorrectly and it skews my distribution a lot
#print(dehyphenated_token)
dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
#print(dehyphenated_token_joined)
short_clean_tokens.append(dehyphenated_token_joined)
dehyphenated_token = []
letter_present = 0
elif len(token) >= 14:
for c in token: for c in token:
if c.isalpha() == True: if c.isalpha() == True:
dehyphenated_token.append(c) dehyphenated_token.append(c)
letter_present = 1 letter_present = 1
#print(dehyphenated_token) elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens,
elif c == "" and letter_present == 1: #here I am eliminating only dashes "territory—thanks" but keeping hyphenated #bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of
# words as one "cobbled-together" # high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
#print(dehyphenated_token) # relatively speaking
dehyphenated_token_joined = ''.join(map(str, dehyphenated_token)) dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
#print(dehyphenated_token_joined) #print(dehyphenated_token_joined)
short_clean_tokens.append(dehyphenated_token_joined) short_clean_tokens.append(dehyphenated_token_joined)
dehyphenated_token = [] dehyphenated_token = []
letter_present = 0 letter_present = 0
else:
short_clean_tokens.append(token) # create the distribution of token lengths / Mendenhall curve
"""
for token in cleaned_tokens: token_lengths = [len(token) for token in short_clean_tokens]
dehyphenated_token = [] token_length_distribution = FreqDist(token_lengths)
letter_present = 0
for c in token: # convert to FreqDist object to a pandas series for easier processing
if c.isalpha() == True: token_len_dist_panda = pd.Series(dict(token_length_distribution))
dehyphenated_token.append(c)
letter_present = 1 # sort, normalise and round the panda series
elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens,
#bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of new_token_len_dist = token_len_dist_panda.sort_index()
# high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
# relatively speaking for i in range(0, len(new_token_len_dist.index)):
dehyphenated_token_joined = ''.join(map(str, dehyphenated_token)) #for index in new_token_len_dist.index:
#print(dehyphenated_token_joined) new_token_len_dist.iat[i] = round(new_token_len_dist.iat[i]/len(short_clean_tokens), 2) #index-1 bc the index starts counting from zero, the word lengths not
short_clean_tokens.append(dehyphenated_token_joined) #if float(new_token_len_dist.iat[i]) == 0.00:
dehyphenated_token = [] # new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
letter_present = 0
# distribution of token lengths / Mendenhall curve # plot using matplotlib and seaborn
token_lengths = [len(token) for token in short_clean_tokens] # set figure, ax into variables
token_length_distribution = FreqDist(token_lengths) fig, ax = plt.subplots(figsize=(10,10))
print(token_length_distribution.tabulate())
token_length_freq_dist_plot = token_length_distribution.plot(title="Token Length Frequency Distribution: Throne of Glass Series", percents=True) # call function for bar (value) labels
addlabels(x=new_token_len_dist.index, y=new_token_len_dist.values)
fig_freq_dist = token_length_freq_dist_plot.get_figure()
fig_freq_dist.savefig("throne_of_glass/freq_distribution/all_canon_token_len.png") plt.title(curve_title)
ax.set_xlabel("Word Length")
for token in short_clean_tokens: ax.set_ylabel("Percentage of Occurence")
if len(token)>= 14:
print(f"this is the word: {token} and it's this long {len(token)}") sns.barplot(x=new_token_len_dist.index, y=new_token_len_dist.values, ax=ax, palette="flare")
#print(read_works_into_string("throne_of_glass/data/canon_works")) #plt.xticks(rotation=30) !!! very useful for words
#plt.get_figure()
# transform corpus into a list of tokens plt.savefig(plot_destination)
\ No newline at end of file #print(new_token_len_dist.tabulate())
#token_length_freq_dist_plot = token_length_distribution.plot(title=curve_title, percents=True)
#fig_freq_dist = token_length_freq_dist_plot.get_figure()
#fig_freq_dist.savefig(plot_destination)
#create the Mendenhall Curve for the Throne of Glass Series
mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")
#create the Mendenhall Curve for the Grishaverse Books
mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png")
throne_of_glass/freq_distribution/all_canon_token_len.png

33 KiB | W: | H:

throne_of_glass/freq_distribution/all_canon_token_len.png

34.1 KiB | W: | H:

throne_of_glass/freq_distribution/all_canon_token_len.png
throne_of_glass/freq_distribution/all_canon_token_len.png
throne_of_glass/freq_distribution/all_canon_token_len.png
throne_of_glass/freq_distribution/all_canon_token_len.png
  • 2-up
  • Swipe
  • Onion skin
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment