diff --git a/grishaverse/freq_distribution/all_canon_token_len.png b/grishaverse/freq_distribution/all_canon_token_len.png new file mode 100644 index 0000000000000000000000000000000000000000..82160fbeee28c747bc22192876dc746d2d736d0f Binary files /dev/null and b/grishaverse/freq_distribution/all_canon_token_len.png differ diff --git a/stylometry_code.py b/stylometry_code.py index 50e07ce78a7536ff3b54f50e6e7d21247144d443..1c8f2a186949b994b3cab04dc8e6691452d7d09d 100644 --- a/stylometry_code.py +++ b/stylometry_code.py @@ -4,10 +4,14 @@ from cycler import cycler import os from nltk.tokenize import word_tokenize from nltk.probability import FreqDist +import pandas as pd +import statistics # you'll have to also download "punkt" from nltk -#make the plots a bit less ugly +# code snippets for prettifying plots + +#colours CB91_Blue = '#2CBDFE' CB91_Green = '#47DBCD' @@ -26,6 +30,11 @@ cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True) cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True) cm2 = sns.cubehelix_palette(as_cmap=True) +# create function for bar (value) labels +def addlabels(x,y): + for i in range(len(x)): + plt.text(i, y[i], y[i], ha = "center") + # function compiling the works given into a single string. Input required: # general path of the files as string, for example: "/throne_of_glass/data/canon_works/" @@ -40,77 +49,79 @@ def read_works_into_string(directory_path): strings.append(f.read()) return "\n".join(strings) +# this function takes a corpus as its input and gives a Mendenhall curve, i.e. a frequency distribution of tokens as its output +# precise input: corpus = string ; +# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series" +# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png" -tokens = word_tokenize(read_works_into_string("throne_of_glass/data/canon_works")) -cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)]) -short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens -# had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin" -# and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running" -""" -for token in cleaned_tokens: - dehyphenated_token = [] - letter_present = 0 - if len(token) >= 19: - for c in token: - if c.isalpha() == True: - dehyphenated_token.append(c) - letter_present = 1 - #print(dehyphenated_token) - elif c.isalpha() == False and (c == "-" or c == "—") and letter_present == 1: #here I am eliminating both dashes and hyphens, - #bc the hyphens are used both correctly and incorrectly and it skews my distribution a lot - #print(dehyphenated_token) - dehyphenated_token_joined = ''.join(map(str, dehyphenated_token)) - #print(dehyphenated_token_joined) - short_clean_tokens.append(dehyphenated_token_joined) - dehyphenated_token = [] - letter_present = 0 - elif len(token) >= 14: + +def mendenhall_curve(corpus, curve_title, plot_destination): + tokens = word_tokenize(corpus) + cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)]) + short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens + # had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin" + # and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running" + + for token in cleaned_tokens: + dehyphenated_token = [] + letter_present = 0 for c in token: if c.isalpha() == True: dehyphenated_token.append(c) letter_present = 1 - #print(dehyphenated_token) - elif c == "—" and letter_present == 1: #here I am eliminating only dashes "territory—thanks" but keeping hyphenated - # words as one "cobbled-together" - #print(dehyphenated_token) + elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens, + #bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of + # high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference, + # relatively speaking dehyphenated_token_joined = ''.join(map(str, dehyphenated_token)) #print(dehyphenated_token_joined) short_clean_tokens.append(dehyphenated_token_joined) dehyphenated_token = [] letter_present = 0 - else: - short_clean_tokens.append(token) -""" -for token in cleaned_tokens: - dehyphenated_token = [] - letter_present = 0 - for c in token: - if c.isalpha() == True: - dehyphenated_token.append(c) - letter_present = 1 - elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens, - #bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of - # high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference, - # relatively speaking - dehyphenated_token_joined = ''.join(map(str, dehyphenated_token)) - #print(dehyphenated_token_joined) - short_clean_tokens.append(dehyphenated_token_joined) - dehyphenated_token = [] - letter_present = 0 - -# distribution of token lengths / Mendenhall curve - -token_lengths = [len(token) for token in short_clean_tokens] -token_length_distribution = FreqDist(token_lengths) -print(token_length_distribution.tabulate()) -token_length_freq_dist_plot = token_length_distribution.plot(title="Token Length Frequency Distribution: Throne of Glass Series", percents=True) - -fig_freq_dist = token_length_freq_dist_plot.get_figure() -fig_freq_dist.savefig("throne_of_glass/freq_distribution/all_canon_token_len.png") - -for token in short_clean_tokens: - if len(token)>= 14: - print(f"this is the word: {token} and it's this long {len(token)}") -#print(read_works_into_string("throne_of_glass/data/canon_works")) - -# transform corpus into a list of tokens \ No newline at end of file + + # create the distribution of token lengths / Mendenhall curve + + token_lengths = [len(token) for token in short_clean_tokens] + token_length_distribution = FreqDist(token_lengths) + + # convert to FreqDist object to a pandas series for easier processing + token_len_dist_panda = pd.Series(dict(token_length_distribution)) + + # sort, normalise and round the panda series + + new_token_len_dist = token_len_dist_panda.sort_index() + + for i in range(0, len(new_token_len_dist.index)): + #for index in new_token_len_dist.index: + new_token_len_dist.iat[i] = round(new_token_len_dist.iat[i]/len(short_clean_tokens), 2) #index-1 bc the index starts counting from zero, the word lengths not + #if float(new_token_len_dist.iat[i]) == 0.00: + # new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry + + + # plot using matplotlib and seaborn + + # set figure, ax into variables + fig, ax = plt.subplots(figsize=(10,10)) + + # call function for bar (value) labels + addlabels(x=new_token_len_dist.index, y=new_token_len_dist.values) + + plt.title(curve_title) + ax.set_xlabel("Word Length") + ax.set_ylabel("Percentage of Occurence") + + sns.barplot(x=new_token_len_dist.index, y=new_token_len_dist.values, ax=ax, palette="flare") + #plt.xticks(rotation=30) !!! very useful for words + #plt.get_figure() + plt.savefig(plot_destination) + #print(new_token_len_dist.tabulate()) + #token_length_freq_dist_plot = token_length_distribution.plot(title=curve_title, percents=True) + + #fig_freq_dist = token_length_freq_dist_plot.get_figure() + #fig_freq_dist.savefig(plot_destination) + +#create the Mendenhall Curve for the Throne of Glass Series +mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png") + +#create the Mendenhall Curve for the Grishaverse Books +mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png") diff --git a/throne_of_glass/freq_distribution/all_canon_token_len.png b/throne_of_glass/freq_distribution/all_canon_token_len.png index 75f317bdf53afca4289b1c029d38328ddb67f080..a9fd5ebf6f4271ce9854bc7e9a4b01963ebc92e0 100644 Binary files a/throne_of_glass/freq_distribution/all_canon_token_len.png and b/throne_of_glass/freq_distribution/all_canon_token_len.png differ