diff --git a/grishaverse/freq_distribution/all_canon_sent_len.png b/grishaverse/freq_distribution/all_canon_sent_len.png new file mode 100644 index 0000000000000000000000000000000000000000..9d003763c78a0d013eb2a61b571b8a6f22c17c2c Binary files /dev/null and b/grishaverse/freq_distribution/all_canon_sent_len.png differ diff --git a/stylometry_code.py b/stylometry_code.py index 061eb5462152a1b8a7faf47f7bcbd579013b45cc..30c2ec1f0e86b494c7e403d01927262ce040eee9 100644 --- a/stylometry_code.py +++ b/stylometry_code.py @@ -4,9 +4,11 @@ from cycler import cycler import os from nltk.tokenize import word_tokenize from nltk.probability import FreqDist +from nltk.tokenize import sent_tokenize import pandas as pd import statistics + # you'll have to also download "punkt" from nltk # code snippets for prettifying plots @@ -49,10 +51,6 @@ def read_works_into_string(directory_path): strings.append(f.read()) return "\n".join(strings) -# this function takes a corpus as its input and gives a Mendenhall curve, i.e. a frequency distribution of tokens as its output -# precise input: corpus = string ; -# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series" -# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png" # by subdiving the text into segments of 1000, it calculates the type token ratio for each segment and then averages over them # this ensures a comparability of the type token ratios for varying text sizes @@ -78,7 +76,12 @@ def standardised_type_token_ratio(tokens): std_ttr = statistics.mean(ttrs) return std_ttr - + +# this function takes a corpus as its input and gives a Mendenhall curve, i.e. a frequency distribution of tokens as its output +# precise input: corpus = string ; +# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series" +# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png" + def mendenhall_curve(corpus, curve_title, plot_destination): tokens = word_tokenize(corpus) @@ -154,11 +157,88 @@ def mendenhall_curve(corpus, curve_title, plot_destination): return standard_deviation, mean, type_token_ratio +def sentence_metrics(corpus, curve_title, plot_destination): + sents = sent_tokenize(corpus) + + sent_lens = [] + for sent in sents: + tokens = word_tokenize(sent) + + #cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)]) + """ + short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens + # had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin" + # and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running" + for token in cleaned_tokens: + dehyphenated_token = [] + letter_present = 0 + for c in token: + if c.isalpha() == True: + dehyphenated_token.append(c) + letter_present = 1 + elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens, + #bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of + # high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference, + # relatively speaking + dehyphenated_token_joined = ''.join(map(str, dehyphenated_token)) + #print(dehyphenated_token_joined) + short_clean_tokens.append(dehyphenated_token_joined) + dehyphenated_token = [] + letter_present = 0 + """ + sent_lens.append(len(tokens)) + + + sent_len_dist = FreqDist(sent_lens).most_common(50) + + # convert to FreqDist object to a pandas series for easier processing + sent_len_dist_panda = pd.Series(dict(sent_len_dist)) + + # sort, normalise and round the panda series + + new_sent_len_dist = sent_len_dist_panda.sort_index() + print(new_sent_len_dist) + + for i in range(0, len(new_sent_len_dist.index)): + #for index in new_token_len_dist.index: + new_sent_len_dist.iat[i] = round(new_sent_len_dist.iat[i]/len(sent_lens), 2) #index-1 bc the index starts counting from zero, the word lengths not + + # plot using matplotlib and seaborn + + # set figure, ax into variables + fig, ax = plt.subplots(figsize=(10,10)) + + # call function for bar (value) labels + #addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values) + + plt.title(curve_title) + ax.set_xlabel("Sentence Length") + ax.set_ylabel("Percentage of Occurence") + + + sns.lineplot(x=new_sent_len_dist.index, y=new_sent_len_dist.values, ax=ax, palette="flare") + #plt.xticks(rotation=30) !!! very useful for words + plt.savefig(plot_destination) + + # calculate the standard deviation, mean, token/type ratio + standard_deviation_sent = statistics.stdev(sent_lens) + mean_sent = statistics.mean(sent_lens) + + return standard_deviation_sent, mean_sent + + #create the Mendenhall Curve for the Throne of Glass Series -std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png") +#std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png") #create the Mendenhall Curve for the Grishaverse Books -std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png") +#std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png") + +# Mendenhall Curve Sentence Lengths for Throne of Glass Canon +std_dev_sent_tog_canon, mean_sent_tog_canon = sentence_metrics(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_sent_len.png") + +# Mendenhall Curve Sentence Lenghts for Grishavers Canon +std_dev_sent_grishaverse_canon, mean_sent_grishaverse_canon = sentence_metrics(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_sent_len.png") + # create a dataframe to store all the overview statistics in # columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15; @@ -167,5 +247,14 @@ std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_rati # tag_ngram_frequencies # punctuation frequencies # token/type ratio -data_overview = pd.DataFrame({"mean_tokens":[mean_tokens_tog_canon, mean_tokens_grishaverse_canon], "std_dev":[std_dev_tokens_tog_canon, std_dev_tokens_grishaverse_canon], "type_token_ratio":[type_token_ratio_tog_canon, type_token_ratio_grishaverse_canon]}, index= ["throne_of_glass_canon", "grishaverse_canon"]) -data_overview.to_csv(f"data_overview/data_overview.csv") \ No newline at end of file +""" +data_overview = pd.DataFrame( + {"mean_tokens":[mean_tokens_tog_canon, mean_tokens_grishaverse_canon], + "std_dev":[std_dev_tokens_tog_canon, std_dev_tokens_grishaverse_canon], + "type_token_ratio":[type_token_ratio_tog_canon, type_token_ratio_grishaverse_canon], + "mean_sent":[mean_sent_tog_canon, mean_sent_grishaverse_canon], + "std_dev":[std_dev_sent_tog_canon, std_dev_sent_grishaverse_canon]}, + index= ["throne_of_glass_canon", "grishaverse_canon"] + ) + """ +#data_overview.to_csv(f"data_overview/data_overview.csv") \ No newline at end of file diff --git a/throne_of_glass/freq_distribution/all_canon_sent_len.png b/throne_of_glass/freq_distribution/all_canon_sent_len.png new file mode 100644 index 0000000000000000000000000000000000000000..13c68cc270455f7f12eba53092f86b88804cf899 Binary files /dev/null and b/throne_of_glass/freq_distribution/all_canon_sent_len.png differ