diff --git a/clean_stylometry.py b/clean_stylometry.py new file mode 100644 index 0000000000000000000000000000000000000000..da317423a6874dd176cef546ab44ffb6ad8a48cd --- /dev/null +++ b/clean_stylometry.py @@ -0,0 +1,579 @@ +import seaborn as sns +import matplotlib.pyplot as plt +from cycler import cycler +import os +from nltk.tokenize import word_tokenize +from nltk.probability import FreqDist +from nltk.tokenize import sent_tokenize +from nltk.tag import pos_tag +import pandas as pd +import statistics +import re + +# you'll have to also download "punkt" from nltk + +# create function for bar (value) labels +def addlabels(x,y): + for i in range(len(x)): + plt.text(i, y[i], y[i], ha = "center") + +# function compiling the works given into a single string. Input required: +# general path of the files as string, for example: "/throne_of_glass/data/canon_works/" +# specific names of the works as a list of strings, for example "throne_of_glass_1.txt" + +# /throne_of_glass/data/canon_works/ +def read_works_into_string(directory_path): + strings = [] + works = os.listdir(directory_path) + for work in works: + with open(f"{directory_path}"+f"/{work}", "r") as f: + strings.append(f.read()) + return "\n".join(strings) + +# by subdiving the text into segments of 1000, it calculates the type token ratio for each segment and then averages over them +# this ensures a comparability of the type token ratios for varying text sizes +def standardised_type_token_ratio(tokens): + ttrs = [] + segment_tokens = [] + segment = 0 + for token in tokens: + if segment < 1000: + segment_tokens.append(token) + segment += 1 + elif segment == 1000: + types = set(segment_tokens) + ttr = len(types)/len(segment_tokens) + ttrs.append(ttr) + segment_tokens =[] + segment = 0 + if len(ttrs) <= 1: + types = set(tokens) + std_ttr = len(types)/len(tokens) + print("Warning: Text was too short for segmentation!") + else: + std_ttr = statistics.mean(ttrs) + return std_ttr + + +def tokenize_and_clean_text(text): + + tokens = word_tokenize(text) + cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)]) + short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens + # had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin" + # and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running" + + for token in cleaned_tokens: + dehyphenated_token = [] + letter_present = 0 + dehyphenated = 0 + second_word_in_compound = 0 + for c in token: + if c.isalpha() == True: + dehyphenated_token.append(c) + letter_present = 1 + if dehyphenated == 1: + second_word_in_compound = 1 + elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens, + #bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of + # high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference, + # relatively speaking + dehyphenated_token_joined = ''.join(map(str, dehyphenated_token)) + #print(dehyphenated_token_joined) + short_clean_tokens.append(dehyphenated_token_joined) + dehyphenated_token = [] + letter_present = 0 + dehyphenated = 1 + second_word_in_compound = 0 + if letter_present == 1 and dehyphenated == 0: + short_clean_tokens.append(token) #catching the tokens that didn't have any special characters; but not the dehyphenated ones twice + elif letter_present == 1 and dehyphenated == 1 and second_word_in_compound == 1: + short_clean_tokens.append(''.join(map(str, dehyphenated_token))) + return short_clean_tokens + +def mendenhall_token_metrics(tokens): + # create the distribution of token lengths / Mendenhall curve + + token_lengths = [len(token) for token in tokens] + + # Calculate the trimmed token length (with 5% trimming) We need to remove the outliers, bc even despite preprocessing, + # there still are some very wrong lengths, which entirely skews the metrics and also ruins our p-values later on + trim_percent = 0.005 + trim_len = int(len(token_lengths) * trim_percent / 2) + token_lengths = sorted(token_lengths)[trim_len:-trim_len] + + + token_length_distribution = FreqDist(token_lengths).most_common(15) + + # convert to FreqDist object to a pandas series for easier processing + token_len_dist_panda = pd.Series(dict(token_length_distribution)) + + # sort, normalise and round the panda series + + new_token_len_dist = token_len_dist_panda.sort_index() + + for i in range(0, len(new_token_len_dist.index)): + #for index in new_token_len_dist.index: + new_token_len_dist.iat[i] = round(new_token_len_dist.iat[i]/len(tokens), 3) #index-1 bc the index starts counting from zero, the word lengths not + #if float(new_token_len_dist.iat[i]) == 0.00: + # new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry + + standard_deviation = statistics.stdev(token_lengths) + mean = statistics.mean(token_lengths) + + return new_token_len_dist, standard_deviation, mean + +def plot_distribution(x, y, plt_title, file_path_for_pic:str, x_label="Number of Kudos", y_label="Percentage of Occurence",palette="flare", plt_type="barplot", add_labels=True): + + plt.figure(figsize=(10,10)) + plt.title(plt_title) + plt.xlabel(x_label) + plt.ylabel(y_label) + + if add_labels: + addlabels(x=x.index, y=y.values) + + match case: + sns.scatterplot(x=x.index, y=y.values, palette=palette) + #plt.xticks(new_dist.index[::100], new_dist.index[::100]) + + else: + sns.lineplot(x=x.index, y=y.values, palette=palette) + + plt.savefig(file_path_for_pic) + plt.close() + + +# plot using matplotlib and seaborn + + # set figure, ax into variables + fig, ax = plt.subplots(figsize=(10,10)) + + # call function for bar (value) labels + addlabels(x=new_token_len_dist.index, y=new_token_len_dist.values) + + plt.title(curve_title) + ax.set_xlabel("Word Length") + ax.set_ylabel("Percentage of Occurence") + + sns.barplot(x=new_token_len_dist.index, y=new_token_len_dist.values, ax=ax, palette="flare") + #plt.xticks(rotation=30) !!! very useful for words + #plt.get_figure() + plt.savefig(plot_destination) + #print(new_token_len_dist.tabulate()) + #token_length_freq_dist_plot = token_length_distribution.plot(title=curve_title, percents=True) + + #fig_freq_dist = token_length_freq_dist_plot.get_figure() + #fig_freq_dist.savefig(plot_destination) + + +class StylometryMetrics: + + def __init__(self, directory_path): + self.text = read_works_into_string(directory_path) + self.clean_tokens = tokenize_and_clean_text(self.text) + + def calculate_standardised_ttr(self): + self.sttr = standardised_type_token_ratio(self.clean_tokens) + + def calculate_mendenhall_token_metrics(self): + self.tk_len_dist, self.tk_len_std, self.tk_len_mean = mendenhall_token_metrics(self.clean_tokens) + + def plot + + + + + + + + + + +# this function takes a corpus as its input and gives a Mendenhall curve, i.e. a frequency distribution of tokens as its output +# precise input: corpus = string ; +# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series" +# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png" + + +def mendenhall_curve(corpus, curve_title, plot_destination): + + short_clean_tokens = tokenize_and_clean_text(corpus) + + # create the distribution of token lengths / Mendenhall curve + + token_lengths = [len(token) for token in short_clean_tokens] + + # Calculate the trimmed token length (with 5% trimming) We need to remove the outliers, bc even despite preprocessing, + # there still are some very wrong lengths, which entirely skews the metrics and also ruins our p-values later on + trim_percent = 0.005 + trim_len = int(len(token_lengths) * trim_percent / 2) + token_lengths = sorted(token_lengths)[trim_len:-trim_len] + + + token_length_distribution = FreqDist(token_lengths).most_common(15) + + # convert to FreqDist object to a pandas series for easier processing + token_len_dist_panda = pd.Series(dict(token_length_distribution)) + + # sort, normalise and round the panda series + + new_token_len_dist = token_len_dist_panda.sort_index() + + for i in range(0, len(new_token_len_dist.index)): + #for index in new_token_len_dist.index: + new_token_len_dist.iat[i] = round(new_token_len_dist.iat[i]/len(short_clean_tokens), 3) #index-1 bc the index starts counting from zero, the word lengths not + #if float(new_token_len_dist.iat[i]) == 0.00: + # new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry + + + # plot using matplotlib and seaborn + + # set figure, ax into variables + fig, ax = plt.subplots(figsize=(10,10)) + + # call function for bar (value) labels + addlabels(x=new_token_len_dist.index, y=new_token_len_dist.values) + + plt.title(curve_title) + ax.set_xlabel("Word Length") + ax.set_ylabel("Percentage of Occurence") + + sns.barplot(x=new_token_len_dist.index, y=new_token_len_dist.values, ax=ax, palette="flare") + #plt.xticks(rotation=30) !!! very useful for words + #plt.get_figure() + plt.savefig(plot_destination) + #print(new_token_len_dist.tabulate()) + #token_length_freq_dist_plot = token_length_distribution.plot(title=curve_title, percents=True) + + #fig_freq_dist = token_length_freq_dist_plot.get_figure() + #fig_freq_dist.savefig(plot_destination) + + # calculate the standard deviation, mean, token/type ratio + standard_deviation = statistics.stdev(token_lengths) + mean = statistics.mean(token_lengths) + + type_token_ratio = standardised_type_token_ratio(short_clean_tokens) + + return standard_deviation, mean, type_token_ratio + + +def sentence_metrics(corpus, curve_title, series, canon_or_fanfic): + + sents = sent_tokenize(corpus) + sent_lens = [] + for sent in sents: + short_clean_tokens = tokenize_and_clean_text(sent) + sent_lens.append(len(short_clean_tokens)) + #if len(short_clean_tokens)>= 90: + #print(f"This sentence: \n {sent} \n is this long: {len(short_clean_tokens)}") + + # Calculate the trimmed mean sentence length (with 5% trimming) We need to remove the outliers, bc even despite preprocessing, + # there still are some sentences that are 1200 tokens long, which entirely skews the metrics and also ruins our p-values later on + trim_percent = 0.05 + trim_len = int(len(sent_lens) * trim_percent / 2) + sent_lens = sorted(sent_lens)[trim_len:-trim_len] + + + sent_len_dist = FreqDist(sent_lens) + #print(sent_len_dist) + + # convert to FreqDist object to a pandas series for easier processing + sent_len_dist_panda = pd.Series(dict(sent_len_dist)) + + # sort, normalise and round the panda series + + new_sent_len_dist = sent_len_dist_panda.sort_index() + #print(new_sent_len_dist) + + for i in range(0, len(new_sent_len_dist.index)): + #for index in new_token_len_dist.index: + new_sent_len_dist.iat[i] = round(new_sent_len_dist.iat[i]/len(sent_lens), 2) #index-1 bc the index starts counting from zero, the word lengths not + + #print(new_sent_len_dist) + # plot using matplotlib and seaborn + + # set figure, ax into variables + fig, ax = plt.subplots(figsize=(10,10)) + + # call function for bar (value) labels + #addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values) + + plt.title(curve_title) + ax.set_xlabel("Sentence Length") + ax.set_ylabel("Percentage of Occurence") + + + sns.lineplot(x=new_sent_len_dist.index, y=new_sent_len_dist.values, ax=ax, palette="crest") + #plt.xticks(rotation=30) !!! very useful for words + plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_sent_len_long.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png" + + # plot the 40 most frequent sentence lenghts as a barplot for a more detailed insight + sent_len_dist_short = FreqDist(sent_lens).most_common(25) + + # convert to FreqDist object to a pandas series for easier processing + sent_len_dist_short_panda = pd.Series(dict(sent_len_dist_short)) + + # sort, normalise and round the panda series + + new_sent_len_dist_short = sent_len_dist_short_panda.sort_index() + #print(new_sent_len_dist) + + for i in range(0, len(new_sent_len_dist_short.index)): + #for index in new_token_len_dist.index: + new_sent_len_dist_short.iat[i] = round(new_sent_len_dist_short.iat[i]/len(sent_lens), 2) #index-1 bc the index starts counting from zero, the word lengths not + + # set figure, ax into variables + fig, ax = plt.subplots(figsize=(10,10)) + + # call function for bar (value) labels + addlabels(x=new_sent_len_dist_short.index, y=new_sent_len_dist_short.values) + + plt.title(curve_title) + ax.set_xlabel("Sentence Length") + ax.set_ylabel("Percentage of Occurence") + + sns.barplot(x=new_sent_len_dist_short.index, y=new_sent_len_dist_short.values, ax=ax, palette="YlGnBu") + #plt.xticks(rotation=30) !!! very useful for words + plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_sent_len_short.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png" + + # calculate the standard deviation, mean, token/type ratio + standard_deviation_sent = statistics.stdev(sent_lens) + mean_sent = statistics.mean(sent_lens) + + return standard_deviation_sent, mean_sent + + +# overall pos_tag frequency distribution +# pos_tag ngrams; (maybe exclude stopwords?) +# tag collocates for specific tags --> adjectives most frequently with nouns +# most frequent words +# most frequent words for specific tags --> punctuation; +# most frequent adjectives + +def pos_tag_frequencies(corpus, series, canon_or_fanfic): + #nltk.pos_tag(text) --> [('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), + #('completely', 'RB'), ('different', 'JJ')] + tokens = word_tokenize(corpus) + """ + short_tokens = [] + for token in tokens: + dehyphenated_token = [] + letter_present = 0 + dehyphenated = 0 + second_word_in_compound = 0 + for c in token: + if c.isalpha() == True: + dehyphenated_token.append(c) + letter_present = 1 + if dehyphenated == 1: + second_word_in_compound = 1 + elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens, + #bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of + # high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference, + # relatively speaking + dehyphenated_token_joined = ''.join(map(str, dehyphenated_token)) + #print(dehyphenated_token_joined) + short_tokens.append(dehyphenated_token_joined) + short_tokens.append(c) #append the hyphen/ other punctuation --> we're also interested in that + dehyphenated_token = [] + letter_present = 0 + dehyphenated = 1 + second_word_in_compound = 0 + if letter_present == 1 and dehyphenated == 0: + short_tokens.append(token) #catching the tokens that didn't have any special characters; but not the dehyphenated ones twice + elif letter_present == 1 and dehyphenated == 1 and second_word_in_compound == 1: + short_tokens.append(''.join(map(str, dehyphenated_token))) + """ + tag_token_tuples = pos_tag(tokens) + punctuation_regex = r"[^\w\s]+" + summarised_tags = [] + punctuation_tags = [] + index = 0 + for token, tag in tag_token_tuples: + if re.match(punctuation_regex, token): + summarised_tags.append("punctuation") + if re.match(r"[\"\'“â€â€™â€˜]+", token): + punctuation_tags.append("quotation_marks") + elif re.match(r"[,;:.?!-]+", token): + try: + punctuation_tags.append("ellipsis" if token == "." and tag_token_tuples[index+1][1] == "." and tag_token_tuples[index+2][1] == "." else "full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct") + except: + punctuation_tags.append("full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct") + + else: + if tag in ["MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]: + summarised_tags.append("verb") + elif tag in ["JJ", "JJR", "JJS"]: + summarised_tags.append("adjective") + elif tag in ["RB", "RBR", "RBS", "WRB"]: + summarised_tags.append("adverb") + elif tag in ["PRP", "PRP$", "WP", "WP$"]: + summarised_tags.append("pronoun") + elif tag in ["NNP", "NNPS"]: + summarised_tags.append("proper_noun") + elif tag in ["NN", "NNS"]: + summarised_tags.append("common_noun") + elif tag in ["DT", "PDT", "WDT"]: + summarised_tags.append("determiner") + elif tag == "CC": + summarised_tags.append("coordinating_conj") + elif tag == "IN": + summarised_tags.append("subordinating_conj") + elif tag in ["$", "CD", "EX", "LS", "POS", "SYM", "TO", "UH", "RP", "FW"]: + summarised_tags.append("other_tag") + index += 1 + + + tag_freq_dist = FreqDist(summarised_tags) + #print(tag_freq_dist) + + # convert FreqDist object to a pandas series for easier processing + tag_freq_dist_panda = pd.Series(dict(tag_freq_dist)) + #print(tag_freq_dist_panda) + + # sort, normalise and round the panda series + + new_tag_freq_dist = tag_freq_dist_panda.sort_index() + #print(new_sent_len_dist) + + for i in range(0, len(new_tag_freq_dist.index)): + #for index in new_token_len_dist.index: + new_tag_freq_dist.iat[i] = round(new_tag_freq_dist.iat[i]/len(tag_token_tuples), 2) #index-1 bc the index starts counting from zero, the word lengths not + + print(new_tag_freq_dist) + + # set figure, ax into variables + fig, ax = plt.subplots(figsize=(10,10)) + + # call function for bar (value) labels + addlabels(x=new_tag_freq_dist.index, y=new_tag_freq_dist.values) + + plt.title(f"POS Tag Frequencies for the {series.replace('_' , ' ').title()} {canon_or_fanfic.replace('_' , ' ').title()}") + ax.set_xlabel("POS Tags") + ax.set_ylabel("Percentage of Occurence") + + sns.barplot(x=new_tag_freq_dist.index, y=new_tag_freq_dist.values, ax=ax, palette="RdPu") + plt.xticks(rotation=30) # !!! very useful for words + plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_pos_tag_frequencies.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png" + + + #punctuation frequency distribution + + punct_tag_freq_dist = FreqDist(punctuation_tags) + #print(tag_freq_dist) + + # convert FreqDist object to a pandas series for easier processing + punct_tag_freq_dist_panda = pd.Series(dict(punct_tag_freq_dist)) + #print(punct_tag_freq_dist_panda) + + # sort, normalise and round the panda series + + new_punct_tag_freq_dist = punct_tag_freq_dist_panda.sort_index() + #print(new_sent_len_dist) + + for i in range(0, len(new_punct_tag_freq_dist.index)): + #for index in new_token_len_dist.index: + new_punct_tag_freq_dist.iat[i] = round(new_punct_tag_freq_dist.iat[i]/len(punctuation_tags), 3) #index-1 bc the index starts counting from zero, the word lengths not + + #print(new_punct_tag_freq_dist) + + # set figure, ax into variables + fig, ax = plt.subplots(figsize=(10,10)) + + # call function for bar (value) labels + addlabels(x=new_punct_tag_freq_dist.index, y=new_punct_tag_freq_dist.values) + + + plt.title(f"Punctuation Frequencies for the {series.replace('_' , ' ').title()} {canon_or_fanfic.replace('_' , ' ').title()}") + ax.set_xlabel("Types of Punctuation") + ax.set_ylabel("Percentage of Occurence") + + sns.barplot(x=new_punct_tag_freq_dist.index, y=new_punct_tag_freq_dist.values, ax=ax, palette="OrRd") + plt.xticks(rotation=30) # !!! very useful for words + plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_punctuation_frequencies.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png" + + +#create the Mendenhall Curve for the Throne of Glass Series +std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png") + +#create the Mendenhall Curve for the Grishaverse Books +std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png") + + +# Mendenhall Curve Sentence Lengths for Throne of Glass Canon +std_dev_sent_tog_canon, mean_sent_tog_canon = sentence_metrics(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Throne of Glass Series", "throne_of_glass", "canon") + +# Mendenhall Curve Sentence Lenghts for Grishavers Canon +std_dev_sent_grishaverse_canon, mean_sent_grishaverse_canon = sentence_metrics(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Grishaverse Books", "grishaverse", "canon") + +# POS Tag frequencies for TOG +pos_tag_frequencies(read_works_into_string(f"throne_of_glass/data/canon_works"), "throne_of_glass", "canon") + +# POS Tag frequencies for Grishaverse +pos_tag_frequencies(read_works_into_string(f"grishaverse/data/canon_works"), "grishaverse", "canon") + +def run_functions(directory_path): + """ + mean_tks = [] + idx = [] + std_dev_tks = [] + ttrs = [] + mean_sts= [] + std_dev_sts = [] + + """ + + #for txt_fic in os.listdir(directory_path): + works = os.listdir(directory_path) + pattern = r"^[a-zA-Z_]+(?=/)" # get series from directory path + match = re.search(pattern, directory_path) + if match: + series = match.group(0) + for work in works: + with open(f"{directory_path}"+f"/{work}", "r") as f: + f = f.read() + std_dev_tk, mean_tk, ttr = mendenhall_curve(f, f"Mendenhall Curve for the {series.replace('_' , ' ').title()} {work[:-4].replace('_' , ' ').title()}", f"{series}/freq_distribution/{work[:-4]}_token_len.png") + mean_tokens.append(mean_tk) + std_dev_tokens.append(std_dev_tk) + type_token_ratio.append(ttr) + std_dev_st, mean_st = sentence_metrics(f, f"Mendenhall Curve for Sentence Lenghts for the {series.replace('_' , ' ').title()} {work[:-4].replace('_' , ' ').title()}", series, work[:-4]) + mean_sent.append(mean_st) + std_dev_sents.append(std_dev_st) + pos_tag_frequencies(f, series, work[:-4]) + index.append(f"{series}_{work[:-4]}") + + +#grishaverse/data/split_txt_fanfics + +#create lists for each of the columns of the dataframe we'll create + +mean_tokens = [mean_tokens_tog_canon, mean_tokens_grishaverse_canon] +std_dev_tokens = [std_dev_tokens_tog_canon, std_dev_tokens_grishaverse_canon] +type_token_ratio = [type_token_ratio_tog_canon, type_token_ratio_grishaverse_canon] +mean_sent = [mean_sent_tog_canon, mean_sent_grishaverse_canon] +std_dev_sents = [std_dev_sent_tog_canon, std_dev_sent_grishaverse_canon] +index = ["throne_of_glass_canon", "grishaverse_canon"] + +# create a dataframe to store all the overview statistics in +# columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15; +# mean_sent; std_dev_sent; freq_sent_len .... +# tag_frequencies +# tag_ngram_frequencies +# punctuation frequencies +# token/type ratio + +data_overview = pd.DataFrame( + {"mean_tokens":mean_tokens, + "std_dev_tokens":std_dev_tokens, + "type_token_ratio":type_token_ratio, + "mean_sent":mean_sent, + "std_dev_sent":std_dev_sents}, + index = index +) + +if __name__ == "__main__": + + run_functions("grishaverse/data/split_txt_fanfics") + run_functions("throne_of_glass/data/split_txt_fanfics") + data_overview.to_csv(f"data_overview/data_overview.csv") diff --git a/colour_code.py b/colour_code.py new file mode 100644 index 0000000000000000000000000000000000000000..2d5f8a3bded0d2d083fd3af240a8fb31d9d238d1 --- /dev/null +++ b/colour_code.py @@ -0,0 +1,21 @@ + +# code snippets for prettifying plots + +#colours + +CB91_Blue = '#2CBDFE' +CB91_Green = '#47DBCD' +CB91_Pink = '#F3A0F2' +CB91_Purple = '#9D2EC5' +CB91_Violet = '#661D98' +CB91_Amber = '#F5B14C' + +color_list = [CB91_Pink, CB91_Blue, CB91_Green, CB91_Amber, + CB91_Purple, CB91_Violet] +plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list) + +#some colour palette playing around + +cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True) +cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True) +cm2 = sns.cubehelix_palette(as_cmap=True) \ No newline at end of file diff --git a/fanfic_preprocessing.py b/fanfic_preprocessing.py index 48a311da8daabdbb52d2ae9f17b59a106f9a2ad2..ad6ab816f92fedea75fa48cf9bb3cb6bfa0a7d9c 100644 --- a/fanfic_preprocessing.py +++ b/fanfic_preprocessing.py @@ -22,7 +22,7 @@ CB91_Purple = '#9D2EC5' CB91_Violet = '#661D98' CB91_Amber = '#F5B14C' -color_list = [pink, light_green, purple_grey, blue_grey, CB91_Green, CB91_Pink, CB91_Blue, CB91_Amber, +color_list = [ blue_grey, CB91_Amber, pink, light_green, CB91_Green, CB91_Pink, CB91_Blue, CB91_Purple, CB91_Violet] plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list) @@ -32,11 +32,11 @@ cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True) cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True) cm2 = sns.cubehelix_palette(as_cmap=True) - - #palette_1 = sns.color_palette("flare") #palette_2 = sns.color_palette("mako_r", as_cmap=True) +# actual preprocessing code + #file header: # work_id,title,author,rating,category,fandom,relationship,character,additional tags,language,published,status,status date,words,chapters,comments,kudos,bookmarks,hits,all_kudos,all_bookmarks,body # 27852922,Dealing with Our Demons,['ravenyenn19'],Mature,F/M,"Six of Crows Series",Kaz Brekker/Inej Ghafa,"Kaz B","Romance,Kanej - Freeform, Eventual Smut",English,2020-12-03,Updated,2023-03-16,747673,162/?,8573,12204,1373,709212,"['ud4m', 'book_addict_1228', 'ephemeraldelights', 'bluedelilah25', 'sunshinecorsets', 'I_do_not_like_purple_glasses', 'beep_boop_00', 'schleswigholstein', 'moonandstars75', 'ewerythingoes', 'mindfighters', 'rosibunnis', 'Lizie06', 'ghostlatte', 'aguswolman', 'QueenofEnglan', 'JenBoyette04', 'gnitneb_reads', 'gloomysunshine', 'v1ofvs', 'BazzaKrekker', 'BookGeek', 'poppyflower19', 'Cassanibal', 'vanilla_chai_tea', 'Honorthyword', 'mariaarmengol', 'luc1inda', 'zarawrites', 'monmough', 'Guilty__Pleasures', 'Ilyann', 'folieadeux_0_0', 'dragonguard', 'Emeliemarx', 'angrydabee', 'slythxrclaw', 'samaram0215', 'letsgetthisbread69', 'Mintmew', 'biblichour', 'Katloupet', 'Miss_ginger', 'inejsquake', 'Arabella_7833', 'flossy_flo99', 'a_k123', 'hushedwanderer', 'siriuslymichele', 'AnnaAvinaVTDX']",[],"Dear Kaz, @@ -45,90 +45,49 @@ cm2 = sns.cubehelix_palette(as_cmap=True) grisha_fanfics = pd.read_csv("grishaverse/data/fanfics/grishaverse_fics.csv") tog_fanfics = pd.read_csv("throne_of_glass/data/fanfics/throne_of_glass_fics.csv") -""" -# plot distribution of kudos for Grishaverse Fanfics - -grisha_kudos = grisha_fanfics["kudos"].values.tolist() - -grisha_kudos_freq_dist = FreqDist(grisha_kudos) -# convert to FreqDist object to a pandas series for easier processing -dist_panda = pd.Series(dict(grisha_kudos_freq_dist)) -#print(dist_panda) - -# sort, normalise and round the panda series - -new_dist = dist_panda.sort_index() - -for i in range(0, len(new_dist.index)): -#for index in new_token_len_dist.index: - new_dist.iat[i] = round(new_dist.iat[i]/len(grisha_kudos), 3) #index-1 bc the index starts counting from zero, the word lengths not - #if float(new_token_len_dist.iat[i]) == 0.00: - # new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry - -#calculate cumulative distribution -cum_dist = np.cumsum(new_dist.values) - -# plot using matplotlib and seaborn - -# set figure, ax into variables -fig, ax = plt.subplots(figsize=(10,10)) - -# call function for bar (value) labels -#addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values) +def read_csv_to_pd(file_path, name_of_file) -> pd: #fix type hints + name_of_file = pd.read_csv(file_path) + return name_of_file -plt.title("Grishaverse Cumulative Frequency Distribution of All Kudos") -ax.set_xlabel("Number of Kudos") -ax.set_ylabel("Percentage of Occurence") +def calculate_cum_kudo_distribution(fanfic_pd): + fanfic_kudos = fanfic_pd["kudos"].values.tolist() + fanfic_kudos_freq_dist = FreqDist(fanfic_kudos) + # convert to FreqDist object to a pandas series for easier processing + dist_panda = pd.Series(dict(fanfic_kudos_freq_dist)) -sns.lineplot(x=new_dist.index, y=cum_dist, ax=ax) -#plt.xticks(rotation=30) !!! very useful for words -fig.savefig(f"grishaverse/freq_distribution/fanfic_kudo_freq_dist.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png" + # sort, normalise and round the panda series + new_dist = dist_panda.sort_index() -""" -# plot distribution of kudos for Throne of Glass Fanfics + for i in range(0, len(new_dist.index)): + #for index in new_token_len_dist.index: + new_dist.iat[i] = round(new_dist.iat[i]/len(fanfic_kudos), 3) #index-1 bc the index starts counting from zero, the word lengths not + #if float(new_token_len_dist.iat[i]) == 0.00: + # new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry -tog_kudos = tog_fanfics["kudos"].values.tolist() + #calculate cumulative distribution + cum_dist = np.cumsum(new_dist.values) + return new_dist, cum_dist -tog_kudos_freq_dist = FreqDist(tog_kudos) -# convert to FreqDist object to a pandas series for easier processing -dist_panda = pd.Series(dict(tog_kudos_freq_dist)) -#print(dist_panda) -# sort, normalise and round the panda series +def plot_distribution(new_dist, cum_dist, plt_title, file_path_for_pic:str, x_label="Number of Kudos", y_label="Percentage of Occurence", scatter_plt=False, max_ticks=10): + + plt.figure(figsize=(10,10)) + plt.title(plt_title) + plt.xlabel(x_label) + plt.ylabel(y_label) + + if scatter_plt: + sns.scatterplot(x=new_dist.index, y=cum_dist) + #plt.xticks(new_dist.index[::100], new_dist.index[::100]) -new_dist = dist_panda.sort_index() + else: + sns.lineplot(x=new_dist.index, y=cum_dist) -for i in range(0, len(new_dist.index)): -#for index in new_token_len_dist.index: - new_dist.iat[i] = round(new_dist.iat[i]/len(tog_kudos), 3) #index-1 bc the index starts counting from zero, the word lengths not - #if float(new_token_len_dist.iat[i]) == 0.00: - # new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry + plt.savefig(file_path_for_pic) + plt.close() -#calculate cumulative distribution -cum_dist = np.cumsum(new_dist.values) - -# plot using matplotlib and seaborn - -# set figure, ax into variables -fig, ax = plt.subplots(figsize=(10,10)) - -# call function for bar (value) labels -#addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values) - -plt.title("Throne of Glass Cumulative Frequency Distribution of Kudos") -ax.set_xlabel("Number of Kudos") -ax.set_ylabel("Percentage of Occurence") - - -sns.lineplot(x=new_dist.index, y=cum_dist, ax=ax) -#plt.xticks(rotation=30) !!! very useful for words -fig.savefig(f"throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png" - - - -""" -def preprocess_data(df, series): +def separate_fanfics_by_good_medium_bad(df, series): good_fics = [] medium_fics = [] bad_fics = [] @@ -163,6 +122,16 @@ def preprocess_data(df, series): f.write(medium_fics_joined) -preprocess_data(grisha_fanfics, "grishaverse") -preprocess_data(tog_fanfics, "throne_of_glass") -""" \ No newline at end of file +if __name__ == "__main__": + #grishaverse + #grisha_fanfics = read_csv_to_pd(file_path="grishaverse/data/fanfics/grishaverse_fics.csv", name_of_file=grisha_fanfics) + #new_dist, cum_dist = calculate_cum_kudo_distribution(grisha_fanfics) + #plot_distribution(new_dist=new_dist, cum_dist=cum_dist, plt_title="Grishaverse Cumulative Frequency Distribution of All Kudos", file_path_for_pic="grishaverse/freq_distribution/fanfic_kudo_freq_dist.png", scatter_plt=_plt=True) + + #throne of glass + tog_fanfics = read_csv_to_pd(file_path="throne_of_glass/data/fanfics/throne_of_glass_fics.csv", name_of_file=tog_fanfics) + new_dist, cum_dist = calculate_cum_kudo_distribution(tog_fanfics) + plot_distribution(new_dist=new_dist, cum_dist=cum_dist, plt_title="Throne of Glass Cumulative Frequency Distribution of All Kudos", file_path_for_pic= "throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png", scatter_plt=True) + + #separate_fanfics_by_good_medium_bad(grisha_fanfics, "grishaverse") + #separate_fanfics_by_good_medium_bad(tog_fanfics, "throne_of_glass") diff --git a/grishaverse/freq_distribution/fanfic_kudo_freq_dist.png b/grishaverse/freq_distribution/fanfic_kudo_freq_dist.png index 6af7e5e445e3b79b93be2d05cdbd138942633747..add2892530f510b7e6873d58ebfab435a4d0ea49 100644 Binary files a/grishaverse/freq_distribution/fanfic_kudo_freq_dist.png and b/grishaverse/freq_distribution/fanfic_kudo_freq_dist.png differ diff --git a/stylometry_code.py b/stylometry_code.py index 6afec30a9aa461ffe3511b8b73a80a92d367a29c..34166240d72edaaaa55eb640e52fd7f908c08760 100644 --- a/stylometry_code.py +++ b/stylometry_code.py @@ -13,27 +13,6 @@ import re # you'll have to also download "punkt" from nltk -# code snippets for prettifying plots - -#colours - -CB91_Blue = '#2CBDFE' -CB91_Green = '#47DBCD' -CB91_Pink = '#F3A0F2' -CB91_Purple = '#9D2EC5' -CB91_Violet = '#661D98' -CB91_Amber = '#F5B14C' - -color_list = [CB91_Pink, CB91_Blue, CB91_Green, CB91_Amber, - CB91_Purple, CB91_Violet] -plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list) - -#some colour palette playing around - -cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True) -cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True) -cm2 = sns.cubehelix_palette(as_cmap=True) - # create function for bar (value) labels def addlabels(x,y): for i in range(len(x)): @@ -482,10 +461,6 @@ mean_sent = [mean_sent_tog_canon, mean_sent_grishaverse_canon] std_dev_sents = [std_dev_sent_tog_canon, std_dev_sent_grishaverse_canon] index = ["throne_of_glass_canon", "grishaverse_canon"] - -run_functions("grishaverse/data/split_txt_fanfics") -run_functions("throne_of_glass/data/split_txt_fanfics") - # create a dataframe to store all the overview statistics in # columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15; # mean_sent; std_dev_sent; freq_sent_len .... @@ -502,5 +477,9 @@ data_overview = pd.DataFrame( "std_dev_sent":std_dev_sents}, index = index ) - -data_overview.to_csv(f"data_overview/data_overview.csv") + +if __name__ == "__main__": + + run_functions("grishaverse/data/split_txt_fanfics") + run_functions("throne_of_glass/data/split_txt_fanfics") + data_overview.to_csv(f"data_overview/data_overview.csv") diff --git a/throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png b/throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png index 55f0c59b642d58388ef446ce42dbfd36c88bf07f..dbfa21f3bd2356bdb55e33839eaa5650621645b9 100644 Binary files a/throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png and b/throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png differ