diff --git a/clean_stylometry.py b/clean_stylometry.py index da317423a6874dd176cef546ab44ffb6ad8a48cd..15dbf92f809d59e775089d66185a9745650e8894 100644 --- a/clean_stylometry.py +++ b/clean_stylometry.py @@ -123,7 +123,7 @@ def mendenhall_token_metrics(tokens): return new_token_len_dist, standard_deviation, mean -def plot_distribution(x, y, plt_title, file_path_for_pic:str, x_label="Number of Kudos", y_label="Percentage of Occurence",palette="flare", plt_type="barplot", add_labels=True): +def plot_distribution(x, y, plt_title, file_path_for_pic:str, x_label="Number of Kudos", y_label="Percentage of Occurence",palette="flare", plt_type="barplot", add_labels=True, rotate_ticks=True): plt.figure(figsize=(10,10)) plt.title(plt_title) @@ -133,45 +133,115 @@ def plot_distribution(x, y, plt_title, file_path_for_pic:str, x_label="Number of if add_labels: addlabels(x=x.index, y=y.values) - match case: - sns.scatterplot(x=x.index, y=y.values, palette=palette) - #plt.xticks(new_dist.index[::100], new_dist.index[::100]) - - else: - sns.lineplot(x=x.index, y=y.values, palette=palette) - + match plt_type: + case "scatterplot": + sns.scatterplot(x=x.index, y=y.values, palette=palette) + case "lineplot": + sns.lineplot(x=x.index, y=y.values, palette=palette) + case "barplot": + sns.barplot(x=x.index, y=y.values, palette=palette) + case "histplot": + sns.histplot(x=x.index, y=y.values, palette=palette) + case _: + print(f"{plt_type} is not a valid format for this function") + + if rotate_ticks: + plt.xticks(rotation=30) # !!! very useful for words plt.savefig(file_path_for_pic) plt.close() +def pos_tag_freq(tokens): + #nltk.pos_tag(text) --> [('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), + #('completely', 'RB'), ('different', 'JJ')] + tag_token_tuples = pos_tag(tokens) + punctuation_regex = r"[^\w\s]+" + summarised_tags = [] + punctuation_tags = [] + index = 0 + for token, tag in tag_token_tuples: + if re.match(punctuation_regex, token): + summarised_tags.append("punctuation") + if re.match(r"[\"\'“â€â€™â€˜]+", token): + punctuation_tags.append("quotation_marks") + elif re.match(r"[,;:.?!-]+", token): + try: + punctuation_tags.append("ellipsis" if token == "." and tag_token_tuples[index+1][1] == "." and tag_token_tuples[index+2][1] == "." else "full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct") + except: + punctuation_tags.append("full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct") -# plot using matplotlib and seaborn - - # set figure, ax into variables - fig, ax = plt.subplots(figsize=(10,10)) - - # call function for bar (value) labels - addlabels(x=new_token_len_dist.index, y=new_token_len_dist.values) - - plt.title(curve_title) - ax.set_xlabel("Word Length") - ax.set_ylabel("Percentage of Occurence") + else: + if tag in ["MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]: + summarised_tags.append("verb") + elif tag in ["JJ", "JJR", "JJS"]: + summarised_tags.append("adjective") + elif tag in ["RB", "RBR", "RBS", "WRB"]: + summarised_tags.append("adverb") + elif tag in ["PRP", "PRP$", "WP", "WP$"]: + summarised_tags.append("pronoun") + elif tag in ["NNP", "NNPS"]: + summarised_tags.append("proper_noun") + elif tag in ["NN", "NNS"]: + summarised_tags.append("common_noun") + elif tag in ["DT", "PDT", "WDT"]: + summarised_tags.append("determiner") + elif tag == "CC": + summarised_tags.append("coordinating_conj") + elif tag == "IN": + summarised_tags.append("subordinating_conj") + elif tag in ["$", "CD", "EX", "LS", "POS", "SYM", "TO", "UH", "RP", "FW"]: + summarised_tags.append("other_tag") + index += 1 - sns.barplot(x=new_token_len_dist.index, y=new_token_len_dist.values, ax=ax, palette="flare") - #plt.xticks(rotation=30) !!! very useful for words - #plt.get_figure() - plt.savefig(plot_destination) - #print(new_token_len_dist.tabulate()) - #token_length_freq_dist_plot = token_length_distribution.plot(title=curve_title, percents=True) - - #fig_freq_dist = token_length_freq_dist_plot.get_figure() - #fig_freq_dist.savefig(plot_destination) + + tag_freq_dist = FreqDist(summarised_tags) + + # convert FreqDist object to a pandas series for easier processing + tag_freq_dist_panda = pd.Series(dict(tag_freq_dist)) + + # sort, normalise and round the panda series + new_tag_freq_dist = tag_freq_dist_panda.sort_index() + + for i in range(0, len(new_tag_freq_dist.index)): + #for index in new_token_len_dist.index: + new_tag_freq_dist.iat[i] = round(new_tag_freq_dist.iat[i]/len(tag_token_tuples), 2) #index-1 bc the index starts counting from zero, the word lengths not + + #punctuation frequency distribution + punct_tag_freq_dist = FreqDist(punctuation_tags) + + # convert FreqDist object to a pandas series for easier processing + punct_tag_freq_dist_panda = pd.Series(dict(punct_tag_freq_dist)) + + # sort, normalise and round the panda series + new_punct_tag_freq_dist = punct_tag_freq_dist_panda.sort_index() + + for i in range(0, len(new_punct_tag_freq_dist.index)): + #for index in new_token_len_dist.index: + new_punct_tag_freq_dist.iat[i] = round(new_punct_tag_freq_dist.iat[i]/len(punctuation_tags), 3) #index-1 bc the index starts counting from zero, the word lengths not + + return new_tag_freq_dist, new_punct_tag_freq_dist +#f"throne_of_glass/data/canon_works" +def extract_info_from_directory_path(directory_path): + #for txt_fic in os.listdir(directory_path): + works = os.listdir(directory_path) + pattern = r"^[a-zA-Z_]+(?=/)" # get series from directory path + match = re.search(pattern, directory_path) + if match: + series = match.group(0) + for work in works: + with open(f"{directory_path}"+f"/{work}", "r") as f: + f = f.read() + std_dev_tk, mean_tk, ttr = mendenhall_curve(f, f"Mendenhall Curve for the {series.replace('_' , ' ').title()} {work[:-4].replace('_' , ' ').title()}", f"{series}/freq_distribution/{work[:-4]}_token_len.png") + mean_tokens.append(mean_tk) class StylometryMetrics: - def __init__(self, directory_path): + def __init__(self, directory_path, name_of_work, quality="", fanfiction=True): self.text = read_works_into_string(directory_path) self.clean_tokens = tokenize_and_clean_text(self.text) + self.name = name_of_work + self.fanfiction = fanfiction + self.quality = quality # good medium bad def calculate_standardised_ttr(self): self.sttr = standardised_type_token_ratio(self.clean_tokens) @@ -179,84 +249,20 @@ class StylometryMetrics: def calculate_mendenhall_token_metrics(self): self.tk_len_dist, self.tk_len_std, self.tk_len_mean = mendenhall_token_metrics(self.clean_tokens) - def plot - - - - - - - - - - -# this function takes a corpus as its input and gives a Mendenhall curve, i.e. a frequency distribution of tokens as its output -# precise input: corpus = string ; -# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series" -# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png" - - -def mendenhall_curve(corpus, curve_title, plot_destination): - - short_clean_tokens = tokenize_and_clean_text(corpus) - - # create the distribution of token lengths / Mendenhall curve - - token_lengths = [len(token) for token in short_clean_tokens] - - # Calculate the trimmed token length (with 5% trimming) We need to remove the outliers, bc even despite preprocessing, - # there still are some very wrong lengths, which entirely skews the metrics and also ruins our p-values later on - trim_percent = 0.005 - trim_len = int(len(token_lengths) * trim_percent / 2) - token_lengths = sorted(token_lengths)[trim_len:-trim_len] - - - token_length_distribution = FreqDist(token_lengths).most_common(15) - - # convert to FreqDist object to a pandas series for easier processing - token_len_dist_panda = pd.Series(dict(token_length_distribution)) - - # sort, normalise and round the panda series + def plot_token_metrics(self, file_path_for_pic): + plt_title = self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon" + " Token Frequency Distribution" + plot_distribution(x=self.tk_len_dist, y=self.tk_len_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Token Length", y_label="Percentage of Occurence") - new_token_len_dist = token_len_dist_panda.sort_index() + def calculate_pos_tag_distribution(self): + self.tag_freq_dist, self.punct_tag_freq_dist = pos_tag_freq(self.clean_tokens) - for i in range(0, len(new_token_len_dist.index)): - #for index in new_token_len_dist.index: - new_token_len_dist.iat[i] = round(new_token_len_dist.iat[i]/len(short_clean_tokens), 3) #index-1 bc the index starts counting from zero, the word lengths not - #if float(new_token_len_dist.iat[i]) == 0.00: - # new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry - + def plot_pos_tag_freq(self, file_path_for_pic): + plt_title = "POS Tag Frequencies for the " + self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon" + plot_distribution(x=self.tag_freq_dist, y=self.tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="POS Tags", y_label="Percentage of Occurence") - # plot using matplotlib and seaborn - - # set figure, ax into variables - fig, ax = plt.subplots(figsize=(10,10)) - - # call function for bar (value) labels - addlabels(x=new_token_len_dist.index, y=new_token_len_dist.values) - - plt.title(curve_title) - ax.set_xlabel("Word Length") - ax.set_ylabel("Percentage of Occurence") - - sns.barplot(x=new_token_len_dist.index, y=new_token_len_dist.values, ax=ax, palette="flare") - #plt.xticks(rotation=30) !!! very useful for words - #plt.get_figure() - plt.savefig(plot_destination) - #print(new_token_len_dist.tabulate()) - #token_length_freq_dist_plot = token_length_distribution.plot(title=curve_title, percents=True) - - #fig_freq_dist = token_length_freq_dist_plot.get_figure() - #fig_freq_dist.savefig(plot_destination) - - # calculate the standard deviation, mean, token/type ratio - standard_deviation = statistics.stdev(token_lengths) - mean = statistics.mean(token_lengths) - - type_token_ratio = standardised_type_token_ratio(short_clean_tokens) - - return standard_deviation, mean, type_token_ratio - + def plot_punct_freq(self, file_path_for_pic): + plt_title = "Punctuation Frequencies for the " + self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon" + plot_distribution(x=self.punct_tag_freq_dist, y=self.punct_tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Types of Punctuation", y_label="Percentage of Occurence") def sentence_metrics(corpus, curve_title, series, canon_or_fanfic): @@ -351,167 +357,26 @@ def sentence_metrics(corpus, curve_title, series, canon_or_fanfic): # most frequent words for specific tags --> punctuation; # most frequent adjectives -def pos_tag_frequencies(corpus, series, canon_or_fanfic): - #nltk.pos_tag(text) --> [('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), - #('completely', 'RB'), ('different', 'JJ')] - tokens = word_tokenize(corpus) - """ - short_tokens = [] - for token in tokens: - dehyphenated_token = [] - letter_present = 0 - dehyphenated = 0 - second_word_in_compound = 0 - for c in token: - if c.isalpha() == True: - dehyphenated_token.append(c) - letter_present = 1 - if dehyphenated == 1: - second_word_in_compound = 1 - elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens, - #bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of - # high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference, - # relatively speaking - dehyphenated_token_joined = ''.join(map(str, dehyphenated_token)) - #print(dehyphenated_token_joined) - short_tokens.append(dehyphenated_token_joined) - short_tokens.append(c) #append the hyphen/ other punctuation --> we're also interested in that - dehyphenated_token = [] - letter_present = 0 - dehyphenated = 1 - second_word_in_compound = 0 - if letter_present == 1 and dehyphenated == 0: - short_tokens.append(token) #catching the tokens that didn't have any special characters; but not the dehyphenated ones twice - elif letter_present == 1 and dehyphenated == 1 and second_word_in_compound == 1: - short_tokens.append(''.join(map(str, dehyphenated_token))) - """ - tag_token_tuples = pos_tag(tokens) - punctuation_regex = r"[^\w\s]+" - summarised_tags = [] - punctuation_tags = [] - index = 0 - for token, tag in tag_token_tuples: - if re.match(punctuation_regex, token): - summarised_tags.append("punctuation") - if re.match(r"[\"\'“â€â€™â€˜]+", token): - punctuation_tags.append("quotation_marks") - elif re.match(r"[,;:.?!-]+", token): - try: - punctuation_tags.append("ellipsis" if token == "." and tag_token_tuples[index+1][1] == "." and tag_token_tuples[index+2][1] == "." else "full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct") - except: - punctuation_tags.append("full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct") - else: - if tag in ["MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]: - summarised_tags.append("verb") - elif tag in ["JJ", "JJR", "JJS"]: - summarised_tags.append("adjective") - elif tag in ["RB", "RBR", "RBS", "WRB"]: - summarised_tags.append("adverb") - elif tag in ["PRP", "PRP$", "WP", "WP$"]: - summarised_tags.append("pronoun") - elif tag in ["NNP", "NNPS"]: - summarised_tags.append("proper_noun") - elif tag in ["NN", "NNS"]: - summarised_tags.append("common_noun") - elif tag in ["DT", "PDT", "WDT"]: - summarised_tags.append("determiner") - elif tag == "CC": - summarised_tags.append("coordinating_conj") - elif tag == "IN": - summarised_tags.append("subordinating_conj") - elif tag in ["$", "CD", "EX", "LS", "POS", "SYM", "TO", "UH", "RP", "FW"]: - summarised_tags.append("other_tag") - index += 1 - - - tag_freq_dist = FreqDist(summarised_tags) - #print(tag_freq_dist) - - # convert FreqDist object to a pandas series for easier processing - tag_freq_dist_panda = pd.Series(dict(tag_freq_dist)) - #print(tag_freq_dist_panda) - - # sort, normalise and round the panda series - - new_tag_freq_dist = tag_freq_dist_panda.sort_index() - #print(new_sent_len_dist) - - for i in range(0, len(new_tag_freq_dist.index)): - #for index in new_token_len_dist.index: - new_tag_freq_dist.iat[i] = round(new_tag_freq_dist.iat[i]/len(tag_token_tuples), 2) #index-1 bc the index starts counting from zero, the word lengths not - - print(new_tag_freq_dist) - - # set figure, ax into variables - fig, ax = plt.subplots(figsize=(10,10)) - - # call function for bar (value) labels - addlabels(x=new_tag_freq_dist.index, y=new_tag_freq_dist.values) - - plt.title(f"POS Tag Frequencies for the {series.replace('_' , ' ').title()} {canon_or_fanfic.replace('_' , ' ').title()}") - ax.set_xlabel("POS Tags") - ax.set_ylabel("Percentage of Occurence") - - sns.barplot(x=new_tag_freq_dist.index, y=new_tag_freq_dist.values, ax=ax, palette="RdPu") - plt.xticks(rotation=30) # !!! very useful for words - plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_pos_tag_frequencies.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png" - - - #punctuation frequency distribution - - punct_tag_freq_dist = FreqDist(punctuation_tags) - #print(tag_freq_dist) - - # convert FreqDist object to a pandas series for easier processing - punct_tag_freq_dist_panda = pd.Series(dict(punct_tag_freq_dist)) - #print(punct_tag_freq_dist_panda) - - # sort, normalise and round the panda series - - new_punct_tag_freq_dist = punct_tag_freq_dist_panda.sort_index() - #print(new_sent_len_dist) - - for i in range(0, len(new_punct_tag_freq_dist.index)): - #for index in new_token_len_dist.index: - new_punct_tag_freq_dist.iat[i] = round(new_punct_tag_freq_dist.iat[i]/len(punctuation_tags), 3) #index-1 bc the index starts counting from zero, the word lengths not - - #print(new_punct_tag_freq_dist) - - # set figure, ax into variables - fig, ax = plt.subplots(figsize=(10,10)) - - # call function for bar (value) labels - addlabels(x=new_punct_tag_freq_dist.index, y=new_punct_tag_freq_dist.values) - - - plt.title(f"Punctuation Frequencies for the {series.replace('_' , ' ').title()} {canon_or_fanfic.replace('_' , ' ').title()}") - ax.set_xlabel("Types of Punctuation") - ax.set_ylabel("Percentage of Occurence") - - sns.barplot(x=new_punct_tag_freq_dist.index, y=new_punct_tag_freq_dist.values, ax=ax, palette="OrRd") - plt.xticks(rotation=30) # !!! very useful for words - plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_punctuation_frequencies.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png" - #create the Mendenhall Curve for the Throne of Glass Series -std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png") +#std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png") #create the Mendenhall Curve for the Grishaverse Books -std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png") +#std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png") # Mendenhall Curve Sentence Lengths for Throne of Glass Canon -std_dev_sent_tog_canon, mean_sent_tog_canon = sentence_metrics(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Throne of Glass Series", "throne_of_glass", "canon") +#std_dev_sent_tog_canon, mean_sent_tog_canon = sentence_metrics(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Throne of Glass Series", "throne_of_glass", "canon") # Mendenhall Curve Sentence Lenghts for Grishavers Canon -std_dev_sent_grishaverse_canon, mean_sent_grishaverse_canon = sentence_metrics(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Grishaverse Books", "grishaverse", "canon") +#std_dev_sent_grishaverse_canon, mean_sent_grishaverse_canon = sentence_metrics(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Grishaverse Books", "grishaverse", "canon") # POS Tag frequencies for TOG -pos_tag_frequencies(read_works_into_string(f"throne_of_glass/data/canon_works"), "throne_of_glass", "canon") +#pos_tag_frequencies(read_works_into_string(f"throne_of_glass/data/canon_works"), "throne_of_glass", "canon") # POS Tag frequencies for Grishaverse -pos_tag_frequencies(read_works_into_string(f"grishaverse/data/canon_works"), "grishaverse", "canon") +#pos_tag_frequencies(read_works_into_string(f"grishaverse/data/canon_works"), "grishaverse", "canon") def run_functions(directory_path): """ @@ -574,6 +439,6 @@ data_overview = pd.DataFrame( if __name__ == "__main__": - run_functions("grishaverse/data/split_txt_fanfics") - run_functions("throne_of_glass/data/split_txt_fanfics") - data_overview.to_csv(f"data_overview/data_overview.csv") + #run_functions("grishaverse/data/split_txt_fanfics") + #run_functions("throne_of_glass/data/split_txt_fanfics") + #data_overview.to_csv(f"data_overview/data_overview.csv") \ No newline at end of file