diff --git a/clean_stylometry.py b/clean_stylometry.py index 15dbf92f809d59e775089d66185a9745650e8894..c9f4a5e7c8713c76182f394299d09cdae520c4f5 100644 --- a/clean_stylometry.py +++ b/clean_stylometry.py @@ -10,8 +10,6 @@ import pandas as pd import statistics import re -# you'll have to also download "punkt" from nltk - # create function for bar (value) labels def addlabels(x,y): for i in range(len(x)): @@ -26,7 +24,7 @@ def read_works_into_string(directory_path): strings = [] works = os.listdir(directory_path) for work in works: - with open(f"{directory_path}"+f"/{work}", "r") as f: + with open(f"{directory_path}"+f"/{work}", "r", errors='ignore') as f: #ignores mostly unicode errors due to problematic encoding of text files strings.append(f.read()) return "\n".join(strings) @@ -91,6 +89,27 @@ def tokenize_and_clean_text(text): short_clean_tokens.append(''.join(map(str, dehyphenated_token))) return short_clean_tokens +def calculate_freq_dist_as_clean_panda(list_of_items, most_common_limit=False): + + if most_common_limit == False: + freq_dist = FreqDist(list_of_items) + else: + freq_dist = FreqDist(list_of_items).most_common(most_common_limit) + + # convert FreqDist object to a pandas series for easier processing + dist_panda = pd.Series(dict(freq_dist)) + + # sort, normalise and round the panda series + new_dist = dist_panda.sort_index() + + for i in range(0, len(new_dist.index)): + #for index in new_token_len_dist.index: + new_dist.iat[i] = round(new_dist.iat[i]/len(list_of_items), 3) #index-1 bc the index starts counting from zero, the word lengths not + #if float(new_token_len_dist.iat[i]) == 0.00: + # new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry + return new_dist + + def mendenhall_token_metrics(tokens): # create the distribution of token lengths / Mendenhall curve @@ -101,23 +120,8 @@ def mendenhall_token_metrics(tokens): trim_percent = 0.005 trim_len = int(len(token_lengths) * trim_percent / 2) token_lengths = sorted(token_lengths)[trim_len:-trim_len] + new_token_len_dist = calculate_freq_dist_as_clean_panda(token_lengths, most_common_limit=15) # token len freq dist - - token_length_distribution = FreqDist(token_lengths).most_common(15) - - # convert to FreqDist object to a pandas series for easier processing - token_len_dist_panda = pd.Series(dict(token_length_distribution)) - - # sort, normalise and round the panda series - - new_token_len_dist = token_len_dist_panda.sort_index() - - for i in range(0, len(new_token_len_dist.index)): - #for index in new_token_len_dist.index: - new_token_len_dist.iat[i] = round(new_token_len_dist.iat[i]/len(tokens), 3) #index-1 bc the index starts counting from zero, the word lengths not - #if float(new_token_len_dist.iat[i]) == 0.00: - # new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry - standard_deviation = statistics.stdev(token_lengths) mean = statistics.mean(token_lengths) @@ -157,6 +161,7 @@ def pos_tag_freq(tokens): punctuation_regex = r"[^\w\s]+" summarised_tags = [] punctuation_tags = [] + modal_verbs = [] index = 0 for token, tag in tag_token_tuples: if re.match(punctuation_regex, token): @@ -170,7 +175,10 @@ def pos_tag_freq(tokens): punctuation_tags.append("full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct") else: - if tag in ["MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]: + if tag in ["MD"]: + summarised_tags.append("modal verb") + modal_verbs.append(token.lower()) + elif tag in ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]: summarised_tags.append("verb") elif tag in ["JJ", "JJR", "JJS"]: summarised_tags.append("adjective") @@ -192,33 +200,16 @@ def pos_tag_freq(tokens): summarised_tags.append("other_tag") index += 1 - - tag_freq_dist = FreqDist(summarised_tags) - - # convert FreqDist object to a pandas series for easier processing - tag_freq_dist_panda = pd.Series(dict(tag_freq_dist)) - - # sort, normalise and round the panda series - new_tag_freq_dist = tag_freq_dist_panda.sort_index() - - for i in range(0, len(new_tag_freq_dist.index)): - #for index in new_token_len_dist.index: - new_tag_freq_dist.iat[i] = round(new_tag_freq_dist.iat[i]/len(tag_token_tuples), 2) #index-1 bc the index starts counting from zero, the word lengths not + #pos tag freq dist + new_tag_freq_dist = calculate_freq_dist_as_clean_panda(summarised_tags) #punctuation frequency distribution - punct_tag_freq_dist = FreqDist(punctuation_tags) + new_punct_tag_freq_dist = calculate_freq_dist_as_clean_panda(punctuation_tags) - # convert FreqDist object to a pandas series for easier processing - punct_tag_freq_dist_panda = pd.Series(dict(punct_tag_freq_dist)) - - # sort, normalise and round the panda series - new_punct_tag_freq_dist = punct_tag_freq_dist_panda.sort_index() - - for i in range(0, len(new_punct_tag_freq_dist.index)): - #for index in new_token_len_dist.index: - new_punct_tag_freq_dist.iat[i] = round(new_punct_tag_freq_dist.iat[i]/len(punctuation_tags), 3) #index-1 bc the index starts counting from zero, the word lengths not + # modal verbs in more detail + new_md_freq_dist_panda = calculate_freq_dist_as_clean_panda(modal_verbs, most_common_limit=10) - return new_tag_freq_dist, new_punct_tag_freq_dist + return new_tag_freq_dist, new_punct_tag_freq_dist, new_md_freq_dist_panda #f"throne_of_glass/data/canon_works" def extract_info_from_directory_path(directory_path): @@ -234,39 +225,9 @@ def extract_info_from_directory_path(directory_path): std_dev_tk, mean_tk, ttr = mendenhall_curve(f, f"Mendenhall Curve for the {series.replace('_' , ' ').title()} {work[:-4].replace('_' , ' ').title()}", f"{series}/freq_distribution/{work[:-4]}_token_len.png") mean_tokens.append(mean_tk) -class StylometryMetrics: +def calculate_sent_len_dist(text): - def __init__(self, directory_path, name_of_work, quality="", fanfiction=True): - self.text = read_works_into_string(directory_path) - self.clean_tokens = tokenize_and_clean_text(self.text) - self.name = name_of_work - self.fanfiction = fanfiction - self.quality = quality # good medium bad - - def calculate_standardised_ttr(self): - self.sttr = standardised_type_token_ratio(self.clean_tokens) - - def calculate_mendenhall_token_metrics(self): - self.tk_len_dist, self.tk_len_std, self.tk_len_mean = mendenhall_token_metrics(self.clean_tokens) - - def plot_token_metrics(self, file_path_for_pic): - plt_title = self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon" + " Token Frequency Distribution" - plot_distribution(x=self.tk_len_dist, y=self.tk_len_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Token Length", y_label="Percentage of Occurence") - - def calculate_pos_tag_distribution(self): - self.tag_freq_dist, self.punct_tag_freq_dist = pos_tag_freq(self.clean_tokens) - - def plot_pos_tag_freq(self, file_path_for_pic): - plt_title = "POS Tag Frequencies for the " + self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon" - plot_distribution(x=self.tag_freq_dist, y=self.tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="POS Tags", y_label="Percentage of Occurence") - - def plot_punct_freq(self, file_path_for_pic): - plt_title = "Punctuation Frequencies for the " + self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon" - plot_distribution(x=self.punct_tag_freq_dist, y=self.punct_tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Types of Punctuation", y_label="Percentage of Occurence") - -def sentence_metrics(corpus, curve_title, series, canon_or_fanfic): - - sents = sent_tokenize(corpus) + sents = sent_tokenize(text) sent_lens = [] for sent in sents: short_clean_tokens = tokenize_and_clean_text(sent) @@ -279,76 +240,68 @@ def sentence_metrics(corpus, curve_title, series, canon_or_fanfic): trim_percent = 0.05 trim_len = int(len(sent_lens) * trim_percent / 2) sent_lens = sorted(sent_lens)[trim_len:-trim_len] - - - sent_len_dist = FreqDist(sent_lens) - #print(sent_len_dist) - - # convert to FreqDist object to a pandas series for easier processing - sent_len_dist_panda = pd.Series(dict(sent_len_dist)) - - # sort, normalise and round the panda series + sent_len_dist = calculate_freq_dist_as_clean_panda(sent_lens) #new_sent_len_dist - new_sent_len_dist = sent_len_dist_panda.sort_index() - #print(new_sent_len_dist) + # plot the 25 most frequent sentence lenghts as a barplot for a more detailed insight + sent_len_dist_short = calculate_freq_dist_as_clean_panda(sent_lens, most_common_limit=25) - for i in range(0, len(new_sent_len_dist.index)): - #for index in new_token_len_dist.index: - new_sent_len_dist.iat[i] = round(new_sent_len_dist.iat[i]/len(sent_lens), 2) #index-1 bc the index starts counting from zero, the word lengths not - - #print(new_sent_len_dist) - # plot using matplotlib and seaborn - - # set figure, ax into variables - fig, ax = plt.subplots(figsize=(10,10)) - - # call function for bar (value) labels - #addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values) + # calculate the standard deviation, mean + standard_deviation_sent = statistics.stdev(sent_lens) + mean_sent = statistics.mean(sent_lens) - plt.title(curve_title) - ax.set_xlabel("Sentence Length") - ax.set_ylabel("Percentage of Occurence") - - - sns.lineplot(x=new_sent_len_dist.index, y=new_sent_len_dist.values, ax=ax, palette="crest") - #plt.xticks(rotation=30) !!! very useful for words - plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_sent_len_long.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png" + return sent_len_dist, sent_len_dist_short, standard_deviation_sent, mean_sent - # plot the 40 most frequent sentence lenghts as a barplot for a more detailed insight - sent_len_dist_short = FreqDist(sent_lens).most_common(25) +class StylometryMetrics: - # convert to FreqDist object to a pandas series for easier processing - sent_len_dist_short_panda = pd.Series(dict(sent_len_dist_short)) + def __init__(self, directory_path, name_of_work, quality="", fanfiction=True): + self.text = read_works_into_string(directory_path) + self.clean_tokens = tokenize_and_clean_text(self.text) + self.name = name_of_work + self.fanfiction = fanfiction + self.quality = quality # good medium bad - # sort, normalise and round the panda series + def determine_titles(self, plot_topic): + if self.fanfiction: + plt_title = f"{plot_topic} for the {self.name} {self.quality} Fanfiction" + else: + plt_title = f"{plot_topic} for the {self.name} Canon" + return plt_title - new_sent_len_dist_short = sent_len_dist_short_panda.sort_index() - #print(new_sent_len_dist) + def calculate_standardised_ttr(self): + self.sttr = standardised_type_token_ratio(self.clean_tokens) - for i in range(0, len(new_sent_len_dist_short.index)): - #for index in new_token_len_dist.index: - new_sent_len_dist_short.iat[i] = round(new_sent_len_dist_short.iat[i]/len(sent_lens), 2) #index-1 bc the index starts counting from zero, the word lengths not - - # set figure, ax into variables - fig, ax = plt.subplots(figsize=(10,10)) + def calculate_mendenhall_token_metrics(self): + self.tk_len_dist, self.tk_len_std, self.tk_len_mean = mendenhall_token_metrics(self.clean_tokens) - # call function for bar (value) labels - addlabels(x=new_sent_len_dist_short.index, y=new_sent_len_dist_short.values) + def plot_token_metrics(self, file_path_for_pic): + plt_title = self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon" + " Token Frequency Distribution" + plot_distribution(x=self.tk_len_dist, y=self.tk_len_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Token Length", y_label="Percentage of Occurence") - plt.title(curve_title) - ax.set_xlabel("Sentence Length") - ax.set_ylabel("Percentage of Occurence") + def calculate_pos_tag_distribution(self): + self.tag_freq_dist, self.punct_tag_freq_dist, self.md_freq_dist = pos_tag_freq(self.clean_tokens) - sns.barplot(x=new_sent_len_dist_short.index, y=new_sent_len_dist_short.values, ax=ax, palette="YlGnBu") - #plt.xticks(rotation=30) !!! very useful for words - plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_sent_len_short.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png" + def calculate_sent_len_distribution(self): + self.sent_len_dist, self.sent_len_dist_short, self.sent_std_dev, self.sent_mean = calculate_sent_len_dist(self.text) - # calculate the standard deviation, mean, token/type ratio - standard_deviation_sent = statistics.stdev(sent_lens) - mean_sent = statistics.mean(sent_lens) + def plot_long_sent_len_dist(self, file_path_for_pic): + plt_title = self.determine_titles(plot_topic="Full Sentence Length Distribution") + plot_distribution(x=self.sent_len_dist, y=self.sent_len_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Sentence Lengths", y_label="Percentage of Occurence", plt_type="lineplot") + + def plot_short_sent_len_dist(self, file_path_for_pic): + plt_title = self.determine_titles(plot_topic="Short Sentence Length Distribution") + plot_distribution(x=self.sent_len_dist_short, y=self.sent_len_dist_short, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Sentence Lengths", y_label="Percentage of Occurence") - return standard_deviation_sent, mean_sent + def plot_pos_tag_freq(self, file_path_for_pic): + plt_title = self.determine_titles(plot_topic="POS Tag Frequencies") + plot_distribution(x=self.tag_freq_dist, y=self.tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="POS Tags", y_label="Percentage of Occurence") + + def plot_md_freq(self, file_path_for_pic): + plt_title = self.determine_titles(plot_topic="Modal Verb Frequencies") + plot_distribution(x=self.md_freq_dist, y=self.md_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Modal Verbs", y_label="Percentage of Occurence") + def plot_punct_freq(self, file_path_for_pic): + plt_title = self.determine_titles(plot_topic="Punctuation Frequencies") + plot_distribution(x=self.punct_tag_freq_dist, y=self.punct_tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Types of Punctuation", y_label="Percentage of Occurence") # overall pos_tag frequency distribution # pos_tag ngrams; (maybe exclude stopwords?) @@ -358,7 +311,6 @@ def sentence_metrics(corpus, curve_title, series, canon_or_fanfic): # most frequent adjectives - #create the Mendenhall Curve for the Throne of Glass Series #std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png") @@ -410,35 +362,39 @@ def run_functions(directory_path): #grishaverse/data/split_txt_fanfics - -#create lists for each of the columns of the dataframe we'll create - -mean_tokens = [mean_tokens_tog_canon, mean_tokens_grishaverse_canon] -std_dev_tokens = [std_dev_tokens_tog_canon, std_dev_tokens_grishaverse_canon] -type_token_ratio = [type_token_ratio_tog_canon, type_token_ratio_grishaverse_canon] -mean_sent = [mean_sent_tog_canon, mean_sent_grishaverse_canon] -std_dev_sents = [std_dev_sent_tog_canon, std_dev_sent_grishaverse_canon] -index = ["throne_of_glass_canon", "grishaverse_canon"] - -# create a dataframe to store all the overview statistics in -# columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15; -# mean_sent; std_dev_sent; freq_sent_len .... -# tag_frequencies -# tag_ngram_frequencies -# punctuation frequencies -# token/type ratio - -data_overview = pd.DataFrame( - {"mean_tokens":mean_tokens, - "std_dev_tokens":std_dev_tokens, - "type_token_ratio":type_token_ratio, - "mean_sent":mean_sent, - "std_dev_sent":std_dev_sents}, - index = index -) +def create_dataframe_with_overview_info(): + #create lists for each of the columns of the dataframe we'll create + + mean_tokens = [mean_tokens_tog_canon, mean_tokens_grishaverse_canon] + std_dev_tokens = [std_dev_tokens_tog_canon, std_dev_tokens_grishaverse_canon] + type_token_ratio = [type_token_ratio_tog_canon, type_token_ratio_grishaverse_canon] + mean_sent = [mean_sent_tog_canon, mean_sent_grishaverse_canon] + std_dev_sents = [std_dev_sent_tog_canon, std_dev_sent_grishaverse_canon] + index = ["throne_of_glass_canon", "grishaverse_canon"] + + # create a dataframe to store all the overview statistics in + # columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15; + # mean_sent; std_dev_sent; freq_sent_len .... + # tag_frequencies + # tag_ngram_frequencies + # punctuation frequencies + # token/type ratio + + data_overview = pd.DataFrame( + {"mean_tokens":mean_tokens, + "std_dev_tokens":std_dev_tokens, + "type_token_ratio":type_token_ratio, + "mean_sent":mean_sent, + "std_dev_sent":std_dev_sents}, + index = index + ) if __name__ == "__main__": #run_functions("grishaverse/data/split_txt_fanfics") #run_functions("throne_of_glass/data/split_txt_fanfics") - #data_overview.to_csv(f"data_overview/data_overview.csv") \ No newline at end of file + #data_overview.to_csv(f"data_overview/data_overview.csv") + GrishaverseCanon = StylometryMetrics(directory_path="grishaverse/data/canon_works", name_of_work="Grishaverse", fanfiction=False) + GrishaverseCanon.calculate_pos_tag_distribution() + GrishaverseCanon.plot_md_freq("grishaverse/plots/canon/md_freq.png") + \ No newline at end of file diff --git a/grishaverse/plots/canon/md_freq.png b/grishaverse/plots/canon/md_freq.png new file mode 100644 index 0000000000000000000000000000000000000000..d6b8a952e9ec3065183aa7cc57a9f9b2e27819a8 Binary files /dev/null and b/grishaverse/plots/canon/md_freq.png differ diff --git a/grishaverse/plots/canon/pos_tag_freq.png b/grishaverse/plots/canon/pos_tag_freq.png new file mode 100644 index 0000000000000000000000000000000000000000..6bc40eaa6d8bc2ceb02badb48baddba14740201e Binary files /dev/null and b/grishaverse/plots/canon/pos_tag_freq.png differ diff --git a/grishaverse/plots/filler.txt b/grishaverse/plots/filler.txt new file mode 100644 index 0000000000000000000000000000000000000000..20281f31fb55ef758453dadef59db1b886b36f18 --- /dev/null +++ b/grishaverse/plots/filler.txt @@ -0,0 +1 @@ +alki ist blöd \ No newline at end of file diff --git a/grishaverse/plots/medium/filler.txt b/grishaverse/plots/medium/filler.txt new file mode 100644 index 0000000000000000000000000000000000000000..9cefcd4aea6e7f0fe549d64d322177e97ca21513 --- /dev/null +++ b/grishaverse/plots/medium/filler.txt @@ -0,0 +1 @@ +me is good for filling stuffs \ No newline at end of file