Clean up POS Tags

b9aad95f · Lea Kyveli Chrysanthopoulou · d1028a35 · b9aad95f
Commit b9aad95f authored 1 year ago by Lea Kyveli Chrysanthopoulou
--- a/clean_stylometry.py
+++ b/clean_stylometry.py
@@ -123,7 +123,7 @@ def mendenhall_token_metrics(tokens):
    return new_token_len_dist, standard_deviation, mean
-def plot_distribution(x, y, plt_title, file_path_for_pic:str, x_label="Number of Kudos", y_label="Percentage of Occurence",palette="flare", plt_type="barplot", add_labels=True):
+def plot_distribution(x, y, plt_title, file_path_for_pic:str, x_label="Number of Kudos", y_label="Percentage of Occurence",palette="flare", plt_type="barplot", add_labels=True, rotate_ticks=True):
    plt.figure(figsize=(10,10))
    plt.title(plt_title)
@@ -133,45 +133,115 @@ def plot_distribution(x, y, plt_title, file_path_for_pic:str, x_label="Number of
    if add_labels:
        addlabels(x=x.index, y=y.values)
-    match case:
+    match plt_type:
-        sns.scatterplot(x=x.index, y=y.values, palette=palette)
+        case "scatterplot":
-        #plt.xticks(new_dist.index[::100], new_dist.index[::100])
+            sns.scatterplot(x=x.index, y=y.values, palette=palette)
+        case "lineplot":
-    else:
+            sns.lineplot(x=x.index, y=y.values, palette=palette)
-        sns.lineplot(x=x.index, y=y.values, palette=palette)
+        case "barplot":
+            sns.barplot(x=x.index, y=y.values, palette=palette)
+        case "histplot":
+            sns.histplot(x=x.index, y=y.values, palette=palette)
+        case _:
+            print(f"{plt_type} is not a valid format for this function")
+    if rotate_ticks:
+        plt.xticks(rotation=30) # !!! very useful for words
    plt.savefig(file_path_for_pic) 
    plt.close()
+def pos_tag_freq(tokens):
+    #nltk.pos_tag(text) --> [('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'),
+    #('completely', 'RB'), ('different', 'JJ')]
+    tag_token_tuples = pos_tag(tokens)
+    punctuation_regex = r"[^\w\s]+"
+    summarised_tags = []
+    punctuation_tags = []
+    index = 0
+    for token, tag in tag_token_tuples:
+        if re.match(punctuation_regex, token):
+            summarised_tags.append("punctuation")
+            if re.match(r"[\"\'“”’‘]+", token):
+                punctuation_tags.append("quotation_marks")
+            elif re.match(r"[,;:.?!-]+", token):
+                try:
+                    punctuation_tags.append("ellipsis" if token == "." and tag_token_tuples[index+1][1] == "." and tag_token_tuples[index+2][1] == "." else "full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct")
+                except:
+                    punctuation_tags.append("full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct")
-# plot using matplotlib and seaborn 
+        else:
+            if tag in ["MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
-    # set figure, ax into variables
+                summarised_tags.append("verb")
-    fig, ax = plt.subplots(figsize=(10,10))
+            elif tag in ["JJ", "JJR", "JJS"]:
+                summarised_tags.append("adjective")
-    # call function for bar (value) labels 
+            elif tag in ["RB", "RBR", "RBS", "WRB"]:
-    addlabels(x=new_token_len_dist.index, y=new_token_len_dist.values)
+                summarised_tags.append("adverb")
+            elif tag in ["PRP", "PRP$", "WP", "WP$"]:
-    plt.title(curve_title)
+                summarised_tags.append("pronoun")
-    ax.set_xlabel("Word Length")
+            elif tag in ["NNP", "NNPS"]:
-    ax.set_ylabel("Percentage of Occurence")
+                summarised_tags.append("proper_noun")
+            elif tag in ["NN", "NNS"]:
+                summarised_tags.append("common_noun")
+            elif tag in ["DT", "PDT", "WDT"]:
+                summarised_tags.append("determiner")
+            elif tag == "CC":
+                summarised_tags.append("coordinating_conj")
+            elif tag == "IN":
+                summarised_tags.append("subordinating_conj")
+            elif tag in ["$", "CD", "EX", "LS", "POS", "SYM", "TO", "UH", "RP", "FW"]:
+                summarised_tags.append("other_tag")
+        index += 1
-    sns.barplot(x=new_token_len_dist.index, y=new_token_len_dist.values, ax=ax, palette="flare")
-    #plt.xticks(rotation=30) !!! very useful for words
+    tag_freq_dist = FreqDist(summarised_tags)
-    #plt.get_figure()
-    plt.savefig(plot_destination)
+    # convert FreqDist object to a pandas series for easier processing
-    #print(new_token_len_dist.tabulate())
+    tag_freq_dist_panda = pd.Series(dict(tag_freq_dist))
-    #token_length_freq_dist_plot = token_length_distribution.plot(title=curve_title, percents=True)
+    # sort, normalise and round the panda series
-    #fig_freq_dist = token_length_freq_dist_plot.get_figure()
+    new_tag_freq_dist = tag_freq_dist_panda.sort_index()
-    #fig_freq_dist.savefig(plot_destination)
+    for i in range(0, len(new_tag_freq_dist.index)):
+    #for index in new_token_len_dist.index:
+        new_tag_freq_dist.iat[i] = round(new_tag_freq_dist.iat[i]/len(tag_token_tuples), 2) #index-1 bc the index starts counting from zero, the word lengths not
+    #punctuation frequency distribution
+    punct_tag_freq_dist = FreqDist(punctuation_tags)
+    # convert FreqDist object to a pandas series for easier processing
+    punct_tag_freq_dist_panda = pd.Series(dict(punct_tag_freq_dist))
+    # sort, normalise and round the panda series
+    new_punct_tag_freq_dist = punct_tag_freq_dist_panda.sort_index()
+    for i in range(0, len(new_punct_tag_freq_dist.index)):
+    #for index in new_token_len_dist.index:
+        new_punct_tag_freq_dist.iat[i] = round(new_punct_tag_freq_dist.iat[i]/len(punctuation_tags), 3) #index-1 bc the index starts counting from zero, the word lengths not
+    return new_tag_freq_dist, new_punct_tag_freq_dist
+#f"throne_of_glass/data/canon_works"
+def extract_info_from_directory_path(directory_path):
+ #for txt_fic in os.listdir(directory_path):
+    works = os.listdir(directory_path)
+    pattern = r"^[a-zA-Z_]+(?=/)" # get series from directory path
+    match = re.search(pattern, directory_path)
+    if match:
+        series = match.group(0)
+    for work in works:
+        with open(f"{directory_path}"+f"/{work}", "r") as f:
+            f = f.read()
+            std_dev_tk, mean_tk, ttr = mendenhall_curve(f, f"Mendenhall Curve for the {series.replace('_' , ' ').title()} {work[:-4].replace('_' , ' ').title()}", f"{series}/freq_distribution/{work[:-4]}_token_len.png")
+            mean_tokens.append(mean_tk)
 class StylometryMetrics:
-    def __init__(self, directory_path):
+    def __init__(self, directory_path, name_of_work, quality="", fanfiction=True):
        self.text = read_works_into_string(directory_path)
        self.clean_tokens = tokenize_and_clean_text(self.text)
+        self.name = name_of_work
+        self.fanfiction = fanfiction
+        self.quality = quality # good medium bad
    def calculate_standardised_ttr(self):
        self.sttr = standardised_type_token_ratio(self.clean_tokens)
@@ -179,84 +249,20 @@ class StylometryMetrics:
    def calculate_mendenhall_token_metrics(self):
        self.tk_len_dist, self.tk_len_std, self.tk_len_mean = mendenhall_token_metrics(self.clean_tokens)
-    def plot
+    def plot_token_metrics(self, file_path_for_pic):
+        plt_title = self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon" + " Token Frequency Distribution"
+        plot_distribution(x=self.tk_len_dist, y=self.tk_len_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic,  x_label="Token Length", y_label="Percentage of Occurence")
-# this function takes a corpus as its input and gives a Mendenhall curve, i.e. a frequency distribution of tokens as its output
-# precise input: corpus = string ; 
-# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series"
-# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png" 
-def mendenhall_curve(corpus, curve_title, plot_destination): 
-    short_clean_tokens = tokenize_and_clean_text(corpus)
-    # create the distribution of token lengths / Mendenhall curve
-    token_lengths = [len(token) for token in short_clean_tokens]
-    # Calculate the trimmed token length (with 5% trimming) We need to remove the outliers, bc even despite preprocessing, 
-    # there still are some very wrong lengths, which entirely skews the metrics and also ruins our p-values later on
-    trim_percent = 0.005
-    trim_len = int(len(token_lengths) * trim_percent / 2)
-    token_lengths = sorted(token_lengths)[trim_len:-trim_len]
-    token_length_distribution = FreqDist(token_lengths).most_common(15)
-    # convert to FreqDist object to a pandas series for easier processing
-    token_len_dist_panda = pd.Series(dict(token_length_distribution))
-    # sort, normalise and round the panda series
-    new_token_len_dist = token_len_dist_panda.sort_index()
+    def calculate_pos_tag_distribution(self):
+        self.tag_freq_dist, self.punct_tag_freq_dist = pos_tag_freq(self.clean_tokens)
-    for i in range(0, len(new_token_len_dist.index)):
+    def plot_pos_tag_freq(self, file_path_for_pic):
-    #for index in new_token_len_dist.index:
+        plt_title = "POS Tag Frequencies for the " + self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon"
-        new_token_len_dist.iat[i] = round(new_token_len_dist.iat[i]/len(short_clean_tokens), 3) #index-1 bc the index starts counting from zero, the word lengths not
+        plot_distribution(x=self.tag_freq_dist, y=self.tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="POS Tags", y_label="Percentage of Occurence")
-        #if float(new_token_len_dist.iat[i]) == 0.00:
-         #   new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
-    # plot using matplotlib and seaborn 
+    def plot_punct_freq(self, file_path_for_pic):
+        plt_title = "Punctuation Frequencies for the " + self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon"
-    # set figure, ax into variables
+        plot_distribution(x=self.punct_tag_freq_dist, y=self.punct_tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Types of Punctuation", y_label="Percentage of Occurence")
-    fig, ax = plt.subplots(figsize=(10,10))
-    # call function for bar (value) labels 
-    addlabels(x=new_token_len_dist.index, y=new_token_len_dist.values)
-    plt.title(curve_title)
-    ax.set_xlabel("Word Length")
-    ax.set_ylabel("Percentage of Occurence")
-    sns.barplot(x=new_token_len_dist.index, y=new_token_len_dist.values, ax=ax, palette="flare")
-    #plt.xticks(rotation=30) !!! very useful for words
-    #plt.get_figure()
-    plt.savefig(plot_destination)
-    #print(new_token_len_dist.tabulate())
-    #token_length_freq_dist_plot = token_length_distribution.plot(title=curve_title, percents=True)
-    #fig_freq_dist = token_length_freq_dist_plot.get_figure()
-    #fig_freq_dist.savefig(plot_destination)
-    # calculate the standard deviation, mean, token/type ratio
-    standard_deviation = statistics.stdev(token_lengths)
-    mean = statistics.mean(token_lengths)
-    type_token_ratio = standardised_type_token_ratio(short_clean_tokens)
-    return standard_deviation, mean, type_token_ratio
 def sentence_metrics(corpus, curve_title, series, canon_or_fanfic): 
@@ -351,167 +357,26 @@ def sentence_metrics(corpus, curve_title, series, canon_or_fanfic):
 # most frequent words for specific tags --> punctuation; 
 # most frequent adjectives
-def pos_tag_frequencies(corpus, series, canon_or_fanfic):
-    #nltk.pos_tag(text) --> [('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'),
-    #('completely', 'RB'), ('different', 'JJ')]
-    tokens = word_tokenize(corpus)
-    """
-    short_tokens = []
-    for token in tokens:
-        dehyphenated_token = []
-        letter_present = 0
-        dehyphenated = 0
-        second_word_in_compound = 0
-        for c in token:
-            if c.isalpha() == True:
-                dehyphenated_token.append(c)
-                letter_present = 1
-                if dehyphenated == 1:
-                    second_word_in_compound = 1
-            elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens, 
-                #bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of 
-                # high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
-                # relatively speaking 
-                dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
-                #print(dehyphenated_token_joined)
-                short_tokens.append(dehyphenated_token_joined)
-                short_tokens.append(c) #append the hyphen/ other punctuation --> we're also interested in that
-                dehyphenated_token = []
-                letter_present = 0
-                dehyphenated = 1
-                second_word_in_compound = 0
-        if letter_present == 1 and dehyphenated == 0:
-            short_tokens.append(token) #catching the tokens that didn't have any special characters; but not the dehyphenated ones twice
-        elif letter_present == 1 and dehyphenated == 1 and second_word_in_compound == 1:
-            short_tokens.append(''.join(map(str, dehyphenated_token)))
-    """
-    tag_token_tuples = pos_tag(tokens)
-    punctuation_regex = r"[^\w\s]+"
-    summarised_tags = []
-    punctuation_tags = []
-    index = 0
-    for token, tag in tag_token_tuples:
-        if re.match(punctuation_regex, token):
-            summarised_tags.append("punctuation")
-            if re.match(r"[\"\'“”’‘]+", token):
-                punctuation_tags.append("quotation_marks")
-            elif re.match(r"[,;:.?!-]+", token):
-                try:
-                    punctuation_tags.append("ellipsis" if token == "." and tag_token_tuples[index+1][1] == "." and tag_token_tuples[index+2][1] == "." else "full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct")
-                except:
-                    punctuation_tags.append("full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct")
-        else:
-            if tag in ["MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
-                summarised_tags.append("verb")
-            elif tag in ["JJ", "JJR", "JJS"]:
-                summarised_tags.append("adjective")
-            elif tag in ["RB", "RBR", "RBS", "WRB"]:
-                summarised_tags.append("adverb")
-            elif tag in ["PRP", "PRP$", "WP", "WP$"]:
-                summarised_tags.append("pronoun")
-            elif tag in ["NNP", "NNPS"]:
-                summarised_tags.append("proper_noun")
-            elif tag in ["NN", "NNS"]:
-                summarised_tags.append("common_noun")
-            elif tag in ["DT", "PDT", "WDT"]:
-                summarised_tags.append("determiner")
-            elif tag == "CC":
-                summarised_tags.append("coordinating_conj")
-            elif tag == "IN":
-                summarised_tags.append("subordinating_conj")
-            elif tag in ["$", "CD", "EX", "LS", "POS", "SYM", "TO", "UH", "RP", "FW"]:
-                summarised_tags.append("other_tag")
-        index += 1
-    tag_freq_dist = FreqDist(summarised_tags)
-    #print(tag_freq_dist)
-    # convert FreqDist object to a pandas series for easier processing
-    tag_freq_dist_panda = pd.Series(dict(tag_freq_dist))
-    #print(tag_freq_dist_panda)
-    # sort, normalise and round the panda series
-    new_tag_freq_dist = tag_freq_dist_panda.sort_index()
-    #print(new_sent_len_dist)
-    for i in range(0, len(new_tag_freq_dist.index)):
-    #for index in new_token_len_dist.index:
-        new_tag_freq_dist.iat[i] = round(new_tag_freq_dist.iat[i]/len(tag_token_tuples), 2) #index-1 bc the index starts counting from zero, the word lengths not
-    print(new_tag_freq_dist)
-    # set figure, ax into variables
-    fig, ax = plt.subplots(figsize=(10,10))
-    # call function for bar (value) labels 
-    addlabels(x=new_tag_freq_dist.index, y=new_tag_freq_dist.values)
-    plt.title(f"POS Tag Frequencies for the {series.replace('_' , ' ').title()} {canon_or_fanfic.replace('_' , ' ').title()}")
-    ax.set_xlabel("POS Tags")
-    ax.set_ylabel("Percentage of Occurence")
-    sns.barplot(x=new_tag_freq_dist.index, y=new_tag_freq_dist.values, ax=ax, palette="RdPu")
-    plt.xticks(rotation=30) # !!! very useful for words
-    plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_pos_tag_frequencies.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
-    #punctuation frequency distribution
-    punct_tag_freq_dist = FreqDist(punctuation_tags)
-    #print(tag_freq_dist)
-    # convert FreqDist object to a pandas series for easier processing
-    punct_tag_freq_dist_panda = pd.Series(dict(punct_tag_freq_dist))
-    #print(punct_tag_freq_dist_panda)
-    # sort, normalise and round the panda series
-    new_punct_tag_freq_dist = punct_tag_freq_dist_panda.sort_index()
-    #print(new_sent_len_dist)
-    for i in range(0, len(new_punct_tag_freq_dist.index)):
-    #for index in new_token_len_dist.index:
-        new_punct_tag_freq_dist.iat[i] = round(new_punct_tag_freq_dist.iat[i]/len(punctuation_tags), 3) #index-1 bc the index starts counting from zero, the word lengths not
-    #print(new_punct_tag_freq_dist)
-    # set figure, ax into variables
-    fig, ax = plt.subplots(figsize=(10,10))
-    # call function for bar (value) labels 
-    addlabels(x=new_punct_tag_freq_dist.index, y=new_punct_tag_freq_dist.values)
-    plt.title(f"Punctuation Frequencies for the {series.replace('_' , ' ').title()} {canon_or_fanfic.replace('_' , ' ').title()}")
-    ax.set_xlabel("Types of Punctuation")
-    ax.set_ylabel("Percentage of Occurence")
-    sns.barplot(x=new_punct_tag_freq_dist.index, y=new_punct_tag_freq_dist.values, ax=ax, palette="OrRd")
-    plt.xticks(rotation=30) # !!! very useful for words
-    plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_punctuation_frequencies.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
 #create the Mendenhall Curve for the Throne of Glass Series
-std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")
+#std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")
 #create the Mendenhall Curve for the Grishaverse Books
-std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png")
+#std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png")
 # Mendenhall Curve Sentence Lengths for Throne of Glass Canon
-std_dev_sent_tog_canon, mean_sent_tog_canon = sentence_metrics(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Throne of Glass Series", "throne_of_glass", "canon")
+#std_dev_sent_tog_canon, mean_sent_tog_canon = sentence_metrics(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Throne of Glass Series", "throne_of_glass", "canon")
 # Mendenhall Curve Sentence Lenghts for Grishavers Canon
-std_dev_sent_grishaverse_canon, mean_sent_grishaverse_canon = sentence_metrics(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Grishaverse Books", "grishaverse", "canon")
+#std_dev_sent_grishaverse_canon, mean_sent_grishaverse_canon = sentence_metrics(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Grishaverse Books", "grishaverse", "canon")
 # POS Tag frequencies for TOG
-pos_tag_frequencies(read_works_into_string(f"throne_of_glass/data/canon_works"), "throne_of_glass", "canon")
+#pos_tag_frequencies(read_works_into_string(f"throne_of_glass/data/canon_works"), "throne_of_glass", "canon")
 # POS Tag frequencies for Grishaverse
-pos_tag_frequencies(read_works_into_string(f"grishaverse/data/canon_works"), "grishaverse", "canon")
+#pos_tag_frequencies(read_works_into_string(f"grishaverse/data/canon_works"), "grishaverse", "canon")
 def run_functions(directory_path):
    """
@@ -574,6 +439,6 @@ data_overview = pd.DataFrame(
 if __name__ == "__main__":
-    run_functions("grishaverse/data/split_txt_fanfics")
+    #run_functions("grishaverse/data/split_txt_fanfics")
-    run_functions("throne_of_glass/data/split_txt_fanfics")
+    #run_functions("throne_of_glass/data/split_txt_fanfics")
-    data_overview.to_csv(f"data_overview/data_overview.csv")
+    #data_overview.to_csv(f"data_overview/data_overview.csv")
\ No newline at end of file