Add mediocre punctuation workaround

15403abf · chrysanthopoulou · 5d267e22 · 15403abf · 15403abf · 5d267e22
Commit 15403abf authored 2 years ago by chrysanthopoulou
--- a/grishaverse/freq_distribution/canon_pos_tag_frequencies.png
+++ b/grishaverse/freq_distribution/canon_pos_tag_frequencies.png
--- a/grishaverse/freq_distribution/canon_punctuation_frequencies.png
+++ b/grishaverse/freq_distribution/canon_punctuation_frequencies.png
--- a/grishaverse/freq_distribution/canon_sent_len_long.png
+++ b/grishaverse/freq_distribution/canon_sent_len_long.png
--- a/grishaverse/freq_distribution/canon_sent_len_short.png
+++ b/grishaverse/freq_distribution/canon_sent_len_short.png
--- a/stylometry_code.py
+++ b/stylometry_code.py
@@ -5,6 +5,7 @@ import os
 from nltk.tokenize import word_tokenize
 from nltk.probability import FreqDist
 from nltk.tokenize import sent_tokenize
+from nltk.tag import pos_tag
 import pandas as pd
 import statistics

@@ -77,14 +78,9 @@ def standardised_type_token_ratio(tokens):
    return std_ttr


-# this function takes a corpus as its input and gives a Mendenhall curve, i.e. a frequency distribution of tokens as its output
-# precise input: corpus = string ; 
-# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series"
-# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png" 
-      
+def tokenize_and_clean_text(text):

-def mendenhall_curve(corpus, curve_title, plot_destination): 
-    tokens = word_tokenize(corpus)
+    tokens = word_tokenize(text)
    cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)])
    short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens
    # had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin"
@@ -116,8 +112,20 @@ def mendenhall_curve(corpus, curve_title, plot_destination):
            short_clean_tokens.append(token) #catching the tokens that didn't have any special characters; but not the dehyphenated ones twice
        elif letter_present == 1 and dehyphenated == 1 and second_word_in_compound == 1:
            short_clean_tokens.append(''.join(map(str, dehyphenated_token)))
-    
+    return short_clean_tokens
+

+
+# this function takes a corpus as its input and gives a Mendenhall curve, i.e. a frequency distribution of tokens as its output
+# precise input: corpus = string ; 
+# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series"
+# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png" 
+      
+
+def mendenhall_curve(corpus, curve_title, plot_destination): 
+    
+    short_clean_tokens = tokenize_and_clean_text(corpus)
+    
    # create the distribution of token lengths / Mendenhall curve

    token_lengths = [len(token) for token in short_clean_tokens]
@@ -169,46 +177,11 @@ def mendenhall_curve(corpus, curve_title, plot_destination):


 def sentence_metrics(corpus, curve_title, series, canon_or_fanfic): 
+
    sents = sent_tokenize(corpus)
-    
    sent_lens = []
    for sent in sents:
-        #print(sent)
-        tokens = word_tokenize(sent)
-        #print(tokens)
-        cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)])
-        #print(cleaned_tokens)
-        short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens
-        # had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin"
-        # and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running"
-        for token in cleaned_tokens:
-            dehyphenated_token = []
-            letter_present = 0
-            dehyphenated = 0
-            second_word_in_compound = 0
-            for c in token:
-                if c.isalpha() == True:
-                    dehyphenated_token.append(c)
-                    letter_present = 1
-                    if dehyphenated == 1:
-                        second_word_in_compound = 1
-                elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens, 
-                    #bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of 
-                    # high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
-                    # relatively speaking 
-                    dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
-                    #print(dehyphenated_token_joined)
-                    short_clean_tokens.append(dehyphenated_token_joined)
-                    dehyphenated_token = []
-                    letter_present = 0
-                    dehyphenated = 1
-                    second_word_in_compound = 0
-            if letter_present == 1 and dehyphenated == 0:
-                short_clean_tokens.append(token) #catching the tokens that didn't have any special characters; but not the dehyphenated ones twice
-            elif letter_present == 1 and dehyphenated == 1 and second_word_in_compound == 1:
-                short_clean_tokens.append(''.join(map(str, dehyphenated_token)))
-        #print(short_clean_tokens)
-        #print(len(short_clean_tokens))
+        short_clean_tokens = tokenize_and_clean_text(sent)
        sent_lens.append(len(short_clean_tokens))
        #if len(short_clean_tokens)>= 90:
            #print(f"This sentence: \n {sent} \n is this long: {len(short_clean_tokens)}")
@@ -261,7 +234,6 @@ def sentence_metrics(corpus, curve_title, series, canon_or_fanfic):
    for i in range(0, len(new_sent_len_dist_short.index)):
    #for index in new_token_len_dist.index:
        new_sent_len_dist_short.iat[i] = round(new_sent_len_dist_short.iat[i]/len(sent_lens), 2) #index-1 bc the index starts counting from zero, the word lengths not
-    

    # set figure, ax into variables
    fig, ax = plt.subplots(figsize=(10,10))
@@ -284,20 +256,209 @@ def sentence_metrics(corpus, curve_title, series, canon_or_fanfic):
    return standard_deviation_sent, mean_sent


-def most_frequent_words(corpus, curve_title, series, canon_or_fanfic): 
-    #hall
+# overall pos_tag frequency distribution
+# pos_tag ngrams; (maybe exclude stopwords?) 
+# tag collocates for specific tags --> adjectives most frequently with nouns
+# most frequent words 
+# most frequent words for specific tags --> punctuation; 
+# most frequent adjectives
+
+def pos_tag_frequencies(corpus, curve_title, series, canon_or_fanfic):
+    #nltk.pos_tag(text) --> [('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'),
+    #('completely', 'RB'), ('different', 'JJ')]
+    tokens = word_tokenize(corpus)
+    short_tokens = []
+    for token in tokens:
+        dehyphenated_token = []
+        letter_present = 0
+        dehyphenated = 0
+        second_word_in_compound = 0
+        for c in token:
+            if c.isalpha() == True:
+                dehyphenated_token.append(c)
+                letter_present = 1
+                if dehyphenated == 1:
+                    second_word_in_compound = 1
+            elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens, 
+                #bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of 
+                # high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
+                # relatively speaking 
+                dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
+                #print(dehyphenated_token_joined)
+                short_tokens.append(dehyphenated_token_joined)
+                short_tokens.append(c) #append the hyphen/ other punctuation --> we're also interested in that
+                dehyphenated_token = []
+                letter_present = 0
+                dehyphenated = 1
+                second_word_in_compound = 0
+        if letter_present == 1 and dehyphenated == 0:
+            short_tokens.append(token) #catching the tokens that didn't have any special characters; but not the dehyphenated ones twice
+        elif letter_present == 1 and dehyphenated == 1 and second_word_in_compound == 1:
+            short_tokens.append(''.join(map(str, dehyphenated_token)))
+    
+    tag_token_tuples = pos_tag(short_tokens)
+
+    """
+
+    #coordinating_conjunction = [CC]
+    #subordinating_conjunction = []IN
+    #determiner = [] # tag: DT; PDT; WDT
+    #noun = [] # tags: NN, NNS
+    #proper_noun = [] # tags: NNP; NNPS
+    #pronoun = [] # tags: PRP, PRP$; WP; WP$
+    #adverb = [] # RB; RBR; RBS; WRB
+    #verb = [] # tags: MD; VB; VBD; VBG; VBN; VBP; VBZ
+    #adjective = [] #tags: JJ; JJR; JJS 
+    other_tags = [] # tags: $; CD; EX; LS; POS; SYM; TO; UH; RP; FW
+
+    #punctuation
+    quotation_marks = [] # tags: '', `` 
+    comma = [] # tags: ,
+    dash = [] # tags: --
+    sentence_terminator = [] # tag: .
+    parentheses = [] # tags: (; )
+    semicolon = [] # for tag : --> token: ;
+    ellipsis_punct = [] # for tag : --> token: ...
+    """
+    
+    punctuation_tags = []
+    summarised_tags = []
+
+    for tuple in tag_token_tuples:
+        if tuple[1] in ["MD" , "VB" , "VBD", "VBG" , "VBN" , "VBP" , "VBZ"]:
+            summarised_tags.append("verb")
+        elif tuple[1] in ["JJ" , "JJR" , "JJS"]:
+            summarised_tags.append("adjective")
+        elif tuple[1] in ["RB" , "RBR" , "RBS" , "WRB"]:
+            summarised_tags.append("adverb")
+        elif tuple[1] in ["PRP" , "PRP$" , "WP" , "WP$"]:
+            summarised_tags.append("pronoun")
+        elif tuple[1] in ["NNP" , "NNPS"]:
+            summarised_tags.append("proper_noun")
+        elif tuple[1] in ["NN" , "NNS"]:
+            summarised_tags.append("common_noun")
+        elif tuple[1] in [ "DT" , "PDT" , "WDT"]:
+            summarised_tags.append("determiner")
+        elif tuple[1] == "CC":
+            summarised_tags.append("coordinating_conj")
+        elif tuple[1] == "IN":
+            summarised_tags.append("subordinating_conj")
+        elif tuple[1] in ["$" , "CD" , "EX" , "LS" , "POS" , "SYM" , "TO" , "UH" , "RP" , "FW"]:
+            summarised_tags.append("other_tag")
+        # now comes the punctuation
+        elif tuple[1] in [ "''" , "``"]:
+            summarised_tags.append("punctuation")
+            punctuation_tags.append("quotation_marks")
+        elif tuple[1] == ",":
+            summarised_tags.append("punctuation")
+            punctuation_tags.append("comma")
+        elif tuple[1] == ".":
+            summarised_tags.append("punctuation")
+            punctuation_tags.append("sentence_terminator")
+        elif tuple[1] in ["(" , ")"]:
+            summarised_tags.append("punctuation")
+            punctuation_tags.append("parentheses")
+        elif tuple[1] == "--":
+            summarised_tags.append("punctuation")
+            punctuation_tags.append("dash")
+        elif tuple[1] == ":":
+            summarised_tags.append("punctuation")
+            if tuple[0] == ";":
+                punctuation_tags.append("semicolon")
+            elif tuple[0] == "...":
+                punctuation_tags.append("ellipsis")
+            elif tuple[0] == ":":
+                punctuation_tags.append("colon")
+        else:
+            summarised_tags.append(tuple[1])
+
+    tag_freq_dist = FreqDist(summarised_tags)
+    #print(tag_freq_dist)
+
+    # convert FreqDist object to a pandas series for easier processing
+    tag_freq_dist_panda = pd.Series(dict(tag_freq_dist))
+    print(tag_freq_dist_panda)
+    
+    # sort, normalise and round the panda series
+
+    new_tag_freq_dist = tag_freq_dist_panda.sort_index()
+    #print(new_sent_len_dist)
+    
+    for i in range(0, len(new_tag_freq_dist.index)):
+    #for index in new_token_len_dist.index:
+        new_tag_freq_dist.iat[i] = round(new_tag_freq_dist.iat[i]/len(tag_token_tuples), 2) #index-1 bc the index starts counting from zero, the word lengths not
+    
+    print(new_tag_freq_dist)
+
+    # set figure, ax into variables
+    fig, ax = plt.subplots(figsize=(10,10))
+
+    # call function for bar (value) labels 
+    addlabels(x=new_tag_freq_dist.index, y=new_tag_freq_dist.values)
+
+    plt.title(curve_title)
+    ax.set_xlabel("POS Tags")
+    ax.set_ylabel("Percentage of Occurence")
+    
+    sns.barplot(x=new_tag_freq_dist.index, y=new_tag_freq_dist.values, ax=ax, palette="flare")
+    plt.xticks(rotation=30) # !!! very useful for words
+    plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_pos_tag_frequencies.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
+    
+
+    #punctuation frequency distribution
+
+    punct_tag_freq_dist = FreqDist(punctuation_tags)
+    #print(tag_freq_dist)
+
+    # convert FreqDist object to a pandas series for easier processing
+    punct_tag_freq_dist_panda = pd.Series(dict(punct_tag_freq_dist))
+    print(punct_tag_freq_dist_panda)
+    
+    # sort, normalise and round the panda series
+
+    new_punct_tag_freq_dist = punct_tag_freq_dist_panda.sort_index()
+    #print(new_sent_len_dist)
+    
+    for i in range(0, len(new_punct_tag_freq_dist.index)):
+    #for index in new_token_len_dist.index:
+        new_punct_tag_freq_dist.iat[i] = round(new_punct_tag_freq_dist.iat[i]/len(punctuation_tags), 2) #index-1 bc the index starts counting from zero, the word lengths not
+    
+    print(new_punct_tag_freq_dist)
+
+    # set figure, ax into variables
+    fig, ax = plt.subplots(figsize=(10,10))
+
+    # call function for bar (value) labels 
+    addlabels(x=new_punct_tag_freq_dist.index, y=new_punct_tag_freq_dist.values)

+    plt.title(curve_title)
+    ax.set_xlabel("Types of Punctuation")
+    ax.set_ylabel("Percentage of Occurence")
+    
+    sns.barplot(x=new_punct_tag_freq_dist.index, y=new_punct_tag_freq_dist.values, ax=ax, palette="flare")
+    plt.xticks(rotation=30) # !!! very useful for words
+    plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_punctuation_frequencies.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
+    
+    
 #create the Mendenhall Curve for the Throne of Glass Series
-std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")
+#std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")

 #create the Mendenhall Curve for the Grishaverse Books
-std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png")
+#std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png")

 # Mendenhall Curve Sentence Lengths for Throne of Glass Canon
-std_dev_sent_tog_canon, mean_sent_tog_canon = sentence_metrics(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Throne of Glass Series", "throne_of_glass", "canon")
+#std_dev_sent_tog_canon, mean_sent_tog_canon = sentence_metrics(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Throne of Glass Series", "throne_of_glass", "canon")

 # Mendenhall Curve Sentence Lenghts for Grishavers Canon
-std_dev_sent_grishaverse_canon, mean_sent_grishaverse_canon = sentence_metrics(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Grishaverse Books", "grishaverse", "canon")
+#std_dev_sent_grishaverse_canon, mean_sent_grishaverse_canon = sentence_metrics(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Grishaverse Books", "grishaverse", "canon")
+
+# POS Tag frequencies for TOG
+#pos_tag_frequencies(read_works_into_string(f"throne_of_glass/data/canon_works"), "POS Tag Frequencies for the Throne of Glass Series", "throne_of_glass", "canon")
+
+
+# POS Tag frequencies for Grishaverse
+pos_tag_frequencies(read_works_into_string(f"grishaverse/data/canon_works"), "POS Tag Frequencies for the Grishaverse Books", "grishaverse", "canon")
+


 # create a dataframe to store all the overview statistics in
@@ -307,7 +468,7 @@ std_dev_sent_grishaverse_canon, mean_sent_grishaverse_canon = sentence_metrics(r
 # tag_ngram_frequencies
 # punctuation frequencies
 # token/type ratio
-
+"""
 data_overview = pd.DataFrame(
    {"mean_tokens":[mean_tokens_tog_canon, mean_tokens_grishaverse_canon], 
     "std_dev":[std_dev_tokens_tog_canon, std_dev_tokens_grishaverse_canon], 
@@ -317,4 +478,6 @@ data_overview = pd.DataFrame(
     index= ["throne_of_glass_canon", "grishaverse_canon"]
    )
    
-data_overview.to_csv(f"data_overview/data_overview.csv")
\ No newline at end of file
+data_overview.to_csv(f"data_overview/data_overview.csv")
+
+"""
\ No newline at end of file
--- a/throne_of_glass/freq_distribution/canon_pos_tag_frequencies.png
+++ b/throne_of_glass/freq_distribution/canon_pos_tag_frequencies.png
--- a/throne_of_glass/freq_distribution/canon_punctuation_frequencies.png
+++ b/throne_of_glass/freq_distribution/canon_punctuation_frequencies.png