Add standardised type token ratio

ecc66a90 · chrysanthopoulou · 4349e916 · ecc66a90 · ecc66a90
Commit ecc66a90 authored 2 years ago by chrysanthopoulou
--- a/data_overview/data_overview.csv
+++ b/data_overview/data_overview.csv
+,mean_tokens,std_dev,type_token_ratio
+throne_of_glass_canon,4.9282660487582,2.033132292363114,0.5932999999999999
+grishaverse_canon,5.046888277930518,2.20148396165822,0.6335263157894737
--- a/stylometry_code.py
+++ b/stylometry_code.py
@@ -54,6 +54,33 @@ def read_works_into_string(directory_path):
 # curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series"
 # plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png" 
+# by subdiving the text into segments of 1000, it calculates the type token ratio for each segment and then averages over them
+# this ensures a comparability of the type token ratios for varying text sizes
+def standardised_type_token_ratio(tokens):
+    ttrs = []
+    segment_tokens = []
+    segment = 0
+    for token in tokens:
+        if segment < 1000:
+            segment_tokens.append(token)
+            segment += 1
+        elif segment == 1000:
+            types = set(segment_tokens)
+            ttr = len(types)/len(segment_tokens)
+            ttrs.append(ttr)
+            segment_tokens =[]
+            segment = 0
+    if len(ttrs) <= 1:
+        types = set(tokens)
+        std_ttr = len(types)/len(tokens)
+        print("Warning: Text was too short for segmentation!")
+        print(ttrs)
+    else:
+        std_ttr = statistics.mean(ttrs)
+        print(ttrs)
+    return std_ttr
 def mendenhall_curve(corpus, curve_title, plot_destination): 
    tokens = word_tokenize(corpus)
@@ -120,8 +147,35 @@ def mendenhall_curve(corpus, curve_title, plot_destination):
    #fig_freq_dist = token_length_freq_dist_plot.get_figure()
    #fig_freq_dist.savefig(plot_destination)
+    # calculate the standard deviation, mean, token/type ratio
+    standard_deviation = statistics.stdev(token_lengths)
+    mean = statistics.mean(token_lengths)
+    # to get the number of unique tokens, i.e., types, I'm converting
+    # my list to a set (and back). I could also transform it to a pandas
+    # series, and drop the duplicates, but: if it is stupid and it works
+    # it isn't stupid
+    types_set = set(short_clean_tokens)
+    #type_token_ratio = len(types_set)/len(short_clean_tokens)
+    type_token_ratio = standardised_type_token_ratio(short_clean_tokens)
+    return standard_deviation, mean, type_token_ratio
 #create the Mendenhall Curve for the Throne of Glass Series
-mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")
+std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")
 #create the Mendenhall Curve for the Grishaverse Books
-mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png")
+std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png")
+# create a dataframe to store all the overview statistics in
+# columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15; 
+# mean_sent; std_dev_sent; freq_sent_len ....
+# tag_frequencies 
+# tag_ngram_frequencies
+# punctuation frequencies
+# token/type ratio
+data_overview = pd.DataFrame({"mean_tokens":[mean_tokens_tog_canon, mean_tokens_grishaverse_canon], "std_dev":[std_dev_tokens_tog_canon, std_dev_tokens_grishaverse_canon], "type_token_ratio":[type_token_ratio_tog_canon, type_token_ratio_grishaverse_canon]}, index= ["throne_of_glass_canon", "grishaverse_canon"])
+data_overview.to_csv(f"data_overview/data_overview.csv")
\ No newline at end of file