Add internal metrics

8e10c2a3 · chrysanthopoulou · 6b5f0c38 · 6b5f0c38 · 8e10c2a3 · 8e10c2a3
Commit 8e10c2a3 authored 2 years ago by chrysanthopoulou
--- a/data_overview/data_overview.png
+++ b/data_overview/data_overview.png
--- a/data_overview/internal_fanfic_metrics.png
+++ b/data_overview/internal_fanfic_metrics.png
--- a/data_overview/singular_and_then_averaged_over_grishaverse_fanfic_metrics.csv
+++ b/data_overview/singular_and_then_averaged_over_grishaverse_fanfic_metrics.csv
+,good_mean_tks,bad_mean_tks,medium_mean_tks,good_std_dev_tks,bad_std_dev_tks,medium_std_dev_tks,good_ttrs,bad_ttrs,medium_ttrs,good_mean_sts,bad_mean_sts,medium_mean_sts,good_std_dev_sts,bad_std_dev_sts,medium_std_dev_sts
+mean,4.140289952930083,4.154665851096502,4.119264102863776,2.0910767253049865,2.0866604932525625,2.0540022678155148,0.5345067272238934,0.5155143859596709,0.5227749657351375,13.620579300977804,13.511268770843815,13.772946678683581,8.689111002392622,8.198019897846702,8.449796302593901
+standard deviation,0.11405170037213595,0.2688458429473845,0.1383546205572587,0.11337704274860502,0.2330295882628002,0.10478244527410657,0.0310030576193801,0.052961321228198396,0.031734463567397224,3.2088570715550557,4.522934129320814,3.582074898221741,3.086223875092333,3.5259506534982767,3.1792547488596856
--- a/data_visualisation.py
+++ b/data_visualisation.py
@@ -41,4 +41,8 @@ data_pairplot.savefig(r"project\pictures_general\data_pairplot.png")

 data_overview_styled = data_overview.style.background_gradient(cmap=cm)

-dfi.export(data_overview_styled, "data_overview/data_overview.png", table_conversion = "matplotlib")
\ No newline at end of file
+dfi.export(data_overview_styled, "data_overview/data_overview.png", table_conversion = "matplotlib")
+
+singular_and_averaged_over_fanfics = pd.DataFrame(pd.read_csv("data_overview/singular_and_then_averaged_over_grishaverse_fanfic_metrics.csv"))
+saaof = singular_and_averaged_over_fanfics.style.background_gradient(cmap=cm)
+dfi.export(saaof, "data_overview/internal_fanfic_metrics.png", table_conversion = "matplotlib")
\ No newline at end of file
--- a/fanfic_internal_metrics.py
+++ b/fanfic_internal_metrics.py
@@ -4,21 +4,23 @@ from nltk.tokenize import sent_tokenize
 import pandas as pd
 import statistics
 import re
-
+import numpy as np
+import math

 # you'll have to also download "punkt" from nltk

 # by subdiving the text into segments of 1000, it calculates the type token ratio for each segment and then averages over them
 # this ensures a comparability of the type token ratios for varying text sizes
+#too_short_segments_for_ttr = 0
 def standardised_type_token_ratio(tokens):
    ttrs = []
    segment_tokens = []
    segment = 0
    for token in tokens:
-        if segment < 1000:
+        if segment < 500: # much smaller segments, since with segment = 1000, 1703 texts were too short, no longer very meaningful; with 500 it's 639
            segment_tokens.append(token)
            segment += 1
-        elif segment == 1000:
+        elif segment == 500:
            types = set(segment_tokens)
            ttr = len(types)/len(segment_tokens)
            ttrs.append(ttr)
@@ -27,7 +29,7 @@ def standardised_type_token_ratio(tokens):
    if len(ttrs) <= 1:
        types = set(tokens)
        std_ttr = len(types)/len(tokens)
-        print("Warning: Text was too short for segmentation!")
+        print("Warning: Too short segment for ttr")
    else:
        std_ttr = statistics.mean(ttrs)
    return std_ttr
@@ -84,12 +86,23 @@ def mendenhall_curve(corpus):
    # create the distribution of token lengths / Mendenhall curve

    token_lengths = [len(token) for token in short_clean_tokens]
+
+    # Calculate the trimmed token length (with 5% trimming) We need to remove the outliers, bc even despite preprocessing, 
+    # there still are some very wrong lengths, which entirely skews the metrics and also ruins our p-values later on
+    trim_percent = 0.005
+    trim_len = int(len(token_lengths) * trim_percent / 2)
+    token_lengths = sorted(token_lengths)[trim_len:-trim_len]
    
    # calculate the standard deviation, mean, token/type ratio
-    standard_deviation = statistics.stdev(token_lengths)
-    mean = statistics.mean(token_lengths)
+    if len(token_lengths) >= 3:
+        standard_deviation = statistics.stdev(token_lengths)
+        mean = statistics.mean(token_lengths)

-    type_token_ratio = standardised_type_token_ratio(short_clean_tokens)
+        type_token_ratio = standardised_type_token_ratio(short_clean_tokens)
+    else:
+        standard_deviation = math.nan
+        mean = math.nan
+        type_token_ratio = math.nan

    return standard_deviation, mean, type_token_ratio

@@ -100,16 +113,26 @@ def sentence_metrics(corpus):
    sent_lens = []
    for sent in sents:
        short_clean_tokens = tokenize_and_clean_text(sent)
+        #if len(short_clean_tokens) >= 3: #exclude texts that due to tagging errors have less than three sentences, otherwise the standard dev won't work
        sent_lens.append(len(short_clean_tokens))
+    
+    trim_percent = 0.05
+    trim_len = int(len(sent_lens) * trim_percent / 2)
+    sent_lens = sorted(sent_lens)[trim_len:-trim_len]
        
    # calculate the standard deviation, mean
-    standard_deviation_sent = statistics.stdev(sent_lens)
-    mean_sent = statistics.mean(sent_lens)
+    if len(sent_lens) >= 3: #exclude texts that due to tagging errors have less than three sentences, otherwise the standard dev won't work
+        standard_deviation_sent = statistics.stdev(sent_lens)
+        mean_sent = statistics.mean(sent_lens)

-    return standard_deviation_sent, mean_sent
+        return standard_deviation_sent, mean_sent
+    else: 
+        standard_deviation_sent = math.nan
+        mean_sent = math.nan
+        return standard_deviation_sent, mean_sent


-def run_functions(directory_path):
+def run_functions(series):
    good_mean_tks = []
    bad_mean_tks = []
    medium_mean_tks = []
@@ -130,7 +153,7 @@ def run_functions(directory_path):
    few_kudos = 100
    medium_kudos = 1500

-    for index, body in grisha_fanfics["body"]:
+    for index, body in enumerate(grisha_fanfics["body"]):
        
        published = pd.to_datetime(grisha_fanfics["published"][index])
        if published.year != 2023:
@@ -139,13 +162,14 @@ def run_functions(directory_path):

                if kudos <= few_kudos:
                    std_dev_tk, mean_tk, ttr = mendenhall_curve(body)
-                    std_dev_st, mean_st = sentence_metrics(body)
-                    bad_mean_tks.append(mean_tk)
-                    bad_std_dev_tks.append(std_dev_tk)
-                    bad_ttrs.append(ttr)
                    bad_mean_tks.append(mean_tk)
                    bad_std_dev_tks.append(std_dev_tk)
                    bad_ttrs.append(ttr)
+                    
+                    std_dev_st, mean_st = sentence_metrics(body)
+                    bad_mean_sts.append(mean_st)
+                    bad_std_dev_sts.append(std_dev_st)
+                    

                elif kudos <= medium_kudos:
                    std_dev_tk, mean_tk, ttr = mendenhall_curve(body)
@@ -169,42 +193,53 @@ def run_functions(directory_path):
                print(f"Missing kudos value for row {index}")
        

-    lists = []
+    lists = [
+        good_mean_tks,
+        bad_mean_tks,
+        medium_mean_tks,
+        #idx = []
+        good_std_dev_tks,
+        bad_std_dev_tks,
+        medium_std_dev_tks,
+        good_ttrs,
+        bad_ttrs,
+        medium_ttrs,
+        good_mean_sts,
+        bad_mean_sts,
+        medium_mean_sts,
+        good_std_dev_sts,
+        bad_std_dev_sts,
+        medium_std_dev_sts,
+    ]
+
+    means = []
+    stds = []
+    for lst in lists:
+        means.append(np.nanmean(lst))
+        stds.append(np.nanstd(lst))
+
+    # Create DataFrame with means and standard deviations
+    singular_and_then_averaged_over_fanfic_metrics = pd.DataFrame({'good_mean_tks': [means[0], stds[0]],
+                    'bad_mean_tks': [means[1], stds[1]],
+                    'medium_mean_tks': [means[2], stds[2]],
+                    'good_std_dev_tks': [means[3], stds[3]],
+                    'bad_std_dev_tks': [means[4], stds[4]],
+                    'medium_std_dev_tks': [means[5], stds[5]],
+                    'good_ttrs': [means[6], stds[6]],
+                    'bad_ttrs': [means[7], stds[7]],
+                    'medium_ttrs': [means[8], stds[8]],
+                    'good_mean_sts': [means[9], stds[9]],
+                    'bad_mean_sts': [means[10], stds[10]],
+                    'medium_mean_sts': [means[11], stds[11]],
+                    'good_std_dev_sts': [means[12], stds[12]],
+                    'bad_std_dev_sts': [means[13], stds[13]],
+                    'medium_std_dev_sts': [means[14], stds[14]]},
+                    index=['mean', 'standard deviation'])
+    singular_and_then_averaged_over_fanfic_metrics.to_csv(f"data_overview/singular_and_then_averaged_over_{series}_fanfic_metrics.csv")

-    for list in lists:
-
-
-        #idx.append(grisha_fanfics["work_id"][index])


 grisha_fanfics = pd.read_csv("grishaverse/data/fanfics/grishaverse_fics.csv")
 #grishaverse/data/split_txt_fanfics

-#create lists for each of the columns of the dataframe we'll create
-
-mean_tokens = []
-std_dev_tokens = []
-type_token_ratio = []
-mean_sent = []
-std_dev_tokens = []
-index = []
-
-
-# create a dataframe to store all the overview statistics in
-# columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15; 
-# mean_sent; std_dev_sent; freq_sent_len ....
-# tag_frequencies 
-# tag_ngram_frequencies
-# punctuation frequencies
-# token/type ratio
-
-data_overview = pd.DataFrame(
-    {"mean_tokens":mean_tokens, 
-     "std_dev_tokens":std_dev_tokens, 
-     "type_token_ratio":type_token_ratio, 
-     "mean_sent":mean_sent, 
-     "std_dev_tokens":std_dev_tokens}, 
-     index = index
-)
-    
-data_overview.to_csv(f"data_overview/data_overview.csv")
+run_functions("grishaverse")
\ No newline at end of file