diff --git a/data_overview/data_overview.png b/data_overview/data_overview.png index 7afd43d04e64eb69d5dd14aae0c4c7c068325c93..b95f4204f49ebf3f6a981278ee54bf115d53520d 100644 Binary files a/data_overview/data_overview.png and b/data_overview/data_overview.png differ diff --git a/data_overview/internal_fanfic_metrics.png b/data_overview/internal_fanfic_metrics.png new file mode 100644 index 0000000000000000000000000000000000000000..7668db0423cfa8218ed643273cf9f7a6c3b17d76 Binary files /dev/null and b/data_overview/internal_fanfic_metrics.png differ diff --git a/data_overview/singular_and_then_averaged_over_grishaverse_fanfic_metrics.csv b/data_overview/singular_and_then_averaged_over_grishaverse_fanfic_metrics.csv new file mode 100644 index 0000000000000000000000000000000000000000..92a96ab9715ed9771437c9ce5d1277f2e54590d8 --- /dev/null +++ b/data_overview/singular_and_then_averaged_over_grishaverse_fanfic_metrics.csv @@ -0,0 +1,3 @@ +,good_mean_tks,bad_mean_tks,medium_mean_tks,good_std_dev_tks,bad_std_dev_tks,medium_std_dev_tks,good_ttrs,bad_ttrs,medium_ttrs,good_mean_sts,bad_mean_sts,medium_mean_sts,good_std_dev_sts,bad_std_dev_sts,medium_std_dev_sts +mean,4.140289952930083,4.154665851096502,4.119264102863776,2.0910767253049865,2.0866604932525625,2.0540022678155148,0.5345067272238934,0.5155143859596709,0.5227749657351375,13.620579300977804,13.511268770843815,13.772946678683581,8.689111002392622,8.198019897846702,8.449796302593901 +standard deviation,0.11405170037213595,0.2688458429473845,0.1383546205572587,0.11337704274860502,0.2330295882628002,0.10478244527410657,0.0310030576193801,0.052961321228198396,0.031734463567397224,3.2088570715550557,4.522934129320814,3.582074898221741,3.086223875092333,3.5259506534982767,3.1792547488596856 diff --git a/data_visualisation.py b/data_visualisation.py index f6aea3fba502c664011e89ddb773e2d0160c1659..8a25702b5844142e37184886564365f7bad5f55d 100644 --- a/data_visualisation.py +++ b/data_visualisation.py @@ -41,4 +41,8 @@ data_pairplot.savefig(r"project\pictures_general\data_pairplot.png") data_overview_styled = data_overview.style.background_gradient(cmap=cm) -dfi.export(data_overview_styled, "data_overview/data_overview.png", table_conversion = "matplotlib") \ No newline at end of file +dfi.export(data_overview_styled, "data_overview/data_overview.png", table_conversion = "matplotlib") + +singular_and_averaged_over_fanfics = pd.DataFrame(pd.read_csv("data_overview/singular_and_then_averaged_over_grishaverse_fanfic_metrics.csv")) +saaof = singular_and_averaged_over_fanfics.style.background_gradient(cmap=cm) +dfi.export(saaof, "data_overview/internal_fanfic_metrics.png", table_conversion = "matplotlib") \ No newline at end of file diff --git a/fanfic_internal_metrics.py b/fanfic_internal_metrics.py index 63c06702123979c6c55a2984a4ecdc58ffd42c67..a441e04ac33681db890e5772a9e4e01db649bf3d 100644 --- a/fanfic_internal_metrics.py +++ b/fanfic_internal_metrics.py @@ -4,21 +4,23 @@ from nltk.tokenize import sent_tokenize import pandas as pd import statistics import re - +import numpy as np +import math # you'll have to also download "punkt" from nltk # by subdiving the text into segments of 1000, it calculates the type token ratio for each segment and then averages over them # this ensures a comparability of the type token ratios for varying text sizes +#too_short_segments_for_ttr = 0 def standardised_type_token_ratio(tokens): ttrs = [] segment_tokens = [] segment = 0 for token in tokens: - if segment < 1000: + if segment < 500: # much smaller segments, since with segment = 1000, 1703 texts were too short, no longer very meaningful; with 500 it's 639 segment_tokens.append(token) segment += 1 - elif segment == 1000: + elif segment == 500: types = set(segment_tokens) ttr = len(types)/len(segment_tokens) ttrs.append(ttr) @@ -27,7 +29,7 @@ def standardised_type_token_ratio(tokens): if len(ttrs) <= 1: types = set(tokens) std_ttr = len(types)/len(tokens) - print("Warning: Text was too short for segmentation!") + print("Warning: Too short segment for ttr") else: std_ttr = statistics.mean(ttrs) return std_ttr @@ -84,12 +86,23 @@ def mendenhall_curve(corpus): # create the distribution of token lengths / Mendenhall curve token_lengths = [len(token) for token in short_clean_tokens] + + # Calculate the trimmed token length (with 5% trimming) We need to remove the outliers, bc even despite preprocessing, + # there still are some very wrong lengths, which entirely skews the metrics and also ruins our p-values later on + trim_percent = 0.005 + trim_len = int(len(token_lengths) * trim_percent / 2) + token_lengths = sorted(token_lengths)[trim_len:-trim_len] # calculate the standard deviation, mean, token/type ratio - standard_deviation = statistics.stdev(token_lengths) - mean = statistics.mean(token_lengths) + if len(token_lengths) >= 3: + standard_deviation = statistics.stdev(token_lengths) + mean = statistics.mean(token_lengths) - type_token_ratio = standardised_type_token_ratio(short_clean_tokens) + type_token_ratio = standardised_type_token_ratio(short_clean_tokens) + else: + standard_deviation = math.nan + mean = math.nan + type_token_ratio = math.nan return standard_deviation, mean, type_token_ratio @@ -100,16 +113,26 @@ def sentence_metrics(corpus): sent_lens = [] for sent in sents: short_clean_tokens = tokenize_and_clean_text(sent) + #if len(short_clean_tokens) >= 3: #exclude texts that due to tagging errors have less than three sentences, otherwise the standard dev won't work sent_lens.append(len(short_clean_tokens)) + + trim_percent = 0.05 + trim_len = int(len(sent_lens) * trim_percent / 2) + sent_lens = sorted(sent_lens)[trim_len:-trim_len] # calculate the standard deviation, mean - standard_deviation_sent = statistics.stdev(sent_lens) - mean_sent = statistics.mean(sent_lens) + if len(sent_lens) >= 3: #exclude texts that due to tagging errors have less than three sentences, otherwise the standard dev won't work + standard_deviation_sent = statistics.stdev(sent_lens) + mean_sent = statistics.mean(sent_lens) - return standard_deviation_sent, mean_sent + return standard_deviation_sent, mean_sent + else: + standard_deviation_sent = math.nan + mean_sent = math.nan + return standard_deviation_sent, mean_sent -def run_functions(directory_path): +def run_functions(series): good_mean_tks = [] bad_mean_tks = [] medium_mean_tks = [] @@ -130,7 +153,7 @@ def run_functions(directory_path): few_kudos = 100 medium_kudos = 1500 - for index, body in grisha_fanfics["body"]: + for index, body in enumerate(grisha_fanfics["body"]): published = pd.to_datetime(grisha_fanfics["published"][index]) if published.year != 2023: @@ -139,13 +162,14 @@ def run_functions(directory_path): if kudos <= few_kudos: std_dev_tk, mean_tk, ttr = mendenhall_curve(body) - std_dev_st, mean_st = sentence_metrics(body) - bad_mean_tks.append(mean_tk) - bad_std_dev_tks.append(std_dev_tk) - bad_ttrs.append(ttr) bad_mean_tks.append(mean_tk) bad_std_dev_tks.append(std_dev_tk) bad_ttrs.append(ttr) + + std_dev_st, mean_st = sentence_metrics(body) + bad_mean_sts.append(mean_st) + bad_std_dev_sts.append(std_dev_st) + elif kudos <= medium_kudos: std_dev_tk, mean_tk, ttr = mendenhall_curve(body) @@ -169,42 +193,53 @@ def run_functions(directory_path): print(f"Missing kudos value for row {index}") - lists = [] + lists = [ + good_mean_tks, + bad_mean_tks, + medium_mean_tks, + #idx = [] + good_std_dev_tks, + bad_std_dev_tks, + medium_std_dev_tks, + good_ttrs, + bad_ttrs, + medium_ttrs, + good_mean_sts, + bad_mean_sts, + medium_mean_sts, + good_std_dev_sts, + bad_std_dev_sts, + medium_std_dev_sts, + ] + + means = [] + stds = [] + for lst in lists: + means.append(np.nanmean(lst)) + stds.append(np.nanstd(lst)) + + # Create DataFrame with means and standard deviations + singular_and_then_averaged_over_fanfic_metrics = pd.DataFrame({'good_mean_tks': [means[0], stds[0]], + 'bad_mean_tks': [means[1], stds[1]], + 'medium_mean_tks': [means[2], stds[2]], + 'good_std_dev_tks': [means[3], stds[3]], + 'bad_std_dev_tks': [means[4], stds[4]], + 'medium_std_dev_tks': [means[5], stds[5]], + 'good_ttrs': [means[6], stds[6]], + 'bad_ttrs': [means[7], stds[7]], + 'medium_ttrs': [means[8], stds[8]], + 'good_mean_sts': [means[9], stds[9]], + 'bad_mean_sts': [means[10], stds[10]], + 'medium_mean_sts': [means[11], stds[11]], + 'good_std_dev_sts': [means[12], stds[12]], + 'bad_std_dev_sts': [means[13], stds[13]], + 'medium_std_dev_sts': [means[14], stds[14]]}, + index=['mean', 'standard deviation']) + singular_and_then_averaged_over_fanfic_metrics.to_csv(f"data_overview/singular_and_then_averaged_over_{series}_fanfic_metrics.csv") - for list in lists: - - - #idx.append(grisha_fanfics["work_id"][index]) grisha_fanfics = pd.read_csv("grishaverse/data/fanfics/grishaverse_fics.csv") #grishaverse/data/split_txt_fanfics -#create lists for each of the columns of the dataframe we'll create - -mean_tokens = [] -std_dev_tokens = [] -type_token_ratio = [] -mean_sent = [] -std_dev_tokens = [] -index = [] - - -# create a dataframe to store all the overview statistics in -# columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15; -# mean_sent; std_dev_sent; freq_sent_len .... -# tag_frequencies -# tag_ngram_frequencies -# punctuation frequencies -# token/type ratio - -data_overview = pd.DataFrame( - {"mean_tokens":mean_tokens, - "std_dev_tokens":std_dev_tokens, - "type_token_ratio":type_token_ratio, - "mean_sent":mean_sent, - "std_dev_tokens":std_dev_tokens}, - index = index -) - -data_overview.to_csv(f"data_overview/data_overview.csv") +run_functions("grishaverse") \ No newline at end of file