Skip to content
Snippets Groups Projects
Commit 8e10c2a3 authored by chrysanthopoulou's avatar chrysanthopoulou
Browse files

Add internal metrics

parent 6b5f0c38
No related branches found
No related tags found
No related merge requests found
data_overview/data_overview.png

107 KiB | W: | H:

data_overview/data_overview.png

107 KiB | W: | H:

data_overview/data_overview.png
data_overview/data_overview.png
data_overview/data_overview.png
data_overview/data_overview.png
  • 2-up
  • Swipe
  • Onion skin
data_overview/internal_fanfic_metrics.png

52.9 KiB

,good_mean_tks,bad_mean_tks,medium_mean_tks,good_std_dev_tks,bad_std_dev_tks,medium_std_dev_tks,good_ttrs,bad_ttrs,medium_ttrs,good_mean_sts,bad_mean_sts,medium_mean_sts,good_std_dev_sts,bad_std_dev_sts,medium_std_dev_sts
mean,4.140289952930083,4.154665851096502,4.119264102863776,2.0910767253049865,2.0866604932525625,2.0540022678155148,0.5345067272238934,0.5155143859596709,0.5227749657351375,13.620579300977804,13.511268770843815,13.772946678683581,8.689111002392622,8.198019897846702,8.449796302593901
standard deviation,0.11405170037213595,0.2688458429473845,0.1383546205572587,0.11337704274860502,0.2330295882628002,0.10478244527410657,0.0310030576193801,0.052961321228198396,0.031734463567397224,3.2088570715550557,4.522934129320814,3.582074898221741,3.086223875092333,3.5259506534982767,3.1792547488596856
......@@ -41,4 +41,8 @@ data_pairplot.savefig(r"project\pictures_general\data_pairplot.png")
data_overview_styled = data_overview.style.background_gradient(cmap=cm)
dfi.export(data_overview_styled, "data_overview/data_overview.png", table_conversion = "matplotlib")
\ No newline at end of file
dfi.export(data_overview_styled, "data_overview/data_overview.png", table_conversion = "matplotlib")
singular_and_averaged_over_fanfics = pd.DataFrame(pd.read_csv("data_overview/singular_and_then_averaged_over_grishaverse_fanfic_metrics.csv"))
saaof = singular_and_averaged_over_fanfics.style.background_gradient(cmap=cm)
dfi.export(saaof, "data_overview/internal_fanfic_metrics.png", table_conversion = "matplotlib")
\ No newline at end of file
......@@ -4,21 +4,23 @@ from nltk.tokenize import sent_tokenize
import pandas as pd
import statistics
import re
import numpy as np
import math
# you'll have to also download "punkt" from nltk
# by subdiving the text into segments of 1000, it calculates the type token ratio for each segment and then averages over them
# this ensures a comparability of the type token ratios for varying text sizes
#too_short_segments_for_ttr = 0
def standardised_type_token_ratio(tokens):
ttrs = []
segment_tokens = []
segment = 0
for token in tokens:
if segment < 1000:
if segment < 500: # much smaller segments, since with segment = 1000, 1703 texts were too short, no longer very meaningful; with 500 it's 639
segment_tokens.append(token)
segment += 1
elif segment == 1000:
elif segment == 500:
types = set(segment_tokens)
ttr = len(types)/len(segment_tokens)
ttrs.append(ttr)
......@@ -27,7 +29,7 @@ def standardised_type_token_ratio(tokens):
if len(ttrs) <= 1:
types = set(tokens)
std_ttr = len(types)/len(tokens)
print("Warning: Text was too short for segmentation!")
print("Warning: Too short segment for ttr")
else:
std_ttr = statistics.mean(ttrs)
return std_ttr
......@@ -84,12 +86,23 @@ def mendenhall_curve(corpus):
# create the distribution of token lengths / Mendenhall curve
token_lengths = [len(token) for token in short_clean_tokens]
# Calculate the trimmed token length (with 5% trimming) We need to remove the outliers, bc even despite preprocessing,
# there still are some very wrong lengths, which entirely skews the metrics and also ruins our p-values later on
trim_percent = 0.005
trim_len = int(len(token_lengths) * trim_percent / 2)
token_lengths = sorted(token_lengths)[trim_len:-trim_len]
# calculate the standard deviation, mean, token/type ratio
standard_deviation = statistics.stdev(token_lengths)
mean = statistics.mean(token_lengths)
if len(token_lengths) >= 3:
standard_deviation = statistics.stdev(token_lengths)
mean = statistics.mean(token_lengths)
type_token_ratio = standardised_type_token_ratio(short_clean_tokens)
type_token_ratio = standardised_type_token_ratio(short_clean_tokens)
else:
standard_deviation = math.nan
mean = math.nan
type_token_ratio = math.nan
return standard_deviation, mean, type_token_ratio
......@@ -100,16 +113,26 @@ def sentence_metrics(corpus):
sent_lens = []
for sent in sents:
short_clean_tokens = tokenize_and_clean_text(sent)
#if len(short_clean_tokens) >= 3: #exclude texts that due to tagging errors have less than three sentences, otherwise the standard dev won't work
sent_lens.append(len(short_clean_tokens))
trim_percent = 0.05
trim_len = int(len(sent_lens) * trim_percent / 2)
sent_lens = sorted(sent_lens)[trim_len:-trim_len]
# calculate the standard deviation, mean
standard_deviation_sent = statistics.stdev(sent_lens)
mean_sent = statistics.mean(sent_lens)
if len(sent_lens) >= 3: #exclude texts that due to tagging errors have less than three sentences, otherwise the standard dev won't work
standard_deviation_sent = statistics.stdev(sent_lens)
mean_sent = statistics.mean(sent_lens)
return standard_deviation_sent, mean_sent
return standard_deviation_sent, mean_sent
else:
standard_deviation_sent = math.nan
mean_sent = math.nan
return standard_deviation_sent, mean_sent
def run_functions(directory_path):
def run_functions(series):
good_mean_tks = []
bad_mean_tks = []
medium_mean_tks = []
......@@ -130,7 +153,7 @@ def run_functions(directory_path):
few_kudos = 100
medium_kudos = 1500
for index, body in grisha_fanfics["body"]:
for index, body in enumerate(grisha_fanfics["body"]):
published = pd.to_datetime(grisha_fanfics["published"][index])
if published.year != 2023:
......@@ -139,13 +162,14 @@ def run_functions(directory_path):
if kudos <= few_kudos:
std_dev_tk, mean_tk, ttr = mendenhall_curve(body)
std_dev_st, mean_st = sentence_metrics(body)
bad_mean_tks.append(mean_tk)
bad_std_dev_tks.append(std_dev_tk)
bad_ttrs.append(ttr)
bad_mean_tks.append(mean_tk)
bad_std_dev_tks.append(std_dev_tk)
bad_ttrs.append(ttr)
std_dev_st, mean_st = sentence_metrics(body)
bad_mean_sts.append(mean_st)
bad_std_dev_sts.append(std_dev_st)
elif kudos <= medium_kudos:
std_dev_tk, mean_tk, ttr = mendenhall_curve(body)
......@@ -169,42 +193,53 @@ def run_functions(directory_path):
print(f"Missing kudos value for row {index}")
lists = []
lists = [
good_mean_tks,
bad_mean_tks,
medium_mean_tks,
#idx = []
good_std_dev_tks,
bad_std_dev_tks,
medium_std_dev_tks,
good_ttrs,
bad_ttrs,
medium_ttrs,
good_mean_sts,
bad_mean_sts,
medium_mean_sts,
good_std_dev_sts,
bad_std_dev_sts,
medium_std_dev_sts,
]
means = []
stds = []
for lst in lists:
means.append(np.nanmean(lst))
stds.append(np.nanstd(lst))
# Create DataFrame with means and standard deviations
singular_and_then_averaged_over_fanfic_metrics = pd.DataFrame({'good_mean_tks': [means[0], stds[0]],
'bad_mean_tks': [means[1], stds[1]],
'medium_mean_tks': [means[2], stds[2]],
'good_std_dev_tks': [means[3], stds[3]],
'bad_std_dev_tks': [means[4], stds[4]],
'medium_std_dev_tks': [means[5], stds[5]],
'good_ttrs': [means[6], stds[6]],
'bad_ttrs': [means[7], stds[7]],
'medium_ttrs': [means[8], stds[8]],
'good_mean_sts': [means[9], stds[9]],
'bad_mean_sts': [means[10], stds[10]],
'medium_mean_sts': [means[11], stds[11]],
'good_std_dev_sts': [means[12], stds[12]],
'bad_std_dev_sts': [means[13], stds[13]],
'medium_std_dev_sts': [means[14], stds[14]]},
index=['mean', 'standard deviation'])
singular_and_then_averaged_over_fanfic_metrics.to_csv(f"data_overview/singular_and_then_averaged_over_{series}_fanfic_metrics.csv")
for list in lists:
#idx.append(grisha_fanfics["work_id"][index])
grisha_fanfics = pd.read_csv("grishaverse/data/fanfics/grishaverse_fics.csv")
#grishaverse/data/split_txt_fanfics
#create lists for each of the columns of the dataframe we'll create
mean_tokens = []
std_dev_tokens = []
type_token_ratio = []
mean_sent = []
std_dev_tokens = []
index = []
# create a dataframe to store all the overview statistics in
# columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15;
# mean_sent; std_dev_sent; freq_sent_len ....
# tag_frequencies
# tag_ngram_frequencies
# punctuation frequencies
# token/type ratio
data_overview = pd.DataFrame(
{"mean_tokens":mean_tokens,
"std_dev_tokens":std_dev_tokens,
"type_token_ratio":type_token_ratio,
"mean_sent":mean_sent,
"std_dev_tokens":std_dev_tokens},
index = index
)
data_overview.to_csv(f"data_overview/data_overview.csv")
run_functions("grishaverse")
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment