Skip to content
Snippets Groups Projects
Commit afbbdaeb authored by chrysanthopoulou's avatar chrysanthopoulou
Browse files

Update code that ignores outliers

parent b87c53f4
No related branches found
No related tags found
No related merge requests found
Showing
with 33 additions and 13 deletions
,mean_tokens,std_dev_tokens,type_token_ratio,mean_sent,std_dev_sent
throne_of_glass_canon,4.20580153308561,2.0348877670869365,0.4612289416846652,14.468550677890269,9.820105672393566
grishaverse_canon,4.1116821403167725,2.1047643402022285,0.4679412861136999,14.026379022147932,9.42692548599567
grishaverse_good_fics,4.128605681546294,2.12767094657917,0.44176648168701443,12.920361563144626,10.031898461069263
grishaverse_bad_fics,4.192839204109023,2.1961898296996827,0.4488349209373214,13.098263374311202,10.83490565859641
grishaverse_medium_fics,4.125989775260719,2.1266952539859654,0.4420552018160678,13.1788589173054,10.270865275375563
throne_of_glass_good_fics,4.197038090427363,2.0907564170382065,0.4495104669887279,13.376067824328105,9.013067041149515
throne_of_glass_bad_fics,4.123089252572971,2.075327500013793,0.43527116374871266,12.966996479535549,9.797982354809053
throne_of_glass_medium_fics,4.123495735120379,2.072193436253281,0.4337096917417227,12.511614522473558,8.912865289012412
throne_of_glass_canon,4.193697929352685,1.9929822413221485,0.4612289416846652,13.97989113172623,8.213478462376772
grishaverse_canon,4.098013126056176,2.0593782606875544,0.4679412861136999,13.531299829548578,7.838649564219618
grishaverse_good_fics,4.114071994071994,2.0786329634637415,0.44176648168701443,12.333374549139293,7.801698270484631
grishaverse_bad_fics,4.176622194950878,2.1405686491651563,0.4488349209373214,12.38953592875258,8.124021833578277
grishaverse_medium_fics,4.111530457967681,2.077913266534855,0.4420552018160678,12.57301135687885,8.128911796201574
throne_of_glass_good_fics,4.1833786567246225,2.04344750307249,0.4495104669887279,12.940001358972617,7.548352348996178
throne_of_glass_bad_fics,4.108812222902329,2.024923068917895,0.43527116374871266,12.422276893350578,7.5708298807850705
throne_of_glass_medium_fics,4.109371642911989,2.022966253498263,0.4337096917417227,12.053047978584209,7.3979633769372635
data_overview/delta_scores_grouped_fanfics.png

46.6 KiB | W: | H:

data_overview/delta_scores_grouped_fanfics.png

47 KiB | W: | H:

data_overview/delta_scores_grouped_fanfics.png
data_overview/delta_scores_grouped_fanfics.png
data_overview/delta_scores_grouped_fanfics.png
data_overview/delta_scores_grouped_fanfics.png
  • 2-up
  • Swipe
  • Onion skin
data_overview/p_values_all_data.png

96.1 KiB

data_overview/z_scores_all_data.png

105 KiB | W: | H:

data_overview/z_scores_all_data.png

98.7 KiB | W: | H:

data_overview/z_scores_all_data.png
data_overview/z_scores_all_data.png
data_overview/z_scores_all_data.png
data_overview/z_scores_all_data.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -38,8 +38,14 @@ for index, row in data_overview.iterrows():
cell_value = data_overview.loc[index, column]
z_score = (cell_value - mean) / std_dev
z_scores_all_data.loc[index, column] = z_score
p_value = scipy.stats.norm.sf(abs(z_score))
p_values_all_data[index, column] = p_value
#p_value = scipy.stats.norm.sf(abs(z_score))
#p_values_all_data[index, column] = p_value
for index, row in z_scores_all_data.iterrows():
for column in z_scores_all_data.columns:
cell_value = z_scores_all_data.loc[index, column]
p_value = scipy.stats.norm.sf(abs(cell_value))
p_values_all_data.loc[index, column] = p_value
dfi.export(z_scores_all_data, "data_overview/z_scores_all_data.png", table_conversion = "matplotlib")
......
grishaverse/freq_distribution/all_canon_token_len.png

36 KiB | W: | H:

grishaverse/freq_distribution/all_canon_token_len.png

35 KiB | W: | H:

grishaverse/freq_distribution/all_canon_token_len.png
grishaverse/freq_distribution/all_canon_token_len.png
grishaverse/freq_distribution/all_canon_token_len.png
grishaverse/freq_distribution/all_canon_token_len.png
  • 2-up
  • Swipe
  • Onion skin
grishaverse/freq_distribution/bad_fics_sent_len_long.png

33.4 KiB | W: | H:

grishaverse/freq_distribution/bad_fics_sent_len_long.png

39 KiB | W: | H:

grishaverse/freq_distribution/bad_fics_sent_len_long.png
grishaverse/freq_distribution/bad_fics_sent_len_long.png
grishaverse/freq_distribution/bad_fics_sent_len_long.png
grishaverse/freq_distribution/bad_fics_sent_len_long.png
  • 2-up
  • Swipe
  • Onion skin
grishaverse/freq_distribution/bad_fics_sent_len_short.png

34.9 KiB | W: | H:

grishaverse/freq_distribution/bad_fics_sent_len_short.png

35.2 KiB | W: | H:

grishaverse/freq_distribution/bad_fics_sent_len_short.png
grishaverse/freq_distribution/bad_fics_sent_len_short.png
grishaverse/freq_distribution/bad_fics_sent_len_short.png
grishaverse/freq_distribution/bad_fics_sent_len_short.png
  • 2-up
  • Swipe
  • Onion skin
grishaverse/freq_distribution/bad_fics_token_len.png

38.8 KiB | W: | H:

grishaverse/freq_distribution/bad_fics_token_len.png

37.5 KiB | W: | H:

grishaverse/freq_distribution/bad_fics_token_len.png
grishaverse/freq_distribution/bad_fics_token_len.png
grishaverse/freq_distribution/bad_fics_token_len.png
grishaverse/freq_distribution/bad_fics_token_len.png
  • 2-up
  • Swipe
  • Onion skin
grishaverse/freq_distribution/canon_sent_len_long.png

38.1 KiB | W: | H:

grishaverse/freq_distribution/canon_sent_len_long.png

39.6 KiB | W: | H:

grishaverse/freq_distribution/canon_sent_len_long.png
grishaverse/freq_distribution/canon_sent_len_long.png
grishaverse/freq_distribution/canon_sent_len_long.png
grishaverse/freq_distribution/canon_sent_len_long.png
  • 2-up
  • Swipe
  • Onion skin
grishaverse/freq_distribution/canon_sent_len_short.png

34.6 KiB | W: | H:

grishaverse/freq_distribution/canon_sent_len_short.png

34.9 KiB | W: | H:

grishaverse/freq_distribution/canon_sent_len_short.png
grishaverse/freq_distribution/canon_sent_len_short.png
grishaverse/freq_distribution/canon_sent_len_short.png
grishaverse/freq_distribution/canon_sent_len_short.png
  • 2-up
  • Swipe
  • Onion skin
grishaverse/freq_distribution/good_fics_sent_len_long.png

36.2 KiB | W: | H:

grishaverse/freq_distribution/good_fics_sent_len_long.png

39.7 KiB | W: | H:

grishaverse/freq_distribution/good_fics_sent_len_long.png
grishaverse/freq_distribution/good_fics_sent_len_long.png
grishaverse/freq_distribution/good_fics_sent_len_long.png
grishaverse/freq_distribution/good_fics_sent_len_long.png
  • 2-up
  • Swipe
  • Onion skin
grishaverse/freq_distribution/good_fics_sent_len_short.png

34.8 KiB | W: | H:

grishaverse/freq_distribution/good_fics_sent_len_short.png

35 KiB | W: | H:

grishaverse/freq_distribution/good_fics_sent_len_short.png
grishaverse/freq_distribution/good_fics_sent_len_short.png
grishaverse/freq_distribution/good_fics_sent_len_short.png
grishaverse/freq_distribution/good_fics_sent_len_short.png
  • 2-up
  • Swipe
  • Onion skin
grishaverse/freq_distribution/good_fics_token_len.png

36.9 KiB | W: | H:

grishaverse/freq_distribution/good_fics_token_len.png

36.1 KiB | W: | H:

grishaverse/freq_distribution/good_fics_token_len.png
grishaverse/freq_distribution/good_fics_token_len.png
grishaverse/freq_distribution/good_fics_token_len.png
grishaverse/freq_distribution/good_fics_token_len.png
  • 2-up
  • Swipe
  • Onion skin
grishaverse/freq_distribution/medium_fics_sent_len_long.png

34.5 KiB | W: | H:

grishaverse/freq_distribution/medium_fics_sent_len_long.png

39.2 KiB | W: | H:

grishaverse/freq_distribution/medium_fics_sent_len_long.png
grishaverse/freq_distribution/medium_fics_sent_len_long.png
grishaverse/freq_distribution/medium_fics_sent_len_long.png
grishaverse/freq_distribution/medium_fics_sent_len_long.png
  • 2-up
  • Swipe
  • Onion skin
grishaverse/freq_distribution/medium_fics_sent_len_short.png

35.2 KiB | W: | H:

grishaverse/freq_distribution/medium_fics_sent_len_short.png

35.2 KiB | W: | H:

grishaverse/freq_distribution/medium_fics_sent_len_short.png
grishaverse/freq_distribution/medium_fics_sent_len_short.png
grishaverse/freq_distribution/medium_fics_sent_len_short.png
grishaverse/freq_distribution/medium_fics_sent_len_short.png
  • 2-up
  • Swipe
  • Onion skin
grishaverse/freq_distribution/medium_fics_token_len.png

37.7 KiB | W: | H:

grishaverse/freq_distribution/medium_fics_token_len.png

36.3 KiB | W: | H:

grishaverse/freq_distribution/medium_fics_token_len.png
grishaverse/freq_distribution/medium_fics_token_len.png
grishaverse/freq_distribution/medium_fics_token_len.png
grishaverse/freq_distribution/medium_fics_token_len.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -130,6 +130,14 @@ def mendenhall_curve(corpus, curve_title, plot_destination):
# create the distribution of token lengths / Mendenhall curve
token_lengths = [len(token) for token in short_clean_tokens]
# Calculate the trimmed token length (with 5% trimming) We need to remove the outliers, bc even despite preprocessing,
# there still are some very wrong lengths, which entirely skews the metrics and also ruins our p-values later on
trim_percent = 0.005
trim_len = int(len(token_lengths) * trim_percent / 2)
token_lengths = sorted(token_lengths)[trim_len:-trim_len]
token_length_distribution = FreqDist(token_lengths).most_common(15)
# convert to FreqDist object to a pandas series for easier processing
......@@ -187,6 +195,12 @@ def sentence_metrics(corpus, curve_title, series, canon_or_fanfic):
#if len(short_clean_tokens)>= 90:
#print(f"This sentence: \n {sent} \n is this long: {len(short_clean_tokens)}")
# Calculate the trimmed mean sentence length (with 5% trimming) We need to remove the outliers, bc even despite preprocessing,
# there still are some sentences that are 1200 tokens long, which entirely skews the metrics and also ruins our p-values later on
trim_percent = 0.05
trim_len = int(len(sent_lens) * trim_percent / 2)
sent_lens = sorted(sent_lens)[trim_len:-trim_len]
sent_len_dist = FreqDist(sent_lens)
#print(sent_len_dist)
......@@ -343,7 +357,7 @@ def pos_tag_frequencies(corpus, series, canon_or_fanfic):
# convert FreqDist object to a pandas series for easier processing
tag_freq_dist_panda = pd.Series(dict(tag_freq_dist))
print(tag_freq_dist_panda)
#print(tag_freq_dist_panda)
# sort, normalise and round the panda series
......@@ -378,7 +392,7 @@ def pos_tag_frequencies(corpus, series, canon_or_fanfic):
# convert FreqDist object to a pandas series for easier processing
punct_tag_freq_dist_panda = pd.Series(dict(punct_tag_freq_dist))
print(punct_tag_freq_dist_panda)
#print(punct_tag_freq_dist_panda)
# sort, normalise and round the panda series
......@@ -389,7 +403,7 @@ def pos_tag_frequencies(corpus, series, canon_or_fanfic):
#for index in new_token_len_dist.index:
new_punct_tag_freq_dist.iat[i] = round(new_punct_tag_freq_dist.iat[i]/len(punctuation_tags), 3) #index-1 bc the index starts counting from zero, the word lengths not
print(new_punct_tag_freq_dist)
#print(new_punct_tag_freq_dist)
# set figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))
......
throne_of_glass/freq_distribution/all_canon_token_len.png

36.7 KiB | W: | H:

throne_of_glass/freq_distribution/all_canon_token_len.png

35.1 KiB | W: | H:

throne_of_glass/freq_distribution/all_canon_token_len.png
throne_of_glass/freq_distribution/all_canon_token_len.png
throne_of_glass/freq_distribution/all_canon_token_len.png
throne_of_glass/freq_distribution/all_canon_token_len.png
  • 2-up
  • Swipe
  • Onion skin
throne_of_glass/freq_distribution/bad_fics_sent_len_long.png

33.8 KiB | W: | H:

throne_of_glass/freq_distribution/bad_fics_sent_len_long.png

40.4 KiB | W: | H:

throne_of_glass/freq_distribution/bad_fics_sent_len_long.png
throne_of_glass/freq_distribution/bad_fics_sent_len_long.png
throne_of_glass/freq_distribution/bad_fics_sent_len_long.png
throne_of_glass/freq_distribution/bad_fics_sent_len_long.png
  • 2-up
  • Swipe
  • Onion skin
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment