Skip to content
Snippets Groups Projects
Commit ba1c60ba authored by chrysanthopoulou's avatar chrysanthopoulou
Browse files

Add plots for the fanfictions

parent eb2bfe91
No related branches found
No related tags found
No related merge requests found
Showing
with 325 additions and 3 deletions
,mean_tokens,std_dev,type_token_ratio,mean_sent
throne_of_glass_canon,4.20580153308561,9.820105672393566,0.4612289416846652,14.468550677890269
grishaverse_canon,4.1116821403167725,9.42692548599567,0.4679412861136999,14.026379022147932
,mean_tokens,std_dev_tokens,type_token_ratio,mean_sent,std_dev_sent
throne_of_glass_canon,4.20580153308561,2.0348877670869365,0.4612289416846652,14.468550677890269,9.820105672393566
grishaverse_canon,4.1116821403167725,2.1047643402022285,0.4679412861136999,14.026379022147932,9.42692548599567
grishaverse_good_fics,4.128605681546294,2.12767094657917,0.44176648168701443,12.920361563144626,10.031898461069263
grishaverse_bad_fics,4.192839204109023,2.1961898296996827,0.4488349209373214,13.098263374311202,10.83490565859641
grishaverse_medium_fics,4.125989775260719,2.1266952539859654,0.4420552018160678,13.1788589173054,10.270865275375563
throne_of_glass_good_fics,4.197038090427363,2.0907564170382065,0.4495104669887279,13.376067824328105,9.013067041149515
throne_of_glass_bad_fics,4.123089252572971,2.075327500013793,0.43527116374871266,12.966996479535549,9.797982354809053
throne_of_glass_medium_fics,4.123495735120379,2.072193436253281,0.4337096917417227,12.511614522473558,8.912865289012412
data_overview/data_overview.png

107 KiB

data_overview/delta_scores_grouped_fanfics.png

46.6 KiB

data_overview/z_scores_all_data.png

105 KiB

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from cycler import cycler
import json
import dataframe_image as dfi
#make the plots a bit less ugly
CB91_Blue = '#2CBDFE'
CB91_Green = '#47DBCD'
CB91_Pink = '#F3A0F2'
CB91_Purple = '#9D2EC5'
CB91_Violet = '#661D98'
CB91_Amber = '#F5B14C'
color_list = [CB91_Pink, CB91_Blue, CB91_Green, CB91_Amber,
CB91_Purple, CB91_Violet]
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list)
#some colour palette playing around
cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True)
cm2 = sns.cubehelix_palette(as_cmap=True)
#read data
data_overview = pd.DataFrame(pd.read_csv("data_overview/data_overview.csv"))
# pairplot initial features -- kinda useless in our case, but hey
"""
data_pairplot = sns.pairplot(ceramics_motives)
data_pairplot.savefig(r"project\pictures_general\data_pairplot.png")
"""
data_overview_styled = data_overview.style.background_gradient(cmap=cm)
dfi.export(data_overview_styled, "data_overview/data_overview.png", table_conversion = "matplotlib")
\ No newline at end of file
import matplotlib.pyplot as plt
import os
from nltk.probability import FreqDist
import pandas as pd
import statistics
import re
import dataframe_image as dfi
data_overview = pd.DataFrame(pd.read_csv("data_overview/data_overview.csv", index_col=0))
"""
data_overview = pd.DataFrame(
{"mean_tokens":mean_tokens,
"std_dev_tokens":std_dev_tokens,
"type_token_ratio":type_token_ratio,
"mean_sent":mean_sent,
"std_dev_tokens":std_dev_tokens},
index = index
)
data_overview.to_csv(f"data_overview/data_overview.csv")
"""
z_score_provider = data_overview.drop(["grishaverse_bad_fics", "grishaverse_good_fics", "grishaverse_medium_fics"], axis=0)
mean_std_dev_list = []
mean_std_dev_list = [[columnName, columnData.mean(), columnData.std()] for columnName, columnData in z_score_provider.iteritems()]
# Create a new DataFrame with the same column names and index labels as data_overview
z_scores_all_data = pd.DataFrame(columns=data_overview.columns, index=data_overview.index)
# Iterate over each cell in the data_overview DataFrame and write the corresponding z-score in the z_scores_all_data DataFrame
for index, row in data_overview.iterrows():
for column in data_overview.columns:
mean, std_dev = [elem[1:] for elem in mean_std_dev_list if elem[0]==column][0]
cell_value = data_overview.loc[index, column]
z_score = (cell_value - mean) / std_dev
z_scores_all_data.loc[index, column] = z_score
dfi.export(z_scores_all_data, "data_overview/z_scores_all_data.png", table_conversion = "matplotlib")
print(z_scores_all_data)
delta_scores_grouped_fanfics = pd.DataFrame(columns=["throne_of_glass_canon", "grishaverse_canon", "throne_of_glass_bad_fics", "throne_of_glass_good_fics", "throne_of_glass_medium_fics"], index=["grishaverse_bad_fics", "grishaverse_good_fics", "grishaverse_medium_fics"])
for fic in ["grishaverse_bad_fics", "grishaverse_good_fics", "grishaverse_medium_fics"]:
delta_scores = []
for index, row in z_scores_all_data.iterrows():
if index not in ["grishaverse_bad_fics", "grishaverse_good_fics", "grishaverse_medium_fics"]:
for column in z_scores_all_data.columns:
delta_score = 0
delta_score += abs(row[column] - z_scores_all_data.loc[fic, column])
delta_score /= len(z_scores_all_data.columns)
delta_scores.append(delta_score)
delta_scores_grouped_fanfics.loc[fic, index] = delta_score
#delta_scores_grouped_fanfics.loc[fic, :] = delta_score
print(delta_scores)
dfi.export(delta_scores_grouped_fanfics, "data_overview/delta_scores_grouped_fanfics.png", table_conversion = "matplotlib")
print(delta_scores_grouped_fanfics)
import os
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import pandas as pd
import statistics
import re
# you'll have to also download "punkt" from nltk
# by subdiving the text into segments of 1000, it calculates the type token ratio for each segment and then averages over them
# this ensures a comparability of the type token ratios for varying text sizes
def standardised_type_token_ratio(tokens):
ttrs = []
segment_tokens = []
segment = 0
for token in tokens:
if segment < 1000:
segment_tokens.append(token)
segment += 1
elif segment == 1000:
types = set(segment_tokens)
ttr = len(types)/len(segment_tokens)
ttrs.append(ttr)
segment_tokens =[]
segment = 0
if len(ttrs) <= 1:
types = set(tokens)
std_ttr = len(types)/len(tokens)
print("Warning: Text was too short for segmentation!")
else:
std_ttr = statistics.mean(ttrs)
return std_ttr
def tokenize_and_clean_text(text):
tokens = word_tokenize(text)
cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)])
short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens
# had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin"
# and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running"
for token in cleaned_tokens:
dehyphenated_token = []
letter_present = 0
dehyphenated = 0
second_word_in_compound = 0
for c in token:
if c.isalpha() == True:
dehyphenated_token.append(c)
letter_present = 1
if dehyphenated == 1:
second_word_in_compound = 1
elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens,
#bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of
# high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
# relatively speaking
dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
#print(dehyphenated_token_joined)
short_clean_tokens.append(dehyphenated_token_joined)
dehyphenated_token = []
letter_present = 0
dehyphenated = 1
second_word_in_compound = 0
if letter_present == 1 and dehyphenated == 0:
short_clean_tokens.append(token) #catching the tokens that didn't have any special characters; but not the dehyphenated ones twice
elif letter_present == 1 and dehyphenated == 1 and second_word_in_compound == 1:
short_clean_tokens.append(''.join(map(str, dehyphenated_token)))
return short_clean_tokens
# this function takes a corpus as its input and gives a Mendenhall curve, i.e. a frequency distribution of tokens as its output
# precise input: corpus = string ;
# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series"
# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png"
def mendenhall_curve(corpus):
short_clean_tokens = tokenize_and_clean_text(corpus)
# create the distribution of token lengths / Mendenhall curve
token_lengths = [len(token) for token in short_clean_tokens]
# calculate the standard deviation, mean, token/type ratio
standard_deviation = statistics.stdev(token_lengths)
mean = statistics.mean(token_lengths)
type_token_ratio = standardised_type_token_ratio(short_clean_tokens)
return standard_deviation, mean, type_token_ratio
def sentence_metrics(corpus):
sents = sent_tokenize(corpus)
sent_lens = []
for sent in sents:
short_clean_tokens = tokenize_and_clean_text(sent)
sent_lens.append(len(short_clean_tokens))
# calculate the standard deviation, mean
standard_deviation_sent = statistics.stdev(sent_lens)
mean_sent = statistics.mean(sent_lens)
return standard_deviation_sent, mean_sent
def run_functions(directory_path):
good_mean_tks = []
bad_mean_tks = []
medium_mean_tks = []
#idx = []
good_std_dev_tks = []
bad_std_dev_tks = []
medium_std_dev_tks = []
good_ttrs = []
bad_ttrs = []
medium_ttrs = []
good_mean_sts= []
bad_mean_sts= []
medium_mean_sts= []
good_std_dev_sts = []
bad_std_dev_sts = []
medium_std_dev_sts = []
few_kudos = 100
medium_kudos = 1500
for index, body in grisha_fanfics["body"]:
published = pd.to_datetime(grisha_fanfics["published"][index])
if published.year != 2023:
if not pd.isna(grisha_fanfics["kudos"][index]):
kudos = pd.to_numeric(grisha_fanfics["kudos"][index], errors="coerce")
if kudos <= few_kudos:
std_dev_tk, mean_tk, ttr = mendenhall_curve(body)
std_dev_st, mean_st = sentence_metrics(body)
bad_mean_tks.append(mean_tk)
bad_std_dev_tks.append(std_dev_tk)
bad_ttrs.append(ttr)
bad_mean_tks.append(mean_tk)
bad_std_dev_tks.append(std_dev_tk)
bad_ttrs.append(ttr)
elif kudos <= medium_kudos:
std_dev_tk, mean_tk, ttr = mendenhall_curve(body)
std_dev_st, mean_st = sentence_metrics(body)
medium_mean_tks.append(mean_tk)
medium_std_dev_tks.append(std_dev_tk)
medium_ttrs.append(ttr)
medium_mean_sts.append(mean_st)
medium_std_dev_sts.append(std_dev_st)
elif kudos > medium_kudos:
std_dev_tk, mean_tk, ttr = mendenhall_curve(body)
std_dev_st, mean_st = sentence_metrics(body)
good_mean_tks.append(mean_tk)
good_std_dev_tks.append(std_dev_tk)
good_ttrs.append(ttr)
good_mean_sts.append(mean_st)
good_std_dev_sts.append(std_dev_st)
else:
print(f"Missing kudos value for row {index}")
lists = []
for list in lists:
#idx.append(grisha_fanfics["work_id"][index])
grisha_fanfics = pd.read_csv("grishaverse/data/fanfics/grishaverse_fics.csv")
#grishaverse/data/split_txt_fanfics
#create lists for each of the columns of the dataframe we'll create
mean_tokens = []
std_dev_tokens = []
type_token_ratio = []
mean_sent = []
std_dev_tokens = []
index = []
# create a dataframe to store all the overview statistics in
# columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15;
# mean_sent; std_dev_sent; freq_sent_len ....
# tag_frequencies
# tag_ngram_frequencies
# punctuation frequencies
# token/type ratio
data_overview = pd.DataFrame(
{"mean_tokens":mean_tokens,
"std_dev_tokens":std_dev_tokens,
"type_token_ratio":type_token_ratio,
"mean_sent":mean_sent,
"std_dev_tokens":std_dev_tokens},
index = index
)
data_overview.to_csv(f"data_overview/data_overview.csv")
grishaverse/freq_distribution/bad_fics_pos_tag_frequencies.png

55.2 KiB

grishaverse/freq_distribution/bad_fics_punctuation_frequencies.png

53.2 KiB

grishaverse/freq_distribution/bad_fics_sent_len_long.png

33.4 KiB

grishaverse/freq_distribution/bad_fics_sent_len_short.png

34.9 KiB

grishaverse/freq_distribution/bad_fics_token_len.png

38.8 KiB

grishaverse/freq_distribution/canon_pos_tag_frequencies.png

55 KiB | W: | H:

grishaverse/freq_distribution/canon_pos_tag_frequencies.png

55 KiB | W: | H:

grishaverse/freq_distribution/canon_pos_tag_frequencies.png
grishaverse/freq_distribution/canon_pos_tag_frequencies.png
grishaverse/freq_distribution/canon_pos_tag_frequencies.png
grishaverse/freq_distribution/canon_pos_tag_frequencies.png
  • 2-up
  • Swipe
  • Onion skin
grishaverse/freq_distribution/canon_punctuation_frequencies.png

49.8 KiB | W: | H:

grishaverse/freq_distribution/canon_punctuation_frequencies.png

50.1 KiB | W: | H:

grishaverse/freq_distribution/canon_punctuation_frequencies.png
grishaverse/freq_distribution/canon_punctuation_frequencies.png
grishaverse/freq_distribution/canon_punctuation_frequencies.png
grishaverse/freq_distribution/canon_punctuation_frequencies.png
  • 2-up
  • Swipe
  • Onion skin
grishaverse/freq_distribution/canon_sent_len_short.png

34.8 KiB | W: | H:

grishaverse/freq_distribution/canon_sent_len_short.png

34.6 KiB | W: | H:

grishaverse/freq_distribution/canon_sent_len_short.png
grishaverse/freq_distribution/canon_sent_len_short.png
grishaverse/freq_distribution/canon_sent_len_short.png
grishaverse/freq_distribution/canon_sent_len_short.png
  • 2-up
  • Swipe
  • Onion skin
grishaverse/freq_distribution/good_fics_pos_tag_frequencies.png

57 KiB

grishaverse/freq_distribution/good_fics_punctuation_frequencies.png

54.2 KiB

grishaverse/freq_distribution/good_fics_sent_len_long.png

36.2 KiB

grishaverse/freq_distribution/good_fics_sent_len_short.png

34.8 KiB

grishaverse/freq_distribution/good_fics_token_len.png

36.9 KiB

0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment