Skip to content
Snippets Groups Projects
Commit 3488607f authored by chrysanthopoulou's avatar chrysanthopoulou
Browse files
parents a35c0eda a8cfdec0
No related branches found
No related tags found
No related merge requests found
# fanfic and stylometry :chipmunk: # fanfic and stylometry :chipmunk:
:lizard: :blowfish: :chipmunk: :unicorn: :deer:
:books: :notebook: :ledger: :page_with_curl: :scroll: :page_facing_up: :newspaper: :rolled_up_newspaper: :old_key: :key: :dagger: :crossed_swords: :bow_and_arrow:
![](/general_pictures/cute_otter.jpg) ![](/general_pictures/cute_otter.jpg)
...@@ -9,11 +6,11 @@ ...@@ -9,11 +6,11 @@
1. [Project Description :black_nib:](#project-description-✒️) 1. [Project Description :black_nib:](#project-description-✒️)
2. [Data :card_index_dividers:](#data-🗂️) 2. [Data :card_index_dividers:](#data-🗂️)
3. [Roadmap :telescope: :world_map:](#roadmap-🔭-🗺️) 4. [Support :dragon: & Author](#support-🐉--author)
4. [Support :dragon: & Authors](#support-🐉--authors)
## Project Description :black_nib: ## Project Description :black_nib:
This project aims to perform a stylometry analysis on a web-scraped corpus of fanfiction and compare it to its origin work.
## Data :card_index_dividers: ## Data :card_index_dividers:
...@@ -23,25 +20,6 @@ ...@@ -23,25 +20,6 @@
- [Grishaverse Series by Leigh Bardugo]() - [Grishaverse Series by Leigh Bardugo]()
## Roadmap :telescope: :world_map:
- [x] Upload Datasets
- [x] Reformat SCI
- [x] Reformat MNLI
- [x] Reformat Veridicality Dataset
- [x] Test Berti :penguin: <br>
imminent to-do stuff :exclamation:
***
- [x] Train Multi :ghost:
- [x] Train Verdi :crocodile:
- [x] Test Multi :ghost:
- [x] Test Verdi :crocodile:
- [x] Figure out Evaluation Metric
- [ ] Apply Evaluation Metric
- [ ] Make Interpretability Datasets
- [ ] Interpretability
## Support :dragon: & Author ## Support :dragon: & Author
Lea Kyveli Chrysanthopoulou: leakyveli.chrysanthopoulou@stud.uni-heidelberg.de<br> Lea Kyveli Chrysanthopoulou: leakyveli.chrysanthopoulou@stud.uni-heidelberg.de<br>
import seaborn as sns
import matplotlib.pyplot as plt
from cycler import cycler
import os
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.tokenize import sent_tokenize
from nltk.tag import pos_tag
import pandas as pd
import statistics
import re
# create function for bar (value) labels
def addlabels(x,y):
for i in range(len(x)):
plt.text(i, y[i], y[i], ha = "center")
# function compiling the works given into a single string. Input required:
# general path of the files as string, for example: "/throne_of_glass/data/canon_works/"
# specific names of the works as a list of strings, for example "throne_of_glass_1.txt"
# /throne_of_glass/data/canon_works/
def read_works_into_string(directory_path):
strings = []
works = os.listdir(directory_path)
for work in works:
with open(f"{directory_path}"+f"/{work}", "r", errors='ignore') as f: #ignores mostly unicode errors due to problematic encoding of text files
strings.append(f.read())
return "\n".join(strings)
# by subdiving the text into segments of 1000, it calculates the type token ratio for each segment and then averages over them
# this ensures a comparability of the type token ratios for varying text sizes
def standardised_type_token_ratio(tokens):
ttrs = []
segment_tokens = []
segment = 0
for token in tokens:
if segment < 1000:
segment_tokens.append(token)
segment += 1
elif segment == 1000:
types = set(segment_tokens)
ttr = len(types)/len(segment_tokens)
ttrs.append(ttr)
segment_tokens =[]
segment = 0
if len(ttrs) <= 1:
types = set(tokens)
std_ttr = len(types)/len(tokens)
print("Warning: Text was too short for segmentation!")
else:
std_ttr = statistics.mean(ttrs)
return std_ttr
def tokenize_and_clean_text(text):
tokens = word_tokenize(text)
cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)])
short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens
# had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin"
# and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running"
for token in cleaned_tokens:
dehyphenated_token = []
letter_present = 0
dehyphenated = 0
second_word_in_compound = 0
for c in token:
if c.isalpha() == True:
dehyphenated_token.append(c)
letter_present = 1
if dehyphenated == 1:
second_word_in_compound = 1
elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens,
#bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of
# high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
# relatively speaking
dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
#print(dehyphenated_token_joined)
short_clean_tokens.append(dehyphenated_token_joined)
dehyphenated_token = []
letter_present = 0
dehyphenated = 1
second_word_in_compound = 0
if letter_present == 1 and dehyphenated == 0:
short_clean_tokens.append(token) #catching the tokens that didn't have any special characters; but not the dehyphenated ones twice
elif letter_present == 1 and dehyphenated == 1 and second_word_in_compound == 1:
short_clean_tokens.append(''.join(map(str, dehyphenated_token)))
return short_clean_tokens
def calculate_freq_dist_as_clean_panda(list_of_items, most_common_limit=False):
if most_common_limit == False:
freq_dist = FreqDist(list_of_items)
else:
freq_dist = FreqDist(list_of_items).most_common(most_common_limit)
# convert FreqDist object to a pandas series for easier processing
dist_panda = pd.Series(dict(freq_dist))
# sort, normalise and round the panda series
new_dist = dist_panda.sort_index()
for i in range(0, len(new_dist.index)):
#for index in new_token_len_dist.index:
new_dist.iat[i] = round(new_dist.iat[i]/len(list_of_items), 3) #index-1 bc the index starts counting from zero, the word lengths not
#if float(new_token_len_dist.iat[i]) == 0.00:
# new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
return new_dist
def mendenhall_token_metrics(tokens):
# create the distribution of token lengths / Mendenhall curve
token_lengths = [len(token) for token in tokens]
# Calculate the trimmed token length (with 5% trimming) We need to remove the outliers, bc even despite preprocessing,
# there still are some very wrong lengths, which entirely skews the metrics and also ruins our p-values later on
trim_percent = 0.005
trim_len = int(len(token_lengths) * trim_percent / 2)
token_lengths = sorted(token_lengths)[trim_len:-trim_len]
new_token_len_dist = calculate_freq_dist_as_clean_panda(token_lengths, most_common_limit=15) # token len freq dist
standard_deviation = statistics.stdev(token_lengths)
mean = statistics.mean(token_lengths)
return new_token_len_dist, standard_deviation, mean
def plot_distribution(x, y, plt_title, file_path_for_pic:str, x_label="Number of Kudos", y_label="Percentage of Occurence",palette="flare", plt_type="barplot", add_labels=True, rotate_ticks=True):
plt.figure(figsize=(10,10))
plt.title(plt_title)
plt.xlabel(x_label)
plt.ylabel(y_label)
if add_labels:
addlabels(x=x.index, y=y.values)
match plt_type:
case "scatterplot":
sns.scatterplot(x=x.index, y=y.values, palette=palette)
case "lineplot":
sns.lineplot(x=x.index, y=y.values, palette=palette)
case "barplot":
sns.barplot(x=x.index, y=y.values, palette=palette)
case "histplot":
sns.histplot(x=x.index, y=y.values, palette=palette)
case _:
print(f"{plt_type} is not a valid format for this function")
if rotate_ticks:
plt.xticks(rotation=30) # !!! very useful for words
plt.savefig(file_path_for_pic)
plt.close()
def pos_tag_freq(tokens):
#nltk.pos_tag(text) --> [('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'),
#('completely', 'RB'), ('different', 'JJ')]
tag_token_tuples = pos_tag(tokens)
punctuation_regex = r"[^\w\s]+"
summarised_tags = []
punctuation_tags = []
modal_verbs = []
index = 0
for token, tag in tag_token_tuples:
if re.match(punctuation_regex, token):
summarised_tags.append("punctuation")
if re.match(r"[\"\'“”’‘]+", token):
punctuation_tags.append("quotation_marks")
elif re.match(r"[,;:.?!-]+", token):
try:
punctuation_tags.append("ellipsis" if token == "." and tag_token_tuples[index+1][1] == "." and tag_token_tuples[index+2][1] == "." else "full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct")
except:
punctuation_tags.append("full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct")
else:
if tag in ["MD"]:
summarised_tags.append("modal verb")
modal_verbs.append(token.lower())
elif tag in ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
summarised_tags.append("verb")
elif tag in ["JJ", "JJR", "JJS"]:
summarised_tags.append("adjective")
elif tag in ["RB", "RBR", "RBS", "WRB"]:
summarised_tags.append("adverb")
elif tag in ["PRP", "PRP$", "WP", "WP$"]:
summarised_tags.append("pronoun")
elif tag in ["NNP", "NNPS"]:
summarised_tags.append("proper_noun")
elif tag in ["NN", "NNS"]:
summarised_tags.append("common_noun")
elif tag in ["DT", "PDT", "WDT"]:
summarised_tags.append("determiner")
elif tag == "CC":
summarised_tags.append("coordinating_conj")
elif tag == "IN":
summarised_tags.append("subordinating_conj")
elif tag in ["$", "CD", "EX", "LS", "POS", "SYM", "TO", "UH", "RP", "FW"]:
summarised_tags.append("other_tag")
index += 1
#pos tag freq dist
new_tag_freq_dist = calculate_freq_dist_as_clean_panda(summarised_tags)
#punctuation frequency distribution
new_punct_tag_freq_dist = calculate_freq_dist_as_clean_panda(punctuation_tags)
# modal verbs in more detail
new_md_freq_dist_panda = calculate_freq_dist_as_clean_panda(modal_verbs, most_common_limit=10)
return new_tag_freq_dist, new_punct_tag_freq_dist, new_md_freq_dist_panda
#f"throne_of_glass/data/canon_works"
def extract_info_from_directory_path(directory_path):
#for txt_fic in os.listdir(directory_path):
works = os.listdir(directory_path)
pattern = r"^[a-zA-Z_]+(?=/)" # get series from directory path
match = re.search(pattern, directory_path)
if match:
series = match.group(0)
for work in works:
with open(f"{directory_path}"+f"/{work}", "r") as f:
f = f.read()
std_dev_tk, mean_tk, ttr = mendenhall_curve(f, f"Mendenhall Curve for the {series.replace('_' , ' ').title()} {work[:-4].replace('_' , ' ').title()}", f"{series}/freq_distribution/{work[:-4]}_token_len.png")
mean_tokens.append(mean_tk)
def calculate_sent_len_dist(text):
sents = sent_tokenize(text)
sent_lens = []
for sent in sents:
short_clean_tokens = tokenize_and_clean_text(sent)
sent_lens.append(len(short_clean_tokens))
#if len(short_clean_tokens)>= 90:
#print(f"This sentence: \n {sent} \n is this long: {len(short_clean_tokens)}")
# Calculate the trimmed mean sentence length (with 5% trimming) We need to remove the outliers, bc even despite preprocessing,
# there still are some sentences that are 1200 tokens long, which entirely skews the metrics and also ruins our p-values later on
trim_percent = 0.05
trim_len = int(len(sent_lens) * trim_percent / 2)
sent_lens = sorted(sent_lens)[trim_len:-trim_len]
sent_len_dist = calculate_freq_dist_as_clean_panda(sent_lens) #new_sent_len_dist
# plot the 25 most frequent sentence lenghts as a barplot for a more detailed insight
sent_len_dist_short = calculate_freq_dist_as_clean_panda(sent_lens, most_common_limit=25)
# calculate the standard deviation, mean
standard_deviation_sent = statistics.stdev(sent_lens)
mean_sent = statistics.mean(sent_lens)
return sent_len_dist, sent_len_dist_short, standard_deviation_sent, mean_sent
class StylometryMetrics:
def __init__(self, directory_path, name_of_work, quality="", fanfiction=True):
self.text = read_works_into_string(directory_path)
self.clean_tokens = tokenize_and_clean_text(self.text)
self.name = name_of_work
self.fanfiction = fanfiction
self.quality = quality # good medium bad
def determine_titles(self, plot_topic):
if self.fanfiction:
plt_title = f"{plot_topic} for the {self.name} {self.quality} Fanfiction"
else:
plt_title = f"{plot_topic} for the {self.name} Canon"
return plt_title
def calculate_standardised_ttr(self):
self.sttr = standardised_type_token_ratio(self.clean_tokens)
def calculate_mendenhall_token_metrics(self):
self.tk_len_dist, self.tk_len_std, self.tk_len_mean = mendenhall_token_metrics(self.clean_tokens)
def plot_token_metrics(self, file_path_for_pic):
plt_title = self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon" + " Token Frequency Distribution"
plot_distribution(x=self.tk_len_dist, y=self.tk_len_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Token Length", y_label="Percentage of Occurence")
def calculate_pos_tag_distribution(self):
self.tag_freq_dist, self.punct_tag_freq_dist, self.md_freq_dist = pos_tag_freq(self.clean_tokens)
def calculate_sent_len_distribution(self):
self.sent_len_dist, self.sent_len_dist_short, self.sent_std_dev, self.sent_mean = calculate_sent_len_dist(self.text)
def plot_long_sent_len_dist(self, file_path_for_pic):
plt_title = self.determine_titles(plot_topic="Full Sentence Length Distribution")
plot_distribution(x=self.sent_len_dist, y=self.sent_len_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Sentence Lengths", y_label="Percentage of Occurence", plt_type="lineplot")
def plot_short_sent_len_dist(self, file_path_for_pic):
plt_title = self.determine_titles(plot_topic="Short Sentence Length Distribution")
plot_distribution(x=self.sent_len_dist_short, y=self.sent_len_dist_short, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Sentence Lengths", y_label="Percentage of Occurence")
def plot_pos_tag_freq(self, file_path_for_pic):
plt_title = self.determine_titles(plot_topic="POS Tag Frequencies")
plot_distribution(x=self.tag_freq_dist, y=self.tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="POS Tags", y_label="Percentage of Occurence")
def plot_md_freq(self, file_path_for_pic):
plt_title = self.determine_titles(plot_topic="Modal Verb Frequencies")
plot_distribution(x=self.md_freq_dist, y=self.md_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Modal Verbs", y_label="Percentage of Occurence")
def plot_punct_freq(self, file_path_for_pic):
plt_title = self.determine_titles(plot_topic="Punctuation Frequencies")
plot_distribution(x=self.punct_tag_freq_dist, y=self.punct_tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Types of Punctuation", y_label="Percentage of Occurence")
# overall pos_tag frequency distribution
# pos_tag ngrams; (maybe exclude stopwords?)
# tag collocates for specific tags --> adjectives most frequently with nouns
# most frequent words
# most frequent words for specific tags --> punctuation;
# most frequent adjectives
#create the Mendenhall Curve for the Throne of Glass Series
#std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")
#create the Mendenhall Curve for the Grishaverse Books
#std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png")
# Mendenhall Curve Sentence Lengths for Throne of Glass Canon
#std_dev_sent_tog_canon, mean_sent_tog_canon = sentence_metrics(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Throne of Glass Series", "throne_of_glass", "canon")
# Mendenhall Curve Sentence Lenghts for Grishavers Canon
#std_dev_sent_grishaverse_canon, mean_sent_grishaverse_canon = sentence_metrics(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Grishaverse Books", "grishaverse", "canon")
# POS Tag frequencies for TOG
#pos_tag_frequencies(read_works_into_string(f"throne_of_glass/data/canon_works"), "throne_of_glass", "canon")
# POS Tag frequencies for Grishaverse
#pos_tag_frequencies(read_works_into_string(f"grishaverse/data/canon_works"), "grishaverse", "canon")
def run_functions(directory_path):
"""
mean_tks = []
idx = []
std_dev_tks = []
ttrs = []
mean_sts= []
std_dev_sts = []
"""
#for txt_fic in os.listdir(directory_path):
works = os.listdir(directory_path)
pattern = r"^[a-zA-Z_]+(?=/)" # get series from directory path
match = re.search(pattern, directory_path)
if match:
series = match.group(0)
for work in works:
with open(f"{directory_path}"+f"/{work}", "r") as f:
f = f.read()
std_dev_tk, mean_tk, ttr = mendenhall_curve(f, f"Mendenhall Curve for the {series.replace('_' , ' ').title()} {work[:-4].replace('_' , ' ').title()}", f"{series}/freq_distribution/{work[:-4]}_token_len.png")
mean_tokens.append(mean_tk)
std_dev_tokens.append(std_dev_tk)
type_token_ratio.append(ttr)
std_dev_st, mean_st = sentence_metrics(f, f"Mendenhall Curve for Sentence Lenghts for the {series.replace('_' , ' ').title()} {work[:-4].replace('_' , ' ').title()}", series, work[:-4])
mean_sent.append(mean_st)
std_dev_sents.append(std_dev_st)
pos_tag_frequencies(f, series, work[:-4])
index.append(f"{series}_{work[:-4]}")
#grishaverse/data/split_txt_fanfics
def create_dataframe_with_overview_info():
#create lists for each of the columns of the dataframe we'll create
mean_tokens = [mean_tokens_tog_canon, mean_tokens_grishaverse_canon]
std_dev_tokens = [std_dev_tokens_tog_canon, std_dev_tokens_grishaverse_canon]
type_token_ratio = [type_token_ratio_tog_canon, type_token_ratio_grishaverse_canon]
mean_sent = [mean_sent_tog_canon, mean_sent_grishaverse_canon]
std_dev_sents = [std_dev_sent_tog_canon, std_dev_sent_grishaverse_canon]
index = ["throne_of_glass_canon", "grishaverse_canon"]
# create a dataframe to store all the overview statistics in
# columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15;
# mean_sent; std_dev_sent; freq_sent_len ....
# tag_frequencies
# tag_ngram_frequencies
# punctuation frequencies
# token/type ratio
data_overview = pd.DataFrame(
{"mean_tokens":mean_tokens,
"std_dev_tokens":std_dev_tokens,
"type_token_ratio":type_token_ratio,
"mean_sent":mean_sent,
"std_dev_sent":std_dev_sents},
index = index
)
if __name__ == "__main__":
#run_functions("grishaverse/data/split_txt_fanfics")
#run_functions("throne_of_glass/data/split_txt_fanfics")
#data_overview.to_csv(f"data_overview/data_overview.csv")
GrishaverseCanon = StylometryMetrics(directory_path="grishaverse/data/canon_works", name_of_work="Grishaverse", fanfiction=False)
GrishaverseCanon.calculate_pos_tag_distribution()
GrishaverseCanon.plot_md_freq("grishaverse/plots/canon/md_freq.png")
\ No newline at end of file
# code snippets for prettifying plots
#colours
CB91_Blue = '#2CBDFE'
CB91_Green = '#47DBCD'
CB91_Pink = '#F3A0F2'
CB91_Purple = '#9D2EC5'
CB91_Violet = '#661D98'
CB91_Amber = '#F5B14C'
color_list = [CB91_Pink, CB91_Blue, CB91_Green, CB91_Amber,
CB91_Purple, CB91_Violet]
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list)
#some colour palette playing around
cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True)
cm2 = sns.cubehelix_palette(as_cmap=True)
\ No newline at end of file
...@@ -22,7 +22,7 @@ CB91_Purple = '#9D2EC5' ...@@ -22,7 +22,7 @@ CB91_Purple = '#9D2EC5'
CB91_Violet = '#661D98' CB91_Violet = '#661D98'
CB91_Amber = '#F5B14C' CB91_Amber = '#F5B14C'
color_list = [pink, light_green, purple_grey, blue_grey, CB91_Green, CB91_Pink, CB91_Blue, CB91_Amber, color_list = [ blue_grey, CB91_Amber, pink, light_green, CB91_Green, CB91_Pink, CB91_Blue,
CB91_Purple, CB91_Violet] CB91_Purple, CB91_Violet]
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list) plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list)
...@@ -32,11 +32,11 @@ cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True) ...@@ -32,11 +32,11 @@ cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True) cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True)
cm2 = sns.cubehelix_palette(as_cmap=True) cm2 = sns.cubehelix_palette(as_cmap=True)
#palette_1 = sns.color_palette("flare") #palette_1 = sns.color_palette("flare")
#palette_2 = sns.color_palette("mako_r", as_cmap=True) #palette_2 = sns.color_palette("mako_r", as_cmap=True)
# actual preprocessing code
#file header: #file header:
# work_id,title,author,rating,category,fandom,relationship,character,additional tags,language,published,status,status date,words,chapters,comments,kudos,bookmarks,hits,all_kudos,all_bookmarks,body # work_id,title,author,rating,category,fandom,relationship,character,additional tags,language,published,status,status date,words,chapters,comments,kudos,bookmarks,hits,all_kudos,all_bookmarks,body
# 27852922,Dealing with Our Demons,['ravenyenn19'],Mature,F/M,"Six of Crows Series",Kaz Brekker/Inej Ghafa,"Kaz B","Romance,Kanej - Freeform, Eventual Smut",English,2020-12-03,Updated,2023-03-16,747673,162/?,8573,12204,1373,709212,"['ud4m', 'book_addict_1228', 'ephemeraldelights', 'bluedelilah25', 'sunshinecorsets', 'I_do_not_like_purple_glasses', 'beep_boop_00', 'schleswigholstein', 'moonandstars75', 'ewerythingoes', 'mindfighters', 'rosibunnis', 'Lizie06', 'ghostlatte', 'aguswolman', 'QueenofEnglan', 'JenBoyette04', 'gnitneb_reads', 'gloomysunshine', 'v1ofvs', 'BazzaKrekker', 'BookGeek', 'poppyflower19', 'Cassanibal', 'vanilla_chai_tea', 'Honorthyword', 'mariaarmengol', 'luc1inda', 'zarawrites', 'monmough', 'Guilty__Pleasures', 'Ilyann', 'folieadeux_0_0', 'dragonguard', 'Emeliemarx', 'angrydabee', 'slythxrclaw', 'samaram0215', 'letsgetthisbread69', 'Mintmew', 'biblichour', 'Katloupet', 'Miss_ginger', 'inejsquake', 'Arabella_7833', 'flossy_flo99', 'a_k123', 'hushedwanderer', 'siriuslymichele', 'AnnaAvinaVTDX']",[],"Dear Kaz, # 27852922,Dealing with Our Demons,['ravenyenn19'],Mature,F/M,"Six of Crows Series",Kaz Brekker/Inej Ghafa,"Kaz B","Romance,Kanej - Freeform, Eventual Smut",English,2020-12-03,Updated,2023-03-16,747673,162/?,8573,12204,1373,709212,"['ud4m', 'book_addict_1228', 'ephemeraldelights', 'bluedelilah25', 'sunshinecorsets', 'I_do_not_like_purple_glasses', 'beep_boop_00', 'schleswigholstein', 'moonandstars75', 'ewerythingoes', 'mindfighters', 'rosibunnis', 'Lizie06', 'ghostlatte', 'aguswolman', 'QueenofEnglan', 'JenBoyette04', 'gnitneb_reads', 'gloomysunshine', 'v1ofvs', 'BazzaKrekker', 'BookGeek', 'poppyflower19', 'Cassanibal', 'vanilla_chai_tea', 'Honorthyword', 'mariaarmengol', 'luc1inda', 'zarawrites', 'monmough', 'Guilty__Pleasures', 'Ilyann', 'folieadeux_0_0', 'dragonguard', 'Emeliemarx', 'angrydabee', 'slythxrclaw', 'samaram0215', 'letsgetthisbread69', 'Mintmew', 'biblichour', 'Katloupet', 'Miss_ginger', 'inejsquake', 'Arabella_7833', 'flossy_flo99', 'a_k123', 'hushedwanderer', 'siriuslymichele', 'AnnaAvinaVTDX']",[],"Dear Kaz,
...@@ -45,90 +45,49 @@ cm2 = sns.cubehelix_palette(as_cmap=True) ...@@ -45,90 +45,49 @@ cm2 = sns.cubehelix_palette(as_cmap=True)
grisha_fanfics = pd.read_csv("grishaverse/data/fanfics/grishaverse_fics.csv") grisha_fanfics = pd.read_csv("grishaverse/data/fanfics/grishaverse_fics.csv")
tog_fanfics = pd.read_csv("throne_of_glass/data/fanfics/throne_of_glass_fics.csv") tog_fanfics = pd.read_csv("throne_of_glass/data/fanfics/throne_of_glass_fics.csv")
""" def read_csv_to_pd(file_path, name_of_file) -> pd: #fix type hints
# plot distribution of kudos for Grishaverse Fanfics name_of_file = pd.read_csv(file_path)
return name_of_file
grisha_kudos = grisha_fanfics["kudos"].values.tolist()
grisha_kudos_freq_dist = FreqDist(grisha_kudos)
# convert to FreqDist object to a pandas series for easier processing
dist_panda = pd.Series(dict(grisha_kudos_freq_dist))
#print(dist_panda)
# sort, normalise and round the panda series
new_dist = dist_panda.sort_index()
for i in range(0, len(new_dist.index)):
#for index in new_token_len_dist.index:
new_dist.iat[i] = round(new_dist.iat[i]/len(grisha_kudos), 3) #index-1 bc the index starts counting from zero, the word lengths not
#if float(new_token_len_dist.iat[i]) == 0.00:
# new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
#calculate cumulative distribution
cum_dist = np.cumsum(new_dist.values)
# plot using matplotlib and seaborn
# set figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))
# call function for bar (value) labels
#addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values)
plt.title("Grishaverse Cumulative Frequency Distribution of All Kudos")
ax.set_xlabel("Number of Kudos")
ax.set_ylabel("Percentage of Occurence")
def calculate_cum_kudo_distribution(fanfic_pd):
fanfic_kudos = fanfic_pd["kudos"].values.tolist()
fanfic_kudos_freq_dist = FreqDist(fanfic_kudos)
# convert to FreqDist object to a pandas series for easier processing
dist_panda = pd.Series(dict(fanfic_kudos_freq_dist))
sns.lineplot(x=new_dist.index, y=cum_dist, ax=ax) # sort, normalise and round the panda series
#plt.xticks(rotation=30) !!! very useful for words new_dist = dist_panda.sort_index()
fig.savefig(f"grishaverse/freq_distribution/fanfic_kudo_freq_dist.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
""" for i in range(0, len(new_dist.index)):
# plot distribution of kudos for Throne of Glass Fanfics #for index in new_token_len_dist.index:
new_dist.iat[i] = round(new_dist.iat[i]/len(fanfic_kudos), 3) #index-1 bc the index starts counting from zero, the word lengths not
#if float(new_token_len_dist.iat[i]) == 0.00:
# new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
tog_kudos = tog_fanfics["kudos"].values.tolist() #calculate cumulative distribution
cum_dist = np.cumsum(new_dist.values)
return new_dist, cum_dist
tog_kudos_freq_dist = FreqDist(tog_kudos)
# convert to FreqDist object to a pandas series for easier processing
dist_panda = pd.Series(dict(tog_kudos_freq_dist))
#print(dist_panda)
# sort, normalise and round the panda series def plot_distribution(new_dist, cum_dist, plt_title, file_path_for_pic:str, x_label="Number of Kudos", y_label="Percentage of Occurence", scatter_plt=False, max_ticks=10):
plt.figure(figsize=(10,10))
plt.title(plt_title)
plt.xlabel(x_label)
plt.ylabel(y_label)
if scatter_plt:
sns.scatterplot(x=new_dist.index, y=cum_dist)
#plt.xticks(new_dist.index[::100], new_dist.index[::100])
new_dist = dist_panda.sort_index() else:
sns.lineplot(x=new_dist.index, y=cum_dist)
for i in range(0, len(new_dist.index)): plt.savefig(file_path_for_pic)
#for index in new_token_len_dist.index: plt.close()
new_dist.iat[i] = round(new_dist.iat[i]/len(tog_kudos), 3) #index-1 bc the index starts counting from zero, the word lengths not
#if float(new_token_len_dist.iat[i]) == 0.00:
# new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
#calculate cumulative distribution def separate_fanfics_by_good_medium_bad(df, series):
cum_dist = np.cumsum(new_dist.values)
# plot using matplotlib and seaborn
# set figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))
# call function for bar (value) labels
#addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values)
plt.title("Throne of Glass Cumulative Frequency Distribution of Kudos")
ax.set_xlabel("Number of Kudos")
ax.set_ylabel("Percentage of Occurence")
sns.lineplot(x=new_dist.index, y=cum_dist, ax=ax)
#plt.xticks(rotation=30) !!! very useful for words
fig.savefig(f"throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
"""
def preprocess_data(df, series):
good_fics = [] good_fics = []
medium_fics = [] medium_fics = []
bad_fics = [] bad_fics = []
...@@ -163,6 +122,16 @@ def preprocess_data(df, series): ...@@ -163,6 +122,16 @@ def preprocess_data(df, series):
f.write(medium_fics_joined) f.write(medium_fics_joined)
preprocess_data(grisha_fanfics, "grishaverse") if __name__ == "__main__":
preprocess_data(tog_fanfics, "throne_of_glass") #grishaverse
""" #grisha_fanfics = read_csv_to_pd(file_path="grishaverse/data/fanfics/grishaverse_fics.csv", name_of_file=grisha_fanfics)
\ No newline at end of file #new_dist, cum_dist = calculate_cum_kudo_distribution(grisha_fanfics)
#plot_distribution(new_dist=new_dist, cum_dist=cum_dist, plt_title="Grishaverse Cumulative Frequency Distribution of All Kudos", file_path_for_pic="grishaverse/freq_distribution/fanfic_kudo_freq_dist.png", scatter_plt=_plt=True)
#throne of glass
tog_fanfics = read_csv_to_pd(file_path="throne_of_glass/data/fanfics/throne_of_glass_fics.csv", name_of_file=tog_fanfics)
new_dist, cum_dist = calculate_cum_kudo_distribution(tog_fanfics)
plot_distribution(new_dist=new_dist, cum_dist=cum_dist, plt_title="Throne of Glass Cumulative Frequency Distribution of All Kudos", file_path_for_pic= "throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png", scatter_plt=True)
#separate_fanfics_by_good_medium_bad(grisha_fanfics, "grishaverse")
#separate_fanfics_by_good_medium_bad(tog_fanfics, "throne_of_glass")
grishaverse/freq_distribution/fanfic_kudo_freq_dist.png

33 KiB | W: | H:

grishaverse/freq_distribution/fanfic_kudo_freq_dist.png

41.1 KiB | W: | H:

grishaverse/freq_distribution/fanfic_kudo_freq_dist.png
grishaverse/freq_distribution/fanfic_kudo_freq_dist.png
grishaverse/freq_distribution/fanfic_kudo_freq_dist.png
grishaverse/freq_distribution/fanfic_kudo_freq_dist.png
  • 2-up
  • Swipe
  • Onion skin
grishaverse/plots/canon/md_freq.png

43.6 KiB

grishaverse/plots/canon/pos_tag_freq.png

55.9 KiB

alki ist blöd
\ No newline at end of file
me is good for filling stuffs
\ No newline at end of file
...@@ -13,27 +13,6 @@ import re ...@@ -13,27 +13,6 @@ import re
# you'll have to also download "punkt" from nltk # you'll have to also download "punkt" from nltk
# code snippets for prettifying plots
#colours
CB91_Blue = '#2CBDFE'
CB91_Green = '#47DBCD'
CB91_Pink = '#F3A0F2'
CB91_Purple = '#9D2EC5'
CB91_Violet = '#661D98'
CB91_Amber = '#F5B14C'
color_list = [CB91_Pink, CB91_Blue, CB91_Green, CB91_Amber,
CB91_Purple, CB91_Violet]
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list)
#some colour palette playing around
cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True)
cm2 = sns.cubehelix_palette(as_cmap=True)
# create function for bar (value) labels # create function for bar (value) labels
def addlabels(x,y): def addlabels(x,y):
for i in range(len(x)): for i in range(len(x)):
...@@ -502,5 +481,9 @@ data_overview = pd.DataFrame( ...@@ -502,5 +481,9 @@ data_overview = pd.DataFrame(
"std_dev_sent":std_dev_sents}, "std_dev_sent":std_dev_sents},
index = index index = index
) )
data_overview.to_csv(f"data_overview/data_overview.csv") if __name__ == "__main__":
run_functions("grishaverse/data/split_txt_fanfics")
run_functions("throne_of_glass/data/split_txt_fanfics")
data_overview.to_csv(f"data_overview/data_overview.csv")
throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png

33.2 KiB | W: | H:

throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png

41.5 KiB | W: | H:

throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png
throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png
throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png
throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png
  • 2-up
  • Swipe
  • Onion skin
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment