Skip to content
Snippets Groups Projects
Commit 6f37468b authored by Lea Kyveli Chrysanthopoulou's avatar Lea Kyveli Chrysanthopoulou
Browse files

Clean up code and put it into classes

parent 8dda779c
No related branches found
No related tags found
No related merge requests found
import seaborn as sns
import matplotlib.pyplot as plt
from cycler import cycler
import os
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.tokenize import sent_tokenize
from nltk.tag import pos_tag
import pandas as pd
import statistics
import re
# you'll have to also download "punkt" from nltk
# create function for bar (value) labels
def addlabels(x,y):
for i in range(len(x)):
plt.text(i, y[i], y[i], ha = "center")
# function compiling the works given into a single string. Input required:
# general path of the files as string, for example: "/throne_of_glass/data/canon_works/"
# specific names of the works as a list of strings, for example "throne_of_glass_1.txt"
# /throne_of_glass/data/canon_works/
def read_works_into_string(directory_path):
strings = []
works = os.listdir(directory_path)
for work in works:
with open(f"{directory_path}"+f"/{work}", "r") as f:
strings.append(f.read())
return "\n".join(strings)
# by subdiving the text into segments of 1000, it calculates the type token ratio for each segment and then averages over them
# this ensures a comparability of the type token ratios for varying text sizes
def standardised_type_token_ratio(tokens):
ttrs = []
segment_tokens = []
segment = 0
for token in tokens:
if segment < 1000:
segment_tokens.append(token)
segment += 1
elif segment == 1000:
types = set(segment_tokens)
ttr = len(types)/len(segment_tokens)
ttrs.append(ttr)
segment_tokens =[]
segment = 0
if len(ttrs) <= 1:
types = set(tokens)
std_ttr = len(types)/len(tokens)
print("Warning: Text was too short for segmentation!")
else:
std_ttr = statistics.mean(ttrs)
return std_ttr
def tokenize_and_clean_text(text):
tokens = word_tokenize(text)
cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)])
short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens
# had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin"
# and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running"
for token in cleaned_tokens:
dehyphenated_token = []
letter_present = 0
dehyphenated = 0
second_word_in_compound = 0
for c in token:
if c.isalpha() == True:
dehyphenated_token.append(c)
letter_present = 1
if dehyphenated == 1:
second_word_in_compound = 1
elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens,
#bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of
# high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
# relatively speaking
dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
#print(dehyphenated_token_joined)
short_clean_tokens.append(dehyphenated_token_joined)
dehyphenated_token = []
letter_present = 0
dehyphenated = 1
second_word_in_compound = 0
if letter_present == 1 and dehyphenated == 0:
short_clean_tokens.append(token) #catching the tokens that didn't have any special characters; but not the dehyphenated ones twice
elif letter_present == 1 and dehyphenated == 1 and second_word_in_compound == 1:
short_clean_tokens.append(''.join(map(str, dehyphenated_token)))
return short_clean_tokens
def mendenhall_token_metrics(tokens):
# create the distribution of token lengths / Mendenhall curve
token_lengths = [len(token) for token in tokens]
# Calculate the trimmed token length (with 5% trimming) We need to remove the outliers, bc even despite preprocessing,
# there still are some very wrong lengths, which entirely skews the metrics and also ruins our p-values later on
trim_percent = 0.005
trim_len = int(len(token_lengths) * trim_percent / 2)
token_lengths = sorted(token_lengths)[trim_len:-trim_len]
token_length_distribution = FreqDist(token_lengths).most_common(15)
# convert to FreqDist object to a pandas series for easier processing
token_len_dist_panda = pd.Series(dict(token_length_distribution))
# sort, normalise and round the panda series
new_token_len_dist = token_len_dist_panda.sort_index()
for i in range(0, len(new_token_len_dist.index)):
#for index in new_token_len_dist.index:
new_token_len_dist.iat[i] = round(new_token_len_dist.iat[i]/len(tokens), 3) #index-1 bc the index starts counting from zero, the word lengths not
#if float(new_token_len_dist.iat[i]) == 0.00:
# new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
standard_deviation = statistics.stdev(token_lengths)
mean = statistics.mean(token_lengths)
return new_token_len_dist, standard_deviation, mean
def plot_distribution(x, y, plt_title, file_path_for_pic:str, x_label="Number of Kudos", y_label="Percentage of Occurence",palette="flare", plt_type="barplot", add_labels=True):
plt.figure(figsize=(10,10))
plt.title(plt_title)
plt.xlabel(x_label)
plt.ylabel(y_label)
if add_labels:
addlabels(x=x.index, y=y.values)
match case:
sns.scatterplot(x=x.index, y=y.values, palette=palette)
#plt.xticks(new_dist.index[::100], new_dist.index[::100])
else:
sns.lineplot(x=x.index, y=y.values, palette=palette)
plt.savefig(file_path_for_pic)
plt.close()
# plot using matplotlib and seaborn
# set figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))
# call function for bar (value) labels
addlabels(x=new_token_len_dist.index, y=new_token_len_dist.values)
plt.title(curve_title)
ax.set_xlabel("Word Length")
ax.set_ylabel("Percentage of Occurence")
sns.barplot(x=new_token_len_dist.index, y=new_token_len_dist.values, ax=ax, palette="flare")
#plt.xticks(rotation=30) !!! very useful for words
#plt.get_figure()
plt.savefig(plot_destination)
#print(new_token_len_dist.tabulate())
#token_length_freq_dist_plot = token_length_distribution.plot(title=curve_title, percents=True)
#fig_freq_dist = token_length_freq_dist_plot.get_figure()
#fig_freq_dist.savefig(plot_destination)
class StylometryMetrics:
def __init__(self, directory_path):
self.text = read_works_into_string(directory_path)
self.clean_tokens = tokenize_and_clean_text(self.text)
def calculate_standardised_ttr(self):
self.sttr = standardised_type_token_ratio(self.clean_tokens)
def calculate_mendenhall_token_metrics(self):
self.tk_len_dist, self.tk_len_std, self.tk_len_mean = mendenhall_token_metrics(self.clean_tokens)
def plot
# this function takes a corpus as its input and gives a Mendenhall curve, i.e. a frequency distribution of tokens as its output
# precise input: corpus = string ;
# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series"
# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png"
def mendenhall_curve(corpus, curve_title, plot_destination):
short_clean_tokens = tokenize_and_clean_text(corpus)
# create the distribution of token lengths / Mendenhall curve
token_lengths = [len(token) for token in short_clean_tokens]
# Calculate the trimmed token length (with 5% trimming) We need to remove the outliers, bc even despite preprocessing,
# there still are some very wrong lengths, which entirely skews the metrics and also ruins our p-values later on
trim_percent = 0.005
trim_len = int(len(token_lengths) * trim_percent / 2)
token_lengths = sorted(token_lengths)[trim_len:-trim_len]
token_length_distribution = FreqDist(token_lengths).most_common(15)
# convert to FreqDist object to a pandas series for easier processing
token_len_dist_panda = pd.Series(dict(token_length_distribution))
# sort, normalise and round the panda series
new_token_len_dist = token_len_dist_panda.sort_index()
for i in range(0, len(new_token_len_dist.index)):
#for index in new_token_len_dist.index:
new_token_len_dist.iat[i] = round(new_token_len_dist.iat[i]/len(short_clean_tokens), 3) #index-1 bc the index starts counting from zero, the word lengths not
#if float(new_token_len_dist.iat[i]) == 0.00:
# new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
# plot using matplotlib and seaborn
# set figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))
# call function for bar (value) labels
addlabels(x=new_token_len_dist.index, y=new_token_len_dist.values)
plt.title(curve_title)
ax.set_xlabel("Word Length")
ax.set_ylabel("Percentage of Occurence")
sns.barplot(x=new_token_len_dist.index, y=new_token_len_dist.values, ax=ax, palette="flare")
#plt.xticks(rotation=30) !!! very useful for words
#plt.get_figure()
plt.savefig(plot_destination)
#print(new_token_len_dist.tabulate())
#token_length_freq_dist_plot = token_length_distribution.plot(title=curve_title, percents=True)
#fig_freq_dist = token_length_freq_dist_plot.get_figure()
#fig_freq_dist.savefig(plot_destination)
# calculate the standard deviation, mean, token/type ratio
standard_deviation = statistics.stdev(token_lengths)
mean = statistics.mean(token_lengths)
type_token_ratio = standardised_type_token_ratio(short_clean_tokens)
return standard_deviation, mean, type_token_ratio
def sentence_metrics(corpus, curve_title, series, canon_or_fanfic):
sents = sent_tokenize(corpus)
sent_lens = []
for sent in sents:
short_clean_tokens = tokenize_and_clean_text(sent)
sent_lens.append(len(short_clean_tokens))
#if len(short_clean_tokens)>= 90:
#print(f"This sentence: \n {sent} \n is this long: {len(short_clean_tokens)}")
# Calculate the trimmed mean sentence length (with 5% trimming) We need to remove the outliers, bc even despite preprocessing,
# there still are some sentences that are 1200 tokens long, which entirely skews the metrics and also ruins our p-values later on
trim_percent = 0.05
trim_len = int(len(sent_lens) * trim_percent / 2)
sent_lens = sorted(sent_lens)[trim_len:-trim_len]
sent_len_dist = FreqDist(sent_lens)
#print(sent_len_dist)
# convert to FreqDist object to a pandas series for easier processing
sent_len_dist_panda = pd.Series(dict(sent_len_dist))
# sort, normalise and round the panda series
new_sent_len_dist = sent_len_dist_panda.sort_index()
#print(new_sent_len_dist)
for i in range(0, len(new_sent_len_dist.index)):
#for index in new_token_len_dist.index:
new_sent_len_dist.iat[i] = round(new_sent_len_dist.iat[i]/len(sent_lens), 2) #index-1 bc the index starts counting from zero, the word lengths not
#print(new_sent_len_dist)
# plot using matplotlib and seaborn
# set figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))
# call function for bar (value) labels
#addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values)
plt.title(curve_title)
ax.set_xlabel("Sentence Length")
ax.set_ylabel("Percentage of Occurence")
sns.lineplot(x=new_sent_len_dist.index, y=new_sent_len_dist.values, ax=ax, palette="crest")
#plt.xticks(rotation=30) !!! very useful for words
plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_sent_len_long.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
# plot the 40 most frequent sentence lenghts as a barplot for a more detailed insight
sent_len_dist_short = FreqDist(sent_lens).most_common(25)
# convert to FreqDist object to a pandas series for easier processing
sent_len_dist_short_panda = pd.Series(dict(sent_len_dist_short))
# sort, normalise and round the panda series
new_sent_len_dist_short = sent_len_dist_short_panda.sort_index()
#print(new_sent_len_dist)
for i in range(0, len(new_sent_len_dist_short.index)):
#for index in new_token_len_dist.index:
new_sent_len_dist_short.iat[i] = round(new_sent_len_dist_short.iat[i]/len(sent_lens), 2) #index-1 bc the index starts counting from zero, the word lengths not
# set figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))
# call function for bar (value) labels
addlabels(x=new_sent_len_dist_short.index, y=new_sent_len_dist_short.values)
plt.title(curve_title)
ax.set_xlabel("Sentence Length")
ax.set_ylabel("Percentage of Occurence")
sns.barplot(x=new_sent_len_dist_short.index, y=new_sent_len_dist_short.values, ax=ax, palette="YlGnBu")
#plt.xticks(rotation=30) !!! very useful for words
plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_sent_len_short.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
# calculate the standard deviation, mean, token/type ratio
standard_deviation_sent = statistics.stdev(sent_lens)
mean_sent = statistics.mean(sent_lens)
return standard_deviation_sent, mean_sent
# overall pos_tag frequency distribution
# pos_tag ngrams; (maybe exclude stopwords?)
# tag collocates for specific tags --> adjectives most frequently with nouns
# most frequent words
# most frequent words for specific tags --> punctuation;
# most frequent adjectives
def pos_tag_frequencies(corpus, series, canon_or_fanfic):
#nltk.pos_tag(text) --> [('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'),
#('completely', 'RB'), ('different', 'JJ')]
tokens = word_tokenize(corpus)
"""
short_tokens = []
for token in tokens:
dehyphenated_token = []
letter_present = 0
dehyphenated = 0
second_word_in_compound = 0
for c in token:
if c.isalpha() == True:
dehyphenated_token.append(c)
letter_present = 1
if dehyphenated == 1:
second_word_in_compound = 1
elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens,
#bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of
# high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
# relatively speaking
dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
#print(dehyphenated_token_joined)
short_tokens.append(dehyphenated_token_joined)
short_tokens.append(c) #append the hyphen/ other punctuation --> we're also interested in that
dehyphenated_token = []
letter_present = 0
dehyphenated = 1
second_word_in_compound = 0
if letter_present == 1 and dehyphenated == 0:
short_tokens.append(token) #catching the tokens that didn't have any special characters; but not the dehyphenated ones twice
elif letter_present == 1 and dehyphenated == 1 and second_word_in_compound == 1:
short_tokens.append(''.join(map(str, dehyphenated_token)))
"""
tag_token_tuples = pos_tag(tokens)
punctuation_regex = r"[^\w\s]+"
summarised_tags = []
punctuation_tags = []
index = 0
for token, tag in tag_token_tuples:
if re.match(punctuation_regex, token):
summarised_tags.append("punctuation")
if re.match(r"[\"\'“”’‘]+", token):
punctuation_tags.append("quotation_marks")
elif re.match(r"[,;:.?!-]+", token):
try:
punctuation_tags.append("ellipsis" if token == "." and tag_token_tuples[index+1][1] == "." and tag_token_tuples[index+2][1] == "." else "full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct")
except:
punctuation_tags.append("full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct")
else:
if tag in ["MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
summarised_tags.append("verb")
elif tag in ["JJ", "JJR", "JJS"]:
summarised_tags.append("adjective")
elif tag in ["RB", "RBR", "RBS", "WRB"]:
summarised_tags.append("adverb")
elif tag in ["PRP", "PRP$", "WP", "WP$"]:
summarised_tags.append("pronoun")
elif tag in ["NNP", "NNPS"]:
summarised_tags.append("proper_noun")
elif tag in ["NN", "NNS"]:
summarised_tags.append("common_noun")
elif tag in ["DT", "PDT", "WDT"]:
summarised_tags.append("determiner")
elif tag == "CC":
summarised_tags.append("coordinating_conj")
elif tag == "IN":
summarised_tags.append("subordinating_conj")
elif tag in ["$", "CD", "EX", "LS", "POS", "SYM", "TO", "UH", "RP", "FW"]:
summarised_tags.append("other_tag")
index += 1
tag_freq_dist = FreqDist(summarised_tags)
#print(tag_freq_dist)
# convert FreqDist object to a pandas series for easier processing
tag_freq_dist_panda = pd.Series(dict(tag_freq_dist))
#print(tag_freq_dist_panda)
# sort, normalise and round the panda series
new_tag_freq_dist = tag_freq_dist_panda.sort_index()
#print(new_sent_len_dist)
for i in range(0, len(new_tag_freq_dist.index)):
#for index in new_token_len_dist.index:
new_tag_freq_dist.iat[i] = round(new_tag_freq_dist.iat[i]/len(tag_token_tuples), 2) #index-1 bc the index starts counting from zero, the word lengths not
print(new_tag_freq_dist)
# set figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))
# call function for bar (value) labels
addlabels(x=new_tag_freq_dist.index, y=new_tag_freq_dist.values)
plt.title(f"POS Tag Frequencies for the {series.replace('_' , ' ').title()} {canon_or_fanfic.replace('_' , ' ').title()}")
ax.set_xlabel("POS Tags")
ax.set_ylabel("Percentage of Occurence")
sns.barplot(x=new_tag_freq_dist.index, y=new_tag_freq_dist.values, ax=ax, palette="RdPu")
plt.xticks(rotation=30) # !!! very useful for words
plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_pos_tag_frequencies.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
#punctuation frequency distribution
punct_tag_freq_dist = FreqDist(punctuation_tags)
#print(tag_freq_dist)
# convert FreqDist object to a pandas series for easier processing
punct_tag_freq_dist_panda = pd.Series(dict(punct_tag_freq_dist))
#print(punct_tag_freq_dist_panda)
# sort, normalise and round the panda series
new_punct_tag_freq_dist = punct_tag_freq_dist_panda.sort_index()
#print(new_sent_len_dist)
for i in range(0, len(new_punct_tag_freq_dist.index)):
#for index in new_token_len_dist.index:
new_punct_tag_freq_dist.iat[i] = round(new_punct_tag_freq_dist.iat[i]/len(punctuation_tags), 3) #index-1 bc the index starts counting from zero, the word lengths not
#print(new_punct_tag_freq_dist)
# set figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))
# call function for bar (value) labels
addlabels(x=new_punct_tag_freq_dist.index, y=new_punct_tag_freq_dist.values)
plt.title(f"Punctuation Frequencies for the {series.replace('_' , ' ').title()} {canon_or_fanfic.replace('_' , ' ').title()}")
ax.set_xlabel("Types of Punctuation")
ax.set_ylabel("Percentage of Occurence")
sns.barplot(x=new_punct_tag_freq_dist.index, y=new_punct_tag_freq_dist.values, ax=ax, palette="OrRd")
plt.xticks(rotation=30) # !!! very useful for words
plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_punctuation_frequencies.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
#create the Mendenhall Curve for the Throne of Glass Series
std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")
#create the Mendenhall Curve for the Grishaverse Books
std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png")
# Mendenhall Curve Sentence Lengths for Throne of Glass Canon
std_dev_sent_tog_canon, mean_sent_tog_canon = sentence_metrics(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Throne of Glass Series", "throne_of_glass", "canon")
# Mendenhall Curve Sentence Lenghts for Grishavers Canon
std_dev_sent_grishaverse_canon, mean_sent_grishaverse_canon = sentence_metrics(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Grishaverse Books", "grishaverse", "canon")
# POS Tag frequencies for TOG
pos_tag_frequencies(read_works_into_string(f"throne_of_glass/data/canon_works"), "throne_of_glass", "canon")
# POS Tag frequencies for Grishaverse
pos_tag_frequencies(read_works_into_string(f"grishaverse/data/canon_works"), "grishaverse", "canon")
def run_functions(directory_path):
"""
mean_tks = []
idx = []
std_dev_tks = []
ttrs = []
mean_sts= []
std_dev_sts = []
"""
#for txt_fic in os.listdir(directory_path):
works = os.listdir(directory_path)
pattern = r"^[a-zA-Z_]+(?=/)" # get series from directory path
match = re.search(pattern, directory_path)
if match:
series = match.group(0)
for work in works:
with open(f"{directory_path}"+f"/{work}", "r") as f:
f = f.read()
std_dev_tk, mean_tk, ttr = mendenhall_curve(f, f"Mendenhall Curve for the {series.replace('_' , ' ').title()} {work[:-4].replace('_' , ' ').title()}", f"{series}/freq_distribution/{work[:-4]}_token_len.png")
mean_tokens.append(mean_tk)
std_dev_tokens.append(std_dev_tk)
type_token_ratio.append(ttr)
std_dev_st, mean_st = sentence_metrics(f, f"Mendenhall Curve for Sentence Lenghts for the {series.replace('_' , ' ').title()} {work[:-4].replace('_' , ' ').title()}", series, work[:-4])
mean_sent.append(mean_st)
std_dev_sents.append(std_dev_st)
pos_tag_frequencies(f, series, work[:-4])
index.append(f"{series}_{work[:-4]}")
#grishaverse/data/split_txt_fanfics
#create lists for each of the columns of the dataframe we'll create
mean_tokens = [mean_tokens_tog_canon, mean_tokens_grishaverse_canon]
std_dev_tokens = [std_dev_tokens_tog_canon, std_dev_tokens_grishaverse_canon]
type_token_ratio = [type_token_ratio_tog_canon, type_token_ratio_grishaverse_canon]
mean_sent = [mean_sent_tog_canon, mean_sent_grishaverse_canon]
std_dev_sents = [std_dev_sent_tog_canon, std_dev_sent_grishaverse_canon]
index = ["throne_of_glass_canon", "grishaverse_canon"]
# create a dataframe to store all the overview statistics in
# columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15;
# mean_sent; std_dev_sent; freq_sent_len ....
# tag_frequencies
# tag_ngram_frequencies
# punctuation frequencies
# token/type ratio
data_overview = pd.DataFrame(
{"mean_tokens":mean_tokens,
"std_dev_tokens":std_dev_tokens,
"type_token_ratio":type_token_ratio,
"mean_sent":mean_sent,
"std_dev_sent":std_dev_sents},
index = index
)
if __name__ == "__main__":
run_functions("grishaverse/data/split_txt_fanfics")
run_functions("throne_of_glass/data/split_txt_fanfics")
data_overview.to_csv(f"data_overview/data_overview.csv")
# code snippets for prettifying plots
#colours
CB91_Blue = '#2CBDFE'
CB91_Green = '#47DBCD'
CB91_Pink = '#F3A0F2'
CB91_Purple = '#9D2EC5'
CB91_Violet = '#661D98'
CB91_Amber = '#F5B14C'
color_list = [CB91_Pink, CB91_Blue, CB91_Green, CB91_Amber,
CB91_Purple, CB91_Violet]
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list)
#some colour palette playing around
cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True)
cm2 = sns.cubehelix_palette(as_cmap=True)
\ No newline at end of file
......@@ -22,7 +22,7 @@ CB91_Purple = '#9D2EC5'
CB91_Violet = '#661D98'
CB91_Amber = '#F5B14C'
color_list = [pink, light_green, purple_grey, blue_grey, CB91_Green, CB91_Pink, CB91_Blue, CB91_Amber,
color_list = [ blue_grey, CB91_Amber, pink, light_green, CB91_Green, CB91_Pink, CB91_Blue,
CB91_Purple, CB91_Violet]
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list)
......@@ -32,11 +32,11 @@ cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True)
cm2 = sns.cubehelix_palette(as_cmap=True)
#palette_1 = sns.color_palette("flare")
#palette_2 = sns.color_palette("mako_r", as_cmap=True)
# actual preprocessing code
#file header:
# work_id,title,author,rating,category,fandom,relationship,character,additional tags,language,published,status,status date,words,chapters,comments,kudos,bookmarks,hits,all_kudos,all_bookmarks,body
# 27852922,Dealing with Our Demons,['ravenyenn19'],Mature,F/M,"Six of Crows Series",Kaz Brekker/Inej Ghafa,"Kaz B","Romance,Kanej - Freeform, Eventual Smut",English,2020-12-03,Updated,2023-03-16,747673,162/?,8573,12204,1373,709212,"['ud4m', 'book_addict_1228', 'ephemeraldelights', 'bluedelilah25', 'sunshinecorsets', 'I_do_not_like_purple_glasses', 'beep_boop_00', 'schleswigholstein', 'moonandstars75', 'ewerythingoes', 'mindfighters', 'rosibunnis', 'Lizie06', 'ghostlatte', 'aguswolman', 'QueenofEnglan', 'JenBoyette04', 'gnitneb_reads', 'gloomysunshine', 'v1ofvs', 'BazzaKrekker', 'BookGeek', 'poppyflower19', 'Cassanibal', 'vanilla_chai_tea', 'Honorthyword', 'mariaarmengol', 'luc1inda', 'zarawrites', 'monmough', 'Guilty__Pleasures', 'Ilyann', 'folieadeux_0_0', 'dragonguard', 'Emeliemarx', 'angrydabee', 'slythxrclaw', 'samaram0215', 'letsgetthisbread69', 'Mintmew', 'biblichour', 'Katloupet', 'Miss_ginger', 'inejsquake', 'Arabella_7833', 'flossy_flo99', 'a_k123', 'hushedwanderer', 'siriuslymichele', 'AnnaAvinaVTDX']",[],"Dear Kaz,
......@@ -45,90 +45,49 @@ cm2 = sns.cubehelix_palette(as_cmap=True)
grisha_fanfics = pd.read_csv("grishaverse/data/fanfics/grishaverse_fics.csv")
tog_fanfics = pd.read_csv("throne_of_glass/data/fanfics/throne_of_glass_fics.csv")
"""
# plot distribution of kudos for Grishaverse Fanfics
grisha_kudos = grisha_fanfics["kudos"].values.tolist()
grisha_kudos_freq_dist = FreqDist(grisha_kudos)
# convert to FreqDist object to a pandas series for easier processing
dist_panda = pd.Series(dict(grisha_kudos_freq_dist))
#print(dist_panda)
# sort, normalise and round the panda series
new_dist = dist_panda.sort_index()
for i in range(0, len(new_dist.index)):
#for index in new_token_len_dist.index:
new_dist.iat[i] = round(new_dist.iat[i]/len(grisha_kudos), 3) #index-1 bc the index starts counting from zero, the word lengths not
#if float(new_token_len_dist.iat[i]) == 0.00:
# new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
#calculate cumulative distribution
cum_dist = np.cumsum(new_dist.values)
# plot using matplotlib and seaborn
# set figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))
# call function for bar (value) labels
#addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values)
def read_csv_to_pd(file_path, name_of_file) -> pd: #fix type hints
name_of_file = pd.read_csv(file_path)
return name_of_file
plt.title("Grishaverse Cumulative Frequency Distribution of All Kudos")
ax.set_xlabel("Number of Kudos")
ax.set_ylabel("Percentage of Occurence")
def calculate_cum_kudo_distribution(fanfic_pd):
fanfic_kudos = fanfic_pd["kudos"].values.tolist()
fanfic_kudos_freq_dist = FreqDist(fanfic_kudos)
# convert to FreqDist object to a pandas series for easier processing
dist_panda = pd.Series(dict(fanfic_kudos_freq_dist))
sns.lineplot(x=new_dist.index, y=cum_dist, ax=ax)
#plt.xticks(rotation=30) !!! very useful for words
fig.savefig(f"grishaverse/freq_distribution/fanfic_kudo_freq_dist.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
# sort, normalise and round the panda series
new_dist = dist_panda.sort_index()
"""
# plot distribution of kudos for Throne of Glass Fanfics
for i in range(0, len(new_dist.index)):
#for index in new_token_len_dist.index:
new_dist.iat[i] = round(new_dist.iat[i]/len(fanfic_kudos), 3) #index-1 bc the index starts counting from zero, the word lengths not
#if float(new_token_len_dist.iat[i]) == 0.00:
# new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
tog_kudos = tog_fanfics["kudos"].values.tolist()
#calculate cumulative distribution
cum_dist = np.cumsum(new_dist.values)
return new_dist, cum_dist
tog_kudos_freq_dist = FreqDist(tog_kudos)
# convert to FreqDist object to a pandas series for easier processing
dist_panda = pd.Series(dict(tog_kudos_freq_dist))
#print(dist_panda)
# sort, normalise and round the panda series
def plot_distribution(new_dist, cum_dist, plt_title, file_path_for_pic:str, x_label="Number of Kudos", y_label="Percentage of Occurence", scatter_plt=False, max_ticks=10):
plt.figure(figsize=(10,10))
plt.title(plt_title)
plt.xlabel(x_label)
plt.ylabel(y_label)
if scatter_plt:
sns.scatterplot(x=new_dist.index, y=cum_dist)
#plt.xticks(new_dist.index[::100], new_dist.index[::100])
new_dist = dist_panda.sort_index()
else:
sns.lineplot(x=new_dist.index, y=cum_dist)
for i in range(0, len(new_dist.index)):
#for index in new_token_len_dist.index:
new_dist.iat[i] = round(new_dist.iat[i]/len(tog_kudos), 3) #index-1 bc the index starts counting from zero, the word lengths not
#if float(new_token_len_dist.iat[i]) == 0.00:
# new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
plt.savefig(file_path_for_pic)
plt.close()
#calculate cumulative distribution
cum_dist = np.cumsum(new_dist.values)
# plot using matplotlib and seaborn
# set figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))
# call function for bar (value) labels
#addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values)
plt.title("Throne of Glass Cumulative Frequency Distribution of Kudos")
ax.set_xlabel("Number of Kudos")
ax.set_ylabel("Percentage of Occurence")
sns.lineplot(x=new_dist.index, y=cum_dist, ax=ax)
#plt.xticks(rotation=30) !!! very useful for words
fig.savefig(f"throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
"""
def preprocess_data(df, series):
def separate_fanfics_by_good_medium_bad(df, series):
good_fics = []
medium_fics = []
bad_fics = []
......@@ -163,6 +122,16 @@ def preprocess_data(df, series):
f.write(medium_fics_joined)
preprocess_data(grisha_fanfics, "grishaverse")
preprocess_data(tog_fanfics, "throne_of_glass")
"""
\ No newline at end of file
if __name__ == "__main__":
#grishaverse
#grisha_fanfics = read_csv_to_pd(file_path="grishaverse/data/fanfics/grishaverse_fics.csv", name_of_file=grisha_fanfics)
#new_dist, cum_dist = calculate_cum_kudo_distribution(grisha_fanfics)
#plot_distribution(new_dist=new_dist, cum_dist=cum_dist, plt_title="Grishaverse Cumulative Frequency Distribution of All Kudos", file_path_for_pic="grishaverse/freq_distribution/fanfic_kudo_freq_dist.png", scatter_plt=_plt=True)
#throne of glass
tog_fanfics = read_csv_to_pd(file_path="throne_of_glass/data/fanfics/throne_of_glass_fics.csv", name_of_file=tog_fanfics)
new_dist, cum_dist = calculate_cum_kudo_distribution(tog_fanfics)
plot_distribution(new_dist=new_dist, cum_dist=cum_dist, plt_title="Throne of Glass Cumulative Frequency Distribution of All Kudos", file_path_for_pic= "throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png", scatter_plt=True)
#separate_fanfics_by_good_medium_bad(grisha_fanfics, "grishaverse")
#separate_fanfics_by_good_medium_bad(tog_fanfics, "throne_of_glass")
grishaverse/freq_distribution/fanfic_kudo_freq_dist.png

33 KiB | W: | H:

grishaverse/freq_distribution/fanfic_kudo_freq_dist.png

41.1 KiB | W: | H:

grishaverse/freq_distribution/fanfic_kudo_freq_dist.png
grishaverse/freq_distribution/fanfic_kudo_freq_dist.png
grishaverse/freq_distribution/fanfic_kudo_freq_dist.png
grishaverse/freq_distribution/fanfic_kudo_freq_dist.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -13,27 +13,6 @@ import re
# you'll have to also download "punkt" from nltk
# code snippets for prettifying plots
#colours
CB91_Blue = '#2CBDFE'
CB91_Green = '#47DBCD'
CB91_Pink = '#F3A0F2'
CB91_Purple = '#9D2EC5'
CB91_Violet = '#661D98'
CB91_Amber = '#F5B14C'
color_list = [CB91_Pink, CB91_Blue, CB91_Green, CB91_Amber,
CB91_Purple, CB91_Violet]
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list)
#some colour palette playing around
cm = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
cm1 = sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True)
cm2 = sns.cubehelix_palette(as_cmap=True)
# create function for bar (value) labels
def addlabels(x,y):
for i in range(len(x)):
......@@ -482,10 +461,6 @@ mean_sent = [mean_sent_tog_canon, mean_sent_grishaverse_canon]
std_dev_sents = [std_dev_sent_tog_canon, std_dev_sent_grishaverse_canon]
index = ["throne_of_glass_canon", "grishaverse_canon"]
run_functions("grishaverse/data/split_txt_fanfics")
run_functions("throne_of_glass/data/split_txt_fanfics")
# create a dataframe to store all the overview statistics in
# columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15;
# mean_sent; std_dev_sent; freq_sent_len ....
......@@ -502,5 +477,9 @@ data_overview = pd.DataFrame(
"std_dev_sent":std_dev_sents},
index = index
)
data_overview.to_csv(f"data_overview/data_overview.csv")
if __name__ == "__main__":
run_functions("grishaverse/data/split_txt_fanfics")
run_functions("throne_of_glass/data/split_txt_fanfics")
data_overview.to_csv(f"data_overview/data_overview.csv")
throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png

33.2 KiB | W: | H:

throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png

41.5 KiB | W: | H:

throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png
throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png
throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png
throne_of_glass/freq_distribution/fanfic_kudo_freq_dist.png
  • 2-up
  • Swipe
  • Onion skin
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment