Skip to content
Snippets Groups Projects
Commit 57a67ffc authored by chrysanthopoulou's avatar chrysanthopoulou
Browse files

Add sentence length metrics

parent ba349be4
No related branches found
No related tags found
No related merge requests found
grishaverse/freq_distribution/all_canon_sent_len.png

37.3 KiB

......@@ -4,9 +4,11 @@ from cycler import cycler
import os
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.tokenize import sent_tokenize
import pandas as pd
import statistics
# you'll have to also download "punkt" from nltk
# code snippets for prettifying plots
......@@ -49,10 +51,6 @@ def read_works_into_string(directory_path):
strings.append(f.read())
return "\n".join(strings)
# this function takes a corpus as its input and gives a Mendenhall curve, i.e. a frequency distribution of tokens as its output
# precise input: corpus = string ;
# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series"
# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png"
# by subdiving the text into segments of 1000, it calculates the type token ratio for each segment and then averages over them
# this ensures a comparability of the type token ratios for varying text sizes
......@@ -78,7 +76,12 @@ def standardised_type_token_ratio(tokens):
std_ttr = statistics.mean(ttrs)
return std_ttr
# this function takes a corpus as its input and gives a Mendenhall curve, i.e. a frequency distribution of tokens as its output
# precise input: corpus = string ;
# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series"
# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png"
def mendenhall_curve(corpus, curve_title, plot_destination):
tokens = word_tokenize(corpus)
......@@ -154,11 +157,88 @@ def mendenhall_curve(corpus, curve_title, plot_destination):
return standard_deviation, mean, type_token_ratio
def sentence_metrics(corpus, curve_title, plot_destination):
sents = sent_tokenize(corpus)
sent_lens = []
for sent in sents:
tokens = word_tokenize(sent)
#cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)])
"""
short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens
# had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin"
# and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running"
for token in cleaned_tokens:
dehyphenated_token = []
letter_present = 0
for c in token:
if c.isalpha() == True:
dehyphenated_token.append(c)
letter_present = 1
elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens,
#bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of
# high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
# relatively speaking
dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
#print(dehyphenated_token_joined)
short_clean_tokens.append(dehyphenated_token_joined)
dehyphenated_token = []
letter_present = 0
"""
sent_lens.append(len(tokens))
sent_len_dist = FreqDist(sent_lens).most_common(50)
# convert to FreqDist object to a pandas series for easier processing
sent_len_dist_panda = pd.Series(dict(sent_len_dist))
# sort, normalise and round the panda series
new_sent_len_dist = sent_len_dist_panda.sort_index()
print(new_sent_len_dist)
for i in range(0, len(new_sent_len_dist.index)):
#for index in new_token_len_dist.index:
new_sent_len_dist.iat[i] = round(new_sent_len_dist.iat[i]/len(sent_lens), 2) #index-1 bc the index starts counting from zero, the word lengths not
# plot using matplotlib and seaborn
# set figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))
# call function for bar (value) labels
#addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values)
plt.title(curve_title)
ax.set_xlabel("Sentence Length")
ax.set_ylabel("Percentage of Occurence")
sns.lineplot(x=new_sent_len_dist.index, y=new_sent_len_dist.values, ax=ax, palette="flare")
#plt.xticks(rotation=30) !!! very useful for words
plt.savefig(plot_destination)
# calculate the standard deviation, mean, token/type ratio
standard_deviation_sent = statistics.stdev(sent_lens)
mean_sent = statistics.mean(sent_lens)
return standard_deviation_sent, mean_sent
#create the Mendenhall Curve for the Throne of Glass Series
std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")
#std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")
#create the Mendenhall Curve for the Grishaverse Books
std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png")
#std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png")
# Mendenhall Curve Sentence Lengths for Throne of Glass Canon
std_dev_sent_tog_canon, mean_sent_tog_canon = sentence_metrics(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_sent_len.png")
# Mendenhall Curve Sentence Lenghts for Grishavers Canon
std_dev_sent_grishaverse_canon, mean_sent_grishaverse_canon = sentence_metrics(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_sent_len.png")
# create a dataframe to store all the overview statistics in
# columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15;
......@@ -167,5 +247,14 @@ std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_rati
# tag_ngram_frequencies
# punctuation frequencies
# token/type ratio
data_overview = pd.DataFrame({"mean_tokens":[mean_tokens_tog_canon, mean_tokens_grishaverse_canon], "std_dev":[std_dev_tokens_tog_canon, std_dev_tokens_grishaverse_canon], "type_token_ratio":[type_token_ratio_tog_canon, type_token_ratio_grishaverse_canon]}, index= ["throne_of_glass_canon", "grishaverse_canon"])
data_overview.to_csv(f"data_overview/data_overview.csv")
\ No newline at end of file
"""
data_overview = pd.DataFrame(
{"mean_tokens":[mean_tokens_tog_canon, mean_tokens_grishaverse_canon],
"std_dev":[std_dev_tokens_tog_canon, std_dev_tokens_grishaverse_canon],
"type_token_ratio":[type_token_ratio_tog_canon, type_token_ratio_grishaverse_canon],
"mean_sent":[mean_sent_tog_canon, mean_sent_grishaverse_canon],
"std_dev":[std_dev_sent_tog_canon, std_dev_sent_grishaverse_canon]},
index= ["throne_of_glass_canon", "grishaverse_canon"]
)
"""
#data_overview.to_csv(f"data_overview/data_overview.csv")
\ No newline at end of file
throne_of_glass/freq_distribution/all_canon_sent_len.png

40.6 KiB

0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment