Skip to content
Snippets Groups Projects
Commit ecc66a90 authored by chrysanthopoulou's avatar chrysanthopoulou
Browse files

Add standardised type token ratio

parent 4349e916
No related branches found
No related tags found
No related merge requests found
,mean_tokens,std_dev,type_token_ratio
throne_of_glass_canon,4.9282660487582,2.033132292363114,0.5932999999999999
grishaverse_canon,5.046888277930518,2.20148396165822,0.6335263157894737
...@@ -54,6 +54,33 @@ def read_works_into_string(directory_path): ...@@ -54,6 +54,33 @@ def read_works_into_string(directory_path):
# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series" # curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series"
# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png" # plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png"
# by subdiving the text into segments of 1000, it calculates the type token ratio for each segment and then averages over them
# this ensures a comparability of the type token ratios for varying text sizes
def standardised_type_token_ratio(tokens):
ttrs = []
segment_tokens = []
segment = 0
for token in tokens:
if segment < 1000:
segment_tokens.append(token)
segment += 1
elif segment == 1000:
types = set(segment_tokens)
ttr = len(types)/len(segment_tokens)
ttrs.append(ttr)
segment_tokens =[]
segment = 0
if len(ttrs) <= 1:
types = set(tokens)
std_ttr = len(types)/len(tokens)
print("Warning: Text was too short for segmentation!")
print(ttrs)
else:
std_ttr = statistics.mean(ttrs)
print(ttrs)
return std_ttr
def mendenhall_curve(corpus, curve_title, plot_destination): def mendenhall_curve(corpus, curve_title, plot_destination):
tokens = word_tokenize(corpus) tokens = word_tokenize(corpus)
...@@ -120,8 +147,35 @@ def mendenhall_curve(corpus, curve_title, plot_destination): ...@@ -120,8 +147,35 @@ def mendenhall_curve(corpus, curve_title, plot_destination):
#fig_freq_dist = token_length_freq_dist_plot.get_figure() #fig_freq_dist = token_length_freq_dist_plot.get_figure()
#fig_freq_dist.savefig(plot_destination) #fig_freq_dist.savefig(plot_destination)
# calculate the standard deviation, mean, token/type ratio
standard_deviation = statistics.stdev(token_lengths)
mean = statistics.mean(token_lengths)
# to get the number of unique tokens, i.e., types, I'm converting
# my list to a set (and back). I could also transform it to a pandas
# series, and drop the duplicates, but: if it is stupid and it works
# it isn't stupid
types_set = set(short_clean_tokens)
#type_token_ratio = len(types_set)/len(short_clean_tokens)
type_token_ratio = standardised_type_token_ratio(short_clean_tokens)
return standard_deviation, mean, type_token_ratio
#create the Mendenhall Curve for the Throne of Glass Series #create the Mendenhall Curve for the Throne of Glass Series
mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png") std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")
#create the Mendenhall Curve for the Grishaverse Books #create the Mendenhall Curve for the Grishaverse Books
mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png") std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png")
# create a dataframe to store all the overview statistics in
# columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15;
# mean_sent; std_dev_sent; freq_sent_len ....
# tag_frequencies
# tag_ngram_frequencies
# punctuation frequencies
# token/type ratio
data_overview = pd.DataFrame({"mean_tokens":[mean_tokens_tog_canon, mean_tokens_grishaverse_canon], "std_dev":[std_dev_tokens_tog_canon, std_dev_tokens_grishaverse_canon], "type_token_ratio":[type_token_ratio_tog_canon, type_token_ratio_grishaverse_canon]}, index= ["throne_of_glass_canon", "grishaverse_canon"])
data_overview.to_csv(f"data_overview/data_overview.csv")
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment