diff --git a/data_overview/data_overview.csv b/data_overview/data_overview.csv new file mode 100644 index 0000000000000000000000000000000000000000..b9f85eaba6b9c09402036b9663b3c56018fc17d1 --- /dev/null +++ b/data_overview/data_overview.csv @@ -0,0 +1,3 @@ +,mean_tokens,std_dev,type_token_ratio +throne_of_glass_canon,4.9282660487582,2.033132292363114,0.5932999999999999 +grishaverse_canon,5.046888277930518,2.20148396165822,0.6335263157894737 diff --git a/stylometry_code.py b/stylometry_code.py index 1c8f2a186949b994b3cab04dc8e6691452d7d09d..16fdcaf23956c2c332a2d7025a34053bacd753a4 100644 --- a/stylometry_code.py +++ b/stylometry_code.py @@ -54,6 +54,33 @@ def read_works_into_string(directory_path): # curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series" # plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png" +# by subdiving the text into segments of 1000, it calculates the type token ratio for each segment and then averages over them +# this ensures a comparability of the type token ratios for varying text sizes +def standardised_type_token_ratio(tokens): + ttrs = [] + segment_tokens = [] + segment = 0 + for token in tokens: + if segment < 1000: + segment_tokens.append(token) + segment += 1 + elif segment == 1000: + types = set(segment_tokens) + ttr = len(types)/len(segment_tokens) + ttrs.append(ttr) + segment_tokens =[] + segment = 0 + if len(ttrs) <= 1: + types = set(tokens) + std_ttr = len(types)/len(tokens) + print("Warning: Text was too short for segmentation!") + print(ttrs) + else: + std_ttr = statistics.mean(ttrs) + print(ttrs) + return std_ttr + + def mendenhall_curve(corpus, curve_title, plot_destination): tokens = word_tokenize(corpus) @@ -120,8 +147,35 @@ def mendenhall_curve(corpus, curve_title, plot_destination): #fig_freq_dist = token_length_freq_dist_plot.get_figure() #fig_freq_dist.savefig(plot_destination) + # calculate the standard deviation, mean, token/type ratio + standard_deviation = statistics.stdev(token_lengths) + mean = statistics.mean(token_lengths) + + # to get the number of unique tokens, i.e., types, I'm converting + # my list to a set (and back). I could also transform it to a pandas + # series, and drop the duplicates, but: if it is stupid and it works + # it isn't stupid + + types_set = set(short_clean_tokens) + #type_token_ratio = len(types_set)/len(short_clean_tokens) + type_token_ratio = standardised_type_token_ratio(short_clean_tokens) + + return standard_deviation, mean, type_token_ratio + + + #create the Mendenhall Curve for the Throne of Glass Series -mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png") +std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png") #create the Mendenhall Curve for the Grishaverse Books -mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png") +std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png") + +# create a dataframe to store all the overview statistics in +# columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15; +# mean_sent; std_dev_sent; freq_sent_len .... +# tag_frequencies +# tag_ngram_frequencies +# punctuation frequencies +# token/type ratio +data_overview = pd.DataFrame({"mean_tokens":[mean_tokens_tog_canon, mean_tokens_grishaverse_canon], "std_dev":[std_dev_tokens_tog_canon, std_dev_tokens_grishaverse_canon], "type_token_ratio":[type_token_ratio_tog_canon, type_token_ratio_grishaverse_canon]}, index= ["throne_of_glass_canon", "grishaverse_canon"]) +data_overview.to_csv(f"data_overview/data_overview.csv") \ No newline at end of file