Skip to content
Snippets Groups Projects
Commit d2138256 authored by chrysanthopoulou's avatar chrysanthopoulou
Browse files

Add more stylometry code

parent b459b28e
No related branches found
No related tags found
No related merge requests found
Showing
with 84 additions and 55 deletions
......@@ -340,32 +340,6 @@ class StylometryMetrics:
file_path_for_pic = f"{self.file_path_template}punct_freq.png"
plot_distribution(x=self.punct_tag_freq_dist, y=self.punct_tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Types of Punctuation", y_label="Percentage of Occurence")
#grishaverse/data/split_txt_fanfics
def create_dataframe_with_overview_info():
#create lists for each of the columns of the dataframe we'll create
mean_tokens = []
std_dev_tokens = []
mean_sent = []
std_dev_sents = []
index = []
# create a dataframe to store all the overview statistics in
# columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15;
# mean_sent; std_dev_sent; freq_sent_len ....
# tag_frequencies
# tag_ngram_frequencies
# punctuation frequencies
# token/type ratio
data_overview = pd.DataFrame(
{"mean_tokens":mean_tokens,
"std_dev_tokens":std_dev_tokens,
"mean_sent":mean_sent,
"std_dev_sent":std_dev_sents},
index = index
)
def execute_funcs(dir_paths, ExampleClass, plt_stuff=False, save_distributions_to_csv=False, data_overview_csv=False):
#create lists for each of the columns of the dataframe we'll create
mean_tokens = []
......
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
import matplotlib.pyplot as plt
import os
from nltk.probability import FreqDist
import pandas as pd
import statistics
import re
import dataframe_image as dfi
import scipy.stats
data_overview = pd.DataFrame(pd.read_csv("data_overview/data_overview.csv", index_col=0))
data_overview = pd.read_csv("data_overview/data_overview.csv", index_col=0)
# ,mean_tokens,std_dev_tokens,type_token_ratio,mean_sent,std_dev_sent
#(f"{C.name_of_universe}_canon") f"{C.name_of_universe}_{quality}"
"""
data_overview = pd.DataFrame(
{"mean_tokens":mean_tokens,
"std_dev_tokens":std_dev_tokens,
"type_token_ratio":type_token_ratio,
"mean_sent":mean_sent,
"std_dev_tokens":std_dev_tokens},
index = index
)
data_overview.to_csv(f"data_overview/data_overview.csv")
"""
z_score_provider = data_overview.drop(["grishaverse_bad_fics", "grishaverse_good_fics", "grishaverse_medium_fics"], axis=0)
mean_std_dev_list = []
mean_std_dev_list = [[columnName, columnData.mean(), columnData.std()] for columnName, columnData in z_score_provider.iteritems()]
# Create a new DataFrame with the same column names and index labels as data_overview
......@@ -47,14 +35,10 @@ for index, row in z_scores_all_data.iterrows():
p_value = scipy.stats.norm.sf(abs(cell_value))
p_values_all_data.loc[index, column] = p_value
z_scores_all_data.to_csv("data_overview/z_scores_all_data.csv")
p_values_all_data.to_csv("data_overview/p_values_all_data.csv")
dfi.export(z_scores_all_data, "data_overview/z_scores_all_data.png", table_conversion = "matplotlib")
dfi.export(p_values_all_data, "data_overview/p_values_all_data.png", table_conversion = "matplotlib")
print(z_scores_all_data)
delta_scores_grouped_fanfics = pd.DataFrame(columns=["throne_of_glass_canon", "grishaverse_canon", "throne_of_glass_bad_fics", "throne_of_glass_good_fics", "throne_of_glass_medium_fics"], index=["grishaverse_bad_fics", "grishaverse_good_fics", "grishaverse_medium_fics"])
delta_scores_grouped = pd.DataFrame(columns=["throne_of_glass_canon", "grishaverse_canon", "throne_of_glass_bad_fics", "throne_of_glass_good_fics", "throne_of_glass_medium_fics"], index=["grishaverse_bad_fics", "grishaverse_good_fics", "grishaverse_medium_fics"])
for fic in ["grishaverse_bad_fics", "grishaverse_good_fics", "grishaverse_medium_fics"]:
delta_scores = []
......@@ -65,12 +49,9 @@ for fic in ["grishaverse_bad_fics", "grishaverse_good_fics", "grishaverse_medium
delta_score += abs(row[column] - z_scores_all_data.loc[fic, column])
delta_score /= len(z_scores_all_data.columns)
delta_scores.append(delta_score)
delta_scores_grouped_fanfics.loc[fic, index] = delta_score
delta_scores_grouped.loc[fic, index] = delta_score
#delta_scores_grouped_fanfics.loc[fic, :] = delta_score
print(delta_scores)
dfi.export(delta_scores_grouped_fanfics, "data_overview/delta_scores_grouped_fanfics.png", table_conversion = "matplotlib")
print(delta_scores_grouped_fanfics)
delta_scores_grouped.to_csv("data_overview/delta_scores_grouped_fanfics.csv")
\ No newline at end of file
import os
import pandas as pd
import re
import scipy.stats
def calculate_delta(path_to_data_overview, type_of_distance):
data_overview = pd.read_csv(path_to_data_overview, index_col=0)
# Calculate z-scores for each cell in the data_overview DataFrame
z_scores_all_data = (data_overview - data_overview.mean()) / data_overview.std()
z_scores_all_data.to_csv(f"data_overview/z_scores_all_data_{type_of_distance}.csv")
delta_scores_grouped = pd.DataFrame(columns=data_overview.columns, index=data_overview.columns)
for fic in delta_scores_grouped.index:
for column in data_overview.columns:
delta_score = abs(z_scores_all_data.loc[fic, column] - z_scores_all_data[column]).mean()
delta_scores_grouped.loc[fic, column] = delta_score
delta_scores_grouped.to_csv(f"data_overview/delta_scores_{type_of_distance}.csv")
calculate_delta(path_to_data_overview="data_overview/data_overview.csv", type_of_distance="tk_sent_len")
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
percy/plots/medium/full_sent_len_dist.png

130 B

percy/plots/medium/md_freq.png

130 B

percy/plots/medium/pos_tag_freq.png

130 B

percy/plots/medium/punct_freq.png

130 B

percy/plots/medium/short_sent_len_dist.png

130 B

percy/plots/medium/token_freq_len_dist.png

130 B

0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment