Skip to content
Snippets Groups Projects
Commit a73fbc7c authored by chrysanthopoulou's avatar chrysanthopoulou
Browse files

Add code unified dfs for freq dists + trial folder

parent b1bfe205
No related branches found
No related tags found
No related merge requests found
......@@ -345,23 +345,6 @@ class StylometryMetrics:
plot_distribution(x=self.punct_tag_freq_dist, y=self.punct_tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Types of Punctuation", y_label="Percentage of Occurence")
def execute_funcs(dir_paths, ExampleClass, plt_stuff=False, save_distributions_to_csv=False, data_overview_csv=False):
# prepping code for the different freq dist dfs
"""
data_overview = pd.read_csv(path_to_df, index_col=0)
not_first = 0
for index in data_overview.index:
feature_name = f"{type_of_features}_{index}"
delta_scores_new = calculate_delta(data_overview, [index], type_of_features=feature_name)
if not_first == 0:
delta_scores = delta_scores_new
not_first += 1
elif not_first:
delta_scores = pd.merge(delta_scores, delta_scores_new, how='outer')
delta_scores.index = data_overview.index
"""
#create lists for each of the columns of the dataframe we'll create
mean_tokens = []
std_dev_tokens = []
......@@ -396,11 +379,11 @@ def execute_funcs(dir_paths, ExampleClass, plt_stuff=False, save_distributions_t
if os.path.exists(path_for_dist) == False:
os.makedirs(path_for_dist)
C.md_freq_dist.name = f"{C.name_of_universe}_canon_works"
C.tk_len_dist = f"{C.name_of_universe}_canon_works"
C.tag_freq_dist = f"{C.name_of_universe}_canon_works"
C.punct_tag_freq_dist = f"{C.name_of_universe}_canon_works"
C.sent_len_dist = f"{C.name_of_universe}_canon_works"
C.sent_len_dist_short = f"{C.name_of_universe}_canon_works"
C.tk_len_dist.name = f"{C.name_of_universe}_canon_works"
C.tag_freq_dist.name = f"{C.name_of_universe}_canon_works"
C.punct_tag_freq_dist.name = f"{C.name_of_universe}_canon_works"
C.sent_len_dist.name = f"{C.name_of_universe}_canon_works"
C.sent_len_dist_short.name = f"{C.name_of_universe}_canon_works"
if not_first == 0:
md_freq_dist = C.md_freq_dist
tk_len_dist = C.tk_len_dist
......@@ -410,12 +393,12 @@ def execute_funcs(dir_paths, ExampleClass, plt_stuff=False, save_distributions_t
sent_len_dist_short = C.sent_len_dist_short
not_first += 1
elif not_first:
md_freq_dist = pd.merge(md_freq_dist, C.md_freq_dist, how='outer')
tk_len_dist = pd.merge(tk_len_dist, C.tk_len_dist, how='outer')
tag_freq_dist = pd.merge(tag_freq_dist, C.tag_freq_dist, how='outer')
punct_tag_freq_dist = pd.merge(punct_tag_freq_dist, C.punct_tag_freq_dist, how='outer')
sent_len_dist = pd.merge(sent_len_dist, C.sent_len_dist, how='outer')
sent_len_dist_short = pd.merge(sent_len_dist_short, C.sent_len_dist_short, how='outer')
md_freq_dist = pd.merge(md_freq_dist, C.md_freq_dist, how='outer', left_index=True, right_index=True)
tk_len_dist = pd.merge(tk_len_dist, C.tk_len_dist, how='outer', left_index=True, right_index=True)
tag_freq_dist = pd.merge(tag_freq_dist, C.tag_freq_dist, how='outer', left_index=True, right_index=True)
punct_tag_freq_dist = pd.merge(punct_tag_freq_dist, C.punct_tag_freq_dist, how='outer', left_index=True, right_index=True)
sent_len_dist = pd.merge(sent_len_dist, C.sent_len_dist, how='outer', left_index=True, right_index=True)
sent_len_dist_short = pd.merge(sent_len_dist_short, C.sent_len_dist_short, how='outer', left_index=True, right_index=True)
#plot stuff (much plot, much stuff)
if plt_stuff:
......@@ -456,11 +439,11 @@ def execute_funcs(dir_paths, ExampleClass, plt_stuff=False, save_distributions_t
if os.path.exists(path_for_dist) == False:
os.makedirs(path_for_dist)
C.md_freq_dist.name = f"{C.name_of_universe}_{quality}"
C.tk_len_dist = f"{C.name_of_universe}_{quality}"
C.tag_freq_dist = f"{C.name_of_universe}_{quality}"
C.punct_tag_freq_dist = f"{C.name_of_universe}_{quality}"
C.sent_len_dist = f"{C.name_of_universe}_{quality}"
C.sent_len_dist_short = f"{C.name_of_universe}_{quality}"
C.tk_len_dist.name = f"{C.name_of_universe}_{quality}"
C.tag_freq_dist.name = f"{C.name_of_universe}_{quality}"
C.punct_tag_freq_dist.name = f"{C.name_of_universe}_{quality}"
C.sent_len_dist.name = f"{C.name_of_universe}_{quality}"
C.sent_len_dist_short.name = f"{C.name_of_universe}_{quality}"
if not_first == 0:
md_freq_dist = C.md_freq_dist
tk_len_dist = C.tk_len_dist
......@@ -470,12 +453,12 @@ def execute_funcs(dir_paths, ExampleClass, plt_stuff=False, save_distributions_t
sent_len_dist_short = C.sent_len_dist_short
not_first += 1
elif not_first:
md_freq_dist = pd.merge(md_freq_dist, C.md_freq_dist, how='outer')
tk_len_dist = pd.merge(tk_len_dist, C.tk_len_dist, how='outer')
tag_freq_dist = pd.merge(tag_freq_dist, C.tag_freq_dist, how='outer')
punct_tag_freq_dist = pd.merge(punct_tag_freq_dist, C.punct_tag_freq_dist, how='outer')
sent_len_dist = pd.merge(sent_len_dist, C.sent_len_dist, how='outer')
sent_len_dist_short = pd.merge(sent_len_dist_short, C.sent_len_dist_short, how='outer')
md_freq_dist = pd.merge(md_freq_dist, C.md_freq_dist, how='outer', left_index=True, right_index=True)
tk_len_dist = pd.merge(tk_len_dist, C.tk_len_dist, how='outer', left_index=True, right_index=True)
tag_freq_dist = pd.merge(tag_freq_dist, C.tag_freq_dist, how='outer', left_index=True, right_index=True)
punct_tag_freq_dist = pd.merge(punct_tag_freq_dist, C.punct_tag_freq_dist, how='outer', left_index=True, right_index=True)
sent_len_dist = pd.merge(sent_len_dist, C.sent_len_dist, how='outer', left_index=True, right_index=True)
sent_len_dist_short = pd.merge(sent_len_dist_short, C.sent_len_dist_short, how='outer', left_index=True, right_index=True)
#plot stuff (much plot, much stuff)
if plt_stuff:
......@@ -495,24 +478,25 @@ def execute_funcs(dir_paths, ExampleClass, plt_stuff=False, save_distributions_t
)
data_overview.to_csv("data_overview/data_overview.csv")
if save_distributions_to_csv:
md_freq_dist.index = index
md_freq_dist = md_freq_dist.T
md_freq_dist.to_csv("data_overview/md_freq_dist.csv")
tk_len_dist.index = index
tk_len_dist = tk_len_dist.T
tk_len_dist.to_csv("data_overview/tk_len_dist.csv")
tag_freq_dist.index = index
tag_freq_dist = tag_freq_dist.T
tag_freq_dist.to_csv("data_overview/tag_freq_dist.csv")
punct_tag_freq_dist.index = index
punct_tag_freq_dist = punct_tag_freq_dist.T
punct_tag_freq_dist.to_csv("data_overview/punct_tag_freq_dist.csv")
sent_len_dist.index = index
sent_len_dist = sent_len_dist.T
sent_len_dist.to_csv("data_overview/sent_len_dist.csv")
sent_len_dist_short.index = index
sent_len_dist_short = sent_len_dist_short.T
sent_len_dist_short.to_csv("data_overview/sent_len_dist_short.csv")
if __name__ == "__main__":
#dir_paths = ['cosmere/data', 'divergent/data', 'grishaverse/data', 'maze_runner/data', 'murderbot/data', 'percy/data', 'school_for_good_and_evil/data', 'throne_of_glass/data',]
dir_paths = ['cosmere/data']
dir_paths = ['cosmere/data', 'divergent/data', 'grishaverse/data', 'maze_runner/data', 'murderbot/data', 'percy/data', 'school_for_good_and_evil/data', 'throne_of_glass/data',]
#dir_paths = ['trial_times']
#execute_funcs(dir_paths=dir_paths, ExampleClass=StylometryMetrics, plt_stuff=True, save_distributions_to_csv=True, data_overview_csv=True)
execute_funcs(dir_paths=dir_paths, ExampleClass=StylometryMetrics, plt_stuff=False, save_distributions_to_csv=True)
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment