Skip to content
Snippets Groups Projects
Commit 8a31a9ff authored by chrysanthopoulou's avatar chrysanthopoulou
Browse files

Modify the separation some more

parent 428c3c1f
No related branches found
No related tags found
No related merge requests found
......@@ -104,7 +104,7 @@ def calculate_freq_dist_as_clean_panda(list_of_items, most_common_limit=False):
for i in range(0, len(new_dist.index)):
#for index in new_token_len_dist.index:
new_dist.iat[i] = round(new_dist.iat[i]/len(list_of_items), 3) #index-1 bc the index starts counting from zero, the word lengths not
new_dist.iat[i] = round(float(new_dist.iat[i])/len(list_of_items), 3) #index-1 bc the index starts counting from zero, the word lengths not
#if float(new_token_len_dist.iat[i]) == 0.00:
# new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
......
......@@ -90,13 +90,13 @@ def separate_fanfics_by_good_medium_bad(df, series, few_kudos_thres, medium_kudo
if os.path.exists(f"{series}/data/split_txt_fanfics/medium_fics") == False:
os.makedirs(f"{series}/data/split_txt_fanfics/medium_fics")
with open(f"{series}/data/split_txt_fanfics/good_fics.txt", "w") as f:
with open(f"{series}/data/split_txt_fanfics/good_fics/good_fics.txt", "w") as f:
f.write(good_fics_joined)
with open(f"{series}/data/split_txt_fanfics/bad_fics.txt", "w") as f:
with open(f"{series}/data/split_txt_fanfics/bad_fics/bad_fics.txt", "w") as f:
f.write(bad_fics_joined)
with open(f"{series}/data/split_txt_fanfics/medium_fics.txt", "w") as f:
with open(f"{series}/data/split_txt_fanfics/medium_fics/medium_fics.txt", "w") as f:
f.write(medium_fics_joined)
def clean_fanfic_dataset(file_path):
......@@ -156,6 +156,7 @@ def run_functions(file_paths):
if __name__ == "__main__":
"""
#clean_fanfic_dataset("cosmere/data/fanfics/cosmere_fanfics.csv")
......@@ -186,27 +187,40 @@ if __name__ == "__main__":
"""
call_me_by_your_name = pd.read_csv("call_me_by_your_name/data/fanfics/call_me_by_your_name_fanfics_new.csv")
separate_fanfics_by_good_medium_bad(call_me_by_your_name, "call_me_by_your_name", 69, 201) #I select the thresholds to have a 50% 30% 20% split
# numbers underneath for 50% 35% 15% 69 276
cosmere = pd.read_csv("cosmere/data/fanfics/cosmere_fanfics_new.csv")
separate_fanfics_by_good_medium_bad(cosmere, "cosmere", 36, 73) # I select the thresholds to have a 50% 25% 25% split
separate_fanfics_by_good_medium_bad(cosmere, "cosmere", 40, 88)
# 40 108
divergent = pd.read_csv("divergent/data/fanfics/divergent_fanfics_new.csv")
separate_fanfics_by_good_medium_bad(divergent, "divergent", 32, 70)
separate_fanfics_by_good_medium_bad(divergent, "divergent", 33, 94)
# 33 119
grishaverse = pd.read_csv("grishaverse/data/fanfics/grisha_fanfics_new.csv")
separate_fanfics_by_good_medium_bad(grishaverse, "grishaverse", 131, 284)
separate_fanfics_by_good_medium_bad(grishaverse, "grishaverse", 134, 346)
# 134 440
maze_runner = pd.read_csv("maze_runner/data/fanfics/mazerunner_fanfics_new.csv")
separate_fanfics_by_good_medium_bad(maze_runner, "maze_runner", 84, 188)
separate_fanfics_by_good_medium_bad(maze_runner, "maze_runner", 89, 239)
# 89 299
murderbot = pd.read_csv("murderbot/data/fanfics/murderbot_fanfics_new.csv")
separate_fanfics_by_good_medium_bad(murderbot, "murderbot", 63, 143)
separate_fanfics_by_good_medium_bad(murderbot, "murderbot", 95, 209)
# 95 239
percy = pd.read_csv("percy/data/fanfics/percy_fanfics_new.csv")
separate_fanfics_by_good_medium_bad(percy, "percy", 94, 233)
separate_fanfics_by_good_medium_bad(percy, "percy", 99, 291)
# 99 343
red_white_royal_blue = pd.read_csv("red_white_royal_blue/data/fanfics/red_white_royal_blue_fanfics_new.csv")
separate_fanfics_by_good_medium_bad(red_white_royal_blue, "red_white_royal_blue", 280, 666)
# 280 884
school_for_good_and_evil = pd.read_csv("school_for_good_and_evil/data/fanfics/school_fanfics_new.csv")
separate_fanfics_by_good_medium_bad(school_for_good_and_evil, "school_for_good_and_evil", 63, 143)
separate_fanfics_by_good_medium_bad(school_for_good_and_evil, "school_for_good_and_evil", 63, 169)
# 63 198
simonverse = pd.read_csv("simonverse/data/fanfics/simonverse_fanfics_new.csv")
separate_fanfics_by_good_medium_bad(simonverse, "simonverse", 181, 347)
# 181 376
song_of_achilles = pd.read_csv("song_of_achilles/data/fanfics/song_of_achilles_fanfics_new.csv")
separate_fanfics_by_good_medium_bad(song_of_achilles, "song_of_achilles", 122, 285)
# 122 329
throne_of_glass = pd.read_csv("throne_of_glass/data/fanfics/throne_of_glass_fanfics_new.csv")
separate_fanfics_by_good_medium_bad(throne_of_glass, "throne_of_glass", 56, 109)
"""
\ No newline at end of file
separate_fanfics_by_good_medium_bad(throne_of_glass, "throne_of_glass", 56, 131)
# 56 153
#"""
\ No newline at end of file
......@@ -87,7 +87,7 @@ def calculate_freq_dist_as_clean_panda(list_of_items, most_common_limit=False):
for i in range(0, len(new_dist.index)):
#for index in new_token_len_dist.index:
new_dist.iat[i] = round(new_dist.iat[i]/len(list_of_items), 3) #index-1 bc the index starts counting from zero, the word lengths not
new_dist.iat[i] = round(float(new_dist.iat[i])/len(list_of_items), 3) #index-1 bc the index starts counting from zero, the word lengths not
#if float(new_token_len_dist.iat[i]) == 0.00:
# new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
......@@ -109,14 +109,14 @@ def mendenhall_token_metrics(tokens):
token_lengths = sorted(token_lengths)[trim_len:-trim_len]
new_token_len_dist = calculate_freq_dist_as_clean_panda(token_lengths, most_common_limit=15) # token len freq dist
try:
standard_deviation = statistics.stdev(token_lengths)
mean = statistics.mean(token_lengths)
standard_deviation = statistics.stdev(token_lengths)
mean = statistics.mean(token_lengths)
"""
except:
print("too short not enough tokens")
standard_deviation = np.nan
mean = np.nan
"""
return new_token_len_dist, standard_deviation, mean
......@@ -221,14 +221,15 @@ def calculate_sent_len_dist(text):
sent_len_dist_short = calculate_freq_dist_as_clean_panda(sent_lens, most_common_limit=25)
# calculate the standard deviation, mean
try:
standard_deviation_sent = statistics.stdev(sent_lens)
mean_sent = statistics.mean(sent_lens)
standard_deviation_sent = statistics.stdev(sent_lens)
mean_sent = statistics.mean(sent_lens)
"""
except:
print("too short not enough sents")
standard_deviation_sent = np.nan
mean_sent = np.nan
"""
return sent_len_dist, sent_len_dist_short, standard_deviation_sent, mean_sent
......@@ -272,8 +273,9 @@ def execute_funcs(file_paths, ExampleClass, save_distributions_to_csv=False, dat
mean_sent = []
std_dev_sents = []
nums_of_kudos = []
not_first = 0
not_first = 0
for file_path in file_paths:
print(file_path)
#"cosmere/data/fanfics/cosmere_fanfics_new.csv"
fanfic_df = pd.read_csv(file_path, thousands=",")
# get series from directory path
......@@ -290,13 +292,15 @@ def execute_funcs(file_paths, ExampleClass, save_distributions_to_csv=False, dat
num_of_kudos = pd.to_numeric(row["kudos"], errors='coerce')
text = row["body"]
C = ExampleClass(text=text, num_of_kudos=num_of_kudos)
if len(C.clean_tokens) <= 10:
try:
# Calculate Stuff
C.calculate_mendenhall_token_metrics()
C.calculate_pos_tag_distribution()
C.calculate_sent_len_distribution()
C.calculate_punct_distribution()
except:
print(f"faulty datapoint {index} \n {C.clean_tokens}")
continue
# Calculate Stuff
C.calculate_mendenhall_token_metrics()
C.calculate_pos_tag_distribution()
C.calculate_sent_len_distribution()
C.calculate_punct_distribution()
# data overview csv
if data_overview_csv:
mean_tokens.append(C.tk_len_mean)
......@@ -366,15 +370,14 @@ if __name__ == "__main__":
file_path_list = ['call_me_by_your_name/data', 'cosmere/data', 'divergent/data', 'grishaverse/data', 'maze_runner/data', 'murderbot/data', 'percy/data', 'red_white_royal_blue/data', 'school_for_good_and_evil/data', 'simonverse/data', 'song_of_achilles/data', 'throne_of_glass/data',]
#file_path_list = ['song_of_achilles/data', 'throne_of_glass/data', 'simonverse/data', ]
file_paths_lists = [os.listdir(f"{file_path}/fanfics") for file_path in file_path_list]
file_paths = []
for file_path in file_paths_lists:
#print(*file_path_list)
file_paths.append(*file_path)
file_paths_new = [f"{folder}/fanfics/{file}" for folder, file in zip(file_path_list, file_paths)]
execute_funcs(file_paths=file_paths_new, ExampleClass=StylometryMetrics, save_distributions_to_csv=True, data_overview_csv=True)
......@@ -109,7 +109,7 @@ def calculate_freq_dist_as_clean_panda(list_of_items, most_common_limit=False):
for i in range(0, len(new_dist.index)):
#for index in new_token_len_dist.index:
new_dist.iat[i] = round(new_dist.iat[i]/len(list_of_items), 3) #index-1 bc the index starts counting from zero, the word lengths not
new_dist.iat[i] = round(float(new_dist.iat[i])/len(list_of_items), 3) #index-1 bc the index starts counting from zero, the word lengths not
#if float(new_token_len_dist.iat[i]) == 0.00:
# new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
......@@ -601,7 +601,10 @@ def execute_funcs(dir_paths, ExampleClass, plt_stuff=False, save_distributions_t
if __name__ == "__main__":
#dir_paths = ['cosmere/data', 'divergent/data', 'grishaverse/data', 'maze_runner/data', 'murderbot/data', 'percy/data', 'school_for_good_and_evil/data', 'throne_of_glass/data',]
dir_paths = ['trial_times']
#dir_paths = ['trial_times']
dir_paths = ['call_me_by_your_name/data', 'cosmere/data', 'divergent/data', 'grishaverse/data', 'maze_runner/data', 'murderbot/data', 'percy/data', 'red_white_royal_blue/data', 'school_for_good_and_evil/data', 'simonverse/data', 'song_of_achilles/data', 'throne_of_glass/data',]
#execute_funcs(dir_paths=dir_paths, ExampleClass=StylometryMetrics, plt_stuff=True, save_distributions_to_csv=True, data_overview_csv=True)
execute_funcs(dir_paths=dir_paths, ExampleClass=StylometryMetrics, plt_stuff=False, save_distributions_to_csv=True)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment