Modify the separation some more

8a31a9ff · chrysanthopoulou · 428c3c1f · 8a31a9ff · 8a31a9ff · 8a31a9ff
Commit 8a31a9ff authored 1 year ago by chrysanthopoulou
--- a/clean_stylometry.py
+++ b/clean_stylometry.py
@@ -104,7 +104,7 @@ def calculate_freq_dist_as_clean_panda(list_of_items, most_common_limit=False):
    
    for i in range(0, len(new_dist.index)):
    #for index in new_token_len_dist.index:
-        new_dist.iat[i] = round(new_dist.iat[i]/len(list_of_items), 3) #index-1 bc the index starts counting from zero, the word lengths not
+        new_dist.iat[i] = round(float(new_dist.iat[i])/len(list_of_items), 3) #index-1 bc the index starts counting from zero, the word lengths not
        #if float(new_token_len_dist.iat[i]) == 0.00:
         #   new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry


--- a/fanfic_preprocessing.py
+++ b/fanfic_preprocessing.py
@@ -90,13 +90,13 @@ def separate_fanfics_by_good_medium_bad(df, series, few_kudos_thres, medium_kudo
    if os.path.exists(f"{series}/data/split_txt_fanfics/medium_fics") == False: 
        os.makedirs(f"{series}/data/split_txt_fanfics/medium_fics")

-    with open(f"{series}/data/split_txt_fanfics/good_fics.txt", "w") as f:
+    with open(f"{series}/data/split_txt_fanfics/good_fics/good_fics.txt", "w") as f:
        f.write(good_fics_joined)

-    with open(f"{series}/data/split_txt_fanfics/bad_fics.txt", "w") as f:
+    with open(f"{series}/data/split_txt_fanfics/bad_fics/bad_fics.txt", "w") as f:
        f.write(bad_fics_joined)

-    with open(f"{series}/data/split_txt_fanfics/medium_fics.txt", "w") as f:
+    with open(f"{series}/data/split_txt_fanfics/medium_fics/medium_fics.txt", "w") as f:
        f.write(medium_fics_joined)

 def clean_fanfic_dataset(file_path):
@@ -156,6 +156,7 @@ def run_functions(file_paths):


 if __name__ == "__main__":
+    """

    #clean_fanfic_dataset("cosmere/data/fanfics/cosmere_fanfics.csv")

@@ -186,27 +187,40 @@ if __name__ == "__main__":


    """
+    call_me_by_your_name = pd.read_csv("call_me_by_your_name/data/fanfics/call_me_by_your_name_fanfics_new.csv")
+    separate_fanfics_by_good_medium_bad(call_me_by_your_name, "call_me_by_your_name", 69, 201) #I select the thresholds to have a 50% 30% 20% split 
+    # numbers underneath for 50% 35% 15% 69 276
    cosmere = pd.read_csv("cosmere/data/fanfics/cosmere_fanfics_new.csv")
-    separate_fanfics_by_good_medium_bad(cosmere, "cosmere", 36, 73) # I select the thresholds to have a 50% 25% 25% split
-
+    separate_fanfics_by_good_medium_bad(cosmere, "cosmere", 40, 88) 
+    # 40 108
    divergent = pd.read_csv("divergent/data/fanfics/divergent_fanfics_new.csv")
-    separate_fanfics_by_good_medium_bad(divergent, "divergent", 32, 70)
-
+    separate_fanfics_by_good_medium_bad(divergent, "divergent", 33, 94)
+    # 33 119
    grishaverse = pd.read_csv("grishaverse/data/fanfics/grisha_fanfics_new.csv")
-    separate_fanfics_by_good_medium_bad(grishaverse, "grishaverse", 131, 284)
-
+    separate_fanfics_by_good_medium_bad(grishaverse, "grishaverse", 134, 346)
+    # 134 440
    maze_runner = pd.read_csv("maze_runner/data/fanfics/mazerunner_fanfics_new.csv")
-    separate_fanfics_by_good_medium_bad(maze_runner, "maze_runner", 84, 188)
-
+    separate_fanfics_by_good_medium_bad(maze_runner, "maze_runner", 89, 239)
+    # 89 299
    murderbot = pd.read_csv("murderbot/data/fanfics/murderbot_fanfics_new.csv")
-    separate_fanfics_by_good_medium_bad(murderbot, "murderbot", 63, 143)
-
+    separate_fanfics_by_good_medium_bad(murderbot, "murderbot", 95, 209)
+    # 95 239 
    percy = pd.read_csv("percy/data/fanfics/percy_fanfics_new.csv")
-    separate_fanfics_by_good_medium_bad(percy, "percy", 94, 233)
-
+    separate_fanfics_by_good_medium_bad(percy, "percy", 99, 291)
+    # 99 343
+    red_white_royal_blue = pd.read_csv("red_white_royal_blue/data/fanfics/red_white_royal_blue_fanfics_new.csv")
+    separate_fanfics_by_good_medium_bad(red_white_royal_blue, "red_white_royal_blue", 280, 666)
+    # 280 884
    school_for_good_and_evil = pd.read_csv("school_for_good_and_evil/data/fanfics/school_fanfics_new.csv")
-    separate_fanfics_by_good_medium_bad(school_for_good_and_evil, "school_for_good_and_evil", 63, 143)
-
+    separate_fanfics_by_good_medium_bad(school_for_good_and_evil, "school_for_good_and_evil", 63, 169)
+    # 63 198
+    simonverse = pd.read_csv("simonverse/data/fanfics/simonverse_fanfics_new.csv")
+    separate_fanfics_by_good_medium_bad(simonverse, "simonverse", 181, 347)
+    # 181 376
+    song_of_achilles = pd.read_csv("song_of_achilles/data/fanfics/song_of_achilles_fanfics_new.csv")
+    separate_fanfics_by_good_medium_bad(song_of_achilles, "song_of_achilles", 122, 285)
+    # 122 329
    throne_of_glass = pd.read_csv("throne_of_glass/data/fanfics/throne_of_glass_fanfics_new.csv")
-    separate_fanfics_by_good_medium_bad(throne_of_glass, "throne_of_glass", 56, 109)   
-    """
\ No newline at end of file
+    separate_fanfics_by_good_medium_bad(throne_of_glass, "throne_of_glass", 56, 131)   
+    # 56 153 
+    #"""
\ No newline at end of file
--- a/singular_fanfics_stylometry.py
+++ b/singular_fanfics_stylometry.py
@@ -87,7 +87,7 @@ def calculate_freq_dist_as_clean_panda(list_of_items, most_common_limit=False):
    
    for i in range(0, len(new_dist.index)):
    #for index in new_token_len_dist.index:
-        new_dist.iat[i] = round(new_dist.iat[i]/len(list_of_items), 3) #index-1 bc the index starts counting from zero, the word lengths not
+        new_dist.iat[i] = round(float(new_dist.iat[i])/len(list_of_items), 3) #index-1 bc the index starts counting from zero, the word lengths not
        #if float(new_token_len_dist.iat[i]) == 0.00:
         #   new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry

@@ -109,14 +109,14 @@ def mendenhall_token_metrics(tokens):
    token_lengths = sorted(token_lengths)[trim_len:-trim_len]
    new_token_len_dist = calculate_freq_dist_as_clean_panda(token_lengths, most_common_limit=15) # token len freq dist

-    try:
-        standard_deviation = statistics.stdev(token_lengths)
-        mean = statistics.mean(token_lengths)
+    standard_deviation = statistics.stdev(token_lengths)
+    mean = statistics.mean(token_lengths)
+    """
    except:
        print("too short not enough tokens") 
        standard_deviation = np.nan
        mean = np.nan
-    
+    """

    return new_token_len_dist, standard_deviation, mean

@@ -221,14 +221,15 @@ def calculate_sent_len_dist(text):
    sent_len_dist_short = calculate_freq_dist_as_clean_panda(sent_lens, most_common_limit=25)
    
    # calculate the standard deviation, mean
-    try:
-        standard_deviation_sent = statistics.stdev(sent_lens)
-        mean_sent = statistics.mean(sent_lens)
+    
+    standard_deviation_sent = statistics.stdev(sent_lens)
+    mean_sent = statistics.mean(sent_lens)
+    """
    except:
        print("too short not enough sents")
        standard_deviation_sent = np.nan
        mean_sent = np.nan
-    
+    """

    return sent_len_dist, sent_len_dist_short, standard_deviation_sent, mean_sent

@@ -272,8 +273,9 @@ def execute_funcs(file_paths, ExampleClass, save_distributions_to_csv=False, dat
    mean_sent = []
    std_dev_sents = []
    nums_of_kudos = []
-    not_first = 0
+    not_first = 0 
    for file_path in file_paths:
+        print(file_path)
        #"cosmere/data/fanfics/cosmere_fanfics_new.csv"
        fanfic_df = pd.read_csv(file_path, thousands=",")
        # get series from directory path
@@ -290,13 +292,15 @@ def execute_funcs(file_paths, ExampleClass, save_distributions_to_csv=False, dat
            num_of_kudos = pd.to_numeric(row["kudos"], errors='coerce')
            text = row["body"]
            C = ExampleClass(text=text, num_of_kudos=num_of_kudos)
-            if len(C.clean_tokens) <= 10:
+            try:
+                # Calculate Stuff
+                C.calculate_mendenhall_token_metrics()
+                C.calculate_pos_tag_distribution()
+                C.calculate_sent_len_distribution()
+                C.calculate_punct_distribution()
+            except:
+                print(f"faulty datapoint {index} \n {C.clean_tokens}")
                continue
-            # Calculate Stuff
-            C.calculate_mendenhall_token_metrics()
-            C.calculate_pos_tag_distribution()
-            C.calculate_sent_len_distribution()
-            C.calculate_punct_distribution()
            # data overview csv
            if data_overview_csv:
                mean_tokens.append(C.tk_len_mean)
@@ -366,15 +370,14 @@ if __name__ == "__main__":
    
    file_path_list = ['call_me_by_your_name/data', 'cosmere/data', 'divergent/data', 'grishaverse/data', 'maze_runner/data', 'murderbot/data', 'percy/data', 'red_white_royal_blue/data', 'school_for_good_and_evil/data', 'simonverse/data', 'song_of_achilles/data', 'throne_of_glass/data',]
    #file_path_list = ['song_of_achilles/data', 'throne_of_glass/data', 'simonverse/data', ]
-    
    file_paths_lists = [os.listdir(f"{file_path}/fanfics") for file_path in file_path_list]
    file_paths = []
-
    for file_path in file_paths_lists:
        #print(*file_path_list)
        file_paths.append(*file_path)
   
    file_paths_new = [f"{folder}/fanfics/{file}" for folder, file in zip(file_path_list, file_paths)]

+
    
    execute_funcs(file_paths=file_paths_new, ExampleClass=StylometryMetrics, save_distributions_to_csv=True, data_overview_csv=True)
--- a/stylo_sing.py
+++ b/stylo_sing.py
@@ -109,7 +109,7 @@ def calculate_freq_dist_as_clean_panda(list_of_items, most_common_limit=False):
    
    for i in range(0, len(new_dist.index)):
    #for index in new_token_len_dist.index:
-        new_dist.iat[i] = round(new_dist.iat[i]/len(list_of_items), 3) #index-1 bc the index starts counting from zero, the word lengths not
+        new_dist.iat[i] = round(float(new_dist.iat[i])/len(list_of_items), 3) #index-1 bc the index starts counting from zero, the word lengths not
        #if float(new_token_len_dist.iat[i]) == 0.00:
         #   new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry

@@ -601,7 +601,10 @@ def execute_funcs(dir_paths, ExampleClass, plt_stuff=False, save_distributions_t
 if __name__ == "__main__":

    #dir_paths = ['cosmere/data', 'divergent/data', 'grishaverse/data', 'maze_runner/data', 'murderbot/data', 'percy/data', 'school_for_good_and_evil/data', 'throne_of_glass/data',]
-    dir_paths = ['trial_times']
+    #dir_paths = ['trial_times']
+
+    dir_paths = ['call_me_by_your_name/data', 'cosmere/data', 'divergent/data', 'grishaverse/data', 'maze_runner/data', 'murderbot/data', 'percy/data', 'red_white_royal_blue/data', 'school_for_good_and_evil/data', 'simonverse/data', 'song_of_achilles/data', 'throne_of_glass/data',]
+    
    
    #execute_funcs(dir_paths=dir_paths, ExampleClass=StylometryMetrics, plt_stuff=True, save_distributions_to_csv=True, data_overview_csv=True)
    execute_funcs(dir_paths=dir_paths, ExampleClass=StylometryMetrics, plt_stuff=False, save_distributions_to_csv=True)