Add some pearson and spearman correlation

904f7a80 · chrysanthopoulou · cf65f872 · 904f7a80 · 904f7a80 · 904f7a80
Commit 904f7a80 authored 1 year ago by chrysanthopoulou
--- a/correlation.py
+++ b/correlation.py
+import seaborn as sns
+import pandas as pd
+import matplotlib.pyplot as plt
+from scipy import stats
+import os
+
+#sing_fanfic = pd.read_csv("cosmere/fanfiction_stylo_data/deltas/md_freq_dist.csv", index_col=0)
+#sns.scatterplot(data=sing_fanfic, x="grishaverse_good", y=sing_fanfic.index)
+#plt.savefig("cosmere/fanfiction_stylo_data/deltas/grishaverse_good.png")
+
+types_of_feature = ["data_overview", "md_freq_dist", "tag_freq_dist","punct_tag_freq_dist", "sent_len_dist", "tk_len_dist",]
+fandoms = ["call_me_by_your_name"]
+# deltas
+for fandom in fandoms:
+    print(f"{fandom}")
+    df_spearman = pd.DataFrame(columns=types_of_feature)
+    df_pearsons = pd.DataFrame(columns=types_of_feature)
+    for type_of_feature in types_of_feature:
+        sing_fanfic = pd.read_csv(f"data_overview/single_fic_deltas/{fandom}/{type_of_feature}.csv", index_col=0)
+        #feature_fanfic = pd.read_csv(f"{fandom}/fanfiction_stylo_data/stylo_data/{type_of_feature}.csv", index_col=0)
+        pearsons_list = []
+        spearman_list = []
+        for column in sing_fanfic.columns:
+            kudos = sing_fanfic.index
+            delta = sing_fanfic[column]
+
+            corr, _ = stats.pearsonr(kudos, delta)
+            pearsons_list.append(corr)
+            #print(f"\n{type_of_feature}")
+            #print('Pearsons correlation: %.3f' % corr)
+
+            corr, _ = stats.spearmanr(kudos, delta)
+            spearman_list.append(corr)
+            #print(f"\n{type_of_feature}")
+            #print('Spearman correlation: %.3f' % corr)
+        df_pearsons[type_of_feature] = pearsons_list
+        df_spearman[type_of_feature] = spearman_list
+    df_pearsons.index = sing_fanfic.columns
+    df_spearman.index = sing_fanfic.columns
+    
+    df_pearsons = df_pearsons.T
+    df_spearman = df_spearman.T
+
+    df_pearsons.to_csv(f"correlation/deltas/{fandom}/pearsons.csv")
+    df_spearman.to_csv(f"correlation/deltas/{fandom}/spearman.csv")
+
+# stylo features
+    
+for type_of_feature in types_of_feature:
+    pearsons_list = []
+    spearman_list = []
+    for fandom in fandoms:
+        print(f"{fandom}")
+        #sing_fanfic = pd.read_csv(f"data_overview/single_fic_deltas/{fandom}/{type_of_feature}.csv", index_col=0)
+        feature_fanfic = pd.read_csv(f"{fandom}/fanfiction_stylo_data/stylo_data/{type_of_feature}.csv", index_col=0)
+        feature_fanfic.fillna(0, inplace=True)
+        pearsons_dict= {}
+        spearman_dict = {}
+        for column in feature_fanfic.columns:
+            kudos = feature_fanfic.index
+            delta = feature_fanfic[column]
+
+            corr, _ = stats.pearsonr(kudos, delta)
+            pearsons_dict[column] = corr
+            #print(f"\n{type_of_feature}")
+            #print('Pearsons correlation: %.3f' % corr)
+
+            corr, _ = stats.spearmanr(kudos, delta)
+            spearman_dict[column] = corr
+            #print(f"\n{type_of_feature}")
+            #print('Spearman correlation: %.3f' % corr)
+        pearsons_list.append(pearsons_dict)
+        spearman_list.append(spearman_dict)
+    
+    df_pearsons = pd.DataFrame(pearsons_list)
+    df_spearman = pd.DataFrame(spearman_list)
+
+    df_pearsons.index = fandoms
+    df_spearman.index = fandoms
+    
+    data_path = f"correlation/stylo_features/{type_of_feature}"
+    if os.path.exists(data_path) == False: os.makedirs(data_path)
+
+    df_pearsons.to_csv(f"{data_path}/pearsons.csv")
+    df_spearman.to_csv(f"{data_path}/spearman.csv")
\ No newline at end of file
--- a/correlation.txt
+++ b/correlation.txt
--- a/correlation/deltas/call_me_by_your_name/pearsons.csv
+++ b/correlation/deltas/call_me_by_your_name/pearsons.csv
--- a/correlation/deltas/call_me_by_your_name/spearman.csv
+++ b/correlation/deltas/call_me_by_your_name/spearman.csv
--- a/correlation/stylo_features/data_overview/pearsons.csv
+++ b/correlation/stylo_features/data_overview/pearsons.csv
--- a/correlation/stylo_features/data_overview/spearman.csv
+++ b/correlation/stylo_features/data_overview/spearman.csv
--- a/correlation/stylo_features/md_freq_dist/pearsons.csv
+++ b/correlation/stylo_features/md_freq_dist/pearsons.csv
--- a/correlation/stylo_features/md_freq_dist/spearman.csv
+++ b/correlation/stylo_features/md_freq_dist/spearman.csv
--- a/correlation/stylo_features/punct_tag_freq_dist/pearsons.csv
+++ b/correlation/stylo_features/punct_tag_freq_dist/pearsons.csv
--- a/correlation/stylo_features/punct_tag_freq_dist/spearman.csv
+++ b/correlation/stylo_features/punct_tag_freq_dist/spearman.csv
--- a/correlation/stylo_features/sent_len_dist/pearsons.csv
+++ b/correlation/stylo_features/sent_len_dist/pearsons.csv
--- a/correlation/stylo_features/sent_len_dist/spearman.csv
+++ b/correlation/stylo_features/sent_len_dist/spearman.csv
--- a/correlation/stylo_features/tag_freq_dist/pearsons.csv
+++ b/correlation/stylo_features/tag_freq_dist/pearsons.csv
--- a/correlation/stylo_features/tag_freq_dist/spearman.csv
+++ b/correlation/stylo_features/tag_freq_dist/spearman.csv
--- a/correlation/stylo_features/tk_len_dist/pearsons.csv
+++ b/correlation/stylo_features/tk_len_dist/pearsons.csv
--- a/correlation/stylo_features/tk_len_dist/spearman.csv
+++ b/correlation/stylo_features/tk_len_dist/spearman.csv
--- a/data_overview/single_fic_deltas/call_me_by_your_name/data_overview.csv
+++ b/data_overview/single_fic_deltas/call_me_by_your_name/data_overview.csv
--- a/delta_measure_rewrite.txt
+++ b/delta_measure_rewrite.txt
--- a/delta_rewrite.py
+++ b/delta_rewrite.py
@@ -135,7 +135,7 @@ def run_functions_single_fanfic(path_to_general_df:str, path_to_singular_df:str,
    
 if __name__ == "__main__":
    
-    types_of_feature = ["sent_len_dist", "tk_len_dist","punct_tag_freq_dist", "md_freq_dist", "tag_freq_dist", ] #"data_overview",
+    types_of_feature = ["data_overview","sent_len_dist", "tk_len_dist","punct_tag_freq_dist", "md_freq_dist", "tag_freq_dist", ] #"data_overview",
    fandoms = ["call_me_by_your_name"] #"cosmere"
    for type_of_feature in types_of_feature:
        print(type_of_feature)

--- a/plt_sing_fanfics.py
+++ b/plt_sing_fanfics.py
-import seaborn as sns
-import pandas as pd
-import matplotlib.pyplot as plt
-from scipy import stats
-
-#sing_fanfic = pd.read_csv("cosmere/fanfiction_stylo_data/deltas/md_freq_dist.csv", index_col=0)
-#sns.scatterplot(data=sing_fanfic, x="grishaverse_good", y=sing_fanfic.index)
-#plt.savefig("cosmere/fanfiction_stylo_data/deltas/grishaverse_good.png")
-
-types_of_feature = ["data_overview_deltas", "md_freq_dist", "tag_freq_dist","punct_tag_freq_dist", "sent_len_dist", "tk_len_dist",]
-for type_of_feature in types_of_feature:
-    sing_fanfic = pd.read_csv(f"cosmere/fanfiction_stylo_data/deltas/{type_of_feature}.csv", index_col=0)
-    for column in sing_fanfic.columns:
-        kudos = sing_fanfic.index
-        delta = sing_fanfic[column]
-
-        corr, _ = stats.pearsonr(kudos, delta)
-        print(f"\n{type_of_feature}")
-        print('Pearsons correlation: %.3f' % corr)
-
-        corr, _ = stats.spearmanr(kudos, delta)
-        print(f"\n{type_of_feature}")
-        print('Spearman correlation: %.3f' % corr)