From f6c5a8823f39b52ac2d1891f96be9ce10e4630ac Mon Sep 17 00:00:00 2001
From: Erik Perov <perov@cl.uni-heidelberg.de>
Date: Thu, 27 Mar 2025 16:09:30 +0100
Subject: [PATCH] add the average values for the metrics instead of one value
 for each word

---
 src/automatic_metrics.py | 15 +++++++++------
 src/text_extraction.py   |  6 +++---
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/automatic_metrics.py b/src/automatic_metrics.py
index 42d240f..096eacd 100644
--- a/src/automatic_metrics.py
+++ b/src/automatic_metrics.py
@@ -16,6 +16,8 @@ def count_syllables(word):
         syllable_count -= 1
     return max(syllable_count, 1)
 
+
+
 class Compute_Metrics(object):
     
     def __init__(self):
@@ -74,8 +76,9 @@ class Compute_Metrics(object):
                 pmi = math.log2(p_w1_w2 / (p_w1 * p_w2))
                 pmi_scores[word1, word2] = pmi
 
-        sorted_pmi_scores = dict(sorted(pmi_scores.items(), key=lambda item: item[1], reverse=True))
-        return sorted_pmi_scores
+        # sorted_pmi_scores = dict(sorted(pmi_scores.items(), key=lambda item: item[1], reverse=True))
+        avg_pmi = sum(pmi_scores.values()) / len(pmi_scores)
+        return avg_pmi
 
     def compute_tfidf(self, new_text) -> dict[str, float]:
         corpus = copy.deepcopy(self.corpus_strings)
@@ -91,12 +94,12 @@ class Compute_Metrics(object):
         feature_indices = [i for i, word in enumerate(feature_names) if word in new_text_tokens]
         tfidf_dict = {feature_names[i]: tfidf_scores[i] for i in feature_indices}
         
-        sorted_tfidf = dict(sorted(tfidf_dict.items(), key=lambda item: item[1], reverse=True))
-        
-        return sorted_tfidf
+        # sorted_tfidf = dict(sorted(tfidf_dict.items(), key=lambda item: item[1], reverse=True))
+        avg_tfidf = sum(tfidf_dict.values()) / len(tfidf_dict)
+        return avg_tfidf
 
 def main():
-    test_text = """I have been dreaming of this day for years, when the sky is blue. But now I'm tired enough to see it coming up out there in my dreams as if they were some kind thing that was born from nothing but sunlight…
+    test_text = """I have been dreaming of this day for years, when the sky is blue. But now I\'m tired enough to see it coming up out there in my dreams as if they were some kind thing that was born from nothing but sunlight… It\'s so bright here at night because you\'re awake right next door; just like before! The sun goes down on your back while every single other part has fallen into place by then – all over me with its own little circle around us."
     """
     Corpus = Compute_Metrics()
     pmi = Corpus.compute_pmi(test_text)
diff --git a/src/text_extraction.py b/src/text_extraction.py
index 1899008..1907371 100644
--- a/src/text_extraction.py
+++ b/src/text_extraction.py
@@ -114,7 +114,7 @@ if __name__ == "__main__":
     
     # output_directory = r"C:\Users"
     
-    # write_file(output_directory, "poetry_newlines.txt", poetry_text[0])
-    # write_file(output_directory, "wiki.txt", wiki_text)
-    # write_file(output_directory, "sport_bbc.txt", sports_text)
+    # write_file(output_directory, "poetry_newlines", poetry_text[0])
+    # write_file(output_directory, "wiki", wiki_text)
+    # write_file(output_directory, "sport_bbc", sports_text)
     
-- 
GitLab