From acf7f897b039a97b22376b77f3a9ca51c600836e Mon Sep 17 00:00:00 2001
From: Erik Perov <perov@cl.uni-heidelberg.de>
Date: Thu, 27 Mar 2025 16:11:06 +0100
Subject: [PATCH] add a script to asses the texts used in the survey
 automatically

---
 src/evaluate_automatic.py | 275 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 275 insertions(+)
 create mode 100644 src/evaluate_automatic.py

diff --git a/src/evaluate_automatic.py b/src/evaluate_automatic.py
new file mode 100644
index 0000000..0c4d03d
--- /dev/null
+++ b/src/evaluate_automatic.py
@@ -0,0 +1,275 @@
+import automatic_metrics as am
+from pathlib import Path
+import re
+import copy
+
+def extract_marked_text(file_path):
+    """Uses "X" as a marker to find which lines/texts to extract. Passages without a X are ignored."""
+    with open(file_path, 'r', encoding='utf-8') as file:
+        lines = file.readlines()
+
+    final_poems = {}
+    poems = []
+    current_poem = []
+    collecting = False
+    idx = 0
+    for line in lines:
+        match = re.match(r"(\d+):.*\bX\s*$", line.strip())
+        if collecting and current_poem:
+            if current_poem and any(line.strip() for line in current_poem):
+                poems.append("\n".join(current_poem).strip())
+                current_poem = []
+                
+        if match:
+            if collecting:
+                final_poems[idx] = ''.join(poems)
+                idx += 1
+                poems = []
+            collecting = True
+
+        elif re.match(r"\d+:", line.strip()):
+            if collecting:
+                final_poems[idx] = ''.join(poems)
+                idx += 1
+                poems = [] 
+            collecting = False
+
+        elif collecting:
+            current_poem.append(line.strip())
+
+    if collecting and current_poem:
+        poems.append("\n".join(current_poem).strip())
+
+    if collecting and current_poem:
+        final_poems[idx] = ''.join(poems)
+    return final_poems
+
+def get_all_data_from_folder(foldername, datatype="txt"):
+    """extracts all files from given folder for further processing"""
+    script_dir = Path(__file__).resolve().parent
+
+    data_dir = script_dir.parent / f"{foldername}"
+
+    files = list(data_dir.rglob(f"*.{datatype}"))
+    
+    all_extracted_text = {}
+    for file in files:
+        relativ_file_location = file.relative_to(data_dir)
+        text = extract_marked_text(file)
+        all_extracted_text[str(relativ_file_location)] = text
+    return all_extracted_text
+
+def calculate_scores_texts(text):
+    """Calculates scores for given text"""
+    texts = copy.deepcopy(text)
+    evaluator = am.Compute_Metrics()
+    evaluated_texts = {}
+    for filename in texts:
+        for idx in texts[filename]:
+            text = texts[filename][idx]
+            calc_metrics = []
+            calc_metrics.append(evaluator.compute_fre(text))
+            calc_metrics.append(evaluator.compute_ttr(text))
+            calc_metrics.append(evaluator.compute_pmi(text))
+            calc_metrics.append(evaluator.compute_tfidf(text))
+            evaluated_texts[f"{filename}\\{idx}"] = calc_metrics
+    return evaluated_texts # {filename\idx: [fre, ttr, pmi, tfidf]}
+
+class Calculate_Parameters(object):
+    """"automated procedure to calculate parameters"""
+    def __init__(self, metrics_ai, metrics_human, question_num):
+        # FRE, TTR, PMI, and TF-IDF are at index 0, 1, 2, and 3
+        self.ai_fre, self.ai_ttr, self.ai_pmi, self.ai_tfidf = metrics_ai[0], metrics_ai[1], metrics_ai[2], metrics_ai[3]
+        self.human_fre, self.human_ttr, self.human_pmi, self.human_tfidf = metrics_human[0], metrics_human[1], metrics_human[2], metrics_human[3]
+        self.question_num = question_num
+    
+    def calculate_coherence(self):
+        score = 0
+        
+        if abs(self.ai_fre - self.human_fre) >= 20:
+            if self.ai_fre > self.human_fre:
+                score += 1
+            else:
+                score -= 1
+
+        if abs(self.ai_pmi - self.human_pmi) >= 0.8:
+            if self.ai_pmi > self.human_pmi:
+                score += 2
+            else:
+                score -= 2
+        
+        if abs(self.ai_tfidf - self.human_tfidf) >= 0.2:
+            if self.ai_tfidf > self.human_tfidf:
+                score += 1
+            elif self.ai_tfidf < self.human_tfidf:
+                score -= 1
+        
+        if score > 0:
+            return "ai"
+        if score < 0:
+            return "human"
+        if score == 0:
+            return "equal"
+
+    def calculate_conciseness(self):
+        score = 0
+
+        if abs(self.ai_pmi - self.human_pmi) >= 1:
+            if self.ai_pmi > self.human_pmi:
+                score += 1
+            else:
+                score -= 1
+        
+        if abs(self.ai_ttr - self.human_ttr) >= 0.1:
+            if self.ai_ttr < self.human_ttr:
+                score += 2
+            elif self.ai_ttr > self.human_ttr:
+                score -= 2 
+        
+        if score > 0:
+            return "ai"
+        if score < 0:
+            return "human"
+        if score == 0:
+            return "equal"
+    
+    def calculate_creativity(self):
+        score = 0
+
+        if abs(self.ai_pmi - self.human_pmi) >= 1:
+            if self.ai_pmi < self.human_pmi:
+                score += 1
+            else:
+                score -= 1
+        
+        if abs(self.ai_ttr - self.human_ttr) >= 0.1:
+            if self.ai_ttr > self.human_ttr:
+                score += 1
+            elif self.ai_ttr < self.human_ttr:
+                score -= 1
+        
+        if abs(self.ai_fre - self.human_fre) >= 20:
+            if self.ai_fre < self.human_fre:
+                score += 1
+            else:
+                score -= 1
+        
+        if score > 0:
+            return "ai"
+        if score < 0:
+            return "human"
+        if score == 0:
+            return "equal"
+    
+    def calculate_clarity_of_concept(self):
+        score = 0
+
+        if abs(self.ai_pmi - self.human_pmi) >= 1:
+            if self.ai_pmi < self.human_pmi:
+                score += 1
+            else:
+                score -= 1
+        
+        if abs(self.ai_ttr - self.human_ttr) >= 0.1:
+            if self.ai_ttr < self.human_ttr:
+                score += 1
+            else:
+                score -= 1
+        
+        if abs(self.ai_fre - self.human_fre) >= 20:
+            if self.ai_fre < self.human_fre:
+                score += 1
+            else:
+                score -= 1
+        
+        if abs(self.ai_tfidf - self.human_tfidf) >= 0.2:
+            if self.ai_tfidf > self.human_tfidf:
+                score += 1
+            elif self.ai_tfidf < self.human_tfidf:
+                score -= 1
+        
+        if score > 0:
+            return "ai"
+        if score < 0:
+            return "human"
+        if score == 0:
+            return "equal"
+
+def predict_human_ai(survey_assessment):
+    """
+    Counts the times when the human text had better scores on the parameters and count the times when the
+    ai text had better scores on the parameters.
+    The outputed tag is the predicted tag. "Equal" means it couldn't decide.
+    """
+    predicted_tags = {}
+
+    for question_num, rated_param in survey_assessment.items():
+        ai = 0
+        human = 0
+
+        keys_to_check = set(rated_param.keys())
+
+        if question_num <= 6:
+            keys_to_check.discard("clarity_of_concept")
+        elif 6 < question_num <= 12:
+            keys_to_check.discard("creativity")
+        elif 12 < question_num <= 18:
+            keys_to_check.discard("clarity_of_concept")
+            keys_to_check.discard("creativity")
+
+        for key in keys_to_check:
+            if rated_param[key] == "ai":
+                ai += 1
+            elif rated_param[key] == "human":
+                human += 1
+
+        # Determine result
+        if human > ai:
+            predicted_tags[question_num] = "human"
+        elif human < ai:
+            predicted_tags[question_num] = "ai"
+        else:
+            predicted_tags[question_num] = "equal"
+
+    return predicted_tags
+    
+
+if __name__ == '__main__':
+    survey_texts = get_all_data_from_folder("data", "txt")
+    evaluated_texts = calculate_scores_texts(survey_texts)
+
+    # I manually ordered the texts in the order used in the survey
+    survey_ai_texts = ['ai\\gpt2_poem.txt\\0', 'ai\\gpt2_poem.txt\\1', 'ai\\opt_poem.txt\\0', 'ai\\opt_poem.txt\\1', 'ai\\gpt4o_poem.txt\\0',  'ai\\gpt4o_poem.txt\\1',
+                       'ai\\gpt4o_wiki.txt\\0', 'ai\\gpt4o_wiki.txt\\1', 'ai\\opt_wiki.txt\\0', 'ai\\opt_wiki.txt\\1', 'ai\\gpt2_wiki.txt\\0', 'ai\\gpt2_wiki.txt\\1',
+                       'ai\\opt_sport.txt\\0', 'ai\\opt_sport.txt\\1', 'ai\\gpt4o_sports.txt\\0', 'ai\\gpt4o_sports.txt\\1', 'ai\\gpt2_sport.txt\\0', 'ai\\gpt2_sport.txt\\1'
+                       ]
+    survey_human_texts = ["human\\poetry.txt\\0", 'human\\poetry.txt\\1', 'human\\poetry.txt\\2', 'human\\poetry.txt\\3', 'human\\poetry.txt\\4', 'human\\poetry.txt\\5',
+                          'human\\wiki.txt\\0', 'human\\wiki.txt\\1', 'human\\wiki.txt\\2', 'human\\wiki.txt\\3', 'human\\wiki.txt\\4', 'human\\wiki.txt\\5',
+                          'human\\sport_bbc.txt\\0', 'human\\sport_bbc.txt\\1', 'human\\sport_bbc.txt\\2', 'human\\sport_bbc.txt\\3', 'human\\sport_bbc.txt\\4', 'human\\sport_bbc.txt\\5'
+                          ]
+    survey_groups = zip(survey_ai_texts, survey_human_texts)
+    
+    # Rate parameters Coherence, Creativity, Conciseness, Clarity of Concepts between survey groups
+    survey_assessment = {}
+    for i, group in enumerate(survey_groups, start=1):
+        for idx, name in enumerate(group):
+            if idx < len(group) - 1:
+                metrics_ai = evaluated_texts[name]
+                metrics_human = evaluated_texts[group[idx + 1]]
+                
+                evaluation_metrics = Calculate_Parameters(metrics_ai, metrics_human, i)
+
+                coherence_score = evaluation_metrics.calculate_coherence()
+                conciseness_score = evaluation_metrics.calculate_conciseness()
+                creativity_score = evaluation_metrics.calculate_creativity()
+                clarity_score = evaluation_metrics.calculate_clarity_of_concept()
+
+                survey_assessment[i] = {
+                    "coherence": coherence_score,
+                    "conciseness": conciseness_score,
+                    "creativity": creativity_score,
+                    "clarity_of_concept": clarity_score
+                    }
+                    
+    # Automatically asses if text is human or ai genareted
+    result = predict_human_ai(survey_assessment)
-- 
GitLab