From acf7f897b039a97b22376b77f3a9ca51c600836e Mon Sep 17 00:00:00 2001 From: Erik Perov <perov@cl.uni-heidelberg.de> Date: Thu, 27 Mar 2025 16:11:06 +0100 Subject: [PATCH] add a script to asses the texts used in the survey automatically --- src/evaluate_automatic.py | 275 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 275 insertions(+) create mode 100644 src/evaluate_automatic.py diff --git a/src/evaluate_automatic.py b/src/evaluate_automatic.py new file mode 100644 index 0000000..0c4d03d --- /dev/null +++ b/src/evaluate_automatic.py @@ -0,0 +1,275 @@ +import automatic_metrics as am +from pathlib import Path +import re +import copy + +def extract_marked_text(file_path): + """Uses "X" as a marker to find which lines/texts to extract. Passages without a X are ignored.""" + with open(file_path, 'r', encoding='utf-8') as file: + lines = file.readlines() + + final_poems = {} + poems = [] + current_poem = [] + collecting = False + idx = 0 + for line in lines: + match = re.match(r"(\d+):.*\bX\s*$", line.strip()) + if collecting and current_poem: + if current_poem and any(line.strip() for line in current_poem): + poems.append("\n".join(current_poem).strip()) + current_poem = [] + + if match: + if collecting: + final_poems[idx] = ''.join(poems) + idx += 1 + poems = [] + collecting = True + + elif re.match(r"\d+:", line.strip()): + if collecting: + final_poems[idx] = ''.join(poems) + idx += 1 + poems = [] + collecting = False + + elif collecting: + current_poem.append(line.strip()) + + if collecting and current_poem: + poems.append("\n".join(current_poem).strip()) + + if collecting and current_poem: + final_poems[idx] = ''.join(poems) + return final_poems + +def get_all_data_from_folder(foldername, datatype="txt"): + """extracts all files from given folder for further processing""" + script_dir = Path(__file__).resolve().parent + + data_dir = script_dir.parent / f"{foldername}" + + files = list(data_dir.rglob(f"*.{datatype}")) + + all_extracted_text = {} + for file in files: + relativ_file_location = file.relative_to(data_dir) + text = extract_marked_text(file) + all_extracted_text[str(relativ_file_location)] = text + return all_extracted_text + +def calculate_scores_texts(text): + """Calculates scores for given text""" + texts = copy.deepcopy(text) + evaluator = am.Compute_Metrics() + evaluated_texts = {} + for filename in texts: + for idx in texts[filename]: + text = texts[filename][idx] + calc_metrics = [] + calc_metrics.append(evaluator.compute_fre(text)) + calc_metrics.append(evaluator.compute_ttr(text)) + calc_metrics.append(evaluator.compute_pmi(text)) + calc_metrics.append(evaluator.compute_tfidf(text)) + evaluated_texts[f"{filename}\\{idx}"] = calc_metrics + return evaluated_texts # {filename\idx: [fre, ttr, pmi, tfidf]} + +class Calculate_Parameters(object): + """"automated procedure to calculate parameters""" + def __init__(self, metrics_ai, metrics_human, question_num): + # FRE, TTR, PMI, and TF-IDF are at index 0, 1, 2, and 3 + self.ai_fre, self.ai_ttr, self.ai_pmi, self.ai_tfidf = metrics_ai[0], metrics_ai[1], metrics_ai[2], metrics_ai[3] + self.human_fre, self.human_ttr, self.human_pmi, self.human_tfidf = metrics_human[0], metrics_human[1], metrics_human[2], metrics_human[3] + self.question_num = question_num + + def calculate_coherence(self): + score = 0 + + if abs(self.ai_fre - self.human_fre) >= 20: + if self.ai_fre > self.human_fre: + score += 1 + else: + score -= 1 + + if abs(self.ai_pmi - self.human_pmi) >= 0.8: + if self.ai_pmi > self.human_pmi: + score += 2 + else: + score -= 2 + + if abs(self.ai_tfidf - self.human_tfidf) >= 0.2: + if self.ai_tfidf > self.human_tfidf: + score += 1 + elif self.ai_tfidf < self.human_tfidf: + score -= 1 + + if score > 0: + return "ai" + if score < 0: + return "human" + if score == 0: + return "equal" + + def calculate_conciseness(self): + score = 0 + + if abs(self.ai_pmi - self.human_pmi) >= 1: + if self.ai_pmi > self.human_pmi: + score += 1 + else: + score -= 1 + + if abs(self.ai_ttr - self.human_ttr) >= 0.1: + if self.ai_ttr < self.human_ttr: + score += 2 + elif self.ai_ttr > self.human_ttr: + score -= 2 + + if score > 0: + return "ai" + if score < 0: + return "human" + if score == 0: + return "equal" + + def calculate_creativity(self): + score = 0 + + if abs(self.ai_pmi - self.human_pmi) >= 1: + if self.ai_pmi < self.human_pmi: + score += 1 + else: + score -= 1 + + if abs(self.ai_ttr - self.human_ttr) >= 0.1: + if self.ai_ttr > self.human_ttr: + score += 1 + elif self.ai_ttr < self.human_ttr: + score -= 1 + + if abs(self.ai_fre - self.human_fre) >= 20: + if self.ai_fre < self.human_fre: + score += 1 + else: + score -= 1 + + if score > 0: + return "ai" + if score < 0: + return "human" + if score == 0: + return "equal" + + def calculate_clarity_of_concept(self): + score = 0 + + if abs(self.ai_pmi - self.human_pmi) >= 1: + if self.ai_pmi < self.human_pmi: + score += 1 + else: + score -= 1 + + if abs(self.ai_ttr - self.human_ttr) >= 0.1: + if self.ai_ttr < self.human_ttr: + score += 1 + else: + score -= 1 + + if abs(self.ai_fre - self.human_fre) >= 20: + if self.ai_fre < self.human_fre: + score += 1 + else: + score -= 1 + + if abs(self.ai_tfidf - self.human_tfidf) >= 0.2: + if self.ai_tfidf > self.human_tfidf: + score += 1 + elif self.ai_tfidf < self.human_tfidf: + score -= 1 + + if score > 0: + return "ai" + if score < 0: + return "human" + if score == 0: + return "equal" + +def predict_human_ai(survey_assessment): + """ + Counts the times when the human text had better scores on the parameters and count the times when the + ai text had better scores on the parameters. + The outputed tag is the predicted tag. "Equal" means it couldn't decide. + """ + predicted_tags = {} + + for question_num, rated_param in survey_assessment.items(): + ai = 0 + human = 0 + + keys_to_check = set(rated_param.keys()) + + if question_num <= 6: + keys_to_check.discard("clarity_of_concept") + elif 6 < question_num <= 12: + keys_to_check.discard("creativity") + elif 12 < question_num <= 18: + keys_to_check.discard("clarity_of_concept") + keys_to_check.discard("creativity") + + for key in keys_to_check: + if rated_param[key] == "ai": + ai += 1 + elif rated_param[key] == "human": + human += 1 + + # Determine result + if human > ai: + predicted_tags[question_num] = "human" + elif human < ai: + predicted_tags[question_num] = "ai" + else: + predicted_tags[question_num] = "equal" + + return predicted_tags + + +if __name__ == '__main__': + survey_texts = get_all_data_from_folder("data", "txt") + evaluated_texts = calculate_scores_texts(survey_texts) + + # I manually ordered the texts in the order used in the survey + survey_ai_texts = ['ai\\gpt2_poem.txt\\0', 'ai\\gpt2_poem.txt\\1', 'ai\\opt_poem.txt\\0', 'ai\\opt_poem.txt\\1', 'ai\\gpt4o_poem.txt\\0', 'ai\\gpt4o_poem.txt\\1', + 'ai\\gpt4o_wiki.txt\\0', 'ai\\gpt4o_wiki.txt\\1', 'ai\\opt_wiki.txt\\0', 'ai\\opt_wiki.txt\\1', 'ai\\gpt2_wiki.txt\\0', 'ai\\gpt2_wiki.txt\\1', + 'ai\\opt_sport.txt\\0', 'ai\\opt_sport.txt\\1', 'ai\\gpt4o_sports.txt\\0', 'ai\\gpt4o_sports.txt\\1', 'ai\\gpt2_sport.txt\\0', 'ai\\gpt2_sport.txt\\1' + ] + survey_human_texts = ["human\\poetry.txt\\0", 'human\\poetry.txt\\1', 'human\\poetry.txt\\2', 'human\\poetry.txt\\3', 'human\\poetry.txt\\4', 'human\\poetry.txt\\5', + 'human\\wiki.txt\\0', 'human\\wiki.txt\\1', 'human\\wiki.txt\\2', 'human\\wiki.txt\\3', 'human\\wiki.txt\\4', 'human\\wiki.txt\\5', + 'human\\sport_bbc.txt\\0', 'human\\sport_bbc.txt\\1', 'human\\sport_bbc.txt\\2', 'human\\sport_bbc.txt\\3', 'human\\sport_bbc.txt\\4', 'human\\sport_bbc.txt\\5' + ] + survey_groups = zip(survey_ai_texts, survey_human_texts) + + # Rate parameters Coherence, Creativity, Conciseness, Clarity of Concepts between survey groups + survey_assessment = {} + for i, group in enumerate(survey_groups, start=1): + for idx, name in enumerate(group): + if idx < len(group) - 1: + metrics_ai = evaluated_texts[name] + metrics_human = evaluated_texts[group[idx + 1]] + + evaluation_metrics = Calculate_Parameters(metrics_ai, metrics_human, i) + + coherence_score = evaluation_metrics.calculate_coherence() + conciseness_score = evaluation_metrics.calculate_conciseness() + creativity_score = evaluation_metrics.calculate_creativity() + clarity_score = evaluation_metrics.calculate_clarity_of_concept() + + survey_assessment[i] = { + "coherence": coherence_score, + "conciseness": conciseness_score, + "creativity": creativity_score, + "clarity_of_concept": clarity_score + } + + # Automatically asses if text is human or ai genareted + result = predict_human_ai(survey_assessment) -- GitLab