Skip to content
Snippets Groups Projects
Commit 91c0e1f1 authored by perov's avatar perov
Browse files

add functionality to display results into graphs and improve on calculations. Fixed bugs.

parent 15ca88e2
No related branches found
No related tags found
No related merge requests found
Showing
with 355 additions and 62 deletions
# NLP_Evaluation
## Name
Evaluation of generative AI using human ... and automatic metrics
Evaluation of generative AI using human judgment and automatic metrics
## Description
3 Models were used to generate texts: GPT2, OPT and GPT4o. Text were generated in 3 categories: Poems, science-related topics and sport summaries. Similar prompts were used on all the models.
......
results/ai_usage.png

20.1 KiB

results/automated_prediciton.png

14.7 KiB

results/automated_prediction.png

14.8 KiB

results/correct_guess.png

27.7 KiB

results/expert_group.png

12.7 KiB

results/model_results.png

22.1 KiB

results/param_matching.png

33.7 KiB

results/param_scores.png

33.7 KiB

results/survey_groups_correct.png

24.5 KiB

results/time_spent.png

19.9 KiB

# Source Explanation:
## Description
This file explains the functionality of each script.
File added
File added
File added
from pathlib import Path
import csv
from datetime import datetime, timedelta
from collections import defaultdict
def get_all_data_from_folder(foldername, datatype="txt"):
"""extracts all files from given folder for further processing"""
......@@ -59,12 +60,12 @@ class Proccess_Data(object):
for num, (start, end) in enumerate(zip(time_start, time_end), start=1):
start_time = datetime.strptime(start, "%H:%M:%S")
end_time = datetime.strptime(end, "%H:%M:%S")
if end_time < start_time:
continue
time = end_time - start_time
if time.total_seconds() / 60 < 180:
if 10 <= time.total_seconds() / 60 <= 180:
total_time += (end_time - start_time)
user_time[num] = time.total_seconds() // 60
......@@ -155,30 +156,123 @@ class Proccess_Data(object):
gpt4o_count += 1
return {"gpt2": round(gpt2 / gpt2_count, 2), "opt": round(opt / opt_count, 2), "gpt4o": round(gpt4o / gpt4o_count, 2)}
def average_parameter(self, parameter, model_name):
def average_parameter_model(self, parameter, model_name):
"""Looks at the parameters like coherence, conciseness, creativity and clarity of concept and calculates the average."""
count, rate = 0, 0
if model_name != "human":
for question_num, correct_label in enumerate(self.correct_labels):
for keys in self.survey_data.keys():
if parameter in keys and correct_label in keys:
model = self.models[question_num]
if model == model_name:
for rating in self.survey_data[keys]:
if rating:
rate += int(rating)
count += 1
if model_name == "human":
for question_num, correct_label in enumerate(self.correct_labels):
for keys in self.survey_data.keys():
if parameter in keys and correct_label not in keys:
model = self.models[question_num]
for rating in self.survey_data[keys]:
question_index = 0
last_group = None
for key, ratings in self.survey_data.items():
if question_index >= len(self.correct_labels):
break
group_id = key.split('/')[-1].strip()
# If this key belongs to the same group as the last processed one, skip it.
if group_id == last_group:
continue
correct_label = self.correct_labels[question_index]
if parameter in key and correct_label in key:
model = self.models[question_index]
if model == model_name:
for rating in ratings:
if rating:
rate += int(rating)
count += 1
question_index += 1
last_group = group_id
if model_name == "human":
question_index = 0
last_group = None
for key, ratings in self.survey_data.items():
if question_index >= len(self.correct_labels):
break
group_id = key.split('/')[-1].strip()
# If this key belongs to the same group as the last processed one, skip it.
if group_id == last_group:
continue
correct_label = self.correct_labels[question_index]
if parameter in key and correct_label not in key:
model = self.models[question_index]
for rating in ratings:
if rating:
rate += int(rating)
count += 1
question_index += 1
last_group = group_id
return round(rate / count, 2)
def parameter_survey_group_assesment(self, parameter):
"""Counts """
count_human, rate_human, count_ai, rate_ai = 0, 0, 0, 0
question_index = 0
last_group = None
data_iter = iter(self.survey_data.items())
parameter_human_response = []
for key, ratings in data_iter:
if question_index >= len(self.correct_labels):
break
group_id = key.split('/')[-1].strip()
# If this key belongs to the same group as the last processed one, skip it.
if group_id == last_group:
continue
correct_label = self.correct_labels[question_index]
if parameter in key and correct_label in key:
for rating in ratings:
if rating:
rate_ai += int(rating)
count_ai += 1
# the next element is the human text
next_key, next_ratings = next(data_iter)
for rating in next_ratings:
if rating:
rate_human += int(rating)
count_human += 1
question_index += 1
last_group = group_id
if rate_human / count_human > rate_ai / count_ai:
parameter_human_response.append("human")
else:
parameter_human_response.append("ai")
count_human, rate_human, count_ai, rate_ai = 0, 0, 0, 0
elif parameter in key and correct_label not in key:
for rating in ratings:
if rating:
rate_human += int(rating)
count_human += 1
# the next element is the ai text
next_key, next_ratings = next(data_iter)
for rating in next_ratings:
if rating:
rate_ai += int(rating)
count_ai += 1
question_index += 1
last_group = group_id
if rate_human / count_human > rate_ai / count_ai:
parameter_human_response.append("human")
elif rate_human / count_human < rate_ai / count_ai:
parameter_human_response.append("ai")
else:
parameter_human_response.append("equal")
count_human, rate_human, count_ai, rate_ai = 0, 0, 0, 0
return parameter_human_response
def get_time(self):
time_start = self.survey_data['What time is it now?']
time_end = self.survey_data['Please enter the time.']
total_time = self.calculate_total_time(time_start, time_end)
avg_time = total_time[0]
return avg_time, total_time[1]
if __name__ == "__main__":
answers = get_all_data_from_folder("results", "csv")
......@@ -186,8 +280,6 @@ if __name__ == "__main__":
only_answers = answers[1:]
survey_data = process_survey_data(headers, only_answers)
#print(survey_data.keys())
# correct answers for each survey group (LLM generated text is the correct answer)
correct_text = ["Text 2", "Text 2", "Text 1", "Text 2", "Text 1", "Text 2",
......@@ -199,23 +291,23 @@ if __name__ == "__main__":
"opt", "opt", "gpt4o", "gpt4o", "gpt2", "gpt2"]
evaluator = Proccess_Data(correct_text, survey_data, models)
#total_correct = evaluator.calculate_correct_answers()
total_correct = evaluator.calculate_correct_answers()
# expert_group = evaluator.compare_groups("expert")
# ai_usage_group = evaluator.compare_groups("ai_usage")
# time_group = evaluator.compare_groups("time")
expert_group = evaluator.compare_groups("expert")
ai_usage_group = evaluator.compare_groups("ai_usage")
time_group = evaluator.compare_groups("time")
#correct_percentage = [i[2] for i in total_correct.values()] # extracts average percentage of correct answers
#model_results = evaluator.compare_ai(correct_percentage)
correct_percentage = [i[2] for i in total_correct.values()] # extracts average percentage of correct answers
model_results = evaluator.compare_ai(correct_percentage)
parameters = ["Coherence", "Conciseness", "Creativity", "Clarity of Concept"]
models = ["gpt2", "gpt4o", "opt", "human"]
model_names = ["gpt2", "gpt4o", "opt", "human"]
avg_scores = {
param: {model: evaluator.average_parameter(param, model) for model in models}
avg_scores_model = {
param: {model: evaluator.average_parameter_model(param, model) for model in model_names}
for param in parameters
}
# make a dict with the key being the parameter and the value being a list where the index
# is the question_num in the survey and ai means ai scored higher and vice versa
avg_scores_question = {param: evaluator.parameter_survey_group_assesment(param) for param in parameters}
import automatic_metrics as am
import compute_metrics as cm
from pathlib import Path
import re
import copy
......@@ -62,7 +62,7 @@ def get_all_data_from_folder(foldername, datatype="txt"):
def calculate_scores_texts(text):
"""Calculates scores for given text"""
texts = copy.deepcopy(text)
evaluator = am.Compute_Metrics()
evaluator = cm.Compute_Metrics()
evaluated_texts = {}
for filename in texts:
for idx in texts[filename]:
......@@ -76,7 +76,7 @@ def calculate_scores_texts(text):
return evaluated_texts # {filename\idx: [fre, ttr, pmi, tfidf]}
class Calculate_Parameters(object):
""""automated procedure to calculate parameters"""
""""Automated procedure to calculate parameters. Only says which text has the higher one doesnt rate from 1-5."""
def __init__(self, metrics_ai, metrics_human, question_num):
# FRE, TTR, PMI, and TF-IDF are at index 0, 1, 2, and 3
self.ai_fre, self.ai_ttr, self.ai_pmi, self.ai_tfidf = metrics_ai[0], metrics_ai[1], metrics_ai[2], metrics_ai[3]
......@@ -210,12 +210,12 @@ def predict_human_ai(survey_assessment):
keys_to_check = set(rated_param.keys())
if question_num <= 6:
keys_to_check.discard("clarity_of_concept")
keys_to_check.discard("Clarity of Concept")
elif 6 < question_num <= 12:
keys_to_check.discard("creativity")
keys_to_check.discard("Creativity")
elif 12 < question_num <= 18:
keys_to_check.discard("clarity_of_concept")
keys_to_check.discard("creativity")
keys_to_check.discard("Clarity of Concept")
keys_to_check.discard("Creativity")
for key in keys_to_check:
if rated_param[key] == "ai":
......@@ -232,24 +232,9 @@ def predict_human_ai(survey_assessment):
predicted_tags[question_num] = "equal"
return predicted_tags
if __name__ == '__main__':
survey_texts = get_all_data_from_folder("data", "txt")
evaluated_texts = calculate_scores_texts(survey_texts)
# I manually ordered the texts in the order used in the survey
survey_ai_texts = ['ai\\gpt2_poem.txt\\0', 'ai\\gpt2_poem.txt\\1', 'ai\\opt_poem.txt\\0', 'ai\\opt_poem.txt\\1', 'ai\\gpt4o_poem.txt\\0', 'ai\\gpt4o_poem.txt\\1',
'ai\\gpt4o_wiki.txt\\0', 'ai\\gpt4o_wiki.txt\\1', 'ai\\opt_wiki.txt\\0', 'ai\\opt_wiki.txt\\1', 'ai\\gpt2_wiki.txt\\0', 'ai\\gpt2_wiki.txt\\1',
'ai\\opt_sport.txt\\0', 'ai\\opt_sport.txt\\1', 'ai\\gpt4o_sports.txt\\0', 'ai\\gpt4o_sports.txt\\1', 'ai\\gpt2_sport.txt\\0', 'ai\\gpt2_sport.txt\\1'
]
survey_human_texts = ["human\\poetry.txt\\0", 'human\\poetry.txt\\1', 'human\\poetry.txt\\2', 'human\\poetry.txt\\3', 'human\\poetry.txt\\4', 'human\\poetry.txt\\5',
'human\\wiki.txt\\0', 'human\\wiki.txt\\1', 'human\\wiki.txt\\2', 'human\\wiki.txt\\3', 'human\\wiki.txt\\4', 'human\\wiki.txt\\5',
'human\\sport_bbc.txt\\0', 'human\\sport_bbc.txt\\1', 'human\\sport_bbc.txt\\2', 'human\\sport_bbc.txt\\3', 'human\\sport_bbc.txt\\4', 'human\\sport_bbc.txt\\5'
]
survey_groups = zip(survey_ai_texts, survey_human_texts)
# Rate parameters Coherence, Creativity, Conciseness, Clarity of Concepts between survey groups
def compute_parameters(survey_groups, evaluated_texts):
"""Rate parameters Coherence, Creativity, Conciseness, Clarity of Concepts between survey groups."""
survey_assessment = {}
for i, group in enumerate(survey_groups, start=1):
for idx, name in enumerate(group):
......@@ -265,11 +250,35 @@ if __name__ == '__main__':
clarity_score = evaluation_metrics.calculate_clarity_of_concept()
survey_assessment[i] = {
"coherence": coherence_score,
"conciseness": conciseness_score,
"creativity": creativity_score,
"clarity_of_concept": clarity_score
"Coherence": coherence_score,
"Conciseness": conciseness_score,
"Creativity": creativity_score,
"Clarity of Concept": clarity_score
}
return survey_assessment
def calculated_predictions():
survey_texts = get_all_data_from_folder("data", "txt")
evaluated_texts = calculate_scores_texts(survey_texts)
# I manually ordered the texts in the order used in the survey
survey_ai_texts = ['ai\\gpt2_poem.txt\\0', 'ai\\gpt2_poem.txt\\1', 'ai\\opt_poem.txt\\0', 'ai\\opt_poem.txt\\1', 'ai\\gpt4o_poem.txt\\0', 'ai\\gpt4o_poem.txt\\1',
'ai\\gpt4o_wiki.txt\\0', 'ai\\gpt4o_wiki.txt\\1', 'ai\\opt_wiki.txt\\0', 'ai\\opt_wiki.txt\\1', 'ai\\gpt2_wiki.txt\\0', 'ai\\gpt2_wiki.txt\\1',
'ai\\opt_sport.txt\\0', 'ai\\opt_sport.txt\\1', 'ai\\gpt4o_sports.txt\\0', 'ai\\gpt4o_sports.txt\\1', 'ai\\gpt2_sport.txt\\0', 'ai\\gpt2_sport.txt\\1'
]
survey_human_texts = ["human\\poetry.txt\\0", 'human\\poetry.txt\\1', 'human\\poetry.txt\\2', 'human\\poetry.txt\\3', 'human\\poetry.txt\\4', 'human\\poetry.txt\\5',
'human\\wiki.txt\\0', 'human\\wiki.txt\\1', 'human\\wiki.txt\\2', 'human\\wiki.txt\\3', 'human\\wiki.txt\\4', 'human\\wiki.txt\\5',
'human\\sport_bbc.txt\\0', 'human\\sport_bbc.txt\\1', 'human\\sport_bbc.txt\\2', 'human\\sport_bbc.txt\\3', 'human\\sport_bbc.txt\\4', 'human\\sport_bbc.txt\\5'
]
survey_groups = zip(survey_ai_texts, survey_human_texts)
# compute
survey_assessment = compute_parameters(survey_groups, evaluated_texts)
# Automatically asses if text is human or ai genareted
result = predict_human_ai(survey_assessment)
return result, survey_assessment # Has the Tags for predicitons
if __name__ == '__main__':
result = calculated_predictions()
print(result)
\ No newline at end of file
File moved
import asses_results
import automatic_prediciton
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np
def collect_data():
answers = asses_results.get_all_data_from_folder("results", "csv")
headers = answers[0]
only_answers = answers[1:]
survey_data = asses_results.process_survey_data(headers, only_answers)
# correct answers for each survey group (LLM generated text is the correct answer)
correct_text = ["Text 2", "Text 2", "Text 1", "Text 2", "Text 1", "Text 2",
"Text 2", "Text 2", "Text 1", "Text 1", "Text 2", "Text 1",
"Text 1", "Text 2", "Text 2", "Text 1", "Text 1", "Text 2",]
models = ["gpt2", "gpt2", "opt", "opt", "gpt4o", "gpt4o",
"gpt4o", "gpt4o", "opt", "opt", "gpt2", "gpt2",
"opt", "opt", "gpt4o", "gpt4o", "gpt2", "gpt2"]
evaluator = asses_results.Proccess_Data(correct_text, survey_data, models)
total_correct = evaluator.calculate_correct_answers()
total_time = evaluator.get_time()
expert_group = evaluator.compare_groups("expert")
ai_usage_group = evaluator.compare_groups("ai_usage")
time_group = evaluator.compare_groups("time")
correct_percentage = [i[2] for i in total_correct.values()] # extracts average percentage of correct answers
model_results = evaluator.compare_ai(correct_percentage)
parameters = ["Coherence", "Conciseness", "Creativity", "Clarity of Concept"]
models = ["gpt2", "gpt4o", "opt", "human"]
avg_scores_model = {
param: {model: evaluator.average_parameter_model(param, model) for model in models}
for param in parameters
}
avg_scores_question = {param: evaluator.parameter_survey_group_assesment(param) for param in parameters}
return total_correct, expert_group, ai_usage_group, time_group, model_results, avg_scores_model, total_time, avg_scores_question
def save_bar_chart(categories, values, filename="barchart.png", title="Average Correct Guess", y="percentage"):
"""Creates and saves a bar chart as a PNG file in the results folder."""
script_dir = Path(__file__).resolve().parent
results_dir = script_dir.parent / "results"
# Create bar chart
plt.figure(figsize=(8, 6))
plt.bar(categories, values, color='steelblue')
if y == "percentage":
# Set y-axis limits from 0 to 1
plt.ylim(0, 1)
# Format y-axis as percentages
plt.gca().set_yticks([i / 10 for i in range(11)]) # 0.0, 0.1, ..., 1.0
plt.gca().set_yticklabels([f"{int(y*100)}%" for y in plt.gca().get_yticks()])
plt.ylabel('Percentage')
if y == "int":
max_value = max(values)
max_y = max_value + 1
# Set y-axis limits from 0 to max_y
plt.ylim(0, max_y)
# Set y-ticks from 0 to max_y with a step of 10
plt.gca().set_yticks(range(0, int(max_y) + 1, 10)) # Step of 10
plt.ylabel('Time (in minutes)')
# Labels and title
plt.xlabel('Categories')
plt.title(title)
# Save plot as a PNG file
save_path = results_dir / filename
plt.savefig(save_path, bbox_inches='tight')
plt.close() # Close the plot to free memory
return save_path
def save_grouped_bar_chart(results, filename="param_scores.png"):
"""Creates and saves a grouped bar chart with model names inside the bars."""
script_dir = Path(__file__).resolve().parent
results_dir = script_dir.parent / "results"
parameters = list(results.keys()) # Extract parameter names
model_names = list(next(iter(results.values())).keys()) # Extract model names
# Convert dictionary into a 2D list (rows: parameters, columns: models)
values = []
for param in parameters:
param_values = []
for model in model_names:
param_values.append(results[param][model])
values.append(param_values)
values = np.array(values)
# Plot settings
x = np.arange(len(parameters)) # X positions for groups
width = 0.2 # Width of bars
plt.figure(figsize=(10, 6))
# Create bars for each model
for i, model in enumerate(model_names):
bars = plt.bar(x + i * width - (width * (len(model_names) - 1) / 2), values[:, i], width, label=model)
# Add model name inside each bar
for bar in bars:
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width() / 2, height / 2, model, ha='center', va='center', fontsize=10, color='white', fontweight='bold')
# Labels and styling
plt.xlabel("Evaluation Parameters")
plt.ylabel("Scores")
plt.title("Model Performance Across Evaluation Parameters")
plt.xticks(x, parameters)
plt.ylim(0, 5) # Assuming scores range between 0 and 5
plt.legend(title="Models")
# Save the figure
save_path = results_dir / filename
plt.savefig(save_path, bbox_inches='tight')
plt.close() # Close plot to free memory
return save_path
if __name__ == '__main__':
results = collect_data()
# Average correct guess for all the survey groups across different participant groups
categories = ['Average correct', 'Experts', 'Freq. AI usage', 'Below avg Time', 'Above avg Time']
values = [results[0][0][2], results[1][0][2], results[2][0][2], results[3][0][0][2], results[3][1][0][2]]
save_bar_chart(categories, values, "correct_guess.png")
# Average correct guess across all survey groups
categories = [str(num) for num in list(results[0].keys())]
values = [total[2] for total in list(results[0].values())]
save_bar_chart(categories, values, "survey_groups_correct.png")
# Time spent with survey
categories = [str(participant) for participant in list(results[6][1].keys())]
categories.append("0")
values = [time_spent for time_spent in list(results[6][1].values())]
values.append(results[6][0])
save_bar_chart(categories, values, "time_spent.png", "Time Spent", y="int")
# Average scores for each parameter across models
save_grouped_bar_chart(results[5])
# Correct Guess for each model
categories = list(results[4].keys())
values = list(results[4].values())
save_bar_chart(categories, values, "model_results.png")
predictions = automatic_prediciton.calculated_predictions()
# Show how often the simple model predicts correctly
predictions_tag = predictions[0]
categories = [str(question_num) for question_num in predictions_tag.keys()]
values = []
for tag in predictions_tag.values():
if tag == "equal":
values.append(0)
if tag == "human":
values.append(1)
save_bar_chart(categories, values, "automated_prediction.png", "Automated Predictiion", y="int")
# counts how often the model rates the parameters higher for ai or human compared to survey results from humans
predicitons_param = predictions[1]
human_param = results[7]
count, count_total = 0, 0
for question_num, answers in enumerate(predicitons_param.values()):
for param, predicted_tag in answers.items():
try:
human_response = human_param[param][question_num]
if predicted_tag == human_response:
count += 1
else:
count_total += 1
except:
count_total += 1
match_accuracy = count / count_total
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment