diff --git a/README.md b/README.md index b6ec8235a44ef41b63bb48fc22fa9e564bd14271..a2a285a4e9e490f35949db315c8f39ff0245fe3c 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # NLP_Evaluation ## Name -Evaluation of generative AI using human ... and automatic metrics +Evaluation of generative AI using human judgment and automatic metrics ## Description 3 Models were used to generate texts: GPT2, OPT and GPT4o. Text were generated in 3 categories: Poems, science-related topics and sport summaries. Similar prompts were used on all the models. diff --git a/results/ai_usage.png b/results/ai_usage.png new file mode 100644 index 0000000000000000000000000000000000000000..7c421684b841a344bf664df28dc9280f29cc9fd4 Binary files /dev/null and b/results/ai_usage.png differ diff --git a/results/automated_prediciton.png b/results/automated_prediciton.png new file mode 100644 index 0000000000000000000000000000000000000000..1547537ab1c62d84e99da1432d1b4cd2aed4d848 Binary files /dev/null and b/results/automated_prediciton.png differ diff --git a/results/automated_prediction.png b/results/automated_prediction.png new file mode 100644 index 0000000000000000000000000000000000000000..590a8a538f58a8b666546ed1ddf531b9e6539435 Binary files /dev/null and b/results/automated_prediction.png differ diff --git a/results/correct_guess.png b/results/correct_guess.png new file mode 100644 index 0000000000000000000000000000000000000000..5573579f40ec8e806bbfb7ceb88b125704a98b55 Binary files /dev/null and b/results/correct_guess.png differ diff --git a/results/expert_group.png b/results/expert_group.png new file mode 100644 index 0000000000000000000000000000000000000000..863856dbbac3898e7e40c6443aee957dcb260811 Binary files /dev/null and b/results/expert_group.png differ diff --git a/results/model_results.png b/results/model_results.png new file mode 100644 index 0000000000000000000000000000000000000000..19ae2a17ce5efcf32969eddff5733526f4ec30ab Binary files /dev/null and b/results/model_results.png differ diff --git a/results/param_matching.png b/results/param_matching.png new file mode 100644 index 0000000000000000000000000000000000000000..2f831a4bee79b06b03f47dc7dd01e7967a97e410 Binary files /dev/null and b/results/param_matching.png differ diff --git a/results/param_scores.png b/results/param_scores.png new file mode 100644 index 0000000000000000000000000000000000000000..2f831a4bee79b06b03f47dc7dd01e7967a97e410 Binary files /dev/null and b/results/param_scores.png differ diff --git a/results/survey_groups_correct.png b/results/survey_groups_correct.png new file mode 100644 index 0000000000000000000000000000000000000000..4b1ac3b31c5dfef204306d38b9ba4d59787ebb80 Binary files /dev/null and b/results/survey_groups_correct.png differ diff --git a/results/time_spent.png b/results/time_spent.png new file mode 100644 index 0000000000000000000000000000000000000000..4b2c8e7a1a80284d9815101ab5d1b1ddac4e2366 Binary files /dev/null and b/results/time_spent.png differ diff --git a/src/README.md b/src/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ba8a6a25ebfc29c96d14b027048b2ad5878cf27b --- /dev/null +++ b/src/README.md @@ -0,0 +1,4 @@ +# Source Explanation: + +## Description +This file explains the functionality of each script. diff --git a/src/__pycache__/asses_results.cpython-312.pyc b/src/__pycache__/asses_results.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..62805477b13d740ef88cf051c9aac1974ed4b26d Binary files /dev/null and b/src/__pycache__/asses_results.cpython-312.pyc differ diff --git a/src/__pycache__/automatic_prediciton.cpython-312.pyc b/src/__pycache__/automatic_prediciton.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..88b5ecab2080b7ef68bdf101107f17bac4ed71fa Binary files /dev/null and b/src/__pycache__/automatic_prediciton.cpython-312.pyc differ diff --git a/src/__pycache__/compute_metrics.cpython-312.pyc b/src/__pycache__/compute_metrics.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..119d86d170e1c3101a78cd6b3ef9293b15aeffc8 Binary files /dev/null and b/src/__pycache__/compute_metrics.cpython-312.pyc differ diff --git a/src/asses_results.py b/src/asses_results.py index 1f5d548fd7eb2cee99c92499a5e3f24421edc67b..4027e7ba4f6692ad37f201f041273c774398252d 100644 --- a/src/asses_results.py +++ b/src/asses_results.py @@ -1,6 +1,7 @@ from pathlib import Path import csv from datetime import datetime, timedelta +from collections import defaultdict def get_all_data_from_folder(foldername, datatype="txt"): """extracts all files from given folder for further processing""" @@ -59,12 +60,12 @@ class Proccess_Data(object): for num, (start, end) in enumerate(zip(time_start, time_end), start=1): start_time = datetime.strptime(start, "%H:%M:%S") end_time = datetime.strptime(end, "%H:%M:%S") - + if end_time < start_time: continue time = end_time - start_time - if time.total_seconds() / 60 < 180: + if 10 <= time.total_seconds() / 60 <= 180: total_time += (end_time - start_time) user_time[num] = time.total_seconds() // 60 @@ -155,30 +156,123 @@ class Proccess_Data(object): gpt4o_count += 1 return {"gpt2": round(gpt2 / gpt2_count, 2), "opt": round(opt / opt_count, 2), "gpt4o": round(gpt4o / gpt4o_count, 2)} - def average_parameter(self, parameter, model_name): + def average_parameter_model(self, parameter, model_name): """Looks at the parameters like coherence, conciseness, creativity and clarity of concept and calculates the average.""" count, rate = 0, 0 if model_name != "human": - for question_num, correct_label in enumerate(self.correct_labels): - for keys in self.survey_data.keys(): - if parameter in keys and correct_label in keys: - model = self.models[question_num] - if model == model_name: - for rating in self.survey_data[keys]: - if rating: - rate += int(rating) - count += 1 - if model_name == "human": - for question_num, correct_label in enumerate(self.correct_labels): - for keys in self.survey_data.keys(): - if parameter in keys and correct_label not in keys: - model = self.models[question_num] - for rating in self.survey_data[keys]: + question_index = 0 + last_group = None + for key, ratings in self.survey_data.items(): + if question_index >= len(self.correct_labels): + break + + group_id = key.split('/')[-1].strip() + # If this key belongs to the same group as the last processed one, skip it. + if group_id == last_group: + continue + + correct_label = self.correct_labels[question_index] + if parameter in key and correct_label in key: + model = self.models[question_index] + if model == model_name: + for rating in ratings: if rating: rate += int(rating) count += 1 + question_index += 1 + last_group = group_id + + if model_name == "human": + question_index = 0 + last_group = None + for key, ratings in self.survey_data.items(): + if question_index >= len(self.correct_labels): + break + + group_id = key.split('/')[-1].strip() + # If this key belongs to the same group as the last processed one, skip it. + if group_id == last_group: + continue + + correct_label = self.correct_labels[question_index] + if parameter in key and correct_label not in key: + model = self.models[question_index] + for rating in ratings: + if rating: + rate += int(rating) + count += 1 + question_index += 1 + last_group = group_id return round(rate / count, 2) + + def parameter_survey_group_assesment(self, parameter): + """Counts """ + count_human, rate_human, count_ai, rate_ai = 0, 0, 0, 0 + question_index = 0 + last_group = None + data_iter = iter(self.survey_data.items()) + parameter_human_response = [] + for key, ratings in data_iter: + if question_index >= len(self.correct_labels): + break + + group_id = key.split('/')[-1].strip() + # If this key belongs to the same group as the last processed one, skip it. + if group_id == last_group: + continue + + correct_label = self.correct_labels[question_index] + if parameter in key and correct_label in key: + for rating in ratings: + if rating: + rate_ai += int(rating) + count_ai += 1 + # the next element is the human text + next_key, next_ratings = next(data_iter) + for rating in next_ratings: + if rating: + rate_human += int(rating) + count_human += 1 + question_index += 1 + last_group = group_id + if rate_human / count_human > rate_ai / count_ai: + parameter_human_response.append("human") + else: + parameter_human_response.append("ai") + count_human, rate_human, count_ai, rate_ai = 0, 0, 0, 0 + + elif parameter in key and correct_label not in key: + for rating in ratings: + if rating: + rate_human += int(rating) + count_human += 1 + # the next element is the ai text + next_key, next_ratings = next(data_iter) + for rating in next_ratings: + if rating: + rate_ai += int(rating) + count_ai += 1 + question_index += 1 + last_group = group_id + if rate_human / count_human > rate_ai / count_ai: + parameter_human_response.append("human") + elif rate_human / count_human < rate_ai / count_ai: + parameter_human_response.append("ai") + else: + parameter_human_response.append("equal") + count_human, rate_human, count_ai, rate_ai = 0, 0, 0, 0 + + + return parameter_human_response + + + def get_time(self): + time_start = self.survey_data['What time is it now?'] + time_end = self.survey_data['Please enter the time.'] + total_time = self.calculate_total_time(time_start, time_end) + avg_time = total_time[0] + return avg_time, total_time[1] if __name__ == "__main__": answers = get_all_data_from_folder("results", "csv") @@ -186,8 +280,6 @@ if __name__ == "__main__": only_answers = answers[1:] survey_data = process_survey_data(headers, only_answers) - - #print(survey_data.keys()) # correct answers for each survey group (LLM generated text is the correct answer) correct_text = ["Text 2", "Text 2", "Text 1", "Text 2", "Text 1", "Text 2", @@ -199,23 +291,23 @@ if __name__ == "__main__": "opt", "opt", "gpt4o", "gpt4o", "gpt2", "gpt2"] evaluator = Proccess_Data(correct_text, survey_data, models) - #total_correct = evaluator.calculate_correct_answers() + total_correct = evaluator.calculate_correct_answers() - # expert_group = evaluator.compare_groups("expert") - # ai_usage_group = evaluator.compare_groups("ai_usage") - # time_group = evaluator.compare_groups("time") + expert_group = evaluator.compare_groups("expert") + ai_usage_group = evaluator.compare_groups("ai_usage") + time_group = evaluator.compare_groups("time") - #correct_percentage = [i[2] for i in total_correct.values()] # extracts average percentage of correct answers - #model_results = evaluator.compare_ai(correct_percentage) + correct_percentage = [i[2] for i in total_correct.values()] # extracts average percentage of correct answers + model_results = evaluator.compare_ai(correct_percentage) parameters = ["Coherence", "Conciseness", "Creativity", "Clarity of Concept"] - models = ["gpt2", "gpt4o", "opt", "human"] + model_names = ["gpt2", "gpt4o", "opt", "human"] - avg_scores = { - param: {model: evaluator.average_parameter(param, model) for model in models} + avg_scores_model = { + param: {model: evaluator.average_parameter_model(param, model) for model in model_names} for param in parameters } - - - + # make a dict with the key being the parameter and the value being a list where the index + # is the question_num in the survey and ai means ai scored higher and vice versa + avg_scores_question = {param: evaluator.parameter_survey_group_assesment(param) for param in parameters} diff --git a/src/evaluate_automatic.py b/src/automatic_prediciton.py similarity index 89% rename from src/evaluate_automatic.py rename to src/automatic_prediciton.py index 0c4d03d8cadf08f3d4b8fe97efc2d09b23ff2a35..5e13458fe36d16e614189e8efa6353f371577050 100644 --- a/src/evaluate_automatic.py +++ b/src/automatic_prediciton.py @@ -1,4 +1,4 @@ -import automatic_metrics as am +import compute_metrics as cm from pathlib import Path import re import copy @@ -62,7 +62,7 @@ def get_all_data_from_folder(foldername, datatype="txt"): def calculate_scores_texts(text): """Calculates scores for given text""" texts = copy.deepcopy(text) - evaluator = am.Compute_Metrics() + evaluator = cm.Compute_Metrics() evaluated_texts = {} for filename in texts: for idx in texts[filename]: @@ -76,7 +76,7 @@ def calculate_scores_texts(text): return evaluated_texts # {filename\idx: [fre, ttr, pmi, tfidf]} class Calculate_Parameters(object): - """"automated procedure to calculate parameters""" + """"Automated procedure to calculate parameters. Only says which text has the higher one doesnt rate from 1-5.""" def __init__(self, metrics_ai, metrics_human, question_num): # FRE, TTR, PMI, and TF-IDF are at index 0, 1, 2, and 3 self.ai_fre, self.ai_ttr, self.ai_pmi, self.ai_tfidf = metrics_ai[0], metrics_ai[1], metrics_ai[2], metrics_ai[3] @@ -210,12 +210,12 @@ def predict_human_ai(survey_assessment): keys_to_check = set(rated_param.keys()) if question_num <= 6: - keys_to_check.discard("clarity_of_concept") + keys_to_check.discard("Clarity of Concept") elif 6 < question_num <= 12: - keys_to_check.discard("creativity") + keys_to_check.discard("Creativity") elif 12 < question_num <= 18: - keys_to_check.discard("clarity_of_concept") - keys_to_check.discard("creativity") + keys_to_check.discard("Clarity of Concept") + keys_to_check.discard("Creativity") for key in keys_to_check: if rated_param[key] == "ai": @@ -232,24 +232,9 @@ def predict_human_ai(survey_assessment): predicted_tags[question_num] = "equal" return predicted_tags - - -if __name__ == '__main__': - survey_texts = get_all_data_from_folder("data", "txt") - evaluated_texts = calculate_scores_texts(survey_texts) - # I manually ordered the texts in the order used in the survey - survey_ai_texts = ['ai\\gpt2_poem.txt\\0', 'ai\\gpt2_poem.txt\\1', 'ai\\opt_poem.txt\\0', 'ai\\opt_poem.txt\\1', 'ai\\gpt4o_poem.txt\\0', 'ai\\gpt4o_poem.txt\\1', - 'ai\\gpt4o_wiki.txt\\0', 'ai\\gpt4o_wiki.txt\\1', 'ai\\opt_wiki.txt\\0', 'ai\\opt_wiki.txt\\1', 'ai\\gpt2_wiki.txt\\0', 'ai\\gpt2_wiki.txt\\1', - 'ai\\opt_sport.txt\\0', 'ai\\opt_sport.txt\\1', 'ai\\gpt4o_sports.txt\\0', 'ai\\gpt4o_sports.txt\\1', 'ai\\gpt2_sport.txt\\0', 'ai\\gpt2_sport.txt\\1' - ] - survey_human_texts = ["human\\poetry.txt\\0", 'human\\poetry.txt\\1', 'human\\poetry.txt\\2', 'human\\poetry.txt\\3', 'human\\poetry.txt\\4', 'human\\poetry.txt\\5', - 'human\\wiki.txt\\0', 'human\\wiki.txt\\1', 'human\\wiki.txt\\2', 'human\\wiki.txt\\3', 'human\\wiki.txt\\4', 'human\\wiki.txt\\5', - 'human\\sport_bbc.txt\\0', 'human\\sport_bbc.txt\\1', 'human\\sport_bbc.txt\\2', 'human\\sport_bbc.txt\\3', 'human\\sport_bbc.txt\\4', 'human\\sport_bbc.txt\\5' - ] - survey_groups = zip(survey_ai_texts, survey_human_texts) - - # Rate parameters Coherence, Creativity, Conciseness, Clarity of Concepts between survey groups +def compute_parameters(survey_groups, evaluated_texts): + """Rate parameters Coherence, Creativity, Conciseness, Clarity of Concepts between survey groups.""" survey_assessment = {} for i, group in enumerate(survey_groups, start=1): for idx, name in enumerate(group): @@ -265,11 +250,35 @@ if __name__ == '__main__': clarity_score = evaluation_metrics.calculate_clarity_of_concept() survey_assessment[i] = { - "coherence": coherence_score, - "conciseness": conciseness_score, - "creativity": creativity_score, - "clarity_of_concept": clarity_score + "Coherence": coherence_score, + "Conciseness": conciseness_score, + "Creativity": creativity_score, + "Clarity of Concept": clarity_score } - + return survey_assessment + +def calculated_predictions(): + survey_texts = get_all_data_from_folder("data", "txt") + evaluated_texts = calculate_scores_texts(survey_texts) + + # I manually ordered the texts in the order used in the survey + survey_ai_texts = ['ai\\gpt2_poem.txt\\0', 'ai\\gpt2_poem.txt\\1', 'ai\\opt_poem.txt\\0', 'ai\\opt_poem.txt\\1', 'ai\\gpt4o_poem.txt\\0', 'ai\\gpt4o_poem.txt\\1', + 'ai\\gpt4o_wiki.txt\\0', 'ai\\gpt4o_wiki.txt\\1', 'ai\\opt_wiki.txt\\0', 'ai\\opt_wiki.txt\\1', 'ai\\gpt2_wiki.txt\\0', 'ai\\gpt2_wiki.txt\\1', + 'ai\\opt_sport.txt\\0', 'ai\\opt_sport.txt\\1', 'ai\\gpt4o_sports.txt\\0', 'ai\\gpt4o_sports.txt\\1', 'ai\\gpt2_sport.txt\\0', 'ai\\gpt2_sport.txt\\1' + ] + survey_human_texts = ["human\\poetry.txt\\0", 'human\\poetry.txt\\1', 'human\\poetry.txt\\2', 'human\\poetry.txt\\3', 'human\\poetry.txt\\4', 'human\\poetry.txt\\5', + 'human\\wiki.txt\\0', 'human\\wiki.txt\\1', 'human\\wiki.txt\\2', 'human\\wiki.txt\\3', 'human\\wiki.txt\\4', 'human\\wiki.txt\\5', + 'human\\sport_bbc.txt\\0', 'human\\sport_bbc.txt\\1', 'human\\sport_bbc.txt\\2', 'human\\sport_bbc.txt\\3', 'human\\sport_bbc.txt\\4', 'human\\sport_bbc.txt\\5' + ] + survey_groups = zip(survey_ai_texts, survey_human_texts) + + # compute + survey_assessment = compute_parameters(survey_groups, evaluated_texts) + # Automatically asses if text is human or ai genareted result = predict_human_ai(survey_assessment) + return result, survey_assessment # Has the Tags for predicitons + +if __name__ == '__main__': + result = calculated_predictions() + print(result) \ No newline at end of file diff --git a/src/automatic_metrics.py b/src/compute_metrics.py similarity index 100% rename from src/automatic_metrics.py rename to src/compute_metrics.py diff --git a/src/display_results.py b/src/display_results.py new file mode 100644 index 0000000000000000000000000000000000000000..d82f3a1a031f2ca1d8251051fd5b652541b5c53c --- /dev/null +++ b/src/display_results.py @@ -0,0 +1,188 @@ +import asses_results +import automatic_prediciton +import matplotlib.pyplot as plt +from pathlib import Path +import numpy as np + +def collect_data(): + answers = asses_results.get_all_data_from_folder("results", "csv") + headers = answers[0] + only_answers = answers[1:] + + survey_data = asses_results.process_survey_data(headers, only_answers) + + # correct answers for each survey group (LLM generated text is the correct answer) + correct_text = ["Text 2", "Text 2", "Text 1", "Text 2", "Text 1", "Text 2", + "Text 2", "Text 2", "Text 1", "Text 1", "Text 2", "Text 1", + "Text 1", "Text 2", "Text 2", "Text 1", "Text 1", "Text 2",] + + models = ["gpt2", "gpt2", "opt", "opt", "gpt4o", "gpt4o", + "gpt4o", "gpt4o", "opt", "opt", "gpt2", "gpt2", + "opt", "opt", "gpt4o", "gpt4o", "gpt2", "gpt2"] + + evaluator = asses_results.Proccess_Data(correct_text, survey_data, models) + total_correct = evaluator.calculate_correct_answers() + total_time = evaluator.get_time() + + expert_group = evaluator.compare_groups("expert") + ai_usage_group = evaluator.compare_groups("ai_usage") + time_group = evaluator.compare_groups("time") + + correct_percentage = [i[2] for i in total_correct.values()] # extracts average percentage of correct answers + model_results = evaluator.compare_ai(correct_percentage) + + parameters = ["Coherence", "Conciseness", "Creativity", "Clarity of Concept"] + models = ["gpt2", "gpt4o", "opt", "human"] + + avg_scores_model = { + param: {model: evaluator.average_parameter_model(param, model) for model in models} + for param in parameters + } + + avg_scores_question = {param: evaluator.parameter_survey_group_assesment(param) for param in parameters} + + return total_correct, expert_group, ai_usage_group, time_group, model_results, avg_scores_model, total_time, avg_scores_question + +def save_bar_chart(categories, values, filename="barchart.png", title="Average Correct Guess", y="percentage"): + """Creates and saves a bar chart as a PNG file in the results folder.""" + script_dir = Path(__file__).resolve().parent + results_dir = script_dir.parent / "results" + + # Create bar chart + plt.figure(figsize=(8, 6)) + plt.bar(categories, values, color='steelblue') + if y == "percentage": + # Set y-axis limits from 0 to 1 + plt.ylim(0, 1) + + # Format y-axis as percentages + plt.gca().set_yticks([i / 10 for i in range(11)]) # 0.0, 0.1, ..., 1.0 + plt.gca().set_yticklabels([f"{int(y*100)}%" for y in plt.gca().get_yticks()]) + plt.ylabel('Percentage') + if y == "int": + max_value = max(values) + max_y = max_value + 1 + # Set y-axis limits from 0 to max_y + plt.ylim(0, max_y) + + # Set y-ticks from 0 to max_y with a step of 10 + plt.gca().set_yticks(range(0, int(max_y) + 1, 10)) # Step of 10 + plt.ylabel('Time (in minutes)') + + # Labels and title + plt.xlabel('Categories') + + plt.title(title) + + # Save plot as a PNG file + save_path = results_dir / filename + plt.savefig(save_path, bbox_inches='tight') + plt.close() # Close the plot to free memory + + return save_path + +def save_grouped_bar_chart(results, filename="param_scores.png"): + """Creates and saves a grouped bar chart with model names inside the bars.""" + script_dir = Path(__file__).resolve().parent + results_dir = script_dir.parent / "results" + + parameters = list(results.keys()) # Extract parameter names + model_names = list(next(iter(results.values())).keys()) # Extract model names + + # Convert dictionary into a 2D list (rows: parameters, columns: models) + values = [] + for param in parameters: + param_values = [] + for model in model_names: + param_values.append(results[param][model]) + values.append(param_values) + + values = np.array(values) + + # Plot settings + x = np.arange(len(parameters)) # X positions for groups + width = 0.2 # Width of bars + + plt.figure(figsize=(10, 6)) + + # Create bars for each model + for i, model in enumerate(model_names): + bars = plt.bar(x + i * width - (width * (len(model_names) - 1) / 2), values[:, i], width, label=model) + + # Add model name inside each bar + for bar in bars: + height = bar.get_height() + plt.text(bar.get_x() + bar.get_width() / 2, height / 2, model, ha='center', va='center', fontsize=10, color='white', fontweight='bold') + + # Labels and styling + plt.xlabel("Evaluation Parameters") + plt.ylabel("Scores") + plt.title("Model Performance Across Evaluation Parameters") + plt.xticks(x, parameters) + plt.ylim(0, 5) # Assuming scores range between 0 and 5 + plt.legend(title="Models") + + # Save the figure + save_path = results_dir / filename + plt.savefig(save_path, bbox_inches='tight') + plt.close() # Close plot to free memory + + return save_path + +if __name__ == '__main__': + results = collect_data() + + # Average correct guess for all the survey groups across different participant groups + categories = ['Average correct', 'Experts', 'Freq. AI usage', 'Below avg Time', 'Above avg Time'] + values = [results[0][0][2], results[1][0][2], results[2][0][2], results[3][0][0][2], results[3][1][0][2]] + save_bar_chart(categories, values, "correct_guess.png") + + # Average correct guess across all survey groups + categories = [str(num) for num in list(results[0].keys())] + values = [total[2] for total in list(results[0].values())] + save_bar_chart(categories, values, "survey_groups_correct.png") + + # Time spent with survey + categories = [str(participant) for participant in list(results[6][1].keys())] + categories.append("0") + values = [time_spent for time_spent in list(results[6][1].values())] + values.append(results[6][0]) + save_bar_chart(categories, values, "time_spent.png", "Time Spent", y="int") + + # Average scores for each parameter across models + save_grouped_bar_chart(results[5]) + + # Correct Guess for each model + categories = list(results[4].keys()) + values = list(results[4].values()) + save_bar_chart(categories, values, "model_results.png") + + predictions = automatic_prediciton.calculated_predictions() + + # Show how often the simple model predicts correctly + predictions_tag = predictions[0] + categories = [str(question_num) for question_num in predictions_tag.keys()] + values = [] + for tag in predictions_tag.values(): + if tag == "equal": + values.append(0) + if tag == "human": + values.append(1) + save_bar_chart(categories, values, "automated_prediction.png", "Automated Predictiion", y="int") + + # counts how often the model rates the parameters higher for ai or human compared to survey results from humans + predicitons_param = predictions[1] + human_param = results[7] + count, count_total = 0, 0 + for question_num, answers in enumerate(predicitons_param.values()): + for param, predicted_tag in answers.items(): + try: + human_response = human_param[param][question_num] + if predicted_tag == human_response: + count += 1 + else: + count_total += 1 + except: + count_total += 1 + + match_accuracy = count / count_total \ No newline at end of file