diff --git a/README.md b/README.md
index b6ec8235a44ef41b63bb48fc22fa9e564bd14271..a2a285a4e9e490f35949db315c8f39ff0245fe3c 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # NLP_Evaluation
 
 ## Name
-Evaluation of generative AI using human ... and automatic metrics
+Evaluation of generative AI using human judgment and automatic metrics
 
 ## Description
 3 Models were used to generate texts: GPT2, OPT and GPT4o. Text were generated in 3 categories: Poems, science-related topics and sport summaries. Similar prompts were used on all the models.  
diff --git a/results/ai_usage.png b/results/ai_usage.png
new file mode 100644
index 0000000000000000000000000000000000000000..7c421684b841a344bf664df28dc9280f29cc9fd4
Binary files /dev/null and b/results/ai_usage.png differ
diff --git a/results/automated_prediciton.png b/results/automated_prediciton.png
new file mode 100644
index 0000000000000000000000000000000000000000..1547537ab1c62d84e99da1432d1b4cd2aed4d848
Binary files /dev/null and b/results/automated_prediciton.png differ
diff --git a/results/automated_prediction.png b/results/automated_prediction.png
new file mode 100644
index 0000000000000000000000000000000000000000..590a8a538f58a8b666546ed1ddf531b9e6539435
Binary files /dev/null and b/results/automated_prediction.png differ
diff --git a/results/correct_guess.png b/results/correct_guess.png
new file mode 100644
index 0000000000000000000000000000000000000000..5573579f40ec8e806bbfb7ceb88b125704a98b55
Binary files /dev/null and b/results/correct_guess.png differ
diff --git a/results/expert_group.png b/results/expert_group.png
new file mode 100644
index 0000000000000000000000000000000000000000..863856dbbac3898e7e40c6443aee957dcb260811
Binary files /dev/null and b/results/expert_group.png differ
diff --git a/results/model_results.png b/results/model_results.png
new file mode 100644
index 0000000000000000000000000000000000000000..19ae2a17ce5efcf32969eddff5733526f4ec30ab
Binary files /dev/null and b/results/model_results.png differ
diff --git a/results/param_matching.png b/results/param_matching.png
new file mode 100644
index 0000000000000000000000000000000000000000..2f831a4bee79b06b03f47dc7dd01e7967a97e410
Binary files /dev/null and b/results/param_matching.png differ
diff --git a/results/param_scores.png b/results/param_scores.png
new file mode 100644
index 0000000000000000000000000000000000000000..2f831a4bee79b06b03f47dc7dd01e7967a97e410
Binary files /dev/null and b/results/param_scores.png differ
diff --git a/results/survey_groups_correct.png b/results/survey_groups_correct.png
new file mode 100644
index 0000000000000000000000000000000000000000..4b1ac3b31c5dfef204306d38b9ba4d59787ebb80
Binary files /dev/null and b/results/survey_groups_correct.png differ
diff --git a/results/time_spent.png b/results/time_spent.png
new file mode 100644
index 0000000000000000000000000000000000000000..4b2c8e7a1a80284d9815101ab5d1b1ddac4e2366
Binary files /dev/null and b/results/time_spent.png differ
diff --git a/src/README.md b/src/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ba8a6a25ebfc29c96d14b027048b2ad5878cf27b
--- /dev/null
+++ b/src/README.md
@@ -0,0 +1,4 @@
+# Source Explanation:
+
+## Description
+This file explains the functionality of each script.
diff --git a/src/__pycache__/asses_results.cpython-312.pyc b/src/__pycache__/asses_results.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..62805477b13d740ef88cf051c9aac1974ed4b26d
Binary files /dev/null and b/src/__pycache__/asses_results.cpython-312.pyc differ
diff --git a/src/__pycache__/automatic_prediciton.cpython-312.pyc b/src/__pycache__/automatic_prediciton.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..88b5ecab2080b7ef68bdf101107f17bac4ed71fa
Binary files /dev/null and b/src/__pycache__/automatic_prediciton.cpython-312.pyc differ
diff --git a/src/__pycache__/compute_metrics.cpython-312.pyc b/src/__pycache__/compute_metrics.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..119d86d170e1c3101a78cd6b3ef9293b15aeffc8
Binary files /dev/null and b/src/__pycache__/compute_metrics.cpython-312.pyc differ
diff --git a/src/asses_results.py b/src/asses_results.py
index 1f5d548fd7eb2cee99c92499a5e3f24421edc67b..4027e7ba4f6692ad37f201f041273c774398252d 100644
--- a/src/asses_results.py
+++ b/src/asses_results.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 import csv
 from datetime import datetime, timedelta
+from collections import defaultdict
 
 def get_all_data_from_folder(foldername, datatype="txt"):
     """extracts all files from given folder for further processing"""
@@ -59,12 +60,12 @@ class Proccess_Data(object):
         for num, (start, end) in enumerate(zip(time_start, time_end), start=1):
             start_time = datetime.strptime(start, "%H:%M:%S")
             end_time = datetime.strptime(end, "%H:%M:%S")
-            
+
             if end_time < start_time:
                 continue
             time = end_time - start_time
             
-            if time.total_seconds() / 60 < 180:
+            if 10 <= time.total_seconds() / 60 <= 180:
                 total_time += (end_time - start_time)
                 user_time[num] = time.total_seconds() // 60
         
@@ -155,30 +156,123 @@ class Proccess_Data(object):
                 gpt4o_count += 1
         return {"gpt2": round(gpt2 / gpt2_count, 2), "opt": round(opt / opt_count, 2), "gpt4o": round(gpt4o / gpt4o_count, 2)}
     
-    def average_parameter(self, parameter, model_name):
+    def average_parameter_model(self, parameter, model_name):
         """Looks at the parameters like coherence, conciseness, creativity and clarity of concept and calculates the average."""
         count, rate = 0, 0
         if model_name != "human":
-            for question_num, correct_label in enumerate(self.correct_labels):
-                for keys in self.survey_data.keys():
-                    if parameter in keys and correct_label in keys:
-                        model = self.models[question_num]
-                        if model == model_name: 
-                            for rating in self.survey_data[keys]:
-                                if rating:
-                                    rate += int(rating)
-                                    count += 1
-        if model_name == "human":
-            for question_num, correct_label in enumerate(self.correct_labels):
-                for keys in self.survey_data.keys():
-                    if parameter in keys and correct_label not in keys:
-                        model = self.models[question_num]
-                        for rating in self.survey_data[keys]:
+            question_index = 0
+            last_group = None
+            for key, ratings in self.survey_data.items():
+                if question_index >= len(self.correct_labels):
+                    break
+                
+                group_id = key.split('/')[-1].strip()
+                # If this key belongs to the same group as the last processed one, skip it.
+                if group_id == last_group:
+                    continue
+                
+                correct_label = self.correct_labels[question_index]
+                if parameter in key and correct_label in key:
+                    model = self.models[question_index]
+                    if model == model_name:
+                        for rating in ratings:
                             if rating:
                                 rate += int(rating)
                                 count += 1
+                    question_index += 1
+                    last_group = group_id
+                            
+        if model_name == "human":
+            question_index = 0
+            last_group = None
+            for key, ratings in self.survey_data.items():
+                if question_index >= len(self.correct_labels):
+                    break
+                
+                group_id = key.split('/')[-1].strip()
+                # If this key belongs to the same group as the last processed one, skip it.
+                if group_id == last_group:
+                    continue
+                
+                correct_label = self.correct_labels[question_index]
+                if parameter in key and correct_label not in key:
+                    model = self.models[question_index]
+                    for rating in ratings:
+                        if rating:
+                            rate += int(rating)
+                            count += 1
+                    question_index += 1
+                    last_group = group_id
 
         return round(rate / count, 2)
+
+    def parameter_survey_group_assesment(self, parameter):
+        """Counts  """
+        count_human, rate_human, count_ai, rate_ai = 0, 0, 0, 0
+        question_index = 0
+        last_group = None
+        data_iter = iter(self.survey_data.items())
+        parameter_human_response = []
+        for key, ratings in data_iter:
+            if question_index >= len(self.correct_labels):
+                break
+            
+            group_id = key.split('/')[-1].strip()
+            # If this key belongs to the same group as the last processed one, skip it.
+            if group_id == last_group:
+                continue
+            
+            correct_label = self.correct_labels[question_index]
+            if parameter in key and correct_label in key:
+                for rating in ratings:
+                    if rating:
+                        rate_ai += int(rating)
+                        count_ai += 1
+                # the next element is the human text
+                next_key, next_ratings = next(data_iter)
+                for rating in next_ratings:
+                    if rating:
+                        rate_human += int(rating)
+                        count_human += 1
+                question_index += 1
+                last_group = group_id
+                if rate_human / count_human > rate_ai / count_ai:
+                    parameter_human_response.append("human")
+                else:
+                    parameter_human_response.append("ai")
+                count_human, rate_human, count_ai, rate_ai = 0, 0, 0, 0
+            
+            elif parameter in key and correct_label not in key:
+                for rating in ratings:
+                    if rating:
+                        rate_human += int(rating)
+                        count_human += 1
+                # the next element is the ai text
+                next_key, next_ratings = next(data_iter)
+                for rating in next_ratings:
+                    if rating:
+                        rate_ai += int(rating)
+                        count_ai += 1
+                question_index += 1
+                last_group = group_id
+                if rate_human / count_human > rate_ai / count_ai:
+                    parameter_human_response.append("human")
+                elif rate_human / count_human < rate_ai / count_ai:
+                    parameter_human_response.append("ai")
+                else:
+                    parameter_human_response.append("equal")
+                count_human, rate_human, count_ai, rate_ai = 0, 0, 0, 0
+                
+
+        return parameter_human_response
+
+    
+    def get_time(self):
+        time_start = self.survey_data['What time is it now?']
+        time_end = self.survey_data['Please enter the time.']
+        total_time = self.calculate_total_time(time_start, time_end)
+        avg_time = total_time[0]
+        return avg_time, total_time[1]
         
 if __name__ == "__main__":
     answers = get_all_data_from_folder("results", "csv")
@@ -186,8 +280,6 @@ if __name__ == "__main__":
     only_answers = answers[1:]
 
     survey_data = process_survey_data(headers, only_answers)
-
-    #print(survey_data.keys())
     
     # correct answers for each survey group (LLM generated text is the correct answer)
     correct_text = ["Text 2", "Text 2", "Text 1", "Text 2", "Text 1", "Text 2",
@@ -199,23 +291,23 @@ if __name__ == "__main__":
               "opt", "opt", "gpt4o", "gpt4o", "gpt2", "gpt2"]
     
     evaluator = Proccess_Data(correct_text, survey_data, models)
-    #total_correct = evaluator.calculate_correct_answers()
+    total_correct = evaluator.calculate_correct_answers()
     
-    # expert_group = evaluator.compare_groups("expert")
-    # ai_usage_group = evaluator.compare_groups("ai_usage")
-    # time_group = evaluator.compare_groups("time")
+    expert_group = evaluator.compare_groups("expert")
+    ai_usage_group = evaluator.compare_groups("ai_usage")
+    time_group = evaluator.compare_groups("time")
     
-    #correct_percentage = [i[2] for i in total_correct.values()] # extracts average percentage of correct answers
-    #model_results = evaluator.compare_ai(correct_percentage)
+    correct_percentage = [i[2] for i in total_correct.values()] # extracts average percentage of correct answers
+    model_results = evaluator.compare_ai(correct_percentage)
 
     parameters = ["Coherence", "Conciseness", "Creativity", "Clarity of Concept"]
-    models = ["gpt2", "gpt4o", "opt", "human"]
+    model_names = ["gpt2", "gpt4o", "opt", "human"]
     
-    avg_scores = {
-    param: {model: evaluator.average_parameter(param, model) for model in models}
+    avg_scores_model = {
+    param: {model: evaluator.average_parameter_model(param, model) for model in model_names}
     for param in parameters
     }
-
     
-
-
+    # make a dict with the key being the parameter and the value being a list where the index
+    # is the question_num in the survey and ai means ai scored higher and vice versa
+    avg_scores_question = {param: evaluator.parameter_survey_group_assesment(param) for param in parameters}
diff --git a/src/evaluate_automatic.py b/src/automatic_prediciton.py
similarity index 89%
rename from src/evaluate_automatic.py
rename to src/automatic_prediciton.py
index 0c4d03d8cadf08f3d4b8fe97efc2d09b23ff2a35..5e13458fe36d16e614189e8efa6353f371577050 100644
--- a/src/evaluate_automatic.py
+++ b/src/automatic_prediciton.py
@@ -1,4 +1,4 @@
-import automatic_metrics as am
+import compute_metrics as cm
 from pathlib import Path
 import re
 import copy
@@ -62,7 +62,7 @@ def get_all_data_from_folder(foldername, datatype="txt"):
 def calculate_scores_texts(text):
     """Calculates scores for given text"""
     texts = copy.deepcopy(text)
-    evaluator = am.Compute_Metrics()
+    evaluator = cm.Compute_Metrics()
     evaluated_texts = {}
     for filename in texts:
         for idx in texts[filename]:
@@ -76,7 +76,7 @@ def calculate_scores_texts(text):
     return evaluated_texts # {filename\idx: [fre, ttr, pmi, tfidf]}
 
 class Calculate_Parameters(object):
-    """"automated procedure to calculate parameters"""
+    """"Automated procedure to calculate parameters. Only says which text has the higher one doesnt rate from 1-5."""
     def __init__(self, metrics_ai, metrics_human, question_num):
         # FRE, TTR, PMI, and TF-IDF are at index 0, 1, 2, and 3
         self.ai_fre, self.ai_ttr, self.ai_pmi, self.ai_tfidf = metrics_ai[0], metrics_ai[1], metrics_ai[2], metrics_ai[3]
@@ -210,12 +210,12 @@ def predict_human_ai(survey_assessment):
         keys_to_check = set(rated_param.keys())
 
         if question_num <= 6:
-            keys_to_check.discard("clarity_of_concept")
+            keys_to_check.discard("Clarity of Concept")
         elif 6 < question_num <= 12:
-            keys_to_check.discard("creativity")
+            keys_to_check.discard("Creativity")
         elif 12 < question_num <= 18:
-            keys_to_check.discard("clarity_of_concept")
-            keys_to_check.discard("creativity")
+            keys_to_check.discard("Clarity of Concept")
+            keys_to_check.discard("Creativity")
 
         for key in keys_to_check:
             if rated_param[key] == "ai":
@@ -232,24 +232,9 @@ def predict_human_ai(survey_assessment):
             predicted_tags[question_num] = "equal"
 
     return predicted_tags
-    
-
-if __name__ == '__main__':
-    survey_texts = get_all_data_from_folder("data", "txt")
-    evaluated_texts = calculate_scores_texts(survey_texts)
 
-    # I manually ordered the texts in the order used in the survey
-    survey_ai_texts = ['ai\\gpt2_poem.txt\\0', 'ai\\gpt2_poem.txt\\1', 'ai\\opt_poem.txt\\0', 'ai\\opt_poem.txt\\1', 'ai\\gpt4o_poem.txt\\0',  'ai\\gpt4o_poem.txt\\1',
-                       'ai\\gpt4o_wiki.txt\\0', 'ai\\gpt4o_wiki.txt\\1', 'ai\\opt_wiki.txt\\0', 'ai\\opt_wiki.txt\\1', 'ai\\gpt2_wiki.txt\\0', 'ai\\gpt2_wiki.txt\\1',
-                       'ai\\opt_sport.txt\\0', 'ai\\opt_sport.txt\\1', 'ai\\gpt4o_sports.txt\\0', 'ai\\gpt4o_sports.txt\\1', 'ai\\gpt2_sport.txt\\0', 'ai\\gpt2_sport.txt\\1'
-                       ]
-    survey_human_texts = ["human\\poetry.txt\\0", 'human\\poetry.txt\\1', 'human\\poetry.txt\\2', 'human\\poetry.txt\\3', 'human\\poetry.txt\\4', 'human\\poetry.txt\\5',
-                          'human\\wiki.txt\\0', 'human\\wiki.txt\\1', 'human\\wiki.txt\\2', 'human\\wiki.txt\\3', 'human\\wiki.txt\\4', 'human\\wiki.txt\\5',
-                          'human\\sport_bbc.txt\\0', 'human\\sport_bbc.txt\\1', 'human\\sport_bbc.txt\\2', 'human\\sport_bbc.txt\\3', 'human\\sport_bbc.txt\\4', 'human\\sport_bbc.txt\\5'
-                          ]
-    survey_groups = zip(survey_ai_texts, survey_human_texts)
-    
-    # Rate parameters Coherence, Creativity, Conciseness, Clarity of Concepts between survey groups
+def compute_parameters(survey_groups, evaluated_texts):
+    """Rate parameters Coherence, Creativity, Conciseness, Clarity of Concepts between survey groups."""
     survey_assessment = {}
     for i, group in enumerate(survey_groups, start=1):
         for idx, name in enumerate(group):
@@ -265,11 +250,35 @@ if __name__ == '__main__':
                 clarity_score = evaluation_metrics.calculate_clarity_of_concept()
 
                 survey_assessment[i] = {
-                    "coherence": coherence_score,
-                    "conciseness": conciseness_score,
-                    "creativity": creativity_score,
-                    "clarity_of_concept": clarity_score
+                    "Coherence": coherence_score,
+                    "Conciseness": conciseness_score,
+                    "Creativity": creativity_score,
+                    "Clarity of Concept": clarity_score
                     }
-                    
+    return survey_assessment
+    
+def calculated_predictions():
+    survey_texts = get_all_data_from_folder("data", "txt")
+    evaluated_texts = calculate_scores_texts(survey_texts)
+
+    # I manually ordered the texts in the order used in the survey
+    survey_ai_texts = ['ai\\gpt2_poem.txt\\0', 'ai\\gpt2_poem.txt\\1', 'ai\\opt_poem.txt\\0', 'ai\\opt_poem.txt\\1', 'ai\\gpt4o_poem.txt\\0',  'ai\\gpt4o_poem.txt\\1',
+                       'ai\\gpt4o_wiki.txt\\0', 'ai\\gpt4o_wiki.txt\\1', 'ai\\opt_wiki.txt\\0', 'ai\\opt_wiki.txt\\1', 'ai\\gpt2_wiki.txt\\0', 'ai\\gpt2_wiki.txt\\1',
+                       'ai\\opt_sport.txt\\0', 'ai\\opt_sport.txt\\1', 'ai\\gpt4o_sports.txt\\0', 'ai\\gpt4o_sports.txt\\1', 'ai\\gpt2_sport.txt\\0', 'ai\\gpt2_sport.txt\\1'
+                       ]
+    survey_human_texts = ["human\\poetry.txt\\0", 'human\\poetry.txt\\1', 'human\\poetry.txt\\2', 'human\\poetry.txt\\3', 'human\\poetry.txt\\4', 'human\\poetry.txt\\5',
+                          'human\\wiki.txt\\0', 'human\\wiki.txt\\1', 'human\\wiki.txt\\2', 'human\\wiki.txt\\3', 'human\\wiki.txt\\4', 'human\\wiki.txt\\5',
+                          'human\\sport_bbc.txt\\0', 'human\\sport_bbc.txt\\1', 'human\\sport_bbc.txt\\2', 'human\\sport_bbc.txt\\3', 'human\\sport_bbc.txt\\4', 'human\\sport_bbc.txt\\5'
+                          ]
+    survey_groups = zip(survey_ai_texts, survey_human_texts)
+    
+    # compute 
+    survey_assessment = compute_parameters(survey_groups, evaluated_texts)
+                
     # Automatically asses if text is human or ai genareted
     result = predict_human_ai(survey_assessment)
+    return result, survey_assessment # Has the Tags for predicitons
+
+if __name__ == '__main__':
+    result = calculated_predictions()
+    print(result)
\ No newline at end of file
diff --git a/src/automatic_metrics.py b/src/compute_metrics.py
similarity index 100%
rename from src/automatic_metrics.py
rename to src/compute_metrics.py
diff --git a/src/display_results.py b/src/display_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..d82f3a1a031f2ca1d8251051fd5b652541b5c53c
--- /dev/null
+++ b/src/display_results.py
@@ -0,0 +1,188 @@
+import asses_results
+import automatic_prediciton
+import matplotlib.pyplot as plt
+from pathlib import Path
+import numpy as np
+
+def collect_data():
+    answers = asses_results.get_all_data_from_folder("results", "csv")
+    headers = answers[0]
+    only_answers = answers[1:]
+
+    survey_data = asses_results.process_survey_data(headers, only_answers)
+    
+    # correct answers for each survey group (LLM generated text is the correct answer)
+    correct_text = ["Text 2", "Text 2", "Text 1", "Text 2", "Text 1", "Text 2",
+                    "Text 2", "Text 2", "Text 1", "Text 1", "Text 2", "Text 1",
+                    "Text 1", "Text 2", "Text 2", "Text 1", "Text 1", "Text 2",]
+    
+    models = ["gpt2", "gpt2", "opt", "opt", "gpt4o", "gpt4o",
+              "gpt4o", "gpt4o", "opt", "opt", "gpt2", "gpt2",
+              "opt", "opt", "gpt4o", "gpt4o", "gpt2", "gpt2"]
+    
+    evaluator = asses_results.Proccess_Data(correct_text, survey_data, models)
+    total_correct = evaluator.calculate_correct_answers()
+    total_time = evaluator.get_time()
+    
+    expert_group = evaluator.compare_groups("expert")
+    ai_usage_group = evaluator.compare_groups("ai_usage")
+    time_group = evaluator.compare_groups("time")
+    
+    correct_percentage = [i[2] for i in total_correct.values()] # extracts average percentage of correct answers
+    model_results = evaluator.compare_ai(correct_percentage)
+
+    parameters = ["Coherence", "Conciseness", "Creativity", "Clarity of Concept"]
+    models = ["gpt2", "gpt4o", "opt", "human"]
+    
+    avg_scores_model = {
+    param: {model: evaluator.average_parameter_model(param, model) for model in models}
+    for param in parameters
+    }
+    
+    avg_scores_question = {param: evaluator.parameter_survey_group_assesment(param) for param in parameters}
+    
+    return total_correct, expert_group, ai_usage_group, time_group, model_results, avg_scores_model, total_time, avg_scores_question
+
+def save_bar_chart(categories, values, filename="barchart.png", title="Average Correct Guess", y="percentage"):
+    """Creates and saves a bar chart as a PNG file in the results folder."""
+    script_dir = Path(__file__).resolve().parent
+    results_dir = script_dir.parent / "results"
+
+    # Create bar chart
+    plt.figure(figsize=(8, 6))
+    plt.bar(categories, values, color='steelblue')
+    if y == "percentage":
+        # Set y-axis limits from 0 to 1
+        plt.ylim(0, 1)
+
+        # Format y-axis as percentages
+        plt.gca().set_yticks([i / 10 for i in range(11)])  # 0.0, 0.1, ..., 1.0
+        plt.gca().set_yticklabels([f"{int(y*100)}%" for y in plt.gca().get_yticks()])
+        plt.ylabel('Percentage')
+    if y == "int":
+        max_value = max(values)
+        max_y = max_value + 1
+        # Set y-axis limits from 0 to max_y
+        plt.ylim(0, max_y)
+
+        # Set y-ticks from 0 to max_y with a step of 10
+        plt.gca().set_yticks(range(0, int(max_y) + 1, 10))  # Step of 10
+        plt.ylabel('Time (in minutes)')
+        
+    # Labels and title
+    plt.xlabel('Categories')
+    
+    plt.title(title)
+
+    # Save plot as a PNG file
+    save_path = results_dir / filename
+    plt.savefig(save_path, bbox_inches='tight')
+    plt.close()  # Close the plot to free memory
+
+    return save_path
+
+def save_grouped_bar_chart(results, filename="param_scores.png"):
+    """Creates and saves a grouped bar chart with model names inside the bars."""
+    script_dir = Path(__file__).resolve().parent
+    results_dir = script_dir.parent / "results"
+
+    parameters = list(results.keys())  # Extract parameter names
+    model_names = list(next(iter(results.values())).keys())  # Extract model names
+
+    # Convert dictionary into a 2D list (rows: parameters, columns: models)
+    values = []
+    for param in parameters:
+        param_values = []
+        for model in model_names:
+            param_values.append(results[param][model])
+        values.append(param_values)
+
+    values = np.array(values)
+
+    # Plot settings
+    x = np.arange(len(parameters))  # X positions for groups
+    width = 0.2  # Width of bars
+
+    plt.figure(figsize=(10, 6))
+
+    # Create bars for each model
+    for i, model in enumerate(model_names):
+        bars = plt.bar(x + i * width - (width * (len(model_names) - 1) / 2), values[:, i], width, label=model)
+
+        # Add model name inside each bar
+        for bar in bars:
+            height = bar.get_height()
+            plt.text(bar.get_x() + bar.get_width() / 2, height / 2, model, ha='center', va='center', fontsize=10, color='white', fontweight='bold')
+
+    # Labels and styling
+    plt.xlabel("Evaluation Parameters")
+    plt.ylabel("Scores")
+    plt.title("Model Performance Across Evaluation Parameters")
+    plt.xticks(x, parameters)
+    plt.ylim(0, 5)  # Assuming scores range between 0 and 5
+    plt.legend(title="Models")
+
+    # Save the figure
+    save_path = results_dir / filename
+    plt.savefig(save_path, bbox_inches='tight')
+    plt.close()  # Close plot to free memory
+
+    return save_path
+
+if __name__ == '__main__':
+    results = collect_data()
+
+    # Average correct guess for all the survey groups across different participant groups
+    categories = ['Average correct', 'Experts', 'Freq. AI usage', 'Below avg Time', 'Above avg Time']
+    values = [results[0][0][2], results[1][0][2], results[2][0][2], results[3][0][0][2], results[3][1][0][2]]
+    save_bar_chart(categories, values, "correct_guess.png")
+
+    # Average correct guess across all survey groups
+    categories = [str(num) for num in list(results[0].keys())]
+    values = [total[2] for total in list(results[0].values())]
+    save_bar_chart(categories, values, "survey_groups_correct.png")
+
+    # Time spent with survey
+    categories = [str(participant) for participant in list(results[6][1].keys())]
+    categories.append("0")
+    values = [time_spent for time_spent in list(results[6][1].values())]
+    values.append(results[6][0])
+    save_bar_chart(categories, values, "time_spent.png", "Time Spent", y="int")
+
+    # Average scores for each parameter across models
+    save_grouped_bar_chart(results[5])
+
+    # Correct Guess for each model
+    categories = list(results[4].keys())
+    values = list(results[4].values())
+    save_bar_chart(categories, values, "model_results.png")
+
+    predictions = automatic_prediciton.calculated_predictions()
+
+    # Show how often the simple model predicts correctly
+    predictions_tag = predictions[0]
+    categories = [str(question_num) for question_num in predictions_tag.keys()]
+    values = []
+    for tag in predictions_tag.values():
+        if tag == "equal":
+            values.append(0)
+        if tag == "human":
+            values.append(1)
+    save_bar_chart(categories, values, "automated_prediction.png", "Automated Predictiion", y="int")
+
+    # counts how often the model rates the parameters higher for ai or human compared to survey results from humans
+    predicitons_param = predictions[1]
+    human_param = results[7]
+    count, count_total = 0, 0
+    for question_num, answers in enumerate(predicitons_param.values()):
+        for param, predicted_tag in answers.items():
+            try:
+                human_response = human_param[param][question_num]
+                if predicted_tag == human_response:
+                    count += 1
+                else:
+                    count_total += 1
+            except:
+                count_total += 1
+
+    match_accuracy = count / count_total
\ No newline at end of file