Compare revisions

lkim · lkim · b4cebef1 · b4cebef1 · b4cebef1 · b4cebef1
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ Sample dataset used for CoT analysis

 ## Prompts and Evaluation Results

-Prompts used to evaluate SummEval with GPT-4 are in prompts/summeval (by G-Eval paper)  
+Prompts used to evaluate SummEval with GPT-4 & base and detailed prompts for CoT analysis are in prompts/summeval (by G-Eval paper)
 Auto-CoT prompts are in prompts/cot_analysis

 GPT-4 G-eval results on SummEval are in results (by G-Eval paper)  

--- a/auto_cot.py
+++ b/auto_cot.py
 from google import genai
 from google.genai import types
 from groq import Groq
+from openai import OpenAI
 import os

-api_key = "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"
-
 def read_prompts_from_file(filename, delimiter):
    """Reads a file and splits the content into prompts based on the delimiter."""
    with open(filename, "r", encoding="utf-8") as file:
@@ -30,7 +29,7 @@ def call_gemini(prompt):
 def call_Qwen(prompt):
    """Call Qwen via Groq and return the response."""
    try:
-        client = Groq(api_key= api_key)#os.environ.get("GROQ_API_KEY"))
+        client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

        response = client.chat.completions.create(
        model="qwen-2.5-32b",
@@ -45,12 +44,12 @@ def call_Qwen(prompt):
        return f"Groq API error: {e}"
    
 def call_llama(prompt):
-    """Call Llama via Groq and return the response."""
+    """Call Llama via Hyperbolic and return the response."""
    try:
-        client = Groq(api_key= api_key)#os.environ.get("GROQ_API_KEY"))
+        client = OpenAI(api_key=os.environ.get("HYPERBOLIC_API_KEY"), base_url="https://api.hyperbolic.xyz/v1")

        response = client.chat.completions.create(
-        model="llama3-70b-8192",
+        model="meta-llama/Meta-Llama-3-70B-Instruct",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0,
        )
@@ -58,7 +57,7 @@ def call_llama(prompt):
        return response.choices[0].message.content

    except Exception as e:
-        return f"Groq API error: {e}"
+        return f"Hyperbolic API error: {e}"
    
 if __name__ == "__main__":
    input_filename = 'prompts\\cot_analysis\\base_prompts.txt'

--- a/correlation_tables/gemini_output.txt
+++ b/correlation_tables/gemini_output.txt

-Filename: .\results\gemini\gemini_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.3529411764705883
+Filename: .\results\gemini\gemini_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.3529411764705883, Skipped Scores: 5
 +---------+----------+---------+
 | Pearson | Spearman | Kendall |
 +---------+----------+---------+
 |  0.6866 |  0.6713  |  0.6578 |
 +---------+----------+---------+
-Filename: .\results\gemini\gemini_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.5
+Filename: .\results\gemini\gemini_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.5, Skipped Scores: 1
 +---------+----------+---------+
 | Pearson | Spearman | Kendall |
 +---------+----------+---------+
 |  0.9922 |   1.0    |   1.0   |
 +---------+----------+---------+
-Filename: .\results\gemini\gemini_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.8596491228070176
+Filename: .\results\gemini\gemini_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.8596491228070176, Skipped Scores: 1
 +---------+----------+---------+
 | Pearson | Spearman | Kendall |
 +---------+----------+---------+
 |  0.6381 |  0.6364  |  0.6267 |
 +---------+----------+---------+
-Filename: .\results\gemini\llama_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.466666666666667
+Filename: .\results\gemini\llama_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.466666666666667, Skipped Scores: 24
 +---------+----------+---------+
 | Pearson | Spearman | Kendall |
 +---------+----------+---------+
 |  0.5714 |  0.5714  |  0.5714 |
 +---------+----------+---------+
-Filename: .\results\gemini\llama_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.5
+Filename: .\results\gemini\llama_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.5, Skipped Scores: 0
 +---------+----------+---------+
 | Pearson | Spearman | Kendall |
 +---------+----------+---------+
 |   1.0   |   1.0    |   1.0   |
 +---------+----------+---------+
-Filename: .\results\gemini\llama_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.85
+Filename: .\results\gemini\llama_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.85, Skipped Scores: 0
 +---------+----------+---------+
 | Pearson | Spearman | Kendall |
 +---------+----------+---------+
 |  0.5857 |  0.575   |  0.5612 |
 +---------+----------+---------+
-Filename: .\results\gemini\qwen_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.3333333333333335
+Filename: .\results\gemini\qwen_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.3333333333333335, Skipped Scores: 16
 +---------+----------+---------+
 | Pearson | Spearman | Kendall |
 +---------+----------+---------+
 |  0.6275 |  0.6065  |  0.5855 |
 +---------+----------+---------+
-Filename: .\results\gemini\qwen_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.823529411764706
+Filename: .\results\gemini\qwen_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.823529411764706, Skipped Scores: 0
 +---------+----------+---------+
 | Pearson | Spearman | Kendall |
 +---------+----------+---------+
 |  0.9768 |  0.9833  |  0.9771 |
 +---------+----------+---------+
-Filename: .\results\gemini\qwen_rel_detailed.json, Dimension: relevance, Average Prediction Score: 4.022222222222222
+Filename: .\results\gemini\qwen_rel_detailed.json, Dimension: relevance, Average Prediction Score: 4.022222222222222, Skipped Scores: 0
 +---------+----------+---------+
 | Pearson | Spearman | Kendall |
 +---------+----------+---------+
 |  0.6407 |  0.6244  |  0.6148 |
 +---------+----------+---------+
-Filename: .\results\gemini\scrambled_coh.json, Dimension: coherence, Average Prediction Score: 3.24
+Filename: .\results\gemini\scrambled_coh.json, Dimension: coherence, Average Prediction Score: 3.24, Skipped Scores: 16
 +---------+----------+---------+
 | Pearson | Spearman | Kendall |
 +---------+----------+---------+
 |  0.7879 |  0.7855  |  0.7749 |
 +---------+----------+---------+
-Filename: .\results\gemini\scrambled_con.json, Dimension: consistency, Average Prediction Score: 3.5384615384615383
+Filename: .\results\gemini\scrambled_con.json, Dimension: consistency, Average Prediction Score: 3.5384615384615383, Skipped Scores: 0
 +---------+----------+---------+
 | Pearson | Spearman | Kendall |
 +---------+----------+---------+
 |  0.969  |  0.9777  |  0.9694 |
 +---------+----------+---------+
-Filename: .\results\gemini\scrambled_rel.json, Dimension: relevance, Average Prediction Score: 3.95
+Filename: .\results\gemini\scrambled_rel.json, Dimension: relevance, Average Prediction Score: 3.95, Skipped Scores: 0
 +---------+----------+---------+
 | Pearson | Spearman | Kendall |
 +---------+----------+---------+
 |  0.7872 |  0.773   |  0.7661 |
 +---------+----------+---------+
-Filename: .\results\gemini\short_coh.json, Dimension: coherence, Average Prediction Score: 3.5952380952380953
+Filename: .\results\gemini\short_coh.json, Dimension: coherence, Average Prediction Score: 3.41025641025641, Skipped Scores: 9
 +---------+----------+---------+
 | Pearson | Spearman | Kendall |
 +---------+----------+---------+
-|  0.4153 |  0.4092  |  0.4065 |
+|  0.614  |  0.5931  |  0.596  |
 +---------+----------+---------+
-Filename: .\results\gemini\short_con.json, Dimension: consistency, Average Prediction Score: 3.3333333333333335
+Filename: .\results\gemini\short_con.json, Dimension: consistency, Average Prediction Score: 3.6, Skipped Scores: 1
 +---------+----------+---------+
 | Pearson | Spearman | Kendall |
 +---------+----------+---------+
-|  0.9735 |  0.9809  |  0.9738 |
+|  0.9794 |  0.9851  |  0.9796 |
 +---------+----------+---------+
-Filename: .\results\gemini\short_rel.json, Dimension: relevance, Average Prediction Score: 3.8333333333333335
+Filename: .\results\gemini\short_rel.json, Dimension: relevance, Average Prediction Score: 3.8292682926829267, Skipped Scores: 0
 +---------+----------+---------+
 | Pearson | Spearman | Kendall |
 +---------+----------+---------+
-|  0.5161 |  0.5083  |  0.503  |
+|  0.5082 |  0.5004  |  0.4906 |
 +---------+----------+---------+
\ No newline at end of file
--- a/correlation_tables/llama_output.txt
+++ b/correlation_tables/llama_output.txt
+
+Filename: .\results\llama\gemini_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.0454545454545454, Skipped Scores: 0
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.6394 |  0.626   |  0.6219 |
+---------+----------+---------+
+Filename: .\results\llama\gemini_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.7142857142857144, Skipped Scores: 0
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.7144 |  0.7222  |  0.7222 |
+---------+----------+---------+
+Filename: .\results\llama\gemini_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.675, Skipped Scores: 0
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.7344 |  0.7322  |  0.7299 |
+---------+----------+---------+
+Filename: .\results\llama\llama_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.088888888888889, Skipped Scores: 0
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.7767 |  0.7727  |  0.7656 |
+---------+----------+---------+
+Filename: .\results\llama\llama_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.7, Skipped Scores: 0
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.7849 |  0.7927  |  0.7982 |
+---------+----------+---------+
+Filename: .\results\llama\llama_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.6315789473684212, Skipped Scores: 0
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|   0.5   |  0.4839  |  0.4853 |
+---------+----------+---------+
+Filename: .\results\llama\qwen_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.0833333333333335, Skipped Scores: 0
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.6922 |  0.6791  |  0.6651 |
+---------+----------+---------+
+Filename: .\results\llama\qwen_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.5238095238095237, Skipped Scores: 0
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.7571 |  0.7629  |  0.7574 |
+---------+----------+---------+
+Filename: .\results\llama\qwen_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.7333333333333334, Skipped Scores: 0
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.6286 |  0.6109  |  0.6095 |
+---------+----------+---------+
+Filename: .\results\llama\scrambled_coh.json, Dimension: coherence, Average Prediction Score: 3.1777777777777776, Skipped Scores: 0
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.7154 |  0.705   |  0.6921 |
+---------+----------+---------+
+Filename: .\results\llama\scrambled_con.json, Dimension: consistency, Average Prediction Score: 3.5, Skipped Scores: 0
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.8255 |  0.8333  |  0.8333 |
+---------+----------+---------+
+Filename: .\results\llama\scrambled_rel.json, Dimension: relevance, Average Prediction Score: 3.8666666666666667, Skipped Scores: 0
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.5328 |  0.539   |  0.5402 |
+---------+----------+---------+
+Filename: .\results\llama\short_coh.json, Dimension: coherence, Average Prediction Score: 3.475, Skipped Scores: 0
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.697  |  0.6901  |  0.6745 |
+---------+----------+---------+
+Filename: .\results\llama\short_con.json, Dimension: consistency, Average Prediction Score: 3.857142857142857, Skipped Scores: 0
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.8255 |  0.8333  |  0.8333 |
+---------+----------+---------+
+Filename: .\results\llama\short_rel.json, Dimension: relevance, Average Prediction Score: 4.038461538461538, Skipped Scores: 0
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.4965 |  0.4944  |  0.4976 |
+---------+----------+---------+
\ No newline at end of file
--- a/gemini_eval.py
+++ b/gemini_eval.py
@@ -22,7 +22,7 @@ sample_summeval = [summeval[i] for i in sample_indices]

 ct, ignore = 0, 0

-for file in os.listdir(prompt_dir):
+for file in os.listdir(prompt_dir)[12:]:
    if file.endswith(".txt"):
        prompt_fp = os.path.join(prompt_dir, file)
        output_fp = os.path.join(output_dir, file.replace(".txt", ".json"))
@@ -41,7 +41,7 @@ for file in os.listdir(prompt_dir):
                        contents=cur_prompt,
                        config=types.GenerateContentConfig(
                        temperature=2,
-                        max_output_tokens=5,
+                        max_output_tokens=5,    #Should have been slightly higher to accommodate for invalid model responses
                        top_p=1,
                        frequency_penalty=0,
                        presence_penalty=0,

--- a/llama3_eval.py
+++ b/llama3_eval.py
 from groq import Groq
+from openai import OpenAI
 import random
 import os
 import json
 import tqdm
 import time

-client = Groq(api_key="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")
+client = OpenAI(
+    api_key=os.environ["HYPERBOLIC_API_KEY"],
+    base_url="https://api.hyperbolic.xyz/v1",
+    )

 prompt_dir = 'prompts\\cot_analysis'
 output_dir = 'results\\llama'
 summeval_dir = 'data\\summeval.json'
-model = 'llama3-70b-8192'
+model = 'meta-llama/Meta-Llama-3-70B-Instruct'

 os.makedirs(output_dir, exist_ok=True)
 summeval = json.load(open(summeval_dir))
@@ -21,7 +25,7 @@ sample_summeval = [summeval[i] for i in sample_indices]

 ct, ignore = 0, 0

-for file in os.listdir(prompt_dir):
+for file in os.listdir(prompt_dir)[12:]:
    if file.endswith(".txt"):
        prompt_fp = os.path.join(prompt_dir, file)
        output_fp = os.path.join(output_dir, file.replace(".txt", ".json"))
@@ -38,8 +42,10 @@ for file in os.listdir(prompt_dir):
                try:
                    _response = client.chat.completions.create(
                        model=model, 
-                        messages=[{"role": "system", "content": "You are a helpful assistant that compares the Source Text and Summary based on the prompt given. You only give numerical values as response, based on the Evalation Steps given, and do not respond with text."},
-                            {"role": "user", "content": cur_prompt}], 
+                        messages=[
+                            {"role": "system", "content": "You are a helpful assistant that compares the Source Text and Summary based on the prompt given. You only give numerical values as response, based on the Evalation Steps given, and do not respond with text."},
+                            {"role": "user", "content": cur_prompt}
+                            ], 
                        temperature=0,
                        max_completion_tokens=10,
                        top_p=1,
@@ -47,7 +53,7 @@ for file in os.listdir(prompt_dir):
                        presence_penalty=0,
                        stop=None,
                        #logprobs=True,     Not yet supported
-                        #n=20               Currently only n=1 supported
+                        #n=20               Possible to estimate logprobs via high temperature and n
                    )
                    time.sleep(0.5)


--- a/meta_eval_summeval.py
+++ b/meta_eval_summeval.py
@@ -9,7 +9,7 @@ from glob import glob
 def calculate_correlation(pred_score, human_score, result):
    assert len(pred_score) == len(human_score)

-    #Part of original code, not sure how to fix without breaking it
+    #Throws error: Part of original code, not sure how to fix without breaking it
    if (len(result) == 0) or result is None:
        result = {'pearson': 0, 'spearman': 0, 'kendalltau': 0}
    result['pearson'] += pearsonr(pred_score, human_score)[0]
@@ -18,7 +18,7 @@ def calculate_correlation(pred_score, human_score, result):

    return result

-#Very problematic way of doing this, but wasn't sure how else to do it (alternative would be to change the filenames to name the dimensions directly)
+#Problematic approach, but wasn't sure how else to do it (alternative is changing the filenames to name the dimensions directly)
 def dimension_from_filename(filename):

    #Asked ChatGPT for this Regex
@@ -37,24 +37,21 @@ def dimension_from_filename(filename):
    else:
        return None

-def print_correlations(result, n, input_file, dimension, avg_pred_score, output_file):
+def print_correlations(result, n, input_file, dimension, avg_pred_score, skipped_scores, output_file):
    table = PrettyTable(['Pearson', 'Spearman', 'Kendall'])
    if (n == 0):
        n = 1
    table.add_row(
        [round(result['pearson'] / n, 4), round(result['spearman'] / n, 4), round(result['kendalltau'] / n, 4)])
    
-    output_file.write(f"\nFilename: {input_file}, Dimension: {dimension}, Average Prediction Score: {avg_pred_score}\n")
+    output_file.write(f"\nFilename: {input_file}, Dimension: {dimension}, Average Prediction Score: {avg_pred_score}, Skipped Scores: {skipped_scores}\n")
    output_file.write(str(table))

 def parse_output(output):
-    #Asked ChatGPT for this Regex (changed it to capture model responses that do not start with a numerical value)
-    matched = re.search(r"([0-9]+(?:\.[0-9]*)?)", output)
+    #Asked ChatGPT for this Regex (changed it from original code to capture model responses that do not start with a numerical value)
+    matched = re.search(r"([1-5]+(?:\.[1-5]*)?)", output)
    if matched:
-        try:
-            score = float(matched.group(1))
-        except:
-            score = 0
+        score = float(matched.group(1))
    else:
        score = 0
    return score
@@ -115,6 +112,6 @@ if __name__ == '__main__':
                    d_ctr += 1

                avg_pred_score = total_pred_score / scores_count
-                print_correlations(results, n=d_ctr, input_file = input_file, dimension = dimension, avg_pred_score = avg_pred_score, output_file = output_file)
+                print_correlations(results, n=d_ctr, input_file = input_file, dimension = dimension, avg_pred_score = avg_pred_score, skipped_scores = skipped_scores, output_file = output_file)
            else:
                print(f"Skipping file: {input_file}")
\ No newline at end of file
--- a/prompts/cot_analysis/short_coh.txt
+++ b/prompts/cot_analysis/short_coh.txt
@@ -10,11 +10,11 @@ Coherence (1-5) - the collective quality of all sentences. We align this dimensi

 Evaluation Steps:

-1. Read.
-2. Assess quality.
-3. Determine if well-structured and well-organized.
-4. Decide if it forms a coherent body.
-5. Rate on scale of 1-5 for Coherence.
+1. read
+2. assess quality
+3. determine if structured/organized
+4. decide if coherent body
+5. rate on scale 1-5 for Coherence


 Example:

--- a/prompts/cot_analysis/short_con.txt
+++ b/prompts/cot_analysis/short_con.txt
@@ -10,10 +10,10 @@ Consistency (1-5) - the factual alignment between the summary and the summarized

 Evaluation Steps:

-1. Read news article.
-2. Read summary.
-3. Identify inconsistencies/hallucinations.
-4. Rate on scale of 1-5 for Consistency.
+1. read news article
+2. read summary
+3. identify inconsistencies/hallucinations
+4. rate on scale 1-5 for Consistency


 Example:

--- a/prompts/cot_analysis/short_rel.txt
+++ b/prompts/cot_analysis/short_rel.txt
@@ -10,11 +10,11 @@ Relevance (1-5) - selection of important content from the source. The summary sh

 Evaluation Steps:

-1. Read.
-2. Identify important information.
-3. Determine if only important information included.
-4. Check redundancies/excess information.
-5. Rate on scale of 1-5 for Relevance.
+1. read
+2. identify important information
+3. determine if only important information included
+4. check redundancies/excess information
+5. rate on scale 1-5 for Relevance


 Example:

--- a/prompts/cot_analysis/base_prompts.txt
+++ b/prompts/cot_analysis/base_prompts.txt
--- a/prompts/cot_analysis/detailed_prompts.txt
+++ b/prompts/cot_analysis/detailed_prompts.txt
--- a/qwen_eval.py
+++ b/qwen_eval.py
@@ -5,7 +5,7 @@ import json
 import time
 import tqdm

-client = Groq(api_key="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")
+client = Groq(api_key=os.environ["GROQ_API_KEY"])

 prompt_dir = 'prompts\\cot_analysis'
 output_dir = 'results\\qwen'
@@ -20,8 +20,6 @@ sample_indices = sorted(random.sample(range(len(summeval)), 100))
 sample_summeval = [summeval[i] for i in sample_indices]

 for file in os.listdir(prompt_dir):
-    if file.startswith(("gemini", "llama", "qwen")):
-        continue
    if file.endswith(".txt"):
        prompt_fp = os.path.join(prompt_dir, file)
        output_fp = os.path.join(output_dir, file.replace(".txt", ".json"))
@@ -41,7 +39,7 @@ for file in os.listdir(prompt_dir):
                        model=model, 
                        messages=[{"role": "user", "content": cur_prompt}], 
                        temperature=0,
-                        max_completion_tokens=5,
+                        max_completion_tokens=10,
                        top_p=1,
                        frequency_penalty=0,
                        presence_penalty=0,

--- a/results/gemini/short_coh.json
+++ b/results/gemini/short_coh.json
--- a/results/gemini/short_con.json
+++ b/results/gemini/short_con.json
--- a/results/gemini/short_rel.json
+++ b/results/gemini/short_rel.json
--- a/results/llama/gemini_coh_detailed.json
+++ b/results/llama/gemini_coh_detailed.json
--- a/results/llama/gemini_con_detailed.json
+++ b/results/llama/gemini_con_detailed.json
--- a/results/llama/gemini_rel_detailed.json
+++ b/results/llama/gemini_rel_detailed.json
--- a/results/llama/llama_coh_detailed.json
+++ b/results/llama/llama_coh_detailed.json
No results found