Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • lkim/analysis-of-chain-of-thought-in-llm-scoring
1 result
Show changes
Commits on Source (2)
Showing
with 7006 additions and 475 deletions
......@@ -14,7 +14,7 @@ Sample dataset used for CoT analysis
## Prompts and Evaluation Results
Prompts used to evaluate SummEval with GPT-4 are in prompts/summeval (by G-Eval paper)
Prompts used to evaluate SummEval with GPT-4 & base and detailed prompts for CoT analysis are in prompts/summeval (by G-Eval paper)
Auto-CoT prompts are in prompts/cot_analysis
GPT-4 G-eval results on SummEval are in results (by G-Eval paper)
......
from google import genai
from google.genai import types
from groq import Groq
from openai import OpenAI
import os
api_key = "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"
def read_prompts_from_file(filename, delimiter):
"""Reads a file and splits the content into prompts based on the delimiter."""
with open(filename, "r", encoding="utf-8") as file:
......@@ -30,7 +29,7 @@ def call_gemini(prompt):
def call_Qwen(prompt):
"""Call Qwen via Groq and return the response."""
try:
client = Groq(api_key= api_key)#os.environ.get("GROQ_API_KEY"))
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
response = client.chat.completions.create(
model="qwen-2.5-32b",
......@@ -45,12 +44,12 @@ def call_Qwen(prompt):
return f"Groq API error: {e}"
def call_llama(prompt):
"""Call Llama via Groq and return the response."""
"""Call Llama via Hyperbolic and return the response."""
try:
client = Groq(api_key= api_key)#os.environ.get("GROQ_API_KEY"))
client = OpenAI(api_key=os.environ.get("HYPERBOLIC_API_KEY"), base_url="https://api.hyperbolic.xyz/v1")
response = client.chat.completions.create(
model="llama3-70b-8192",
model="meta-llama/Meta-Llama-3-70B-Instruct",
messages=[{"role": "user", "content": prompt}],
temperature=0.0,
)
......@@ -58,7 +57,7 @@ def call_llama(prompt):
return response.choices[0].message.content
except Exception as e:
return f"Groq API error: {e}"
return f"Hyperbolic API error: {e}"
if __name__ == "__main__":
input_filename = 'prompts\\cot_analysis\\base_prompts.txt'
......
Filename: .\results\gemini\gemini_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.3529411764705883
Filename: .\results\gemini\gemini_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.3529411764705883, Skipped Scores: 5
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.6866 | 0.6713 | 0.6578 |
+---------+----------+---------+
Filename: .\results\gemini\gemini_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.5
Filename: .\results\gemini\gemini_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.5, Skipped Scores: 1
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.9922 | 1.0 | 1.0 |
+---------+----------+---------+
Filename: .\results\gemini\gemini_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.8596491228070176
Filename: .\results\gemini\gemini_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.8596491228070176, Skipped Scores: 1
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.6381 | 0.6364 | 0.6267 |
+---------+----------+---------+
Filename: .\results\gemini\llama_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.466666666666667
Filename: .\results\gemini\llama_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.466666666666667, Skipped Scores: 24
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.5714 | 0.5714 | 0.5714 |
+---------+----------+---------+
Filename: .\results\gemini\llama_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.5
Filename: .\results\gemini\llama_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.5, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 1.0 | 1.0 | 1.0 |
+---------+----------+---------+
Filename: .\results\gemini\llama_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.85
Filename: .\results\gemini\llama_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.85, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.5857 | 0.575 | 0.5612 |
+---------+----------+---------+
Filename: .\results\gemini\qwen_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.3333333333333335
Filename: .\results\gemini\qwen_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.3333333333333335, Skipped Scores: 16
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.6275 | 0.6065 | 0.5855 |
+---------+----------+---------+
Filename: .\results\gemini\qwen_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.823529411764706
Filename: .\results\gemini\qwen_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.823529411764706, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.9768 | 0.9833 | 0.9771 |
+---------+----------+---------+
Filename: .\results\gemini\qwen_rel_detailed.json, Dimension: relevance, Average Prediction Score: 4.022222222222222
Filename: .\results\gemini\qwen_rel_detailed.json, Dimension: relevance, Average Prediction Score: 4.022222222222222, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.6407 | 0.6244 | 0.6148 |
+---------+----------+---------+
Filename: .\results\gemini\scrambled_coh.json, Dimension: coherence, Average Prediction Score: 3.24
Filename: .\results\gemini\scrambled_coh.json, Dimension: coherence, Average Prediction Score: 3.24, Skipped Scores: 16
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.7879 | 0.7855 | 0.7749 |
+---------+----------+---------+
Filename: .\results\gemini\scrambled_con.json, Dimension: consistency, Average Prediction Score: 3.5384615384615383
Filename: .\results\gemini\scrambled_con.json, Dimension: consistency, Average Prediction Score: 3.5384615384615383, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.969 | 0.9777 | 0.9694 |
+---------+----------+---------+
Filename: .\results\gemini\scrambled_rel.json, Dimension: relevance, Average Prediction Score: 3.95
Filename: .\results\gemini\scrambled_rel.json, Dimension: relevance, Average Prediction Score: 3.95, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.7872 | 0.773 | 0.7661 |
+---------+----------+---------+
Filename: .\results\gemini\short_coh.json, Dimension: coherence, Average Prediction Score: 3.5952380952380953
Filename: .\results\gemini\short_coh.json, Dimension: coherence, Average Prediction Score: 3.41025641025641, Skipped Scores: 9
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.4153 | 0.4092 | 0.4065 |
| 0.614 | 0.5931 | 0.596 |
+---------+----------+---------+
Filename: .\results\gemini\short_con.json, Dimension: consistency, Average Prediction Score: 3.3333333333333335
Filename: .\results\gemini\short_con.json, Dimension: consistency, Average Prediction Score: 3.6, Skipped Scores: 1
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.9735 | 0.9809 | 0.9738 |
| 0.9794 | 0.9851 | 0.9796 |
+---------+----------+---------+
Filename: .\results\gemini\short_rel.json, Dimension: relevance, Average Prediction Score: 3.8333333333333335
Filename: .\results\gemini\short_rel.json, Dimension: relevance, Average Prediction Score: 3.8292682926829267, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.5161 | 0.5083 | 0.503 |
| 0.5082 | 0.5004 | 0.4906 |
+---------+----------+---------+
\ No newline at end of file
Filename: .\results\llama\gemini_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.0454545454545454, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.6394 | 0.626 | 0.6219 |
+---------+----------+---------+
Filename: .\results\llama\gemini_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.7142857142857144, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.7144 | 0.7222 | 0.7222 |
+---------+----------+---------+
Filename: .\results\llama\gemini_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.675, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.7344 | 0.7322 | 0.7299 |
+---------+----------+---------+
Filename: .\results\llama\llama_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.088888888888889, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.7767 | 0.7727 | 0.7656 |
+---------+----------+---------+
Filename: .\results\llama\llama_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.7, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.7849 | 0.7927 | 0.7982 |
+---------+----------+---------+
Filename: .\results\llama\llama_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.6315789473684212, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.5 | 0.4839 | 0.4853 |
+---------+----------+---------+
Filename: .\results\llama\qwen_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.0833333333333335, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.6922 | 0.6791 | 0.6651 |
+---------+----------+---------+
Filename: .\results\llama\qwen_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.5238095238095237, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.7571 | 0.7629 | 0.7574 |
+---------+----------+---------+
Filename: .\results\llama\qwen_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.7333333333333334, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.6286 | 0.6109 | 0.6095 |
+---------+----------+---------+
Filename: .\results\llama\scrambled_coh.json, Dimension: coherence, Average Prediction Score: 3.1777777777777776, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.7154 | 0.705 | 0.6921 |
+---------+----------+---------+
Filename: .\results\llama\scrambled_con.json, Dimension: consistency, Average Prediction Score: 3.5, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.8255 | 0.8333 | 0.8333 |
+---------+----------+---------+
Filename: .\results\llama\scrambled_rel.json, Dimension: relevance, Average Prediction Score: 3.8666666666666667, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.5328 | 0.539 | 0.5402 |
+---------+----------+---------+
Filename: .\results\llama\short_coh.json, Dimension: coherence, Average Prediction Score: 3.475, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.697 | 0.6901 | 0.6745 |
+---------+----------+---------+
Filename: .\results\llama\short_con.json, Dimension: consistency, Average Prediction Score: 3.857142857142857, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.8255 | 0.8333 | 0.8333 |
+---------+----------+---------+
Filename: .\results\llama\short_rel.json, Dimension: relevance, Average Prediction Score: 4.038461538461538, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.4965 | 0.4944 | 0.4976 |
+---------+----------+---------+
\ No newline at end of file
......@@ -22,7 +22,7 @@ sample_summeval = [summeval[i] for i in sample_indices]
ct, ignore = 0, 0
for file in os.listdir(prompt_dir):
for file in os.listdir(prompt_dir)[12:]:
if file.endswith(".txt"):
prompt_fp = os.path.join(prompt_dir, file)
output_fp = os.path.join(output_dir, file.replace(".txt", ".json"))
......@@ -41,7 +41,7 @@ for file in os.listdir(prompt_dir):
contents=cur_prompt,
config=types.GenerateContentConfig(
temperature=2,
max_output_tokens=5,
max_output_tokens=5, #Should have been slightly higher to accommodate for invalid model responses
top_p=1,
frequency_penalty=0,
presence_penalty=0,
......
from groq import Groq
from openai import OpenAI
import random
import os
import json
import tqdm
import time
client = Groq(api_key="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")
client = OpenAI(
api_key=os.environ["HYPERBOLIC_API_KEY"],
base_url="https://api.hyperbolic.xyz/v1",
)
prompt_dir = 'prompts\\cot_analysis'
output_dir = 'results\\llama'
summeval_dir = 'data\\summeval.json'
model = 'llama3-70b-8192'
model = 'meta-llama/Meta-Llama-3-70B-Instruct'
os.makedirs(output_dir, exist_ok=True)
summeval = json.load(open(summeval_dir))
......@@ -21,7 +25,7 @@ sample_summeval = [summeval[i] for i in sample_indices]
ct, ignore = 0, 0
for file in os.listdir(prompt_dir):
for file in os.listdir(prompt_dir)[12:]:
if file.endswith(".txt"):
prompt_fp = os.path.join(prompt_dir, file)
output_fp = os.path.join(output_dir, file.replace(".txt", ".json"))
......@@ -38,8 +42,10 @@ for file in os.listdir(prompt_dir):
try:
_response = client.chat.completions.create(
model=model,
messages=[{"role": "system", "content": "You are a helpful assistant that compares the Source Text and Summary based on the prompt given. You only give numerical values as response, based on the Evalation Steps given, and do not respond with text."},
{"role": "user", "content": cur_prompt}],
messages=[
{"role": "system", "content": "You are a helpful assistant that compares the Source Text and Summary based on the prompt given. You only give numerical values as response, based on the Evalation Steps given, and do not respond with text."},
{"role": "user", "content": cur_prompt}
],
temperature=0,
max_completion_tokens=10,
top_p=1,
......@@ -47,7 +53,7 @@ for file in os.listdir(prompt_dir):
presence_penalty=0,
stop=None,
#logprobs=True, Not yet supported
#n=20 Currently only n=1 supported
#n=20 Possible to estimate logprobs via high temperature and n
)
time.sleep(0.5)
......
......@@ -9,7 +9,7 @@ from glob import glob
def calculate_correlation(pred_score, human_score, result):
assert len(pred_score) == len(human_score)
#Part of original code, not sure how to fix without breaking it
#Throws error: Part of original code, not sure how to fix without breaking it
if (len(result) == 0) or result is None:
result = {'pearson': 0, 'spearman': 0, 'kendalltau': 0}
result['pearson'] += pearsonr(pred_score, human_score)[0]
......@@ -18,7 +18,7 @@ def calculate_correlation(pred_score, human_score, result):
return result
#Very problematic way of doing this, but wasn't sure how else to do it (alternative would be to change the filenames to name the dimensions directly)
#Problematic approach, but wasn't sure how else to do it (alternative is changing the filenames to name the dimensions directly)
def dimension_from_filename(filename):
#Asked ChatGPT for this Regex
......@@ -37,24 +37,21 @@ def dimension_from_filename(filename):
else:
return None
def print_correlations(result, n, input_file, dimension, avg_pred_score, output_file):
def print_correlations(result, n, input_file, dimension, avg_pred_score, skipped_scores, output_file):
table = PrettyTable(['Pearson', 'Spearman', 'Kendall'])
if (n == 0):
n = 1
table.add_row(
[round(result['pearson'] / n, 4), round(result['spearman'] / n, 4), round(result['kendalltau'] / n, 4)])
output_file.write(f"\nFilename: {input_file}, Dimension: {dimension}, Average Prediction Score: {avg_pred_score}\n")
output_file.write(f"\nFilename: {input_file}, Dimension: {dimension}, Average Prediction Score: {avg_pred_score}, Skipped Scores: {skipped_scores}\n")
output_file.write(str(table))
def parse_output(output):
#Asked ChatGPT for this Regex (changed it to capture model responses that do not start with a numerical value)
matched = re.search(r"([0-9]+(?:\.[0-9]*)?)", output)
#Asked ChatGPT for this Regex (changed it from original code to capture model responses that do not start with a numerical value)
matched = re.search(r"([1-5]+(?:\.[1-5]*)?)", output)
if matched:
try:
score = float(matched.group(1))
except:
score = 0
score = float(matched.group(1))
else:
score = 0
return score
......@@ -115,6 +112,6 @@ if __name__ == '__main__':
d_ctr += 1
avg_pred_score = total_pred_score / scores_count
print_correlations(results, n=d_ctr, input_file = input_file, dimension = dimension, avg_pred_score = avg_pred_score, output_file = output_file)
print_correlations(results, n=d_ctr, input_file = input_file, dimension = dimension, avg_pred_score = avg_pred_score, skipped_scores = skipped_scores, output_file = output_file)
else:
print(f"Skipping file: {input_file}")
\ No newline at end of file
......@@ -10,11 +10,11 @@ Coherence (1-5) - the collective quality of all sentences. We align this dimensi
Evaluation Steps:
1. Read.
2. Assess quality.
3. Determine if well-structured and well-organized.
4. Decide if it forms a coherent body.
5. Rate on scale of 1-5 for Coherence.
1. read
2. assess quality
3. determine if structured/organized
4. decide if coherent body
5. rate on scale 1-5 for Coherence
Example:
......
......@@ -10,10 +10,10 @@ Consistency (1-5) - the factual alignment between the summary and the summarized
Evaluation Steps:
1. Read news article.
2. Read summary.
3. Identify inconsistencies/hallucinations.
4. Rate on scale of 1-5 for Consistency.
1. read news article
2. read summary
3. identify inconsistencies/hallucinations
4. rate on scale 1-5 for Consistency
Example:
......
......@@ -10,11 +10,11 @@ Relevance (1-5) - selection of important content from the source. The summary sh
Evaluation Steps:
1. Read.
2. Identify important information.
3. Determine if only important information included.
4. Check redundancies/excess information.
5. Rate on scale of 1-5 for Relevance.
1. read
2. identify important information
3. determine if only important information included
4. check redundancies/excess information
5. rate on scale 1-5 for Relevance
Example:
......
......@@ -5,7 +5,7 @@ import json
import time
import tqdm
client = Groq(api_key="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")
client = Groq(api_key=os.environ["GROQ_API_KEY"])
prompt_dir = 'prompts\\cot_analysis'
output_dir = 'results\\qwen'
......@@ -20,8 +20,6 @@ sample_indices = sorted(random.sample(range(len(summeval)), 100))
sample_summeval = [summeval[i] for i in sample_indices]
for file in os.listdir(prompt_dir):
if file.startswith(("gemini", "llama", "qwen")):
continue
if file.endswith(".txt"):
prompt_fp = os.path.join(prompt_dir, file)
output_fp = os.path.join(output_dir, file.replace(".txt", ".json"))
......@@ -41,7 +39,7 @@ for file in os.listdir(prompt_dir):
model=model,
messages=[{"role": "user", "content": cur_prompt}],
temperature=0,
max_completion_tokens=5,
max_completion_tokens=10,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.