Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • lkim/analysis-of-chain-of-thought-in-llm-scoring
1 result
Show changes
Commits on Source (2)
Showing
with 7006 additions and 475 deletions
...@@ -14,7 +14,7 @@ Sample dataset used for CoT analysis ...@@ -14,7 +14,7 @@ Sample dataset used for CoT analysis
## Prompts and Evaluation Results ## Prompts and Evaluation Results
Prompts used to evaluate SummEval with GPT-4 are in prompts/summeval (by G-Eval paper) Prompts used to evaluate SummEval with GPT-4 & base and detailed prompts for CoT analysis are in prompts/summeval (by G-Eval paper)
Auto-CoT prompts are in prompts/cot_analysis Auto-CoT prompts are in prompts/cot_analysis
GPT-4 G-eval results on SummEval are in results (by G-Eval paper) GPT-4 G-eval results on SummEval are in results (by G-Eval paper)
......
from google import genai from google import genai
from google.genai import types from google.genai import types
from groq import Groq from groq import Groq
from openai import OpenAI
import os import os
api_key = "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"
def read_prompts_from_file(filename, delimiter): def read_prompts_from_file(filename, delimiter):
"""Reads a file and splits the content into prompts based on the delimiter.""" """Reads a file and splits the content into prompts based on the delimiter."""
with open(filename, "r", encoding="utf-8") as file: with open(filename, "r", encoding="utf-8") as file:
...@@ -30,7 +29,7 @@ def call_gemini(prompt): ...@@ -30,7 +29,7 @@ def call_gemini(prompt):
def call_Qwen(prompt): def call_Qwen(prompt):
"""Call Qwen via Groq and return the response.""" """Call Qwen via Groq and return the response."""
try: try:
client = Groq(api_key= api_key)#os.environ.get("GROQ_API_KEY")) client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
response = client.chat.completions.create( response = client.chat.completions.create(
model="qwen-2.5-32b", model="qwen-2.5-32b",
...@@ -45,12 +44,12 @@ def call_Qwen(prompt): ...@@ -45,12 +44,12 @@ def call_Qwen(prompt):
return f"Groq API error: {e}" return f"Groq API error: {e}"
def call_llama(prompt): def call_llama(prompt):
"""Call Llama via Groq and return the response.""" """Call Llama via Hyperbolic and return the response."""
try: try:
client = Groq(api_key= api_key)#os.environ.get("GROQ_API_KEY")) client = OpenAI(api_key=os.environ.get("HYPERBOLIC_API_KEY"), base_url="https://api.hyperbolic.xyz/v1")
response = client.chat.completions.create( response = client.chat.completions.create(
model="llama3-70b-8192", model="meta-llama/Meta-Llama-3-70B-Instruct",
messages=[{"role": "user", "content": prompt}], messages=[{"role": "user", "content": prompt}],
temperature=0.0, temperature=0.0,
) )
...@@ -58,7 +57,7 @@ def call_llama(prompt): ...@@ -58,7 +57,7 @@ def call_llama(prompt):
return response.choices[0].message.content return response.choices[0].message.content
except Exception as e: except Exception as e:
return f"Groq API error: {e}" return f"Hyperbolic API error: {e}"
if __name__ == "__main__": if __name__ == "__main__":
input_filename = 'prompts\\cot_analysis\\base_prompts.txt' input_filename = 'prompts\\cot_analysis\\base_prompts.txt'
......
Filename: .\results\gemini\gemini_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.3529411764705883 Filename: .\results\gemini\gemini_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.3529411764705883, Skipped Scores: 5
+---------+----------+---------+ +---------+----------+---------+
| Pearson | Spearman | Kendall | | Pearson | Spearman | Kendall |
+---------+----------+---------+ +---------+----------+---------+
| 0.6866 | 0.6713 | 0.6578 | | 0.6866 | 0.6713 | 0.6578 |
+---------+----------+---------+ +---------+----------+---------+
Filename: .\results\gemini\gemini_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.5 Filename: .\results\gemini\gemini_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.5, Skipped Scores: 1
+---------+----------+---------+ +---------+----------+---------+
| Pearson | Spearman | Kendall | | Pearson | Spearman | Kendall |
+---------+----------+---------+ +---------+----------+---------+
| 0.9922 | 1.0 | 1.0 | | 0.9922 | 1.0 | 1.0 |
+---------+----------+---------+ +---------+----------+---------+
Filename: .\results\gemini\gemini_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.8596491228070176 Filename: .\results\gemini\gemini_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.8596491228070176, Skipped Scores: 1
+---------+----------+---------+ +---------+----------+---------+
| Pearson | Spearman | Kendall | | Pearson | Spearman | Kendall |
+---------+----------+---------+ +---------+----------+---------+
| 0.6381 | 0.6364 | 0.6267 | | 0.6381 | 0.6364 | 0.6267 |
+---------+----------+---------+ +---------+----------+---------+
Filename: .\results\gemini\llama_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.466666666666667 Filename: .\results\gemini\llama_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.466666666666667, Skipped Scores: 24
+---------+----------+---------+ +---------+----------+---------+
| Pearson | Spearman | Kendall | | Pearson | Spearman | Kendall |
+---------+----------+---------+ +---------+----------+---------+
| 0.5714 | 0.5714 | 0.5714 | | 0.5714 | 0.5714 | 0.5714 |
+---------+----------+---------+ +---------+----------+---------+
Filename: .\results\gemini\llama_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.5 Filename: .\results\gemini\llama_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.5, Skipped Scores: 0
+---------+----------+---------+ +---------+----------+---------+
| Pearson | Spearman | Kendall | | Pearson | Spearman | Kendall |
+---------+----------+---------+ +---------+----------+---------+
| 1.0 | 1.0 | 1.0 | | 1.0 | 1.0 | 1.0 |
+---------+----------+---------+ +---------+----------+---------+
Filename: .\results\gemini\llama_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.85 Filename: .\results\gemini\llama_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.85, Skipped Scores: 0
+---------+----------+---------+ +---------+----------+---------+
| Pearson | Spearman | Kendall | | Pearson | Spearman | Kendall |
+---------+----------+---------+ +---------+----------+---------+
| 0.5857 | 0.575 | 0.5612 | | 0.5857 | 0.575 | 0.5612 |
+---------+----------+---------+ +---------+----------+---------+
Filename: .\results\gemini\qwen_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.3333333333333335 Filename: .\results\gemini\qwen_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.3333333333333335, Skipped Scores: 16
+---------+----------+---------+ +---------+----------+---------+
| Pearson | Spearman | Kendall | | Pearson | Spearman | Kendall |
+---------+----------+---------+ +---------+----------+---------+
| 0.6275 | 0.6065 | 0.5855 | | 0.6275 | 0.6065 | 0.5855 |
+---------+----------+---------+ +---------+----------+---------+
Filename: .\results\gemini\qwen_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.823529411764706 Filename: .\results\gemini\qwen_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.823529411764706, Skipped Scores: 0
+---------+----------+---------+ +---------+----------+---------+
| Pearson | Spearman | Kendall | | Pearson | Spearman | Kendall |
+---------+----------+---------+ +---------+----------+---------+
| 0.9768 | 0.9833 | 0.9771 | | 0.9768 | 0.9833 | 0.9771 |
+---------+----------+---------+ +---------+----------+---------+
Filename: .\results\gemini\qwen_rel_detailed.json, Dimension: relevance, Average Prediction Score: 4.022222222222222 Filename: .\results\gemini\qwen_rel_detailed.json, Dimension: relevance, Average Prediction Score: 4.022222222222222, Skipped Scores: 0
+---------+----------+---------+ +---------+----------+---------+
| Pearson | Spearman | Kendall | | Pearson | Spearman | Kendall |
+---------+----------+---------+ +---------+----------+---------+
| 0.6407 | 0.6244 | 0.6148 | | 0.6407 | 0.6244 | 0.6148 |
+---------+----------+---------+ +---------+----------+---------+
Filename: .\results\gemini\scrambled_coh.json, Dimension: coherence, Average Prediction Score: 3.24 Filename: .\results\gemini\scrambled_coh.json, Dimension: coherence, Average Prediction Score: 3.24, Skipped Scores: 16
+---------+----------+---------+ +---------+----------+---------+
| Pearson | Spearman | Kendall | | Pearson | Spearman | Kendall |
+---------+----------+---------+ +---------+----------+---------+
| 0.7879 | 0.7855 | 0.7749 | | 0.7879 | 0.7855 | 0.7749 |
+---------+----------+---------+ +---------+----------+---------+
Filename: .\results\gemini\scrambled_con.json, Dimension: consistency, Average Prediction Score: 3.5384615384615383 Filename: .\results\gemini\scrambled_con.json, Dimension: consistency, Average Prediction Score: 3.5384615384615383, Skipped Scores: 0
+---------+----------+---------+ +---------+----------+---------+
| Pearson | Spearman | Kendall | | Pearson | Spearman | Kendall |
+---------+----------+---------+ +---------+----------+---------+
| 0.969 | 0.9777 | 0.9694 | | 0.969 | 0.9777 | 0.9694 |
+---------+----------+---------+ +---------+----------+---------+
Filename: .\results\gemini\scrambled_rel.json, Dimension: relevance, Average Prediction Score: 3.95 Filename: .\results\gemini\scrambled_rel.json, Dimension: relevance, Average Prediction Score: 3.95, Skipped Scores: 0
+---------+----------+---------+ +---------+----------+---------+
| Pearson | Spearman | Kendall | | Pearson | Spearman | Kendall |
+---------+----------+---------+ +---------+----------+---------+
| 0.7872 | 0.773 | 0.7661 | | 0.7872 | 0.773 | 0.7661 |
+---------+----------+---------+ +---------+----------+---------+
Filename: .\results\gemini\short_coh.json, Dimension: coherence, Average Prediction Score: 3.5952380952380953 Filename: .\results\gemini\short_coh.json, Dimension: coherence, Average Prediction Score: 3.41025641025641, Skipped Scores: 9
+---------+----------+---------+ +---------+----------+---------+
| Pearson | Spearman | Kendall | | Pearson | Spearman | Kendall |
+---------+----------+---------+ +---------+----------+---------+
| 0.4153 | 0.4092 | 0.4065 | | 0.614 | 0.5931 | 0.596 |
+---------+----------+---------+ +---------+----------+---------+
Filename: .\results\gemini\short_con.json, Dimension: consistency, Average Prediction Score: 3.3333333333333335 Filename: .\results\gemini\short_con.json, Dimension: consistency, Average Prediction Score: 3.6, Skipped Scores: 1
+---------+----------+---------+ +---------+----------+---------+
| Pearson | Spearman | Kendall | | Pearson | Spearman | Kendall |
+---------+----------+---------+ +---------+----------+---------+
| 0.9735 | 0.9809 | 0.9738 | | 0.9794 | 0.9851 | 0.9796 |
+---------+----------+---------+ +---------+----------+---------+
Filename: .\results\gemini\short_rel.json, Dimension: relevance, Average Prediction Score: 3.8333333333333335 Filename: .\results\gemini\short_rel.json, Dimension: relevance, Average Prediction Score: 3.8292682926829267, Skipped Scores: 0
+---------+----------+---------+ +---------+----------+---------+
| Pearson | Spearman | Kendall | | Pearson | Spearman | Kendall |
+---------+----------+---------+ +---------+----------+---------+
| 0.5161 | 0.5083 | 0.503 | | 0.5082 | 0.5004 | 0.4906 |
+---------+----------+---------+ +---------+----------+---------+
\ No newline at end of file
Filename: .\results\llama\gemini_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.0454545454545454, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.6394 | 0.626 | 0.6219 |
+---------+----------+---------+
Filename: .\results\llama\gemini_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.7142857142857144, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.7144 | 0.7222 | 0.7222 |
+---------+----------+---------+
Filename: .\results\llama\gemini_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.675, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.7344 | 0.7322 | 0.7299 |
+---------+----------+---------+
Filename: .\results\llama\llama_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.088888888888889, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.7767 | 0.7727 | 0.7656 |
+---------+----------+---------+
Filename: .\results\llama\llama_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.7, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.7849 | 0.7927 | 0.7982 |
+---------+----------+---------+
Filename: .\results\llama\llama_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.6315789473684212, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.5 | 0.4839 | 0.4853 |
+---------+----------+---------+
Filename: .\results\llama\qwen_coh_detailed.json, Dimension: coherence, Average Prediction Score: 3.0833333333333335, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.6922 | 0.6791 | 0.6651 |
+---------+----------+---------+
Filename: .\results\llama\qwen_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.5238095238095237, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.7571 | 0.7629 | 0.7574 |
+---------+----------+---------+
Filename: .\results\llama\qwen_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.7333333333333334, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.6286 | 0.6109 | 0.6095 |
+---------+----------+---------+
Filename: .\results\llama\scrambled_coh.json, Dimension: coherence, Average Prediction Score: 3.1777777777777776, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.7154 | 0.705 | 0.6921 |
+---------+----------+---------+
Filename: .\results\llama\scrambled_con.json, Dimension: consistency, Average Prediction Score: 3.5, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.8255 | 0.8333 | 0.8333 |
+---------+----------+---------+
Filename: .\results\llama\scrambled_rel.json, Dimension: relevance, Average Prediction Score: 3.8666666666666667, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.5328 | 0.539 | 0.5402 |
+---------+----------+---------+
Filename: .\results\llama\short_coh.json, Dimension: coherence, Average Prediction Score: 3.475, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.697 | 0.6901 | 0.6745 |
+---------+----------+---------+
Filename: .\results\llama\short_con.json, Dimension: consistency, Average Prediction Score: 3.857142857142857, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.8255 | 0.8333 | 0.8333 |
+---------+----------+---------+
Filename: .\results\llama\short_rel.json, Dimension: relevance, Average Prediction Score: 4.038461538461538, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.4965 | 0.4944 | 0.4976 |
+---------+----------+---------+
\ No newline at end of file
...@@ -22,7 +22,7 @@ sample_summeval = [summeval[i] for i in sample_indices] ...@@ -22,7 +22,7 @@ sample_summeval = [summeval[i] for i in sample_indices]
ct, ignore = 0, 0 ct, ignore = 0, 0
for file in os.listdir(prompt_dir): for file in os.listdir(prompt_dir)[12:]:
if file.endswith(".txt"): if file.endswith(".txt"):
prompt_fp = os.path.join(prompt_dir, file) prompt_fp = os.path.join(prompt_dir, file)
output_fp = os.path.join(output_dir, file.replace(".txt", ".json")) output_fp = os.path.join(output_dir, file.replace(".txt", ".json"))
...@@ -41,7 +41,7 @@ for file in os.listdir(prompt_dir): ...@@ -41,7 +41,7 @@ for file in os.listdir(prompt_dir):
contents=cur_prompt, contents=cur_prompt,
config=types.GenerateContentConfig( config=types.GenerateContentConfig(
temperature=2, temperature=2,
max_output_tokens=5, max_output_tokens=5, #Should have been slightly higher to accommodate for invalid model responses
top_p=1, top_p=1,
frequency_penalty=0, frequency_penalty=0,
presence_penalty=0, presence_penalty=0,
......
from groq import Groq from groq import Groq
from openai import OpenAI
import random import random
import os import os
import json import json
import tqdm import tqdm
import time import time
client = Groq(api_key="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") client = OpenAI(
api_key=os.environ["HYPERBOLIC_API_KEY"],
base_url="https://api.hyperbolic.xyz/v1",
)
prompt_dir = 'prompts\\cot_analysis' prompt_dir = 'prompts\\cot_analysis'
output_dir = 'results\\llama' output_dir = 'results\\llama'
summeval_dir = 'data\\summeval.json' summeval_dir = 'data\\summeval.json'
model = 'llama3-70b-8192' model = 'meta-llama/Meta-Llama-3-70B-Instruct'
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
summeval = json.load(open(summeval_dir)) summeval = json.load(open(summeval_dir))
...@@ -21,7 +25,7 @@ sample_summeval = [summeval[i] for i in sample_indices] ...@@ -21,7 +25,7 @@ sample_summeval = [summeval[i] for i in sample_indices]
ct, ignore = 0, 0 ct, ignore = 0, 0
for file in os.listdir(prompt_dir): for file in os.listdir(prompt_dir)[12:]:
if file.endswith(".txt"): if file.endswith(".txt"):
prompt_fp = os.path.join(prompt_dir, file) prompt_fp = os.path.join(prompt_dir, file)
output_fp = os.path.join(output_dir, file.replace(".txt", ".json")) output_fp = os.path.join(output_dir, file.replace(".txt", ".json"))
...@@ -38,8 +42,10 @@ for file in os.listdir(prompt_dir): ...@@ -38,8 +42,10 @@ for file in os.listdir(prompt_dir):
try: try:
_response = client.chat.completions.create( _response = client.chat.completions.create(
model=model, model=model,
messages=[{"role": "system", "content": "You are a helpful assistant that compares the Source Text and Summary based on the prompt given. You only give numerical values as response, based on the Evalation Steps given, and do not respond with text."}, messages=[
{"role": "user", "content": cur_prompt}], {"role": "system", "content": "You are a helpful assistant that compares the Source Text and Summary based on the prompt given. You only give numerical values as response, based on the Evalation Steps given, and do not respond with text."},
{"role": "user", "content": cur_prompt}
],
temperature=0, temperature=0,
max_completion_tokens=10, max_completion_tokens=10,
top_p=1, top_p=1,
...@@ -47,7 +53,7 @@ for file in os.listdir(prompt_dir): ...@@ -47,7 +53,7 @@ for file in os.listdir(prompt_dir):
presence_penalty=0, presence_penalty=0,
stop=None, stop=None,
#logprobs=True, Not yet supported #logprobs=True, Not yet supported
#n=20 Currently only n=1 supported #n=20 Possible to estimate logprobs via high temperature and n
) )
time.sleep(0.5) time.sleep(0.5)
......
...@@ -9,7 +9,7 @@ from glob import glob ...@@ -9,7 +9,7 @@ from glob import glob
def calculate_correlation(pred_score, human_score, result): def calculate_correlation(pred_score, human_score, result):
assert len(pred_score) == len(human_score) assert len(pred_score) == len(human_score)
#Part of original code, not sure how to fix without breaking it #Throws error: Part of original code, not sure how to fix without breaking it
if (len(result) == 0) or result is None: if (len(result) == 0) or result is None:
result = {'pearson': 0, 'spearman': 0, 'kendalltau': 0} result = {'pearson': 0, 'spearman': 0, 'kendalltau': 0}
result['pearson'] += pearsonr(pred_score, human_score)[0] result['pearson'] += pearsonr(pred_score, human_score)[0]
...@@ -18,7 +18,7 @@ def calculate_correlation(pred_score, human_score, result): ...@@ -18,7 +18,7 @@ def calculate_correlation(pred_score, human_score, result):
return result return result
#Very problematic way of doing this, but wasn't sure how else to do it (alternative would be to change the filenames to name the dimensions directly) #Problematic approach, but wasn't sure how else to do it (alternative is changing the filenames to name the dimensions directly)
def dimension_from_filename(filename): def dimension_from_filename(filename):
#Asked ChatGPT for this Regex #Asked ChatGPT for this Regex
...@@ -37,24 +37,21 @@ def dimension_from_filename(filename): ...@@ -37,24 +37,21 @@ def dimension_from_filename(filename):
else: else:
return None return None
def print_correlations(result, n, input_file, dimension, avg_pred_score, output_file): def print_correlations(result, n, input_file, dimension, avg_pred_score, skipped_scores, output_file):
table = PrettyTable(['Pearson', 'Spearman', 'Kendall']) table = PrettyTable(['Pearson', 'Spearman', 'Kendall'])
if (n == 0): if (n == 0):
n = 1 n = 1
table.add_row( table.add_row(
[round(result['pearson'] / n, 4), round(result['spearman'] / n, 4), round(result['kendalltau'] / n, 4)]) [round(result['pearson'] / n, 4), round(result['spearman'] / n, 4), round(result['kendalltau'] / n, 4)])
output_file.write(f"\nFilename: {input_file}, Dimension: {dimension}, Average Prediction Score: {avg_pred_score}\n") output_file.write(f"\nFilename: {input_file}, Dimension: {dimension}, Average Prediction Score: {avg_pred_score}, Skipped Scores: {skipped_scores}\n")
output_file.write(str(table)) output_file.write(str(table))
def parse_output(output): def parse_output(output):
#Asked ChatGPT for this Regex (changed it to capture model responses that do not start with a numerical value) #Asked ChatGPT for this Regex (changed it from original code to capture model responses that do not start with a numerical value)
matched = re.search(r"([0-9]+(?:\.[0-9]*)?)", output) matched = re.search(r"([1-5]+(?:\.[1-5]*)?)", output)
if matched: if matched:
try: score = float(matched.group(1))
score = float(matched.group(1))
except:
score = 0
else: else:
score = 0 score = 0
return score return score
...@@ -115,6 +112,6 @@ if __name__ == '__main__': ...@@ -115,6 +112,6 @@ if __name__ == '__main__':
d_ctr += 1 d_ctr += 1
avg_pred_score = total_pred_score / scores_count avg_pred_score = total_pred_score / scores_count
print_correlations(results, n=d_ctr, input_file = input_file, dimension = dimension, avg_pred_score = avg_pred_score, output_file = output_file) print_correlations(results, n=d_ctr, input_file = input_file, dimension = dimension, avg_pred_score = avg_pred_score, skipped_scores = skipped_scores, output_file = output_file)
else: else:
print(f"Skipping file: {input_file}") print(f"Skipping file: {input_file}")
\ No newline at end of file
...@@ -10,11 +10,11 @@ Coherence (1-5) - the collective quality of all sentences. We align this dimensi ...@@ -10,11 +10,11 @@ Coherence (1-5) - the collective quality of all sentences. We align this dimensi
Evaluation Steps: Evaluation Steps:
1. Read. 1. read
2. Assess quality. 2. assess quality
3. Determine if well-structured and well-organized. 3. determine if structured/organized
4. Decide if it forms a coherent body. 4. decide if coherent body
5. Rate on scale of 1-5 for Coherence. 5. rate on scale 1-5 for Coherence
Example: Example:
......
...@@ -10,10 +10,10 @@ Consistency (1-5) - the factual alignment between the summary and the summarized ...@@ -10,10 +10,10 @@ Consistency (1-5) - the factual alignment between the summary and the summarized
Evaluation Steps: Evaluation Steps:
1. Read news article. 1. read news article
2. Read summary. 2. read summary
3. Identify inconsistencies/hallucinations. 3. identify inconsistencies/hallucinations
4. Rate on scale of 1-5 for Consistency. 4. rate on scale 1-5 for Consistency
Example: Example:
......
...@@ -10,11 +10,11 @@ Relevance (1-5) - selection of important content from the source. The summary sh ...@@ -10,11 +10,11 @@ Relevance (1-5) - selection of important content from the source. The summary sh
Evaluation Steps: Evaluation Steps:
1. Read. 1. read
2. Identify important information. 2. identify important information
3. Determine if only important information included. 3. determine if only important information included
4. Check redundancies/excess information. 4. check redundancies/excess information
5. Rate on scale of 1-5 for Relevance. 5. rate on scale 1-5 for Relevance
Example: Example:
......
...@@ -5,7 +5,7 @@ import json ...@@ -5,7 +5,7 @@ import json
import time import time
import tqdm import tqdm
client = Groq(api_key="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") client = Groq(api_key=os.environ["GROQ_API_KEY"])
prompt_dir = 'prompts\\cot_analysis' prompt_dir = 'prompts\\cot_analysis'
output_dir = 'results\\qwen' output_dir = 'results\\qwen'
...@@ -20,8 +20,6 @@ sample_indices = sorted(random.sample(range(len(summeval)), 100)) ...@@ -20,8 +20,6 @@ sample_indices = sorted(random.sample(range(len(summeval)), 100))
sample_summeval = [summeval[i] for i in sample_indices] sample_summeval = [summeval[i] for i in sample_indices]
for file in os.listdir(prompt_dir): for file in os.listdir(prompt_dir):
if file.startswith(("gemini", "llama", "qwen")):
continue
if file.endswith(".txt"): if file.endswith(".txt"):
prompt_fp = os.path.join(prompt_dir, file) prompt_fp = os.path.join(prompt_dir, file)
output_fp = os.path.join(output_dir, file.replace(".txt", ".json")) output_fp = os.path.join(output_dir, file.replace(".txt", ".json"))
...@@ -41,7 +39,7 @@ for file in os.listdir(prompt_dir): ...@@ -41,7 +39,7 @@ for file in os.listdir(prompt_dir):
model=model, model=model,
messages=[{"role": "user", "content": cur_prompt}], messages=[{"role": "user", "content": cur_prompt}],
temperature=0, temperature=0,
max_completion_tokens=5, max_completion_tokens=10,
top_p=1, top_p=1,
frequency_penalty=0, frequency_penalty=0,
presence_penalty=0, presence_penalty=0,
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.