Rework Qwen requests

4628d31d · lkim · b4cebef1 · 4628d31d · 4628d31d · 4628d31d
Commit 4628d31d authored 2 weeks ago by lkim
--- a/LICENSE
+++ b/LICENSE
 MIT License

-Copyright (c) 2024 Yang Liu
+Copyright (c) 2025 Long Kim

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

--- a/auto_cot.py
+++ b/auto_cot.py
@@ -60,7 +60,7 @@ def call_llama(prompt):
        return f"Hyperbolic API error: {e}"
    
 if __name__ == "__main__":
-    input_filename = 'prompts\\cot_analysis\\base_prompts.txt'
+    input_filename = 'prompts\\summeval\\base_prompts.txt'
    output_folder = os.path.dirname(input_filename)
    detailed_prompts = os.path.join(output_folder, "detailed_prompts.txt")
    prompts = read_prompts_from_file(input_filename, delimiter="\n\n\n")

--- a/correlation_tables/qwen_output.txt
+++ b/correlation_tables/qwen_output.txt
+
+Filename: .\results\qwen\gemini_coh_detailed.json, Dimension: coherence, Average Prediction Score: 2.774193548387097, Skipped Scores: 17
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.4145 |  0.419   |  0.4155 |
+---------+----------+---------+
+Filename: .\results\qwen\gemini_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.642857142857143, Skipped Scores: 0
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.8275 |  0.8333  |  0.8333 |
+---------+----------+---------+
+Filename: .\results\qwen\gemini_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.0, Skipped Scores: 17
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.8011 |  0.8023  |  0.7952 |
+---------+----------+---------+
+Filename: .\results\qwen\llama_coh_detailed.json, Dimension: coherence, Average Prediction Score: 2.6285714285714286, Skipped Scores: 2
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.8298 |  0.8081  |  0.8018 |
+---------+----------+---------+
+Filename: .\results\qwen\llama_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.3214285714285716, Skipped Scores: 0
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.7466 |   0.75   |   0.75  |
+---------+----------+---------+
+Filename: .\results\qwen\llama_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.0, Skipped Scores: 3
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.6837 |  0.6911  |  0.6878 |
+---------+----------+---------+
+Filename: .\results\qwen\qwen_coh_detailed.json, Dimension: coherence, Average Prediction Score: 2.4, Skipped Scores: 1
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.6534 |  0.6298  |  0.6272 |
+---------+----------+---------+
+Filename: .\results\qwen\qwen_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.6153846153846154, Skipped Scores: 0
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.851  |  0.8515  |  0.847  |
+---------+----------+---------+
+Filename: .\results\qwen\qwen_rel_detailed.json, Dimension: relevance, Average Prediction Score: 2.925, Skipped Scores: 4
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.806  |  0.794   |  0.7861 |
+---------+----------+---------+
+Filename: .\results\qwen\scrambled_coh.json, Dimension: coherence, Average Prediction Score: 2.4583333333333335, Skipped Scores: 14
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.7428 |  0.7313  |  0.7225 |
+---------+----------+---------+
+Filename: .\results\qwen\scrambled_con.json, Dimension: consistency, Average Prediction Score: 3.8333333333333335, Skipped Scores: 13
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.6231 |  0.6237  |  0.6166 |
+---------+----------+---------+
+Filename: .\results\qwen\scrambled_rel.json, Dimension: relevance, Average Prediction Score: 2.772727272727273, Skipped Scores: 17
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.9355 |  0.9296  |  0.9241 |
+---------+----------+---------+
+Filename: .\results\qwen\short_coh.json, Dimension: coherence, Average Prediction Score: 2.710526315789474, Skipped Scores: 0
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.5334 |  0.5004  |   0.49  |
+---------+----------+---------+
+Filename: .\results\qwen\short_con.json, Dimension: consistency, Average Prediction Score: 3.48, Skipped Scores: 0
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.8023 |  0.806   |  0.8015 |
+---------+----------+---------+
+Filename: .\results\qwen\short_rel.json, Dimension: relevance, Average Prediction Score: 2.9285714285714284, Skipped Scores: 0
+---------+----------+---------+
+| Pearson | Spearman | Kendall |
+---------+----------+---------+
+|  0.8288 |  0.8115  |   0.81  |
+---------+----------+---------+
\ No newline at end of file
--- a/gemini_eval.py
+++ b/gemini_eval.py
@@ -20,14 +20,14 @@ random.seed(42)
 sample_indices = sorted(random.sample(range(len(summeval)), 100))
 sample_summeval = [summeval[i] for i in sample_indices]

-ct, ignore = 0, 0
-
-for file in os.listdir(prompt_dir)[12:]:
+for file in os.listdir(prompt_dir):
    if file.endswith(".txt"):
        prompt_fp = os.path.join(prompt_dir, file)
        output_fp = os.path.join(output_dir, file.replace(".txt", ".json"))
        prompt = open(prompt_fp, "r").read()
        
+        ct, ignore = 0, 0
+        
        new_json = []
        for instance in tqdm.tqdm(sample_summeval):
            source = instance['source']
@@ -58,7 +58,7 @@ for file in os.listdir(prompt_dir)[12:]:
                    if ct >= 15:           #Circumvent rate error due to limitation of 15 requests per minute
                        time.sleep(60)
                        ct=0
-
+                        
                    break
                except Exception as e:
                    print(e)
@@ -67,7 +67,6 @@ for file in os.listdir(prompt_dir)[12:]:
                    else:
                        ignore += 1
                        print('ignored', ignore)
-
                        break

        print('ignored total', ignore)

--- a/llama3_eval.py
+++ b/llama3_eval.py
-from groq import Groq
 from openai import OpenAI
 import random
 import os
@@ -6,10 +5,7 @@ import json
 import tqdm
 import time

-client = OpenAI(
-    api_key=os.environ["HYPERBOLIC_API_KEY"],
-    base_url="https://api.hyperbolic.xyz/v1",
-    )
+client = OpenAI(api_key=os.environ["HYPERBOLIC_API_KEY"], base_url="https://api.hyperbolic.xyz/v1",)

 prompt_dir = 'prompts\\cot_analysis'
 output_dir = 'results\\llama'
@@ -23,27 +19,27 @@ random.seed(42)
 sample_indices = sorted(random.sample(range(len(summeval)), 100))
 sample_summeval = [summeval[i] for i in sample_indices]

-ct, ignore = 0, 0
-
-for file in os.listdir(prompt_dir)[12:]:
+for file in os.listdir(prompt_dir):
    if file.endswith(".txt"):
        prompt_fp = os.path.join(prompt_dir, file)
        output_fp = os.path.join(output_dir, file.replace(".txt", ".json"))
        prompt = open(prompt_fp, "r").read()
        
+        ct, ignore = 0, 0
+        
        new_json = []
        for instance in tqdm.tqdm(sample_summeval):
            source = instance['source']
            system_output = instance['system_output']
            cur_prompt = prompt.replace('{{Document}}', source).replace('{{Summary}}', system_output)
            instance['prompt'] = cur_prompt
-            
            while True:
                try:
                    _response = client.chat.completions.create(
                        model=model, 
                        messages=[
-                            {"role": "system", "content": "You are a helpful assistant that compares the Source Text and Summary based on the prompt given. You only give numerical values as response, based on the Evalation Steps given, and do not respond with text."},
+                            {"role": "system", "content": "You are a helpful assistant that compares the Source Text and Summary based on the prompt given. "
+                            "You only give numerical values as response, based on the Evalation Steps given, and do not respond with text."},
                            {"role": "user", "content": cur_prompt}
                            ], 
                        temperature=0,
@@ -60,7 +56,6 @@ for file in os.listdir(prompt_dir)[12:]:
                    instance['model_response'] = _response.choices[0].message.content
                    new_json.append(instance)
                    ct += 1
-
                    break
                except Exception as e:
                    print(e)

--- a/results/qwen/gemini_con_detailed.json
+++ b/results/qwen/gemini_con_detailed.json
--- a/results/qwen/llama_con_detailed.json
+++ b/results/qwen/llama_con_detailed.json
--- a/results/qwen/qwen_con_detailed.json
+++ b/results/qwen/qwen_con_detailed.json
--- a/results/qwen/short_coh.json
+++ b/results/qwen/short_coh.json
--- a/results/qwen/short_con.json
+++ b/results/qwen/short_con.json
--- a/results/qwen/short_rel.json
+++ b/results/qwen/short_rel.json