Skip to content
Snippets Groups Projects
Commit 4628d31d authored by lkim's avatar lkim
Browse files

Rework Qwen requests

parent b4cebef1
No related branches found
No related tags found
No related merge requests found
MIT License
Copyright (c) 2024 Yang Liu
Copyright (c) 2025 Long Kim
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
......
......@@ -60,7 +60,7 @@ def call_llama(prompt):
return f"Hyperbolic API error: {e}"
if __name__ == "__main__":
input_filename = 'prompts\\cot_analysis\\base_prompts.txt'
input_filename = 'prompts\\summeval\\base_prompts.txt'
output_folder = os.path.dirname(input_filename)
detailed_prompts = os.path.join(output_folder, "detailed_prompts.txt")
prompts = read_prompts_from_file(input_filename, delimiter="\n\n\n")
......
Filename: .\results\qwen\gemini_coh_detailed.json, Dimension: coherence, Average Prediction Score: 2.774193548387097, Skipped Scores: 17
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.4145 | 0.419 | 0.4155 |
+---------+----------+---------+
Filename: .\results\qwen\gemini_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.642857142857143, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.8275 | 0.8333 | 0.8333 |
+---------+----------+---------+
Filename: .\results\qwen\gemini_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.0, Skipped Scores: 17
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.8011 | 0.8023 | 0.7952 |
+---------+----------+---------+
Filename: .\results\qwen\llama_coh_detailed.json, Dimension: coherence, Average Prediction Score: 2.6285714285714286, Skipped Scores: 2
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.8298 | 0.8081 | 0.8018 |
+---------+----------+---------+
Filename: .\results\qwen\llama_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.3214285714285716, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.7466 | 0.75 | 0.75 |
+---------+----------+---------+
Filename: .\results\qwen\llama_rel_detailed.json, Dimension: relevance, Average Prediction Score: 3.0, Skipped Scores: 3
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.6837 | 0.6911 | 0.6878 |
+---------+----------+---------+
Filename: .\results\qwen\qwen_coh_detailed.json, Dimension: coherence, Average Prediction Score: 2.4, Skipped Scores: 1
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.6534 | 0.6298 | 0.6272 |
+---------+----------+---------+
Filename: .\results\qwen\qwen_con_detailed.json, Dimension: consistency, Average Prediction Score: 3.6153846153846154, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.851 | 0.8515 | 0.847 |
+---------+----------+---------+
Filename: .\results\qwen\qwen_rel_detailed.json, Dimension: relevance, Average Prediction Score: 2.925, Skipped Scores: 4
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.806 | 0.794 | 0.7861 |
+---------+----------+---------+
Filename: .\results\qwen\scrambled_coh.json, Dimension: coherence, Average Prediction Score: 2.4583333333333335, Skipped Scores: 14
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.7428 | 0.7313 | 0.7225 |
+---------+----------+---------+
Filename: .\results\qwen\scrambled_con.json, Dimension: consistency, Average Prediction Score: 3.8333333333333335, Skipped Scores: 13
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.6231 | 0.6237 | 0.6166 |
+---------+----------+---------+
Filename: .\results\qwen\scrambled_rel.json, Dimension: relevance, Average Prediction Score: 2.772727272727273, Skipped Scores: 17
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.9355 | 0.9296 | 0.9241 |
+---------+----------+---------+
Filename: .\results\qwen\short_coh.json, Dimension: coherence, Average Prediction Score: 2.710526315789474, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.5334 | 0.5004 | 0.49 |
+---------+----------+---------+
Filename: .\results\qwen\short_con.json, Dimension: consistency, Average Prediction Score: 3.48, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.8023 | 0.806 | 0.8015 |
+---------+----------+---------+
Filename: .\results\qwen\short_rel.json, Dimension: relevance, Average Prediction Score: 2.9285714285714284, Skipped Scores: 0
+---------+----------+---------+
| Pearson | Spearman | Kendall |
+---------+----------+---------+
| 0.8288 | 0.8115 | 0.81 |
+---------+----------+---------+
\ No newline at end of file
......@@ -20,14 +20,14 @@ random.seed(42)
sample_indices = sorted(random.sample(range(len(summeval)), 100))
sample_summeval = [summeval[i] for i in sample_indices]
ct, ignore = 0, 0
for file in os.listdir(prompt_dir)[12:]:
for file in os.listdir(prompt_dir):
if file.endswith(".txt"):
prompt_fp = os.path.join(prompt_dir, file)
output_fp = os.path.join(output_dir, file.replace(".txt", ".json"))
prompt = open(prompt_fp, "r").read()
ct, ignore = 0, 0
new_json = []
for instance in tqdm.tqdm(sample_summeval):
source = instance['source']
......@@ -58,7 +58,7 @@ for file in os.listdir(prompt_dir)[12:]:
if ct >= 15: #Circumvent rate error due to limitation of 15 requests per minute
time.sleep(60)
ct=0
break
except Exception as e:
print(e)
......@@ -67,7 +67,6 @@ for file in os.listdir(prompt_dir)[12:]:
else:
ignore += 1
print('ignored', ignore)
break
print('ignored total', ignore)
......
from groq import Groq
from openai import OpenAI
import random
import os
......@@ -6,10 +5,7 @@ import json
import tqdm
import time
client = OpenAI(
api_key=os.environ["HYPERBOLIC_API_KEY"],
base_url="https://api.hyperbolic.xyz/v1",
)
client = OpenAI(api_key=os.environ["HYPERBOLIC_API_KEY"], base_url="https://api.hyperbolic.xyz/v1",)
prompt_dir = 'prompts\\cot_analysis'
output_dir = 'results\\llama'
......@@ -23,27 +19,27 @@ random.seed(42)
sample_indices = sorted(random.sample(range(len(summeval)), 100))
sample_summeval = [summeval[i] for i in sample_indices]
ct, ignore = 0, 0
for file in os.listdir(prompt_dir)[12:]:
for file in os.listdir(prompt_dir):
if file.endswith(".txt"):
prompt_fp = os.path.join(prompt_dir, file)
output_fp = os.path.join(output_dir, file.replace(".txt", ".json"))
prompt = open(prompt_fp, "r").read()
ct, ignore = 0, 0
new_json = []
for instance in tqdm.tqdm(sample_summeval):
source = instance['source']
system_output = instance['system_output']
cur_prompt = prompt.replace('{{Document}}', source).replace('{{Summary}}', system_output)
instance['prompt'] = cur_prompt
while True:
try:
_response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "You are a helpful assistant that compares the Source Text and Summary based on the prompt given. You only give numerical values as response, based on the Evalation Steps given, and do not respond with text."},
{"role": "system", "content": "You are a helpful assistant that compares the Source Text and Summary based on the prompt given. "
"You only give numerical values as response, based on the Evalation Steps given, and do not respond with text."},
{"role": "user", "content": cur_prompt}
],
temperature=0,
......@@ -60,7 +56,6 @@ for file in os.listdir(prompt_dir)[12:]:
instance['model_response'] = _response.choices[0].message.content
new_json.append(instance)
ct += 1
break
except Exception as e:
print(e)
......
This diff is collapsed.
Source diff could not be displayed: it is too large. Options to address this: view the blob.
Source diff could not be displayed: it is too large. Options to address this: view the blob.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment