Skip to content
Snippets Groups Projects
Commit f95c1cf6 authored by kulcsar's avatar kulcsar
Browse files

clean up some code

parent 550a5c4a
No related branches found
No related tags found
No related merge requests found
......@@ -9,12 +9,10 @@ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Tenso
import logging
from transformers import get_scheduler, AdamW
from t5_model import preprocess
#from accelerate import Accelerator
import argparse
logging.basicConfig(filename='log_t5.log', level=logging.DEBUG)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#accelerator=Accelerator()
#device=accelerator.device
def run():
print(args.saved_model)
print(args.tokenizer)
......@@ -22,22 +20,14 @@ def run():
logging.info("performing evaluation")
logging.info("loading saved model")
#checkpoint=torch.load(args.saved_model)
#model.load_state_dict(checkpoint)
model=BioGptForCausalLM.from_pretrained(args.saved_model)
#model=AutoModelForCausalLM.from_pretrained(args.saved_model)
#model=T5ForConditionalGeneration.from_pretrained("t5-small")
#model.load_state_dict(torch.load(args.saved_model))
#model=accelerator.load("./t5_small_deepspeed_train_test_2.pt")
#model=accelerator.load_state("./t5_small_deepspee_train_test.pt")
#model=BioGptForCausalLM.from_pretrained(args.saved_model)
model=AutoModelForCausalLM.from_pretrained(args.saved_model)
tokenizer=AutoTokenizer.from_pretrained(args.tokenizer)
tokenizer.padding_side="left"
tokenizer.pad_token=tokenizer.eos_token
model.config.pad_token_id=model.config.eos_token_id
test_dataset=preprocess(tokenizer, args.test_dataset)
#with open(args.test_dataset, "rb") as f:
# data=pkl.load(f)
#test_dataset=DiagnosesDataset(data, tokenizer)
logging.info("running evaluation")
res=evaluate_model_loop(model, args.config_name, test_dataset, args.batch_size,tokenizer, args.topk, args.temp, args.num_beams, args.early_stopping, args.no_rep_ngram, args.num_return_sequences, args.metrics, args.do_sample, args.generative, args.icd_codes)
logging.info(res)
......@@ -62,25 +52,9 @@ class DiagnosesDataset(torch.utils.data.Dataset):
label_instruction=self.tokenizer(labels)
i=len(tokenized_instruction["input_ids"])-1
#while 1<len(item["input_ids"])
#print("Len of item labels before ", len(item["labels"]))
item["labels"][i:]=label_instruction["input_ids"]
#item.pop("token_type_ids")
#print(item["labels"])
#we now need to pad to 2048
item["labels"][i:]=label_instruction["input_ids"]
#print("Len labels: ", len(item["labels"]))
#print("Len input ids: ", len(item["input_ids"]))
#print("\n\n")
#try:
# assert len(item["labels"]) == len(item["input_ids"])
#except AssertionError:
# print(len(item["labels"]))
# print(len(item["input_ids"]))
# print(len(tokenized_instruction["input_ids"]))
# print("\n\n")
# break
return item
def tokenize(self, prompt):
......@@ -90,24 +64,7 @@ class DiagnosesDataset(torch.utils.data.Dataset):
max_length=1024,
padding=False,
return_tensors=None)
#print(type(result_prompt))
#print(len(result_prompt["input_ids"]))
#result_labels=self.tokenizer(labels,
# truncation=True,
# max_length=1024,
# padding=False,
# return_tensors=None)
#old_labels=result_labels["input_ids"].copy()
#result_prompt["labels"]=[-100 for i in result_prompt["input_ids"]] + result_labels["input_ids"]
#result_prompt["input_ids"]=result_prompt["input_ids"] + old_labels
#print(result_prompt["input_ids"]
#result_prompt["labels"] = [-100 for i in result_prompt["input_ids"]] + result_labels["input_ids"]
#print(len(result_prompt["labels"]))
#assert len(result_prompt["input_ids"]) == len(result_prompt["labels"])
result_prompt["labels"]=[-100]*len(result_prompt["input_ids"])
#print(result_prompt["labels"])
return result_prompt
def __len__(self):
......@@ -137,10 +94,6 @@ def evaluate_model_loop(model,config_name, test_dataset, batch_size, tokenizer,
print("num_return_sequences: ", num_return_sequences)
print("metrics: ", metrics)
print("generative? ", generative)
#config=GenerationConfig.from_pretrained(config_name, top_k=top_k, temperature=temp, num_beams=num_beams, early_stopping=early_stopping, no_repeat_ngram_size=no_rep, num_return_sequences=num_return_sequences, max_length=512)
#print(config.num_return_sequences)
#print(tokenizer.max_length)
eval_sampler=SequentialSampler(test_dataset)
eval_dataloader=DataLoader(test_dataset, sampler=eval_sampler, batch_size=batch_size)
accuracies=[]
......@@ -149,35 +102,15 @@ def evaluate_model_loop(model,config_name, test_dataset, batch_size, tokenizer,
precs=[]
for index, batch in tqdm(enumerate(eval_dataloader)):
with torch.no_grad():
#print(batch.to(device))
#if index == 20:
# break
print(len(batch["input_ids"]))
#input_ids=torch.tensor(batch[0]).unsqueeze(0).to(device)
input_ids=batch["input_ids"]
#print(input_ids.size())
attention_mask=batch["attention_mask"]
labels=batch["labels"]
labels_len=len(batch[2][batch[2] != tokenizer.pad_token_id])
input_ids_len=len(batch[0][0])
print("Len input ids: ", input_ids_len)
print("Len labels: ", labels_len)
#attention_mask=torch.tensor(batch[1]).unsqueeze(0).to(device)
#labels=torch.tensor(batch[2]).unsqueeze(0).to(device)
#last_inp_token_index=batch[2][::-1].index(-100)
#last_occurrence=len(batch[2]) - last_inp_token_index
#print(last_occurrence)
#label_length=len(batch["input_ids"][last_occurrence])
#print(label_length)
#outputs=model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=512, top_k=top_k, temperature=temp, num_beams=num_beams, early_stopping=early_stopping, no_repeat_ngram_size=no_rep, num_return_sequences=num_return_sequences)
outputs=model.generate(input_ids=input_ids, attention_mask=attention_mask, top_k=top_k, temperature=temp, num_beams=num_beams, early_stopping=early_stopping, no_repeat_ngram_size=no_rep, num_return_sequences=num_return_sequences, max_new_tokens=labels_len+8)# , length_penalty=-0.8)
#print("Outputs: ",len(outputs[0]))
#print("Len input ids: ", labels_len)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment