Skip to content
Snippets Groups Projects
Commit 000608d3 authored by kulcsar's avatar kulcsar
Browse files

add aki stuff

parent f95c1cf6
No related branches found
No related tags found
No related merge requests found
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler, TrainingArguments, Trainer
import argparse
import pickle as pkl
import random
from sklearn import metrics
import os
import numpy as np
#import accelerate
import evaluate
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
#from accelerate import Accelerator
#accelerator=Accelerator()
parser=argparse.ArgumentParser()
class AKIDataset(torch.utils.data.Dataset):
def __init__(self, instances, tokenizer):
self.instances=instances
self.tokenizer=tokenizer
def __getitem__(self, idx):
instance=self.instances[idx]
#print(instance)
prompt=instance[0]
labels=instance[1]
item=self.tokenize(prompt)
item["input_ids"]=item["input_ids"].squeeze()
item["token_type_ids"]=item["token_type_ids"].squeeze()
item["attention_mask"]=item["attention_mask"].squeeze()
item["label"] = labels
#print(item["input_ids"].size())
#print(item["input_ids"].size())
#item["input_ids"]
#item["label"]#.to(device)
#item["attention_mask"]
return item
def tokenize(self, prompt):
result=self.tokenizer(prompt, truncation=True, max_length=512, padding="max_length", return_tensors="pt")
return result
def __len__(self):
return len(self.instances)
def set_seed(seed):
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic=True
torch.backends.cudnn.banchmark=False
os.environ["PYTHONHASHSEED"]=str(seed)
print("Random seed set as : {seed}")
def train(model, dataset, seed, batch_size, epochs, lr, gradient_accumulation_steps=1, model_save_path=None):
model.train()
model.to(device)
set_seed(seed)
optimizer=AdamW(model.parameters(), lr=lr)
train_sampler=RandomSampler(dataset)
train_dataloader=DataLoader(dataset, sampler=train_sampler, batch_size=batch_size)
num_training_steps=epochs*len(train_dataloader)
lr_scheduler=get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
#model, optimizer, train_dataloader, lr_scheduler=accelerator.prepare(model, optimizer, train_dataloader, lr_scheduler)
model.zero_grad()
for e in tqdm(range(epochs)):
model.train()
for index, batch in tqdm(enumerate(train_dataloader)):
#print(batch)
input_ids=batch["input_ids"].squeeze().to(device)#.to(device) #.unsqueeze(0).transpose(0,1).to(device)
attention_mask=batch["attention_mask"].squeeze().to(device)#.to(device) #).unsqueeze(0).transpose(0,1).to(device)
token_type_ids=batch["token_type_ids"].squeeze().to(device)
labels=batch["label"].to(device) #).unsqueeze(0).transpose(0,1).to(device)
print(input_ids.size())
outputs=model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)
print(outputs.logits)
print("Model prediciton: ", torch.argmax(outputs.logits, dim=-1))
print("True label: ", labels)
loss=outputs.loss
loss=loss/gradient_accumulation_steps
loss.backward()
print(loss)
#if ((index+1) % gradient_accumulation_steps==0) or (index+1==len(train_dataloader)):
# optimizer.step()
# lr_scheduler.step()
# optimizer.zero_grad()
if index % gradient_accumulation_steps==0 or index == len(train_dataloader):
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
#model.zero_grad()
#if model_save_path != None:
# torch.save(model, model_save_path)
model.save_pretrained(model_save_path)
return model
def main():
metric=evaluate.load("accuracy", average="macro")
#metric_f1=evaluate.load("f1")
model=AutoModelForSequenceClassification.from_pretrained(args.model, num_labels=3)
tokenizer=AutoTokenizer.from_pretrained(args.tokenizer)
with open(args.train_dataset, "rb") as f:
data=pkl.load(f)
print(len(data[:-1]))
with open(args.test_dataset, "rb") as f:
test_data=pkl.load(f)
test_dataset=AKIDataset(test_data, tokenizer)
dataset=AKIDataset(data[:-1], tokenizer)
training_args=TrainingArguments(output_dir="./outputs_3_eps_full",
per_device_train_batch_size=args.train_batch_size,
gradient_accumulation_steps=2,
learning_rate=args.learning_rate,
evaluation_strategy="epoch")
trainer=Trainer(model=model,
args=training_args,
tokenizer=tokenizer,
train_dataset=dataset,
eval_dataset=test_dataset,
compute_metrics=compute_metrics)
trainer.train()
#trained_model=train(model, dataset, args.seed, args.train_batch_size, args.epochs, args.learning_rate, args.gradient_accumulation_steps, args.model_save_path)
def compute_metrics(eval_pred):
#metric=evaluate.load("accuracy", average="macro")
#metric_f1=evaluate.load("f1", average="macro")
#metric_prec=evaluate.load("precision", average="macro")
#metric_rec=evaluate.load("recall", average="macro")
#print(eval_pred)
logits, labels=eval_pred
#print(logits, labels)
normalized_logits=torch.nn.functional.log_softmax(torch.from_numpy(logits), dim=1)
predictions=torch.argmax(normalized_logits, dim=-1)
#acc=metric.compute(predictions=predictions, references=labels)
#f1=metric_f1.compute(predictions=predictions, references=labels)
#precision=metric_prec.compute(predictions=predictions, references=labels)
#recall=metric_rec.compute(predictions=predictions, references=labels)
accuracy=metrics.accuracy_score(labels, predictions)
f1_score_micro=metrics.f1_score(labels, predictions, average="micro")
f1_score_macro=metrics.f1_score(labels, predictions, average="macro")
print(f"Accuracy score: {accuracy}")
print(f"F1 score (micro): {f1_score_micro}")
print(f"F1 score (macro): {f1_score_macro}")
return_dict={"accuracy":accuracy, "f1_score_micro":f1_score_micro, "f1_score_macro":f1_score_macro}
return return_dict#, precision, recall
if __name__ == "__main__":
parser.add_argument("--model", type=str)
parser.add_argument("--tokenizer", type=str)
parser.add_argument("--train_dataset", type=str)
parser.add_argument("--test_dataset", type=str)
parser.add_argument("--seed", type=int)
parser.add_argument("--epochs", type=int)
parser.add_argument("--learning_rate", type=float)
parser.add_argument("--train_batch_size", type=int)
parser.add_argument("--gradient_accumulation_steps", type=int)
parser.add_argument("--model_save_path", type=str)
args=parser.parse_args()
main()
#!/bin/bash
#
#SBATCH --job-name=aki_classification_bert
#SBATCH --output=output_train_biobert_aki.txt
#SBATCH --mail-user=kulcsar@cl.uni-heidelberg.de
#SBATCH --mail-type=ALL
#SBATCH --partition=students
#SBATCH --mem 40G
#SBATCH --gres=gpu:1
#SBATCH --ntasks=1
#JOB STEPS
#srun hostname
cd /home/students/kulcsar/
source /home/students/kulcsar/anaconda3/etc/profile.d/conda.sh
conda activate huggingface_older
cd /home/students/kulcsar/Bachelor/for_dataset/aki_classification
#accelerate config
#python -m torch.distributed.launch --nproc_per_node=2 --use_env t5_model.py --model luqh/ClinicalT5-large --tokenizer luqh/ClinicalT5-large --dataset ./dev/dev_dataset_diagnoses_icd_48_pref.pkl --ff --seed 42 --batch_size 8 --epochs 3 --learning_rate 0.000005 --model_save_path t5_small_10000_icd_48_rs42_prefix.pt --do_eval --test_dataset ./test/test_dataset_diagnoses_icd_48_pref.pkl --topk 3 --temp 0.9 --num_beams 4 --no_rep_ngram 2 --do_sample --metrics accuracy --log log_clinicalt5.log
python classification_3.py --model emilyalsentzer/Bio_ClinicalBERT --tokenizer emilyalsentzer/Bio_ClinicalBERT --train_dataset ./train_dataset_bert.pkl --test_dataset ./test_dataset_bert.pkl --seed 42 --train_batch_size 4 --gradient_accumulation_steps 2 --epochs 3 --learning_rate 5e-5 --model_save_path bio_bert_base_cased_aki_classification_10_epochs_3_epochs.pt
#--do_eval --test_dataset ./test/test_dataset_diagnoses_48.pkl --topk 30 --temp 0.9 --num_beams 4 --no_rep_ngram 2 --do_sample --metrics accuracy --num_return_sequences 1
#python t5_model.py --model luqh/ClinicalT5-large --tokenizer luqh/ClinicalT5-large --dataset ./dev/dev_dataset_diagnoses_icd_48.pkl --seed 13 --batch_size 16 --epochs 3 --learning_rate 0.000005 --model_save_path clint5_large_10000_icd_48_rs13.pt --do_eval --test_dataset ./test/test_dataset_diagnoses_48.pkl --topk 3 --temp 0.9 --num_beams 4 --no_rep_ngram 2 -do_sample --metrics accuracy
#python t5_model.py --model luqh/ClinicalT5-large --tokenizer luqh/ClinicalT5-large --dataset ./dev/dev_dataset_diagnoses_icd_48.pkl --seed 9 --batch_size 16 --epochs 3 --learning_rate 0.000005 --model_save_path clint5_large_10000_icd_48_rs9.pt --do_eval --test_dataset ./test/test_dataset_diagnoses_48.pkl --topk 3 --temp 0.9 --num_beams 4 --no_rep_ngram 2 -do_sample --metrics accuracy
File added
import evaluate
import torch
from statistics import mean
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
import argparse
import pickle as pkl
import random
import os
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
#import accelerate
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
#from accelerate import Accelerator
#accelerator=Accelerator()
parser=argparse.ArgumentParser()
def main():
tokenizer=AutoTokenizer.from_pretrained(args.tokenizer)
#model=torch.load(args.model)
#print(type(model))
model=AutoModelForSequenceClassification.from_pretrained(args.model)
with open(args.dataset, "rb") as f:
data=pkl.load(f)
print(len(data))
dataset=AKIDataset(data, tokenizer)
evaluate_model_loop(model, args.seed, dataset, args.batch_size, args.metrics)
class AKIDataset(torch.utils.data.Dataset):
def __init__(self, instances, tokenizer):
self.instances=instances
self.tokenizer=tokenizer
def __getitem__(self, idx):
instance=self.instances[idx]
#print(instance)
prompt=instance[0]
labels=instance[1]
item=self.tokenize(prompt)
item["label"] = labels
#print(item["input_ids"].size())
#print(item["input_ids"].size())
#item["input_ids"]
#item["label"]#.to(device)
#item["attention_mask"]
return item
def tokenize(self, prompt):
result=self.tokenizer(prompt, truncation=True, max_length=512, padding="max_length", return_tensors="pt")
return result
def __len__(self):
return len(self.instances)
def set_seed(seed):
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic=True
torch.backends.cudnn.banchmark=False
os.environ["PYTHONHASHSEED"]=str(seed)
print("Random seed set as : {seed}")
def evaluate_model_loop(model, seed, test_dataset, batch_size, metrics):
torch.cuda.empty_cache()
set_seed(seed)
print("Testing model")
print("Metric: ", metrics)
#if len(metrics) > 1:
# metric=evaluate.combine(metrics)
#else:
# print("here")
# metric=evaluate.load(metrics[0])
#metric=evaluate.combine([evaluate.load("accuracy", average=None),
# evaluate.load("f1",average=None),
# evaluate.load("precision", average=None),
# evaluate.load("recall", average=None)])
metric=evaluate.load("accuracy", average=None)
model.eval().to(device)
print("Batch size: ", batch_size)
print("metrics: ", metrics)
all_accs=[]
all_precs=[]
all_recs=[]
all_f1=[]
#config=GenerationConfig.from_pretrained(config_name, top_k=top_k, temperature=temp, num_beams=num_beams, early_stopping=early_stopping, no_repeat_ngram_size=no_rep, num_return_sequences=num_return_sequences, max_length=512)
#print(config.num_return_sequences)
#print(tokenizer.max_length)
eval_sampler=SequentialSampler(test_dataset)
eval_dataloader=DataLoader(test_dataset, sampler=eval_sampler, batch_size=batch_size)
iterations=0
for index, batch in tqdm(enumerate(eval_dataloader)):
with torch.no_grad():
#print(batch.to(device))
#if index == 10:
# break
input_ids=batch["input_ids"].squeeze().to(device)
#print(input_ids.size())
attention_mask=batch["attention_mask"].squeeze().to(device)
token_type_ids=batch["token_type_ids"].squeeze().to(device)
labels=batch["label"].to(device)
outputs=model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
prediction=torch.argmax(outputs.logits.softmax(1), dim=-1)
print(prediction)
print(outputs.logits.softmax(1))
accuracy_res=metric.compute(predictions=prediction, references=labels)
print(accuracy_res)
all_accs.append(accuracy_res["accuracy"])
iterations+=1
#all_accs.append(accuracy_score(labels.cpu().squeeze(), prediction.cpu().squeeze()))
#all_precs.append(precision_score(labels.cpu().squeeze(), prediction.cpu().squeeze()))
#all_recs.append(recall_score(labels.cpu().squeeze(), prediction.cpu().squeeze()))
#all_f1.append(f1_scorer(labels.cpu().squeeze(), prediction.cpu().squeeze()))
#print(len(eval_dataloader))
#print(len(list(eval_dataloader.items())))
#print(sum(all_accs))
acc=sum(all_accs)/iterations
#prec=sum(all_precs)/len(eval_dataloader)
#rec=sum(all_recs)/len(eval_dataloader)
#f1=sum(all_f1)/len(eval_dataloader)
print(acc)
#res=metric.compute(average=None)
#print("RESULTS:\n\nAccuracy: {0}\nPrecision: {1}\nRecall: {2}\nF1: {3}".format(round(acc*100, 2),round(prec*100, 2), round(rec*100, 2), round(f1*100, 2)))
return None
if __name__ == "__main__":
parser.add_argument("--model", type=str)
parser.add_argument("--tokenizer", type=str)
parser.add_argument("--dataset", type=str)
parser.add_argument("--seed", type=int)
parser.add_argument("--batch_size", type=int)
parser.add_argument("--metrics", nargs="+")
args=parser.parse_args()
main()
File added
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment