Skip to content
Snippets Groups Projects
Commit 700a2517 authored by kulcsar's avatar kulcsar
Browse files

add diagnosis classification

parent 000608d3
No related branches found
No related tags found
No related merge requests found
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler, TrainingArguments, Trainer
import argparse
import pickle as pkl
import random
import os
from sklearn import metrics
import numpy as np
#import accelerate
import evaluate
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
from datasets import load_metric
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
#from accelerate import Accelerator
#accelerator=Accelerator()
parser=argparse.ArgumentParser()
class AKIDataset(torch.utils.data.Dataset):
def __init__(self, instances, tokenizer):
self.instances=instances
self.tokenizer=tokenizer
def __getitem__(self, idx):
instance=self.instances[idx]
#print(instance)
prompt=instance["prompt"]
labels=instance["label"]
#print(labels)
item=self.tokenize(prompt)
item["input_ids"]=item["input_ids"].squeeze()
item["token_type_ids"]=item["token_type_ids"].squeeze()
item["attention_mask"]=item["attention_mask"].squeeze()
item["label"] = torch.tensor(eval(labels[0]), dtype=torch.float32) #take this out if created dataset is right
#print(labels.size())
#print(item["input_ids"].size())
#print(item["input_ids"].size())
#item["input_ids"]
#item["label"]#.to(device)
#item["attention_mask"]
return item
def tokenize(self, prompt):
result=self.tokenizer(prompt, truncation=True, max_length=512, padding="max_length", return_tensors="pt")
return result
def __len__(self):
return len(self.instances)
def set_seed(seed):
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic=True
torch.backends.cudnn.banchmark=False
os.environ["PYTHONHASHSEED"]=str(seed)
print("Random seed set as : {seed}")
def train(model, dataset, seed, batch_size, epochs, lr, gradient_accumulation_steps=1, model_save_path=None):
model.train()
model.to(device)
set_seed(seed)
optimizer=AdamW(model.parameters(), lr=lr)
train_sampler=RandomSampler(dataset)
train_dataloader=DataLoader(dataset, sampler=train_sampler, batch_size=batch_size)
num_training_steps=epochs*len(train_dataloader)
lr_scheduler=get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
#model, optimizer, train_dataloader, lr_scheduler=accelerator.prepare(model, optimizer, train_dataloader, lr_scheduler)
model.zero_grad()
for e in tqdm(range(epochs)):
model.train()
for index, batch in tqdm(enumerate(train_dataloader)):
#print(batch)
input_ids=batch["input_ids"].squeeze().to(device)#.to(device) #.unsqueeze(0).transpose(0,1).to(device)
attention_mask=batch["attention_mask"].squeeze().to(device)#.to(device) #).unsqueeze(0).transpose(0,1).to(device)
token_type_ids=batch["token_type_ids"].squeeze().to(device)
labels=batch["label"].to(device) #).unsqueeze(0).transpose(0,1).to(device)
print(input_ids.size())
outputs=model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)
print(outputs.logits)
print("Model prediciton: ", torch.argmax(outputs.logits, dim=-1))
print("True label: ", labels)
loss=outputs.loss
loss=loss/gradient_accumulation_steps
loss.backward()
print(loss)
#if ((index+1) % gradient_accumulation_steps==0) or (index+1==len(train_dataloader)):
# optimizer.step()
# lr_scheduler.step()
# optimizer.zero_grad()
if index % gradient_accumulation_steps==0 or index == len(train_dataloader):
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
#model.zero_grad()
#if model_save_path != None:
# torch.save(model, model_save_path)
model.save_pretrained(model_save_path)
return model
def main():
#metric=evaluate.load("accuracy", average="macro")
model=AutoModelForSequenceClassification.from_pretrained(args.model, num_labels=127, problem_type="multi_label_classification")
tokenizer=AutoTokenizer.from_pretrained(args.tokenizer)
with open(args.train_dataset, "rb") as f:
data=pkl.load(f)
print(len(data[:-1]))
with open(args.test_dataset, "rb") as f:
test_data=pkl.load(f)
test_dataset=AKIDataset(test_data, tokenizer)
dataset=AKIDataset(data[:-1], tokenizer)
training_args=TrainingArguments(output_dir="./outputs_3_eps_full",
per_device_train_batch_size=args.train_batch_size,
per_device_eval_batch_size=1,
num_train_epochs=args.epochs,
gradient_accumulation_steps=2,
learning_rate=args.learning_rate,
evaluation_strategy="epoch")
trainer=Trainer(model=model,
args=training_args,
tokenizer=tokenizer,
train_dataset=dataset,
eval_dataset=test_dataset,
compute_metrics=compute_metrics)
trainer.train()
#trained_model=train(model, dataset, args.seed, args.train_batch_size, args.epochs, args.learning_rate, args.gradient_accumulation_steps, args.model_save_path)
def compute_metrics(eval_pred):
#metric=evaluate.load("accuracy", average="macro")
#metric=load_metric("sequence_multi_label_classification")
#print(eval_pred)
logits, labels=eval_pred
#print(logits, labels)
normalized_logits=torch.sigmoid(torch.from_numpy(logits)).cpu().detach().numpy().round()#the rounding will make sure, that this will be a vector with only 1s and 0s
#print(normalized_logits.shape)
normalized_logits=torch.tensor(normalized_logits, dtype=torch.int32)
labels=torch.tensor(labels, dtype=torch.int32)
accuracy=metrics.accuracy_score(labels, normalized_logits)
f1_score_micro=metrics.f1_score(labels, normalized_logits, average="micro")
f1_score_macro=metrics.f1_score(labels, normalized_logits, average="macro")
print(f"Accuracy score: {accuracy}")
print(f"F1 score (micro): {f1_score_micro}")
print(f"F1 score (macro): {f1_score_macro}")
#print(labels.shape)
#print(type(labels))
#print(type(labels[0][0]))
#print(type(normalized_logits[0][0]))
#print(normalized_logits)
#print(type(normalized_logits))
#print(normalized_logits.shape)
#normalized_logits=normalized_logits.astype("int32")
#predictions=torch.argmax(normalized_logits, dim=-1)
#for el in labels:
# for x in el:
# if type(x) != "torch.int32":
# print("found it")
# print(x)
# print(type(x))
#for el in normalized_logits:
# for x in el:
# if type(x) != "torch.int32":
# print("found it!")
# print(x)
# print(type(x))
#
#metric.add_batch(predictions=normalized_logits, references=labels)
#scores=metric.compute()
return_dict={"accuracy":accuracy, "f1_score_micro":f1_score_micro, "f1_score_macro":f1_score_macro}
return return_dict
#return None
if __name__ == "__main__":
parser.add_argument("--model", type=str)
parser.add_argument("--tokenizer", type=str)
parser.add_argument("--train_dataset", type=str)
parser.add_argument("--test_dataset", type=str)
parser.add_argument("--seed", type=int)
parser.add_argument("--epochs", type=int)
parser.add_argument("--learning_rate", type=float)
parser.add_argument("--train_batch_size", type=int)
parser.add_argument("--gradient_accumulation_steps", type=int)
parser.add_argument("--model_save_path", type=str)
args=parser.parse_args()
main()
#!/bin/bash
#
#SBATCH --job-name=diagnosis_classification_multi_epoch
#SBATCH --output=output_train_bert_min100_100_epochs.txt
#SBATCH --mail-user=kulcsar@cl.uni-heidelberg.de
#SBATCH --mail-type=ALL
#SBATCH --partition=students
#SBATCH --mem 40G
#SBATCH --gres=gpu:1
#SBATCH --ntasks=1
#JOB STEPS
#srun hostname
cd /home/students/kulcsar/
source /home/students/kulcsar/anaconda3/etc/profile.d/conda.sh
conda activate software_bubble
cd /home/students/kulcsar/Bachelor/for_dataset/diagnosis_classification
#accelerate config
#python -m torch.distributed.launch --nproc_per_node=2 --use_env t5_model.py --model luqh/ClinicalT5-large --tokenizer luqh/ClinicalT5-large --dataset ./dev/dev_dataset_diagnoses_icd_48_pref.pkl --ff --seed 42 --batch_size 8 --epochs 3 --learning_rate 0.000005 --model_save_path t5_small_10000_icd_48_rs42_prefix.pt --do_eval --test_dataset ./test/test_dataset_diagnoses_icd_48_pref.pkl --topk 3 --temp 0.9 --num_beams 4 --no_rep_ngram 2 --do_sample --metrics accuracy --log log_clinicalt5.log
python classification_3.py --model bert-base-cased --tokenizer bert-base-cased --train_dataset ./train_dataset_diagnoses_shortened_min_100_vectors_split.pkl --test_dataset ./test_dataset_diagnoses_shortened_min_100_vectors_split.pkl --seed 42 --train_batch_size 4 --gradient_accumulation_steps 2 --epochs 100 --learning_rate 5e-5 --model_save_path bert_base_cased_aki_classification_100_epochs_min_100.pt
#--do_eval --test_dataset ./test/test_dataset_diagnoses_48.pkl --topk 30 --temp 0.9 --num_beams 4 --no_rep_ngram 2 --do_sample --metrics accuracy --num_return_sequences 1
#python t5_model.py --model luqh/ClinicalT5-large --tokenizer luqh/ClinicalT5-large --dataset ./dev/dev_dataset_diagnoses_icd_48.pkl --seed 13 --batch_size 16 --epochs 3 --learning_rate 0.000005 --model_save_path clint5_large_10000_icd_48_rs13.pt --do_eval --test_dataset ./test/test_dataset_diagnoses_48.pkl --topk 3 --temp 0.9 --num_beams 4 --no_rep_ngram 2 -do_sample --metrics accuracy
#python t5_model.py --model luqh/ClinicalT5-large --tokenizer luqh/ClinicalT5-large --dataset ./dev/dev_dataset_diagnoses_icd_48.pkl --seed 9 --batch_size 16 --epochs 3 --learning_rate 0.000005 --model_save_path clint5_large_10000_icd_48_rs9.pt --do_eval --test_dataset ./test/test_dataset_diagnoses_48.pkl --topk 3 --temp 0.9 --num_beams 4 --no_rep_ngram 2 -do_sample --metrics accuracy
#!/bin/bash
#
#SBATCH --job-name=diagnosis_classification_multi_epoch_biobert
#SBATCH --output=output_train_biobert_min100_100_epochs.txt
#SBATCH --mail-user=kulcsar@cl.uni-heidelberg.de
#SBATCH --mail-type=ALL
#SBATCH --partition=students
#SBATCH --mem 40G
#SBATCH --gres=gpu:1
#SBATCH --ntasks=1
#JOB STEPS
#srun hostname
cd /home/students/kulcsar/
source /home/students/kulcsar/anaconda3/etc/profile.d/conda.sh
conda activate software_bubble
cd /home/students/kulcsar/Bachelor/for_dataset/diagnosis_classification
#accelerate config
#python -m torch.distributed.launch --nproc_per_node=2 --use_env t5_model.py --model luqh/ClinicalT5-large --tokenizer luqh/ClinicalT5-large --dataset ./dev/dev_dataset_diagnoses_icd_48_pref.pkl --ff --seed 42 --batch_size 8 --epochs 3 --learning_rate 0.000005 --model_save_path t5_small_10000_icd_48_rs42_prefix.pt --do_eval --test_dataset ./test/test_dataset_diagnoses_icd_48_pref.pkl --topk 3 --temp 0.9 --num_beams 4 --no_rep_ngram 2 --do_sample --metrics accuracy --log log_clinicalt5.log
python classification_3.py --model emilyalsentzer/Bio_ClinicalBERT --tokenizer emilyalsentzer/Bio_ClinicalBERT --train_dataset ./train_dataset_diagnoses_shortened_min_100_vectors_split.pkl --test_dataset ./test_dataset_diagnoses_shortened_min_100_vectors_split.pkl --seed 42 --train_batch_size 4 --gradient_accumulation_steps 2 --epochs 100 --learning_rate 5e-5 --model_save_path biobert_diagnosis_classification_100_epochs_min_100.pt
#--do_eval --test_dataset ./test/test_dataset_diagnoses_48.pkl --topk 30 --temp 0.9 --num_beams 4 --no_rep_ngram 2 --do_sample --metrics accuracy --num_return_sequences 1
#python t5_model.py --model luqh/ClinicalT5-large --tokenizer luqh/ClinicalT5-large --dataset ./dev/dev_dataset_diagnoses_icd_48.pkl --seed 13 --batch_size 16 --epochs 3 --learning_rate 0.000005 --model_save_path clint5_large_10000_icd_48_rs13.pt --do_eval --test_dataset ./test/test_dataset_diagnoses_48.pkl --topk 3 --temp 0.9 --num_beams 4 --no_rep_ngram 2 -do_sample --metrics accuracy
#python t5_model.py --model luqh/ClinicalT5-large --tokenizer luqh/ClinicalT5-large --dataset ./dev/dev_dataset_diagnoses_icd_48.pkl --seed 9 --batch_size 16 --epochs 3 --learning_rate 0.000005 --model_save_path clint5_large_10000_icd_48_rs9.pt --do_eval --test_dataset ./test/test_dataset_diagnoses_48.pkl --topk 3 --temp 0.9 --num_beams 4 --no_rep_ngram 2 -do_sample --metrics accuracy
File added
File added
File added
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment