From 4e71dcedc137e84d818e1da062f4f1d7ee06169e Mon Sep 17 00:00:00 2001 From: kupper <kupper@login.cl.uni-heidelberg.de> Date: Thu, 20 Mar 2025 12:40:41 +0100 Subject: [PATCH] Finetuning for T5 MLM --- ...5_finetune_cl.sh => T5_MLM_finetune_cl.sh} | 6 +- scripts/T5_NLI_finetune_cl.sh | 16 +++++ .../finetune_T5/finetune_T5_MLM.py | 30 ++++++++ .../{finetune_T5.py => finetune_T5_NLI.py} | 0 src/models/T5_MLM_label.py | 68 +++++++++++++++++++ src/models/T5_NLI.py | 3 +- 6 files changed, 119 insertions(+), 4 deletions(-) rename scripts/{T5_finetune_cl.sh => T5_MLM_finetune_cl.sh} (65%) create mode 100644 scripts/T5_NLI_finetune_cl.sh create mode 100644 src/experiments/finetune_T5/finetune_T5_MLM.py rename src/experiments/finetune_T5/{finetune_T5.py => finetune_T5_NLI.py} (100%) diff --git a/scripts/T5_finetune_cl.sh b/scripts/T5_MLM_finetune_cl.sh similarity index 65% rename from scripts/T5_finetune_cl.sh rename to scripts/T5_MLM_finetune_cl.sh index c3df044..e9bf9fd 100644 --- a/scripts/T5_finetune_cl.sh +++ b/scripts/T5_MLM_finetune_cl.sh @@ -1,7 +1,7 @@ #!/bin/bash -#SBATCH --job-name=finetune_T5 -#SBATCH --output=logs/finetune_T5_%j.txt +#SBATCH --job-name=finetune_T5_MLM +#SBATCH --output=logs/finetune_T5_MLM_%j.txt #SBATCH --ntasks=1 #SBATCH --time=24:00:00 #SBATCH --mem=8000 @@ -13,4 +13,4 @@ #SBATCH --gres=gpu export PYTHONUNBUFFERED=1 -srun python3 -m src.experiments.finetune_T5.finetune_T5 +srun python3 -m src.experiments.finetune_T5.finetune_T5_MLM diff --git a/scripts/T5_NLI_finetune_cl.sh b/scripts/T5_NLI_finetune_cl.sh new file mode 100644 index 0000000..9922273 --- /dev/null +++ b/scripts/T5_NLI_finetune_cl.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +#SBATCH --job-name=finetune_T5_NLI +#SBATCH --output=logs/finetune_T5_NLI_%j.txt +#SBATCH --ntasks=1 +#SBATCH --time=24:00:00 +#SBATCH --mem=8000 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=kupper@cl.uni-heidelberg.de +#SBATCH --partition=students +#SBATCH --cpus-per-task=4 +#SBATCH --qos=batch +#SBATCH --gres=gpu + +export PYTHONUNBUFFERED=1 +srun python3 -m src.experiments.finetune_T5.finetune_T5_NLI diff --git a/src/experiments/finetune_T5/finetune_T5_MLM.py b/src/experiments/finetune_T5/finetune_T5_MLM.py new file mode 100644 index 0000000..e70bfc6 --- /dev/null +++ b/src/experiments/finetune_T5/finetune_T5_MLM.py @@ -0,0 +1,30 @@ +import data.data_manager as data_manager +from src.models.T5_MLM_label import finetune_model + +def finetune_t5(dataset): + print("start") + annotated_sentences = data_manager.get_annotated_sentences(dataset, 1000) + labels = data_manager.get_labels(dataset) + + sentences = [] + entities = [] + labels = [] + + for annotated_sentence in annotated_sentences: + sentence = annotated_sentence[0] + + for annotation in annotated_sentence[1]: + sentences.append(sentence) + entities.append(annotation[0]) + labels.append(annotation[1]) + + print(f"Finetuning on {len(sentences)} examples") + + for i in range(min(len(sentences), 50)): + print(f"sentence: {sentences[i]}, entity: {entities[i]}, label: {labels[i]}") + + epochs = 20 + + finetune_model(sentences, entities, labels, output_dir=f"./src/models/t5_mlm_finetuned_model/pretrained_{dataset}_epoch{epochs}", epochs=epochs) + +finetune_t5("CoNLL") diff --git a/src/experiments/finetune_T5/finetune_T5.py b/src/experiments/finetune_T5/finetune_T5_NLI.py similarity index 100% rename from src/experiments/finetune_T5/finetune_T5.py rename to src/experiments/finetune_T5/finetune_T5_NLI.py diff --git a/src/models/T5_MLM_label.py b/src/models/T5_MLM_label.py index d893ca2..c0711ad 100644 --- a/src/models/T5_MLM_label.py +++ b/src/models/T5_MLM_label.py @@ -1,6 +1,8 @@ import torch +import numpy as np from torch.nn.functional import softmax from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq +from datasets import Dataset, DatasetDict model_name = "google-t5/t5-base" @@ -21,3 +23,69 @@ def classify_entity(sentence, entity, labels): min_loss = min(results.keys()) return results[min_loss] + + +def finetune_model(sentences, entities, labels, output_dir, epochs=10): + input_texts = [] + target_texts = [] + + for i in range(len(sentences)): + sentence = sentences[i] + entity = entities[i] + label = labels[i] + input_texts.append(f"{sentence} {entity} is a <extra_id_0>.") + target_texts.append(f"<extra_id_0> {label} <extra_id_1>") + + model_input = tokenizer(input_texts, return_tensors="pt", padding="max_length", truncation=True, max_length=128) + targets = tokenizer(target_texts, return_tensors="pt", padding="max_length", truncation=True, max_length=128) + + model_input["input_ids"] = np.array(model_input["input_ids"]) + model_input["attention_mask"] = np.array(model_input["attention_mask"]) + model_input["labels"] = np.array(targets["input_ids"]) + + dataset = Dataset.from_dict({ + "input_ids": model_input["input_ids"], + "attention_mask": model_input["attention_mask"], + "labels": model_input["labels"] + }) + + print(dataset) + + # split into training and validation data (20% used for validation) + train_test_split = dataset.train_test_split(test_size=0.2, shuffle=True, seed=0) + + dataset = DatasetDict({ + "train": train_test_split["train"], + "validation": train_test_split["test"] + }) + + dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"]) + + training_args = TrainingArguments( + output_dir="./src/models/t5_nli_finetuned_model", + eval_strategy="epoch", + learning_rate=5e-5, + per_device_train_batch_size=8, + per_device_eval_batch_size=8, + num_train_epochs=epochs, + weight_decay=0.01, + save_strategy="no", + push_to_hub=False, + logging_dir="./logs", + ) + + data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=dataset["train"], + eval_dataset=dataset["validation"], + tokenizer=tokenizer, + data_collator=data_collator, + ) + + trainer.train() + + model.save_pretrained(output_dir) + tokenizer.save_pretrained(output_dir) diff --git a/src/models/T5_NLI.py b/src/models/T5_NLI.py index 2a1ca8d..f24058a 100644 --- a/src/models/T5_NLI.py +++ b/src/models/T5_NLI.py @@ -83,7 +83,7 @@ def preprocess_data(sample): input_text = f"nli hypothesis: {sample['hypothesis']} premise: {sample['premise']}" target_text = label_map[bool(sample['entailment'])] - tokenized_input = tokenizer(input_text, padding="max_length", truncation=True, max_length=512) + tokenized_input = tokenizer(input_text, padding="max_length", truncation=True, max_length=128) tokenized_target = tokenizer(target_text, padding="max_length", truncation=True, max_length=10) tokenized_input["labels"] = tokenized_target["input_ids"] @@ -139,3 +139,4 @@ def finetune_model(premises, hypotheses, entailment, output_dir, epochs=10): tokenizer.save_pretrained(output_dir) load_base() +# load_finetuned("./src/models/t5_nli_finetuned_model/pretrained_CoNLL_epoch20") -- GitLab