Skip to content
Snippets Groups Projects
Commit 4e71dced authored by kupper's avatar kupper
Browse files

Finetuning for T5 MLM

parent 1e3beafe
No related branches found
No related tags found
No related merge requests found
#!/bin/bash
#SBATCH --job-name=finetune_T5
#SBATCH --output=logs/finetune_T5_%j.txt
#SBATCH --job-name=finetune_T5_MLM
#SBATCH --output=logs/finetune_T5_MLM_%j.txt
#SBATCH --ntasks=1
#SBATCH --time=24:00:00
#SBATCH --mem=8000
......@@ -13,4 +13,4 @@
#SBATCH --gres=gpu
export PYTHONUNBUFFERED=1
srun python3 -m src.experiments.finetune_T5.finetune_T5
srun python3 -m src.experiments.finetune_T5.finetune_T5_MLM
#!/bin/bash
#SBATCH --job-name=finetune_T5_NLI
#SBATCH --output=logs/finetune_T5_NLI_%j.txt
#SBATCH --ntasks=1
#SBATCH --time=24:00:00
#SBATCH --mem=8000
#SBATCH --mail-type=ALL
#SBATCH --mail-user=kupper@cl.uni-heidelberg.de
#SBATCH --partition=students
#SBATCH --cpus-per-task=4
#SBATCH --qos=batch
#SBATCH --gres=gpu
export PYTHONUNBUFFERED=1
srun python3 -m src.experiments.finetune_T5.finetune_T5_NLI
import data.data_manager as data_manager
from src.models.T5_MLM_label import finetune_model
def finetune_t5(dataset):
print("start")
annotated_sentences = data_manager.get_annotated_sentences(dataset, 1000)
labels = data_manager.get_labels(dataset)
sentences = []
entities = []
labels = []
for annotated_sentence in annotated_sentences:
sentence = annotated_sentence[0]
for annotation in annotated_sentence[1]:
sentences.append(sentence)
entities.append(annotation[0])
labels.append(annotation[1])
print(f"Finetuning on {len(sentences)} examples")
for i in range(min(len(sentences), 50)):
print(f"sentence: {sentences[i]}, entity: {entities[i]}, label: {labels[i]}")
epochs = 20
finetune_model(sentences, entities, labels, output_dir=f"./src/models/t5_mlm_finetuned_model/pretrained_{dataset}_epoch{epochs}", epochs=epochs)
finetune_t5("CoNLL")
import torch
import numpy as np
from torch.nn.functional import softmax
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset, DatasetDict
model_name = "google-t5/t5-base"
......@@ -21,3 +23,69 @@ def classify_entity(sentence, entity, labels):
min_loss = min(results.keys())
return results[min_loss]
def finetune_model(sentences, entities, labels, output_dir, epochs=10):
input_texts = []
target_texts = []
for i in range(len(sentences)):
sentence = sentences[i]
entity = entities[i]
label = labels[i]
input_texts.append(f"{sentence} {entity} is a <extra_id_0>.")
target_texts.append(f"<extra_id_0> {label} <extra_id_1>")
model_input = tokenizer(input_texts, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
targets = tokenizer(target_texts, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
model_input["input_ids"] = np.array(model_input["input_ids"])
model_input["attention_mask"] = np.array(model_input["attention_mask"])
model_input["labels"] = np.array(targets["input_ids"])
dataset = Dataset.from_dict({
"input_ids": model_input["input_ids"],
"attention_mask": model_input["attention_mask"],
"labels": model_input["labels"]
})
print(dataset)
# split into training and validation data (20% used for validation)
train_test_split = dataset.train_test_split(test_size=0.2, shuffle=True, seed=0)
dataset = DatasetDict({
"train": train_test_split["train"],
"validation": train_test_split["test"]
})
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
training_args = TrainingArguments(
output_dir="./src/models/t5_nli_finetuned_model",
eval_strategy="epoch",
learning_rate=5e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=epochs,
weight_decay=0.01,
save_strategy="no",
push_to_hub=False,
logging_dir="./logs",
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset["train"],
eval_dataset=dataset["validation"],
tokenizer=tokenizer,
data_collator=data_collator,
)
trainer.train()
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
......@@ -83,7 +83,7 @@ def preprocess_data(sample):
input_text = f"nli hypothesis: {sample['hypothesis']} premise: {sample['premise']}"
target_text = label_map[bool(sample['entailment'])]
tokenized_input = tokenizer(input_text, padding="max_length", truncation=True, max_length=512)
tokenized_input = tokenizer(input_text, padding="max_length", truncation=True, max_length=128)
tokenized_target = tokenizer(target_text, padding="max_length", truncation=True, max_length=10)
tokenized_input["labels"] = tokenized_target["input_ids"]
......@@ -139,3 +139,4 @@ def finetune_model(premises, hypotheses, entailment, output_dir, epochs=10):
tokenizer.save_pretrained(output_dir)
load_base()
# load_finetuned("./src/models/t5_nli_finetuned_model/pretrained_CoNLL_epoch20")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment