Newer
Older
import torch
import tqdm
import numpy as np
import evaluation
import json
import random
import math
from tqdm.auto import tqdm
from transformers import BertTokenizer, RobertaTokenizer, BertModel, RobertaModel, RobertaPreTrainedModel, RobertaConfig, BertConfig, BertPreTrainedModel, PreTrainedModel, AutoModel, AutoTokenizer
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import AdamW, get_scheduler
from torch import nn
from torch.nn import CrossEntropyLoss
import matplotlib.pyplot as plt
import os
import pandas as pd
import sklearn
metric=evaluate.load("accuracy")
torch.cuda.empty_cache()
def train(model, name, seed,gradient_accumulation_steps,mixup, threshold, lambda_value, mixepoch, tmix, mixlayer, train_dataset, test_dataset, num_epochs, learning_rate, batch_size, test_batch_size):
"""Write Train loop for model with certain train dataset"""
#set_seed(seed)
#if model_name[0] == "b":
# model=BertForWordClassification.from_pretrained(model_name).to("cuda")
#elif model_name[0] == "r":
# model=RobertaForWordClassification.from_pretrained(model_name),to("cuda")
print("batch size: ", batch_size)
print("test batch size: ", test_batch_size)
print("mix up: ", mixup)
model.train().to("cuda")
train_sampler = RandomSampler(train_dataset)
train_dataloader=DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)
num_training_steps=num_epochs*len(train_dataloader)
optimizer=AdamW(model.parameters(), lr=learning_rate, eps=1e-8, weight_decay=0.1)
lr_scheduler=get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=10, num_training_steps=num_training_steps)
for epoch in range(num_epochs):
#for param_tensor in model.state_dict():
# print(param_tensor, "\t", model.state_dict()[param_tensor])
if name[0] == "b":
if tmix==False:
inputs = {'input_ids': batch[0],
'attention_mask': batch[1],
'token_type_ids': batch[2],
'start_position': batch[3],
'end_position': batch[4],
'labels': batch[5]}
labels=batch[5]
start_positions=batch[3]
end_positions=batch[4]
if tmix==True:
if epoch == mixepoch:
inputs={'input_ids': batch[0],
'attention_mask': batch[1],
'token_type_ids': batch[2],
'start_position': batch[3],
'end_position': batch[4],
'labels': batch[5],
'mixepoch': True,
'mixlayer':mixepoch,
'lambda_value':lambda_value}
else:
inputs={'input_ids': batch[0],
'attention_mask': batch[1],
'token_type_ids': batch[2],
'start_position': batch[3],
'end_position': batch[4],
'labels': batch[5],
'mixepoch': False,
'mixlayer':mixepoch,
'lambda_value':lambda_value}
if name[0] == "r":
inputs = {'input_ids': batch[0],
'attention_mask': batch[1],
'start_position': batch[2],
'end_position': batch[3],
'labels': batch[4]}
labels = batch[4]
start_positions=batch[2]
end_positions=batch[3]
outputs=model(**inputs)
#print("outputs: ", outputs)
#print("outputs 0: ", outputs[0])
loss=outputs[0]
#print("length of outputs; ", len(outputs))
#for i in range(len(outputs)):
print("Loss: ", loss)
if mixup == True: #and epoch>=mixup_epoch-1:
#loss.backward(retain_graph=True)
loss.backward()
print("epoch: {0}, retained".format(epoch))
else:
loss.backward()
#if (index+1)%gradient_accumulation_steps==0:
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
model.zero_grad()
# # print("outputs {0}: {1}".format(i, outputs[i].size()))
if mixup == True:
#print("length of outputs: ", len(outputs))
new_matrix_batch, new_labels_batch = mixup_function(outputs[2], labels, lambda_value, threshold)
#for matrix in new_matrix_batch
span_output=torch.randn(new_matrix_batch.shape[0], new_matrix_batch.shape[-1]).to("cuda")
for i in range(new_matrix_batch.shape[0]):
span_output[i]=new_matrix_batch[i][start_positions[i]:end_positions[i]].mean(dim=0)
#print("span output size: ", span_output.size())
#print("span output: ", span_output)
logits=model.classifier(span_output.detach()) #target_value?
#loss_2 = SoftCrossEntropyLoss(logits.view(-1, 2).to("cuda"), new_labels_batch.view(-1).to("cuda"))
print("MixUp Loss: ", loss_2)
#update entire model
loss_2.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
model.zero_grad()
#print(outputs[0].size())
#progress_bar.update(1)
#print("one epoch done")
#print(model_name)
evaluation_test = evaluation.evaluate_model(model, name, test_dataset, learning_rate, test_batch_size)
evaluation_train = evaluation.evaluate_model(model, name, train_dataset, learning_rate, test_batch_size)
print("DEV: ", evaluation_test)
print("TRAIN: ", evaluation_train)
for i in range (logits.shape[0]):
lg = logits[i:i+1,:]
t = target[i]
value = t.item() #gets Item (0. or 1.)
if value == 1 or value == 0:
one_hot = torch.tensor([1-value,value], device='cuda:0') #creating one-hot vector e.g. [0. ,1.]
loss_clear_labels = -((one_hot[0] * logprobs[0][0]) + (one_hot[1] * logprobs[0][1]))
results = torch.cat((loss_clear_labels.view(1), results), dim=0)
value_r = round(value, 1) #to make it equal to lambda_value e.g. 0.4
if value_r == l:
#create vector: e.g. [l, 1-l]
mixed_vec = torch.tensor([l, 1-l], device='cuda')
loss_mixed_labels_1 = -((mixed_vec[0] * logprobs[0][0]) + (mixed_vec[1] * logprobs[0][1]))
loss_mixed_labels_2 = -((mixed_vec[0] * logprobs[0][0]) + (mixed_vec[1] * logprobs[0][1]))
def mixup_function(batch_of_matrices, batch_of_labels, l, t):
runs = math.floor(batch_of_matrices.size()[0]/2)
counter=0
print("doing interpolation with lambda: {0} and threshold: {1}...".format(l, t))
matrix1=batch_of_matrices[counter]
label1=batch_of_labels[counter]
matrix2=batch_of_matrices[counter+1]
label2=batch_of_labels[counter+1]
new_matrix, new_label=interpolate(matrix1, label1, matrix2, label2, l, t)
if new_matrix != None:
results.append(new_matrix)
result_labels.append(new_label)
result_labels= torch.stack(result_labels) #torch.LongTensor(result_labels)
def interpolate(matrix1, label1, matrix2, label2, l, threshold):
new_matrix=(matrix1*l)+(matrix2 * (1-l))
new_label=(label1*l)+(label2*(1-l))
#if new_label > 0.5+threshold:
# new_label=1
#elif new_label < 0.5-threshold:
# new_label=0
#else:
# print("in undefinded zone")
# return None, None
return new_matrix, new_label#torch.tensor([new_label])
def train_salami(model, seed, train_set, test_set, batch_size, test_batch_size, learning_rate, epochs):
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
results=[]
#for num_run, seed in enumerate(random.sample(range(1, 100), num_runs)):
#if model_name[0]=="b":
# model=BertForWordClassification.from_pretrained(model_name)
#else:
# model=RobertaForWordClassification.from_pretrained(model_name)
#set_seed(seed)
training_args = TrainingArguments(
output_dir="./results", # output directory
num_train_epochs=epochs, # total # of training epochs
per_device_train_batch_size=batch_size, # batch size per device during training
per_device_eval_batch_size=test_batch_size, # batch size for evaluation
warmup_steps=10, # number of warmup steps for learning rate scheduler
weight_decay=0.1, # strength of weight decay
learning_rate=learning_rate,
evaluation_strategy="no", # evaluates never, per epoch, or every eval_steps
eval_steps=10,
logging_dir="./logs", # directory for storing logs
seed=seed, # explicitly set seed
save_strategy="no", # do not save checkpoints
)
trainer=Trainer(
model=model,
train_dataset=train_set,
eval_dataset=test_set,
args=training_args,
)
trainer.train()
test_set_results=trainer.evaluate()
results.append(test_set_results)
print(test_set_results)
return results