Skip to content
Snippets Groups Projects
Commit 8655276c authored by friebolin's avatar friebolin
Browse files
parents d094438e 152b69bc
No related branches found
No related tags found
No related merge requests found
......@@ -42,32 +42,27 @@ def set_seed(seed: int = 42) -> None:
class WordClassificationModel(torch.nn.Module): #AutoModel verwenden aus der Bibliothek
"""This class is needed to enable BERT to work with our input. We apply a dropout layer
and the linear classifier layer to make it a binary decision problem. In the forward step
and the linear classifier layer (/2 layer MLP) to make it a binary decision problem. In the forward step
we specify the classification over the span given by end and start position and compute the
loss function with cross entropy. The predictions (logits) are made by our classifier layer."""
def __init__(self, config_name, tmix=False, imdb=False): #mixlayer=-1, lambda_value=0.0):
super(WordClassificationModel, self).__init__()
#self.num_labels=config.num_labels
self.tmix=tmix
self.imdb=imdb
#self.mixlayer=mixlayer
if tmix:
print("initializing BertModelTMix")
self.embedding_model=BertModelTMix(config=AutoConfig.from_pretrained(config_name)).to("cuda")
#print("name or path: ", repr(self.embedding_model.name_or_path))
else:
self.embedding_model=AutoModel.from_pretrained(config_name, config=AutoConfig.from_pretrained(config_name)).to("cuda")
self.dropout=nn.Dropout(0.1) #config.hidden_dropout_prob for BERT: 0.1, config.hidden_dropout_prob:Roberta: not in roberta
self.classifier = nn.Linear(768, 2) #first element: Hidden size, defaults to 768, should we change it to 512 like in email? Alo 768 in the other two classes as per default
#self.embedding_model.init_weights() #do we need to reimplement this?
self.dropout=nn.Dropout(0.1)
self.classifier = nn.Linear(768, 2)
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, start_position=None, end_position=None, labels=None, mixepoch=False, mixlayer=None, lambda_value=None):
#print("mixepoch? ", mixepoch)
#print("mixlayer in forward: ", mixlayer)
if self.tmix==True:
print("here for bert model tmix")
#print("labels in WordModelClassification: ", labels)
outputs = self.embedding_model(input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
......@@ -88,22 +83,9 @@ class WordClassificationModel(torch.nn.Module): #AutoModel verwenden aus der Bib
head_mask=head_mask,
return_dict=False,
output_hidden_states=False)
#print("length just after embedding model: ", len(outputs))
#print("outputs from embedding model 1: ", outputs[0].size()) #hidden states of all the batch
#print("outputs from embedding model 2: ", outputs[1].size()) #pooled output, not needed
#print("outputs from embedding model 3: ", outputs[2].size()) #in case of tmix these will be labels
output = outputs[0].to("cuda") #get outputs from bert in the case of tmix, this will be the sequence outputs
#print("outputs: ", outputs)
#print("outputs size: ", len(outputs)) #with tmix: 3, wihtout: 2
#print(output[0].size())
output = self.dropout(output)#apply droput
#print(output)
#span_output = torch.randn(output.shape[0],output.shape[-1]).to(output.device)
#print(span_output)
#for i in range(output.shape[0]):
# span_output[i] = output[i][start_position[i]:end_position[i]].mean(dim=0)
output = outputs[0].to("cuda")
output = self.dropout(output)
if self.imdb==False:
span_output=torch.randn(output.shape[0], output.shape[-1]).to(output.device)
......@@ -116,29 +98,19 @@ class WordClassificationModel(torch.nn.Module): #AutoModel verwenden aus der Bib
for i in range(output.shape[0]):
span_output[i]=output[i].mean(dim=0)
logits=self.classifier(span_output)
#print(logits)
#print(logits.size())
#outputs = (logits,) + outputs[2:]
if self.tmix==True and mixepoch == True:
outputs = (logits,) + outputs[2:] #this will result in (logits, labels) and have length of 2
#print(len(outputs))
#print(outputs[0].size())
#print("labels size: ", outputs[1].size())
outputs = (logits,) + outputs[2:]
loss = train.cross_entropy(logits[:math.floor((logits.size()[0]/2))], outputs[1][:math.floor((outputs[1].size()[0]/2))], lambda_value) #special CEL for soft labels
#loss = loss_fct(logits.view(-1, 2), labels.view(-1)) #changed. self.num_labels, I set this to 2 because we have 0 and 1
outputs = (loss,) + outputs
#print("outputs: ", outputs)
#print("outputs size: ", len(outputs))
else:
outputs = (logits,) + outputs[:2]
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, 2), labels.view(-1)) #changed. self.num_labels, I set this to 2 because we have 0 and 1
loss = loss_fct(logits.view(-1, 2), labels.view(-1))
outputs = (loss,) + outputs
#print("outputs: ", outputs)
#print("outputs size: ", len(outputs))
return outputs
......@@ -147,11 +119,9 @@ class BertForWordClassification(BertPreTrainedModel): #AutoModel verwenden aus d
"""This class is needed to enable BERT to work with our input. We apply a dropout layer
and the linear classifier layer to make it a binary decision problem. In the forward step
we specify the classification over the span given by end and start position and compute the
loss function with cross entropy. The predictions (logits) are made by our classifier layer."""
loss function with cross entropy. The predictions (logits) are made by our classifier layer.
"""
def __init__(self, config):
#print(config)
#if config[0]=="b":
# BertModel.from_pretrained(config)
super(BertForWordClassification, self).__init__(config)
self.num_labels=config.num_labels
......@@ -164,7 +134,7 @@ class BertForWordClassification(BertPreTrainedModel): #AutoModel verwenden aus d
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
start_position=None, end_position=None, labels=None):
#print("in here")
outputs = self.bert(input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
......@@ -193,9 +163,6 @@ class RobertaForWordClassification(RobertaPreTrainedModel): #AutoModel verwenden
we specify the classification over the span given by end and start position and compute the
loss function with cross entropy. The predictions (logits) are made by our classifier layer."""
def __init__(self, config):
#print(config)
#if config[0]=="b":
# BertModel.from_pretrained(config)
super(RobertaForWordClassification, self).__init__(config)
self.num_labels=config.num_labels
......@@ -232,13 +199,8 @@ class RobertaForWordClassification(RobertaPreTrainedModel): #AutoModel verwenden
class BertModelTMix(BertPreTrainedModel):
"""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in [Attention is
all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
`add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
Model to override forward function in Encoder (copied and slightly modified from
transformers)
"""
def __init__(self, config, add_pooling_layer=True):
......@@ -269,13 +231,8 @@ class BertModelTMix(BertPreTrainedModel):
self.encoder.layer[layer].attention.prune_heads(heads)
def post_init(self):
print("jadijaid")
#@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
#@add_code_sample_docstrings(
# checkpoint=_CHECKPOINT_FOR_DOC,
# output_type=BaseModelOutputWithPoolingAndCrossAttentions,
# config_class=_CONFIG_FOR_DOC,
#)
print("init")
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
......@@ -296,7 +253,7 @@ class BertModelTMix(BertPreTrainedModel):
mixlayer=None,
lambda_value=None
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
r"""
"""
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
......@@ -380,8 +337,7 @@ class BertModelTMix(BertPreTrainedModel):
inputs_embeds=inputs_embeds,
past_key_values_length=past_key_values_length,
)
#print("mixlayer in Model class: ", mixlayer)
#print("labels in Model class: ", labels)
#call special encoder
encoder_outputs = self.encoder(
embedding_output,
attention_mask=extended_attention_mask,
......@@ -399,7 +355,6 @@ class BertModelTMix(BertPreTrainedModel):
lambda_value=lambda_value
)
sequence_output = encoder_outputs[0]
#labels=encoder_outputs[1]
pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
if not return_dict:
......@@ -416,16 +371,14 @@ class BertModelTMix(BertPreTrainedModel):
class BertTMixEncoder(torch.nn.Module):
"""Used for Tmix. When using Tmix the only change that has to be done, is to be able to modify at which layer to start
training and what the input of that layer is. We can do so, by making i in forward a variable. However, how do we specify
the input of the layer?"""
"""Used for Tmix. When using Tmix the only change that has to be done, is to be able to modify layers in model.
This way, we can apply the Mixup function to a batch of hidden states at a certain layer """
def __init__(self, config):
super().__init__()
self.config = config
BertLayer.forward=forward_new(BertLayer.forward) #Monkey Patch
self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
#self.batch_size=batch_size
def forward(
self,
......@@ -447,12 +400,8 @@ class BertTMixEncoder(torch.nn.Module):
all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
all_labels=()
#print(labels)
#self.layer.forward=forward_new(self.layer.forward) #Monkeypatch here?
#print("labels in encoder class: ", labels)
next_decoder_cache = () if use_cache else None
for i, layer_module in enumerate(self.layer):
#print("layer: ", i)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
......@@ -483,7 +432,6 @@ class BertTMixEncoder(torch.nn.Module):
encoder_attention_mask,
)
else:
#print("before")
layer_outputs = layer_module(
hidden_states,
labels,
......@@ -499,23 +447,10 @@ class BertTMixEncoder(torch.nn.Module):
mixepoch=mixepoch
)
#print("after")
hidden_states = layer_outputs[0]
labels=layer_outputs[1]
#all_labels=all_labels+(labels,)
#internal_batch_size=layer_outputs[2]
#print(positions)
#print("hidden states: ", hidden_states.size()) #should be [batch size, 512, 768]
#print("labels: ", layer_outputs[1]) #should be [batch size]
#positions = layer_outputs[1]
#print(hidden_states)
#if use_cache:
# next_decoder_cache += (layer_outputs[-1],)
#if output_attentions:
# all_self_attentions = all_self_attentions + (layer_outputs[1],)
# if self.config.add_cross_attention:
# all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
......@@ -545,7 +480,6 @@ class BertTMixEncoder(torch.nn.Module):
#Moneky Patching the forward function of BertLayer for mixup -> use decorators here to call the old forward function on the newly comptued hidden_state
def forward_new(forward):
#print("over here")
def forward_mix(self, hidden_states: torch.Tensor,
labels: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
......@@ -561,69 +495,27 @@ def forward_new(forward):
mixlayer: list=None,
nowlayer: int=0,
mixepoch: bool=False)-> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
#print("in Monkey Patch function")
#print("attention mask:", attention_mask.size())#return something
#print("hidden states: ", hidden_states.size())#return something
#print("past key values: ", past_key_values.size())#None
#print("head mask: ", head_mask.size())#None
#print("encoder hidden states: ", encoder_hidden_states.size())#None
#print("labels in forward patch: ", labels)
#runs = math.floor(hidden_states.size()[0]/2) ->we need a differetn way of calculating the runs, so that we dont count torch.zeros
#print("mixlayer: ", mixlayer)
#print("nowlayer: ", nowlayer)
new_matrices=[]
#print("positions in Monkey Patch: ", positions)
#all_position_lists=copy.deepcopy(positions)
if nowlayer == mixlayer and mixepoch==True:
#print("performing tmix")
#if len(positions) == 0:
# list_positions=[] #if no previous positons -> create new list
#else:
# list_positions=copy.deepcopy(positions[-1]) #else take last input to modify it
#1.step: do interpolation...
#for i in range(hidden_states.size()[0]):
#print("hidden state: ", hidden_states.size())
#if torch.equal(hidden_states[i], torch.zeros([hidden_states[i].size()[0], hidden_states[i].size()[1]])):
# print("not counting")
#list_positions[i] = (-1, -1)
#else:
# runs+=1
#print("batch size in forward patch: ", hidden_states.size()[0])
runs = math.floor(hidden_states.size()[0]/2)
#internal_batch_size=runs
#print("runs: ", runs)
#print("lambda_value: ", lambda_value)
counter=0
#new_matrices=[]
new_attention_masks=[]
new_labels=[]
#print("lists positions: ", len(list_positions))
for i in range(runs):
#if len(list_positions) < 32:
# list_positions.append((counter, counter+1))
#else:
# list_positions[i]=(counter, counter+1)
#print(positions)
#two hidden states, labels and attention masks to perform interpolation
hidden_states_1=hidden_states[counter]
attention_mask_1=attention_mask[counter]
#print("mask 1: ", attention_mask_1)
label_1=labels[counter]
#print("erstes label: ", label_1)
#print(attention_mask_1.size())
hidden_states_2=hidden_states[counter+1]
attention_mask_2=attention_mask[counter+1]
#print("mask 2: ", attention_mask_2)
label_2=labels[counter+1]
#print("zweites Label: ", label_2)
#print(attention_mask_2.size())
#perform interpolation
new_matrix = (lambda_value*hidden_states_1) + ((1-lambda_value)*hidden_states_2) #do interpolation
new_label = (lambda_value*label_1) + ((1-lambda_value)*label_2)
#new_attention_mask = (lambda_value*attention_mask*attention_mask_1) + ((1-lambda_value)*attention_mask_2)
#new_attention_masks.append(new_attention_mask)
#select longest attention mask
try:
index1=((attention_mask_1[0][0]== -10000.).nonzero(as_tuple=False)[0]).item()
except IndexError:
......@@ -634,65 +526,37 @@ def forward_new(forward):
except IndexError:
print(attention_mask_2.size())
index2=attention_mask_2.size()[0]
#print("index 1: ", index1)
#print("index 2: ", index2)
if index1>= index2:
selected_attention_mask=attention_mask_1
#print("attention mask shape", attention_mask_1.size())
else:
selected_attention_mask=attention_mask_2
#print("attention mask shape", attention_mask_2.size())
#print(attention_mask_1)
new_matrices.append(new_matrix) #do intepolations
#add everything to respective lists
new_matrices.append(new_matrix)
new_attention_masks.append(selected_attention_mask)
new_labels.append(new_label)
counter+=2
#print("new labels before padding: ", new_labels)
for i in range(runs, hidden_states.size()[0]):
#Pad to batch size
new_matrices.append(torch.zeros([hidden_states.size()[1], hidden_states.size()[2]]).to("cuda"))
new_labels.append(0)
#list_positions.append((-1, -1))
new_attention_masks.append(torch.zeros([1, 1, hidden_states.size()[1]]).to("cuda"))
new_matrices=torch.stack(new_matrices).to("cuda")
new_attention_masks=torch.stack(new_attention_masks).to("cuda")
#print("nem matrices: ", new_matrices)
#print("new attention masks: ", new_attention_masks)
new_labels=torch.Tensor(new_labels).to("cuda")
#new_labels=torch.stack(new_labels)
#print(new_labels)
#print(new_matrices.size())
#all_position_lists.append(list_positions)
#2.step: feed interpolations to old outputs function for backpropagation
#when performing interpolation, pass back th new hidden states and labels
outputs=forward(self, hidden_states=new_matrices, head_mask=head_mask, attention_mask=new_attention_masks, encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask, past_key_value=past_key_values, output_attentions=output_attentions) #I"m a bit confused here... do we have to add self or rather not?
labels=copy.deepcopy(new_labels)
else:
#compute outputs on hidden states
#print("not performing interpolation")
#when not performing interpolation, pass inputs directly to old forward function
print("not performing interpolation")
outputs=forward(self, hidden_states=hidden_states, head_mask=head_mask, attention_mask=attention_mask, encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask, past_key_value=past_key_values, output_attentions=output_attentions)
labels=copy.deepcopy(labels)
#outputs=list(zip(positions, outputs))
#print("old outputs len: ", len(outputs))
#print(outputs[0].size()) #outputs is only of length 1 because we dont have output attentions etc, so we only give back last hidden state
#outputs_with_pos=[]
#if len(new_matrices)>0:
# print("performed interpolation, adding new outputs")
#outputs_with_pos.append(outputs[0])
#outputs_with_pos.append(all_position_lists)
#print(len(outputs_with_pos))
#for i in range(len(new_matrices)):
# outputs_with_pos.append([outputs[0][i], positions[i]]) #the list we return has len batch size, each tuple consists of (last hidden state, (position1, position2))
#print("list len: ", len(outputs_with_pos))
#print(len(outputs_with_pos[1]))
#print(len(outputs_with_pos[0]))
return [outputs[0], labels] #, internal_batch_size]
return [outputs[0], labels]
return forward_mix
......@@ -23,7 +23,26 @@ torch.cuda.empty_cache()
#with torch.autocast("cuda"):
def train(model, name, imdb, seed,mixup,lambda_value, mixepoch, tmix, mixlayer, train_dataset, test_dataset, num_epochs, learning_rate, batch_size, test_batch_size):
"""Train loop for models. Iterates over epochs and batches and gives inputs to model. After training, call evaluation.py for evaluation of finetuned model."""
"""Train loop for models. Iterates over epochs and batches and gives inputs to model. After training, call evaluation.py for evaluation of finetuned model.
Params:
model: model out of models.py
name: str
imdb: bool
seed: int
mixup: bool
lambda_value: float
mixepoch:int
tmix: bool
mixlayer: int in {0, 11}
train_dataset: Dataset
test_dataset: Dataset
num_epochs: int
learning_rate: float
batch_size:
test_batch_size:
Returns:"""
model.train().to("cuda")
train_sampler = RandomSampler(train_dataset)
train_dataloader=DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)
......@@ -46,59 +65,27 @@ def train(model, name, imdb, seed,mixup,lambda_value, mixepoch, tmix, mixlayer,
'start_position': batch[3],
'end_position': batch[4],
'labels': batch[5]}
labels=batch[5]
start_positions=batch[3]
end_positions=batch[4]
if tmix==True:
#print("Hello, tmix is set as true")
if epoch == mixepoch:
if imdb == False:
print("this is miuxup epoch")
#print(batch[5])
#print("mixlayer: ", mixlayer)
#print("lambda: ", lambda_value)
inputs={'input_ids': batch[0],
'attention_mask': batch[1],
'token_type_ids': batch[2],
'start_position': batch[3],
'end_position': batch[4],
'labels': batch[5],
'mixepoch': True,
'mixlayer':mixlayer,
'lambda_value':lambda_value}
if imdb==True:
print("this is a mixup epoch with imdb")
inputs={'input_ids':batch[0],
if imdb == False:
print("this is mixup epoch")
inputs={'input_ids': batch[0],
'attention_mask': batch[1],
'token_type_ids': batch[2],
'labels': batch[3],
'start_position': batch[3],
'end_position': batch[4],
'labels': batch[5],
'mixepoch': True,
'mixlayer': mixlayer,
'lambda_value': lambda_value}
else:
if imdb == False:
print("this is a non mixup epoch")
#print(batch[5])
inputs={'input_ids': batch[0],
'attention_mask': batch[1],
'token_type_ids': batch[2],
'start_position': batch[3],
'end_position': batch[4],
'labels': batch[5],
'mixepoch': False,
'mixlayer':mixlayer,
'lambda_value':lambda_value}
elif imdb == True:
print("non mixup epoch with imbd")
inputs={'input_ids': batch[0],
'attention_mask': batch[1],
'token_type_ids': batch[2],
'labels': batch[3],
'mixepoch': False,
'mixlayer': mixlayer,
'mixlayer':mixlayer,
'lambda_value':lambda_value}
if imdb==True:
print("this is a mixup epoch with imdb")
inputs={'input_ids':batch[0],
'attention_mask': batch[1],
'token_type_ids': batch[2],
'labels': batch[3],
'mixepoch': True,
'mixlayer': mixlayer,
'lambda_value': lambda_value}
if name[0] == "r":
......
......@@ -3,3 +3,8 @@ numpy==1.23.5
pandas==1.5.2
torch==1.13.0+cu116
tqdm==4.64.1
evaluate ==0.3.0
matplotlib==3.5.2
scikit_lean==1.2.1
transformers==4.26.1
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment