diff --git a/Code/models.py b/Code/models.py index c903d11661d506d7b4a23eba0f94b584ecdb0f6c..f488662e9b5a9c027e6de596fd58be90c7d3954f 100644 --- a/Code/models.py +++ b/Code/models.py @@ -58,20 +58,26 @@ class WordClassificationModel(torch.nn.Module): model and the computed loss value. """ - def __init__(self, config_name, tmix=False, imdb=False): #mixlayer=-1, lambda_value=0.0): + def __init__(self, config_name, tmix=False, imdb=False, mlp_flag=False): #mixlayer=-1, lambda_value=0.0): super(WordClassificationModel, self).__init__() self.tmix=tmix self.imdb=imdb + self.mlp_flag=mlp_flag #self.mixlayer=mixlayer if tmix: print("initializing BertModelTMix") - self.embedding_model=BertModelTMix(config=AutoConfig.from_pretrained(config_name)).to(device) + self.embedding_model=BertModelTMix.from_pretrained(config_name, config=AutoConfig.from_pretrained(config_name)).to(device) else: self.embedding_model=AutoModel.from_pretrained(config_name, config=AutoConfig.from_pretrained(config_name)).to(device) - - self.dropout=nn.Dropout(0.1) - self.classifier = nn.Linear(768, 2) + if mlp_flag==False: + print("Using Linear Classifier") + self.classifier=nn.Linear(768, 2) + elif mlp_flag==True: + print("Using two layer Multi Layer Perceptron") + self.classifier=nn.Sequential(nn.Linear(768, 128), nn.Tanh(), nn.Linear(128, 2)) + + self.dropout=nn.Dropout(0.1) def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, start_position=None, end_position=None, labels=None, mixepoch=False, mixlayer=None, lambda_value=None): if self.tmix==True: @@ -82,9 +88,8 @@ class WordClassificationModel(torch.nn.Module): position_ids=position_ids, head_mask=head_mask, return_dict=False, - output_hidden_states=False, - labels=labels, - mixepoch=mixepoch, + output_hidden_states=False, + labels=labels, mixlayer=mixlayer, lambda_value=lambda_value) else: @@ -107,14 +112,11 @@ class WordClassificationModel(torch.nn.Module): logits = self.classifier(span_output) else: - span_output=torch.randn(output.shape[0], output.shape[-1]).to(output.device) - for i in range(output.shape[0]): - span_output[i]=output[i].mean(dim=0) + span_output=torch.mean(output, 1) logits=self.classifier(span_output) if self.tmix==True and mixepoch == True: outputs = (logits,) + outputs[2:] - loss = train.cross_entropy(logits[:math.floor((logits.size()[0]/2))], outputs[1][:math.floor((outputs[1].size()[0]/2))], lambda_value) #special CEL for soft labels outputs = (loss,) + outputs @@ -152,7 +154,7 @@ class BertForWordClassification(BertPreTrainedModel): self.bert=BertModel(config) self.dropout=nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) #selbst machen!! + self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) self.init_weights() @@ -167,7 +169,7 @@ class BertForWordClassification(BertPreTrainedModel): head_mask=head_mask) output = outputs[0] - output = self.dropout(output) #apply droput + output = self.dropout(output) span_output = torch.randn(output.shape[0],output.shape[-1]).to(output.device) for i in range(output.shape[0]): span_output[i] = output[i][start_position[i]:end_position[i]].mean(dim=0) @@ -208,7 +210,7 @@ class RobertaForWordClassification(RobertaPreTrainedModel): self.roberta=RobertaModel(config) self.dropout=nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) #selbst machen!! + self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) self.init_weights() @@ -220,8 +222,8 @@ class RobertaForWordClassification(RobertaPreTrainedModel): position_ids=position_ids, head_mask=head_mask) - output = outputs[0] #get outputs from bert - output = self.dropout(output) #apply droput + output = outputs[0] + output = self.dropout(output) span_output = torch.randn(output.shape[0],output.shape[-1]).to(output.device) for i in range(output.shape[0]): span_output[i] = output[i][start_position[i]:end_position[i]].mean(dim=0) @@ -305,7 +307,6 @@ class BertModelTMix(BertPreTrainedModel): output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, labels=None, - mixepoch=False, mixlayer=None, lambda_value=None ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: @@ -406,7 +407,6 @@ class BertModelTMix(BertPreTrainedModel): output_hidden_states=output_hidden_states, return_dict=return_dict, labels=labels, - mixepoch=mixepoch, mixlayer=mixlayer, lambda_value=lambda_value ) @@ -460,12 +460,10 @@ class BertTMixEncoder(torch.nn.Module): output_hidden_states: Optional[bool] = False, return_dict: Optional[bool] = True, mixlayer: int = None, - lambda_value: float=0.0, - mixepoch: bool = False) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + lambda_value: float=0.0) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None - all_labels=() next_decoder_cache = () if use_cache else None for i, layer_module in enumerate(self.layer): @@ -509,8 +507,7 @@ class BertTMixEncoder(torch.nn.Module): output_attentions, lambda_value, mixlayer=mixlayer, - nowlayer=i, - mixepoch=mixepoch + nowlayer=i ) hidden_states = layer_outputs[0] @@ -573,10 +570,9 @@ def forward_new(forward): return_dict: Optional[bool] = True, lambda_value: float=0.4, mixlayer: list=None, - nowlayer: int=0, - mixepoch: bool=False)-> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + nowlayer: int=0)-> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: new_matrices=[] - if nowlayer == mixlayer and mixepoch==True: + if nowlayer == mixlayer: runs = math.floor(hidden_states.size()[0]/2) counter=0 new_attention_masks=[] @@ -599,12 +595,10 @@ def forward_new(forward): try: index1=((attention_mask_1[0][0]== -10000.).nonzero(as_tuple=False)[0]).item() except IndexError: - print(attention_mask_1.size()) index1=attention_mask_1.size()[0] try: index2=((attention_mask_2[0][0]== -10000.).nonzero(as_tuple=False)[0]).item() except IndexError: - print(attention_mask_2.size()) index2=attention_mask_2.size()[0] if index1>= index2: selected_attention_mask=attention_mask_1 @@ -625,9 +619,8 @@ def forward_new(forward): new_attention_masks=torch.stack(new_attention_masks).to(device) new_labels=torch.Tensor(new_labels).to(device) - #when performing interpolation, pass back th new hidden states and labels - outputs=forward(self, hidden_states=new_matrices, head_mask=head_mask, attention_mask=new_attention_masks, encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, past_key_value=past_key_values, output_attentions=output_attentions) #I"m a bit confused here... do we have to add self or rather not? + #when performing interpolation, pass back the new hidden states and labels + outputs=[new_matrices, new_attention_masks] labels=copy.deepcopy(new_labels) else: diff --git a/Code/preprocess.py b/Code/preprocess.py index a3586e19d63e8e8207172ecc8ae0bfff442173c2..7c179cae581441849b2b9d185076a81412ff25c4 100644 --- a/Code/preprocess.py +++ b/Code/preprocess.py @@ -16,16 +16,16 @@ import os import pandas as pd import sklearn -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #metric=evaluate.load("accuracy") torch.cuda.empty_cache() def reposition(dp, old_dataset=False): - """Reposition fucntion to find the character level indices of the metonymy (to map back in tokenier_new + """Reposition function to find the character level indices of the metonymy (to map back in tokenizer_new function by char_to_tokens) params: - dp -> json readin of li et al shaped dataset + dp -> json readin of li et al shaped dataset(or original dataset) + old_dataset: bool -> Wheter or not Li et al Datasets are used (True:no, False:yes) returns: new_start -> int: new start position of metonymy on character level (including whitespaces) @@ -34,7 +34,6 @@ def reposition(dp, old_dataset=False): new_end=0 if old_dataset ==False: new_dp= " ".join(dp["sentence"]).lower() - if dp["pos"][0]==0: new_start=len(" ".join(dp["sentence"][:dp["pos"][0]])) else: @@ -86,62 +85,35 @@ def tokenizer_new(tokenizer, input, max_length, masked=False, old_dataset=False, dp["sentence"][dp["pos"][0] : dp["pos"][1]] == "<mask>" - #if old_dataset == False: - #find new char-pos for metonymic word - #new_start_pos, new_end_pos = reposition(dp) - #old_target=" ".join(dp["sentence"][dp["pos"][0]:dp["pos"][1]]).lower() - - #### implement rest: tokenize metonymic sentence and encode the rest and pad with it - - - #old_target=new_dp[dp["pos"][0]: dp["pos"][1]].lower() #old target already on character level - - #assert new_dp[new_start_pos:new_end_pos].strip() == "".join(dp["sentence"][dp["pos"][0]: dp["pos"][1]]).lower() if old_dataset == False: + #if Li et al dataset: reposition positions on character level, encode sentence and extract target new_start_pos, new_end_pos = reposition(dp, old_dataset=False) new_dp= " ".join(dp["sentence"]).lower() encoded_inp=tokenizer.encode_plus(new_dp, add_special_tokens=True, max_length=max_length, padding="max_length", truncation=True) - #tf_tokens=tokenizer.convert_ids_to_tokens(encoded_inp["input_ids"]) - #print(tf_tokens) - #print(typ(encoded_inp)) - #print("start pos: ", new_start_pos) - #print("end pos: ", new_end_pos) - #print("sentence: ", new_dp) old_target="".join(dp["sentence"][dp["pos"][0]:dp["pos"][1]]).lower() else: - #print("new dataset") + #If old dataset(original Markert semeval): reposition and add context new_start_pos, new_end_pos = reposition(dp, old_dataset=True) new_dp= " ".join(dp["sentence"][1]).lower() encoded_inp=tokenizer.encode_plus(new_dp, add_special_tokens=True) #dont add max length and padding so we can do it manually length_metonymies = len(encoded_inp["input_ids"]) context_len=max_length - length_metonymies #length of how much context tokens we can add. We add from left to right - #print("metonymy sentece: ", new_dp) - #print("metonymy sentence inputs: ", encoded_inp) - #print("context length: ", context_len) inp_before=" ".join(dp["sentence"][0]).lower() - #print("input before: ", inp_before) encoded_inp_before=tokenizer.encode_plus(inp_before, add_special_tokens=True) #encode before and after context - - #print("encoded inputs before: ", encoded_inp_before) - #print("\n") inp_after=" ".join(dp["sentence"][2]).lower() - #print("input after: ", inp_after) encoded_inp_after=tokenizer.encode_plus(inp_after , add_special_tokens=True) - #print("encoded inputs after: ", encoded_inp_after) - #print("\n") - #print("\n") #Preprare input for new dictionary with context context_input_ids=[] context_attention_masks=[] - if tokenizer.name_or_path[0] == "b": #BER Tokenizer has token type ids too + if tokenizer.name_or_path[0] == "b": #BERT Tokenizer has token type ids too context_token_type_ids=[] length_before=len(encoded_inp_before["input_ids"]) length_after=len(encoded_inp_after["input_ids"]) - + #Pad before if length_before>=context_len/2 and length_after>=context_len/2: index_before=int(context_len/2) index_after=int(context_len/2) @@ -155,6 +127,7 @@ def tokenizer_new(tokenizer, input, max_length, masked=False, old_dataset=False, else: index_after=int(math.ceil(wanted_from_after)) + #Pad after elif length_after<context_len/2 and length_before>=context_len/2: index_after=length_after difference_after=(context_len/2)-length_after @@ -168,108 +141,37 @@ def tokenizer_new(tokenizer, input, max_length, masked=False, old_dataset=False, index_before=length_before index_after=length_after - #print("len before: ", length_before) - #print("len after: ", length_after) - #print("index_before: ", index_before) - #print("index_after: ", index_after) - #print("not used: ", context_len-index_before-index_after) - - #Use the calculated indices to append the right tokens and pad to 512 if needed, recalculate metonymy position and prepare for decoding metonymy - - #before_decoded="".join(tokenizer.decode(encoded_inp_before["input_ids"][length_before-index_before:length_before])) - #if tokenizer.name_or_path[0]=="b": - # before_decoded.replace("[CLS]", "") #.replace(" [SEP]", "") - #print(before_decoded) - context_input_ids=context_input_ids + encoded_inp_before["input_ids"][length_before-index_before:length_before] context_input_ids=context_input_ids + encoded_inp["input_ids"] context_input_ids=context_input_ids + encoded_inp_after["input_ids"][length_after-index_after:length_after] context_input_ids=context_input_ids+([0]*(512-len(context_input_ids))) #pad - #print("new input ids: ", len(context_input_ids)) context_attention_masks= context_attention_masks+encoded_inp_before["attention_mask"][length_before-index_before:length_before] context_attention_masks=context_attention_masks+encoded_inp["attention_mask"] context_attention_masks=context_attention_masks+encoded_inp_after["attention_mask"][length_after-index_after:length_after] context_attention_masks=context_attention_masks+([0]* (512-len(context_attention_masks))) #pad - #print("new attention maks: ", len(context_attention_masks)) - if tokenizer.name_or_path[0] == "b": #BER Tokenizer has token type ids too + if tokenizer.name_or_path[0] == "b": #BERT Tokenizer has token type ids too context_token_type_ids=context_token_type_ids + encoded_inp_before["token_type_ids"][length_before-index_before:length_before] context_token_type_ids=context_token_type_ids +encoded_inp["token_type_ids"] context_token_type_ids=context_token_type_ids +encoded_inp_after["token_type_ids"][length_after-index_after:length_after] context_token_type_ids=context_token_type_ids+([0]*(512-len(context_token_type_ids))) - #print("new token type ids: ", len(context_token_type_ids)) assert len(context_token_type_ids) == 512 - + + #make sure we pad to maximum assert len(context_input_ids) == 512 and len(context_attention_masks) == 512 - print(len(context_input_ids)) - - #get tokeniized words for before sentence and the metonymy sentence + + #get tokenized words for before sentence and the metonymy sentence tokenized_before=[] for i in range(len(" ".join(dp["sentence"][0]).lower())): tokenized_before.append((encoded_inp.char_to_token(i, sequence_index=0))) - #print(tokenized_before) - - #tokenized_words = [] - #for i in range(len(new_dp)): #range(len(new_dp)) - # tokenized_words.append((encoded_inp.char_to_token(i, sequence_index=0))) - #print(tokenized_words) - - #span=[] - - #for i in tokenized_words[new_start_pos:new_end_pos]: - # if i is not None: - # span.append(i+len(encoded_inp_before["input_ids"])) - - #new_start_pos=new_start_pos+len(encoded_inp_before["input_ids"]) #update inces by adding the number of tokens that are in before sentence - #new_end_pos=new_end_pos+len(encoded_inp_before["input_ids"]) - #print(span) - #indices_to_tokens=list(set(span)) - #indices_to_tokens.sort() - #print(indices_to_tokens) - #if len(indices_to_tokens)==1: - # print("decoding 1") - # decoded="".join(tokenizer.decode(context_input_ids[indices_to_tokens[0]])).strip().replace(" ", "") - #else: - # print("decoding 2") - # #print("indices_to_tokens: ", indices_to_tokens) - # decoded="".join(tokenizer.decode(context_input_ids[indices_to_tokens[0]:indices_to_tokens[-1]+1])).strip().replace(" ", "") - #print(decoded) old_target="".join(dp["sentence"][1][dp["pos"][0]:dp["pos"][1]]).lower() - #print("old_target: ", old_target) - #make an encoded_inp dictionary -> not needed, because we use lists directly - #encoded_inp={"input_ids": context_input_ids, "attention_mask": context_attention_masks} - - #if tokenizer.name_or_path[0] =="b": - # encoded_inp["token_type_ids"]=context_token_type_ids - #print(encoded_inp) - - - #print(len(encoded_inp["input_ids"])) - - #li et al approach - """ - if old_dataset==False: - orig_to_tok_index2=[] - all_tokens2 = ['[CLS]'] - for (i, token) in enumerate(dp["sentence"]): - orig_to_tok_index2.append(len(all_tokens2)) - sub_tokens = tokenizer#.tokenize(token) - for sub_token in sub_tokens: - all_tokens2.append(sub_token) - orig_to_tok_index2.append(len(all_tokens2)) - new_target="".join(tf_tokens[orig_to_tok_index2[dp["pos"][0]]:orig_to_tok_index2[dp["pos"][1]]]).replace("##", "").lower() - print("orig to tok index: ", [orig_to_tok_index2[dp["pos"][0]], orig_to_tok_index2[dp["pos"][1]]]) - print("new_target: ", repr(new_target)) - """ tokenized_words = [] - for i in range(len(new_dp)): #range(len(new_dp)) - #if(new_dp[i])==" ": - # continue #spaces are connected with the words with the roberta tokenizer and are thus always mapped to None + for i in range(len(new_dp)): tokenized_words.append((encoded_inp.char_to_token(i, sequence_index=0))) span=[] @@ -279,68 +181,35 @@ def tokenizer_new(tokenizer, input, max_length, masked=False, old_dataset=False, span.append(i+index_before) else: span.append(i) - #if old_dataset==True: - # new_start_pos=new_start_pos+len(encoded_inp_before["input_ids"]) #update inces by adding the number of tokens that are in before sentence - # new_end_pos=new_end_pos+len(encoded_inp_before["input_ids"]) indices_to_tokens=list(set(span)) indices_to_tokens.sort() - #print(indices_to_tokens) - #print("indices to tokens: ", indices_to_tokens) + #decode new positioned tokens to check for false mapping if old_dataset==False: if len(indices_to_tokens)==1: - #print("decoding 1") decoded="".join(tokenizer.decode(encoded_inp["input_ids"][indices_to_tokens[0]])).strip().replace(" ", "") else: - #print("decoding 2") - #print("indices_to_tokens: ", indices_to_tokens) decoded="".join(tokenizer.decode(encoded_inp["input_ids"][indices_to_tokens[0]:indices_to_tokens[-1]+1])).strip().replace(" ", "") else: if len(indices_to_tokens)==1: - #print("decoding 1") decoded="".join(tokenizer.decode(context_input_ids[indices_to_tokens[0]])).strip().replace(" ", "") else: - #print("decoding 2") - #print("indices_to_tokens: ", indices_to_tokens) decoded="".join(tokenizer.decode(context_input_ids[indices_to_tokens[0]:indices_to_tokens[-1]+1])).strip().replace(" ", "") - - #print("newly_decoded: ", decoded) - - #old_dp=" ".join(dp["sentence"]).lower() - #print(old_dp) - #old_target="".join(old_dp[dp["pos"][0]: dp["pos"][1]]).lower() - #old_target="".join(dp["sentence"][dp["pos"][0]:dp["pos"][1]]).lower() + if old_target!=decoded: print("wrong mapping") - if old_dataset == True: - print("new_start_pos: ", new_start_pos) - print("lenght of before: ", len(encoded_inp_before["input_ids"])) - print("lengh of after: ", len(encoded_inp_after["input_ids"])) - print("after input ids: ", encoded_inp_after["input_ids"]) - print("Used from before: ", index_before) - print("Used from after: ", index_after) - print("metonomy sentence length: ", len(encoded_inp["input_ids"])) - print("left for filling: ", context_len) - print("indices to tokens: ", indices_to_tokens) - print("decoded: ", decoded) - print("old target: ", old_target) - print(dp) - #print(old_dp) - #mapping_counter+=1 continue - + all_start_positions.append(indices_to_tokens[0]) all_end_positions.append(indices_to_tokens[-1]+1) all_labels.append(dp["label"]) if old_dataset==False: all_input_ids.append(encoded_inp["input_ids"]) - #print("len input ids: ", len(all_input_ids)) all_attention_masks.append(encoded_inp["attention_mask"]) else: all_input_ids.append(context_input_ids) - #print("len input ids: ", len(all_input_ids)) all_attention_masks.append(context_attention_masks) if tokenizer.name_or_path[0] == "b": @@ -349,37 +218,33 @@ def tokenizer_new(tokenizer, input, max_length, masked=False, old_dataset=False, else: all_token_type_ids.append(context_token_type_ids) - #if tokenizer.name_or_path[0] == "b": - # print(len(all_start_positions)) - - #print("len end pos: ", len(all_end_positions)) - #print("len all labels: ", len(all_labels)) - #print("len attention masks: ", len(all_attention_masks[0])) - #print("len start pos: ", len(all_start_positions)) - #print("len toke type ids: ", len(all_token_type_ids[0])) - if tokenizer.name_or_path[0] == "r": #if tokenizer is roberta we dont have token_type ids print("roberta tokenizer") - dataset=TensorDataset(torch.tensor(all_input_ids, dtype=torch.long).to(device) , - torch.tensor(all_attention_masks, dtype=torch.long).to(device) , - torch.tensor(all_start_positions,dtype=torch.long).to(device), - torch.tensor(all_end_positions, dtype=torch.long).to(device), - torch.tensor(all_labels,dtype=torch.long).to(device)) + dataset=TensorDataset(torch.tensor(all_input_ids, dtype=torch.long).to("cuda") , + torch.tensor(all_attention_masks, dtype=torch.long).to("cuda") , + torch.tensor(all_start_positions,dtype=torch.long).to("cuda"), + torch.tensor(all_end_positions, dtype=torch.long).to("cuda"), + torch.tensor(all_labels,dtype=torch.long).to("cuda")) if tokenizer.name_or_path[0] =="b": print("bert tokenizer") - dataset=TensorDataset(torch.tensor(all_input_ids, dtype=torch.long).to(device), - torch.tensor(all_attention_masks, dtype=torch.long).to(device), - torch.tensor(all_token_type_ids, dtype=torch.long).to(device), - torch.tensor(all_start_positions,dtype=torch.long).to(device), - torch.tensor(all_end_positions, dtype=torch.long).to(device), - torch.tensor(all_labels,dtype=torch.long).to(device)) + dataset=TensorDataset(torch.tensor(all_input_ids, dtype=torch.long).to("cuda"), + torch.tensor(all_attention_masks, dtype=torch.long).to("cuda"), + torch.tensor(all_token_type_ids, dtype=torch.long).to("cuda"), + torch.tensor(all_start_positions,dtype=torch.long).to("cuda"), + torch.tensor(all_end_positions, dtype=torch.long).to("cuda"), + torch.tensor(all_labels,dtype=torch.long).to("cuda")) print("created dataset") - #print(mapping_counter) return dataset def tokenizer_imdb(tokenizer, dataset, max_length): + """Tokenizer for imdb dataset (for validation of our tmix implementation. + + Params: + tokenizer: AutoTokenizer -> Tokenizer (in out case BERT base uncased) + dataset: list of dicts -> dataset (imdb from huggingface) to be preprocessed + max_length: int -> maximum length for padding/truncation""" all_input_ids=[] all_attention_masks=[] all_token_type_ids=[] @@ -387,17 +252,12 @@ def tokenizer_imdb(tokenizer, dataset, max_length): for dp in dataset: encoded_inp=tokenizer.encode_plus(dp["text"], add_special_tokens=True, max_length=max_length, truncation=True, padding="max_length") - #print("encoded input:",encoded_inp) all_labels.append(dp["label"]) all_input_ids.append(encoded_inp["input_ids"]) all_attention_masks.append(encoded_inp["attention_mask"]) all_token_type_ids.append(encoded_inp["token_type_ids"]) - print("labels: ", len(all_labels)) - print("input_ids: ", len(all_input_ids)) - print("token_type_ids: ", len(all_token_type_ids)) - print("attention_masks: ", len(all_attention_masks)) - dataset=TensorDataset(torch.tensor(all_input_ids, dtype=torch.long).to(device), torch.tensor(all_attention_masks, dtype=torch.long).to(device), torch.tensor(all_token_type_ids, dtype=torch.long).to(device), torch.tensor(all_labels, dtype=torch.long).to(device)) + dataset=TensorDataset(torch.tensor(all_input_ids, dtype=torch.long).to("cuda"), torch.tensor(all_attention_masks, dtype=torch.long).to("cuda"), torch.tensor(all_token_type_ids, dtype=torch.long).to("cuda"), torch.tensor(all_labels, dtype=torch.long).to("cuda")) print("created imdb dataset") return dataset @@ -405,6 +265,7 @@ def tokenizer_imdb(tokenizer, dataset, max_length): class EncodedTokenDataset(torch.utils.data.Dataset): """ + Salami Dataset Creator A dataset, containing encoded sentences, integer labels and the starting and ending position of the target word. """ @@ -431,7 +292,7 @@ class EncodedTokenDataset(torch.utils.data.Dataset): def salami_tokenizer(tokenizer, input, max_length, masked=False): - + """Salami tokenizer for input sentences (Used together with EncodedTokenDataset)""" print("salami tokenizer") bots_token, eots_token = "[bots]", "[eots]" tokenizer.add_tokens([bots_token, eots_token]) diff --git a/Code/train.py b/Code/train.py index 919fb063a8d58f0db930d2af9898da11567e1e53..a215b15503c7748fe1e450158e2b7f26ccea4e22 100644 --- a/Code/train.py +++ b/Code/train.py @@ -11,7 +11,7 @@ from transformers import BertTokenizer, RobertaTokenizer, BertModel, RobertaMode from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from transformers import AdamW, get_scheduler from torch import nn -from torch.nn import CrossEntropyLoss +from torch.nn import CrossEntropyLoss, DataParallel import matplotlib.pyplot as plt import os import pandas as pd @@ -23,49 +23,64 @@ torch.cuda.empty_cache() #with torch.autocast("cuda"): -def train(model, name, imdb, seed,mixup,lambda_value, mixepoch, tmix, mixlayer, train_dataset, test_dataset, num_epochs, learning_rate, batch_size, test_batch_size, model_save_path=None): +def train(model, name,train_dataset, test_dataset, seed, batch_size, test_batch_size,num_epochs,imdb=False, mixup =False,lambda_value=None, mixepoch=None, tmix=False, mixlayer=None, learning_rate=None, mlp_learning_rate=None, model_save_path=None): """Train loop for models. Iterates over epochs and batches and gives inputs to model. After training, call evaluation.py for evaluation of finetuned model. Params: model: model out of models.py name: str - imdb: bool + train_dataset: Dataset + test_dataset: Dataset seed: int + batch_size: + test_batch_size: + num_epochs: int + imdb: bool mixup: bool lambda_value: float mixepoch:int tmix: bool mixlayer: int in {0, 11} - train_dataset: Dataset - test_dataset: Dataset - num_epochs: int learning_rate: float - batch_size: - test_batch_size: + mlp_leaning_rate:float + - Returns:""" + Returns: Evaluation Results for train and test dataset in Accuracy, F1, Precision and Recall""" model.train().to(device) train_sampler = RandomSampler(train_dataset) - train_dataloader=DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size) + train_dataloader=DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size, shuffle=True) num_training_steps=num_epochs*len(train_dataloader) - optimizer=AdamW(model.parameters(), lr=learning_rate, eps=1e-8, weight_decay=0.1) + if mlp_learning_rate==None: + print("initializing one learning rate") + optimizer=AdamW(model.parameters(), lr=learning_rate, eps=1e-8, weight_decay=0.1) + else: + print("initializing separate learning rates") + model=nn.DataParallel(model) + optimizer=AdamW([ + {'params': model.module.embedding_model.parameters(), 'lr': learning_rate}, + {'params': model.module.classifier.parameters(), 'lr': mlp_learning_rate} + ]) lr_scheduler=get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=10, num_training_steps=num_training_steps) model.zero_grad() for epoch in range(num_epochs): - index=0 - for batch in train_dataloader: print(len(batch)) if name[0] == "b": if tmix==False: - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'token_type_ids': batch[2], - 'start_position': batch[3], - 'end_position': batch[4], - 'labels': batch[5]} + if imdb==False: + inputs = {'input_ids': batch[0], + 'attention_mask': batch[1], + 'token_type_ids': batch[2], + 'start_position': batch[3], + 'end_position': batch[4], + 'labels': batch[5]} + if imdb==True: + inputs={'input_ids':batch[0], + 'attention_mask': batch[1], + 'token_type_ids': batch[2], + 'labels': batch[3]} if tmix==True: if imdb == False: print("this is mixup epoch") @@ -100,7 +115,7 @@ def train(model, name, imdb, seed,mixup,lambda_value, mixepoch, tmix, mixlayer, end_positions=batch[3] outputs=model(**inputs) loss=outputs[0] - print("Loss: ", loss) + print("Epoch: {0} Loss: {1}".format(epoch, loss)) loss.backward() optimizer.step() lr_scheduler.step() diff --git a/main.py b/main.py index f969a35d8cfd429eed7140075feee0086dcde2e9..bdf7daab319026b30cb2eb663a1f6ad1393afb2c 100644 --- a/main.py +++ b/main.py @@ -51,8 +51,7 @@ def run(raw_args): test_dataset=Code.preprocess.salami_tokenizer(tokenizer, data_test, args.max_length, masked=args.masking) elif args.tokenizer=="swp": - print("train dataset preprocessing ") - print(args.tcontext) + print("train dataset preprocessing ") train_dataset=Code.preprocess.tokenizer_new(tokenizer, data_train, args.max_length, masked=args.masking, old_dataset=args.tcontext) test_dataset=Code.preprocess.tokenizer_new(tokenizer, data_test, args.max_length, masked=args.masking, old_dataset=False) @@ -66,7 +65,7 @@ def run(raw_args): #train&evaluate... print("training..") if args.train_loop=="swp": - evaluation_test, evaluation_train = Code.train.train(model, args.architecture, args.imdb, args.random_seed, args.mix_up, args.lambda_value, args.mixepoch, args.tmix, args.mixlayer, train_dataset, test_dataset, args.epochs, args.learning_rate, args.batch_size, args.test_batch_size, args.model_save_path) + evaluation_test, evaluation_train = Code.train.train(model, args.architecture, train_dataset, test_dataset, args.random_seed,args.batch_size, args.test_batch_size,args.epochs,args.imdb, args.mix_up, args.lambda_value, args.mixepoch, args.tmix, args.mixlayer, args.learning_rate, args.second_learning_rate, args.model_save_path) elif args.train_loop=="salami": evaluation_test = Code.train.train_salami(model,args.random_seed, train_dataset, test_dataset, args.batch_size, args.test_batch_size, args.learning_rate, args.epochs) else: @@ -111,6 +110,12 @@ if __name__ == "__main__": action="store_true" ) + parser.add_argument( + "--mlp", + help="use two layer multi layer perceptron at the end? (if no, linear classifier)", + action="store_true" + ) + #Datasets parser.add_argument( "-t", @@ -150,7 +155,7 @@ if __name__ == "__main__": "-max", "--max_length", type=int, - help="How big is max length when tokenizing the sentences?") + help="Max sequence length when tokenizing the sentences?") #Train arguments @@ -170,6 +175,14 @@ if __name__ == "__main__": "--learning_rate", type=float, help="Learning rate for training") + + parser.add_argument( + "-lrtwo", + "--second_learning_rate", + type=float, + help="Separate learning rate for multi layer perceptron", + default=None + ) parser.add_argument( "-rs",