diff --git a/Code/__pycache__/evaluation.cpython-39.pyc b/Code/__pycache__/evaluation.cpython-39.pyc
index dae64c5ab2f6a7dd905001c25ed0152e6cf0c655..9a7e021150bae6e0bbeaa94079b8785a9dceb07e 100644
Binary files a/Code/__pycache__/evaluation.cpython-39.pyc and b/Code/__pycache__/evaluation.cpython-39.pyc differ
diff --git a/Code/__pycache__/models.cpython-39.pyc b/Code/__pycache__/models.cpython-39.pyc
index 2390a8e8d639f4966c650a0b98e94675d6a98bd8..dbbd8c146ec7697832874ba03aa864c4ce287e4c 100644
Binary files a/Code/__pycache__/models.cpython-39.pyc and b/Code/__pycache__/models.cpython-39.pyc differ
diff --git a/Code/__pycache__/preprocess.cpython-39.pyc b/Code/__pycache__/preprocess.cpython-39.pyc
index 6b6fd34fb43f99c6a0898ce234ea5f46b52a0a4b..453b2918d23d60fe92b4ec6801b68ec13a3e3598 100644
Binary files a/Code/__pycache__/preprocess.cpython-39.pyc and b/Code/__pycache__/preprocess.cpython-39.pyc differ
diff --git a/Code/__pycache__/train.cpython-39.pyc b/Code/__pycache__/train.cpython-39.pyc
index 1a386b93ee4f4191efb8dda7fa30800056b3d3bc..e6950329e68a341eb011364c84304cd25eb49bcf 100644
Binary files a/Code/__pycache__/train.cpython-39.pyc and b/Code/__pycache__/train.cpython-39.pyc differ
diff --git a/Code/evaluation.py b/Code/evaluation.py
index 3ada77aaae3f344f9a4638846b2b97a68b14f693..6212fd1d728b7724133959897570a128347df814 100644
--- a/Code/evaluation.py
+++ b/Code/evaluation.py
@@ -45,7 +45,7 @@ def evaluate_model(model, name,test_dataset, batch_size, imdb=False):
 		with torch.no_grad():
 			if name[0] == "b":
 				if imdb==False:
-					print("Evaluating Bert model")
+					#print("Evaluating Bert model")
 					inputs = {'input_ids': batch[0],
 							  'attention_mask': batch[1],
 							  'token_type_ids': batch[2],
@@ -53,14 +53,14 @@ def evaluate_model(model, name,test_dataset, batch_size, imdb=False):
 							  'end_position': batch[4],
 							  'labels': batch[5]}
 				elif imdb==True:
-					print("Evaluating Bert model on imdb")
+					#print("Evaluating Bert model on imdb")
 					inputs={'input_ids':batch[0],
 							'attention_mask':batch[1],
 							'token_type_ids':batch[2],
 							'labels':batch[3]}
 
 			if name[0] == "r":
-				print("Evaluating roberta model")
+				#print("Evaluating roberta model")
 				inputs = {'input_ids': batch[0],
 						  'attention_mask': batch[1],
 						  'start_position': batch[2],
diff --git a/Code/models.py b/Code/models.py
index f488662e9b5a9c027e6de596fd58be90c7d3954f..6be2a5072379ea6cbda4b50d7a054d0f24777a73 100644
--- a/Code/models.py
+++ b/Code/models.py
@@ -6,7 +6,7 @@ import evaluate
 import json
 import random
 import math
-import train
+import Code.train
 import copy
 from tqdm.auto import tqdm
 from transformers import BertTokenizer, RobertaTokenizer, BertModel, RobertaModel, RobertaPreTrainedModel, RobertaConfig,  BertConfig, BertPreTrainedModel, PreTrainedModel, AutoModel, AutoTokenizer, AutoConfig
@@ -117,7 +117,7 @@ class WordClassificationModel(torch.nn.Module):
 
         if self.tmix==True and mixepoch == True:
             outputs = (logits,) + outputs[2:]
-            loss = train.cross_entropy(logits[:math.floor((logits.size()[0]/2))], outputs[1][:math.floor((outputs[1].size()[0]/2))], lambda_value) #special CEL for soft labels 
+            loss = Code.train.cross_entropy(logits[:math.floor((logits.size()[0]/2))], outputs[1][:math.floor((outputs[1].size()[0]/2))], lambda_value) #special CEL for soft labels 
             outputs = (loss,) + outputs
         
         else:
diff --git a/Code/preprocess.py b/Code/preprocess.py
index 7c179cae581441849b2b9d185076a81412ff25c4..45c0fe24f7a4f0233664e8d106b83f2c6a42fa95 100644
--- a/Code/preprocess.py
+++ b/Code/preprocess.py
@@ -218,7 +218,7 @@ def tokenizer_new(tokenizer, input, max_length, masked=False, old_dataset=False,
 			else:
 				all_token_type_ids.append(context_token_type_ids)
 
-
+	if tokenizer.name_or_path[0] =="r":
 		print("roberta tokenizer")
 		dataset=TensorDataset(torch.tensor(all_input_ids, dtype=torch.long).to("cuda") , 
 							torch.tensor(all_attention_masks, dtype=torch.long).to("cuda") ,
diff --git a/Code/train.py b/Code/train.py
index a215b15503c7748fe1e450158e2b7f26ccea4e22..e45ea60e4eb4779e0c7fbbce3354b58b42d3fc29 100644
--- a/Code/train.py
+++ b/Code/train.py
@@ -1,13 +1,13 @@
 import torch
 import tqdm
 import numpy as np
-import evaluation
+import Code.evaluation
 import evaluate
 import json
 import random
 import math
 from tqdm.auto import tqdm
-from transformers import BertTokenizer, RobertaTokenizer, BertModel, RobertaModel, RobertaPreTrainedModel, RobertaConfig,  BertConfig, BertPreTrainedModel, PreTrainedModel, AutoModel, AutoTokenizer
+from transformers import BertTokenizer, RobertaTokenizer, BertModel, RobertaModel, RobertaPreTrainedModel, RobertaConfig,  BertConfig, BertPreTrainedModel, PreTrainedModel, AutoModel, AutoTokenizer, Trainer, TrainingArguments
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from transformers import AdamW, get_scheduler
 from torch import nn
@@ -27,28 +27,29 @@ def train(model, name,train_dataset, test_dataset, seed, batch_size, test_batch_
 	"""Train loop for models. Iterates over epochs and batches and gives inputs to model. After training, call evaluation.py for evaluation of finetuned model.
 	
 	Params:
-	model: model out of models.py
-	name: str
-	train_dataset: Dataset 
-	test_dataset: Dataset
-	seed: int
-	batch_size: 
-	test_batch_size:
-	num_epochs: int
-	imdb: bool
-	mixup: bool
-	lambda_value: float
-	mixepoch:int
-	tmix: bool
-	mixlayer: int in {0, 11}
-	learning_rate: float
-	mlp_leaning_rate:float
+
+	model: model out of models.py ->WordClassificationModel, BertForWordClassification or RobertaForWordClassification
+	name: str -> specifies architecture of model (either bert-base-uncased or roberta-base)
+	train_dataset: Dataset  -> Train dataset as Torch.Dataset Object (created in preprocess.py)
+	test_dataset: Dataset ->Test dataset as Torch.Dataset Object (created in preprocess.py)
+	seed: int -> Random seed 
+	batch_size: ->batch size for training
+	test_batch_size: -> batch size for testing
+	num_epochs: int -> number of epochs
+	imdb: bool ->whether or not imdb dataset is used
+	mixup: bool ->whether or not to use mixup in training 
+	lambda_value: float ->if mixup or tmix selected, what lambda value to use
+	mixepoch:int -> specifies in what epoch to use mixup
+	tmix: bool ->whether or not tmix is used in training (used to differentiate between mixing in training and not mixing in evaluation)
+	mixlayer: int in {0, 11} ->what layer to mix in tmix
+	learning_rate: float ->learning rate for Bert/Roberta Model, or WordClassificationModel including linear classifier
+	mlp_leaning_rate:float ->separate learning rate for multi layer perceptron
 	
 	
 	Returns: Evaluation Results for train and test dataset in Accuracy, F1, Precision and Recall"""
 	model.train().to(device)
 	train_sampler = RandomSampler(train_dataset)
-	train_dataloader=DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size, shuffle=True)
+	train_dataloader=DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)
 	num_training_steps=num_epochs*len(train_dataloader)
 
 	if mlp_learning_rate==None:
@@ -76,11 +77,15 @@ def train(model, name,train_dataset, test_dataset, seed, batch_size, test_batch_
 								'start_position': batch[3],
 								'end_position': batch[4],
 								'labels': batch[5]}
+						labels=batch[5]
+						start_positions=batch[3]
+						end_positions=batch[4]
 					if imdb==True:
 						inputs={'input_ids':batch[0],
 								'attention_mask': batch[1],
 								'token_type_ids': batch[2],
 								'labels': batch[3]}
+					
 				if tmix==True:
 					if imdb == False:
 						print("this is mixup epoch")
@@ -93,6 +98,9 @@ def train(model, name,train_dataset, test_dataset, seed, batch_size, test_batch_
 									'mixepoch': True,
 									'mixlayer':mixlayer,
 									'lambda_value':lambda_value}
+						labels=batch[5]
+						start_positions=batch[3]
+						end_positions=batch[4]
 					if imdb==True:
 						print("this is a mixup epoch with imdb")
 						inputs={'input_ids':batch[0],
@@ -123,7 +131,7 @@ def train(model, name,train_dataset, test_dataset, seed, batch_size, test_batch_
 			model.zero_grad()
 
 			if epoch==mixepoch:
-				#print("mixepoch")
+				print("mixepoch")
 				if mixup == True:
 					#calculate new last hidden states and predictions(logits)
 					new_matrix_batch, new_labels_batch = mixup_function(outputs[2], labels, lambda_value)
@@ -155,7 +163,7 @@ def train(model, name,train_dataset, test_dataset, seed, batch_size, test_batch_
 
 	return evaluation_test, evaluation_train
 
-def cross_entropy(logits, target):
+def cross_entropy(logits, target, l):
 	"""
     Computes the cross-entropy loss between the predicted logits and the target labels.
     
@@ -178,10 +186,10 @@ def cross_entropy(logits, target):
 		if value == 1 or value == 0: #check if non-mixed label
 			one_hot = torch.tensor([1-value,value], device='cuda:0') #creating one-hot vector e.g. [0. ,1.]
 			loss_clear_labels = -((one_hot[0] * logprobs[0][0]) + (one_hot[1] * logprobs[0][1]))
-			#calculation with indexing (- 1-label * )
+			#calculation with indexing 
 			results = torch.cat((loss_clear_labels.view(1), results), dim=0)
 		else:
-			mixed_vec = torch.tensor([value, 1-value]) #creating on-hot mixed vec.
+			mixed_vec = torch.tensor([l, 1-l]) #creating on-hot mixed vec.
 			logprobs = torch.nn.functional.log_softmax(lg, dim=1)#logits in log probabilities
 			loss_mixed_labels = -((mixed_vec[0] * logprobs[0][0]) + (mixed_vec[1] * logprobs[0][1]))
 			#calculation for mixed with indexing
@@ -219,7 +227,7 @@ def mixup_function(batch_of_matrices, batch_of_labels, l):
 	return results, result_labels
 
 	
-def train_salami(model, seed, train_set, test_set, batch_size, test_batch_size, learning_rate, epochs):
+def train_salami(model,name, seed, train_set, test_set, batch_size, test_batch_size, learning_rate, epochs):
 	"""Train loop of the salami group"""
 	results=[]
 	training_args = TrainingArguments(
@@ -243,7 +251,7 @@ def train_salami(model, seed, train_set, test_set, batch_size, test_batch_size,
 		train_dataset=train_set,
 		eval_dataset=test_set,
 		args=training_args,
-		compute_metrics=evaluation.evaluate_model
+		compute_metrics=Code.evaluation.compute_metrics
 		)
 
 	trainer.train()
diff --git a/main.py b/main.py
index 8a33f4ff0965ece565502accf64130a78ee37176..fe8c434ab49eb37416c55ae23e872a1238166cfc 100644
--- a/main.py
+++ b/main.py
@@ -67,12 +67,12 @@ def run(raw_args):
 	if args.train_loop=="swp":
 		evaluation_test, evaluation_train = Code.train.train(model, args.architecture, train_dataset, test_dataset, args.random_seed,args.batch_size, args.test_batch_size,args.epochs,args.imdb,  args.mix_up, args.lambda_value, args.mixepoch, args.tmix, args.mixlayer,   args.learning_rate, args.second_learning_rate, args.model_save_path)
 	elif args.train_loop=="salami":
-		evaluation_test = Code.train.train_salami(model,args.random_seed, train_dataset, test_dataset, args.batch_size, args.test_batch_size, args.learning_rate, args.epochs)
+		evaluation_test = Code.train.train_salami(model,args.architecture, args.random_seed, train_dataset, test_dataset, args.batch_size, args.test_batch_size, args.learning_rate, args.epochs)
 	else:
 		print("no eligible train loop selected")
 	
-	#save
-	if isinstance(args.save_directory, str): 
+	#save 
+	if args.save_directory !=None:
 		with open(args.save_directory, "x") as f:
 			f.write(str(args))
 			f.write(str(evaluation_test))
@@ -208,8 +208,7 @@ if __name__ == "__main__":
 		"-lambda",
 		"--lambda_value",
 		help="speficies the lambda value for mixup",
-		type=float,
-		default=0.4)
+		type=float)
 
 	parser.add_argument(
 		"-mixepoch", 
@@ -231,7 +230,8 @@ if __name__ == "__main__":
 	parser.add_argument(
 		"-sd",
 		"--save_directory",
-		help="Destination directory for the output results of the run")
+		help="Destination directory for the output results of the run",
+		default=None)
 	
 	parser.add_argument(
 		"-msp",