diff --git a/Code/inference.py b/Code/inference.py
index 2deb942e79d771af84ce76dec32c9488dd81b148..7689fdd09eae18d5ffa670b8d0fec3bb5971ff3a 100644
--- a/Code/inference.py
+++ b/Code/inference.py
@@ -9,6 +9,9 @@ import re
 import train
 from torch.utils.data import DataLoader, RandomSampler
 
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+
 # Get user input
 print("Enter a sentence and enclose the target word(s) between asteriks (e.g. \"I love *New York*\"): ")
 sentence = input()
@@ -56,14 +59,13 @@ label = int(input())
 
 # Convert to data sample for BERT
 data_sample = [{"sentence": sentence, "pos": pos, "label": label}]
-print(data_sample)
+#print(data_sample)
 
 tokenizer=AutoTokenizer.from_pretrained("bert-base-uncased")
 input_as_dataset=preprocess.tokenizer_new(tokenizer, data_sample, max_length=512)
 
 
 # Load model
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
 model=models.WordClassificationModel.from_pretrained("bert-base-uncased")
 
diff --git a/Code/models.py b/Code/models.py
index adac00aa2d6357f572ab176629a31051da5a7802..530b9f7b00dffdbec44ac8bb29ce3640989d8eac 100644
--- a/Code/models.py
+++ b/Code/models.py
@@ -22,6 +22,7 @@ import pandas as pd
 import sklearn
 from typing import List, Optional, Tuple, Union
 
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 metric=evaluate.load("accuracy")
 torch.cuda.empty_cache()
 
@@ -52,9 +53,9 @@ class WordClassificationModel(torch.nn.Module): #AutoModel verwenden aus der Bib
 		#self.mixlayer=mixlayer
 		if tmix:
 			print("initializing BertModelTMix")
-			self.embedding_model=BertModelTMix(config=AutoConfig.from_pretrained(config_name)).to("cuda")
+			self.embedding_model=BertModelTMix(config=AutoConfig.from_pretrained(config_name)).to(device)
 		else:
-			self.embedding_model=AutoModel.from_pretrained(config_name, config=AutoConfig.from_pretrained(config_name)).to("cuda")
+			self.embedding_model=AutoModel.from_pretrained(config_name, config=AutoConfig.from_pretrained(config_name)).to(device)
         
 
 		self.dropout=nn.Dropout(0.1)
@@ -84,7 +85,7 @@ class WordClassificationModel(torch.nn.Module): #AutoModel verwenden aus der Bib
 	                        	    return_dict=False,
 	                           	 	output_hidden_states=False)
 
-		output = outputs[0].to("cuda") 
+		output = outputs[0].to(device) 
 		output = self.dropout(output)
 		
 		if self.imdb==False:
@@ -207,10 +208,10 @@ class BertModelTMix(BertPreTrainedModel):
         super().__init__(config)
         self.config = config
 
-        self.embeddings = BertEmbeddings(config).to("cuda")
-        self.encoder = BertTMixEncoder(config).to("cuda")
+        self.embeddings = BertEmbeddings(config).to(device)
+        self.encoder = BertTMixEncoder(config).to(device)
 
-        self.pooler = BertPooler(config).to("cuda") if add_pooling_layer else None
+        self.pooler = BertPooler(config).to(device) if add_pooling_layer else None
 
 
         # Initialize weights and apply final processing
@@ -538,12 +539,12 @@ def forward_new(forward):
 				counter+=2
 			for i in range(runs, hidden_states.size()[0]):
 				#Pad to batch size
-				new_matrices.append(torch.zeros([hidden_states.size()[1], hidden_states.size()[2]]).to("cuda"))
+				new_matrices.append(torch.zeros([hidden_states.size()[1], hidden_states.size()[2]]).to(device))
 				new_labels.append(0)
-				new_attention_masks.append(torch.zeros([1, 1, hidden_states.size()[1]]).to("cuda"))
-			new_matrices=torch.stack(new_matrices).to("cuda")
-			new_attention_masks=torch.stack(new_attention_masks).to("cuda")
-			new_labels=torch.Tensor(new_labels).to("cuda")
+				new_attention_masks.append(torch.zeros([1, 1, hidden_states.size()[1]]).to(device))
+			new_matrices=torch.stack(new_matrices).to(device)
+			new_attention_masks=torch.stack(new_attention_masks).to(device)
+			new_labels=torch.Tensor(new_labels).to(device)
 
 			#when performing interpolation, pass back th new hidden states and labels
 			outputs=forward(self, hidden_states=new_matrices, head_mask=head_mask, attention_mask=new_attention_masks, encoder_hidden_states=encoder_hidden_states,
diff --git a/Code/preprocess.py b/Code/preprocess.py
index 1e5b5381e48c9a3615c070d6ec54f9f1b41bd2d7..a3586e19d63e8e8207172ecc8ae0bfff442173c2 100644
--- a/Code/preprocess.py
+++ b/Code/preprocess.py
@@ -16,6 +16,7 @@ import os
 import pandas as pd
 import sklearn
 
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 #metric=evaluate.load("accuracy")
 torch.cuda.empty_cache()
 
@@ -359,20 +360,20 @@ def tokenizer_new(tokenizer, input, max_length, masked=False, old_dataset=False,
 	#print("len toke type ids: ", len(all_token_type_ids[0]))
 	if tokenizer.name_or_path[0] == "r": #if tokenizer is roberta we dont have token_type ids
 		print("roberta tokenizer")
-		dataset=TensorDataset(torch.tensor(all_input_ids, dtype=torch.long).to("cuda") , 
-							torch.tensor(all_attention_masks, dtype=torch.long).to("cuda") ,
-							torch.tensor(all_start_positions,dtype=torch.long).to("cuda"),
-							torch.tensor(all_end_positions, dtype=torch.long).to("cuda"),
-							torch.tensor(all_labels,dtype=torch.long).to("cuda"))
+		dataset=TensorDataset(torch.tensor(all_input_ids, dtype=torch.long).to(device) , 
+							torch.tensor(all_attention_masks, dtype=torch.long).to(device) ,
+							torch.tensor(all_start_positions,dtype=torch.long).to(device),
+							torch.tensor(all_end_positions, dtype=torch.long).to(device),
+							torch.tensor(all_labels,dtype=torch.long).to(device))
 
 	if tokenizer.name_or_path[0] =="b":
 		print("bert tokenizer")
-		dataset=TensorDataset(torch.tensor(all_input_ids, dtype=torch.long).to("cuda"), 
-					torch.tensor(all_attention_masks, dtype=torch.long).to("cuda"),
-					torch.tensor(all_token_type_ids, dtype=torch.long).to("cuda"),
-					torch.tensor(all_start_positions,dtype=torch.long).to("cuda"),
-					torch.tensor(all_end_positions, dtype=torch.long).to("cuda"),
-					torch.tensor(all_labels,dtype=torch.long).to("cuda"))
+		dataset=TensorDataset(torch.tensor(all_input_ids, dtype=torch.long).to(device), 
+					torch.tensor(all_attention_masks, dtype=torch.long).to(device),
+					torch.tensor(all_token_type_ids, dtype=torch.long).to(device),
+					torch.tensor(all_start_positions,dtype=torch.long).to(device),
+					torch.tensor(all_end_positions, dtype=torch.long).to(device),
+					torch.tensor(all_labels,dtype=torch.long).to(device))
 	print("created dataset")
 	#print(mapping_counter)
 
@@ -396,7 +397,7 @@ def tokenizer_imdb(tokenizer, dataset, max_length):
 	print("input_ids: ", len(all_input_ids))
 	print("token_type_ids: ", len(all_token_type_ids))
 	print("attention_masks: ", len(all_attention_masks))
-	dataset=TensorDataset(torch.tensor(all_input_ids, dtype=torch.long).to("cuda"), torch.tensor(all_attention_masks, dtype=torch.long).to("cuda"), torch.tensor(all_token_type_ids, dtype=torch.long).to("cuda"), torch.tensor(all_labels, dtype=torch.long).to("cuda"))
+	dataset=TensorDataset(torch.tensor(all_input_ids, dtype=torch.long).to(device), torch.tensor(all_attention_masks, dtype=torch.long).to(device), torch.tensor(all_token_type_ids, dtype=torch.long).to(device), torch.tensor(all_labels, dtype=torch.long).to(device))
 	print("created imdb dataset")
 	return dataset
 
diff --git a/Code/saved_models/test.md b/Code/saved_models/test.md
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/Code/train.py b/Code/train.py
index 6ff8f6d833c21fb61ac8d2921da86456eeaa91a5..a0abfdbd77f4628074f5d0cd9a99695a35e5960e 100644
--- a/Code/train.py
+++ b/Code/train.py
@@ -43,7 +43,7 @@ def train(model, name, imdb, seed,mixup,lambda_value, mixepoch, tmix, mixlayer,
 	test_batch_size:
 	
 	Returns:"""
-	model.train().to("cuda")
+	model.train().to(device)
 	train_sampler = RandomSampler(train_dataset)
 	train_dataloader=DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)
 	num_training_steps=num_epochs*len(train_dataloader)
@@ -110,15 +110,15 @@ def train(model, name, imdb, seed,mixup,lambda_value, mixepoch, tmix, mixlayer,
 				#print("mixepoch")
 				if mixup == True:
 					#calculate new last hidden states and predictions(logits)
-					new_matrix_batch, new_labels_batch = mixup_function(outputs[2], labels, lambda_value, threshold)
-					new_matrix_batch.to("cuda")
-					new_labels_batch.to("cuda")
-					span_output=torch.randn(new_matrix_batch.shape[0], new_matrix_batch.shape[-1]).to("cuda")
+					new_matrix_batch, new_labels_batch = mixup_function(outputs[2], labels, lambda_value)
+					new_matrix_batch.to(device)
+					new_labels_batch.to(device)
+					span_output=torch.randn(new_matrix_batch.shape[0], new_matrix_batch.shape[-1]).to(device)
 					for i in range(new_matrix_batch.shape[0]):
 						span_output[i]=new_matrix_batch[i][start_positions[i]:end_positions[i]].mean(dim=0)
 					logits=model.classifier(span_output.detach())
-					logits = logits.view(-1, 2).to("cuda")
-					target = new_labels_batch.view(-1).to("cuda")
+					logits = logits.view(-1, 2).to(device)
+					target = new_labels_batch.view(-1).to(device)
 					loss_2 = cross_entropy(logits, target, lambda_value)
 					
 					#update entire model