diff --git a/Code/models.py b/Code/models.py
index 530b9f7b00dffdbec44ac8bb29ce3640989d8eac..de06b219e2108f30ad8beb969dd96d859f570042 100644
--- a/Code/models.py
+++ b/Code/models.py
@@ -41,11 +41,23 @@ def set_seed(seed: int = 42) -> None:
 
 
 
-class WordClassificationModel(torch.nn.Module): #AutoModel verwenden aus der Bibliothek
-	"""This class is needed to enable BERT to work with our input. We apply a dropout layer
-	and the linear classifier layer (/2 layer MLP) to make it a binary decision problem. In the forward step
-	we specify the classification over the span given by end and start position and compute the
-	loss function with cross entropy. The predictions (logits) are made by our classifier layer."""
+class WordClassificationModel(torch.nn.Module): 
+	"""
+    A PyTorch Module that utilizes BERT for word classification. It applies a dropout layer and
+    a linear classifier layer to make it a binary decision problem. In the forward step, it specifies
+    the classification over the span given by end and start position and computes the loss function
+    with cross entropy. The predictions (logits) are made by our classifier layer.
+
+    Params:
+        config_name (str): The configuration name of the pre-trained BERT model.
+        tmix (bool): Whether to use the TMix layer or not. Default is False.
+        imdb (bool): Whether to use the IMDB dataset or not. Default is False.
+
+    Returns:
+        outputs: The predicted logits along with the hidden states and attention masks from BERT
+                 model and the computed loss value.
+
+    """
 	def __init__(self, config_name, tmix=False, imdb=False): #mixlayer=-1, lambda_value=0.0):
 		super(WordClassificationModel, self).__init__()
 		self.tmix=tmix
@@ -116,12 +128,24 @@ class WordClassificationModel(torch.nn.Module): #AutoModel verwenden aus der Bib
 
 
 
-class BertForWordClassification(BertPreTrainedModel): #AutoModel verwenden aus der Bibliothek
-	"""This class is needed to enable BERT to work with our input. We apply a dropout layer
-	and the linear classifier layer to make it a binary decision problem. In the forward step
-	we specify the classification over the span given by end and start position and compute the
-	loss function with cross entropy. The predictions (logits) are made by our classifier layer.
+class BertForWordClassification(BertPreTrainedModel): 
 	"""
+    BERT model for word classification. Applies a dropout layer and a linear classifier layer to make it a binary
+    decision problem. In the forward step, the classification is specified over the span given by end and start
+    position, and the loss function is computed with cross entropy. The predictions (logits) are made by the classifier
+    layer.
+    
+    Params:
+        config (:class:`~transformers.BertConfig`):
+            Configuration class for BERT.
+
+    Outputs:
+        if `labels` is not `None`:
+        - `loss`: `torch.FloatTensor` of shape `(1,)`. Classification loss.
+        - `logits`: `torch.FloatTensor` of shape `(batch_size, num_labels)`. Logits (output) produced by the linear classifier layer.
+        if `labels` is `None`:
+        - `logits`: `torch.FloatTensor` of shape `(batch_size, num_labels)`. Logits (output) produced by the linear classifier layer.
+    """
 	def __init__(self, config):
 		super(BertForWordClassification, self).__init__(config)
 		self.num_labels=config.num_labels
@@ -158,11 +182,26 @@ class BertForWordClassification(BertPreTrainedModel): #AutoModel verwenden aus d
 		return outputs
 
 
-class RobertaForWordClassification(RobertaPreTrainedModel): #AutoModel verwenden aus der Bibliothek
-	"""This class is needed to enable BERT to work with our input. We apply a dropout layer
-	and the linear classifier layer to make it a binary decision problem. In the forward step
-	we specify the classification over the span given by end and start position and compute the
-	loss function with cross entropy. The predictions (logits) are made by our classifier layer."""
+class RobertaForWordClassification(RobertaPreTrainedModel):
+	class RobertaForWordClassification(RobertaPreTrainedModel):
+    """
+    Fine-tunes a pre-trained RoBERTa model for word classification tasks. Applies a dropout layer
+    and a linear classifier layer to make it a binary decision problem. In the forward step,
+    the model specifies the classification over the span given by end and start position and computes the
+    loss function with cross entropy. The predictions (logits) are made by the classifier layer.
+
+    Args:
+        config (:class:`~transformers.RobertaConfig`):
+            The configuration object that configures the model architecture.
+
+    Outputs:
+        if `labels` is not None:
+            Returns the cross-entropy loss (:obj:`torch.FloatTensor`).
+        if `labels` is None:
+            Returns a tuple of the predicted logits for each class (:obj:`torch.FloatTensor`)
+            and a tuple of outputs from the RoBERTa model, including the last hidden state and
+            the attention mask.
+    """
 	def __init__(self, config):
 		super(RobertaForWordClassification, self).__init__(config)
 		self.num_labels=config.num_labels
@@ -199,11 +238,27 @@ class RobertaForWordClassification(RobertaPreTrainedModel): #AutoModel verwenden
 
 
 class BertModelTMix(BertPreTrainedModel):
-    """
-   	Model to override forward function in Encoder (copied and slightly modified from
+	"""
+    Initializes a BertModelTMix model. Override the forward function in Encoder (copied and slightly modified from
 	transformers)
+    
+    Params:
+		- config: `BertConfig` instance with the model configuration.
+		- add_pooling_layer: `bool`. Whether to include a pooling layer in the model architecture. Default: True.
+		- *args: Variable length argument list.
+		- **kwargs: Arbitrary keyword arguments.
+    
+    Returns:
+    	- If return_dict is True, returns a dictionary containing the following keys:
+			'last_hidden_state': a tensor of shape (batch_size, sequence_length, hidden_size) containing the final hidden states of the model.
+			'pooler_output': a tensor of shape (batch_size, hidden_size) containing the output of the model's pooling layer (if add_pooling_layer is True).
+			'past_key_values': a tuple of tensors containing the precomputed key and value hidden states of the attention blocks (if use_cache is True).
+			'hidden_states': a tuple of tensors containing the hidden states of all layers of the model (if output_hidden_states is True).
+			'attentions': a tuple of tensors containing the attention weights of all layers of the model (if output_attentions is True).
+		- Otherwise, returns a tuple containing:
+			a tensor of shape (batch_size, sequence_length, hidden_size) containing the final hidden states of the model.
+			a tensor of shape (batch_size, hidden_size) containing the output of the model's pooling layer (if add_pooling_layer is True).
     """
-
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
@@ -374,6 +429,17 @@ class BertModelTMix(BertPreTrainedModel):
 class BertTMixEncoder(torch.nn.Module):
 	"""Used for Tmix. When using Tmix the only change that has to be done, is to be able to modify layers in model.
 	This way, we can apply the Mixup function to a batch of hidden states at a certain layer """
+	"""Module for Tmix that modifies the model layers to apply Mixup function to a batch of hidden states at a certain layer.
+    
+    Params:
+        config (BertConfig): Configuration class for BertTMixEncoder.
+        
+    Returns:
+    	- If return_dict is True, returns a BaseModelOutputWithPastAndCrossAttentions object containing the last hidden state, 
+			past key values, all hidden states, self-attention weights, and cross-attention weights (if add_cross_attention=True in the model configuration).
+		- If return_dict is False, returns a tuple containing the last hidden state and the last state of labels after interpolation. 
+			Any None values are omitted.
+    """
 	def __init__(self, config):
 		super().__init__()
 		self.config = config
@@ -481,6 +547,20 @@ class BertTMixEncoder(torch.nn.Module):
 
 #Moneky Patching the forward function of BertLayer for mixup -> use decorators here to call the old forward function on the newly comptued hidden_state
 def forward_new(forward):
+	"""
+    Decorator function that monkey patches the `forward` method of a `BertLayer` instance to implement mixup data
+    augmentation during training. When `nowlayer` and `mixepoch` arguments are specified, performs mixup data 
+    augmentation on the hidden states and labels of the layer specified by `nowlayer`. Otherwise, calls the original 
+    `forward` method of the `BertLayer` instance on the input tensors and returns the output tensor along with the 
+    original labels.
+
+    Params:
+        forward (callable): The original `forward` method of the `BertLayer` instance.
+
+    Returns:
+        callable: A new `forward` method for the `BertLayer` instance that performs mixup data augmentation if 
+        specified, otherwise calls the original `forward` method.
+    """
 	def forward_mix(self, hidden_states: torch.Tensor,
 			labels: torch.Tensor,
 			attention_mask: Optional[torch.FloatTensor] = None,