Rename files

3956472c · F1nnH · 7148fd81 · 3956472c · 3956472c
Commit 3956472c authored 1 year ago by F1nnH
--- a/project/src/find_optimal_parameters.py
+++ b/project/src/find_optimal_parameters.py
-"""Helper functions to find the optimal parameters for the classifiers using grid search."""
-
-import numpy as np
-import matplotlib.pyplot as plt
-import pandas as pd
-from sklearn.model_selection import PredefinedSplit, GridSearchCV
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.naive_bayes import GaussianNB
-from sklearn.ensemble import RandomForestClassifier
-import seaborn as sns
-
-
-def find_best_params_for_decision_tree(X_train: list[np.ndarray], y_train: str, X_dev: list[np.ndarray], y_dev: str, feature_description: str):
-    """
-    Finds the optimal parameters for the decision tree classifier using a predefined grid and plots the results.
-    
-    Args:
-        X_train: Training data.
-        y_train: Training labels.
-        X_dev: Development data.
-        y_dev: Development labels.
-        feature_description: A string describing the features used.
-        
-    Returns:
-        best_params: A dictionary containing the optimal parameters for the decision tree classifier.
-    """
-    
-    # Create a dictionary of all values we want to test
-    param_grid = {
-        "max_depth": list(range(10,81,10)) + [None], 
-        "max_features": ["sqrt", "log2"], 
-        "min_samples_leaf": [1,2,5,10,20], 
-        "min_samples_split": [2,5,10,20],
-        "criterion": ["gini", "entropy"]
-        }
-   
-    # Ensure classifier is trained on training set and evaluated on dev set using PredefinedSplit():
-    # index -1 for training data
-    # index 0 for dev data
-    # Otherwise, GridSearchCV() would by default use a 5-fold cross validation on the training set
-    split_index = [-1] * len(X_train) + [0] * len(X_dev)
-    X_train_dev = np.concatenate((X_train, X_dev))
-    y_train_dev = np.concatenate((y_train, y_dev))
-    pds = PredefinedSplit(test_fold=split_index)
-    
-    # Use GridSearch to test all values
-    # Adjust n_jobs to the number of cores you have available
-    dtree_gscv = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), param_grid=param_grid, cv=pds, verbose=3, n_jobs=8, return_train_score=True)
-    dtree_gscv.fit(X_train_dev, y_train_dev)
-
-    # Print best model parameters after tuning
-    best_params = dtree_gscv.best_params_
-    print(f"Best parameters for decision tree classifier: {best_params}")
-
-    # Plot the GridSearch results
-    plot_grid_search_results(dtree_gscv, feature_description, "decision_tree")
-    
-    
-    return best_params
-    
-    
-def find_best_params_for_random_forest(X_train, y_train, X_dev, y_dev, feature_description):
-    """
-    Finds the optimal parameters for the random forest classifier using a predefined grid and plots the results.
-    
-    Args:
-        X_train: Training data.
-        y_train: Training labels.
-        X_dev: Development data.
-        y_dev: Development labels.
-        feature_description: A string describing the features used.
-        
-    Returns:
-        best_params: A dictionary containing the optimal parameters for the random forest classifier.
-    """
-    # Create a dictionary of all values we want to test
-    
-    """
-    # Smaller grid for testing purposes
-    param_grid = {
-        "max_depth": list(range(5,9,1)),
-        "n_estimators": [1,2,3,4],
-        "min_samples_leaf": [1,2],
-        "min_samples_split": [2,5],
-        }
-    """
-    
-    param_grid = {
-        "max_depth": list(range(10,81,10)) + [None], 
-        "n_estimators": [10,50,100], 
-        "max_features": ["sqrt", "log2"], 
-        "min_samples_leaf": [1,2,5,10,20], 
-        "min_samples_split": [2,5,10,20]
-        }
-
-    
-    # Ensure classifier is trained on training set and evaluated on dev set using PredefinedSplit():
-    # index -1 for training data
-    # index 0 for dev data
-    # Otherwise, GridSearchCV() would by default use a 5-fold cross validation on the training set
-    split_index = [-1] * len(X_train) + [0] * len(X_dev)
-    X_train_dev = np.concatenate((X_train, X_dev))
-    y_train_dev = np.concatenate((y_train, y_dev))
-    pds = PredefinedSplit(test_fold=split_index)
-    
-    # Use GridSearch to test all values
-    # Adjust n_jobs to the number of cores you have available
-    rforest_gscv = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=pds, verbose=3, n_jobs=8, return_train_score=True, scoring='accuracy')
-    rforest_gscv.fit(X_train_dev, y_train_dev)
-    
-    # Print best model parameters after tuning
-    best_params = rforest_gscv.best_params_
-    print(f"Best parameters for random forest classifier: {best_params}")
-
-    # Plot the GridSearch results
-    plot_heatmap_for_max_depth_and_n_estimators(rforest_gscv, feature_description)
-    plot_grid_search_results(rforest_gscv, feature_description, "random_forest")
-    
-
-    return best_params
-
-
-def find_best_params_for_naive_bayes(X_train, y_train, X_dev, y_dev, feature_description):
-    """
-    Finds the optimal parameters for the naive bayes classifier using a predefined grid and plots the results.
-    
-    Args:
-        X_train: Training data.
-        y_train: Training labels.
-        X_dev: Development data.
-        y_dev: Development labels.
-        feature_description: A string describing the features used.
-        
-    Returns:
-        best_params: A dictionary containing the optimal parameters for the naive bayes classifier.
-    """
-    
-    # Create a dictionary of all values we want to test; for GaussianNB, var_smoothing is the only hyperparameter
-    param_grid = {
-        "var_smoothing": np.logspace(0,-20, num=20)
-        } 
-    
-    # Ensure classifier is trained on training set and evaluated on dev set using PredefinedSplit():
-    # index -1 for training data
-    # index 0 for dev data
-    # Otherwise, GridSearchCV() would by default use a 5-fold cross validation on the training set
-    split_index = [-1] * len(X_train) + [0] * len(X_dev)
-    X_train_dev = np.concatenate((X_train, X_dev))
-    y_train_dev = np.concatenate((y_train, y_dev))
-    pds = PredefinedSplit(test_fold=split_index)
-    
-    # Use GridSearch to test all values
-    # Adjust n_jobs to the number of cores you have available
-    bayes_gscv = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid, cv=pds, verbose=3, n_jobs=6, return_train_score=True) 
-    bayes_gscv.fit(X_train_dev, y_train_dev)
-    
-    best_params = bayes_gscv.best_params_
-    print(f"Best parameters for naive bayes classifier: {best_params}")
-    
-    # Plot the GridSearch results
-    plot_grid_search_results(bayes_gscv, feature_description, "naive_bayes")
-    
-    return best_params
-    
-    
-def plot_grid_search_results(grid, feature_description, classifier_name):
-    """
-    Plots the results of the grid search for a classifier and saves the figure.
-    
-    Args:
-        grid: A trained GridSearchCV object.
-        feature_description: A string describing the features used.
-        classifier_name: A string describing the classifier used.
-    """
-    # Results from grid search
-    results = grid.cv_results_
-    means_test = results['mean_test_score']
-    stds_test = results['std_test_score']
-    means_train = results['mean_train_score']
-    stds_train = results['std_train_score']
-
-    # Our masks are the names of the hyperparameters
-    masks=[]
-    masks_names= list(grid.best_params_.keys())
-    # Move criterion (gini or entropy) to the end because we want the plot for the decision tree to be similar to the one for random forest
-    if 'criterion' in masks_names:
-        masks_names.remove('criterion')
-        masks_names.append('criterion')
-    
-    best_params_ordered = {k: v for k, v in grid.best_params_.items() if k != 'criterion'}
-    if 'criterion' in grid.best_params_:
-        best_params_ordered['criterion'] = grid.best_params_['criterion']
-    
-    for p_k, p_v in best_params_ordered.items():
-        masks.append(list(results['param_'+p_k].data==p_v))
-
-    params=grid.param_grid
-    # Replace 'None' with None because otherwise it will be plotted as a string
-    if 'max_depth' in params:
-        if None in params['max_depth']:
-            index_none = params['max_depth'].index(None)
-            params['max_depth'][index_none] = "None"
-    
-    # Create a subplot for each hyperparameter
-    fig, ax = plt.subplots(1,len(params),sharex='none', sharey='all',figsize=(30,10))
-    fig.suptitle('Score per parameter')
-    fig.text(0.04, 0.5, 'MEAN ACCURACY', va='center', rotation='vertical')
-    print(masks_names)
-    if len(masks_names) == 1:
-        ax = [ax]
-    
-    # If there's more than one hyperparameter, we combine masks for all other hyperparameters to isolate the results for the current hyperparameter
-    for i, p in enumerate(masks_names):
-        if len(masks_names) > 1:
-            m = np.stack(masks[:i] + masks[i+1:])
-            best_parms_mask = m.all(axis=0)
-            best_index = np.where(best_parms_mask)[0]
-            x = np.array(params[p])
-            y_1 = np.array(means_test[best_index])
-            e_1 = np.array(stds_test[best_index])
-            y_2 = np.array(means_train[best_index])
-            e_2 = np.array(stds_train[best_index])
-            ax[i].errorbar(x, y_1, e_1, linestyle='--', marker='o', label='dev')
-            ax[i].errorbar(x, y_2, e_2, linestyle='-', marker='^',label='train' )
-            ax[i].set_xlabel(p.upper())
-        else: 
-            x = np.array(params[p])
-            y_1 = np.array(means_test)
-            e_1 = np .array(stds_test)
-            y_2 = np.array(means_train)
-            e_2 = np.array(stds_train)
-            ax[i].errorbar(x, y_1, e_1, linestyle='--', marker='o', label='dev')
-            ax[i].errorbar(x, y_2, e_2, linestyle='-', marker='^',label='train' )
-            ax[i].set_xlabel(p.upper())
-            ax[i].set_xscale('log') # for var_smoothing in naive bayes
-
-    plt.legend()
-    plt.savefig(f"../figures/{classifier_name}/grid_search_results_{feature_description}.png")
- 
-   
-def plot_heatmap_for_max_depth_and_n_estimators(grid, feature_description: str):
-    """
-    Plots a heatmap for the GridSearch results of the random forest classifier and saves the figure.
-    
-    Args:
-        grid: A trained GridSearchCV object.
-        feature_description: A string describing the features used.
-    """
-    # Mean test scores for all combinations of max_depth and n_estimators
-    mean_scores = grid.cv_results_["mean_test_score"][:len(grid.param_grid["max_depth"])*len(grid.param_grid["n_estimators"])]
-    mean_scores = np.array(mean_scores).reshape(len(grid.param_grid["max_depth"]), len(grid.param_grid["n_estimators"]))
-
-    # Plot heatmap
-    plt.figure(figsize=(20, 10))
-    plt.imshow(mean_scores.transpose(), interpolation="nearest", cmap="viridis")
-    plt.title("Grid search results for random forest classifier", fontsize=18)
-    plt.xlabel("max_depth", fontsize=14)
-    plt.ylabel("n_estimators", fontsize=14)
-    # Label None as "None"
-    if None in grid.param_grid["max_depth"]:
-        index_none = grid.param_grid["max_depth"].index(None)
-        grid.param_grid["max_depth"][index_none] = "None"
-    plt.xticks(np.arange(len(grid.param_grid["max_depth"])), grid.param_grid["max_depth"])
-    plt.yticks(np.arange(len(grid.param_grid["n_estimators"])), grid.param_grid["n_estimators"])
-    
-    # Add scores as text annotations within the heatmap
-    for i in range(len(grid.param_grid["max_depth"])):
-        for j in range(len(grid.param_grid["n_estimators"])):
-            plt.text(i, j, f"{mean_scores[i, j]:.2f}", ha="center", va="center", color="white") 
-
-    plt.colorbar(label="Mean accuracy on dev set")
-    
-    # Save figure
-    plt.savefig(f"../figures/random_forest/heatmap_max_depth_n_estimators_{feature_description}.png")
-    print(f"Saved figure as ../figures/random_forest/heatmap_max_depth_n_estimators_{feature_description}.png")
+"""Helper functions to find the optimal parameters for the classifiers using grid search."""
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+from sklearn.model_selection import PredefinedSplit, GridSearchCV
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.ensemble import RandomForestClassifier
+import seaborn as sns
+
+
+def find_best_params_for_decision_tree(X_train: list[np.ndarray], y_train: str, X_dev: list[np.ndarray], y_dev: str, feature_description: str):
+    """
+    Finds the optimal parameters for the decision tree classifier using a predefined grid and plots the results.
+    
+    Args:
+        X_train: Training data.
+        y_train: Training labels.
+        X_dev: Development data.
+        y_dev: Development labels.
+        feature_description: A string describing the features used.
+        
+    Returns:
+        best_params: A dictionary containing the optimal parameters for the decision tree classifier.
+    """
+    
+    # Create a dictionary of all values we want to test
+    param_grid = {
+        "max_depth": list(range(10,81,10)) + [None], 
+        "max_features": ["sqrt", "log2"], 
+        "min_samples_leaf": [1,2,5,10,20], 
+        "min_samples_split": [2,5,10,20],
+        "criterion": ["gini", "entropy"]
+        }
+   
+    # Ensure classifier is trained on training set and evaluated on dev set using PredefinedSplit():
+    # index -1 for training data
+    # index 0 for dev data
+    # Otherwise, GridSearchCV() would by default use a 5-fold cross validation on the training set
+    split_index = [-1] * len(X_train) + [0] * len(X_dev)
+    X_train_dev = np.concatenate((X_train, X_dev))
+    y_train_dev = np.concatenate((y_train, y_dev))
+    pds = PredefinedSplit(test_fold=split_index)
+    
+    # Use GridSearch to test all values
+    # Adjust n_jobs to the number of cores you have available
+    dtree_gscv = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), param_grid=param_grid, cv=pds, verbose=3, n_jobs=8, return_train_score=True)
+    dtree_gscv.fit(X_train_dev, y_train_dev)
+
+    # Print best model parameters after tuning
+    best_params = dtree_gscv.best_params_
+    print(f"Best parameters for decision tree classifier: {best_params}")
+
+    # Plot the GridSearch results
+    plot_grid_search_results(dtree_gscv, feature_description, "decision_tree")
+    
+    
+    return best_params
+    
+    
+def find_best_params_for_random_forest(X_train, y_train, X_dev, y_dev, feature_description):
+    """
+    Finds the optimal parameters for the random forest classifier using a predefined grid and plots the results.
+    
+    Args:
+        X_train: Training data.
+        y_train: Training labels.
+        X_dev: Development data.
+        y_dev: Development labels.
+        feature_description: A string describing the features used.
+        
+    Returns:
+        best_params: A dictionary containing the optimal parameters for the random forest classifier.
+    """
+    # Create a dictionary of all values we want to test
+    
+    """
+    # Smaller grid for testing purposes
+    param_grid = {
+        "max_depth": list(range(5,9,1)),
+        "n_estimators": [1,2,3,4],
+        "min_samples_leaf": [1,2],
+        "min_samples_split": [2,5],
+        }
+    """
+    
+    param_grid = {
+        "max_depth": list(range(10,81,10)) + [None], 
+        "n_estimators": [10,50,100], 
+        "max_features": ["sqrt", "log2"], 
+        "min_samples_leaf": [1,2,5,10,20], 
+        "min_samples_split": [2,5,10,20]
+        }
+
+    
+    # Ensure classifier is trained on training set and evaluated on dev set using PredefinedSplit():
+    # index -1 for training data
+    # index 0 for dev data
+    # Otherwise, GridSearchCV() would by default use a 5-fold cross validation on the training set
+    split_index = [-1] * len(X_train) + [0] * len(X_dev)
+    X_train_dev = np.concatenate((X_train, X_dev))
+    y_train_dev = np.concatenate((y_train, y_dev))
+    pds = PredefinedSplit(test_fold=split_index)
+    
+    # Use GridSearch to test all values
+    # Adjust n_jobs to the number of cores you have available
+    rforest_gscv = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=pds, verbose=3, n_jobs=8, return_train_score=True, scoring='accuracy')
+    rforest_gscv.fit(X_train_dev, y_train_dev)
+    
+    # Print best model parameters after tuning
+    best_params = rforest_gscv.best_params_
+    print(f"Best parameters for random forest classifier: {best_params}")
+
+    # Plot the GridSearch results
+    plot_heatmap_for_max_depth_and_n_estimators(rforest_gscv, feature_description)
+    plot_grid_search_results(rforest_gscv, feature_description, "random_forest")
+    
+
+    return best_params
+
+
+def find_best_params_for_naive_bayes(X_train, y_train, X_dev, y_dev, feature_description):
+    """
+    Finds the optimal parameters for the naive bayes classifier using a predefined grid and plots the results.
+    
+    Args:
+        X_train: Training data.
+        y_train: Training labels.
+        X_dev: Development data.
+        y_dev: Development labels.
+        feature_description: A string describing the features used.
+        
+    Returns:
+        best_params: A dictionary containing the optimal parameters for the naive bayes classifier.
+    """
+    
+    # Create a dictionary of all values we want to test; for GaussianNB, var_smoothing is the only hyperparameter
+    param_grid = {
+        "var_smoothing": np.logspace(0,-20, num=20)
+        } 
+    
+    # Ensure classifier is trained on training set and evaluated on dev set using PredefinedSplit():
+    # index -1 for training data
+    # index 0 for dev data
+    # Otherwise, GridSearchCV() would by default use a 5-fold cross validation on the training set
+    split_index = [-1] * len(X_train) + [0] * len(X_dev)
+    X_train_dev = np.concatenate((X_train, X_dev))
+    y_train_dev = np.concatenate((y_train, y_dev))
+    pds = PredefinedSplit(test_fold=split_index)
+    
+    # Use GridSearch to test all values
+    # Adjust n_jobs to the number of cores you have available
+    bayes_gscv = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid, cv=pds, verbose=3, n_jobs=6, return_train_score=True) 
+    bayes_gscv.fit(X_train_dev, y_train_dev)
+    
+    best_params = bayes_gscv.best_params_
+    print(f"Best parameters for naive bayes classifier: {best_params}")
+    
+    # Plot the GridSearch results
+    plot_grid_search_results(bayes_gscv, feature_description, "naive_bayes")
+    
+    return best_params
+    
+    
+def plot_grid_search_results(grid, feature_description, classifier_name):
+    """
+    Plots the results of the grid search for a classifier and saves the figure.
+    
+    Args:
+        grid: A trained GridSearchCV object.
+        feature_description: A string describing the features used.
+        classifier_name: A string describing the classifier used.
+    """
+    # Results from grid search
+    results = grid.cv_results_
+    means_test = results['mean_test_score']
+    stds_test = results['std_test_score']
+    means_train = results['mean_train_score']
+    stds_train = results['std_train_score']
+
+    # Our masks are the names of the hyperparameters
+    masks=[]
+    masks_names= list(grid.best_params_.keys())
+    # Move criterion (gini or entropy) to the end because we want the plot for the decision tree to be similar to the one for random forest
+    if 'criterion' in masks_names:
+        masks_names.remove('criterion')
+        masks_names.append('criterion')
+    
+    best_params_ordered = {k: v for k, v in grid.best_params_.items() if k != 'criterion'}
+    if 'criterion' in grid.best_params_:
+        best_params_ordered['criterion'] = grid.best_params_['criterion']
+    
+    for p_k, p_v in best_params_ordered.items():
+        masks.append(list(results['param_'+p_k].data==p_v))
+
+    params=grid.param_grid
+    # Replace 'None' with None because otherwise it will be plotted as a string
+    if 'max_depth' in params:
+        if None in params['max_depth']:
+            index_none = params['max_depth'].index(None)
+            params['max_depth'][index_none] = "None"
+    
+    # Create a subplot for each hyperparameter
+    fig, ax = plt.subplots(1,len(params),sharex='none', sharey='all',figsize=(30,10))
+    fig.suptitle('Score per parameter')
+    fig.text(0.04, 0.5, 'MEAN ACCURACY', va='center', rotation='vertical')
+    print(masks_names)
+    if len(masks_names) == 1:
+        ax = [ax]
+    
+    # If there's more than one hyperparameter, we combine masks for all other hyperparameters to isolate the results for the current hyperparameter
+    for i, p in enumerate(masks_names):
+        if len(masks_names) > 1:
+            m = np.stack(masks[:i] + masks[i+1:])
+            best_parms_mask = m.all(axis=0)
+            best_index = np.where(best_parms_mask)[0]
+            x = np.array(params[p])
+            y_1 = np.array(means_test[best_index])
+            e_1 = np.array(stds_test[best_index])
+            y_2 = np.array(means_train[best_index])
+            e_2 = np.array(stds_train[best_index])
+            ax[i].errorbar(x, y_1, e_1, linestyle='--', marker='o', label='dev')
+            ax[i].errorbar(x, y_2, e_2, linestyle='-', marker='^',label='train' )
+            ax[i].set_xlabel(p.upper())
+        else: 
+            x = np.array(params[p])
+            y_1 = np.array(means_test)
+            e_1 = np .array(stds_test)
+            y_2 = np.array(means_train)
+            e_2 = np.array(stds_train)
+            ax[i].errorbar(x, y_1, e_1, linestyle='--', marker='o', label='dev')
+            ax[i].errorbar(x, y_2, e_2, linestyle='-', marker='^',label='train' )
+            ax[i].set_xlabel(p.upper())
+            ax[i].set_xscale('log') # for var_smoothing in naive bayes
+
+    plt.legend()
+    plt.savefig(f"../figures/{classifier_name}/grid_search_results_{feature_description}.png")
+ 
+   
+def plot_heatmap_for_max_depth_and_n_estimators(grid, feature_description: str):
+    """
+    Plots a heatmap for the GridSearch results of the random forest classifier and saves the figure.
+    
+    Args:
+        grid: A trained GridSearchCV object.
+        feature_description: A string describing the features used.
+    """
+    # Mean test scores for all combinations of max_depth and n_estimators
+    mean_scores = grid.cv_results_["mean_test_score"][:len(grid.param_grid["max_depth"])*len(grid.param_grid["n_estimators"])]
+    mean_scores = np.array(mean_scores).reshape(len(grid.param_grid["max_depth"]), len(grid.param_grid["n_estimators"]))
+
+    # Plot heatmap
+    plt.figure(figsize=(20, 10))
+    plt.imshow(mean_scores.transpose(), interpolation="nearest", cmap="viridis")
+    plt.title("Grid search results for random forest classifier", fontsize=18)
+    plt.xlabel("max_depth", fontsize=14)
+    plt.ylabel("n_estimators", fontsize=14)
+    # Label None as "None"
+    if None in grid.param_grid["max_depth"]:
+        index_none = grid.param_grid["max_depth"].index(None)
+        grid.param_grid["max_depth"][index_none] = "None"
+    plt.xticks(np.arange(len(grid.param_grid["max_depth"])), grid.param_grid["max_depth"])
+    plt.yticks(np.arange(len(grid.param_grid["n_estimators"])), grid.param_grid["n_estimators"])
+    
+    # Add scores as text annotations within the heatmap
+    for i in range(len(grid.param_grid["max_depth"])):
+        for j in range(len(grid.param_grid["n_estimators"])):
+            plt.text(i, j, f"{mean_scores[i, j]:.2f}", ha="center", va="center", color="white") 
+
+    plt.colorbar(label="Mean accuracy on dev set")
+    
+    # Save figure
+    plt.savefig(f"../figures/random_forest/heatmap_max_depth_n_estimators_{feature_description}.png")
+    print(f"Saved figure as ../figures/random_forest/heatmap_max_depth_n_estimators_{feature_description}.png")
    
\ No newline at end of file
--- a/project/src/results.txt
+++ b/project/src/results.txt