Skip to content
Snippets Groups Projects
Commit 3956472c authored by F1nnH's avatar F1nnH
Browse files

Rename files

parent 7148fd81
No related branches found
No related tags found
No related merge requests found
"""Helper functions to find the optimal parameters for the classifiers using grid search."""
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
def find_best_params_for_decision_tree(X_train: list[np.ndarray], y_train: str, X_dev: list[np.ndarray], y_dev: str, feature_description: str):
"""
Finds the optimal parameters for the decision tree classifier using a predefined grid and plots the results.
Args:
X_train: Training data.
y_train: Training labels.
X_dev: Development data.
y_dev: Development labels.
feature_description: A string describing the features used.
Returns:
best_params: A dictionary containing the optimal parameters for the decision tree classifier.
"""
# Create a dictionary of all values we want to test
param_grid = {
"max_depth": list(range(10,81,10)) + [None],
"max_features": ["sqrt", "log2"],
"min_samples_leaf": [1,2,5,10,20],
"min_samples_split": [2,5,10,20],
"criterion": ["gini", "entropy"]
}
# Ensure classifier is trained on training set and evaluated on dev set using PredefinedSplit():
# index -1 for training data
# index 0 for dev data
# Otherwise, GridSearchCV() would by default use a 5-fold cross validation on the training set
split_index = [-1] * len(X_train) + [0] * len(X_dev)
X_train_dev = np.concatenate((X_train, X_dev))
y_train_dev = np.concatenate((y_train, y_dev))
pds = PredefinedSplit(test_fold=split_index)
# Use GridSearch to test all values
# Adjust n_jobs to the number of cores you have available
dtree_gscv = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), param_grid=param_grid, cv=pds, verbose=3, n_jobs=8, return_train_score=True)
dtree_gscv.fit(X_train_dev, y_train_dev)
# Print best model parameters after tuning
best_params = dtree_gscv.best_params_
print(f"Best parameters for decision tree classifier: {best_params}")
# Plot the GridSearch results
plot_grid_search_results(dtree_gscv, feature_description, "decision_tree")
return best_params
def find_best_params_for_random_forest(X_train, y_train, X_dev, y_dev, feature_description):
"""
Finds the optimal parameters for the random forest classifier using a predefined grid and plots the results.
Args:
X_train: Training data.
y_train: Training labels.
X_dev: Development data.
y_dev: Development labels.
feature_description: A string describing the features used.
Returns:
best_params: A dictionary containing the optimal parameters for the random forest classifier.
"""
# Create a dictionary of all values we want to test
"""
# Smaller grid for testing purposes
param_grid = {
"max_depth": list(range(5,9,1)),
"n_estimators": [1,2,3,4],
"min_samples_leaf": [1,2],
"min_samples_split": [2,5],
}
"""
param_grid = {
"max_depth": list(range(10,81,10)) + [None],
"n_estimators": [10,50,100],
"max_features": ["sqrt", "log2"],
"min_samples_leaf": [1,2,5,10,20],
"min_samples_split": [2,5,10,20]
}
# Ensure classifier is trained on training set and evaluated on dev set using PredefinedSplit():
# index -1 for training data
# index 0 for dev data
# Otherwise, GridSearchCV() would by default use a 5-fold cross validation on the training set
split_index = [-1] * len(X_train) + [0] * len(X_dev)
X_train_dev = np.concatenate((X_train, X_dev))
y_train_dev = np.concatenate((y_train, y_dev))
pds = PredefinedSplit(test_fold=split_index)
# Use GridSearch to test all values
# Adjust n_jobs to the number of cores you have available
rforest_gscv = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=pds, verbose=3, n_jobs=8, return_train_score=True, scoring='accuracy')
rforest_gscv.fit(X_train_dev, y_train_dev)
# Print best model parameters after tuning
best_params = rforest_gscv.best_params_
print(f"Best parameters for random forest classifier: {best_params}")
# Plot the GridSearch results
plot_heatmap_for_max_depth_and_n_estimators(rforest_gscv, feature_description)
plot_grid_search_results(rforest_gscv, feature_description, "random_forest")
return best_params
def find_best_params_for_naive_bayes(X_train, y_train, X_dev, y_dev, feature_description):
"""
Finds the optimal parameters for the naive bayes classifier using a predefined grid and plots the results.
Args:
X_train: Training data.
y_train: Training labels.
X_dev: Development data.
y_dev: Development labels.
feature_description: A string describing the features used.
Returns:
best_params: A dictionary containing the optimal parameters for the naive bayes classifier.
"""
# Create a dictionary of all values we want to test; for GaussianNB, var_smoothing is the only hyperparameter
param_grid = {
"var_smoothing": np.logspace(0,-20, num=20)
}
# Ensure classifier is trained on training set and evaluated on dev set using PredefinedSplit():
# index -1 for training data
# index 0 for dev data
# Otherwise, GridSearchCV() would by default use a 5-fold cross validation on the training set
split_index = [-1] * len(X_train) + [0] * len(X_dev)
X_train_dev = np.concatenate((X_train, X_dev))
y_train_dev = np.concatenate((y_train, y_dev))
pds = PredefinedSplit(test_fold=split_index)
# Use GridSearch to test all values
# Adjust n_jobs to the number of cores you have available
bayes_gscv = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid, cv=pds, verbose=3, n_jobs=6, return_train_score=True)
bayes_gscv.fit(X_train_dev, y_train_dev)
best_params = bayes_gscv.best_params_
print(f"Best parameters for naive bayes classifier: {best_params}")
# Plot the GridSearch results
plot_grid_search_results(bayes_gscv, feature_description, "naive_bayes")
return best_params
def plot_grid_search_results(grid, feature_description, classifier_name):
"""
Plots the results of the grid search for a classifier and saves the figure.
Args:
grid: A trained GridSearchCV object.
feature_description: A string describing the features used.
classifier_name: A string describing the classifier used.
"""
# Results from grid search
results = grid.cv_results_
means_test = results['mean_test_score']
stds_test = results['std_test_score']
means_train = results['mean_train_score']
stds_train = results['std_train_score']
# Our masks are the names of the hyperparameters
masks=[]
masks_names= list(grid.best_params_.keys())
# Move criterion (gini or entropy) to the end because we want the plot for the decision tree to be similar to the one for random forest
if 'criterion' in masks_names:
masks_names.remove('criterion')
masks_names.append('criterion')
best_params_ordered = {k: v for k, v in grid.best_params_.items() if k != 'criterion'}
if 'criterion' in grid.best_params_:
best_params_ordered['criterion'] = grid.best_params_['criterion']
for p_k, p_v in best_params_ordered.items():
masks.append(list(results['param_'+p_k].data==p_v))
params=grid.param_grid
# Replace 'None' with None because otherwise it will be plotted as a string
if 'max_depth' in params:
if None in params['max_depth']:
index_none = params['max_depth'].index(None)
params['max_depth'][index_none] = "None"
# Create a subplot for each hyperparameter
fig, ax = plt.subplots(1,len(params),sharex='none', sharey='all',figsize=(30,10))
fig.suptitle('Score per parameter')
fig.text(0.04, 0.5, 'MEAN ACCURACY', va='center', rotation='vertical')
print(masks_names)
if len(masks_names) == 1:
ax = [ax]
# If there's more than one hyperparameter, we combine masks for all other hyperparameters to isolate the results for the current hyperparameter
for i, p in enumerate(masks_names):
if len(masks_names) > 1:
m = np.stack(masks[:i] + masks[i+1:])
best_parms_mask = m.all(axis=0)
best_index = np.where(best_parms_mask)[0]
x = np.array(params[p])
y_1 = np.array(means_test[best_index])
e_1 = np.array(stds_test[best_index])
y_2 = np.array(means_train[best_index])
e_2 = np.array(stds_train[best_index])
ax[i].errorbar(x, y_1, e_1, linestyle='--', marker='o', label='dev')
ax[i].errorbar(x, y_2, e_2, linestyle='-', marker='^',label='train' )
ax[i].set_xlabel(p.upper())
else:
x = np.array(params[p])
y_1 = np.array(means_test)
e_1 = np .array(stds_test)
y_2 = np.array(means_train)
e_2 = np.array(stds_train)
ax[i].errorbar(x, y_1, e_1, linestyle='--', marker='o', label='dev')
ax[i].errorbar(x, y_2, e_2, linestyle='-', marker='^',label='train' )
ax[i].set_xlabel(p.upper())
ax[i].set_xscale('log') # for var_smoothing in naive bayes
plt.legend()
plt.savefig(f"../figures/{classifier_name}/grid_search_results_{feature_description}.png")
def plot_heatmap_for_max_depth_and_n_estimators(grid, feature_description: str):
"""
Plots a heatmap for the GridSearch results of the random forest classifier and saves the figure.
Args:
grid: A trained GridSearchCV object.
feature_description: A string describing the features used.
"""
# Mean test scores for all combinations of max_depth and n_estimators
mean_scores = grid.cv_results_["mean_test_score"][:len(grid.param_grid["max_depth"])*len(grid.param_grid["n_estimators"])]
mean_scores = np.array(mean_scores).reshape(len(grid.param_grid["max_depth"]), len(grid.param_grid["n_estimators"]))
# Plot heatmap
plt.figure(figsize=(20, 10))
plt.imshow(mean_scores.transpose(), interpolation="nearest", cmap="viridis")
plt.title("Grid search results for random forest classifier", fontsize=18)
plt.xlabel("max_depth", fontsize=14)
plt.ylabel("n_estimators", fontsize=14)
# Label None as "None"
if None in grid.param_grid["max_depth"]:
index_none = grid.param_grid["max_depth"].index(None)
grid.param_grid["max_depth"][index_none] = "None"
plt.xticks(np.arange(len(grid.param_grid["max_depth"])), grid.param_grid["max_depth"])
plt.yticks(np.arange(len(grid.param_grid["n_estimators"])), grid.param_grid["n_estimators"])
# Add scores as text annotations within the heatmap
for i in range(len(grid.param_grid["max_depth"])):
for j in range(len(grid.param_grid["n_estimators"])):
plt.text(i, j, f"{mean_scores[i, j]:.2f}", ha="center", va="center", color="white")
plt.colorbar(label="Mean accuracy on dev set")
# Save figure
plt.savefig(f"../figures/random_forest/heatmap_max_depth_n_estimators_{feature_description}.png")
print(f"Saved figure as ../figures/random_forest/heatmap_max_depth_n_estimators_{feature_description}.png")
"""Helper functions to find the optimal parameters for the classifiers using grid search."""
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
def find_best_params_for_decision_tree(X_train: list[np.ndarray], y_train: str, X_dev: list[np.ndarray], y_dev: str, feature_description: str):
"""
Finds the optimal parameters for the decision tree classifier using a predefined grid and plots the results.
Args:
X_train: Training data.
y_train: Training labels.
X_dev: Development data.
y_dev: Development labels.
feature_description: A string describing the features used.
Returns:
best_params: A dictionary containing the optimal parameters for the decision tree classifier.
"""
# Create a dictionary of all values we want to test
param_grid = {
"max_depth": list(range(10,81,10)) + [None],
"max_features": ["sqrt", "log2"],
"min_samples_leaf": [1,2,5,10,20],
"min_samples_split": [2,5,10,20],
"criterion": ["gini", "entropy"]
}
# Ensure classifier is trained on training set and evaluated on dev set using PredefinedSplit():
# index -1 for training data
# index 0 for dev data
# Otherwise, GridSearchCV() would by default use a 5-fold cross validation on the training set
split_index = [-1] * len(X_train) + [0] * len(X_dev)
X_train_dev = np.concatenate((X_train, X_dev))
y_train_dev = np.concatenate((y_train, y_dev))
pds = PredefinedSplit(test_fold=split_index)
# Use GridSearch to test all values
# Adjust n_jobs to the number of cores you have available
dtree_gscv = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), param_grid=param_grid, cv=pds, verbose=3, n_jobs=8, return_train_score=True)
dtree_gscv.fit(X_train_dev, y_train_dev)
# Print best model parameters after tuning
best_params = dtree_gscv.best_params_
print(f"Best parameters for decision tree classifier: {best_params}")
# Plot the GridSearch results
plot_grid_search_results(dtree_gscv, feature_description, "decision_tree")
return best_params
def find_best_params_for_random_forest(X_train, y_train, X_dev, y_dev, feature_description):
"""
Finds the optimal parameters for the random forest classifier using a predefined grid and plots the results.
Args:
X_train: Training data.
y_train: Training labels.
X_dev: Development data.
y_dev: Development labels.
feature_description: A string describing the features used.
Returns:
best_params: A dictionary containing the optimal parameters for the random forest classifier.
"""
# Create a dictionary of all values we want to test
"""
# Smaller grid for testing purposes
param_grid = {
"max_depth": list(range(5,9,1)),
"n_estimators": [1,2,3,4],
"min_samples_leaf": [1,2],
"min_samples_split": [2,5],
}
"""
param_grid = {
"max_depth": list(range(10,81,10)) + [None],
"n_estimators": [10,50,100],
"max_features": ["sqrt", "log2"],
"min_samples_leaf": [1,2,5,10,20],
"min_samples_split": [2,5,10,20]
}
# Ensure classifier is trained on training set and evaluated on dev set using PredefinedSplit():
# index -1 for training data
# index 0 for dev data
# Otherwise, GridSearchCV() would by default use a 5-fold cross validation on the training set
split_index = [-1] * len(X_train) + [0] * len(X_dev)
X_train_dev = np.concatenate((X_train, X_dev))
y_train_dev = np.concatenate((y_train, y_dev))
pds = PredefinedSplit(test_fold=split_index)
# Use GridSearch to test all values
# Adjust n_jobs to the number of cores you have available
rforest_gscv = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=pds, verbose=3, n_jobs=8, return_train_score=True, scoring='accuracy')
rforest_gscv.fit(X_train_dev, y_train_dev)
# Print best model parameters after tuning
best_params = rforest_gscv.best_params_
print(f"Best parameters for random forest classifier: {best_params}")
# Plot the GridSearch results
plot_heatmap_for_max_depth_and_n_estimators(rforest_gscv, feature_description)
plot_grid_search_results(rforest_gscv, feature_description, "random_forest")
return best_params
def find_best_params_for_naive_bayes(X_train, y_train, X_dev, y_dev, feature_description):
"""
Finds the optimal parameters for the naive bayes classifier using a predefined grid and plots the results.
Args:
X_train: Training data.
y_train: Training labels.
X_dev: Development data.
y_dev: Development labels.
feature_description: A string describing the features used.
Returns:
best_params: A dictionary containing the optimal parameters for the naive bayes classifier.
"""
# Create a dictionary of all values we want to test; for GaussianNB, var_smoothing is the only hyperparameter
param_grid = {
"var_smoothing": np.logspace(0,-20, num=20)
}
# Ensure classifier is trained on training set and evaluated on dev set using PredefinedSplit():
# index -1 for training data
# index 0 for dev data
# Otherwise, GridSearchCV() would by default use a 5-fold cross validation on the training set
split_index = [-1] * len(X_train) + [0] * len(X_dev)
X_train_dev = np.concatenate((X_train, X_dev))
y_train_dev = np.concatenate((y_train, y_dev))
pds = PredefinedSplit(test_fold=split_index)
# Use GridSearch to test all values
# Adjust n_jobs to the number of cores you have available
bayes_gscv = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid, cv=pds, verbose=3, n_jobs=6, return_train_score=True)
bayes_gscv.fit(X_train_dev, y_train_dev)
best_params = bayes_gscv.best_params_
print(f"Best parameters for naive bayes classifier: {best_params}")
# Plot the GridSearch results
plot_grid_search_results(bayes_gscv, feature_description, "naive_bayes")
return best_params
def plot_grid_search_results(grid, feature_description, classifier_name):
"""
Plots the results of the grid search for a classifier and saves the figure.
Args:
grid: A trained GridSearchCV object.
feature_description: A string describing the features used.
classifier_name: A string describing the classifier used.
"""
# Results from grid search
results = grid.cv_results_
means_test = results['mean_test_score']
stds_test = results['std_test_score']
means_train = results['mean_train_score']
stds_train = results['std_train_score']
# Our masks are the names of the hyperparameters
masks=[]
masks_names= list(grid.best_params_.keys())
# Move criterion (gini or entropy) to the end because we want the plot for the decision tree to be similar to the one for random forest
if 'criterion' in masks_names:
masks_names.remove('criterion')
masks_names.append('criterion')
best_params_ordered = {k: v for k, v in grid.best_params_.items() if k != 'criterion'}
if 'criterion' in grid.best_params_:
best_params_ordered['criterion'] = grid.best_params_['criterion']
for p_k, p_v in best_params_ordered.items():
masks.append(list(results['param_'+p_k].data==p_v))
params=grid.param_grid
# Replace 'None' with None because otherwise it will be plotted as a string
if 'max_depth' in params:
if None in params['max_depth']:
index_none = params['max_depth'].index(None)
params['max_depth'][index_none] = "None"
# Create a subplot for each hyperparameter
fig, ax = plt.subplots(1,len(params),sharex='none', sharey='all',figsize=(30,10))
fig.suptitle('Score per parameter')
fig.text(0.04, 0.5, 'MEAN ACCURACY', va='center', rotation='vertical')
print(masks_names)
if len(masks_names) == 1:
ax = [ax]
# If there's more than one hyperparameter, we combine masks for all other hyperparameters to isolate the results for the current hyperparameter
for i, p in enumerate(masks_names):
if len(masks_names) > 1:
m = np.stack(masks[:i] + masks[i+1:])
best_parms_mask = m.all(axis=0)
best_index = np.where(best_parms_mask)[0]
x = np.array(params[p])
y_1 = np.array(means_test[best_index])
e_1 = np.array(stds_test[best_index])
y_2 = np.array(means_train[best_index])
e_2 = np.array(stds_train[best_index])
ax[i].errorbar(x, y_1, e_1, linestyle='--', marker='o', label='dev')
ax[i].errorbar(x, y_2, e_2, linestyle='-', marker='^',label='train' )
ax[i].set_xlabel(p.upper())
else:
x = np.array(params[p])
y_1 = np.array(means_test)
e_1 = np .array(stds_test)
y_2 = np.array(means_train)
e_2 = np.array(stds_train)
ax[i].errorbar(x, y_1, e_1, linestyle='--', marker='o', label='dev')
ax[i].errorbar(x, y_2, e_2, linestyle='-', marker='^',label='train' )
ax[i].set_xlabel(p.upper())
ax[i].set_xscale('log') # for var_smoothing in naive bayes
plt.legend()
plt.savefig(f"../figures/{classifier_name}/grid_search_results_{feature_description}.png")
def plot_heatmap_for_max_depth_and_n_estimators(grid, feature_description: str):
"""
Plots a heatmap for the GridSearch results of the random forest classifier and saves the figure.
Args:
grid: A trained GridSearchCV object.
feature_description: A string describing the features used.
"""
# Mean test scores for all combinations of max_depth and n_estimators
mean_scores = grid.cv_results_["mean_test_score"][:len(grid.param_grid["max_depth"])*len(grid.param_grid["n_estimators"])]
mean_scores = np.array(mean_scores).reshape(len(grid.param_grid["max_depth"]), len(grid.param_grid["n_estimators"]))
# Plot heatmap
plt.figure(figsize=(20, 10))
plt.imshow(mean_scores.transpose(), interpolation="nearest", cmap="viridis")
plt.title("Grid search results for random forest classifier", fontsize=18)
plt.xlabel("max_depth", fontsize=14)
plt.ylabel("n_estimators", fontsize=14)
# Label None as "None"
if None in grid.param_grid["max_depth"]:
index_none = grid.param_grid["max_depth"].index(None)
grid.param_grid["max_depth"][index_none] = "None"
plt.xticks(np.arange(len(grid.param_grid["max_depth"])), grid.param_grid["max_depth"])
plt.yticks(np.arange(len(grid.param_grid["n_estimators"])), grid.param_grid["n_estimators"])
# Add scores as text annotations within the heatmap
for i in range(len(grid.param_grid["max_depth"])):
for j in range(len(grid.param_grid["n_estimators"])):
plt.text(i, j, f"{mean_scores[i, j]:.2f}", ha="center", va="center", color="white")
plt.colorbar(label="Mean accuracy on dev set")
# Save figure
plt.savefig(f"../figures/random_forest/heatmap_max_depth_n_estimators_{feature_description}.png")
print(f"Saved figure as ../figures/random_forest/heatmap_max_depth_n_estimators_{feature_description}.png")
\ No newline at end of file
File moved
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment