Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
exp-ml-2-hillengass-graf
Manage
Activity
Members
Plan
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Terraform modules
Analyze
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
igraf
exp-ml-2-hillengass-graf
Commits
3956472c
Commit
3956472c
authored
1 year ago
by
F1nnH
Browse files
Options
Downloads
Patches
Plain Diff
Rename files
parent
7148fd81
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
project/src/find_optimal_parameters_basic.py
+275
-275
275 additions, 275 deletions
project/src/find_optimal_parameters_basic.py
project/src/results_basic_classifiers.txt
+0
-0
0 additions, 0 deletions
project/src/results_basic_classifiers.txt
with
275 additions
and
275 deletions
project/src/find_optimal_parameters.py
→
project/src/find_optimal_parameters
_basic
.py
+
275
−
275
View file @
3956472c
"""
Helper functions to find the optimal parameters for the classifiers using grid search.
"""
import
numpy
as
np
import
matplotlib.pyplot
as
plt
import
pandas
as
pd
from
sklearn.model_selection
import
PredefinedSplit
,
GridSearchCV
from
sklearn.tree
import
DecisionTreeClassifier
from
sklearn.naive_bayes
import
GaussianNB
from
sklearn.ensemble
import
RandomForestClassifier
import
seaborn
as
sns
def
find_best_params_for_decision_tree
(
X_train
:
list
[
np
.
ndarray
],
y_train
:
str
,
X_dev
:
list
[
np
.
ndarray
],
y_dev
:
str
,
feature_description
:
str
):
"""
Finds the optimal parameters for the decision tree classifier using a predefined grid and plots the results.
Args:
X_train: Training data.
y_train: Training labels.
X_dev: Development data.
y_dev: Development labels.
feature_description: A string describing the features used.
Returns:
best_params: A dictionary containing the optimal parameters for the decision tree classifier.
"""
# Create a dictionary of all values we want to test
param_grid
=
{
"
max_depth
"
:
list
(
range
(
10
,
81
,
10
))
+
[
None
],
"
max_features
"
:
[
"
sqrt
"
,
"
log2
"
],
"
min_samples_leaf
"
:
[
1
,
2
,
5
,
10
,
20
],
"
min_samples_split
"
:
[
2
,
5
,
10
,
20
],
"
criterion
"
:
[
"
gini
"
,
"
entropy
"
]
}
# Ensure classifier is trained on training set and evaluated on dev set using PredefinedSplit():
# index -1 for training data
# index 0 for dev data
# Otherwise, GridSearchCV() would by default use a 5-fold cross validation on the training set
split_index
=
[
-
1
]
*
len
(
X_train
)
+
[
0
]
*
len
(
X_dev
)
X_train_dev
=
np
.
concatenate
((
X_train
,
X_dev
))
y_train_dev
=
np
.
concatenate
((
y_train
,
y_dev
))
pds
=
PredefinedSplit
(
test_fold
=
split_index
)
# Use GridSearch to test all values
# Adjust n_jobs to the number of cores you have available
dtree_gscv
=
GridSearchCV
(
estimator
=
DecisionTreeClassifier
(
random_state
=
42
),
param_grid
=
param_grid
,
cv
=
pds
,
verbose
=
3
,
n_jobs
=
8
,
return_train_score
=
True
)
dtree_gscv
.
fit
(
X_train_dev
,
y_train_dev
)
# Print best model parameters after tuning
best_params
=
dtree_gscv
.
best_params_
print
(
f
"
Best parameters for decision tree classifier:
{
best_params
}
"
)
# Plot the GridSearch results
plot_grid_search_results
(
dtree_gscv
,
feature_description
,
"
decision_tree
"
)
return
best_params
def
find_best_params_for_random_forest
(
X_train
,
y_train
,
X_dev
,
y_dev
,
feature_description
):
"""
Finds the optimal parameters for the random forest classifier using a predefined grid and plots the results.
Args:
X_train: Training data.
y_train: Training labels.
X_dev: Development data.
y_dev: Development labels.
feature_description: A string describing the features used.
Returns:
best_params: A dictionary containing the optimal parameters for the random forest classifier.
"""
# Create a dictionary of all values we want to test
"""
# Smaller grid for testing purposes
param_grid = {
"
max_depth
"
: list(range(5,9,1)),
"
n_estimators
"
: [1,2,3,4],
"
min_samples_leaf
"
: [1,2],
"
min_samples_split
"
: [2,5],
}
"""
param_grid
=
{
"
max_depth
"
:
list
(
range
(
10
,
81
,
10
))
+
[
None
],
"
n_estimators
"
:
[
10
,
50
,
100
],
"
max_features
"
:
[
"
sqrt
"
,
"
log2
"
],
"
min_samples_leaf
"
:
[
1
,
2
,
5
,
10
,
20
],
"
min_samples_split
"
:
[
2
,
5
,
10
,
20
]
}
# Ensure classifier is trained on training set and evaluated on dev set using PredefinedSplit():
# index -1 for training data
# index 0 for dev data
# Otherwise, GridSearchCV() would by default use a 5-fold cross validation on the training set
split_index
=
[
-
1
]
*
len
(
X_train
)
+
[
0
]
*
len
(
X_dev
)
X_train_dev
=
np
.
concatenate
((
X_train
,
X_dev
))
y_train_dev
=
np
.
concatenate
((
y_train
,
y_dev
))
pds
=
PredefinedSplit
(
test_fold
=
split_index
)
# Use GridSearch to test all values
# Adjust n_jobs to the number of cores you have available
rforest_gscv
=
GridSearchCV
(
estimator
=
RandomForestClassifier
(
random_state
=
42
),
param_grid
=
param_grid
,
cv
=
pds
,
verbose
=
3
,
n_jobs
=
8
,
return_train_score
=
True
,
scoring
=
'
accuracy
'
)
rforest_gscv
.
fit
(
X_train_dev
,
y_train_dev
)
# Print best model parameters after tuning
best_params
=
rforest_gscv
.
best_params_
print
(
f
"
Best parameters for random forest classifier:
{
best_params
}
"
)
# Plot the GridSearch results
plot_heatmap_for_max_depth_and_n_estimators
(
rforest_gscv
,
feature_description
)
plot_grid_search_results
(
rforest_gscv
,
feature_description
,
"
random_forest
"
)
return
best_params
def
find_best_params_for_naive_bayes
(
X_train
,
y_train
,
X_dev
,
y_dev
,
feature_description
):
"""
Finds the optimal parameters for the naive bayes classifier using a predefined grid and plots the results.
Args:
X_train: Training data.
y_train: Training labels.
X_dev: Development data.
y_dev: Development labels.
feature_description: A string describing the features used.
Returns:
best_params: A dictionary containing the optimal parameters for the naive bayes classifier.
"""
# Create a dictionary of all values we want to test; for GaussianNB, var_smoothing is the only hyperparameter
param_grid
=
{
"
var_smoothing
"
:
np
.
logspace
(
0
,
-
20
,
num
=
20
)
}
# Ensure classifier is trained on training set and evaluated on dev set using PredefinedSplit():
# index -1 for training data
# index 0 for dev data
# Otherwise, GridSearchCV() would by default use a 5-fold cross validation on the training set
split_index
=
[
-
1
]
*
len
(
X_train
)
+
[
0
]
*
len
(
X_dev
)
X_train_dev
=
np
.
concatenate
((
X_train
,
X_dev
))
y_train_dev
=
np
.
concatenate
((
y_train
,
y_dev
))
pds
=
PredefinedSplit
(
test_fold
=
split_index
)
# Use GridSearch to test all values
# Adjust n_jobs to the number of cores you have available
bayes_gscv
=
GridSearchCV
(
estimator
=
GaussianNB
(),
param_grid
=
param_grid
,
cv
=
pds
,
verbose
=
3
,
n_jobs
=
6
,
return_train_score
=
True
)
bayes_gscv
.
fit
(
X_train_dev
,
y_train_dev
)
best_params
=
bayes_gscv
.
best_params_
print
(
f
"
Best parameters for naive bayes classifier:
{
best_params
}
"
)
# Plot the GridSearch results
plot_grid_search_results
(
bayes_gscv
,
feature_description
,
"
naive_bayes
"
)
return
best_params
def
plot_grid_search_results
(
grid
,
feature_description
,
classifier_name
):
"""
Plots the results of the grid search for a classifier and saves the figure.
Args:
grid: A trained GridSearchCV object.
feature_description: A string describing the features used.
classifier_name: A string describing the classifier used.
"""
# Results from grid search
results
=
grid
.
cv_results_
means_test
=
results
[
'
mean_test_score
'
]
stds_test
=
results
[
'
std_test_score
'
]
means_train
=
results
[
'
mean_train_score
'
]
stds_train
=
results
[
'
std_train_score
'
]
# Our masks are the names of the hyperparameters
masks
=
[]
masks_names
=
list
(
grid
.
best_params_
.
keys
())
# Move criterion (gini or entropy) to the end because we want the plot for the decision tree to be similar to the one for random forest
if
'
criterion
'
in
masks_names
:
masks_names
.
remove
(
'
criterion
'
)
masks_names
.
append
(
'
criterion
'
)
best_params_ordered
=
{
k
:
v
for
k
,
v
in
grid
.
best_params_
.
items
()
if
k
!=
'
criterion
'
}
if
'
criterion
'
in
grid
.
best_params_
:
best_params_ordered
[
'
criterion
'
]
=
grid
.
best_params_
[
'
criterion
'
]
for
p_k
,
p_v
in
best_params_ordered
.
items
():
masks
.
append
(
list
(
results
[
'
param_
'
+
p_k
].
data
==
p_v
))
params
=
grid
.
param_grid
# Replace 'None' with None because otherwise it will be plotted as a string
if
'
max_depth
'
in
params
:
if
None
in
params
[
'
max_depth
'
]:
index_none
=
params
[
'
max_depth
'
].
index
(
None
)
params
[
'
max_depth
'
][
index_none
]
=
"
None
"
# Create a subplot for each hyperparameter
fig
,
ax
=
plt
.
subplots
(
1
,
len
(
params
),
sharex
=
'
none
'
,
sharey
=
'
all
'
,
figsize
=
(
30
,
10
))
fig
.
suptitle
(
'
Score per parameter
'
)
fig
.
text
(
0.04
,
0.5
,
'
MEAN ACCURACY
'
,
va
=
'
center
'
,
rotation
=
'
vertical
'
)
print
(
masks_names
)
if
len
(
masks_names
)
==
1
:
ax
=
[
ax
]
# If there's more than one hyperparameter, we combine masks for all other hyperparameters to isolate the results for the current hyperparameter
for
i
,
p
in
enumerate
(
masks_names
):
if
len
(
masks_names
)
>
1
:
m
=
np
.
stack
(
masks
[:
i
]
+
masks
[
i
+
1
:])
best_parms_mask
=
m
.
all
(
axis
=
0
)
best_index
=
np
.
where
(
best_parms_mask
)[
0
]
x
=
np
.
array
(
params
[
p
])
y_1
=
np
.
array
(
means_test
[
best_index
])
e_1
=
np
.
array
(
stds_test
[
best_index
])
y_2
=
np
.
array
(
means_train
[
best_index
])
e_2
=
np
.
array
(
stds_train
[
best_index
])
ax
[
i
].
errorbar
(
x
,
y_1
,
e_1
,
linestyle
=
'
--
'
,
marker
=
'
o
'
,
label
=
'
dev
'
)
ax
[
i
].
errorbar
(
x
,
y_2
,
e_2
,
linestyle
=
'
-
'
,
marker
=
'
^
'
,
label
=
'
train
'
)
ax
[
i
].
set_xlabel
(
p
.
upper
())
else
:
x
=
np
.
array
(
params
[
p
])
y_1
=
np
.
array
(
means_test
)
e_1
=
np
.
array
(
stds_test
)
y_2
=
np
.
array
(
means_train
)
e_2
=
np
.
array
(
stds_train
)
ax
[
i
].
errorbar
(
x
,
y_1
,
e_1
,
linestyle
=
'
--
'
,
marker
=
'
o
'
,
label
=
'
dev
'
)
ax
[
i
].
errorbar
(
x
,
y_2
,
e_2
,
linestyle
=
'
-
'
,
marker
=
'
^
'
,
label
=
'
train
'
)
ax
[
i
].
set_xlabel
(
p
.
upper
())
ax
[
i
].
set_xscale
(
'
log
'
)
# for var_smoothing in naive bayes
plt
.
legend
()
plt
.
savefig
(
f
"
../figures/
{
classifier_name
}
/grid_search_results_
{
feature_description
}
.png
"
)
def
plot_heatmap_for_max_depth_and_n_estimators
(
grid
,
feature_description
:
str
):
"""
Plots a heatmap for the GridSearch results of the random forest classifier and saves the figure.
Args:
grid: A trained GridSearchCV object.
feature_description: A string describing the features used.
"""
# Mean test scores for all combinations of max_depth and n_estimators
mean_scores
=
grid
.
cv_results_
[
"
mean_test_score
"
][:
len
(
grid
.
param_grid
[
"
max_depth
"
])
*
len
(
grid
.
param_grid
[
"
n_estimators
"
])]
mean_scores
=
np
.
array
(
mean_scores
).
reshape
(
len
(
grid
.
param_grid
[
"
max_depth
"
]),
len
(
grid
.
param_grid
[
"
n_estimators
"
]))
# Plot heatmap
plt
.
figure
(
figsize
=
(
20
,
10
))
plt
.
imshow
(
mean_scores
.
transpose
(),
interpolation
=
"
nearest
"
,
cmap
=
"
viridis
"
)
plt
.
title
(
"
Grid search results for random forest classifier
"
,
fontsize
=
18
)
plt
.
xlabel
(
"
max_depth
"
,
fontsize
=
14
)
plt
.
ylabel
(
"
n_estimators
"
,
fontsize
=
14
)
# Label None as "None"
if
None
in
grid
.
param_grid
[
"
max_depth
"
]:
index_none
=
grid
.
param_grid
[
"
max_depth
"
].
index
(
None
)
grid
.
param_grid
[
"
max_depth
"
][
index_none
]
=
"
None
"
plt
.
xticks
(
np
.
arange
(
len
(
grid
.
param_grid
[
"
max_depth
"
])),
grid
.
param_grid
[
"
max_depth
"
])
plt
.
yticks
(
np
.
arange
(
len
(
grid
.
param_grid
[
"
n_estimators
"
])),
grid
.
param_grid
[
"
n_estimators
"
])
# Add scores as text annotations within the heatmap
for
i
in
range
(
len
(
grid
.
param_grid
[
"
max_depth
"
])):
for
j
in
range
(
len
(
grid
.
param_grid
[
"
n_estimators
"
])):
plt
.
text
(
i
,
j
,
f
"
{
mean_scores
[
i
,
j
]
:
.
2
f
}
"
,
ha
=
"
center
"
,
va
=
"
center
"
,
color
=
"
white
"
)
plt
.
colorbar
(
label
=
"
Mean accuracy on dev set
"
)
# Save figure
plt
.
savefig
(
f
"
../figures/random_forest/heatmap_max_depth_n_estimators_
{
feature_description
}
.png
"
)
print
(
f
"
Saved figure as ../figures/random_forest/heatmap_max_depth_n_estimators_
{
feature_description
}
.png
"
)
"""
Helper functions to find the optimal parameters for the classifiers using grid search.
"""
import
numpy
as
np
import
matplotlib.pyplot
as
plt
import
pandas
as
pd
from
sklearn.model_selection
import
PredefinedSplit
,
GridSearchCV
from
sklearn.tree
import
DecisionTreeClassifier
from
sklearn.naive_bayes
import
GaussianNB
from
sklearn.ensemble
import
RandomForestClassifier
import
seaborn
as
sns
def
find_best_params_for_decision_tree
(
X_train
:
list
[
np
.
ndarray
],
y_train
:
str
,
X_dev
:
list
[
np
.
ndarray
],
y_dev
:
str
,
feature_description
:
str
):
"""
Finds the optimal parameters for the decision tree classifier using a predefined grid and plots the results.
Args:
X_train: Training data.
y_train: Training labels.
X_dev: Development data.
y_dev: Development labels.
feature_description: A string describing the features used.
Returns:
best_params: A dictionary containing the optimal parameters for the decision tree classifier.
"""
# Create a dictionary of all values we want to test
param_grid
=
{
"
max_depth
"
:
list
(
range
(
10
,
81
,
10
))
+
[
None
],
"
max_features
"
:
[
"
sqrt
"
,
"
log2
"
],
"
min_samples_leaf
"
:
[
1
,
2
,
5
,
10
,
20
],
"
min_samples_split
"
:
[
2
,
5
,
10
,
20
],
"
criterion
"
:
[
"
gini
"
,
"
entropy
"
]
}
# Ensure classifier is trained on training set and evaluated on dev set using PredefinedSplit():
# index -1 for training data
# index 0 for dev data
# Otherwise, GridSearchCV() would by default use a 5-fold cross validation on the training set
split_index
=
[
-
1
]
*
len
(
X_train
)
+
[
0
]
*
len
(
X_dev
)
X_train_dev
=
np
.
concatenate
((
X_train
,
X_dev
))
y_train_dev
=
np
.
concatenate
((
y_train
,
y_dev
))
pds
=
PredefinedSplit
(
test_fold
=
split_index
)
# Use GridSearch to test all values
# Adjust n_jobs to the number of cores you have available
dtree_gscv
=
GridSearchCV
(
estimator
=
DecisionTreeClassifier
(
random_state
=
42
),
param_grid
=
param_grid
,
cv
=
pds
,
verbose
=
3
,
n_jobs
=
8
,
return_train_score
=
True
)
dtree_gscv
.
fit
(
X_train_dev
,
y_train_dev
)
# Print best model parameters after tuning
best_params
=
dtree_gscv
.
best_params_
print
(
f
"
Best parameters for decision tree classifier:
{
best_params
}
"
)
# Plot the GridSearch results
plot_grid_search_results
(
dtree_gscv
,
feature_description
,
"
decision_tree
"
)
return
best_params
def
find_best_params_for_random_forest
(
X_train
,
y_train
,
X_dev
,
y_dev
,
feature_description
):
"""
Finds the optimal parameters for the random forest classifier using a predefined grid and plots the results.
Args:
X_train: Training data.
y_train: Training labels.
X_dev: Development data.
y_dev: Development labels.
feature_description: A string describing the features used.
Returns:
best_params: A dictionary containing the optimal parameters for the random forest classifier.
"""
# Create a dictionary of all values we want to test
"""
# Smaller grid for testing purposes
param_grid = {
"
max_depth
"
: list(range(5,9,1)),
"
n_estimators
"
: [1,2,3,4],
"
min_samples_leaf
"
: [1,2],
"
min_samples_split
"
: [2,5],
}
"""
param_grid
=
{
"
max_depth
"
:
list
(
range
(
10
,
81
,
10
))
+
[
None
],
"
n_estimators
"
:
[
10
,
50
,
100
],
"
max_features
"
:
[
"
sqrt
"
,
"
log2
"
],
"
min_samples_leaf
"
:
[
1
,
2
,
5
,
10
,
20
],
"
min_samples_split
"
:
[
2
,
5
,
10
,
20
]
}
# Ensure classifier is trained on training set and evaluated on dev set using PredefinedSplit():
# index -1 for training data
# index 0 for dev data
# Otherwise, GridSearchCV() would by default use a 5-fold cross validation on the training set
split_index
=
[
-
1
]
*
len
(
X_train
)
+
[
0
]
*
len
(
X_dev
)
X_train_dev
=
np
.
concatenate
((
X_train
,
X_dev
))
y_train_dev
=
np
.
concatenate
((
y_train
,
y_dev
))
pds
=
PredefinedSplit
(
test_fold
=
split_index
)
# Use GridSearch to test all values
# Adjust n_jobs to the number of cores you have available
rforest_gscv
=
GridSearchCV
(
estimator
=
RandomForestClassifier
(
random_state
=
42
),
param_grid
=
param_grid
,
cv
=
pds
,
verbose
=
3
,
n_jobs
=
8
,
return_train_score
=
True
,
scoring
=
'
accuracy
'
)
rforest_gscv
.
fit
(
X_train_dev
,
y_train_dev
)
# Print best model parameters after tuning
best_params
=
rforest_gscv
.
best_params_
print
(
f
"
Best parameters for random forest classifier:
{
best_params
}
"
)
# Plot the GridSearch results
plot_heatmap_for_max_depth_and_n_estimators
(
rforest_gscv
,
feature_description
)
plot_grid_search_results
(
rforest_gscv
,
feature_description
,
"
random_forest
"
)
return
best_params
def
find_best_params_for_naive_bayes
(
X_train
,
y_train
,
X_dev
,
y_dev
,
feature_description
):
"""
Finds the optimal parameters for the naive bayes classifier using a predefined grid and plots the results.
Args:
X_train: Training data.
y_train: Training labels.
X_dev: Development data.
y_dev: Development labels.
feature_description: A string describing the features used.
Returns:
best_params: A dictionary containing the optimal parameters for the naive bayes classifier.
"""
# Create a dictionary of all values we want to test; for GaussianNB, var_smoothing is the only hyperparameter
param_grid
=
{
"
var_smoothing
"
:
np
.
logspace
(
0
,
-
20
,
num
=
20
)
}
# Ensure classifier is trained on training set and evaluated on dev set using PredefinedSplit():
# index -1 for training data
# index 0 for dev data
# Otherwise, GridSearchCV() would by default use a 5-fold cross validation on the training set
split_index
=
[
-
1
]
*
len
(
X_train
)
+
[
0
]
*
len
(
X_dev
)
X_train_dev
=
np
.
concatenate
((
X_train
,
X_dev
))
y_train_dev
=
np
.
concatenate
((
y_train
,
y_dev
))
pds
=
PredefinedSplit
(
test_fold
=
split_index
)
# Use GridSearch to test all values
# Adjust n_jobs to the number of cores you have available
bayes_gscv
=
GridSearchCV
(
estimator
=
GaussianNB
(),
param_grid
=
param_grid
,
cv
=
pds
,
verbose
=
3
,
n_jobs
=
6
,
return_train_score
=
True
)
bayes_gscv
.
fit
(
X_train_dev
,
y_train_dev
)
best_params
=
bayes_gscv
.
best_params_
print
(
f
"
Best parameters for naive bayes classifier:
{
best_params
}
"
)
# Plot the GridSearch results
plot_grid_search_results
(
bayes_gscv
,
feature_description
,
"
naive_bayes
"
)
return
best_params
def
plot_grid_search_results
(
grid
,
feature_description
,
classifier_name
):
"""
Plots the results of the grid search for a classifier and saves the figure.
Args:
grid: A trained GridSearchCV object.
feature_description: A string describing the features used.
classifier_name: A string describing the classifier used.
"""
# Results from grid search
results
=
grid
.
cv_results_
means_test
=
results
[
'
mean_test_score
'
]
stds_test
=
results
[
'
std_test_score
'
]
means_train
=
results
[
'
mean_train_score
'
]
stds_train
=
results
[
'
std_train_score
'
]
# Our masks are the names of the hyperparameters
masks
=
[]
masks_names
=
list
(
grid
.
best_params_
.
keys
())
# Move criterion (gini or entropy) to the end because we want the plot for the decision tree to be similar to the one for random forest
if
'
criterion
'
in
masks_names
:
masks_names
.
remove
(
'
criterion
'
)
masks_names
.
append
(
'
criterion
'
)
best_params_ordered
=
{
k
:
v
for
k
,
v
in
grid
.
best_params_
.
items
()
if
k
!=
'
criterion
'
}
if
'
criterion
'
in
grid
.
best_params_
:
best_params_ordered
[
'
criterion
'
]
=
grid
.
best_params_
[
'
criterion
'
]
for
p_k
,
p_v
in
best_params_ordered
.
items
():
masks
.
append
(
list
(
results
[
'
param_
'
+
p_k
].
data
==
p_v
))
params
=
grid
.
param_grid
# Replace 'None' with None because otherwise it will be plotted as a string
if
'
max_depth
'
in
params
:
if
None
in
params
[
'
max_depth
'
]:
index_none
=
params
[
'
max_depth
'
].
index
(
None
)
params
[
'
max_depth
'
][
index_none
]
=
"
None
"
# Create a subplot for each hyperparameter
fig
,
ax
=
plt
.
subplots
(
1
,
len
(
params
),
sharex
=
'
none
'
,
sharey
=
'
all
'
,
figsize
=
(
30
,
10
))
fig
.
suptitle
(
'
Score per parameter
'
)
fig
.
text
(
0.04
,
0.5
,
'
MEAN ACCURACY
'
,
va
=
'
center
'
,
rotation
=
'
vertical
'
)
print
(
masks_names
)
if
len
(
masks_names
)
==
1
:
ax
=
[
ax
]
# If there's more than one hyperparameter, we combine masks for all other hyperparameters to isolate the results for the current hyperparameter
for
i
,
p
in
enumerate
(
masks_names
):
if
len
(
masks_names
)
>
1
:
m
=
np
.
stack
(
masks
[:
i
]
+
masks
[
i
+
1
:])
best_parms_mask
=
m
.
all
(
axis
=
0
)
best_index
=
np
.
where
(
best_parms_mask
)[
0
]
x
=
np
.
array
(
params
[
p
])
y_1
=
np
.
array
(
means_test
[
best_index
])
e_1
=
np
.
array
(
stds_test
[
best_index
])
y_2
=
np
.
array
(
means_train
[
best_index
])
e_2
=
np
.
array
(
stds_train
[
best_index
])
ax
[
i
].
errorbar
(
x
,
y_1
,
e_1
,
linestyle
=
'
--
'
,
marker
=
'
o
'
,
label
=
'
dev
'
)
ax
[
i
].
errorbar
(
x
,
y_2
,
e_2
,
linestyle
=
'
-
'
,
marker
=
'
^
'
,
label
=
'
train
'
)
ax
[
i
].
set_xlabel
(
p
.
upper
())
else
:
x
=
np
.
array
(
params
[
p
])
y_1
=
np
.
array
(
means_test
)
e_1
=
np
.
array
(
stds_test
)
y_2
=
np
.
array
(
means_train
)
e_2
=
np
.
array
(
stds_train
)
ax
[
i
].
errorbar
(
x
,
y_1
,
e_1
,
linestyle
=
'
--
'
,
marker
=
'
o
'
,
label
=
'
dev
'
)
ax
[
i
].
errorbar
(
x
,
y_2
,
e_2
,
linestyle
=
'
-
'
,
marker
=
'
^
'
,
label
=
'
train
'
)
ax
[
i
].
set_xlabel
(
p
.
upper
())
ax
[
i
].
set_xscale
(
'
log
'
)
# for var_smoothing in naive bayes
plt
.
legend
()
plt
.
savefig
(
f
"
../figures/
{
classifier_name
}
/grid_search_results_
{
feature_description
}
.png
"
)
def
plot_heatmap_for_max_depth_and_n_estimators
(
grid
,
feature_description
:
str
):
"""
Plots a heatmap for the GridSearch results of the random forest classifier and saves the figure.
Args:
grid: A trained GridSearchCV object.
feature_description: A string describing the features used.
"""
# Mean test scores for all combinations of max_depth and n_estimators
mean_scores
=
grid
.
cv_results_
[
"
mean_test_score
"
][:
len
(
grid
.
param_grid
[
"
max_depth
"
])
*
len
(
grid
.
param_grid
[
"
n_estimators
"
])]
mean_scores
=
np
.
array
(
mean_scores
).
reshape
(
len
(
grid
.
param_grid
[
"
max_depth
"
]),
len
(
grid
.
param_grid
[
"
n_estimators
"
]))
# Plot heatmap
plt
.
figure
(
figsize
=
(
20
,
10
))
plt
.
imshow
(
mean_scores
.
transpose
(),
interpolation
=
"
nearest
"
,
cmap
=
"
viridis
"
)
plt
.
title
(
"
Grid search results for random forest classifier
"
,
fontsize
=
18
)
plt
.
xlabel
(
"
max_depth
"
,
fontsize
=
14
)
plt
.
ylabel
(
"
n_estimators
"
,
fontsize
=
14
)
# Label None as "None"
if
None
in
grid
.
param_grid
[
"
max_depth
"
]:
index_none
=
grid
.
param_grid
[
"
max_depth
"
].
index
(
None
)
grid
.
param_grid
[
"
max_depth
"
][
index_none
]
=
"
None
"
plt
.
xticks
(
np
.
arange
(
len
(
grid
.
param_grid
[
"
max_depth
"
])),
grid
.
param_grid
[
"
max_depth
"
])
plt
.
yticks
(
np
.
arange
(
len
(
grid
.
param_grid
[
"
n_estimators
"
])),
grid
.
param_grid
[
"
n_estimators
"
])
# Add scores as text annotations within the heatmap
for
i
in
range
(
len
(
grid
.
param_grid
[
"
max_depth
"
])):
for
j
in
range
(
len
(
grid
.
param_grid
[
"
n_estimators
"
])):
plt
.
text
(
i
,
j
,
f
"
{
mean_scores
[
i
,
j
]
:
.
2
f
}
"
,
ha
=
"
center
"
,
va
=
"
center
"
,
color
=
"
white
"
)
plt
.
colorbar
(
label
=
"
Mean accuracy on dev set
"
)
# Save figure
plt
.
savefig
(
f
"
../figures/random_forest/heatmap_max_depth_n_estimators_
{
feature_description
}
.png
"
)
print
(
f
"
Saved figure as ../figures/random_forest/heatmap_max_depth_n_estimators_
{
feature_description
}
.png
"
)
\ No newline at end of file
This diff is collapsed.
Click to expand it.
project/src/results.txt
→
project/src/results
_basic_classifiers
.txt
+
0
−
0
View file @
3956472c
File moved
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment