Tuning hyper-parameters

  • parameters: the values learnt by the estimator (splits for decision trees)
  • hyper parameters: the values which are passed to estimator from outside

Preparation

In [8]:
import pandas as pd
iris = pd.read_csv("iris.csv")
iris_data = iris.drop('Name', axis=1)
iris_target = iris['Name']

GridSearchCV

GridSearchCV(estimator, parameters, scoring, cv)

  • behaves just like an estimator
    • when calling fit method it will try out all possibilities
  • parameters
    • an estimator (like decision tree etc)
    • parameters as a dictionary
      • key is parameter name
      • value is a list of possible values
      • example: {'param_a':[1,2,3], 'param_b':[7,8,9] }
    • scoring (can be 'accuracy' or other scores)
    • cross validation: which type of cross validation (defaults to 3-fold cross validation)
In [36]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import GridSearchCV

knn_estimator = KNeighborsClassifier()
parameters = {
    'n_neighbors':[2,3,4,5,6,7,8], 
    'algorithm':['ball_tree', 'kd_tree', 'brute']
}
stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

grid_search_estimator = GridSearchCV(knn_estimator, parameters, scoring='accuracy', cv=stratified_10_fold_cv)
grid_search_estimator.fit(iris_data,iris_target)# this will try out all possibilities
Out[36]:
GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=42, shuffle=True),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [2, 3, 4, 5, 6, 7, 8], 'algorithm': ['ball_tree', 'kd_tree', 'brute']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)
In [37]:
#one can use the best estimator for further prediction 
#this estimator is trained on the whole dataset with the best hyper parameters
#grid_search_estimator.best_estimator_.predict()
print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_ ))

results = grid_search_estimator.cv_results_

#import pprint
#pprint.pprint(results)

for i in range(len(results['params'])):
    print("{}, {}".format(results['params'][i], results['mean_test_score'][i]))
best score is 0.9733333333333334 with params {'algorithm': 'ball_tree', 'n_neighbors': 5}
{'algorithm': 'ball_tree', 'n_neighbors': 2}, 0.9466666666666667
{'algorithm': 'ball_tree', 'n_neighbors': 3}, 0.9666666666666667
{'algorithm': 'ball_tree', 'n_neighbors': 4}, 0.9666666666666667
{'algorithm': 'ball_tree', 'n_neighbors': 5}, 0.9733333333333334
{'algorithm': 'ball_tree', 'n_neighbors': 6}, 0.96
{'algorithm': 'ball_tree', 'n_neighbors': 7}, 0.96
{'algorithm': 'ball_tree', 'n_neighbors': 8}, 0.9666666666666667
{'algorithm': 'kd_tree', 'n_neighbors': 2}, 0.9466666666666667
{'algorithm': 'kd_tree', 'n_neighbors': 3}, 0.9666666666666667
{'algorithm': 'kd_tree', 'n_neighbors': 4}, 0.9666666666666667
{'algorithm': 'kd_tree', 'n_neighbors': 5}, 0.9733333333333334
{'algorithm': 'kd_tree', 'n_neighbors': 6}, 0.96
{'algorithm': 'kd_tree', 'n_neighbors': 7}, 0.96
{'algorithm': 'kd_tree', 'n_neighbors': 8}, 0.9666666666666667
{'algorithm': 'brute', 'n_neighbors': 2}, 0.9466666666666667
{'algorithm': 'brute', 'n_neighbors': 3}, 0.9666666666666667
{'algorithm': 'brute', 'n_neighbors': 4}, 0.96
{'algorithm': 'brute', 'n_neighbors': 5}, 0.9733333333333334
{'algorithm': 'brute', 'n_neighbors': 6}, 0.96
{'algorithm': 'brute', 'n_neighbors': 7}, 0.96
{'algorithm': 'brute', 'n_neighbors': 8}, 0.9666666666666667

Possible starting point for task 5.2

In [42]:
import pandas as pd
import numpy as np
from scipy.io import arff

adult_arff_data, adult_arff_meta = arff.loadarff(open('adult.arff', 'r'))
adult = pd.DataFrame(adult_arff_data)

adult = adult.applymap(lambda x: x.decode('utf8').replace("'", "") if hasattr(x, 'decode') else x)

adult_target = np.array(adult['class'])
print(adult_target[:10])
adult_data = pd.get_dummies(adult.drop('class', axis=1))
adult_data.head()
['<=50K' '<=50K' '>50K' '>50K' '<=50K' '<=50K' '<=50K' '>50K' '<=50K'
 '<=50K']
Out[42]:
age fnlwgt education-num capital-gain capital-loss hours-per-week workclass_? workclass_Federal-gov workclass_Local-gov workclass_Never-worked ... native-country_Portugal native-country_Puerto-Rico native-country_Scotland native-country_South native-country_Taiwan native-country_Thailand native-country_Trinadad&Tobago native-country_United-States native-country_Vietnam native-country_Yugoslavia
0 25.0 226802.0 7.0 0.0 0.0 40.0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
1 38.0 89814.0 9.0 0.0 0.0 50.0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
2 28.0 336951.0 12.0 0.0 0.0 40.0 0 0 1 0 ... 0 0 0 0 0 0 0 1 0 0
3 44.0 160323.0 10.0 7688.0 0.0 40.0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
4 18.0 103497.0 10.0 0.0 0.0 30.0 1 0 0 0 ... 0 0 0 0 0 0 0 1 0 0

5 rows × 108 columns

In [ ]:
from sklearn.model_selection import train_test_split
data_train, data_test, target_train, target_test = train_test_split(
    adult_data, adult_target,test_size=0.2, random_state=42)