Functions are defined by keyword "def", followed with the function's name and brackets and a colon
For example:
def my_function():
print("Hello From My Function!")
Calling this function this done by:
my_function()
def my_function_with_args(username, greeting):
print("Hello, {}, I wish you {}".format(username, greeting))
my_function_with_args("Mike", "good day")
Just write return whenever you want to return something
def sum_two_numbers(a, b):
return a + b
sum_two_numbers(4,5)
Remember that there are tuples
def sum_and_diff(a, b):
return (a+b, a-b) # the brackets here are not necessary, one can also write return a+b, a-b
my_tuple = sum_and_diff(4,6)
print(my_tuple) # prints (10, -2)
print(my_tuple[0]) # prints 10
print(my_tuple[1]) # prints -2
(my_sum, my_diff) = sum_and_diff(4,6) # here also the brackets are not essential
print(my_sum)
print(my_diff)
population = {} # or population = dict()
population["Mannheim"] = 305780
population["Ludwigshafen"] = 164718
population["Heidelberg"] = 156267
print(population)
print(population["Heidelberg"])
population = {
"Mannheim" : 305780,
"Ludwigshafen" : 164718,
"Heidelberg" : 156267
}
print(population)
for name, count in population.items():
print("The population of {} is {}".format(name, count))
del population["Heidelberg"]
print(population)
import pandas as pd
items = [0,4,12,16,16,18,24,26,28]
pd.cut(items, bins=3)
pd.qcut(items, q=3)
iris = pd.read_csv("iris.csv")
pd.cut(iris['SepalLength'], bins=3, labels=['low', 'middle', 'high'])
iris_binned = pd.DataFrame(dict(
SepalLength = pd.cut(iris['SepalLength'], bins=3, labels=['low', 'middle', 'high']),
SepalWidth = pd.cut(iris['SepalWidth'], bins=3, labels=['low', 'middle', 'high'])
))
iris_binned.head()
iris_binned_and_encoded = pd.get_dummies(iris_binned)
iris_binned_and_encoded.head()
from sklearn import tree
decision_tree = tree.DecisionTreeClassifier()
decision_tree
decision_tree = tree.DecisionTreeClassifier(max_depth=2)#max_depth=2, because to see onl a small decision tree
decision_tree.fit(iris_binned_and_encoded, iris['Name'])
# run the following two commands in console:
# conda install -c conda-forge graphviz
# pip install graphviz
import graphviz
from sklearn.utils.multiclass import unique_labels
dot_data = tree.export_graphviz(decision_tree,
feature_names=iris_binned_and_encoded.columns.values,
class_names=unique_labels(iris['Name']),
filled=True, rounded=True,special_characters=True,out_file=None)
graphviz.Source(dot_data)
decision_tree.tree_.node_count
from sklearn.model_selection import train_test_split
data_train, data_test, target_train, target_test = train_test_split(
iris_binned_and_encoded, iris['Name'],test_size=0.2, random_state=42, stratify=iris['Name'])
print(data_train.head())
print(target_train.head())
from sklearn.model_selection import cross_val_score
accuracy_iris = cross_val_score(decision_tree, iris_binned_and_encoded, iris['Name'], cv=10, scoring='accuracy')
accuracy_iris
accuracy_iris.mean()
from sklearn.model_selection import StratifiedKFold
cross_val = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
acc_each_split = cross_val_score(decision_tree, iris_binned_and_encoded, iris['Name'], cv=cross_val, scoring='accuracy')
acc_each_split.mean()
from sklearn.model_selection import cross_val_predict
predicted = cross_val_predict(decision_tree, iris_binned_and_encoded, iris['Name'], cv=10)
print(predicted)
# sometimes you have to use the raw array and not the pandas dataframe (access it with .values)
data = iris_binned_and_encoded.values
target = iris['Name']
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
for train_indices, test_indices in cv.split(data, target):
train_data = data[train_indices]
train_target = target[train_indices]
decision_tree.fit(train_data, train_target)
test_data = data[test_indices]
test_target = target[test_indices]
test_prediction = decision_tree.predict(test_data)
import numpy as np
for train_indices, test_indices in cv.split(data, target):
train_indices = np.append(train_indices, (target == 'Iris-setosa').nonzero()[0])
print(train_indices[:10])
from scipy.io import arff
credit_arff_data, credit_arff_meta = arff.loadarff(open('credit-g.arff', 'r'))
credit = pd.DataFrame(credit_arff_data)
credit.head()
credit_target = np.array([x.decode('ascii') for x in credit['class'].values]) # "inline" for loop with []
print(credit_target[:10])
credit_data = pd.get_dummies(credit.drop('class', axis=1)) # drop can be used to get all columns but removing one (the target column)
credit_data.head()
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
knn_estimator = KNeighborsClassifier(3)
data_train, data_test, target_train, target_test = train_test_split(credit_data, credit_target)
knn_estimator.fit(data_train, target_train)
proba_for_each_class = knn_estimator.predict_proba(data_test)#have to use predict_proba or decision_function
fpr, tpr, thresholds = roc_curve(target_test, proba_for_each_class[:,1], pos_label='good')#choose the second class
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Luck', alpha=.8) # draw diagonal
plt.plot(fpr,tpr,label='K-NN')
plt.legend()
plt.show()
from scipy import interp
from sklearn.metrics import roc_curve, auc
def avg_roc(cv, estimator, data, target, pos_label):
mean_fpr = np.linspace(0, 1, 100) # = [0.0, 0.01, 0.02, 0.03, ... , 0.99, 1.0]
tprs = []
aucs = []
for train_indices, test_indices in cv.split(data, target):
train_data, train_target = data[train_indices], target[train_indices]
estimator.fit(train_data, train_target)
test_data, test_target = data[test_indices], target[test_indices]
decision_for_each_class = estimator.predict_proba(test_data)#have to use predict_proba or decision_function
fpr, tpr, thresholds = roc_curve(test_target, decision_for_each_class[:,1], pos_label=pos_label)
tprs.append(interp(mean_fpr, fpr, tpr))
tprs[-1][0] = 0.0 # tprs[-1] access the last element
aucs.append(auc(fpr, tpr))
#plt.plot(fpr, tpr)# plot for each fold
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0 # set the last tpr to 1
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
return mean_fpr, mean_tpr, mean_auc, std_auc
knn_estimator = KNeighborsClassifier(3)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
mean_fpr, mean_tpr, mean_auc, std_auc = avg_roc(cv, knn_estimator, credit_data.values, credit_target, 'good')
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Luck', alpha=.8) # draw diagonal
plt.plot(mean_fpr,mean_tpr,label='K-NN')
plt.legend()
plt.show()
import random
from sklearn.utils.multiclass import unique_labels
def add_noise(raw_target, percentage):
labels = unique_labels(raw_target)
target_with_noise = []
for one_target_label in raw_target:
if random.randint(1,100) <= percentage:
target_with_noise.append(next(l for l in labels if l != one_target_label))
else:
target_with_noise.append(one_target_label)
return target_with_noise
credit_target_with_noise = add_noise(credit_target, 10)
for i in range(20):
print("{:10} - {:10} - {}".format(credit_target[i], credit_target_with_noise[i], credit_target[i]==credit_target_with_noise[i]))