outlook | temperature | humidity | wind | play |
---|---|---|---|---|
sunny | hot | high | false | no |
sunny | hot | high | true | no |
overcast | hot | high | false | yes |
rainy | mild | high | false | yes |
rainy | cold | normal | false | yes |
rainy | cold | normal | true | no |
overcast | cold | normal | true | yes |
sunny | mild | high | false | ? |
[[ 0. 0. 5. ..., 0. 0. 0.]
[ 0. 0. 0. ..., 10. 0. 0.]
[ 0. 0. 0. ..., 16. 9. 0.]
...,
[ 0. 0. 1. ..., 6. 0. 0.]
[ 0. 0. 2. ..., 12. 0. 0.]
[ 0. 0. 10. ..., 12. 1. 0.]]
[
True,
False,
True,
True,
False
...]
import pandas as pd
customer_data = pd.read_excel('CustomerDataSet.xls')
customer_data.head()
from sklearn.cluster import KMeans
estimator = KMeans(n_clusters = 2)
labels = estimator.fit_predict(customer_data[['ItemsBought', 'ItemsReturned']])
print(labels)
# OR
estimator.fit(customer_data[['ItemsBought', 'ItemsReturned']])
print(estimator.labels_)
For plotting the clusters you can use the c parameter of the plotting function
import matplotlib.pyplot as plt
plt.title("KMeans #cluster = 2")
plt.xlabel('ItemsBought')
plt.ylabel('ItemsReturned')
plt.scatter(customer_data['ItemsBought'], customer_data['ItemsReturned'], c=estimator.labels_)
plt.show()
from sklearn.metrics import silhouette_score
silhouette = silhouette_score( customer_data[['ItemsBought', 'ItemsReturned']], labels)
silhouette
print(customer_data[['Product', 'ZipCode']])
from sklearn import preprocessing
customer_data_encoded = customer_data[['Product', 'ZipCode']].apply(preprocessing.LabelEncoder().fit_transform)
print(customer_data_encoded[['Product', 'ZipCode']])
from sklearn import preprocessing
#inplace
#customer_data[['ItemsBought', 'ItemsReturned']] = preprocessing.MinMaxScaler().fit_transform(customer_data[['ItemsBought', 'ItemsReturned']])
#creating a new dataframe
customer_data_normalised = pd.DataFrame(
preprocessing.MinMaxScaler().fit_transform(customer_data[['ItemsBought', 'ItemsReturned']]),
columns=['ItemsBought', 'ItemsReturned']
)
print(customer_data[['ItemsBought', 'ItemsReturned']])
print(customer_data_normalised)
customer_data_with_cluster = customer_data.assign(cluster=estimator.labels_)
customer_data_with_cluster
dendrogram(Z,labels=data['column'].values)
from scipy.cluster.hierarchy import dendrogram, linkage
Z = linkage(customer_data[['ItemsBought', 'ItemsReturned']], 'ward')
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Customer IDs')
plt.ylabel('distance')
dendrogram(Z, labels=customer_data['Customer ID'].values)
plt.show()
plt.title('Dendrogram - 3 clusters')
plt.xlabel('Count of Customers')
plt.ylabel('distance')
dendrogram(Z,
truncate_mode='lastp',
p=3)
plt.show()
Changing the orientation of the dendrogram is done with orientation
ax = dendrogram(
Z,
orientation='right'
)
plt.show()
from sklearn.cluster import AgglomerativeClustering
agg = AgglomerativeClustering(n_clusters = 3)
agg_predictions = agg.fit_predict(customer_data[['ItemsBought', 'ItemsReturned']])
plt.scatter(customer_data['ItemsBought'], customer_data['ItemsReturned'], c=agg_predictions)
plt.show()
from sklearn.cluster import DBSCAN
db = DBSCAN().fit(customer_data[['ItemsBought', 'ItemsReturned']])
plt.scatter(customer_data['ItemsBought'], customer_data['ItemsReturned'], c=db.labels_)
plt.show()
from scipy.io import arff
zoo_arff_data, zoo_arff_meta = arff.loadarff(open('zoo.arff', 'r'))
zoo_data = pd.DataFrame(zoo_arff_data)
zoo_data.head()
#for i in range(2,5):
for i in [2,3,4]:
estimator = KMeans(n_clusters = i)
estimator.fit(customer_data[['ItemsBought', 'ItemsReturned']])
plt.title("#cluster = {}".format(i))
plt.scatter(customer_data['ItemsBought'], customer_data['ItemsReturned'], c=estimator.labels_)
plt.show()
plt.figure(1, (10,10)) # otherwise the image is really small
counter = 1 # we need a counter to start at one
for i in [2,3,4,5,6]:
plt.subplot(3,2,counter) # plt.subplot(rows, columns, current_index)
#plt.tight_layout() # sometimes you need it, when the plots are a bit overlapping
counter += 1
estimator = KMeans(n_clusters = i)
estimator.fit(customer_data[['ItemsBought', 'ItemsReturned']])
plt.title("#cluster = {}".format(i))
plt.scatter(customer_data['ItemsBought'], customer_data['ItemsReturned'], c=estimator.labels_)
plt.show()