Chapter 15

Chapter 15 Cluster Analysis

Original Code Credit:: Shmueli, Galit; Bruce, Peter C.; Gedeck, Peter; Patel, Nitin R.. Data Mining for Business Analytics Wiley.

Modifications have been made from the original textbook examples due to version changes in library dependencies and/or for clarity.

Download this notebook and data here.

Import Libraries

import os
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import pairwise
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.cluster import KMeans
import matplotlib.pylab as plt
import seaborn as sns
from pandas.plotting import parallel_coordinates

15.1 Introduction

Example: Public Utilities

utilities_df = pd.read_csv(os.path.join('data', 'Utilities.csv'))
# set row names to the utilities column
utilities_df.set_index('Company', inplace=True)
# while not required, the conversion of integer data to float
# will avoid a warning when applying the scale function
utilities_df = utilities_df.apply(lambda x: x.astype('float64'))
# compute Euclidean distance
d = pairwise.pairwise_distances(utilities_df, metric='euclidean')
pd.DataFrame(d, columns=utilities_df.index, index=utilities_df.index)

Company	Arizona	Boston	Central	Commonwealth	NY	Florida	Hawaiian	Idaho	Kentucky	Madison	...	Northern	Oklahoma	Pacific	Puget	San Diego	Southern	Texas	Wisconsin	United	Virginia
Company
Arizona	0.000000	3989.408076	140.402855	2654.277632	5777.167672	2050.529440	1435.265019	4006.104187	671.276346	2622.699002	...	1899.279821	598.556633	2609.045363	6914.742065	3363.061626	1063.009074	4430.251585	1790.485648	2427.588875	1016.617691
Boston	3989.408076	0.000000	4125.044132	1335.466502	1788.068027	6039.689076	2554.287162	7994.155985	3318.276558	1367.090634	...	2091.160485	4586.302564	1380.749962	10903.146464	629.760748	5052.331669	8419.610541	2199.721665	1562.210811	5005.081262
Central	140.402855	4125.044132	0.000000	2789.759674	5912.552908	1915.155154	1571.295401	3872.257626	807.920792	2758.559663	...	2035.441520	461.341670	2744.502847	6780.430307	3498.113013	928.749249	4295.014690	1925.772564	2563.637362	883.535455
Commonwealth	2654.277632	1335.466502	2789.759674	0.000000	3123.153215	4704.363099	1219.560005	6659.534567	1983.314354	43.648894	...	756.831954	3250.984589	56.644626	9568.434429	710.292965	3717.202963	7084.372839	864.273153	232.476871	3670.018191
NY	5777.167672	1788.068027	5912.552908	3123.153215	0.000000	7827.429211	4342.093798	9782.158178	5106.094153	3155.095594	...	3879.167462	6373.743249	3168.177463	12691.155108	2414.698757	6840.150291	10207.392630	3987.335962	3350.073118	6793.035300
Florida	2050.529440	6039.689076	1915.155154	4704.363099	7827.429211	0.000000	3485.671562	1959.731080	2721.706296	4672.829286	...	3949.092316	1454.292604	4659.356262	4866.111649	5413.093004	988.044559	2380.124974	3840.227943	4478.028874	1035.981475
Hawaiian	1435.265019	2554.287162	1571.295401	1219.560005	4342.093798	3485.671562	0.000000	5440.461781	764.083188	1187.941143	...	466.559118	2032.614245	1174.075616	8349.366438	1928.441480	2498.149024	5865.447190	358.476293	992.453252	2451.185161
Idaho	4006.104187	7994.155985	3872.257626	6659.534567	9782.158178	1959.731080	5440.461781	0.000000	4676.638384	6627.291780	...	5903.395450	3412.263965	6614.499239	2909.014679	7368.815437	2943.535570	447.828673	5795.958815	6432.132202	2989.963982
Kentucky	671.276346	3318.276558	807.920792	1983.314354	5106.094153	2721.706296	764.083188	4676.638384	0.000000	1951.628580	...	1228.436327	1269.102099	1938.026557	7585.467294	2692.212361	1734.103297	5101.414140	1119.940014	1756.378966	1687.236030
Madison	2622.699002	1367.090634	2758.559663	43.648894	3155.095594	4672.829286	1187.941143	6627.291780	1951.628580	0.000000	...	724.096182	3219.825109	53.301401	9536.242192	744.253668	3685.510088	7052.723883	833.472995	199.228400	3638.097548
Nevada	8364.031051	12353.062698	8229.223281	11018.057812	14141.022579	6314.359092	9799.015552	4359.599605	9035.007488	10986.098011	...	10262.157285	7768.384793	10973.010950	1452.162005	11727.066293	7301.040864	3934.617521	10154.118793	10791.049271	7348.049019
New England	2923.136103	1066.579432	3058.707429	271.452731	2854.099482	4973.506840	1488.014909	6928.326174	2252.026717	304.277034	...	1026.482994	3519.977565	314.354030	9837.281834	442.132760	3986.102433	7353.379146	1134.145010	496.687413	3939.100355
Northern	1899.279821	2091.160485	2035.441520	756.831954	3879.167462	3949.092316	466.559118	5903.395450	1228.436327	724.096182	...	0.000000	2496.638890	713.665046	8812.303559	1466.991954	2961.834750	6328.917948	119.981262	531.476328	2914.204993
Oklahoma	598.556633	4586.302564	461.341670	3250.984589	6373.743249	1454.292604	2032.614245	3412.263965	1269.102099	3219.825109	...	2496.638890	0.000000	3205.748876	6319.933836	3959.240748	470.164792	3834.012257	2386.942751	3024.952355	428.065259
Pacific	2609.045363	1380.749962	2744.502847	56.644626	3168.177463	4659.356262	1174.075616	6614.499239	1938.026557	53.301401	...	713.665046	3205.748876	0.000000	9523.413499	754.612093	3672.035402	7039.262070	820.164297	186.388651	3625.118869
Puget	6914.742065	10903.146464	6780.430307	9568.434429	12691.155108	4866.111649	8349.366438	2909.014679	7585.467294	9536.242192	...	8812.303559	6319.933836	9523.413499	0.000000	10277.660378	5851.893307	2488.432223	8704.721278	9341.126615	5898.576962
San Diego	3363.061626	629.760748	3498.113013	710.292965	2414.698757	5413.093004	1928.441480	7368.815437	2692.212361	744.253668	...	1466.991954	3959.240748	754.612093	10277.660378	0.000000	4426.041889	7793.083947	1573.408379	938.522726	4379.211818
Southern	1063.009074	5052.331669	928.749249	3717.202963	6840.150291	988.044559	2498.149024	2943.535570	1734.103297	3685.510088	...	2961.834750	470.164792	3672.035402	5851.893307	4426.041889	0.000000	3367.318870	2853.298778	3490.422918	59.325286
Texas	4430.251585	8419.610541	4295.014690	7084.372839	10207.392630	2380.124974	5865.447190	447.828673	5101.414140	7052.723883	...	6328.917948	3834.012257	7039.262070	2488.432223	7793.083947	3367.318870	0.000000	6220.296729	6857.735864	3414.831455
Wisconsin	1790.485648	2199.721665	1925.772564	864.273153	3987.335962	3840.227943	358.476293	5795.958815	1119.940014	833.472995	...	119.981262	2386.942751	820.164297	8704.721278	1573.408379	2853.298778	6220.296729	0.000000	640.786770	2806.165712
United	2427.588875	1562.210811	2563.637362	232.476871	3350.073118	4478.028874	992.453252	6432.132202	1756.378966	199.228400	...	531.476328	3024.952355	186.388651	9341.126615	938.522726	3490.422918	6857.735864	640.786770	0.000000	3443.240967
Virginia	1016.617691	5005.081262	883.535455	3670.018191	6793.035300	1035.981475	2451.185161	2989.963982	1687.236030	3638.097548	...	2914.204993	428.065259	3625.118869	5898.576962	4379.211818	59.325286	3414.831455	2806.165712	3443.240967	0.000000

22 rows × 22 columns

# pandas uses sample standard deviation
utilities_df_norm = (utilities_df - utilities_df.mean())/utilities_df.std()
# compute normalized distance based on Sales and Fuel Cost
utilities_df_norm[['Sales', 'Fuel_Cost']]
d_norm = pairwise.pairwise_distances(utilities_df_norm[['Sales', 'Fuel_Cost']],
                                     metric='euclidean')
pd.DataFrame(d_norm, columns=utilities_df.index, index=utilities_df.index)

Company	Arizona	Boston	Central	Commonwealth	NY	Florida	Hawaiian	Idaho	Kentucky	Madison	...	Northern	Oklahoma	Pacific	Puget	San Diego	Southern	Texas	Wisconsin	United	Virginia
Company
Arizona	0.000000	2.010329	0.774179	0.758738	3.021907	1.244422	1.885248	1.265638	0.461292	0.738650	...	0.564657	0.182648	1.570780	1.947668	2.509043	0.913621	1.247976	0.521491	2.761745	1.252350
Boston	2.010329	0.000000	1.465703	1.582821	1.013370	1.792397	0.740283	3.176654	1.557738	1.719632	...	1.940166	2.166078	0.478334	3.501390	0.679634	1.634425	2.890560	1.654255	1.100595	1.479261
Central	0.774179	1.465703	0.000000	1.015710	2.432528	0.631892	1.156092	1.732777	0.419254	1.102287	...	1.113433	0.855093	0.987772	2.065643	1.836762	0.276440	1.428159	0.838967	2.034824	0.510365
Commonwealth	0.758738	1.582821	1.015710	0.000000	2.571969	1.643857	1.746027	2.003230	0.629994	0.138758	...	0.377004	0.937389	1.258835	2.699060	2.202930	1.278514	1.998818	0.243408	2.547116	1.502093
NY	3.021907	1.013370	2.432528	2.571969	0.000000	2.635573	1.411695	4.162561	2.566439	2.705445	...	2.938637	3.174588	1.462019	4.397433	0.715629	2.558409	3.831132	2.661786	0.952507	2.328691
Florida	1.244422	1.792397	0.631892	1.643857	2.635573	0.000000	1.228805	1.764123	1.025663	1.722510	...	1.698624	1.243634	1.343185	1.767581	1.953423	0.366744	1.277920	1.452417	2.016493	0.313847
Hawaiian	1.885248	0.740283	1.156092	1.746027	1.411695	1.228805	0.000000	2.860189	1.436822	1.880361	...	2.027224	1.997036	0.560997	2.995848	0.726095	1.205034	2.463227	1.711256	0.879934	0.929414
Idaho	1.265638	3.176654	1.732777	2.003230	4.162561	1.764123	2.860189	0.000000	1.650417	1.950296	...	1.708409	1.083449	2.705579	0.992092	3.563727	1.658671	0.600089	1.778813	3.720421	1.980715
Kentucky	0.461292	1.557738	0.419254	0.629994	2.566439	1.025663	1.436822	1.650417	0.000000	0.697674	...	0.694524	0.608401	1.110854	2.180496	2.048098	0.658996	1.493274	0.426780	2.308613	0.929141
Madison	0.738650	1.719632	1.102287	0.138758	2.705445	1.722510	1.880361	1.950296	0.697674	0.000000	...	0.267198	0.908665	1.397240	2.686215	2.341644	1.355786	1.986625	0.274061	2.685340	1.599587
Nevada	2.369479	3.756513	2.375975	3.106084	4.597006	1.971518	3.185311	1.479526	2.550689	3.105627	...	2.923023	2.211990	3.293310	0.487508	3.899212	2.145585	1.133311	2.862756	3.887918	2.284803
New England	2.425975	0.684393	1.737322	2.153831	0.846291	1.831380	0.608107	3.458771	1.966323	2.292531	...	2.480456	2.554109	0.898094	3.598846	0.130663	1.809354	3.071178	2.172473	0.417866	1.536436
Northern	0.564657	1.940166	1.113433	0.377004	2.938637	1.698624	2.027224	1.708409	0.694524	0.267198	...	0.000000	0.711050	1.582591	2.487892	2.538720	1.336887	1.793287	0.316160	2.861293	1.623614
Oklahoma	0.182648	2.166078	0.855093	0.937389	3.174588	1.243634	1.997036	1.083449	0.608401	0.908665	...	0.711050	0.000000	1.716739	1.780656	2.642155	0.944295	1.083449	0.702684	2.876646	1.296548
Pacific	1.570780	0.478334	0.987772	1.258835	1.462019	1.343185	0.560997	2.705579	1.110854	1.397240	...	1.582591	1.716739	0.000000	3.027116	0.958905	1.160017	2.412278	1.276200	1.288563	1.035028
Puget	1.947668	3.501390	2.065643	2.699060	4.397433	1.767581	2.995848	0.992092	2.180496	2.686215	...	2.487892	1.780656	3.027116	0.000000	3.720970	1.867235	0.700313	2.456272	3.763066	2.069314
San Diego	2.509043	0.679634	1.836762	2.202930	0.715629	1.953423	0.726095	3.563727	2.048098	2.341644	...	2.538720	2.642155	0.958905	3.720970	0.000000	1.920035	3.185942	2.234632	0.440163	1.655498
Southern	0.913621	1.634425	0.276440	1.278514	2.558409	0.366744	1.205034	1.658671	0.658996	1.355786	...	1.336887	0.944295	1.160017	1.867235	1.920035	0.000000	1.272784	1.085774	2.062067	0.356298
Texas	1.247976	2.890560	1.428159	1.998818	3.831132	1.277920	2.463227	0.600089	1.493274	1.986625	...	1.793287	1.083449	2.412278	0.700313	3.185942	1.272784	0.000000	1.756136	3.288460	1.541576
Wisconsin	0.521491	1.654255	0.838967	0.243408	2.661786	1.452417	1.711256	1.778813	0.426780	0.274061	...	0.316160	0.702684	1.276200	2.456272	2.234632	1.085774	1.756136	0.000000	2.549040	1.343306
United	2.761745	1.100595	2.034824	2.547116	0.952507	2.016493	0.879934	3.720421	2.308613	2.685340	...	2.861293	2.876646	1.288563	3.763066	0.440163	2.062067	3.288460	2.549040	0.000000	1.749930
Virginia	1.252350	1.479261	0.510365	1.502093	2.328691	0.313847	0.929414	1.980715	0.929141	1.599587	...	1.623614	1.296548	1.035028	2.069314	1.655498	0.356298	1.541576	1.343306	1.749930	0.000000

22 rows × 22 columns

15.4 Hierarchical (Agglomerative) Clustering

We will focus on kMeans Clustering but code examples for hierarchical clustering are provided for example purposes.

fig, axes = plt.subplots(2, 1, figsize=(10,10))
# in linkage() set argument method =
# 'single', 'complete', 'average', 'weighted', centroid', 'median', 'ward'
Z = linkage(utilities_df_norm, method='single')
ax1 = axes[0]
dendrogram(Z, labels=utilities_df_norm.index, color_threshold=2.75, ax=ax1)
ax1.set_title('Hierarchical Clustering Dendrogram (Single Linkage)')
Z = linkage(utilities_df_norm, method='average')
ax2 = axes[1]
dendrogram(Z, labels=utilities_df_norm.index, color_threshold=3.6, ax=ax2)
ax2.set_title('Hierarchical Clustering Dendrogram (Average Linkage)')
plt.tight_layout()
plt.show()

png

memb = fcluster(linkage(utilities_df_norm, method='single'), 6, criterion='maxclust')
memb = pd.Series(memb, index=utilities_df_norm.index)
for key, item in memb.groupby(memb):
    print(key, ': ', ', '.join(item.index))

1 :  Idaho, Puget
2 :  Arizona , Boston , Commonwealth, Florida , Hawaiian , Kentucky, Madison , New England, Northern, Oklahoma, Pacific , Southern, Texas, Wisconsin, United, Virginia
3 :  Central 
4 :  San Diego
5 :  Nevada
6 :  NY

memb = fcluster(linkage(utilities_df_norm, method='average'), 6, criterion='maxclust')
memb = pd.Series(memb, index=utilities_df_norm.index)
for key, item in memb.groupby(memb):
    print(key, ': ', ', '.join(item.index))

1 :  Idaho, Nevada, Puget
2 :  Hawaiian , New England, Pacific , United
3 :  San Diego
4 :  Boston , Commonwealth, Madison , Northern, Wisconsin, Virginia
5 :  Arizona , Central , Florida , Kentucky, Oklahoma, Southern, Texas
6 :  NY

# set labels as cluster membership and utility name
utilities_df_norm.index = ['{}: {}'.format(cluster, state)
                           for cluster, state in zip(memb, utilities_df_norm.index)]
# plot heatmap
# the '_r' suffix reverses the color mapping to large = dark
sns.clustermap(utilities_df_norm, method='average', col_cluster=False, cmap='mako_r')
plt.show()

png

15.5 Non-Hierarchical Clustering: The k-Means Algorithm

# Normalize distances
utilities_df_norm = utilities_df.apply(preprocessing.scale, axis=0)
kmeans = KMeans(n_clusters=6, init='k-means++', max_iter=300, n_init=10, random_state=0).fit(utilities_df_norm)
# Cluster membership
memb = pd.Series(kmeans.labels_, index=utilities_df_norm.index)
for key, item in memb.groupby(memb):
     print(key, ': ', ', '.join(item.index))

0 :  Idaho, Puget
1 :  Arizona , Central , Florida , Kentucky, Oklahoma, Southern, Texas
2 :  Commonwealth, Madison , Northern, Wisconsin, Virginia
3 :  Boston , Hawaiian , New England, Pacific , San Diego, United
4 :  Nevada
5 :  NY

centroids = pd.DataFrame(kmeans.cluster_centers_, columns=utilities_df_norm.columns)
centroids

	Fixed_charge	RoR	Cost	Load_factor	Demand_growth	Sales	Nuclear	Fuel_Cost
0	0.088252	-0.541112	1.995766	-0.109502	0.987702	1.621068	-0.731447	-1.174696
1	0.516184	0.797896	-1.009097	-0.345490	-0.501098	0.360140	-0.535523	-0.420198
2	-0.011599	0.339180	0.224086	-0.366466	0.170386	-0.411331	1.601868	-0.609460
3	-0.632893	-0.639936	0.206692	1.175321	0.057691	-0.757719	-0.380962	1.203616
4	-2.019709	-1.476137	0.119723	-1.256665	1.069762	2.458495	-0.731447	-0.616086
5	2.085268	-0.883194	0.591840	-1.325495	-0.735555	-1.618644	0.219434	1.732470

# calculate the distances of each data point to the cluster centers
distances = kmeans.transform(utilities_df_norm)
# find closest cluster for each data point
minSquaredDistances = distances.min(axis=1) ** 2
# combine with cluster labels into a data frame
df = pd.DataFrame({'squaredDistance': minSquaredDistances, 'cluster': kmeans.labels_},
    index=utilities_df_norm.index)
# group by cluster and print information
for cluster, data in df.groupby('cluster'):
    count = len(data)
    withinClustSS = data.squaredDistance.sum()
    print(f'Cluster {cluster} ({count} members): {withinClustSS:.2f} within cluster ')

Cluster 0 (2 members): 2.54 within cluster 
Cluster 1 (7 members): 27.77 within cluster 
Cluster 2 (5 members): 10.66 within cluster 
Cluster 3 (6 members): 22.20 within cluster 
Cluster 4 (1 members): 0.00 within cluster 
Cluster 5 (1 members): 0.00 within cluster

centroids['cluster'] = ['Cluster {}'.format(i) for i in centroids.index]
plt.figure(figsize=(10,6))
parallel_coordinates(centroids, class_column='cluster', colormap='Dark2', linewidth=5)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()

png

inertia = []
for n_clusters in range(1, 7):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(utilities_df_norm)
    inertia.append(kmeans.inertia_ / n_clusters)
inertias = pd.DataFrame({'n_clusters': range(1, 7), 'inertia': inertia})
ax = inertias.plot(x='n_clusters', y='inertia')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Average Within-Cluster Squared Distances')
plt.ylim((0, 1.1 * inertias.inertia.max()))
ax.legend().set_visible(False)
plt.show()

png