Skip to content

Chapter 15

Chapter 15 Cluster Analysis

Original Code Credit:: Shmueli, Galit; Bruce, Peter C.; Gedeck, Peter; Patel, Nitin R.. Data Mining for Business Analytics Wiley.

Modifications have been made from the original textbook examples due to version changes in library dependencies and/or for clarity.

Download this notebook and data here.

Import Libraries

import os
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import pairwise
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.cluster import KMeans
import matplotlib.pylab as plt
import seaborn as sns
from pandas.plotting import parallel_coordinates

15.1 Introduction

Example: Public Utilities

utilities_df = pd.read_csv(os.path.join('data', 'Utilities.csv'))
# set row names to the utilities column
utilities_df.set_index('Company', inplace=True)
# while not required, the conversion of integer data to float
# will avoid a warning when applying the scale function
utilities_df = utilities_df.apply(lambda x: x.astype('float64'))
# compute Euclidean distance
d = pairwise.pairwise_distances(utilities_df, metric='euclidean')
pd.DataFrame(d, columns=utilities_df.index, index=utilities_df.index)
Company Arizona Boston Central Commonwealth NY Florida Hawaiian Idaho Kentucky Madison ... Northern Oklahoma Pacific Puget San Diego Southern Texas Wisconsin United Virginia
Company
Arizona 0.000000 3989.408076 140.402855 2654.277632 5777.167672 2050.529440 1435.265019 4006.104187 671.276346 2622.699002 ... 1899.279821 598.556633 2609.045363 6914.742065 3363.061626 1063.009074 4430.251585 1790.485648 2427.588875 1016.617691
Boston 3989.408076 0.000000 4125.044132 1335.466502 1788.068027 6039.689076 2554.287162 7994.155985 3318.276558 1367.090634 ... 2091.160485 4586.302564 1380.749962 10903.146464 629.760748 5052.331669 8419.610541 2199.721665 1562.210811 5005.081262
Central 140.402855 4125.044132 0.000000 2789.759674 5912.552908 1915.155154 1571.295401 3872.257626 807.920792 2758.559663 ... 2035.441520 461.341670 2744.502847 6780.430307 3498.113013 928.749249 4295.014690 1925.772564 2563.637362 883.535455
Commonwealth 2654.277632 1335.466502 2789.759674 0.000000 3123.153215 4704.363099 1219.560005 6659.534567 1983.314354 43.648894 ... 756.831954 3250.984589 56.644626 9568.434429 710.292965 3717.202963 7084.372839 864.273153 232.476871 3670.018191
NY 5777.167672 1788.068027 5912.552908 3123.153215 0.000000 7827.429211 4342.093798 9782.158178 5106.094153 3155.095594 ... 3879.167462 6373.743249 3168.177463 12691.155108 2414.698757 6840.150291 10207.392630 3987.335962 3350.073118 6793.035300
Florida 2050.529440 6039.689076 1915.155154 4704.363099 7827.429211 0.000000 3485.671562 1959.731080 2721.706296 4672.829286 ... 3949.092316 1454.292604 4659.356262 4866.111649 5413.093004 988.044559 2380.124974 3840.227943 4478.028874 1035.981475
Hawaiian 1435.265019 2554.287162 1571.295401 1219.560005 4342.093798 3485.671562 0.000000 5440.461781 764.083188 1187.941143 ... 466.559118 2032.614245 1174.075616 8349.366438 1928.441480 2498.149024 5865.447190 358.476293 992.453252 2451.185161
Idaho 4006.104187 7994.155985 3872.257626 6659.534567 9782.158178 1959.731080 5440.461781 0.000000 4676.638384 6627.291780 ... 5903.395450 3412.263965 6614.499239 2909.014679 7368.815437 2943.535570 447.828673 5795.958815 6432.132202 2989.963982
Kentucky 671.276346 3318.276558 807.920792 1983.314354 5106.094153 2721.706296 764.083188 4676.638384 0.000000 1951.628580 ... 1228.436327 1269.102099 1938.026557 7585.467294 2692.212361 1734.103297 5101.414140 1119.940014 1756.378966 1687.236030
Madison 2622.699002 1367.090634 2758.559663 43.648894 3155.095594 4672.829286 1187.941143 6627.291780 1951.628580 0.000000 ... 724.096182 3219.825109 53.301401 9536.242192 744.253668 3685.510088 7052.723883 833.472995 199.228400 3638.097548
Nevada 8364.031051 12353.062698 8229.223281 11018.057812 14141.022579 6314.359092 9799.015552 4359.599605 9035.007488 10986.098011 ... 10262.157285 7768.384793 10973.010950 1452.162005 11727.066293 7301.040864 3934.617521 10154.118793 10791.049271 7348.049019
New England 2923.136103 1066.579432 3058.707429 271.452731 2854.099482 4973.506840 1488.014909 6928.326174 2252.026717 304.277034 ... 1026.482994 3519.977565 314.354030 9837.281834 442.132760 3986.102433 7353.379146 1134.145010 496.687413 3939.100355
Northern 1899.279821 2091.160485 2035.441520 756.831954 3879.167462 3949.092316 466.559118 5903.395450 1228.436327 724.096182 ... 0.000000 2496.638890 713.665046 8812.303559 1466.991954 2961.834750 6328.917948 119.981262 531.476328 2914.204993
Oklahoma 598.556633 4586.302564 461.341670 3250.984589 6373.743249 1454.292604 2032.614245 3412.263965 1269.102099 3219.825109 ... 2496.638890 0.000000 3205.748876 6319.933836 3959.240748 470.164792 3834.012257 2386.942751 3024.952355 428.065259
Pacific 2609.045363 1380.749962 2744.502847 56.644626 3168.177463 4659.356262 1174.075616 6614.499239 1938.026557 53.301401 ... 713.665046 3205.748876 0.000000 9523.413499 754.612093 3672.035402 7039.262070 820.164297 186.388651 3625.118869
Puget 6914.742065 10903.146464 6780.430307 9568.434429 12691.155108 4866.111649 8349.366438 2909.014679 7585.467294 9536.242192 ... 8812.303559 6319.933836 9523.413499 0.000000 10277.660378 5851.893307 2488.432223 8704.721278 9341.126615 5898.576962
San Diego 3363.061626 629.760748 3498.113013 710.292965 2414.698757 5413.093004 1928.441480 7368.815437 2692.212361 744.253668 ... 1466.991954 3959.240748 754.612093 10277.660378 0.000000 4426.041889 7793.083947 1573.408379 938.522726 4379.211818
Southern 1063.009074 5052.331669 928.749249 3717.202963 6840.150291 988.044559 2498.149024 2943.535570 1734.103297 3685.510088 ... 2961.834750 470.164792 3672.035402 5851.893307 4426.041889 0.000000 3367.318870 2853.298778 3490.422918 59.325286
Texas 4430.251585 8419.610541 4295.014690 7084.372839 10207.392630 2380.124974 5865.447190 447.828673 5101.414140 7052.723883 ... 6328.917948 3834.012257 7039.262070 2488.432223 7793.083947 3367.318870 0.000000 6220.296729 6857.735864 3414.831455
Wisconsin 1790.485648 2199.721665 1925.772564 864.273153 3987.335962 3840.227943 358.476293 5795.958815 1119.940014 833.472995 ... 119.981262 2386.942751 820.164297 8704.721278 1573.408379 2853.298778 6220.296729 0.000000 640.786770 2806.165712
United 2427.588875 1562.210811 2563.637362 232.476871 3350.073118 4478.028874 992.453252 6432.132202 1756.378966 199.228400 ... 531.476328 3024.952355 186.388651 9341.126615 938.522726 3490.422918 6857.735864 640.786770 0.000000 3443.240967
Virginia 1016.617691 5005.081262 883.535455 3670.018191 6793.035300 1035.981475 2451.185161 2989.963982 1687.236030 3638.097548 ... 2914.204993 428.065259 3625.118869 5898.576962 4379.211818 59.325286 3414.831455 2806.165712 3443.240967 0.000000

22 rows × 22 columns

# pandas uses sample standard deviation
utilities_df_norm = (utilities_df - utilities_df.mean())/utilities_df.std()
# compute normalized distance based on Sales and Fuel Cost
utilities_df_norm[['Sales', 'Fuel_Cost']]
d_norm = pairwise.pairwise_distances(utilities_df_norm[['Sales', 'Fuel_Cost']],
                                     metric='euclidean')
pd.DataFrame(d_norm, columns=utilities_df.index, index=utilities_df.index)
Company Arizona Boston Central Commonwealth NY Florida Hawaiian Idaho Kentucky Madison ... Northern Oklahoma Pacific Puget San Diego Southern Texas Wisconsin United Virginia
Company
Arizona 0.000000 2.010329 0.774179 0.758738 3.021907 1.244422 1.885248 1.265638 0.461292 0.738650 ... 0.564657 0.182648 1.570780 1.947668 2.509043 0.913621 1.247976 0.521491 2.761745 1.252350
Boston 2.010329 0.000000 1.465703 1.582821 1.013370 1.792397 0.740283 3.176654 1.557738 1.719632 ... 1.940166 2.166078 0.478334 3.501390 0.679634 1.634425 2.890560 1.654255 1.100595 1.479261
Central 0.774179 1.465703 0.000000 1.015710 2.432528 0.631892 1.156092 1.732777 0.419254 1.102287 ... 1.113433 0.855093 0.987772 2.065643 1.836762 0.276440 1.428159 0.838967 2.034824 0.510365
Commonwealth 0.758738 1.582821 1.015710 0.000000 2.571969 1.643857 1.746027 2.003230 0.629994 0.138758 ... 0.377004 0.937389 1.258835 2.699060 2.202930 1.278514 1.998818 0.243408 2.547116 1.502093
NY 3.021907 1.013370 2.432528 2.571969 0.000000 2.635573 1.411695 4.162561 2.566439 2.705445 ... 2.938637 3.174588 1.462019 4.397433 0.715629 2.558409 3.831132 2.661786 0.952507 2.328691
Florida 1.244422 1.792397 0.631892 1.643857 2.635573 0.000000 1.228805 1.764123 1.025663 1.722510 ... 1.698624 1.243634 1.343185 1.767581 1.953423 0.366744 1.277920 1.452417 2.016493 0.313847
Hawaiian 1.885248 0.740283 1.156092 1.746027 1.411695 1.228805 0.000000 2.860189 1.436822 1.880361 ... 2.027224 1.997036 0.560997 2.995848 0.726095 1.205034 2.463227 1.711256 0.879934 0.929414
Idaho 1.265638 3.176654 1.732777 2.003230 4.162561 1.764123 2.860189 0.000000 1.650417 1.950296 ... 1.708409 1.083449 2.705579 0.992092 3.563727 1.658671 0.600089 1.778813 3.720421 1.980715
Kentucky 0.461292 1.557738 0.419254 0.629994 2.566439 1.025663 1.436822 1.650417 0.000000 0.697674 ... 0.694524 0.608401 1.110854 2.180496 2.048098 0.658996 1.493274 0.426780 2.308613 0.929141
Madison 0.738650 1.719632 1.102287 0.138758 2.705445 1.722510 1.880361 1.950296 0.697674 0.000000 ... 0.267198 0.908665 1.397240 2.686215 2.341644 1.355786 1.986625 0.274061 2.685340 1.599587
Nevada 2.369479 3.756513 2.375975 3.106084 4.597006 1.971518 3.185311 1.479526 2.550689 3.105627 ... 2.923023 2.211990 3.293310 0.487508 3.899212 2.145585 1.133311 2.862756 3.887918 2.284803
New England 2.425975 0.684393 1.737322 2.153831 0.846291 1.831380 0.608107 3.458771 1.966323 2.292531 ... 2.480456 2.554109 0.898094 3.598846 0.130663 1.809354 3.071178 2.172473 0.417866 1.536436
Northern 0.564657 1.940166 1.113433 0.377004 2.938637 1.698624 2.027224 1.708409 0.694524 0.267198 ... 0.000000 0.711050 1.582591 2.487892 2.538720 1.336887 1.793287 0.316160 2.861293 1.623614
Oklahoma 0.182648 2.166078 0.855093 0.937389 3.174588 1.243634 1.997036 1.083449 0.608401 0.908665 ... 0.711050 0.000000 1.716739 1.780656 2.642155 0.944295 1.083449 0.702684 2.876646 1.296548
Pacific 1.570780 0.478334 0.987772 1.258835 1.462019 1.343185 0.560997 2.705579 1.110854 1.397240 ... 1.582591 1.716739 0.000000 3.027116 0.958905 1.160017 2.412278 1.276200 1.288563 1.035028
Puget 1.947668 3.501390 2.065643 2.699060 4.397433 1.767581 2.995848 0.992092 2.180496 2.686215 ... 2.487892 1.780656 3.027116 0.000000 3.720970 1.867235 0.700313 2.456272 3.763066 2.069314
San Diego 2.509043 0.679634 1.836762 2.202930 0.715629 1.953423 0.726095 3.563727 2.048098 2.341644 ... 2.538720 2.642155 0.958905 3.720970 0.000000 1.920035 3.185942 2.234632 0.440163 1.655498
Southern 0.913621 1.634425 0.276440 1.278514 2.558409 0.366744 1.205034 1.658671 0.658996 1.355786 ... 1.336887 0.944295 1.160017 1.867235 1.920035 0.000000 1.272784 1.085774 2.062067 0.356298
Texas 1.247976 2.890560 1.428159 1.998818 3.831132 1.277920 2.463227 0.600089 1.493274 1.986625 ... 1.793287 1.083449 2.412278 0.700313 3.185942 1.272784 0.000000 1.756136 3.288460 1.541576
Wisconsin 0.521491 1.654255 0.838967 0.243408 2.661786 1.452417 1.711256 1.778813 0.426780 0.274061 ... 0.316160 0.702684 1.276200 2.456272 2.234632 1.085774 1.756136 0.000000 2.549040 1.343306
United 2.761745 1.100595 2.034824 2.547116 0.952507 2.016493 0.879934 3.720421 2.308613 2.685340 ... 2.861293 2.876646 1.288563 3.763066 0.440163 2.062067 3.288460 2.549040 0.000000 1.749930
Virginia 1.252350 1.479261 0.510365 1.502093 2.328691 0.313847 0.929414 1.980715 0.929141 1.599587 ... 1.623614 1.296548 1.035028 2.069314 1.655498 0.356298 1.541576 1.343306 1.749930 0.000000

22 rows × 22 columns

15.4 Hierarchical (Agglomerative) Clustering

We will focus on kMeans Clustering but code examples for hierarchical clustering are provided for example purposes.

fig, axes = plt.subplots(2, 1, figsize=(10,10))
# in linkage() set argument method =
# 'single', 'complete', 'average', 'weighted', centroid', 'median', 'ward'
Z = linkage(utilities_df_norm, method='single')
ax1 = axes[0]
dendrogram(Z, labels=utilities_df_norm.index, color_threshold=2.75, ax=ax1)
ax1.set_title('Hierarchical Clustering Dendrogram (Single Linkage)')
Z = linkage(utilities_df_norm, method='average')
ax2 = axes[1]
dendrogram(Z, labels=utilities_df_norm.index, color_threshold=3.6, ax=ax2)
ax2.set_title('Hierarchical Clustering Dendrogram (Average Linkage)')
plt.tight_layout()
plt.show()

png

memb = fcluster(linkage(utilities_df_norm, method='single'), 6, criterion='maxclust')
memb = pd.Series(memb, index=utilities_df_norm.index)
for key, item in memb.groupby(memb):
    print(key, ': ', ', '.join(item.index))
1 :  Idaho, Puget
2 :  Arizona , Boston , Commonwealth, Florida , Hawaiian , Kentucky, Madison , New England, Northern, Oklahoma, Pacific , Southern, Texas, Wisconsin, United, Virginia
3 :  Central 
4 :  San Diego
5 :  Nevada
6 :  NY
memb = fcluster(linkage(utilities_df_norm, method='average'), 6, criterion='maxclust')
memb = pd.Series(memb, index=utilities_df_norm.index)
for key, item in memb.groupby(memb):
    print(key, ': ', ', '.join(item.index))
1 :  Idaho, Nevada, Puget
2 :  Hawaiian , New England, Pacific , United
3 :  San Diego
4 :  Boston , Commonwealth, Madison , Northern, Wisconsin, Virginia
5 :  Arizona , Central , Florida , Kentucky, Oklahoma, Southern, Texas
6 :  NY
# set labels as cluster membership and utility name
utilities_df_norm.index = ['{}: {}'.format(cluster, state)
                           for cluster, state in zip(memb, utilities_df_norm.index)]
# plot heatmap
# the '_r' suffix reverses the color mapping to large = dark
sns.clustermap(utilities_df_norm, method='average', col_cluster=False, cmap='mako_r')
plt.show()

png

15.5 Non-Hierarchical Clustering: The k-Means Algorithm

# Normalize distances
utilities_df_norm = utilities_df.apply(preprocessing.scale, axis=0)
kmeans = KMeans(n_clusters=6, init='k-means++', max_iter=300, n_init=10, random_state=0).fit(utilities_df_norm)
# Cluster membership
memb = pd.Series(kmeans.labels_, index=utilities_df_norm.index)
for key, item in memb.groupby(memb):
     print(key, ': ', ', '.join(item.index))
0 :  Idaho, Puget
1 :  Arizona , Central , Florida , Kentucky, Oklahoma, Southern, Texas
2 :  Commonwealth, Madison , Northern, Wisconsin, Virginia
3 :  Boston , Hawaiian , New England, Pacific , San Diego, United
4 :  Nevada
5 :  NY
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=utilities_df_norm.columns)
centroids
Fixed_charge RoR Cost Load_factor Demand_growth Sales Nuclear Fuel_Cost
0 0.088252 -0.541112 1.995766 -0.109502 0.987702 1.621068 -0.731447 -1.174696
1 0.516184 0.797896 -1.009097 -0.345490 -0.501098 0.360140 -0.535523 -0.420198
2 -0.011599 0.339180 0.224086 -0.366466 0.170386 -0.411331 1.601868 -0.609460
3 -0.632893 -0.639936 0.206692 1.175321 0.057691 -0.757719 -0.380962 1.203616
4 -2.019709 -1.476137 0.119723 -1.256665 1.069762 2.458495 -0.731447 -0.616086
5 2.085268 -0.883194 0.591840 -1.325495 -0.735555 -1.618644 0.219434 1.732470
# calculate the distances of each data point to the cluster centers
distances = kmeans.transform(utilities_df_norm)
# find closest cluster for each data point
minSquaredDistances = distances.min(axis=1) ** 2
# combine with cluster labels into a data frame
df = pd.DataFrame({'squaredDistance': minSquaredDistances, 'cluster': kmeans.labels_},
    index=utilities_df_norm.index)
# group by cluster and print information
for cluster, data in df.groupby('cluster'):
    count = len(data)
    withinClustSS = data.squaredDistance.sum()
    print(f'Cluster {cluster} ({count} members): {withinClustSS:.2f} within cluster ')
Cluster 0 (2 members): 2.54 within cluster 
Cluster 1 (7 members): 27.77 within cluster 
Cluster 2 (5 members): 10.66 within cluster 
Cluster 3 (6 members): 22.20 within cluster 
Cluster 4 (1 members): 0.00 within cluster 
Cluster 5 (1 members): 0.00 within cluster
centroids['cluster'] = ['Cluster {}'.format(i) for i in centroids.index]
plt.figure(figsize=(10,6))
parallel_coordinates(centroids, class_column='cluster', colormap='Dark2', linewidth=5)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()

png

inertia = []
for n_clusters in range(1, 7):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(utilities_df_norm)
    inertia.append(kmeans.inertia_ / n_clusters)
inertias = pd.DataFrame({'n_clusters': range(1, 7), 'inertia': inertia})
ax = inertias.plot(x='n_clusters', y='inertia')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Average Within-Cluster Squared Distances')
plt.ylim((0, 1.1 * inertias.inertia.max()))
ax.legend().set_visible(False)
plt.show()

png