Chapter 15
Chapter 15 Cluster Analysis
Original Code Credit:: Shmueli, Galit; Bruce, Peter C.; Gedeck, Peter; Patel, Nitin R.. Data Mining for Business Analytics Wiley.
Modifications have been made from the original textbook examples due to version changes in library dependencies and/or for clarity.
Download this notebook and data here.
Import Libraries
import os
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import pairwise
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.cluster import KMeans
import matplotlib.pylab as plt
import seaborn as sns
from pandas.plotting import parallel_coordinates
15.1 Introduction
Example: Public Utilities
utilities_df = pd.read_csv(os.path.join('data', 'Utilities.csv'))
# set row names to the utilities column
utilities_df.set_index('Company', inplace=True)
# while not required, the conversion of integer data to float
# will avoid a warning when applying the scale function
utilities_df = utilities_df.apply(lambda x: x.astype('float64'))
# compute Euclidean distance
d = pairwise.pairwise_distances(utilities_df, metric='euclidean')
pd.DataFrame(d, columns=utilities_df.index, index=utilities_df.index)
Company | Arizona | Boston | Central | Commonwealth | NY | Florida | Hawaiian | Idaho | Kentucky | Madison | ... | Northern | Oklahoma | Pacific | Puget | San Diego | Southern | Texas | Wisconsin | United | Virginia |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Company | |||||||||||||||||||||
Arizona | 0.000000 | 3989.408076 | 140.402855 | 2654.277632 | 5777.167672 | 2050.529440 | 1435.265019 | 4006.104187 | 671.276346 | 2622.699002 | ... | 1899.279821 | 598.556633 | 2609.045363 | 6914.742065 | 3363.061626 | 1063.009074 | 4430.251585 | 1790.485648 | 2427.588875 | 1016.617691 |
Boston | 3989.408076 | 0.000000 | 4125.044132 | 1335.466502 | 1788.068027 | 6039.689076 | 2554.287162 | 7994.155985 | 3318.276558 | 1367.090634 | ... | 2091.160485 | 4586.302564 | 1380.749962 | 10903.146464 | 629.760748 | 5052.331669 | 8419.610541 | 2199.721665 | 1562.210811 | 5005.081262 |
Central | 140.402855 | 4125.044132 | 0.000000 | 2789.759674 | 5912.552908 | 1915.155154 | 1571.295401 | 3872.257626 | 807.920792 | 2758.559663 | ... | 2035.441520 | 461.341670 | 2744.502847 | 6780.430307 | 3498.113013 | 928.749249 | 4295.014690 | 1925.772564 | 2563.637362 | 883.535455 |
Commonwealth | 2654.277632 | 1335.466502 | 2789.759674 | 0.000000 | 3123.153215 | 4704.363099 | 1219.560005 | 6659.534567 | 1983.314354 | 43.648894 | ... | 756.831954 | 3250.984589 | 56.644626 | 9568.434429 | 710.292965 | 3717.202963 | 7084.372839 | 864.273153 | 232.476871 | 3670.018191 |
NY | 5777.167672 | 1788.068027 | 5912.552908 | 3123.153215 | 0.000000 | 7827.429211 | 4342.093798 | 9782.158178 | 5106.094153 | 3155.095594 | ... | 3879.167462 | 6373.743249 | 3168.177463 | 12691.155108 | 2414.698757 | 6840.150291 | 10207.392630 | 3987.335962 | 3350.073118 | 6793.035300 |
Florida | 2050.529440 | 6039.689076 | 1915.155154 | 4704.363099 | 7827.429211 | 0.000000 | 3485.671562 | 1959.731080 | 2721.706296 | 4672.829286 | ... | 3949.092316 | 1454.292604 | 4659.356262 | 4866.111649 | 5413.093004 | 988.044559 | 2380.124974 | 3840.227943 | 4478.028874 | 1035.981475 |
Hawaiian | 1435.265019 | 2554.287162 | 1571.295401 | 1219.560005 | 4342.093798 | 3485.671562 | 0.000000 | 5440.461781 | 764.083188 | 1187.941143 | ... | 466.559118 | 2032.614245 | 1174.075616 | 8349.366438 | 1928.441480 | 2498.149024 | 5865.447190 | 358.476293 | 992.453252 | 2451.185161 |
Idaho | 4006.104187 | 7994.155985 | 3872.257626 | 6659.534567 | 9782.158178 | 1959.731080 | 5440.461781 | 0.000000 | 4676.638384 | 6627.291780 | ... | 5903.395450 | 3412.263965 | 6614.499239 | 2909.014679 | 7368.815437 | 2943.535570 | 447.828673 | 5795.958815 | 6432.132202 | 2989.963982 |
Kentucky | 671.276346 | 3318.276558 | 807.920792 | 1983.314354 | 5106.094153 | 2721.706296 | 764.083188 | 4676.638384 | 0.000000 | 1951.628580 | ... | 1228.436327 | 1269.102099 | 1938.026557 | 7585.467294 | 2692.212361 | 1734.103297 | 5101.414140 | 1119.940014 | 1756.378966 | 1687.236030 |
Madison | 2622.699002 | 1367.090634 | 2758.559663 | 43.648894 | 3155.095594 | 4672.829286 | 1187.941143 | 6627.291780 | 1951.628580 | 0.000000 | ... | 724.096182 | 3219.825109 | 53.301401 | 9536.242192 | 744.253668 | 3685.510088 | 7052.723883 | 833.472995 | 199.228400 | 3638.097548 |
Nevada | 8364.031051 | 12353.062698 | 8229.223281 | 11018.057812 | 14141.022579 | 6314.359092 | 9799.015552 | 4359.599605 | 9035.007488 | 10986.098011 | ... | 10262.157285 | 7768.384793 | 10973.010950 | 1452.162005 | 11727.066293 | 7301.040864 | 3934.617521 | 10154.118793 | 10791.049271 | 7348.049019 |
New England | 2923.136103 | 1066.579432 | 3058.707429 | 271.452731 | 2854.099482 | 4973.506840 | 1488.014909 | 6928.326174 | 2252.026717 | 304.277034 | ... | 1026.482994 | 3519.977565 | 314.354030 | 9837.281834 | 442.132760 | 3986.102433 | 7353.379146 | 1134.145010 | 496.687413 | 3939.100355 |
Northern | 1899.279821 | 2091.160485 | 2035.441520 | 756.831954 | 3879.167462 | 3949.092316 | 466.559118 | 5903.395450 | 1228.436327 | 724.096182 | ... | 0.000000 | 2496.638890 | 713.665046 | 8812.303559 | 1466.991954 | 2961.834750 | 6328.917948 | 119.981262 | 531.476328 | 2914.204993 |
Oklahoma | 598.556633 | 4586.302564 | 461.341670 | 3250.984589 | 6373.743249 | 1454.292604 | 2032.614245 | 3412.263965 | 1269.102099 | 3219.825109 | ... | 2496.638890 | 0.000000 | 3205.748876 | 6319.933836 | 3959.240748 | 470.164792 | 3834.012257 | 2386.942751 | 3024.952355 | 428.065259 |
Pacific | 2609.045363 | 1380.749962 | 2744.502847 | 56.644626 | 3168.177463 | 4659.356262 | 1174.075616 | 6614.499239 | 1938.026557 | 53.301401 | ... | 713.665046 | 3205.748876 | 0.000000 | 9523.413499 | 754.612093 | 3672.035402 | 7039.262070 | 820.164297 | 186.388651 | 3625.118869 |
Puget | 6914.742065 | 10903.146464 | 6780.430307 | 9568.434429 | 12691.155108 | 4866.111649 | 8349.366438 | 2909.014679 | 7585.467294 | 9536.242192 | ... | 8812.303559 | 6319.933836 | 9523.413499 | 0.000000 | 10277.660378 | 5851.893307 | 2488.432223 | 8704.721278 | 9341.126615 | 5898.576962 |
San Diego | 3363.061626 | 629.760748 | 3498.113013 | 710.292965 | 2414.698757 | 5413.093004 | 1928.441480 | 7368.815437 | 2692.212361 | 744.253668 | ... | 1466.991954 | 3959.240748 | 754.612093 | 10277.660378 | 0.000000 | 4426.041889 | 7793.083947 | 1573.408379 | 938.522726 | 4379.211818 |
Southern | 1063.009074 | 5052.331669 | 928.749249 | 3717.202963 | 6840.150291 | 988.044559 | 2498.149024 | 2943.535570 | 1734.103297 | 3685.510088 | ... | 2961.834750 | 470.164792 | 3672.035402 | 5851.893307 | 4426.041889 | 0.000000 | 3367.318870 | 2853.298778 | 3490.422918 | 59.325286 |
Texas | 4430.251585 | 8419.610541 | 4295.014690 | 7084.372839 | 10207.392630 | 2380.124974 | 5865.447190 | 447.828673 | 5101.414140 | 7052.723883 | ... | 6328.917948 | 3834.012257 | 7039.262070 | 2488.432223 | 7793.083947 | 3367.318870 | 0.000000 | 6220.296729 | 6857.735864 | 3414.831455 |
Wisconsin | 1790.485648 | 2199.721665 | 1925.772564 | 864.273153 | 3987.335962 | 3840.227943 | 358.476293 | 5795.958815 | 1119.940014 | 833.472995 | ... | 119.981262 | 2386.942751 | 820.164297 | 8704.721278 | 1573.408379 | 2853.298778 | 6220.296729 | 0.000000 | 640.786770 | 2806.165712 |
United | 2427.588875 | 1562.210811 | 2563.637362 | 232.476871 | 3350.073118 | 4478.028874 | 992.453252 | 6432.132202 | 1756.378966 | 199.228400 | ... | 531.476328 | 3024.952355 | 186.388651 | 9341.126615 | 938.522726 | 3490.422918 | 6857.735864 | 640.786770 | 0.000000 | 3443.240967 |
Virginia | 1016.617691 | 5005.081262 | 883.535455 | 3670.018191 | 6793.035300 | 1035.981475 | 2451.185161 | 2989.963982 | 1687.236030 | 3638.097548 | ... | 2914.204993 | 428.065259 | 3625.118869 | 5898.576962 | 4379.211818 | 59.325286 | 3414.831455 | 2806.165712 | 3443.240967 | 0.000000 |
22 rows × 22 columns
# pandas uses sample standard deviation
utilities_df_norm = (utilities_df - utilities_df.mean())/utilities_df.std()
# compute normalized distance based on Sales and Fuel Cost
utilities_df_norm[['Sales', 'Fuel_Cost']]
d_norm = pairwise.pairwise_distances(utilities_df_norm[['Sales', 'Fuel_Cost']],
metric='euclidean')
pd.DataFrame(d_norm, columns=utilities_df.index, index=utilities_df.index)
Company | Arizona | Boston | Central | Commonwealth | NY | Florida | Hawaiian | Idaho | Kentucky | Madison | ... | Northern | Oklahoma | Pacific | Puget | San Diego | Southern | Texas | Wisconsin | United | Virginia |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Company | |||||||||||||||||||||
Arizona | 0.000000 | 2.010329 | 0.774179 | 0.758738 | 3.021907 | 1.244422 | 1.885248 | 1.265638 | 0.461292 | 0.738650 | ... | 0.564657 | 0.182648 | 1.570780 | 1.947668 | 2.509043 | 0.913621 | 1.247976 | 0.521491 | 2.761745 | 1.252350 |
Boston | 2.010329 | 0.000000 | 1.465703 | 1.582821 | 1.013370 | 1.792397 | 0.740283 | 3.176654 | 1.557738 | 1.719632 | ... | 1.940166 | 2.166078 | 0.478334 | 3.501390 | 0.679634 | 1.634425 | 2.890560 | 1.654255 | 1.100595 | 1.479261 |
Central | 0.774179 | 1.465703 | 0.000000 | 1.015710 | 2.432528 | 0.631892 | 1.156092 | 1.732777 | 0.419254 | 1.102287 | ... | 1.113433 | 0.855093 | 0.987772 | 2.065643 | 1.836762 | 0.276440 | 1.428159 | 0.838967 | 2.034824 | 0.510365 |
Commonwealth | 0.758738 | 1.582821 | 1.015710 | 0.000000 | 2.571969 | 1.643857 | 1.746027 | 2.003230 | 0.629994 | 0.138758 | ... | 0.377004 | 0.937389 | 1.258835 | 2.699060 | 2.202930 | 1.278514 | 1.998818 | 0.243408 | 2.547116 | 1.502093 |
NY | 3.021907 | 1.013370 | 2.432528 | 2.571969 | 0.000000 | 2.635573 | 1.411695 | 4.162561 | 2.566439 | 2.705445 | ... | 2.938637 | 3.174588 | 1.462019 | 4.397433 | 0.715629 | 2.558409 | 3.831132 | 2.661786 | 0.952507 | 2.328691 |
Florida | 1.244422 | 1.792397 | 0.631892 | 1.643857 | 2.635573 | 0.000000 | 1.228805 | 1.764123 | 1.025663 | 1.722510 | ... | 1.698624 | 1.243634 | 1.343185 | 1.767581 | 1.953423 | 0.366744 | 1.277920 | 1.452417 | 2.016493 | 0.313847 |
Hawaiian | 1.885248 | 0.740283 | 1.156092 | 1.746027 | 1.411695 | 1.228805 | 0.000000 | 2.860189 | 1.436822 | 1.880361 | ... | 2.027224 | 1.997036 | 0.560997 | 2.995848 | 0.726095 | 1.205034 | 2.463227 | 1.711256 | 0.879934 | 0.929414 |
Idaho | 1.265638 | 3.176654 | 1.732777 | 2.003230 | 4.162561 | 1.764123 | 2.860189 | 0.000000 | 1.650417 | 1.950296 | ... | 1.708409 | 1.083449 | 2.705579 | 0.992092 | 3.563727 | 1.658671 | 0.600089 | 1.778813 | 3.720421 | 1.980715 |
Kentucky | 0.461292 | 1.557738 | 0.419254 | 0.629994 | 2.566439 | 1.025663 | 1.436822 | 1.650417 | 0.000000 | 0.697674 | ... | 0.694524 | 0.608401 | 1.110854 | 2.180496 | 2.048098 | 0.658996 | 1.493274 | 0.426780 | 2.308613 | 0.929141 |
Madison | 0.738650 | 1.719632 | 1.102287 | 0.138758 | 2.705445 | 1.722510 | 1.880361 | 1.950296 | 0.697674 | 0.000000 | ... | 0.267198 | 0.908665 | 1.397240 | 2.686215 | 2.341644 | 1.355786 | 1.986625 | 0.274061 | 2.685340 | 1.599587 |
Nevada | 2.369479 | 3.756513 | 2.375975 | 3.106084 | 4.597006 | 1.971518 | 3.185311 | 1.479526 | 2.550689 | 3.105627 | ... | 2.923023 | 2.211990 | 3.293310 | 0.487508 | 3.899212 | 2.145585 | 1.133311 | 2.862756 | 3.887918 | 2.284803 |
New England | 2.425975 | 0.684393 | 1.737322 | 2.153831 | 0.846291 | 1.831380 | 0.608107 | 3.458771 | 1.966323 | 2.292531 | ... | 2.480456 | 2.554109 | 0.898094 | 3.598846 | 0.130663 | 1.809354 | 3.071178 | 2.172473 | 0.417866 | 1.536436 |
Northern | 0.564657 | 1.940166 | 1.113433 | 0.377004 | 2.938637 | 1.698624 | 2.027224 | 1.708409 | 0.694524 | 0.267198 | ... | 0.000000 | 0.711050 | 1.582591 | 2.487892 | 2.538720 | 1.336887 | 1.793287 | 0.316160 | 2.861293 | 1.623614 |
Oklahoma | 0.182648 | 2.166078 | 0.855093 | 0.937389 | 3.174588 | 1.243634 | 1.997036 | 1.083449 | 0.608401 | 0.908665 | ... | 0.711050 | 0.000000 | 1.716739 | 1.780656 | 2.642155 | 0.944295 | 1.083449 | 0.702684 | 2.876646 | 1.296548 |
Pacific | 1.570780 | 0.478334 | 0.987772 | 1.258835 | 1.462019 | 1.343185 | 0.560997 | 2.705579 | 1.110854 | 1.397240 | ... | 1.582591 | 1.716739 | 0.000000 | 3.027116 | 0.958905 | 1.160017 | 2.412278 | 1.276200 | 1.288563 | 1.035028 |
Puget | 1.947668 | 3.501390 | 2.065643 | 2.699060 | 4.397433 | 1.767581 | 2.995848 | 0.992092 | 2.180496 | 2.686215 | ... | 2.487892 | 1.780656 | 3.027116 | 0.000000 | 3.720970 | 1.867235 | 0.700313 | 2.456272 | 3.763066 | 2.069314 |
San Diego | 2.509043 | 0.679634 | 1.836762 | 2.202930 | 0.715629 | 1.953423 | 0.726095 | 3.563727 | 2.048098 | 2.341644 | ... | 2.538720 | 2.642155 | 0.958905 | 3.720970 | 0.000000 | 1.920035 | 3.185942 | 2.234632 | 0.440163 | 1.655498 |
Southern | 0.913621 | 1.634425 | 0.276440 | 1.278514 | 2.558409 | 0.366744 | 1.205034 | 1.658671 | 0.658996 | 1.355786 | ... | 1.336887 | 0.944295 | 1.160017 | 1.867235 | 1.920035 | 0.000000 | 1.272784 | 1.085774 | 2.062067 | 0.356298 |
Texas | 1.247976 | 2.890560 | 1.428159 | 1.998818 | 3.831132 | 1.277920 | 2.463227 | 0.600089 | 1.493274 | 1.986625 | ... | 1.793287 | 1.083449 | 2.412278 | 0.700313 | 3.185942 | 1.272784 | 0.000000 | 1.756136 | 3.288460 | 1.541576 |
Wisconsin | 0.521491 | 1.654255 | 0.838967 | 0.243408 | 2.661786 | 1.452417 | 1.711256 | 1.778813 | 0.426780 | 0.274061 | ... | 0.316160 | 0.702684 | 1.276200 | 2.456272 | 2.234632 | 1.085774 | 1.756136 | 0.000000 | 2.549040 | 1.343306 |
United | 2.761745 | 1.100595 | 2.034824 | 2.547116 | 0.952507 | 2.016493 | 0.879934 | 3.720421 | 2.308613 | 2.685340 | ... | 2.861293 | 2.876646 | 1.288563 | 3.763066 | 0.440163 | 2.062067 | 3.288460 | 2.549040 | 0.000000 | 1.749930 |
Virginia | 1.252350 | 1.479261 | 0.510365 | 1.502093 | 2.328691 | 0.313847 | 0.929414 | 1.980715 | 0.929141 | 1.599587 | ... | 1.623614 | 1.296548 | 1.035028 | 2.069314 | 1.655498 | 0.356298 | 1.541576 | 1.343306 | 1.749930 | 0.000000 |
22 rows × 22 columns
15.4 Hierarchical (Agglomerative) Clustering
We will focus on kMeans Clustering but code examples for hierarchical clustering are provided for example purposes.
fig, axes = plt.subplots(2, 1, figsize=(10,10))
# in linkage() set argument method =
# 'single', 'complete', 'average', 'weighted', centroid', 'median', 'ward'
Z = linkage(utilities_df_norm, method='single')
ax1 = axes[0]
dendrogram(Z, labels=utilities_df_norm.index, color_threshold=2.75, ax=ax1)
ax1.set_title('Hierarchical Clustering Dendrogram (Single Linkage)')
Z = linkage(utilities_df_norm, method='average')
ax2 = axes[1]
dendrogram(Z, labels=utilities_df_norm.index, color_threshold=3.6, ax=ax2)
ax2.set_title('Hierarchical Clustering Dendrogram (Average Linkage)')
plt.tight_layout()
plt.show()
memb = fcluster(linkage(utilities_df_norm, method='single'), 6, criterion='maxclust')
memb = pd.Series(memb, index=utilities_df_norm.index)
for key, item in memb.groupby(memb):
print(key, ': ', ', '.join(item.index))
1 : Idaho, Puget
2 : Arizona , Boston , Commonwealth, Florida , Hawaiian , Kentucky, Madison , New England, Northern, Oklahoma, Pacific , Southern, Texas, Wisconsin, United, Virginia
3 : Central
4 : San Diego
5 : Nevada
6 : NY
memb = fcluster(linkage(utilities_df_norm, method='average'), 6, criterion='maxclust')
memb = pd.Series(memb, index=utilities_df_norm.index)
for key, item in memb.groupby(memb):
print(key, ': ', ', '.join(item.index))
1 : Idaho, Nevada, Puget
2 : Hawaiian , New England, Pacific , United
3 : San Diego
4 : Boston , Commonwealth, Madison , Northern, Wisconsin, Virginia
5 : Arizona , Central , Florida , Kentucky, Oklahoma, Southern, Texas
6 : NY
# set labels as cluster membership and utility name
utilities_df_norm.index = ['{}: {}'.format(cluster, state)
for cluster, state in zip(memb, utilities_df_norm.index)]
# plot heatmap
# the '_r' suffix reverses the color mapping to large = dark
sns.clustermap(utilities_df_norm, method='average', col_cluster=False, cmap='mako_r')
plt.show()
15.5 Non-Hierarchical Clustering: The k-Means Algorithm
# Normalize distances
utilities_df_norm = utilities_df.apply(preprocessing.scale, axis=0)
kmeans = KMeans(n_clusters=6, init='k-means++', max_iter=300, n_init=10, random_state=0).fit(utilities_df_norm)
# Cluster membership
memb = pd.Series(kmeans.labels_, index=utilities_df_norm.index)
for key, item in memb.groupby(memb):
print(key, ': ', ', '.join(item.index))
0 : Idaho, Puget
1 : Arizona , Central , Florida , Kentucky, Oklahoma, Southern, Texas
2 : Commonwealth, Madison , Northern, Wisconsin, Virginia
3 : Boston , Hawaiian , New England, Pacific , San Diego, United
4 : Nevada
5 : NY
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=utilities_df_norm.columns)
centroids
Fixed_charge | RoR | Cost | Load_factor | Demand_growth | Sales | Nuclear | Fuel_Cost | |
---|---|---|---|---|---|---|---|---|
0 | 0.088252 | -0.541112 | 1.995766 | -0.109502 | 0.987702 | 1.621068 | -0.731447 | -1.174696 |
1 | 0.516184 | 0.797896 | -1.009097 | -0.345490 | -0.501098 | 0.360140 | -0.535523 | -0.420198 |
2 | -0.011599 | 0.339180 | 0.224086 | -0.366466 | 0.170386 | -0.411331 | 1.601868 | -0.609460 |
3 | -0.632893 | -0.639936 | 0.206692 | 1.175321 | 0.057691 | -0.757719 | -0.380962 | 1.203616 |
4 | -2.019709 | -1.476137 | 0.119723 | -1.256665 | 1.069762 | 2.458495 | -0.731447 | -0.616086 |
5 | 2.085268 | -0.883194 | 0.591840 | -1.325495 | -0.735555 | -1.618644 | 0.219434 | 1.732470 |
# calculate the distances of each data point to the cluster centers
distances = kmeans.transform(utilities_df_norm)
# find closest cluster for each data point
minSquaredDistances = distances.min(axis=1) ** 2
# combine with cluster labels into a data frame
df = pd.DataFrame({'squaredDistance': minSquaredDistances, 'cluster': kmeans.labels_},
index=utilities_df_norm.index)
# group by cluster and print information
for cluster, data in df.groupby('cluster'):
count = len(data)
withinClustSS = data.squaredDistance.sum()
print(f'Cluster {cluster} ({count} members): {withinClustSS:.2f} within cluster ')
Cluster 0 (2 members): 2.54 within cluster
Cluster 1 (7 members): 27.77 within cluster
Cluster 2 (5 members): 10.66 within cluster
Cluster 3 (6 members): 22.20 within cluster
Cluster 4 (1 members): 0.00 within cluster
Cluster 5 (1 members): 0.00 within cluster
centroids['cluster'] = ['Cluster {}'.format(i) for i in centroids.index]
plt.figure(figsize=(10,6))
parallel_coordinates(centroids, class_column='cluster', colormap='Dark2', linewidth=5)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
inertia = []
for n_clusters in range(1, 7):
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(utilities_df_norm)
inertia.append(kmeans.inertia_ / n_clusters)
inertias = pd.DataFrame({'n_clusters': range(1, 7), 'inertia': inertia})
ax = inertias.plot(x='n_clusters', y='inertia')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Average Within-Cluster Squared Distances')
plt.ylim((0, 1.1 * inertias.inertia.max()))
ax.legend().set_visible(False)
plt.show()