Chapter 7
Chapter 7 k-Nearest Neighbors (kNN)¶
Original Code Credit:: Shmueli, Galit; Bruce, Peter C.; Gedeck, Peter; Patel, Nitin R.. Machine Learning for Business Analytics Wiley.
Modifications have been made from the original textbook examples due to version changes in library dependencies and/or for clarity.
Import Libraries¶
In [1]:
Copied!
import os
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import matplotlib.pylab as plt
import matplotlib
%matplotlib inline
import os
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import matplotlib.pylab as plt
import matplotlib
%matplotlib inline
7.1 The k-NN Classifier (Categorical Outcome)¶
Example: Riding Mowers¶
In [2]:
Copied!
mower_df = pd.read_csv(os.path.join('..', 'data','RidingMowers.csv'))
mower_df['Number'] = mower_df.index + 1
trainData, validData = train_test_split(mower_df, test_size=0.4, random_state=26)
## new household
newHousehold = pd.DataFrame([{'Income': 60, 'Lot_Size': 20}])
## scatter plot
def plotDataset(ax, data):
subset = data.loc[data['Ownership']=='Owner']
ax.scatter(subset.Income, subset.Lot_Size, marker='o',
label='Owner', color='C1')
subset = data.loc[data['Ownership']=='Nonowner']
ax.scatter(subset.Income, subset.Lot_Size, marker='D',
label='Nonowner', color='C0')
for _, row in data.iterrows():
ax.annotate(row.Number, (row.Income + 2, row.Lot_Size))
fig, ax = plt.subplots(figsize=(6, 4))
plotDataset(ax, trainData)
ax.scatter(newHousehold.Income, newHousehold.Lot_Size, marker='*',
label='New household', color='black', s=150)
ax.set_xlabel('Income'); ax.set_ylabel('Lot size')
ax.set_xlim(20, 120)
ax.set_ylim(13, 25)
ax.legend(loc=4)
plt.show()
mower_df = pd.read_csv(os.path.join('..', 'data','RidingMowers.csv'))
mower_df['Number'] = mower_df.index + 1
trainData, validData = train_test_split(mower_df, test_size=0.4, random_state=26)
## new household
newHousehold = pd.DataFrame([{'Income': 60, 'Lot_Size': 20}])
## scatter plot
def plotDataset(ax, data):
subset = data.loc[data['Ownership']=='Owner']
ax.scatter(subset.Income, subset.Lot_Size, marker='o',
label='Owner', color='C1')
subset = data.loc[data['Ownership']=='Nonowner']
ax.scatter(subset.Income, subset.Lot_Size, marker='D',
label='Nonowner', color='C0')
for _, row in data.iterrows():
ax.annotate(row.Number, (row.Income + 2, row.Lot_Size))
fig, ax = plt.subplots(figsize=(6, 4))
plotDataset(ax, trainData)
ax.scatter(newHousehold.Income, newHousehold.Lot_Size, marker='*',
label='New household', color='black', s=150)
ax.set_xlabel('Income'); ax.set_ylabel('Lot size')
ax.set_xlim(20, 120)
ax.set_ylim(13, 25)
ax.legend(loc=4)
plt.show()
7.2 Running k-NN¶
In [3]:
Copied!
predictors = ['Income', 'Lot_Size']
outcome = 'Ownership'
# combine scaling and k-NN classifier into model using a pipeline
model = Pipeline([
('scaler', preprocessing.StandardScaler()),
('knn', KNeighborsClassifier(n_neighbors=3)),
])
model.fit(trainData[predictors], trainData[outcome])
print(model.predict(newHousehold))
zNewHousehold = model['scaler'].transform(newHousehold)
distances, indices = model['knn'].kneighbors(zNewHousehold)
trainData.iloc[indices[0], :]
predictors = ['Income', 'Lot_Size']
outcome = 'Ownership'
# combine scaling and k-NN classifier into model using a pipeline
model = Pipeline([
('scaler', preprocessing.StandardScaler()),
('knn', KNeighborsClassifier(n_neighbors=3)),
])
model.fit(trainData[predictors], trainData[outcome])
print(model.predict(newHousehold))
zNewHousehold = model['scaler'].transform(newHousehold)
distances, indices = model['knn'].kneighbors(zNewHousehold)
trainData.iloc[indices[0], :]
['Owner']
Out[3]:
| Income | Lot_Size | Ownership | Number | |
|---|---|---|---|---|
| 3 | 61.5 | 20.8 | Owner | 4 |
| 13 | 52.8 | 20.8 | Nonowner | 14 |
| 0 | 60.0 | 18.4 | Owner | 1 |
7.3 Accuracy of k-NN Predictions in Leave-One Out Cross Validation for Various Choices of k¶
In [4]:
Copied!
X = trainData[predictors]
y = trainData[outcome]
results = []
for k in range(1, 14):
# set the number of neighbors in the knn model
model.set_params(knn__n_neighbors=k)
accuracy = cross_val_score(model, X, y, cv=LeaveOneOut(),
scoring='accuracy').mean()
results.append({'k': k, 'accuracy': accuracy})
pd.DataFrame(results).round(2)
X = trainData[predictors]
y = trainData[outcome]
results = []
for k in range(1, 14):
# set the number of neighbors in the knn model
model.set_params(knn__n_neighbors=k)
accuracy = cross_val_score(model, X, y, cv=LeaveOneOut(),
scoring='accuracy').mean()
results.append({'k': k, 'accuracy': accuracy})
pd.DataFrame(results).round(2)
Out[4]:
| k | accuracy | |
|---|---|---|
| 0 | 1 | 0.71 |
| 1 | 2 | 0.50 |
| 2 | 3 | 0.71 |
| 3 | 4 | 0.71 |
| 4 | 5 | 0.79 |
| 5 | 6 | 0.71 |
| 6 | 7 | 0.79 |
| 7 | 8 | 0.71 |
| 8 | 9 | 0.71 |
| 9 | 10 | 0.57 |
| 10 | 11 | 0.50 |
| 11 | 12 | 0.43 |
| 12 | 13 | 0.57 |
7.4 Classifying a New Household using the "BEST k" = 5¶
In [5]:
Copied!
# Retrain with full dataset
knn = Pipeline([
('scaler', preprocessing.StandardScaler()),
('knn', KNeighborsClassifier(n_neighbors=5)),
])
knn.fit(mower_df[predictors], mower_df[outcome])
zNewHousehold = knn['scaler'].transform(newHousehold)
distances, indices = knn['knn'].kneighbors(zNewHousehold)
print('Predicted class', knn.predict(newHousehold))
print('Probability', knn.predict_proba(newHousehold))
print('Distances',distances)
print('Indices', indices)
print(mower_df.iloc[indices[0], :])
# Retrain with full dataset
knn = Pipeline([
('scaler', preprocessing.StandardScaler()),
('knn', KNeighborsClassifier(n_neighbors=5)),
])
knn.fit(mower_df[predictors], mower_df[outcome])
zNewHousehold = knn['scaler'].transform(newHousehold)
distances, indices = knn['knn'].kneighbors(zNewHousehold)
print('Predicted class', knn.predict(newHousehold))
print('Probability', knn.predict_proba(newHousehold))
print('Distances',distances)
print('Indices', indices)
print(mower_df.iloc[indices[0], :])
Predicted class ['Owner']
Probability [[0.2 0.8]]
Distances [[0.34532669 0.46448259 0.50133206 0.67307542 0.71721526]]
Indices [[ 3 8 13 0 2]]
Income Lot_Size Ownership Number
3 61.5 20.8 Owner 4
8 69.0 20.0 Owner 9
13 52.8 20.8 Nonowner 14
0 60.0 18.4 Owner 1
2 64.8 21.6 Owner 3
7.5 Column Specific Transformations¶
In [6]:
Copied!
df = pd.DataFrame({
'Family': ['Single', 'Married', 'Married', 'Single', 'Married', 'Married'],
'Car': ['SUV', 'SUV', 'Sports', 'Truck', 'Sports', 'Sports'],
'Income': [49, 34, 11, 100, 45, 29],
'CreditCard': [1, 1, 3, 1, 2, 2],
})
preprocess = ColumnTransformer([
('encoder', preprocessing.OneHotEncoder(drop='if_binary'), ['Family', 'Car']),
('standardize', preprocessing.StandardScaler(), ['Income']),
('unchanged', 'passthrough', ['CreditCard']),
])
preprocess.fit_transform(df).round(4)
df = pd.DataFrame({
'Family': ['Single', 'Married', 'Married', 'Single', 'Married', 'Married'],
'Car': ['SUV', 'SUV', 'Sports', 'Truck', 'Sports', 'Sports'],
'Income': [49, 34, 11, 100, 45, 29],
'CreditCard': [1, 1, 3, 1, 2, 2],
})
preprocess = ColumnTransformer([
('encoder', preprocessing.OneHotEncoder(drop='if_binary'), ['Family', 'Car']),
('standardize', preprocessing.StandardScaler(), ['Income']),
('unchanged', 'passthrough', ['CreditCard']),
])
preprocess.fit_transform(df).round(4)
Out[6]:
array([[ 1. , 1. , 0. , 0. , 0.157 , 1. ],
[ 0. , 1. , 0. , 0. , -0.3864, 1. ],
[ 0. , 0. , 1. , 0. , -1.2194, 3. ],
[ 1. , 0. , 0. , 1. , 2.0042, 1. ],
[ 0. , 0. , 1. , 0. , 0.0121, 2. ],
[ 0. , 0. , 1. , 0. , -0.5675, 2. ]])
In [ ]:
Copied!