Chapter 15
Chapter 15 Association Rules and Collaborative Filtering¶
Original Code Credit:: Shmueli, Galit; Bruce, Peter C.; Gedeck, Peter; Patel, Nitin R.. Machine Learning for Business Analytics Wiley.
Modifications have been made from the original textbook examples due to version changes in library dependencies and/or for clarity.
Import Libraries¶
In [21]:
Copied!
import os
import heapq
import random
from collections import defaultdict
import pandas as pd
import matplotlib.pylab as plt
from mlxtend.frequent_patterns import apriori, association_rules
import surprise
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
import os
import heapq
import random
from collections import defaultdict
import pandas as pd
import matplotlib.pylab as plt
from mlxtend.frequent_patterns import apriori, association_rules
import surprise
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
15.1 Association Rules¶
Example 1: Synthetic Data on Purchases of Phone Faceplates¶
In [4]:
Copied!
# Load and preprocess data set
fp_df = pd.read_csv(os.path.join('..', 'data', 'Faceplate.csv'))
fp_df.set_index('Transaction', inplace=True)
fp_df = fp_df.astype(bool, 0)
# create frequent itemsets
itemsets = apriori(fp_df, min_support=0.2, use_colnames=True)
# convert into rules
rules = association_rules(itemsets, num_itemsets=len(fp_df), metric='confidence', min_threshold=0.5)
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift','leverage']].sort_values(by=['lift'], ascending=False).head(6)
# Load and preprocess data set
fp_df = pd.read_csv(os.path.join('..', 'data', 'Faceplate.csv'))
fp_df.set_index('Transaction', inplace=True)
fp_df = fp_df.astype(bool, 0)
# create frequent itemsets
itemsets = apriori(fp_df, min_support=0.2, use_colnames=True)
# convert into rules
rules = association_rules(itemsets, num_itemsets=len(fp_df), metric='confidence', min_threshold=0.5)
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift','leverage']].sort_values(by=['lift'], ascending=False).head(6)
Out[4]:
| antecedents | consequents | support | confidence | lift | leverage | |
|---|---|---|---|---|---|---|
| 12 | (White, Red) | (Green) | 0.2 | 0.5 | 2.500000 | 0.12 |
| 15 | (Green) | (White, Red) | 0.2 | 1.0 | 2.500000 | 0.12 |
| 4 | (Green) | (Red) | 0.2 | 1.0 | 1.666667 | 0.08 |
| 13 | (White, Green) | (Red) | 0.2 | 1.0 | 1.666667 | 0.08 |
| 7 | (Orange) | (White) | 0.2 | 1.0 | 1.428571 | 0.06 |
| 8 | (Green) | (White) | 0.2 | 1.0 | 1.428571 | 0.06 |
Example 2: Rules for Similar Book Purchases¶
In [6]:
Copied!
# load dataset
all_books_df = pd.read_csv(os.path.join('..', 'data', 'CharlesBookClub.csv'))
ignore = ['Seq#', 'ID#', 'Gender', 'M', 'R', 'F', 'FirstPurch', 'Related Purchase',
'Mcode', 'Rcode', 'Fcode', 'Yes_Florence', 'No_Florence']
count_books = all_books_df.drop(columns=ignore)
count_books[count_books > 0] = 1
count_books = count_books.astype(bool, 0)
# create frequent itemsets and rules
itemsets = apriori(count_books, min_support=200/4000, use_colnames=True)
rules = association_rules(itemsets, num_itemsets=len(count_books), metric='confidence', min_threshold=0.5)
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift','leverage']].sort_values(by=['lift'], ascending=False).head(10)
# load dataset
all_books_df = pd.read_csv(os.path.join('..', 'data', 'CharlesBookClub.csv'))
ignore = ['Seq#', 'ID#', 'Gender', 'M', 'R', 'F', 'FirstPurch', 'Related Purchase',
'Mcode', 'Rcode', 'Fcode', 'Yes_Florence', 'No_Florence']
count_books = all_books_df.drop(columns=ignore)
count_books[count_books > 0] = 1
count_books = count_books.astype(bool, 0)
# create frequent itemsets and rules
itemsets = apriori(count_books, min_support=200/4000, use_colnames=True)
rules = association_rules(itemsets, num_itemsets=len(count_books), metric='confidence', min_threshold=0.5)
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift','leverage']].sort_values(by=['lift'], ascending=False).head(10)
Out[6]:
| antecedents | consequents | support | confidence | lift | leverage | |
|---|---|---|---|---|---|---|
| 64 | (RefBks, YouthBks) | (ChildBks, CookBks) | 0.05525 | 0.680000 | 2.809917 | 0.035588 |
| 73 | (DoItYBks, RefBks) | (ChildBks, CookBks) | 0.06125 | 0.662162 | 2.736207 | 0.038865 |
| 60 | (DoItYBks, YouthBks) | (ChildBks, CookBks) | 0.06700 | 0.648910 | 2.681448 | 0.042014 |
| 80 | (RefBks, GeogBks) | (ChildBks, CookBks) | 0.05025 | 0.614679 | 2.539995 | 0.030467 |
| 69 | (GeogBks, YouthBks) | (ChildBks, CookBks) | 0.06325 | 0.605263 | 2.501087 | 0.037961 |
| 77 | (DoItYBks, GeogBks) | (ChildBks, CookBks) | 0.06050 | 0.599010 | 2.475248 | 0.036058 |
| 67 | (GeogBks, ChildBks, CookBks) | (YouthBks) | 0.06325 | 0.577626 | 2.424452 | 0.037162 |
| 72 | (RefBks, ChildBks, CookBks) | (DoItYBks) | 0.06125 | 0.591787 | 2.323013 | 0.034883 |
| 48 | (DoItYBks, GeogBks) | (YouthBks) | 0.05450 | 0.539604 | 2.264864 | 0.030437 |
| 63 | (RefBks, ChildBks, CookBks) | (YouthBks) | 0.05525 | 0.533816 | 2.240573 | 0.030591 |
15.2 Collaborative Filtering¶
Example 3: Netflix Prize Contest¶
In [7]:
Copied!
random.seed(0)
nratings = 5000
randomData = pd.DataFrame({
'itemID': [random.randint(0,99) for _ in range(nratings)],
'userID': [random.randint(0,999) for _ in range(nratings)],
'rating': [random.randint(1,5) for _ in range(nratings)]
})
def get_top_n(predictions, n=10):
# First map the predictions to each user.
byUser = defaultdict(list)
for p in predictions:
byUser[p.uid].append(p)
# For each user, reduce predictions to top-n
for uid, userPredictions in byUser.items():
byUser[uid] = heapq.nlargest(n, userPredictions, key=lambda p: p.est)
return byUser
random.seed(0)
nratings = 5000
randomData = pd.DataFrame({
'itemID': [random.randint(0,99) for _ in range(nratings)],
'userID': [random.randint(0,999) for _ in range(nratings)],
'rating': [random.randint(1,5) for _ in range(nratings)]
})
def get_top_n(predictions, n=10):
# First map the predictions to each user.
byUser = defaultdict(list)
for p in predictions:
byUser[p.uid].append(p)
# For each user, reduce predictions to top-n
for uid, userPredictions in byUser.items():
byUser[uid] = heapq.nlargest(n, userPredictions, key=lambda p: p.est)
return byUser
In [8]:
Copied!
# Convert the data set into the format required by the surprise package
# The columns must correspond to user id, item id, and ratings (in that order)
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(randomData[['userID', 'itemID', 'rating']], reader)
# Split into training and test set
trainset, testset = train_test_split(data, test_size=.25, random_state=1)
## User-based filtering
# compute cosine similarity between users
sim_options = {'name': 'cosine', 'user_based': True}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)
# predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = algo.test(testset)
# Print the recommended items for each user
top_n = get_top_n(predictions, n=4)
print('Top-4 recommended items for each user')
for uid, user_ratings in list(top_n.items())[:5]:
print('User {}'.format(uid))
for prediction in user_ratings:
print(' Item {0.iid} ({0.est:.2f})'.format(prediction), end='')
print()
# Convert the data set into the format required by the surprise package
# The columns must correspond to user id, item id, and ratings (in that order)
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(randomData[['userID', 'itemID', 'rating']], reader)
# Split into training and test set
trainset, testset = train_test_split(data, test_size=.25, random_state=1)
## User-based filtering
# compute cosine similarity between users
sim_options = {'name': 'cosine', 'user_based': True}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)
# predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = algo.test(testset)
# Print the recommended items for each user
top_n = get_top_n(predictions, n=4)
print('Top-4 recommended items for each user')
for uid, user_ratings in list(top_n.items())[:5]:
print('User {}'.format(uid))
for prediction in user_ratings:
print(' Item {0.iid} ({0.est:.2f})'.format(prediction), end='')
print()
Computing the cosine similarity matrix... Done computing similarity matrix. Top-4 recommended items for each user User 6 Item 6 (5.00) Item 77 (2.50) Item 60 (1.00) User 222 Item 77 (3.50) Item 75 (2.78) User 424 Item 14 (3.50) Item 45 (3.10) Item 54 (2.34) User 87 Item 27 (3.00) Item 54 (3.00) Item 82 (3.00) Item 32 (1.00) User 121 Item 98 (3.48) Item 32 (2.83)
In [9]:
Copied!
trainset = data.build_full_trainset()
sim_options = {'name': 'cosine', 'user_based': False}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)
# Predict rating for user 383 and item 7
algo.predict(383, 7)
trainset = data.build_full_trainset()
sim_options = {'name': 'cosine', 'user_based': False}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)
# Predict rating for user 383 and item 7
algo.predict(383, 7)
Computing the cosine similarity matrix... Done computing similarity matrix.
Out[9]:
Prediction(uid=383, iid=7, r_ui=None, est=2.3661840936304324, details={'actual_k': 4, 'was_impossible': False})
Example 4: Predicting Movie Ratings with MovieLens Data¶
In [18]:
Copied!
# download MovieLens data
ratings = pd.read_csv(os.path.join('..', 'data', 'MovieLensRatings.csv'))
movies = pd.read_csv(os.path.join('..', 'data', 'MovieLensMovies.csv'))
# create a dictionary of movie titles by movieId
moviesById = {movie['movieId']: movie['title'] for _, movie in movies.iterrows()}
# convert ratings into dataset suitable for scikit-surprise and split into
# training and holdout sets
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset, holdoutset = train_test_split(data, test_size=0.2, random_state=1)
# download MovieLens data
ratings = pd.read_csv(os.path.join('..', 'data', 'MovieLensRatings.csv'))
movies = pd.read_csv(os.path.join('..', 'data', 'MovieLensMovies.csv'))
# create a dictionary of movie titles by movieId
moviesById = {movie['movieId']: movie['title'] for _, movie in movies.iterrows()}
# convert ratings into dataset suitable for scikit-surprise and split into
# training and holdout sets
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset, holdoutset = train_test_split(data, test_size=0.2, random_state=1)
In [19]:
Copied!
# UBCF model and prediction
sim_options = {'name': 'cosine', 'user_based': True}
ubcf = KNNBasic(sim_options=sim_options)
ubcf.fit(trainset)
# predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = ubcf.test(holdoutset)
top_n = get_top_n(predictions, n=4)
print('UBCF Top-4 recommended items for each user')
for uid, user_ratings in list(top_n.items())[:3]:
print(f'User {uid}')
for prediction in user_ratings:
print(f' Est. {prediction.est:.2f} / act. {prediction.r_ui}:',
f' {moviesById[prediction.iid]:40s}')
# IBCF model and prediction
sim_options = {'name': 'cosine', 'user_based': False}
ibcf = KNNBasic(sim_options=sim_options)
ibcf.fit(trainset)
# predictions
top_n = get_top_n(ibcf.test(holdoutset), n=4)
print('IBCF Top-4 recommended items for each user')
for uid, user_ratings in list(top_n.items())[:3]:
print(f'User {uid}')
for prediction in user_ratings:
print(f' Est. {prediction.est:.2f} / act. {prediction.r_ui}:',
f' {moviesById[prediction.iid]:40s}')
# UBCF model and prediction
sim_options = {'name': 'cosine', 'user_based': True}
ubcf = KNNBasic(sim_options=sim_options)
ubcf.fit(trainset)
# predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = ubcf.test(holdoutset)
top_n = get_top_n(predictions, n=4)
print('UBCF Top-4 recommended items for each user')
for uid, user_ratings in list(top_n.items())[:3]:
print(f'User {uid}')
for prediction in user_ratings:
print(f' Est. {prediction.est:.2f} / act. {prediction.r_ui}:',
f' {moviesById[prediction.iid]:40s}')
# IBCF model and prediction
sim_options = {'name': 'cosine', 'user_based': False}
ibcf = KNNBasic(sim_options=sim_options)
ibcf.fit(trainset)
# predictions
top_n = get_top_n(ibcf.test(holdoutset), n=4)
print('IBCF Top-4 recommended items for each user')
for uid, user_ratings in list(top_n.items())[:3]:
print(f'User {uid}')
for prediction in user_ratings:
print(f' Est. {prediction.est:.2f} / act. {prediction.r_ui}:',
f' {moviesById[prediction.iid]:40s}')
Computing the cosine similarity matrix... Done computing similarity matrix. UBCF Top-4 recommended items for each user User 469 Est. 4.45 / act. 4.0: Usual Suspects, The (1995) Est. 4.36 / act. 5.0: Fargo (1996) Est. 4.36 / act. 3.0: Star Wars: Episode VI - Return of the Jedi (1983) Est. 4.33 / act. 5.0: Harold and Maude (1971) User 187 Est. 4.39 / act. 4.5: Taxi Driver (1976) Est. 4.34 / act. 3.0: Fight Club (1999) Est. 4.33 / act. 5.0: Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) Est. 4.32 / act. 4.5: Aliens (1986) User 399 Est. 4.24 / act. 5.0: Forrest Gump (1994) Est. 4.18 / act. 5.0: Lord of the Rings: The Two Towers, The (2002) Est. 4.11 / act. 5.0: Back to the Future (1985) Est. 4.07 / act. 2.5: Terminator 2: Judgment Day (1991) Computing the cosine similarity matrix... Done computing similarity matrix. IBCF Top-4 recommended items for each user User 469 Est. 4.00 / act. 3.0: Bug's Life, A (1998) Est. 3.98 / act. 5.0: Exorcist, The (1973) Est. 3.97 / act. 5.0: Fish Called Wanda, A (1988) Est. 3.95 / act. 2.0: Rocky III (1982) User 187 Est. 4.33 / act. 5.0: Shaun of the Dead (2004) Est. 4.30 / act. 4.5: Aliens (1986) Est. 4.29 / act. 4.0: Pan's Labyrinth (Laberinto del fauno, El) (2006) Est. 4.28 / act. 3.0: Zodiac (2007) User 399 Est. 3.97 / act. 3.5: Crouching Tiger, Hidden Dragon (Wo hu cang long) (2000) Est. 3.97 / act. 5.0: Lord of the Rings: The Two Towers, The (2002) Est. 3.97 / act. 2.5: Terminator 2: Judgment Day (1991) Est. 3.97 / act. 5.0: Back to the Future (1985)
In [22]:
Copied!
ubcf_pred = ubcf.test(holdoutset)
ibcf_pred = ibcf.test(holdoutset)
random_pred = [surprise.Prediction(0, 0, random.randint(1, 5),
random.randint(1, 5), None)
for _ in range(len(holdoutset))]
pd.DataFrame({
'UBCF': {'RMSE': surprise.accuracy.rmse(ubcf_pred, verbose=False),
'MSE': surprise.accuracy.mse(ubcf_pred, verbose=False),
'MAE': surprise.accuracy.mae(ubcf_pred, verbose=False)},
'IBCF': {'RMSE': surprise.accuracy.rmse(ibcf_pred, verbose=False),
'MSE': surprise.accuracy.mse(ibcf_pred, verbose=False),
'MAE': surprise.accuracy.mae(ibcf_pred, verbose=False)},
'Random': {'RMSE': surprise.accuracy.rmse(random_pred, verbose=False),
'MSE': surprise.accuracy.mse(random_pred, verbose=False),
'MAE': surprise.accuracy.mae(random_pred, verbose=False)},
}).transpose().round(3)
ubcf_pred = ubcf.test(holdoutset)
ibcf_pred = ibcf.test(holdoutset)
random_pred = [surprise.Prediction(0, 0, random.randint(1, 5),
random.randint(1, 5), None)
for _ in range(len(holdoutset))]
pd.DataFrame({
'UBCF': {'RMSE': surprise.accuracy.rmse(ubcf_pred, verbose=False),
'MSE': surprise.accuracy.mse(ubcf_pred, verbose=False),
'MAE': surprise.accuracy.mae(ubcf_pred, verbose=False)},
'IBCF': {'RMSE': surprise.accuracy.rmse(ibcf_pred, verbose=False),
'MSE': surprise.accuracy.mse(ibcf_pred, verbose=False),
'MAE': surprise.accuracy.mae(ibcf_pred, verbose=False)},
'Random': {'RMSE': surprise.accuracy.rmse(random_pred, verbose=False),
'MSE': surprise.accuracy.mse(random_pred, verbose=False),
'MAE': surprise.accuracy.mae(random_pred, verbose=False)},
}).transpose().round(3)
Out[22]:
| RMSE | MSE | MAE | |
|---|---|---|---|
| UBCF | 0.973 | 0.948 | 0.750 |
| IBCF | 0.978 | 0.957 | 0.762 |
| Random | 2.008 | 4.033 | 1.608 |
In [ ]:
Copied!