Chapter 14
Chapter 14 Association Rules and Collaborative Filtering
Original Code Credit:: Shmueli, Galit; Bruce, Peter C.; Gedeck, Peter; Patel, Nitin R.. Data Mining for Business Analytics Wiley.
Modifications have been made from the original textbook examples due to version changes in library dependencies and/or for clarity.
Download this notebook and data here.
Import Libraries
import os
import heapq
import random
from collections import defaultdict
import pandas as pd
import matplotlib.pylab as plt
from mlxtend.frequent_patterns import apriori, association_rules
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
14.1 Association Rules
Example 1: Synthetic Data on Purchases of Phone Faceplates
# Load and preprocess data set
fp_df = pd.read_csv(os.path.join('data', 'Faceplate.csv'))
fp_df.set_index('Transaction', inplace=True)
fp_df = fp_df.astype(bool, 0)
# create frequent itemsets
itemsets = apriori(fp_df, min_support=0.2, use_colnames=True)
# convert into rules
rules = association_rules(itemsets, num_itemsets=len(fp_df), metric='confidence', min_threshold=0.5)
rules.sort_values(by=['lift'], ascending=False).head(6)
antecedents | consequents | antecedent support | consequent support | support | confidence | lift | representativity | leverage | conviction | zhangs_metric | jaccard | certainty | kulczynski | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
12 | (Red, White) | (Green) | 0.4 | 0.2 | 0.2 | 0.5 | 2.500000 | 1.0 | 0.12 | 1.6 | 1.000 | 0.500000 | 0.375 | 0.750000 |
15 | (Green) | (Red, White) | 0.2 | 0.4 | 0.2 | 1.0 | 2.500000 | 1.0 | 0.12 | inf | 0.750 | 0.500000 | 1.000 | 0.750000 |
4 | (Green) | (Red) | 0.2 | 0.6 | 0.2 | 1.0 | 1.666667 | 1.0 | 0.08 | inf | 0.500 | 0.333333 | 1.000 | 0.666667 |
14 | (White, Green) | (Red) | 0.2 | 0.6 | 0.2 | 1.0 | 1.666667 | 1.0 | 0.08 | inf | 0.500 | 0.333333 | 1.000 | 0.666667 |
7 | (Orange) | (White) | 0.2 | 0.7 | 0.2 | 1.0 | 1.428571 | 1.0 | 0.06 | inf | 0.375 | 0.285714 | 1.000 | 0.642857 |
8 | (Green) | (White) | 0.2 | 0.7 | 0.2 | 1.0 | 1.428571 | 1.0 | 0.06 | inf | 0.375 | 0.285714 | 1.000 | 0.642857 |
Example 2: Rules for Similar Book Purchases
# load dataset
all_books_df = pd.read_csv(os.path.join('data', 'CharlesBookClub.csv'))
ignore = ['Seq#', 'ID#', 'Gender', 'M', 'R', 'F', 'FirstPurch', 'Related Purchase',
'Mcode', 'Rcode', 'Fcode', 'Yes_Florence', 'No_Florence']
count_books = all_books_df.drop(columns=ignore)
count_books[count_books > 0] = 1
count_books = count_books.astype(bool, 0)
# create frequent itemsets and rules
itemsets = apriori(count_books, min_support=200/4000, use_colnames=True)
rules = association_rules(itemsets, num_itemsets=len(count_books), metric='confidence', min_threshold=0.5)
rules.sort_values(by=['lift'], ascending=False).head(10)
antecedents | consequents | antecedent support | consequent support | support | confidence | lift | representativity | leverage | conviction | zhangs_metric | jaccard | certainty | kulczynski | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
64 | (YouthBks, RefBks) | (ChildBks, CookBks) | 0.08125 | 0.24200 | 0.05525 | 0.680000 | 2.809917 | 1.0 | 0.035588 | 2.368750 | 0.701080 | 0.206157 | 0.577836 | 0.454153 |
73 | (RefBks, DoItYBks) | (ChildBks, CookBks) | 0.09250 | 0.24200 | 0.06125 | 0.662162 | 2.736207 | 1.0 | 0.038865 | 2.243680 | 0.699207 | 0.224154 | 0.554304 | 0.457631 |
60 | (YouthBks, DoItYBks) | (ChildBks, CookBks) | 0.10325 | 0.24200 | 0.06700 | 0.648910 | 2.681448 | 1.0 | 0.042014 | 2.158993 | 0.699266 | 0.240791 | 0.536821 | 0.462885 |
80 | (GeogBks, RefBks) | (ChildBks, CookBks) | 0.08175 | 0.24200 | 0.05025 | 0.614679 | 2.539995 | 1.0 | 0.030467 | 1.967190 | 0.660276 | 0.183729 | 0.491661 | 0.411162 |
69 | (GeogBks, YouthBks) | (ChildBks, CookBks) | 0.10450 | 0.24200 | 0.06325 | 0.605263 | 2.501087 | 1.0 | 0.037961 | 1.920267 | 0.670211 | 0.223301 | 0.479239 | 0.433313 |
77 | (GeogBks, DoItYBks) | (ChildBks, CookBks) | 0.10100 | 0.24200 | 0.06050 | 0.599010 | 2.475248 | 1.0 | 0.036058 | 1.890321 | 0.662959 | 0.214159 | 0.470989 | 0.424505 |
67 | (GeogBks, ChildBks, CookBks) | (YouthBks) | 0.10950 | 0.23825 | 0.06325 | 0.577626 | 2.424452 | 1.0 | 0.037162 | 1.803495 | 0.659782 | 0.222320 | 0.445521 | 0.421552 |
70 | (ChildBks, RefBks, CookBks) | (DoItYBks) | 0.10350 | 0.25475 | 0.06125 | 0.591787 | 2.323013 | 1.0 | 0.034883 | 1.825642 | 0.635276 | 0.206229 | 0.452247 | 0.416110 |
48 | (GeogBks, DoItYBks) | (YouthBks) | 0.10100 | 0.23825 | 0.05450 | 0.539604 | 2.264864 | 1.0 | 0.030437 | 1.654554 | 0.621215 | 0.191396 | 0.395607 | 0.384178 |
63 | (ChildBks, RefBks, CookBks) | (YouthBks) | 0.10350 | 0.23825 | 0.05525 | 0.533816 | 2.240573 | 1.0 | 0.030591 | 1.634013 | 0.617608 | 0.192845 | 0.388010 | 0.382858 |
14.2 Collaborative Filtering
Example 3: Netflix Prize Contest
random.seed(0)
nratings = 5000
randomData = pd.DataFrame({
'itemID': [random.randint(0,99) for _ in range(nratings)],
'userID': [random.randint(0,999) for _ in range(nratings)],
'rating': [random.randint(1,5) for _ in range(nratings)]
})
def get_top_n(predictions, n=10):
# First map the predictions to each user.
byUser = defaultdict(list)
for p in predictions:
byUser[p.uid].append(p)
# For each user, reduce predictions to top-n
for uid, userPredictions in byUser.items():
byUser[uid] = heapq.nlargest(n, userPredictions, key=lambda p: p.est)
return byUser
# Convert the data set into the format required by the surprise package
# The columns must correspond to user id, item id, and ratings (in that order)
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(randomData[['userID', 'itemID', 'rating']], reader)
# Split into training and test set
trainset, testset = train_test_split(data, test_size=.25, random_state=1)
## User-based filtering
# compute cosine similarity between users
sim_options = {'name': 'cosine', 'user_based': True}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)
# predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = algo.test(testset)
# Print the recommended items for each user
top_n = get_top_n(predictions, n=4)
print('Top-4 recommended items for each user')
for uid, user_ratings in list(top_n.items())[:5]:
print('User {}'.format(uid))
for prediction in user_ratings:
print(' Item {0.iid} ({0.est:.2f})'.format(prediction), end='')
print()
Computing the cosine similarity matrix...
Done computing similarity matrix.
Top-4 recommended items for each user
User 6
Item 6 (5.00) Item 77 (2.50) Item 60 (1.00)
User 222
Item 77 (3.50) Item 75 (2.78)
User 424
Item 14 (3.50) Item 45 (3.10) Item 54 (2.34)
User 87
Item 27 (3.00) Item 54 (3.00) Item 82 (3.00) Item 32 (1.00)
User 121
Item 98 (3.48) Item 32 (2.83)
trainset = data.build_full_trainset()
sim_options = {'name': 'cosine', 'user_based': False}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)
# Predict rating for user 383 and item 7
algo.predict(383, 7)
Computing the cosine similarity matrix...
Done computing similarity matrix.
Prediction(uid=383, iid=7, r_ui=None, est=2.3661840936304324, details={'actual_k': 4, 'was_impossible': False})