Skip to content

Chapter 14

Chapter 14 Association Rules and Collaborative Filtering

Original Code Credit:: Shmueli, Galit; Bruce, Peter C.; Gedeck, Peter; Patel, Nitin R.. Data Mining for Business Analytics Wiley.

Modifications have been made from the original textbook examples due to version changes in library dependencies and/or for clarity.

Download this notebook and data here.

Import Libraries

import os
import heapq
import random
from collections import defaultdict
import pandas as pd

import matplotlib.pylab as plt
from mlxtend.frequent_patterns import apriori, association_rules
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split

14.1 Association Rules

Example 1: Synthetic Data on Purchases of Phone Faceplates

# Load and preprocess data set 
fp_df = pd.read_csv(os.path.join('data', 'Faceplate.csv'))
fp_df.set_index('Transaction', inplace=True)
fp_df = fp_df.astype(bool, 0)

# create frequent itemsets
itemsets = apriori(fp_df, min_support=0.2, use_colnames=True)

# convert into rules
rules = association_rules(itemsets, num_itemsets=len(fp_df), metric='confidence', min_threshold=0.5)
rules.sort_values(by=['lift'], ascending=False).head(6)
antecedents consequents antecedent support consequent support support confidence lift representativity leverage conviction zhangs_metric jaccard certainty kulczynski
12 (Red, White) (Green) 0.4 0.2 0.2 0.5 2.500000 1.0 0.12 1.6 1.000 0.500000 0.375 0.750000
15 (Green) (Red, White) 0.2 0.4 0.2 1.0 2.500000 1.0 0.12 inf 0.750 0.500000 1.000 0.750000
4 (Green) (Red) 0.2 0.6 0.2 1.0 1.666667 1.0 0.08 inf 0.500 0.333333 1.000 0.666667
14 (White, Green) (Red) 0.2 0.6 0.2 1.0 1.666667 1.0 0.08 inf 0.500 0.333333 1.000 0.666667
7 (Orange) (White) 0.2 0.7 0.2 1.0 1.428571 1.0 0.06 inf 0.375 0.285714 1.000 0.642857
8 (Green) (White) 0.2 0.7 0.2 1.0 1.428571 1.0 0.06 inf 0.375 0.285714 1.000 0.642857

Example 2: Rules for Similar Book Purchases

# load dataset
all_books_df = pd.read_csv(os.path.join('data', 'CharlesBookClub.csv'))
ignore = ['Seq#', 'ID#', 'Gender', 'M', 'R', 'F', 'FirstPurch', 'Related Purchase',
          'Mcode', 'Rcode', 'Fcode', 'Yes_Florence', 'No_Florence']
count_books = all_books_df.drop(columns=ignore)
count_books[count_books > 0] = 1
count_books = count_books.astype(bool, 0)
# create frequent itemsets and rules
itemsets = apriori(count_books, min_support=200/4000, use_colnames=True)
rules = association_rules(itemsets, num_itemsets=len(count_books), metric='confidence', min_threshold=0.5)
rules.sort_values(by=['lift'], ascending=False).head(10)
antecedents consequents antecedent support consequent support support confidence lift representativity leverage conviction zhangs_metric jaccard certainty kulczynski
64 (YouthBks, RefBks) (ChildBks, CookBks) 0.08125 0.24200 0.05525 0.680000 2.809917 1.0 0.035588 2.368750 0.701080 0.206157 0.577836 0.454153
73 (RefBks, DoItYBks) (ChildBks, CookBks) 0.09250 0.24200 0.06125 0.662162 2.736207 1.0 0.038865 2.243680 0.699207 0.224154 0.554304 0.457631
60 (YouthBks, DoItYBks) (ChildBks, CookBks) 0.10325 0.24200 0.06700 0.648910 2.681448 1.0 0.042014 2.158993 0.699266 0.240791 0.536821 0.462885
80 (GeogBks, RefBks) (ChildBks, CookBks) 0.08175 0.24200 0.05025 0.614679 2.539995 1.0 0.030467 1.967190 0.660276 0.183729 0.491661 0.411162
69 (GeogBks, YouthBks) (ChildBks, CookBks) 0.10450 0.24200 0.06325 0.605263 2.501087 1.0 0.037961 1.920267 0.670211 0.223301 0.479239 0.433313
77 (GeogBks, DoItYBks) (ChildBks, CookBks) 0.10100 0.24200 0.06050 0.599010 2.475248 1.0 0.036058 1.890321 0.662959 0.214159 0.470989 0.424505
67 (GeogBks, ChildBks, CookBks) (YouthBks) 0.10950 0.23825 0.06325 0.577626 2.424452 1.0 0.037162 1.803495 0.659782 0.222320 0.445521 0.421552
70 (ChildBks, RefBks, CookBks) (DoItYBks) 0.10350 0.25475 0.06125 0.591787 2.323013 1.0 0.034883 1.825642 0.635276 0.206229 0.452247 0.416110
48 (GeogBks, DoItYBks) (YouthBks) 0.10100 0.23825 0.05450 0.539604 2.264864 1.0 0.030437 1.654554 0.621215 0.191396 0.395607 0.384178
63 (ChildBks, RefBks, CookBks) (YouthBks) 0.10350 0.23825 0.05525 0.533816 2.240573 1.0 0.030591 1.634013 0.617608 0.192845 0.388010 0.382858

14.2 Collaborative Filtering

Example 3: Netflix Prize Contest

random.seed(0)
nratings = 5000
randomData = pd.DataFrame({
    'itemID': [random.randint(0,99) for _ in range(nratings)],
    'userID': [random.randint(0,999) for _ in range(nratings)],
    'rating': [random.randint(1,5) for _ in range(nratings)]
})
def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    byUser = defaultdict(list)
    for p in predictions:
        byUser[p.uid].append(p)

    # For each user, reduce predictions to top-n
    for uid, userPredictions in byUser.items():
        byUser[uid] = heapq.nlargest(n, userPredictions, key=lambda p: p.est)
    return byUser
# Convert the data set into the format required by the surprise package
# The columns must correspond to user id, item id, and ratings (in that order)
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(randomData[['userID', 'itemID', 'rating']], reader)
# Split into training and test set
trainset, testset = train_test_split(data, test_size=.25, random_state=1)
## User-based filtering
# compute cosine similarity between users 
sim_options = {'name': 'cosine', 'user_based': True}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)
# predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = algo.test(testset) 
# Print the recommended items for each user
top_n = get_top_n(predictions, n=4)
print('Top-4 recommended items for each user')
for uid, user_ratings in list(top_n.items())[:5]:
    print('User {}'.format(uid))
    for prediction in user_ratings:
        print('  Item {0.iid} ({0.est:.2f})'.format(prediction), end='')
    print()
Computing the cosine similarity matrix...
Done computing similarity matrix.
Top-4 recommended items for each user
User 6
  Item 6 (5.00)  Item 77 (2.50)  Item 60 (1.00)
User 222
  Item 77 (3.50)  Item 75 (2.78)
User 424
  Item 14 (3.50)  Item 45 (3.10)  Item 54 (2.34)
User 87
  Item 27 (3.00)  Item 54 (3.00)  Item 82 (3.00)  Item 32 (1.00)
User 121
  Item 98 (3.48)  Item 32 (2.83)
trainset = data.build_full_trainset()
sim_options = {'name': 'cosine', 'user_based': False}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)
# Predict rating for user 383 and item 7
algo.predict(383, 7)
Computing the cosine similarity matrix...
Done computing similarity matrix.





Prediction(uid=383, iid=7, r_ui=None, est=2.3661840936304324, details={'actual_k': 4, 'was_impossible': False})