This notebook does the data preparation for the recommendation list analysis.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import product
import ujson
from bookgender.config import data_dir
Load book gender data and clean it up:
book_gender = pd.read_csv('data/author-gender.csv.gz')
book_gender = book_gender.set_index('item')['gender']
book_gender.loc[book_gender.str.startswith('no-')] = 'unknown'
book_gender.loc[book_gender == 'unlinked'] = 'unknown'
book_gender = book_gender.astype('category')
book_gender.describe()
book_gender.head()
And load hashes:
book_hash = pd.read_parquet('data/book-hash.parquet').rename(columns={'cluster': 'item'})
book_hash['dcode'] = book_hash['md5'].apply(lambda x: int(x[-1], 16) % 2)
book_hash = book_hash.set_index('item')
book_hash.head()
Load the user profile data:
profiles = pd.read_pickle('data/profile-data.pkl')
profiles.head()
datasets = list(profiles.index.levels[0])
datasets
And load the recommendations:
recs = pd.read_parquet('data/study-recs.parquet')
recs.rename(columns={'dataset': 'Set', 'algorithm': 'Algorithm'}, inplace=True)
recs.head()
The original paper truncated recommendation lists to 50. Let's do that too:
recs = recs[recs['rank'] <= 50]
recs.Set.unique()
recs.Algorithm.unique()
We will need to extract implicit/explicit from those. In the new paper, we are going to separate out implicit and explicit data for presentation; these functions will help with that.
def select_implicit(data, reset=True):
if reset:
data = data.reset_index()
implicit = data['Algorithm'].str.endswith('-imp') | data['Set'].str.endswith('-I')
data = data[implicit].assign(Set=data['Set'].str.replace('-I', ''),
Algorithm=data['Algorithm'].str.replace('-imp', ''))
data['Algorithm'] = data['Algorithm'].str.replace('wrls', 'als')
return data
def select_explicit(data, reset=True):
if reset:
data = data.reset_index()
implicit = data['Algorithm'].str.endswith('-imp') | data['Set'].str.endswith('-I')
data = data[~implicit].assign(Set=data['Set'].str.replace('-E', ''))
return data
And give ourselves a handy way to relable algorithms:
algo_labels = {
'als': 'ALS',
'bpr': 'BPR',
'item-item': 'II',
'user-user': 'UU'
}
In the mean time, let's proceed by computing recommendation list gender data.
recs.drop(columns=['gender'], errors='ignore', inplace=True)
recs = recs.join(book_gender, on='item', how='left')
recs['gender'] = recs['gender'].fillna('unknown')
recs['gender'].describe()
And mix in the dummy code data:
recs.drop(columns=['dcode'], errors='ignore', inplace=True)
recs = recs.join(book_hash['dcode'], on='item', how='left')
recs.head()
Count up the statistics for each list by gender:
rec_stats = recs.groupby(['Set', 'Algorithm', 'user'])['gender'].value_counts().unstack(fill_value=0)
rec_stats.columns = rec_stats.columns.astype('object')
rec_stats['Total'] = rec_stats.sum(axis=1)
rec_stats['Known'] = rec_stats['male'].fillna(0) + rec_stats['female'].fillna(0)
rec_stats['PropKnown'] = rec_stats['Known'] / rec_stats['Total']
rec_stats['PropFemale'] = rec_stats['female'] / rec_stats['Known']
rec_stats
rec_stats.info()
Mix in info from dummy codes:
rec_dc_stats = recs.groupby(['Set', 'Algorithm', 'user'])['dcode'].agg(['count', 'sum', 'mean'])
rec_dc_stats.rename(columns={'count': 'dcknown', 'sum': 'dcyes', 'mean': 'PropDC'}, inplace=True)
rec_dc_stats['dcyes'] = rec_dc_stats['dcyes'].astype('i4')
rec_dc_stats.head()
rec_stats = rec_stats.join(rec_dc_stats)
rec_stats.head()
Quick status-check on the number of recommendation lists per algorithm, implicit feedback:
select_implicit(rec_stats).groupby(['Set', 'Algorithm'])['Total'].count().unstack()
Explicit feedback:
select_explicit(rec_stats).groupby(['Set', 'Algorithm'])['Total'].count().unstack()
We also want to compute the makeup of non-personalized recommendations, to get a baseline level for each algorithm.
az_ratings = pd.read_parquet('data/AZ/ratings.parquet')
bxi_ratings = pd.read_parquet('data/BX-I/ratings.parquet')
bxe_ratings = pd.read_parquet('data/BX-E/ratings.parquet')
gre_ratings = pd.read_parquet('data/GR-E/ratings.parquet')
gri_ratings = pd.read_parquet('data/GR-I/ratings.parquet')
istats = pd.concat({
'AZ': az_ratings.groupby('item')['user'].count().nlargest(50),
'BX-I': bxi_ratings.groupby('item')['user'].count().nlargest(50),
'BX-E': bxe_ratings.groupby('item')['user'].count().nlargest(50),
'GR-I': gri_ratings.groupby('item')['user'].count().nlargest(50),
'GR-E': gre_ratings.groupby('item')['user'].count().nlargest(50)
}, names=['Set'])
istats = istats.reset_index(name='count')
istats.head()
irecs = istats.join(book_gender, on='item', how='left')
irecs['gender'] = irecs['gender'].fillna('unknown')
irecs.head()
pop_gender = irecs.groupby(['Set', 'gender']).item.count().unstack().fillna(0).astype('i4')
pop_gender.columns = pop_gender.columns.astype('object')
pop_gender['Total'] = pop_gender.sum(axis=1)
pop_gender['Known'] = pop_gender['male'] + pop_gender['female']
pop_gender['PropKnown'] = pop_gender['Known'] / pop_gender['Total']
pop_gender['PropFemale'] = pop_gender['female'] / pop_gender['Known']
pop_gender
astats = pd.concat({
'AZ': az_ratings.groupby('item')['rating'].mean().nlargest(50),
'BX-E': bxe_ratings.groupby('item')['rating'].mean().nlargest(50),
'GR-E': gre_ratings.groupby('item')['rating'].mean().nlargest(50)
}, names=['Set'])
astats = astats.reset_index(name='count')
astats.head()
arecs = astats.join(book_gender, on='item', how='left')
arecs['gender'] = arecs['gender'].fillna('unknown')
arecs.head()
avg_gender = arecs.groupby(['Set', 'gender']).item.count().unstack().fillna(0).astype('i4')
avg_gender.columns = avg_gender.columns.astype('object')
avg_gender['Total'] = avg_gender.sum(axis=1)
avg_gender['Known'] = avg_gender['male'] + avg_gender['female']
avg_gender['PropKnown'] = avg_gender['Known'] / avg_gender['Total']
avg_gender['PropFemale'] = avg_gender['female'] / avg_gender['Known']
avg_gender
We want to understand how the recommendation lists work to better understand how many items we get.
list_counts = recs.groupby(['Set', 'Algorithm'])['user'].nunique()
list_counts.name = 'Lists'
item_counts = recs.groupby(['Set', 'Algorithm'])['item'].agg(['count', 'nunique'])
item_counts.rename(columns={'count': 'Recs', 'nunique': 'Distinct'}, inplace=True)
item_counts = item_counts.join(list_counts)
item_counts['FracDistinct'] = item_counts['Distinct'] / item_counts['Recs']
What does this look like for implicit?
df = select_implicit(item_counts).set_index(['Algorithm', 'Set']).stack().reorder_levels([0, 2, 1]).unstack().unstack()
df = df.rename(index=algo_labels)
df
def f_n(n):
return '{:,.0f}'.format(n)
def f_pct(n):
return '{:.1f}%'.format(n * 100)
print(df.swaplevel(axis=1).loc[:, ['Recs', 'Distinct', 'FracDistinct']].to_latex(formatters=[
f_n, f_n, f_pct,
f_n, f_n, f_pct,
f_n, f_n, f_pct
]))
And explicit?
df = select_explicit(item_counts).set_index(['Algorithm', 'Set']).stack().reorder_levels([0, 2, 1]).unstack().unstack()
df = df.rename(index=algo_labels)
df
print(df.swaplevel(axis=1).loc[:, ['Recs', 'Distinct', 'FracDistinct']].to_latex(formatters=[
f_n, f_n, f_pct,
f_n, f_n, f_pct,
f_n, f_n, f_pct
]))
select_implicit(rec_stats).groupby(['Algorithm', 'Set']).PropFemale.mean().unstack()
np.sqrt(select_implicit(rec_stats).groupby(['Algorithm', 'Set']).PropFemale.var()).unstack()
select_explicit(rec_stats).groupby(['Algorithm', 'Set']).PropFemale.mean().unstack()
np.sqrt(select_explicit(rec_stats).groupby(['Algorithm', 'Set']).PropFemale.var()).unstack()
Now that we have all of this, we can start to look at recommendation list distributions. How is Proportion Female distributed?
grid = sns.FacetGrid(col='Set', row='Algorithm', data=select_implicit(rec_stats), sharey=False, margin_titles=True)
grid.map(sns.distplot, 'PropFemale', kde=False, norm_hist=True)
grid = sns.FacetGrid(col='Set', row='Algorithm', data=select_implicit(rec_stats), sharey=False, margin_titles=True)
grid.map(sns.distplot, 'PropKnown', kde=False, norm_hist=True)
grid = sns.FacetGrid(col='Set', row='Algorithm', data=select_explicit(rec_stats), sharey=False, margin_titles=True)
grid.map(sns.distplot, 'PropFemale', kde=False, norm_hist=True)
grid = sns.FacetGrid(col='Set', row='Algorithm', data=select_explicit(rec_stats), sharey=False, margin_titles=True)
grid.map(sns.distplot, 'PropKnown', kde=False, norm_hist=True)
grid = sns.FacetGrid(col='Set', row='Algorithm', data=select_implicit(rec_stats), sharey=False, margin_titles=True)
grid.map(sns.distplot, 'PropDC', kde=False, norm_hist=True)
grid = sns.FacetGrid(col='Set', row='Algorithm', data=select_explicit(rec_stats), sharey=False, margin_titles=True)
grid.map(sns.distplot, 'PropDC', kde=False, norm_hist=True)
With this analysis, we need to prepare our recommendation data for modeling.
Because ALS on BX-E behaves so badly, we can't really use it. Drop from further analysis.
rec_stats = rec_stats.drop(('BX-E', 'als'))
rec_stats.to_pickle('data/rec-data.pkl')
We also want to save this data for STAN.
def inf_dir(sname):
return data_dir / sname / 'inference'
for sname, frame in rec_stats.groupby('Set'):
print('preparing STAN input for', sname)
lists = frame.reset_index().astype({'Algorithm': 'category'})
algos = lists['Algorithm'].cat.categories
print(sname, 'has algorithms', algos)
# set up the users
users = profiles.loc[sname, :]
users = users.assign(unum=np.arange(len(users), dtype='i4') + 1)
lists = lists.join(users[['unum']], on='user')
data = {
'A': len(algos),
'J': len(users),
'NL': len(lists),
'n': users['Known'],
'y': users['female'],
'ru': lists['unum'],
'ra': lists['Algorithm'].cat.codes + 1,
'rn': lists['Known'],
'ry': lists['female']
}
# and write
dir = inf_dir(sname)
dir.mkdir(exist_ok=True)
in_fn = dir / 'full-inputs.json'
in_fn.write_text(ujson.dumps(data))
# in_fn.write_text(ujson.dumps(stan_inputs(frame, 'Known', 'female')))