This notebook contains correlational analyses of the recommendation list inputs and outputs, for comparison purposes and also to analyze the control bias.
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.special import logit, expit
with open('data/profile-data.pkl', 'rb') as f:
profiles = pickle.load(f)
profiles.info()
with open('data/rec-data.pkl', 'rb') as f:
recs = pickle.load(f)
recs.info()
Helpers from RecModel Analysis:
def select_implicit(data, reset=True):
if reset:
data = data.reset_index()
implicit = data['Set'].str.endswith('-I')
if 'Algorithm' in data.columns:
implicit |= data['Algorithm'].str.endswith('-imp')
else:
implicit |= data['Set'] == 'AZ'
data = data.loc[implicit].assign(Set=data['Set'].str.replace('-I', ''))
if 'Algorithm' in data.columns:
data['Algorithm'] = data['Algorithm'].str.replace('-imp', '').str.replace('wrls', 'als')
return data
def select_explicit(data, reset=True):
if reset:
data = data.reset_index()
implicit = data['Set'].str.endswith('-I')
if 'Algorithm' in data.columns:
implicit |= data['Algorithm'].str.endswith('-imp')
data = data[~implicit].assign(Set=data['Set'].str.replace('-E', ''))
return data
What do proportions look like, for both female authors and dummy codes?
profiles.groupby('Set')[['PropFemale', 'PropDC']].agg(['mean', 'std']).T
What does the distribution look like?
props = pd.melt(profiles.reset_index(), ['Set', 'user'], ['PropFemale', 'PropDC'])
props.head()
grid = sns.FacetGrid(props, col='variable', row='Set', sharey=False)
grid.map(sns.distplot, 'value')
rec_props = pd.melt(recs.reset_index(), ['Set', 'Algorithm', 'user'], ['PropFemale', 'PropDC'])
rec_props.head()
Implicit algorithm distributions:
grid = sns.FacetGrid(select_implicit(rec_props), row='Set', col='Algorithm', hue='variable')
grid.map(sns.kdeplot, 'value')
grid.add_legend()
Explicit algorithm distributions:
grid = sns.FacetGrid(select_explicit(rec_props), row='Set', col='Algorithm', hue='variable')
grid.map(sns.kdeplot, 'value')
grid.add_legend()
First, extract the recommendation data:
imp_rd = select_implicit(recs)
imp_rd = imp_rd.melt(['Set', 'user', 'Algorithm'], ['PropFemale', 'PropDC'], value_name='recs')
imp_rd.set_index(['Set', 'user', 'Algorithm', 'variable'], inplace=True)
imp_rd.sort_index(inplace=True)
imp_rd
Now the profile data:
imp_prof = profiles.reset_index()
imp_prof = imp_prof[~imp_prof['Set'].str.endswith('-E')]
imp_prof['Set'] = imp_prof['Set'].str.replace('-I$', '')
imp_prof = imp_prof.melt(['Set', 'user'], ['PropFemale', 'PropDC'], value_name='profile')
imp_prof.set_index(['Set', 'user', 'variable'], inplace=True)
imp_prof.sort_index(inplace=True)
imp_prof
And now merge it:
joint = imp_prof.join(imp_rd)
joint
And compute profile-rec correlations:
joint.groupby(['Set', 'Algorithm', 'variable']).apply(lambda df: df['profile'].corr(df['recs'])).unstack()
First, extract the recommendation data:
exp_rd = select_explicit(recs)
exp_rd = exp_rd.melt(['Set', 'user', 'Algorithm'], ['PropFemale', 'PropDC'], value_name='recs')
exp_rd.set_index(['Set', 'user', 'Algorithm', 'variable'], inplace=True)
exp_rd.sort_index(inplace=True)
exp_rd
Now the profile data:
exp_prof = profiles.reset_index()
exp_prof = exp_prof[~exp_prof['Set'].str.endswith('-I')]
exp_prof['Set'] = exp_prof['Set'].str.replace('-E$', '')
exp_prof = exp_prof.melt(['Set', 'user'], ['PropFemale', 'PropDC'], value_name='profile')
exp_prof.set_index(['Set', 'user', 'variable'], inplace=True)
exp_prof.sort_index(inplace=True)
exp_prof
And now merge it:
joint = exp_prof.join(exp_rd)
joint
And compute profile-rec correlations:
joint.groupby(['Set', 'Algorithm', 'variable']).apply(lambda df: df['profile'].corr(df['recs'])).unstack()