This notebook analyzes the results of the Bayesian analysis of the user profile data.
We split our Bayesian analyses into three steps:
This is to enable the sampling process and its results to be managed and cached by DVC, and to avoid re-running the Bayesian sampling process when we only want to change how we look at its results.
model = 'profile'
# Parameters
model = "full"
import os
from pathlib import Path
import pandas as pd
import numpy as np
import scipy.special as sps
from scipy import stats
import seaborn as sns
import plotnine as p
import matplotlib.pyplot as plt
from statsmodels.nonparametric.kde import KDEUnivariate
import zarr
from IPython.display import display, Markdown
from bookgender.config import rng_seed
from lenskit.util import init_rng
import bookgender.datatools as dt
from bookgender.nbutils import *
seed = init_rng(rng_seed(), 'ProfileModelAnalysis', propagate=True)
rng = np.random.default_rng(seed)
seed
if model.startswith('profile'):
fig_key = model
else:
fig_key = 'profile-' + model
fig_key
fig_dir = init_figs(f'PMA-{fig_key}')
datasets = list(dt.datasets.keys())
datasets
We need to load the sampling results for each data set:
samples = {}
summary = {}
for ds in datasets:
_zf = zarr.ZipStore(f'data/{ds}/inference/{model}/samples.zarr', mode='r')
_c = zarr.LRUStoreCache(_zf, 2**30)
samples[ds] = zarr.group(_c)
summary[ds] = pd.read_csv(f'data/{ds}/inference/{model}-summary.csv', index_col='name')
summary = pd.concat(summary, names=['Set'])
summary.head()
Extract and verify sample size:
sample_size = len(samples['AZ']['lp__'])
assert all(len(s['lp__']) == sample_size for s in samples.values())
sample_size
Do we have parameters with bad mixing? (high $\hat{R}$)
summary.sort_values('R_hat', ascending=False).head()
Let's compute LPPD and WAIC for each of our models:
def ll_stats(ds):
ll_exp = samples[ds]['ll_exp']
ll_var = samples[ds]['ll_var']
lppd = np.sum(ll_exp)
pwaic = np.sum(ll_var)
return pd.Series({'lppd': lppd, 'pWAIC': pwaic, 'WAIC': -2 * (lppd - pwaic)})
pd.Series(datasets).apply(ll_stats).assign(Set=datasets).set_index('Set')
Extract sample data in a form usable for plotting:
samp_df = pd.concat(dict((ds, pd.DataFrame({
'Mu': samples[ds]['mu'],
'Sigma': samples[ds]['sigma'],
'Theta': samples[ds]['thetaP']
})) for ds in datasets), names=['Set']).copy()
samp_df
We also want to show observed distributions:
profiles = pd.read_pickle('data/profile-data.pkl')
profiles.head()
For each sample, we want to sample a predicted observation. We do this by:
def _sample_ns(df):
known = profiles.loc[df.name, 'Known']
# return known.sample(len(df), replace=True).reset_index(drop=True)
return pd.Series(rng.choice(known, len(df)))
samp_df.groupby('Set').apply(_sample_ns).stack()
samp_df['N'] = samp_df.groupby('Set').apply(_sample_ns).stack()
samp_df['Y'] = rng.binomial(samp_df['N'], samp_df['Theta'], len(samp_df))
samp_df['PropFemale'] = samp_df['Y'] / samp_df['N']
samp_df.head()
We want to report summary statistics of the various parameters. We will start with observed ratios:
profiles.groupby('Set')['PropFemale'].agg(['mean', 'std']).T
print(profiles.groupby('Set')['PropFemale'].agg(['mean', 'std']).T.to_latex(float_format='%.3f'))
What about our est. mean log odds ($\mu$) and variance ($\sigma$)? We also want to the posterior mean tendancy ($\theta$) and its variance.
It's important to distinguish between the estimated variance of users ($\sigma$) and the variance of the estimate (e.g. the variance of $\mu$). Since $\theta$ is a sample of a random unseen user, its variance should be comparable to the variance of the underlying users.
samp_df.groupby('Set')[['Mu', 'Sigma', 'Theta']].agg(['mean', 'std']).T
Find the 95% confidence intervals of $\mu$:
samp_df.groupby('Set')['Mu'].apply(lambda s: {
'Low': s.quantile(0.025),
'High': s.quantile(0.975)
}).unstack()
Let's look at the STAN summary output for these parameters too:
summary.swaplevel().loc['mu', ['Mean', 'StdDev', '5%', '95%']].T
summary.swaplevel().loc['sigma', ['Mean', 'StdDev', '5%', '95%']].T
summary.swaplevel().loc['thetaP', ['Mean', 'StdDev', '5%', '95%']].T
Stack simulated proportions from two sources:
simulated_prop = pd.DataFrame({'value': samp_df[['Theta', 'PropFemale']].stack()})
simulated_prop.reset_index([0,2], inplace=True)
simulated_prop['Set'] = simulated_prop['Set'].astype('category')
simulated_prop.rename(columns={'level_2': 'Method'}, inplace=True)
simulated_prop.reset_index(drop=True, inplace=True)
simulated_prop.head()
obs_prop = profiles.groupby('Set').apply(lambda df: df['PropFemale'].sample(sample_size, replace=True))
obs_prop.name = 'value'
obs_prop = obs_prop.reset_index(0)
obs_prop.reset_index(drop=True, inplace=True)
obs_prop['Method'] = 'Observed'
obs_prop = obs_prop[['Set', 'Method', 'value']]
obs_prop.head()
all_prop = pd.concat([simulated_prop, obs_prop], ignore_index=True)
all_prop['Method'] = all_prop['Method'].astype('category')
all_prop['Method'].cat.reorder_categories(['Theta', 'PropFemale', 'Observed'], inplace=True)
all_prop['Method'].cat.rename_categories({'Theta': 'Smoothed', 'PropFemale': 'Predicted y/n', 'Observed': 'Observed y/n'}, inplace=True)
all_prop.head()
We want the mean for each method to show:
prop_means = all_prop.groupby(['Set', 'Method'])['value'].mean().reset_index()
prop_means
Now finally we can view densities:
grid = sns.FacetGrid(all_prop.assign(hist=all_prop.Method == 'Observed y/n'), row='Set', hue='Method',
sharey=False, height=2, aspect=5)
for (r, c, h), df in grid.facet_data():
if h == 0:
sns.distplot(df['value'], kde=False, norm_hist=True, ax=grid.axes[r, c])
grid.map(sns.kdeplot, 'value', clip=(0,1)).add_legend()
plt.xlim(0, 1)
# plt.savefig(fig_dir / 'dist.pdf')
from bookgender.nbutils import make_plot
make_plot(all_prop, p.aes('value'),
p.geom_histogram(p.aes(y='..density..'), data=obs_prop,
binwidth=0.05, fill='#CCCCCC'),
p.geom_line(p.aes(color='Method', linetype='Method'),
stat='density', bw='scott', adjust=0.5),
p.geom_vline(p.aes(xintercept='value', color='Method', linetype='Method'),
data=prop_means, show_legend=False),
p.facet_grid('Set ~', scales='free_y'),
p.scale_color_brewer('qual', 'Dark2'),
p.xlab("Profile Proportion of Female Authors"),
p.ylab("Density"),
legend_position='top', legend_title=p.element_blank(),
file='profile-dist', width=8, height=6)
The profiling process infers a $g(\theta_u)$ value for each user. What is the variance of those estimates?
def u_stats(ds):
ths = samples[ds]['nTheta'][...].T
return pd.DataFrame({
'mean': np.mean(ths, axis=1),
'var': np.var(ths, axis=1),
'lo': np.quantile(ths, 0.025, axis=1),
'hi': np.quantile(ths, 0.975, axis=1)
})
user_biases = pd.concat(dict(
(ds, u_stats(ds)) for ds in datasets
), names=['Set', 'uidx'])
user_biases
(p.ggplot(user_biases.reset_index(), p.aes('mean'))
+ p.geom_histogram()
+ p.facet_grid('Set ~'))
(p.ggplot(user_biases.reset_index(), p.aes('var'))
+ p.geom_histogram()
+ p.facet_grid('Set ~'))
This indicates quite a few $\theta_u$ values have high posterior variance, so we really do want to use the entire distribution.