This notebook does the data preparation for Bayesian inference for the model analysis.
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from plotnine import *
import ujson
from bookgender.config import data_dir
from bookgender.nbutils import *
fig_dir = init_figs('ProfileData')
We need to load the author-gender information:
book_gender = pd.read_csv(data_dir / 'author-gender.csv.gz')
book_gender = book_gender.set_index('item')['gender']
book_gender.describe()
book_gender[book_gender == 'no-viaf-author'] = 'unlinked'
book_gender[book_gender == 'no-loc-author'] = 'unlinked'
book_gender[book_gender == 'no-loc-book'] = 'unlinked'
book_gender = book_gender.astype('category')
book_gender.unique()
And we load book hashes, to set up our dummy bias:
book_hash = pd.read_parquet(data_dir / 'book-hash.parquet').rename(columns={'cluster': 'item'})
book_hash['dcode'] = book_hash['md5'].apply(lambda x: int(x[-1], 16) % 2)
book_hash = book_hash.set_index('item')
book_hash.head()
Load the sample user ratings for each data set:
user_ratings = pd.read_csv(data_dir / 'study-ratings.csv')
user_ratings.drop(columns=['rating'], inplace=True)
user_ratings.rename(columns={'dataset': 'Set'}, inplace=True)
user_ratings.head()
user_ratings = user_ratings.join(book_gender, on='item', how='left')
user_ratings['gender'].fillna('unlinked', inplace=True)
user_ratings = user_ratings.join(book_hash['dcode'], on='item', how='left')
user_ratings.head(15)
Now we will summarize user profiles:
def summarize_profile(df):
gender = df['gender']
dc = df['dcode']
data = {
'count': len(df),
'linked': np.sum(gender != 'unlinked'),
'ambiguous': np.sum(gender == 'ambiguous'),
'male': np.sum(gender == 'male'),
'female': np.sum(gender == 'female'),
'dcknown': dc.count(),
'dcyes': dc.sum(skipna=True),
'PropDC': dc.mean()
}
data['Known'] = data['male'] + data['female']
data['PropFemale'] = data['female'] / data['Known']
data['PropKnown'] = data['Known'] / data['count']
return pd.Series(data)
profiles = user_ratings.groupby(['Set', 'user']).apply(summarize_profile)
profiles = profiles.apply(lambda s: s if s.name.startswith('Prop') else s.astype('i4'))
profiles.head()
How are profile sizes distributed?
g = sns.FacetGrid(profiles.reset_index(), col='Set', sharex=False, sharey=False, height=2)
g.map(sns.distplot, 'count')
plt.savefig(fig_dir / 'profile-size-all.pdf')
g = sns.FacetGrid(profiles.reset_index(), col='Set', sharex=False, sharey=False, height=2)
g.map(sns.distplot, 'Known')
plt.savefig(fig_dir / 'profile-size-known.pdf')
For the paper, we want to make some changes - we're going to show this on a scatter plot.
up_sizes = profiles[['count', 'Known']].reset_index().melt(id_vars=['Set', 'user'], var_name='Type', value_name='Size')
up_sizes['Type'] = up_sizes['Type'].astype('category').cat.rename_categories({
'count': 'All',
'Known': 'Known-Gender'
})
up_sizes.head()
size_counts = up_sizes.groupby(['Set', 'Type', 'Size'])['user'].count().reset_index(name='Users')
size_counts = size_counts[size_counts['Users'] > 0]
size_counts.head()
size_counts['Users'].describe()
make_plot(size_counts, aes(x='Size', y='Users'),
geom_point(),
scale_x_log10(),
scale_y_log10(),
facet_grid('Type ~ Set', scales='free'),
xlab('# of Consumed Items'),
ylab('# of Users'),
panel_grid=element_blank(),
file='profile-size.pdf', width=7, height=3.2)
And % known?
g = sns.FacetGrid(profiles.reset_index(), col='Set')
g.map(sns.distplot, 'PropKnown')
Quick empirical inspection:
g = sns.FacetGrid(profiles.reset_index(), col='Set', sharey=False)
g.map(sns.distplot, 'PropFemale')
profiles.groupby('Set')['PropFemale'].mean()
np.sqrt(profiles.groupby('Set')['PropFemale'].var())
Quick empirical inspection - this should be noise:
g = sns.FacetGrid(profiles.reset_index(), row='Set', sharey=False)
g.map(sns.distplot, 'PropDC')
We save profile data frame to be reloaded in the Bayesian analysis.
profiles.to_pickle('data/profile-data.pkl')
We also want to save the data for STAN.
def stan_inputs(data, kc, pc):
return {
'J': len(data),
'n': data[kc],
'y': data[pc]
}
def inf_dir(sname):
return data_dir / sname / 'inference'
for sname, frame in profiles.groupby('Set'):
print('preparing STAN input for', sname)
dir = inf_dir(sname)
dir.mkdir(exist_ok=True)
in_fn = dir / 'profile-inputs.json'
in_fn.write_text(ujson.dumps(stan_inputs(frame, 'Known', 'female')))
in_fn = dir / 'profile-dcode-inputs.json'
in_fn.write_text(ujson.dumps(stan_inputs(frame, 'dcknown', 'dcyes')))