Import Libraries

In [1]:
import pandas as pd
from sklearn import cross_validation
C:\Users\Oghenemaro Anuyah\AppData\Local\conda\conda\envs\my_root\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Read Data

A. Children's movie ratings

In [2]:
childrenData = pd.read_csv("../data/input/childrenData/ratings.dat", sep="::", names = ["user", "item", "ratings", "x"])
childrenData = childrenData[["user", "item", "ratings"]]
childrenData["user"] = [1000000 + x for x in childrenData.user]
childrenData.head()
C:\Users\Oghenemaro Anuyah\AppData\Local\conda\conda\envs\my_root\lib\site-packages\ipykernel_launcher.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  """Entry point for launching an IPython kernel.
Out[2]:
user item ratings
0 1000001 4886 5
1 1000002 586 4
2 1000002 1032 3
3 1000002 1394 3
4 1000002 2392 5
In [3]:
children_movies = set([x for x in childrenData.item])
children_users = set([x for x in childrenData.user])
In [4]:
childrenData.describe().style.format({'user': "{:4.0f}", 'item': "{:4.0f}", 'ratings': "{:4.0f}"})
Out[4]:
user item ratings
count 34077 34077 34077
mean 1003778 4984 5
std 3321 1070 1
min 1000001 1 1
25% 1000988 4495 5
50% 1002726 5069 5
75% 1005968 5726 5
max 1012359 6405 5

B. Movie Lens

In [5]:
movie_lens_data = pd.read_csv("../data/input/ML1M/ratings.dat", sep = "::", names = ["user", "item", "ratings", "timestamp"])
C:\Users\Oghenemaro Anuyah\AppData\Local\conda\conda\envs\my_root\lib\site-packages\ipykernel_launcher.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  """Entry point for launching an IPython kernel.
In [6]:
movie_lens_data = movie_lens_data[["user", "item", "ratings"]]
In [7]:
movie_lens_data.describe().style.format({'user': "{:4.0f}", 'item': "{:4.0f}", 'ratings': "{:4.0f}"})
Out[7]:
user item ratings
count 1000209 1000209 1000209
mean 3025 1866 4
std 1728 1096 1
min 1 1 1
25% 1506 1030 3
50% 3070 1835 4
75% 4476 2770 4
max 6040 3952 5

Create train and test sets for childrenData

In [8]:
childrenData_train, childrenData_test, yt, yy = cross_validation.train_test_split(childrenData, childrenData["user"], random_state = 0, test_size = 0.4)
In [9]:
len(childrenData_train), len(childrenData_test)
Out[9]:
(20446, 13631)
In [10]:
len(set(childrenData_train.user)), len(set(childrenData_train.item))
Out[10]:
(8492, 1815)
In [11]:
len(set(childrenData_test.user)), len(set(childrenData_test.item))
Out[11]:
(6671, 1577)
In [12]:
childrenData_train_users = set(childrenData_train.user)
childrenData_test_users = set(childrenData_test.user)

childrenData_train_items = set(childrenData_train.item)
childrenData_test_items = set(childrenData_test.item)
In [13]:
user_in_test_not_train = childrenData_test_users - childrenData_train_users
item_in_test_not_train = childrenData_test_items - childrenData_train_items
In [14]:
# Ensure that all users and items in the test set exist in the train set

childrenData_test = childrenData_test[~(childrenData_test.user.isin(user_in_test_not_train)) & ~(childrenData_test.item.isin(item_in_test_not_train))]
In [15]:
len(childrenData_test)
Out[15]:
10057
In [ ]:
childrenData_test.to_csv("../data/output/childrenData_test.csv", index = False, header = False, sep = ":")

Build dataset for second experiment

- This dataset includes all of movie lens data along with different variations of childrenData (based on the minimum number 
of ratings).
In [16]:
# Get the summary of the total number of items rated by each user in childrenData

childrenData_users_ratings_summary = childrenData_data.groupby(['user'])['ratings'].count().reset_index().rename(columns = {"ratings": "number_of_ratings"})
In [17]:
childrenData_users_ratings_summary = childrenData_users_ratings_summary[childrenData_users_ratings_summary.number_of_ratings > 1].reset_index(drop = True)
In [18]:
childrenData_users_ratings_summary.head()
Out[18]:
user number_of_ratings
0 1000002 54
1 1000003 9
2 1000004 3
3 1000005 7
4 1000006 15

(i) All movie lens data plus childrenData users that rated at least 2 movies

In [19]:
childrenData_users_2_above_ratings = childrenData_users_ratings_summary[childrenData_users_ratings_summary.number_of_ratings >= 2].reset_index(drop = True)

childrenData_train_users_2_above_ratings = childrenData_train[childrenData_train["user"].isin(childrenData_users_2_above_ratings.user)]
childrenData_test_users_2_above_ratings = childrenData_test[childrenData_test["user"].isin(childrenData_users_2_above_ratings.user)]
In [ ]:
childrenData_test_users_2_above_ratings.to_csv("../data/output/childrenData_test_set_2_above_ratings.csv", index = False, header = False, sep = ":")
  • Data used in this experiment for training and testing
In [20]:
movie_lens_data_plus_childrenData_train_2_above_ratings = pd.concat([movie_lens_data, childrenData_train_users_2_above_ratings])

childrenData_test_set_above_2_ratings = childrenData_test[childrenData_test["user"].isin(childrenData_users_2_above_ratings.user)]
In [21]:
len(movie_lens_data_plus_childrenData_train_2_above_ratings)
Out[21]:
1017249
In [ ]:
movie_lens_data_plus_childrenData_train_2_above_ratings.to_csv("../data/output/all_movie_lens_plus_childrenData_users_10_above_ratings.csv", index = False, header = False, sep = ":")

(ii) All movie lens data plus childrenData users that rated at least 10 movies

In [22]:
childrenData_users_10_above_ratings = childrenData_users_ratings_summary[childrenData_users_ratings_summary.number_of_ratings >= 10].reset_index(drop = True)

childrenData_train_users_10_above_ratings = childrenData_train[childrenData_train["user"].isin(childrenData_users_10_above_ratings.user)]
childrenData_test_users_10_above_ratings = childrenData_test[childrenData_test["user"].isin(childrenData_users_10_above_ratings.user)]
In [ ]:
childrenData_test_users_10_above_ratings.to_csv("../data/output/childrenData_test_set_10_above_ratings.csv", index = False, header = False, sep = ":")
  • Data used in this experiment for training and testing
In [23]:
movie_lens_data_plus_childrenData_train_10_above_ratings = pd.concat([movie_lens_data, childrenData_train_users_10_above_ratings])

childrenData_test_set_above_10_ratings = childrenData_test[childrenData_test["user"].isin(childrenData_users_10_above_ratings.user)]
In [24]:
len(movie_lens_data_plus_childrenData_train_10_above_ratings)
Out[24]:
1007014
In [ ]:
movie_lens_data_plus_childrenData_train_10_above_ratings.to_csv("../data/output/all_movie_lens_plus_childrenData_users_10_above_ratings.csv", index = False, header = False, sep = ":")

(iii) All movie lens data plus childrenData users that rated at least 20 movies

In [25]:
childrenData_users_20_above_ratings = childrenData_users_ratings_summary[childrenData_users_ratings_summary.number_of_ratings >= 20].reset_index(drop = True)

childrenData_train_users_20_above_ratings = childrenData_train[childrenData_train["user"].isin(childrenData_users_20_above_ratings.user)]
childrenData_test_users_20_above_ratings = childrenData_test[childrenData_test["user"].isin(childrenData_users_20_above_ratings.user)]
In [ ]:
childrenData_test_users_20_above_ratings.to_csv("../data/output/childrenData_test_set_20_above_ratings.csv", index = False, header = False, sep = ":")
  • Data used in this experiment for training and testing
In [26]:
movie_lens_data_plus_childrenData_train_20_above_ratings = pd.concat([movie_lens_data, childrenData_train_users_20_above_ratings])

childrenData_test_set_above_20_ratings = childrenData_test[childrenData_test["user"].isin(childrenData_users_20_above_ratings.user)]
In [27]:
len(movie_lens_data_plus_childrenData_train_20_above_ratings)
Out[27]:
1003463
In [ ]:
movie_lens_data_plus_childrenData_train_20_above_ratings.to_csv("../data/output/all_movie_lens_plus_childrenData_users_20_above_ratings.csv", index = False, header = False, sep = ":")

Build dataset for third experiment

- This dataset includes movie ratings provided by movie lens users that rated at least two movies for children, along with different varaitions of childrenData (based on the minimum number of ratings).
In [28]:
movie_lens_user_df = movie_lens_data.groupby('user', as_index = False)['item'].aggregate(lambda x: set(x["item"]))

movie_lens_user_df["item"] = [list(x) for x in movie_lens_user_df.item]
In [29]:
movie_lens_user_df["has_rated_child_movies"] = [any(x in children_movies for x in y) for y in movie_lens_user_df.item]
In [30]:
movie_lens_user_df_has_rated_child = movie_lens_user_df[movie_lens_user_df.has_rated_child_movies == True]
In [31]:
movie_lens_user_df_has_rated_child["child_movies"] = [[x for x in y if x in children_movies] for y in movie_lens_user_df_has_rated_child.item]
C:\Users\Oghenemaro Anuyah\AppData\Local\conda\conda\envs\my_root\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
In [32]:
movie_lens_user_df_has_rated_child.head(5)
Out[32]:
user item has_rated_child_movies child_movies
0 1 [1, 2692, 260, 1028, 1287, 1029, 1545, 1035, 5... True [1, 260, 1028, 661, 150, 1566, 1962, 3114, 172...
1 2 [1537, 515, 1544, 2571, 1552, 2067, 21, 3095, ... True [2628, 648, 1196, 1198, 1210, 2236, 1225, 1246...
2 3 [260, 648, 1291, 653, 1431, 1304, 1049, 3868, ... True [260, 648, 1291, 3114, 1580, 1196, 1198, 1210,...
3 4 [2947, 260, 2692, 2951, 3468, 1036, 1954, 1196... True [2947, 260, 1954, 1196, 1198, 1210, 1097, 480,...
4 5 [2560, 515, 6, 3079, 3081, 2058, 3083, 2571, 1... True [34, 39, 1580, 2683, 150, 2716, 1721, 1722, 37...
In [33]:
movie_lens_user_df_has_rated_child["num_child_movies"] = [len(set(x)) for x in movie_lens_user_df_has_rated_child.child_movies]
C:\Users\Oghenemaro Anuyah\AppData\Local\conda\conda\envs\my_root\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
In [34]:
special_movie_lens_users_df = movie_lens_user_df_has_rated_child[["user", "num_child_movies"]]

# We want only users that have rated at least 2 children's movies

special_movie_lens_users_df = special_movie_lens_users_df[special_movie_lens_users_df.num_child_movies >= 2]
In [35]:
special_movie_lens_users_df.head()
Out[35]:
user num_child_movies
0 1 22
1 2 18
2 3 15
3 4 9
4 5 13
In [36]:
movie_lens_special_users_ratings = movie_lens_data[movie_lens_data["user"].isin(special_movie_lens_users_df.user)]

(i) Special group of movie lens users plus childrenData users that rated at least two movies

  • Data used in this experiment for training and testing
In [37]:
movie_lens_special_users_plus_childrenData_train_2_above_ratings = pd.concat([movie_lens_special_users_ratings, dchildrenData_train_users_2_above_ratings])

movie_lens_special_users_plus_childrenData_train_2_above_ratings.to_csv("../data/output/movie_lens_rated_2_kids_plus_childrenData_users_2_above_ratings.csv", index = False, header = False, sep = ":")

# Used the same dataset containing childrenData users that rated at least 2 children movies for testing

(ii) Special group of movie lens users plus childrenData users that rated at least 10 movies

  • Data used in this experiment for training and testing
In [38]:
movie_lens_special_users_plus_childrenData_train_10_above_ratings = pd.concat([movie_lens_special_users_ratings, childrenData_train_users_10_above_ratings])

movie_lens_special_users_plus_childrenData_train_10_above_ratings.to_csv("../data/output/movie_lens_rated_2_kids_plus_childrenData_users_10_above_ratings.csv", index = False, header = False, sep = ":")

# Used the same dataset containing childrenData users that rated at least 10 children movies for testing
In [39]:
len(movie_lens_special_users_plus_childrenData_train_10_above_ratings)
Out[39]:
1003956

(iii) Special group of movie lens users plus childrenData users that rated at least 20 movies

  • Data used in this experiment for training and testing
In [40]:
movie_lens_special_users_plus_childrenData_train_20_above_ratings = pd.concat([movie_lens_special_users_ratings, childrenData_train_users_20_above_ratings])

movie_lens_special_users_plus_childrenData_train_20_above_ratings.to_csv("../data/output/movie_lens_rated_2_kids_plus_childrenData_users_20_above_ratings.csv", index = False, header = False, sep = ":")

# Used the same dataset containing childrenData users that rated at least 20 children movies for testing
In [41]:
len(movie_lens_special_users_plus_childrenData_train_20_above_ratings)
Out[41]:
1000405

Note:

Check /data/output for all exported datasets