import pandas as pd
from sklearn import cross_validation
A. Children's movie ratings
childrenData = pd.read_csv("../data/input/childrenData/ratings.dat", sep="::", names = ["user", "item", "ratings", "x"])
childrenData = childrenData[["user", "item", "ratings"]]
childrenData["user"] = [1000000 + x for x in childrenData.user]
childrenData.head()
children_movies = set([x for x in childrenData.item])
children_users = set([x for x in childrenData.user])
childrenData.describe().style.format({'user': "{:4.0f}", 'item': "{:4.0f}", 'ratings': "{:4.0f}"})
B. Movie Lens
movie_lens_data = pd.read_csv("../data/input/ML1M/ratings.dat", sep = "::", names = ["user", "item", "ratings", "timestamp"])
movie_lens_data = movie_lens_data[["user", "item", "ratings"]]
movie_lens_data.describe().style.format({'user': "{:4.0f}", 'item': "{:4.0f}", 'ratings': "{:4.0f}"})
childrenData_train, childrenData_test, yt, yy = cross_validation.train_test_split(childrenData, childrenData["user"], random_state = 0, test_size = 0.4)
len(childrenData_train), len(childrenData_test)
len(set(childrenData_train.user)), len(set(childrenData_train.item))
len(set(childrenData_test.user)), len(set(childrenData_test.item))
childrenData_train_users = set(childrenData_train.user)
childrenData_test_users = set(childrenData_test.user)
childrenData_train_items = set(childrenData_train.item)
childrenData_test_items = set(childrenData_test.item)
user_in_test_not_train = childrenData_test_users - childrenData_train_users
item_in_test_not_train = childrenData_test_items - childrenData_train_items
# Ensure that all users and items in the test set exist in the train set
childrenData_test = childrenData_test[~(childrenData_test.user.isin(user_in_test_not_train)) & ~(childrenData_test.item.isin(item_in_test_not_train))]
len(childrenData_test)
childrenData_test.to_csv("../data/output/childrenData_test.csv", index = False, header = False, sep = ":")
- This dataset includes all of movie lens data along with different variations of childrenData (based on the minimum number
of ratings).
# Get the summary of the total number of items rated by each user in childrenData
childrenData_users_ratings_summary = childrenData_data.groupby(['user'])['ratings'].count().reset_index().rename(columns = {"ratings": "number_of_ratings"})
childrenData_users_ratings_summary = childrenData_users_ratings_summary[childrenData_users_ratings_summary.number_of_ratings > 1].reset_index(drop = True)
childrenData_users_ratings_summary.head()
(i) All movie lens data plus childrenData users that rated at least 2 movies
childrenData_users_2_above_ratings = childrenData_users_ratings_summary[childrenData_users_ratings_summary.number_of_ratings >= 2].reset_index(drop = True)
childrenData_train_users_2_above_ratings = childrenData_train[childrenData_train["user"].isin(childrenData_users_2_above_ratings.user)]
childrenData_test_users_2_above_ratings = childrenData_test[childrenData_test["user"].isin(childrenData_users_2_above_ratings.user)]
childrenData_test_users_2_above_ratings.to_csv("../data/output/childrenData_test_set_2_above_ratings.csv", index = False, header = False, sep = ":")
movie_lens_data_plus_childrenData_train_2_above_ratings = pd.concat([movie_lens_data, childrenData_train_users_2_above_ratings])
childrenData_test_set_above_2_ratings = childrenData_test[childrenData_test["user"].isin(childrenData_users_2_above_ratings.user)]
len(movie_lens_data_plus_childrenData_train_2_above_ratings)
movie_lens_data_plus_childrenData_train_2_above_ratings.to_csv("../data/output/all_movie_lens_plus_childrenData_users_10_above_ratings.csv", index = False, header = False, sep = ":")
(ii) All movie lens data plus childrenData users that rated at least 10 movies
childrenData_users_10_above_ratings = childrenData_users_ratings_summary[childrenData_users_ratings_summary.number_of_ratings >= 10].reset_index(drop = True)
childrenData_train_users_10_above_ratings = childrenData_train[childrenData_train["user"].isin(childrenData_users_10_above_ratings.user)]
childrenData_test_users_10_above_ratings = childrenData_test[childrenData_test["user"].isin(childrenData_users_10_above_ratings.user)]
childrenData_test_users_10_above_ratings.to_csv("../data/output/childrenData_test_set_10_above_ratings.csv", index = False, header = False, sep = ":")
movie_lens_data_plus_childrenData_train_10_above_ratings = pd.concat([movie_lens_data, childrenData_train_users_10_above_ratings])
childrenData_test_set_above_10_ratings = childrenData_test[childrenData_test["user"].isin(childrenData_users_10_above_ratings.user)]
len(movie_lens_data_plus_childrenData_train_10_above_ratings)
movie_lens_data_plus_childrenData_train_10_above_ratings.to_csv("../data/output/all_movie_lens_plus_childrenData_users_10_above_ratings.csv", index = False, header = False, sep = ":")
(iii) All movie lens data plus childrenData users that rated at least 20 movies
childrenData_users_20_above_ratings = childrenData_users_ratings_summary[childrenData_users_ratings_summary.number_of_ratings >= 20].reset_index(drop = True)
childrenData_train_users_20_above_ratings = childrenData_train[childrenData_train["user"].isin(childrenData_users_20_above_ratings.user)]
childrenData_test_users_20_above_ratings = childrenData_test[childrenData_test["user"].isin(childrenData_users_20_above_ratings.user)]
childrenData_test_users_20_above_ratings.to_csv("../data/output/childrenData_test_set_20_above_ratings.csv", index = False, header = False, sep = ":")
movie_lens_data_plus_childrenData_train_20_above_ratings = pd.concat([movie_lens_data, childrenData_train_users_20_above_ratings])
childrenData_test_set_above_20_ratings = childrenData_test[childrenData_test["user"].isin(childrenData_users_20_above_ratings.user)]
len(movie_lens_data_plus_childrenData_train_20_above_ratings)
movie_lens_data_plus_childrenData_train_20_above_ratings.to_csv("../data/output/all_movie_lens_plus_childrenData_users_20_above_ratings.csv", index = False, header = False, sep = ":")
- This dataset includes movie ratings provided by movie lens users that rated at least two movies for children, along with different varaitions of childrenData (based on the minimum number of ratings).
movie_lens_user_df = movie_lens_data.groupby('user', as_index = False)['item'].aggregate(lambda x: set(x["item"]))
movie_lens_user_df["item"] = [list(x) for x in movie_lens_user_df.item]
movie_lens_user_df["has_rated_child_movies"] = [any(x in children_movies for x in y) for y in movie_lens_user_df.item]
movie_lens_user_df_has_rated_child = movie_lens_user_df[movie_lens_user_df.has_rated_child_movies == True]
movie_lens_user_df_has_rated_child["child_movies"] = [[x for x in y if x in children_movies] for y in movie_lens_user_df_has_rated_child.item]
movie_lens_user_df_has_rated_child.head(5)
movie_lens_user_df_has_rated_child["num_child_movies"] = [len(set(x)) for x in movie_lens_user_df_has_rated_child.child_movies]
special_movie_lens_users_df = movie_lens_user_df_has_rated_child[["user", "num_child_movies"]]
# We want only users that have rated at least 2 children's movies
special_movie_lens_users_df = special_movie_lens_users_df[special_movie_lens_users_df.num_child_movies >= 2]
special_movie_lens_users_df.head()
movie_lens_special_users_ratings = movie_lens_data[movie_lens_data["user"].isin(special_movie_lens_users_df.user)]
(i) Special group of movie lens users plus childrenData users that rated at least two movies
movie_lens_special_users_plus_childrenData_train_2_above_ratings = pd.concat([movie_lens_special_users_ratings, dchildrenData_train_users_2_above_ratings])
movie_lens_special_users_plus_childrenData_train_2_above_ratings.to_csv("../data/output/movie_lens_rated_2_kids_plus_childrenData_users_2_above_ratings.csv", index = False, header = False, sep = ":")
# Used the same dataset containing childrenData users that rated at least 2 children movies for testing
(ii) Special group of movie lens users plus childrenData users that rated at least 10 movies
movie_lens_special_users_plus_childrenData_train_10_above_ratings = pd.concat([movie_lens_special_users_ratings, childrenData_train_users_10_above_ratings])
movie_lens_special_users_plus_childrenData_train_10_above_ratings.to_csv("../data/output/movie_lens_rated_2_kids_plus_childrenData_users_10_above_ratings.csv", index = False, header = False, sep = ":")
# Used the same dataset containing childrenData users that rated at least 10 children movies for testing
len(movie_lens_special_users_plus_childrenData_train_10_above_ratings)
(iii) Special group of movie lens users plus childrenData users that rated at least 20 movies
movie_lens_special_users_plus_childrenData_train_20_above_ratings = pd.concat([movie_lens_special_users_ratings, childrenData_train_users_20_above_ratings])
movie_lens_special_users_plus_childrenData_train_20_above_ratings.to_csv("../data/output/movie_lens_rated_2_kids_plus_childrenData_users_20_above_ratings.csv", index = False, header = False, sep = ":")
# Used the same dataset containing childrenData users that rated at least 20 children movies for testing
len(movie_lens_special_users_plus_childrenData_train_20_above_ratings)
Check /data/output for all exported datasets