Skip to content

Instantly share code, notes, and snippets.

@reachsumit
Created November 7, 2022 02:41
Show Gist options
  • Save reachsumit/a02a83fbb3ae5e293fde4b90e3a319d7 to your computer and use it in GitHub Desktop.
Save reachsumit/a02a83fbb3ae5e293fde4b90e3a319d7 to your computer and use it in GitHub Desktop.
Deep Learning Recommendation Model
Display the source blob
Display the rendered blob
Raw
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.7.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import torch\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nimport torch.nn as nn\n\nfrom itertools import combinations\nfrom scipy.sparse import coo_matrix\nfrom scipy.stats import rankdata\nfrom sklearn.preprocessing import StandardScaler","metadata":{"execution":{"iopub.status.busy":"2022-11-05T10:11:43.408638Z","iopub.execute_input":"2022-11-05T10:11:43.409197Z","iopub.status.idle":"2022-11-05T10:11:45.540036Z","shell.execute_reply.started":"2022-11-05T10:11:43.409088Z","shell.execute_reply":"2022-11-05T10:11:45.538996Z"},"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"code","source":"device = 'cuda' if torch.cuda.is_available() else 'cpu'\nPAD_IDX = 0","metadata":{"execution":{"iopub.status.busy":"2022-11-05T10:11:45.542202Z","iopub.execute_input":"2022-11-05T10:11:45.542613Z","iopub.status.idle":"2022-11-05T10:11:45.612188Z","shell.execute_reply.started":"2022-11-05T10:11:45.542586Z","shell.execute_reply":"2022-11-05T10:11:45.610774Z"},"trusted":true},"execution_count":2,"outputs":[]},{"cell_type":"code","source":"# purpose: convert target with index of movie to series of all zeros and one in place of index\n# We will use this to compute the expected output of the model to be compared with actual output\ndef idx_to_sparse(idx, sparse_dim):\n sparse = np.zeros(sparse_dim) # vector of 1683 zeroes\n sparse[int(idx)] = 1 # set a given index to 1\n return pd.Series(sparse, dtype=int) # make a pandas series of 0s and 1s\n\n\n# Calculate accuracy (a classification metric)\ndef accuracy_fn(y_true, y_pred):\n correct = torch.eq(y_true, y_pred).sum().item() # torch.eq() calculates where two tensors are equal\n acc = (correct / len(y_pred)) * 100 \n return acc\n\n# r,c = get_coo_indexes(dataset['prev movies'].tolist())\n# print(len(r), len(c))\n# 10150406 10150406\n# print(r[:11], c[:11])\n# [0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4] ['168', '168', '172', '168', '172', '165', '168', '172', '165', '156', '168']\n# basically the information that row0 has 168, row1 has 168 and 172, row2 has 168, 172, 165 and so on..\n# note that the length of first list represents number of \"1s\", while zip(first,second) gives row, col indices that should be one\ndef get_coo_indexes(lil):\n rows = []\n cols = []\n for i, el in enumerate(lil):\n if type(el)!=list:\n el = [el]\n for j in el:\n rows.append(i)\n cols.append(j)\n return rows, cols\n\n\n# This function creates a sparse matrix given the \"prev movies\" column\ndef get_sparse_features(series, shape):\n # get row, column pairs such that column value represents the watched movie\n coo_indexes = get_coo_indexes(series.tolist())\n # Create a matrix of 0s and 1s of size orignal dataset rows and number of movies as columns; then convert it into coord based sparse matrix\n # sparse matrix would be of the size tuple (original rows count x number of movies); matrix starts with 1; we keep one extra column because movie id starts with 1 in the dataset\n # In the tuple, first argument specifies the number of 1s to be put in the sparse matrix, the second item (another tuple) specified row and column indexes for the positions where corresponding each value ie. 1 should be placed in the sparse matrix\n sparse_df = coo_matrix((np.ones(len(coo_indexes[0])), (coo_indexes[0], coo_indexes[1])), shape=shape)\n return sparse_df\n\n\n# purpose: convert indexes of previous watched movies to series of films indexes\n# given a sparse matrix input, this function returns a corresponding padded 2D matrix\n# We use this to make binary features for the model training and testing\ndef sparse_to_idx(data, pad_idx=-1, max_length=None):\n # Returns a tuple of arrays (row,col) containing the indices of the non-zero elements of the matrix.\n indexes = data.nonzero()\n # for prev_movies_train, this dataset will be 7957390 rows × 2 columns because of repeating values of rows\n indexes_df = pd.DataFrame()\n indexes_df['rows'] = indexes[0]\n indexes_df['cols'] = indexes[1]\n \n # group by the rows, and make a list of all the corresponding columns\n # rows\n # 0 [255, 286, 298, 185, 173]\n # 1 [255, 286, 298, 185, 173, 772, 108]\n # 2 [255, 286, 298, 185, 173, 772]\n # 3 [255, 286, 298, 185, 173, 772, 108, 288]\n mdf = indexes_df.groupby('rows').apply(lambda x: x['cols'].tolist())\n max_len = mdf.apply(lambda x: len(x)).max() if max_length is None else max_length# longest list is 736 sized\n return max_len, mdf.apply(lambda x: pd.Series(x + [pad_idx] * (max_len - len(x)))).values # pad zeroes in the list upto 736 values; this result is (76228, 736) shaped","metadata":{"execution":{"iopub.status.busy":"2022-11-05T10:11:45.614025Z","iopub.execute_input":"2022-11-05T10:11:45.614614Z","iopub.status.idle":"2022-11-05T10:11:45.629650Z","shell.execute_reply.started":"2022-11-05T10:11:45.614576Z","shell.execute_reply":"2022-11-05T10:11:45.628739Z"},"trusted":true},"execution_count":3,"outputs":[]},{"cell_type":"code","source":"def load_and_process_data_dlrm():\n #Load the Ratings data\n data = pd.read_csv('../input/movielens-100k-dataset/ml-100k/u.data', sep=\"\\t\", header=None)\n data.columns = ['user id', 'movie id', 'rating', 'timestamp']\n #Load the User data\n users = pd.read_csv('../input/movielens-100k-dataset/ml-100k/u.user', sep=\"|\", encoding='latin-1', header=None)\n users.columns = ['user id', 'age', 'gender', 'occupation', 'zip code']\n #Load movie data\n items = pd.read_csv('../input/movielens-100k-dataset/ml-100k/u.item', \n sep=\"|\", encoding='latin-1', header=None)\n items.columns = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', \n 'unknown', 'Action', 'Adventure', 'Animation', 'Children\\'s', 'Comedy', \n 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', \n 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']\n GENRES = pd.read_csv('../input/movielens-100k-dataset/ml-100k/u.genre', \n sep=\"|\", header=None, usecols=[0])[0].tolist()\n \n # Sort the dataset by user-id and time\n dataset = data.sort_values(['user id', 'timestamp']).reset_index(drop=True)\n dataset['one'] = 1 # add a column containing all 1s\n dataset['sample_num'] = dataset.groupby('user id')['one'].cumsum() # use the 1s column to create a sample number for each user\n # Create a target column by shifting movie-id for each user-id one step back, effectively this means that we have a column that has id for the next movie the user is going to watch \n # (it is NaN for the row representing the last movie the user watches). We will predict this column.\n dataset['target'] = dataset.groupby('user id')['movie id'].shift(-1)\n # create a column that represents average movie rating given by user till that time (represented by row)\n dataset['mean_rate'] = dataset.groupby('user id')['rating'].cumsum() / dataset['sample_num']\n \n # Create a column that has a list of movies that the user has watched so far. We will create sparse vector and embedding vectors from this later on.\n dataset['prev movies'] = dataset['movie id'].apply(lambda x: str(x))\n dataset['prev movies'] = dataset.groupby('user id')['prev movies'].apply(lambda x: (x + ' ').cumsum().str.strip())\n dataset['prev movies'] = dataset['prev movies'].apply(lambda x: x.split())\n \n # do a left join with movies dataframe and bring all the genre representations (0/1 binary values for each movie representing its category) here.\n dataset = dataset.merge(items[['movie id'] + GENRES], on='movie id', how='left')\n \n # For each genre column (19) creates another column (total 19 more). This column represents a given user's mean score (float value) for a given genre till that time (represented by row).\n # Note that we also update the genre columns such that each column now has cumulative sum, i.e. the corresponding number of movies that the user has watched in that genre so far.\n for genre in GENRES:\n dataset[f'{genre}_rate'] = dataset[genre]*dataset['rating']\n dataset[genre] = dataset.groupby('user id')[genre].cumsum()\n dataset[f'{genre}_rate'] = dataset.groupby('user id')[f'{genre}_rate'].cumsum() / dataset[genre]\n \n # Next we normalize the scores for movies in each genre such that we divide it by the number of movies that the user has watched so far.\n dataset[GENRES] = dataset[GENRES].apply(lambda x: x / dataset['sample_num'])\n # do a left-join on users data and get more information on users\n dataset = dataset.merge(users, on='user id', how='left')\n \n occupations_categoricals = dataset['occupation'].unique().tolist()\n\n dataset['gender'] = (dataset['gender'] == 'M').astype(int) # change gender to 0/1 integer\n dataset = pd.concat([dataset.drop(['occupation'], axis=1), pd.get_dummies(dataset[['occupation']], prefix=\"\", prefix_sep=\"\")], axis=1) # get occupation dummy variables and drop occupation column\n dataset.drop('zip code', axis=1, inplace=True)\n \n COLD_START_TRESH = 5 # take the rows AFTER each user has watched at least 4 movies\n # filter using threshold and remove null target rows\n filtred_data = dataset[(dataset['sample_num'] >= COLD_START_TRESH) &\n ~(dataset['target'].isna())].sort_values('timestamp')\n \n continuous_cols = ['age', 'gender', 'mean_rate'] + GENRES + [gen+\"_rate\" for gen in GENRES] # 41\n df_deep = filtred_data[continuous_cols] # this and embeddings will be fed to the deep part\n \n scaler = StandardScaler()\n pd.options.mode.chained_assignment = None\n for cc in continuous_cols:\n df_deep[cc] = scaler.fit_transform(df_deep[cc].values.reshape(-1,1))\n \n TEST_SIZE = 0.2 # size of test set\n X_train_deep, X_test_deep = df_deep[:int(len(df_deep)*(1-TEST_SIZE))], df_deep[int(len(df_deep)*(1-TEST_SIZE)):]\n\n filtered_train_data, filtered_test_data = filtred_data[:int(len(filtred_data)*(1-TEST_SIZE))], filtred_data[int(len(filtred_data)*(1-TEST_SIZE)):]\n y_train, y_test = filtered_train_data['target'], filtered_test_data['target']\n \n # create sparse matrix out of prev_movies column for both train and test sets\n prev_movies_train = get_sparse_features(filtered_train_data['prev movies'], (len(filtered_train_data), filtred_data['movie id'].max()+1))\n prev_movies_test = get_sparse_features(filtered_test_data['prev movies'], (len(filtered_test_data), filtred_data['movie id'].max()+1))\n \n # Train part\n # tensor with binary features\n # to get embeddings for sequence of indexes\n # We use this as index for nn Embeddings and feed that along with X_train_tensor to deep part\n max_train_len, movies_train_idx = sparse_to_idx(prev_movies_train, pad_idx=PAD_IDX)\n movies_train_idx = torch.Tensor(movies_train_idx).long().to(device)\n \n _, movies_test_idx = sparse_to_idx(prev_movies_test, pad_idx=PAD_IDX, max_length=max_train_len)\n movies_test_idx = torch.Tensor(movies_test_idx).long().to(device)\n \n # target\n target_train = torch.Tensor(y_train.values).long().to(device)\n target_test = torch.Tensor(y_test.values).long().to(device)\n target_test_sparse = y_test.apply(lambda x: idx_to_sparse(x, items['movie id'].nunique() + 1)) # to calculate mean rank over test set during training\n \n # tensor with continious features\n X_train_deep_tensor = torch.Tensor(X_train_deep.fillna(0).values).to(device)\n X_test_deep_tensor = torch.Tensor(X_test_deep.fillna(0).values).to(device)\n \n return X_train_deep_tensor, X_test_deep_tensor, movies_train_idx, movies_test_idx, target_train, target_test, target_test_sparse, items['movie id'].nunique() + 1\n\nclass DLRM(nn.Module):\n def __init__(self, embed_dim, embed_size, deep_dim, n_fields, n_class, pad_idx=0, interaction_op=\"cat\"):\n super().__init__()\n self.embedding = nn.Embedding(embed_dim, embed_size, padding_idx=pad_idx, device=device)\n \n self.bottom_mlp_stack = nn.Sequential(\n nn.Linear(deep_dim, 1024, device=device),\n nn.ReLU(),\n nn.Linear(1024, 512, device=device),\n nn.ReLU(),\n nn.Linear(512, embed_size, device=device),\n nn.ReLU()\n )\n self.interaction_op = interaction_op # [\"dot\", \"cat\"]\n if self.interaction_op == \"dot\":\n p, q = zip(*list(combinations(range(n_fields), 2)))\n self.field_p = nn.Parameter(torch.LongTensor(p), requires_grad=False)\n self.field_q = nn.Parameter(torch.LongTensor(q), requires_grad=False)\n self.interaction_units = int(n_fields * (n_fields - 1) / 2)\n self.upper_triange_mask = nn.Parameter(torch.triu(torch.ones(n_fields, n_fields), 1).type(torch.ByteTensor),\n requires_grad=False)\n # torchrec style implementation (as an alterante to above)\n # self.triu_indices: torch.Tensor = torch.triu_indices(\n # self.n_fields + 1, self.n_fields + 1, offset=1\n # )\n self.top_input_dim = (n_fields * (n_fields - 1)) // 2 + embed_size\n elif self.interaction_op == \"cat\":\n self.top_input_dim = (n_fields+1) * embed_size\n \n self.top_mlp_stack = nn.Sequential(\n nn.Linear(self.top_input_dim, 1024, device=device),\n nn.ReLU(),\n nn.Linear(1024, 512, device=device),\n nn.ReLU(),\n nn.Linear(512, n_class, device=device),\n nn.ReLU()\n )\n\n def forward(self, X_d, X_sparse_idx):\n embed_x = self.embedding(X_sparse_idx) # movies_train_idx\n \n # bottom mlp\n dense_out = self.bottom_mlp_stack(X_d).unsqueeze(1)\n feat_emb = torch.cat([embed_x, dense_out], dim=1)\n \n # interaction\n if self.interaction_op == \"dot\":\n inner_product_matrix = torch.bmm(feat_emb, feat_emb.transpose(1, 2))\n flat_upper_triange = torch.masked_select(inner_product_matrix, self.upper_triange_mask)\n interact_out = flat_upper_triange.view(-1, self.interaction_units)\n # torchrec style implementation (as an alterante to above)\n # interactions = torch.bmm(\n # feat_emb, torch.transpose(feat_emb, 1, 2)\n # )\n # interactions_flat = interactions[:, self.triu_indices[0], self.triu_indices[1]]\n # interact_out = torch.cat((dense_out, interactions_flat), dim=1)\n else:\n interact_out = feat_emb.flatten(start_dim=1) # torch.Size([76228, 11792]) 737*16\n \n # top mlp\n output = self.top_mlp_stack(interact_out)\n return output\n\ndef run_gradient_descent_dlrm(model,\n learning_rate=1e-3,\n weight_decay=0.01,\n num_epochs=10):\n loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_IDX) # the model doesn't need to predict padding index\n optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)\n \n iters, train_losses, test_losses, mean_test_ranks = [], [], [], []\n \n # training\n n = 0 # the number of iterations\n for epoch in range(num_epochs):\n model.train()\n y_logits = model(X_train_deep_tensor, movies_train_idx)\n loss_train = loss_fn(y_logits, target_train)\n\n # Backpropagation\n optimizer.zero_grad() # a clean up step for PyTorch\n loss_train.backward() # compute updates for each parameter\n optimizer.step() # make the updates for each parameter\n\n # save the current training information\n if n%100 == 0:\n pred_train = torch.softmax(y_logits, dim=1).argmax(dim=1)\n acc = accuracy_fn(y_true=target_train, y_pred=pred_train)\n \n model.eval()\n with torch.inference_mode():\n test_logits = model(X_test_deep_tensor, movies_test_idx)\n test_pred = torch.softmax(test_logits, dim=1).argmax(dim=1)\n loss_test = loss_fn(test_logits, target_test)\n test_acc = accuracy_fn(y_true=target_test,y_pred=test_pred)\n \n # calculate mean rank on test set\n softmax = nn.Softmax(dim=0)\n preds_wnd = softmax(test_logits.float()).cpu().detach().numpy()\n ranks_wnd = pd.DataFrame(preds_wnd).apply(lambda x: pd.Series(rankdata(-x)), axis=1)\n ranks_target_wnd = (ranks_wnd.values * target_test_sparse).sum(axis=1)\n mean_rank_wnd = ranks_target_wnd.mean()\n \n print(f\"Epoch: {epoch} | Loss: {loss_train:.5f}, Acc: {acc:.2f}% | Test Loss: {loss_test:.5f}, Test Acc: {test_acc:.2f}% Test mean rank: {mean_rank_wnd:.0f}\")\n \n iters.append(n)\n train_losses.append(float(loss_train))\n test_losses.append(float(loss_test))\n mean_test_ranks.append(mean_rank_wnd)\n \n # increment the iteration number\n n += 1\n \n # plotting\n plt.figure(figsize=(12, 8), dpi=100)\n plt.title(f\"Training Curve (lr={learning_rate})\")\n plt.plot(iters, train_losses, label=\"Train Loss\")\n plt.plot(iters, test_losses, label=\"Test Loss\")\n plt.xlabel(\"Iterations\")\n plt.ylabel(\"Loss\")\n plt.legend(loc='best')\n plt.show()\n \n plt.figure(figsize=(12, 8), dpi=100)\n plt.plot(iters, mean_test_ranks, label=\"Test Rank\")\n plt.xlabel(\"Iterations\")\n plt.ylabel(\"Mean Rank on testset\")\n plt.legend(loc='best')\n plt.show()\n \n return model, iters, train_losses, test_losses","metadata":{"execution":{"iopub.status.busy":"2022-11-05T10:11:45.631505Z","iopub.execute_input":"2022-11-05T10:11:45.631947Z","iopub.status.idle":"2022-11-05T10:11:45.676732Z","shell.execute_reply.started":"2022-11-05T10:11:45.631915Z","shell.execute_reply":"2022-11-05T10:11:45.675849Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"X_train_deep_tensor, X_test_deep_tensor, movies_train_idx, movies_test_idx, target_train, target_test, target_test_sparse, n_classes = load_and_process_data_dlrm()","metadata":{"execution":{"iopub.status.busy":"2022-11-05T10:11:45.679835Z","iopub.execute_input":"2022-11-05T10:11:45.680539Z","iopub.status.idle":"2022-11-05T10:12:50.244892Z","shell.execute_reply.started":"2022-11-05T10:11:45.680487Z","shell.execute_reply":"2022-11-05T10:12:50.243751Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"dlrm_model = DLRM(\n embed_dim=n_classes,\n embed_size=16,\n deep_dim=X_train_deep_tensor.shape[1],\n n_fields=movies_train_idx.shape[1],\n n_class=n_classes,) # randomly chosen","metadata":{"execution":{"iopub.status.busy":"2022-11-05T10:12:50.246730Z","iopub.execute_input":"2022-11-05T10:12:50.247150Z","iopub.status.idle":"2022-11-05T10:12:50.261991Z","shell.execute_reply.started":"2022-11-05T10:12:50.247109Z","shell.execute_reply":"2022-11-05T10:12:50.260721Z"},"trusted":true},"execution_count":6,"outputs":[]},{"cell_type":"code","source":"dlrm_model_trained, iters, train_losses, test_losses = run_gradient_descent_dlrm(dlrm_model, num_epochs=1000, weight_decay=0, learning_rate=0.03)","metadata":{"execution":{"iopub.status.busy":"2022-11-05T10:12:50.307143Z","iopub.execute_input":"2022-11-05T10:12:50.308132Z","iopub.status.idle":"2022-11-05T10:34:36.763541Z","shell.execute_reply.started":"2022-11-05T10:12:50.308096Z","shell.execute_reply":"2022-11-05T10:34:36.762617Z"},"trusted":true},"execution_count":11,"outputs":[{"name":"stdout","text":"Epoch: 0 | Loss: 7.42922, Acc: 0.08% | Test Loss: 539.31897, Test Acc: 0.44% Test mean rank: 1155\nEpoch: 100 | Loss: 6.56626, Acc: 1.23% | Test Loss: 7.09442, Test Acc: 0.52% Test mean rank: 922\nEpoch: 200 | Loss: 6.50729, Acc: 1.69% | Test Loss: 7.09041, Test Acc: 0.52% Test mean rank: 891\nEpoch: 300 | Loss: 6.47858, Acc: 1.93% | Test Loss: 7.07938, Test Acc: 0.52% Test mean rank: 791\nEpoch: 400 | Loss: 6.45101, Acc: 2.27% | Test Loss: 7.05669, Test Acc: 0.52% Test mean rank: 757\nEpoch: 500 | Loss: 6.43888, Acc: 2.59% | Test Loss: 7.06469, Test Acc: 0.52% Test mean rank: 749\nEpoch: 600 | Loss: 6.43282, Acc: 2.76% | Test Loss: 7.05829, Test Acc: 0.52% Test mean rank: 741\nEpoch: 700 | Loss: 6.42656, Acc: 2.93% | Test Loss: 7.06306, Test Acc: 0.52% Test mean rank: 739\nEpoch: 800 | Loss: 6.42380, Acc: 2.97% | Test Loss: 7.05855, Test Acc: 0.52% Test mean rank: 767\nEpoch: 900 | Loss: 6.41983, Acc: 3.33% | Test Loss: 7.06801, Test Acc: 0.52% Test mean rank: 793\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"<Figure size 1200x800 with 1 Axes>","image/png":"\n"},"metadata":{"needs_background":"light"}},{"output_type":"display_data","data":{"text/plain":"<Figure size 1200x800 with 1 Axes>","image/png":"\n"},"metadata":{"needs_background":"light"}}]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment