Last active
December 12, 2021 15:16
-
-
Save ita9naiwa/1999469f0ccbc9e4fef790fa51504b98 to your computer and use it in GitHub Desktop.
alpha-beta-NDCG
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "ee9fbdce", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def rowwise_norm(arr, norm='mean'):\n", | |
" newmat = []\n", | |
" for i in range(arr.shape[0]):\n", | |
" a = arr[i].astype(np.float32)\n", | |
" newmat.append(a / (1e-10 + a.sum()))\n", | |
" return np.asarray(newmat).astype(np.float32)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "f758b56e", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from tqdm.auto import tqdm\n", | |
"\n", | |
"\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"from scipy.sparse import coo_matrix, csr_matrix\n", | |
"from implicit import evaluation" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "538195b2", | |
"metadata": {}, | |
"source": [ | |
"### Data Preparing" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "b756a329", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ui_mat = pd.read_csv(\"data/ml-1m/ratings.dat\", sep='::', engine='python', encoding='ISO-8859-1', header=None).to_numpy().astype(int)\n", | |
"u = ui_mat[:, 0] - 1\n", | |
"i = ui_mat[:, 1] - 1\n", | |
"r = ui_mat[:, 2]\n", | |
"ui_mat = csr_matrix((r, (u, i,)))\n", | |
"ui_mat = ui_mat >= 4\n", | |
"ui_mat.eliminate_zeros()\n", | |
"ui_mat.data[:] = 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "9eefcb4e", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"tr, te = evaluation.train_test_split(ui_mat, 0.5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "4904684b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"n_users, n_items = tr.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "8ba29d2d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"genres = pd.read_csv(\"data/ml-1m/movies.dat\", sep='::', engine='python', encoding='ISO-8859-1', header=None)\n", | |
"genres.columns = ['id', 'title', 'genre']\n", | |
"genres = {x: y.strip().split('|') for (x, y) in zip(genres['id'], genres['genre'])}\n", | |
"\n", | |
"unique_genres = set()\n", | |
"for i, g in genres.items():\n", | |
" unique_genres |= set(g)\n", | |
"genre_map = {g:i for (i, g) in enumerate(unique_genres)}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "bf67112b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['Drama', 'Thriller']" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"genres[i]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "5a786fc0", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"genre_mat = []\n", | |
"for i in range(n_items):\n", | |
" j = i + 1\n", | |
" u = np.zeros(len(unique_genres))\n", | |
" if j in genres:\n", | |
" for k in genres[j]:\n", | |
" u[genre_map[k]] = 1\n", | |
" genre_mat.append(u)\n", | |
"genre_mat = np.asarray(genre_mat)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "bb4a8508", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"num_topics = genre_mat.shape[1]\n", | |
"genre_mat = genre_mat.astype(np.int32)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "7d88b012", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"res = tr * genre_mat\n", | |
"user_phi_dist = rowwise_norm(res)\n", | |
"genre_csr = csr_matrix(genre_mat)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "55c686bc", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def get_gain(test_item_list, uid_true, genre_csr, user_pref, alpha=0.05, beta=0.99):\n", | |
" l = len(test_item_list)\n", | |
" num_topics = genre_csr.shape[1]\n", | |
" topic_array = np.zeros_like(num_topics)\n", | |
" rho = np.zeros(num_topics)\n", | |
" tau = np.zeros(num_topics)\n", | |
" gains = []\n", | |
" for k in range(l):\n", | |
" iid = test_item_list[k]\n", | |
" hit = int(iid in uid_true)\n", | |
" p = 1\n", | |
" for c in genre_csr[iid].indices:\n", | |
" P_a_u_i = (1 - hit) * alpha + hit * beta\n", | |
" p *= (1 - P_a_u_i * user_pref[c] * ((1 - alpha) ** tau[c]) * ((1 - beta) ** rho[c]))\n", | |
"# p *= (1 - P_a_u_i * user_pref[c] * np.max(0.0, (1.0 - tau[c] * alpha)) * ((1 - beta) ** rho[c]))\n", | |
" tau[c] += 1\n", | |
" rho[c] += hit\n", | |
" gain = 1.0 - p\n", | |
" gains.append(gain)\n", | |
" return gains\n", | |
"\n", | |
"def get_ideal_order(test_item_list, uid_true, genre_csr, user_pref):\n", | |
" _test_item_list = np.copy(test_item_list).tolist()\n", | |
"\n", | |
" k = len(_test_item_list)\n", | |
" topic_array = np.zeros(num_topics)\n", | |
" ideal_list = []\n", | |
" for i in range(k):\n", | |
" target_item_list = [x for x in _test_item_list if x in uid_true]\n", | |
" if len(target_item_list) == 0:\n", | |
" target_item_list = _test_item_list\n", | |
" \n", | |
" scores = []\n", | |
" for iid in target_item_list:\n", | |
" score = (genre_csr[iid] * user_pref).sum() - (0.0 / k) * (genre_csr[iid] * topic_array).sum()\n", | |
" scores.append(score)\n", | |
" \n", | |
" chosen = np.argmax(scores)\n", | |
" chosen_item = target_item_list[chosen]\n", | |
" ideal_list.append(chosen_item)\n", | |
" topic_array = topic_array + np.asarray(genre_csr[chosen_item].todense()).ravel()\n", | |
" _test_item_list = [x for x in _test_item_list if x != chosen_item]\n", | |
" return ideal_list" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "7dd47079", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def get_ideal_order_fast(test_item_list, uid_true, genre_csr, user_pref):\n", | |
" _test_item_list = np.copy(test_item_list)\n", | |
"\n", | |
" k = len(_test_item_list)\n", | |
" topic_array = np.zeros(num_topics)\n", | |
" ideal_list = []\n", | |
" hit = np.array([100 * (x in uid_true) for x in _test_item_list])\n", | |
" score = hit + (genre_csr[_test_item_list] * user_pref)\n", | |
" o = np.argsort(-score)\n", | |
" return _test_item_list[o]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "653b59af", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def ab_ndcg(uid, test_item_list, te_mat, genre_csr, user_phi_dist):\n", | |
" K = len(test_item_list)\n", | |
" uid_true = te_mat[uid].indices\n", | |
" gains = get_gain(test_item_list, uid_true, genre_csr, user_phi_dist[uid])\n", | |
" ideal_order = get_ideal_order_fast(test_item_list, uid_true, genre_csr, user_phi_dist[uid])\n", | |
" ideal_gains = get_gain(ideal_order, uid_true, genre_csr, user_phi_dist[uid])\n", | |
" U = np.sum(gains * (1 / np.log2(1 + np.arange(1, 1 + K))))\n", | |
" D = np.sum(ideal_gains * (1 / np.log2(1 + np.arange(1, 1 + K))))\n", | |
" return min(1, U / (1e-10 + D))\n", | |
"# return U" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "1bb7e11d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from implicit.als import AlternatingLeastSquares as ALS\n", | |
"from implicit.bpr import BayesianPersonalizedRanking as BPR" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "d1448f0e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"WARNING:root:Intel MKL BLAS detected. Its highly recommend to set the environment variable 'export MKL_NUM_THREADS=1' to disable its internal multithreading\n" | |
] | |
} | |
], | |
"source": [ | |
"model = ALS()\n", | |
"bpr = BPR()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "b847777d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "aa9a1deb3526421385a95e84ce0ee590", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
" 0%| | 0/15 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "b9483b905f09498090bcf571042ba856", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
" 0%| | 0/100 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"model.fit(tr.T * 10, )\n", | |
"bpr.fit(tr.T,)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"id": "527fe5ea", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"n_users, n_items = tr.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"id": "01b0fd48", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "c08b02f378ac4827927a2b18455fdca4", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
" 0%| | 0/6040 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"user_target_list = []\n", | |
"for u in tqdm(range(n_users)):\n", | |
" a = [x for x in np.random.choice(n_items, 100, replace=False) if x not in tr[u].indices]\n", | |
" a += np.random.choice(te[u].indices, min(3, len(te[u].indices)), replace=False).tolist()\n", | |
" user_target_list.append(a)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "c9a3d8fb", | |
"metadata": {}, | |
"source": [ | |
"## Random Recommendation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"id": "8ac87c68", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "76763aafc95742ca9f2d876df8af46a8", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
" 0%| | 0/6040 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"from sklearn.utils import shuffle\n", | |
"ndcgs = []\n", | |
"for uid in tqdm(range(n_users)):\n", | |
" K = 10\n", | |
" rec = shuffle(user_target_list[uid])\n", | |
" ndcg = ab_ndcg(uid, rec, te, genre_csr, user_phi_dist)\n", | |
" ndcgs.append(ndcg)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"id": "c5022d49", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.38296358741994113" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"np.mean(ndcgs)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "44debbae", | |
"metadata": {}, | |
"source": [ | |
"## ALS" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"id": "bed7001b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "f539f2449c31431284e589715c718ea5", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
" 0%| | 0/6040 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"0.7467418337430999" | |
] | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from tqdm.auto import tqdm\n", | |
"ndcgs = []\n", | |
"for uid in tqdm(range(n_users)):\n", | |
" rec = [x[0] for x in model.rank_items(uid, tr, user_target_list[uid])]\n", | |
" ndcg = ab_ndcg(uid, rec, te, genre_csr, user_phi_dist)\n", | |
" ndcgs.append(ndcg)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"id": "c8af11e3", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.7467418337430999" | |
] | |
}, | |
"execution_count": 22, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"np.mean(ndcgs)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "d5d6da52", | |
"metadata": {}, | |
"source": [ | |
"## BPRMF" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "86fa474e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "ba609d3a3c564d1e815fe1b138e75905", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
" 0%| | 0/6040 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"from tqdm.auto import tqdm\n", | |
"ndcgs = []\n", | |
"for uid in tqdm(range(n_users)):\n", | |
" rec = [x[0] for x in bpr.rank_items(uid, tr, user_target_list[uid])]\n", | |
" ndcg = ab_ndcg(uid, rec, te, genre_csr, user_phi_dist)\n", | |
" ndcgs.append(ndcg)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "5e4cac4b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"np.mean(ndcgs)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment