Last active
February 14, 2020 00:19
-
-
Save VibhuJawa/4f6a73458a54dd7571e799f4c8a795cb to your computer and use it in GitHub Desktop.
xgboost_working_example.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"env: NCCL_P2P_DISABLE=1\n" | |
] | |
} | |
], | |
"source": [ | |
"%env NCCL_P2P_DISABLE=1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import xgboost \n", | |
"import dask_cudf\n", | |
"from dask import delayed\n", | |
"import dask_xgboost\n", | |
"from dask.distributed import Client, wait\n", | |
"from dask.dataframe import from_delayed\n", | |
"import cudf\n", | |
"import dask\n", | |
"from dask_cuda import LocalCUDACluster" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"_py-xgboost-mutex 2.0 cpu_0 conda-forge\n", | |
"dask-xgboost 0.2.0.dev28 cuda10.0py37_0 rapidsai/label/xgboost\n", | |
"libxgboost 0.90 he1b5a44_2 conda-forge\n", | |
"py-xgboost 0.90 py37he1b5a44_2 conda-forge\n", | |
"xgboost 1.0.0-SNAPSHOT pypi_0 pypi\n" | |
] | |
} | |
], | |
"source": [ | |
"!conda list | grep xgboost" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### xgboost source\n", | |
"!xgboost is instaled from hhttps://xgboost-ci.net/job/xgboost/job/master/lastSuccessfulBuild/artifact/python-package/dist/xgboost-1.0.0_SNAPSHOT-py2.py3-none-any.whl" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"cluster = LocalCUDACluster()\n", | |
"client = Client(cluster)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = dask_cudf.from_cudf(cudf.DataFrame({'x':[1,2]*16,'y':[0,1]*16}), npartitions=8)\n", | |
"df = df.persist()\n", | |
"done = wait(df)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"xgb_gpu_params = {\n", | |
" 'nround': 100,\n", | |
" 'max_depth': 8,\n", | |
" 'max_leaves': 2**8,\n", | |
" 'alpha': 0.9,\n", | |
" 'eta': 0.1,\n", | |
" 'gamma': 0.1,\n", | |
" 'learning_rate': 0.1,\n", | |
" 'subsample': 1,\n", | |
" 'reg_lambda': 1,\n", | |
" 'scale_pos_weight': 2,\n", | |
" 'min_child_weight': 30,\n", | |
" 'tree_method': 'gpu_hist',\n", | |
" 'loss': 'ls',\n", | |
" 'objective': 'binary:logistic',\n", | |
" 'max_features': 'auto',\n", | |
" 'criterion': 'friedman_mse',\n", | |
" 'grow_policy': 'lossguide',\n", | |
" 'verbose': True\n", | |
" }\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Training using cudf" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<xgboost.core.Booster at 0x7f22248fceb8>" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"model = xgboost.train(xgb_gpu_params,xgboost.DMatrix(df[['x']].compute(),df[['y']].compute()))\n", | |
"model" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Training using dask-cudf" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<xgboost.core.Booster at 0x7f212814b4e0>" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"model = dask_xgboost.train(client, xgb_gpu_params, df[['x']], df[['y']], \n", | |
" num_boost_round=xgb_gpu_params['nround'])\n", | |
"model" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Training using dmatrix\n", | |
"##### Dmatrix requires that you have 1 dmatrix per GPU/Worker\n", | |
"* Below i first repartion the df by concatinating frames on the same GPU\n", | |
"* Then create DMatrix for training" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def repartition_cudf_join_on_gpu(df,client):\n", | |
" \"\"\"\n", | |
" This function repartitions the dataframe by joining all \n", | |
" keys that are on the same gpu\n", | |
" This only works with persisted DF\n", | |
" \"\"\"\n", | |
" dataframe_dask_keys = [str(key) for key in df.__dask_keys__() ]\n", | |
" client_dict = client.has_what()\n", | |
" \n", | |
" ### this creates a key map\n", | |
" ## where key is worker_ip\n", | |
" ## and values are the dataframe objects at that key\n", | |
" worker_key_map = {}\n", | |
" for worker_ip,worker_tasks in client_dict.items():\n", | |
" worker_key_map[worker_ip] = [delayed(lambda x:x)(dask_key_name = dask_key) for dask_key in worker_tasks if dask_key in dataframe_dask_keys]\n", | |
" \n", | |
" concatenated_df_task_ls = []\n", | |
" for list_delayed in worker_key_map.values():\n", | |
" concatenated_df_task_ls.append(delayed(cudf.concat)(list_delayed))\n", | |
" \n", | |
" ### persisting the tasks on GPU\n", | |
" concatenated_df_task_ls = [task.persist() for task in concatenated_df_task_ls]\n", | |
" done = wait(concatenated_df_task_ls)\n", | |
"\n", | |
"\n", | |
" return concatenated_df_task_ls\n", | |
"\n", | |
"\n", | |
"def get_dmatrix_from_persisted_df(df,label_col = 'y',non_label_columns=['x']):\n", | |
" gpu_dfs = [(gpu_df[[label_col]], gpu_df[non_label_columns]) for gpu_df in df.to_delayed()]\n", | |
" gpu_dfs = [(gpu_df[0].persist(), gpu_df[1].persist()) for gpu_df in gpu_dfs]\n", | |
"\n", | |
" dmat_ls = [dask.delayed(xgboost.DMatrix)(gpu_df[1], gpu_df[0]) for gpu_df in gpu_dfs]\n", | |
" dmat_ls = [dmat.persist() for dmat in dmat_ls]\n", | |
" wait(dmat_ls)\n", | |
" \n", | |
" return dmat_ls" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"repartitioned_df = dask_cudf.from_delayed(repartition_cudf_join_on_gpu(df,client))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"assert repartitioned_df.npartitions == len(client.scheduler_info()['workers'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dmat = get_dmatrix_from_persisted_df(repartitioned_df)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<xgboost.core.Booster at 0x7f20faff2fd0>" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"model = dask_xgboost.train(client, xgb_gpu_params, dmat, None, num_boost_round=xgb_gpu_params['nround'])\n", | |
"model" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment