Skip to content

Instantly share code, notes, and snippets.

@VibhuJawa
Created January 22, 2020 23:35
Show Gist options
  • Save VibhuJawa/236b073b8e1b1243ad33a099503cdc36 to your computer and use it in GitHub Desktop.
Save VibhuJawa/236b073b8e1b1243ad33a099503cdc36 to your computer and use it in GitHub Desktop.
Sorting Performance Comparison CUDF vs Dask vs Dask.map_partitions
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import dask_cudf\n",
"import cudf\n",
"import os\n",
"import time\n",
"import dask.dataframe as dd\n",
"import dask.array as da"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from dask_cuda import LocalCUDACluster\n",
"from dask.distributed import Client,wait"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"cluster = LocalCUDACluster()\n",
"client = Client(cluster)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def create_random_data(n_rows=1_000,n_keys_index_1=50_000,n_keys_index_2=20_000,n_keys_index_3=20_000):\n",
" \n",
" df = dd.concat([\n",
" da.random.random(n_rows).to_dask_dataframe(columns='x'),\n",
" da.random.randint(0, n_keys_index_1, size=n_rows).to_dask_dataframe(columns='index_1'),\n",
" da.random.randint(0, n_keys_index_2, size=n_rows).to_dask_dataframe(columns='index_2'),\n",
" da.random.randint(0, n_keys_index_3, size=n_rows).to_dask_dataframe(columns='index_3'),\n",
" \n",
" ], axis=1).persist()\n",
" gdf = df.map_partitions(cudf.from_pandas)\n",
" return gdf.persist()\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Native Dask"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"time taken = 4.509349584579468 , len of df = 80,000,000\n"
]
}
],
"source": [
"df = create_random_data(80_000_000)\n",
"df = df.repartition(npartitions=1).persist()\n",
"_ = wait(df)\n",
"\n",
"st = time.time()\n",
"df = df.sort_values(by =['index_1','index_2', 'index_3']).persist()\n",
"_ = wait(df)\n",
"et = time.time()\n",
"print(f\"time taken = {et-st} , len of df = {len(df):,}\")\n",
"del df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Map Partitions "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"time taken = 1.7103350162506104 , len of df = 80,000,000\n"
]
}
],
"source": [
"df = create_random_data(80_000_000)\n",
"df = df.repartition(npartitions=1).persist()\n",
"_ = wait(df)\n",
"\n",
"st = time.time()\n",
"df = df.map_partitions(lambda df:df.sort_values(by =['index_1','index_2', 'index_3'])).persist()\n",
"_ = wait(df)\n",
"et = time.time()\n",
"print(f\"time taken = {et-st} , len of df = {len(df):,}\")\n",
"del df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Native Cudf"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"time taken = 1.4036340713500977 , len of df = 80,000,000\n"
]
}
],
"source": [
"df = create_random_data(80_000_000)\n",
"df = df.compute()\n",
"\n",
"st = time.time()\n",
"df = df.sort_values(by =['index_1','index_2', 'index_3'])\n",
"_ = wait(df)\n",
"et = time.time()\n",
"print(f\"time taken = {et-st} , len of df = {len(df):,}\")\n",
"del df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment