Skip to content

Instantly share code, notes, and snippets.

@masdeseiscaracteres
Last active April 5, 2020 16:32
Show Gist options
  • Save masdeseiscaracteres/c8ea34b439168ecdab70eef906b966de to your computer and use it in GitHub Desktop.
Save masdeseiscaracteres/c8ea34b439168ecdab70eef906b966de to your computer and use it in GitHub Desktop.
Numpy groupby (inspired in the implementation in https://github.com/EelcoHoogendoorn/Numpy_arraysetops_EP)
import numpy as np
def groupby(keys, values, func=np.sum, axis=0, stable=True):
if axis != 0:
raise NotImplementedError
stable = True
sorter = np.argsort(keys, kind='mergesort' if stable else 'quicksort')
# computed sorted keys
keys_sorted = keys[sorter]
# the slicing points of the bins to reduce over
flag = keys_sorted[:-1] != keys_sorted[1:] # True is final element of a group
length = len(keys)
slices = np.concatenate(([0], np.flatnonzero(flag) + 1, [length]))
values = np.take(values, sorter, axis=axis)
if isinstance(func, ufunc):
agg_values = func.__getattribute__('reduceat')(values, slices[:-1])
else:
sub_arr_list = np.array_split(values, slices[1:-1])
agg_values = np.array([func(sub_arr) for sub_arr in sub_arr_list])
unique_keys = keys_sorted[slices[:-1]]
return np.concatenate((unique_keys[:,None], agg_values[:,None]), axis=1)
@masdeseiscaracteres
Copy link
Author

masdeseiscaracteres commented Apr 4, 2020

Examples

Small array

A = np.random.randint(0, 10, (1000, 2))
%timeit out_np = groupby(A[:,0], A[:,1], ufunc=np.add)

A_df =  pd.DataFrame(A)
%timeit out_pd = A_df.groupby(0)[1].sum()
51.1 µs ± 2.3 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
860 µs ± 62.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

Large array

A = np.random.randint(0, 10, (int(1e6), 2))
%timeit out_np = groupby(A[:,0], A[:,1], ufunc=np.add)

A_df = pd.DataFrame(A)
%timeit out_pd = A_df.groupby(0)[1].sum()
96.3 ms ± 1.99 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
40.4 ms ± 1.19 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment