Skip to content

Instantly share code, notes, and snippets.

@dschult
Created July 16, 2024 15:43
Show Gist options
  • Save dschult/6b65bd18cd425d07b1a659eb7f67bc58 to your computer and use it in GitHub Desktop.
Save dschult/6b65bd18cd425d07b1a659eb7f67bc58 to your computer and use it in GitHub Desktop.
Proof of concept for nD sparse arrays using 2D storage with NumPy tools and 2D sparsetools
# See the bottom for demo lines that print desired output
#
# key idea: store the nD array as a 2D array by raveling
# some axes into the rows and some axes into the columns.
#
# element-wise operations work nicely with the 2D version of the array
# and can be converted back to nD when needed.
#
# Two important functions here are `get_2d_coords` and `get_nd_coords`
# which convert between nD and 2D coords. The data elements stay in the
# same order. This version stores both nD coords and 2D coords, but this
# could be changed to save memory if that works better.
import itertools
import operator
import math
import numpy as np
import scipy as sp
from scipy.sparse import coo_array
def check_shape(shape, allow_nd=None):
# TODO police the allow_nd case
if allow_nd:
return shape
return sp.sparse._sputils.check_shape(shape)
class ndsparray:
"""Idea: store 2D version of the nD object, and info needed to convert
A._self_2d
A._shape_as_2d
A._coords_2d
A._compressed_axes #(not actually compressed for COO. But used as row in 2D.)
The same system could be used for nD CSR with compressed axes raveled to
rows in the 2D rep. and other axes raveled to columns.
"""
@property
def shape(self):
return self._shape
@property
def _shape_as_2d(self):
s = self._shape
if len(s) ==1:
return (1, s[-1])
if len(s) == 2:
return s
return self._self_2d.shape
def __init__(self,
arg1=None,
*,
shape=None,
dtype=None,
compressed_axes=None,
copy=False,
maxprint=None,
):
#TODO: allow smaller index_dtype
index_dtype = np.int64
if sp.sparse.issparse(arg1):
ndim = arg1.ndim
if ndim == 2:
self._self_2d = arg1.tocoo(copy=False)
self._shape = self._self_2d._shape
self.dtype = self._self_2d.dtype
return
else:
# TODO allow ndsparray input
raise NotImplementError("Todo")
elif arg1 is None:
if shape is None:
raise ValueError('cannot infer shape from size zero coords')
shape = check_shape(shape, allow_nd=True)
ndim = len(shape)
data = np.array([], dtype=dtype)
coords = tuple(np.array([], dtype='i') for _ in shape) # index_dtype??
elif isinstance(arg1, tuple):
try:
data, coords = arg1
except (TypeError, ValueError) as e:
raise TypeError('tuple input arg must be len 2: (data, coords)') from e
if not isinstance(data, np.ndarray):
raise TypeError('first object in the arg1 tuple is not an array')
if not isinstance(coords, tuple):
raise ValueError('2nd input is not a tuple')
if any(not isinstance(c, np.ndarray) for c in coords):
raise ValueError('2nd input is not a tuple of coord arrays')
ndim = len(coords)
if shape is None:
if any(len(idx) == 0 for idx in coords):
raise ValueError('cannot infer shape from size zero coords')
shape = tuple(operator.index(np.max(idx)) + 1 for idx in coords)
else:
shape = check_shape(shape, allow_nd=True)
elif isinstance(arg1, (np.ndarray, list)): # dense
M = np.asarray(arg1)
ndim = M.ndim
coords = M.nonzero()
data = M[coords]
coords = tuple(idx.astype(index_dtype, copy=False) for idx in coords)
if ndim <= 2:
self._self_2d = coo_array((data, coords), shape=shape, dtype=dtype)
self._shape = self._self_2d._shape
#self._row_axes = self._col_axes = None
self._axes_2d = None
self.dtype = self._self_2d.dtype
return
else:
coords = M.nonzero()
data = M[coords]
shape = M.shape
else:
raise ValueError('Could not parse arg1 input')
if ndim <= 2:
self._axes_2d = None
self._self_2d = coo_array((data, coords), shape=shape, dtype=dtype)
else: # ndim > 2:
if compressed_axes is None: # COO
axes_2d = (tuple(range(ndim - 1)), (ndim -1,))
else:
index_axes = [ax for ax in range(ndim) if ax not in compressed_axes]
axes_2d = (compressed_axes, tuple(index_axis))
coords_2d, shape_2d = get_2d_coords(coords, axes_2d, shape)
idx_dtype = sp.sparse._sputils.get_index_dtype(
coords_2d, maxval=max(shape_2d), check_contents=False
)
if idx_dtype != coords_2d[0].dtype or copy:
coords_2d = tuple(idx.astype(idx_dtype) for idx in coords_2d)
self._axes_2d = axes_2d
self._self_2d = coo_array((data, coords_2d), shape=shape_2d, dtype=dtype)
self._shape = shape
self.ndim = ndim
self.data = sp.sparse._sputils.getdata(data, dtype=dtype, copy=copy)
self.coords = coords
self.dtype = self._self_2d.dtype
#TODO: self._nd_check()
def toarray(self):
flat_indices = np.ravel_multi_index(self.coords, self.shape)
dense_result = np.zeros(self.shape, dtype=self.dtype)
dense_result.flat[flat_indices] = self.data
return dense_result
def __add__(self, other):
if isinstance(other, ndsparray):
res = self._self_2d + other._self_2d
else:
res = self._self_2d + other
nd_coords = get_nd_coords(res.tocoo().coords, self._axes_2d, self.shape)
return ndsparray((res.data, nd_coords), shape = self.shape)
def get_2d_coords(coords, axes_2d, shape):
coords_2d = []
shape_as_2d = []
for axes in axes_2d:
ax_shape = tuple(shape[i] for i in axes)
ax_co = tuple(coords[i] for i in axes)
co = np.ravel_multi_index(ax_co, ax_shape)
coords_2d.append(co)
shape_as_2d.append(math.prod(ax_shape))
return tuple(coords_2d), tuple(shape_as_2d)
# same code without loop (easier to read?)
# row_axes, col_axes = axes_2d
#
# row_shape = tuple(shape[i] for i in row_axes)
# rows = tuple(coords[i] for i in row_axes)
# row_coords = np.ravel_multi_index(rows, row_shape)
#
# col_shape = tuple(shape[i] for i in col_axes)
# cols = tuple(coords[i] for i in col_axes)
# col_coords = np.ravel_multi_index(cols, col_shape)
#
# shape_as_2d = (math.prod(row_shape), math.prod(col_shape))
# return (row_coords, col_coords), shape_as_2d
def get_nd_coords(coords_2d, axes_2d, shape):
coords = list(shape) # shape values not used: only for length of list.
for co, ax in zip(coords_2d, axes_2d):
ax_shape = tuple(shape[i] for i in ax)
ax_arr = np.unravel_index(co, ax_shape)
for i, coord in zip(ax, ax_arr):
coords[i] = coord
return tuple(coords)
# same code without loop (easier to read?)
# coords = list(shape)
# row_axes, col_axes = axes_2d
# row_co, col_co = coords_2d
#
# row_shape = tuple(shape[i] for i in row_axes)
# row_arr = np.unravel_index(row_co, row_shape)
# for i, coord in zip(row_axes, row_arr):
# coords[i] = coord
#
# col_shape = tuple(shape[i] for i in col_axes)
# col_arr = np.unravel_index(col_co, col_shape)
# for i, coord in zip(col_axes, col_arr):
# coords[i] = coord
#
# return tuple(coords)
if __name__ == "__main__":
A = ndsparray(shape=(3,4,2))
print(f"{A.shape=}")
print(f"{A.coords=}")
print(f"{A.data=}")
print(f"{A._self_2d=}")
print(A.toarray())
print("##############")
coords = (np.array([2, 3, 0, 0, 0]), np.array([1, 1, 1, 1, 0]), np.array([0, 1, 2, 3, 4]))
data = np.array([3.4, 4, 2, 1, 5])
B = ndsparray((data, coords))
print(f"{B.shape=}")
print(f"{B.coords=}")
print(f"{B.data=}")
print(f"{B._self_2d=}")
print(B.toarray())
print("##############")
BB = B + B
print(f"{BB.shape=}")
print(f"{BB.coords=}")
print(f"{BB.data=}")
print(f"{BB._self_2d=}")
print(BB.toarray())
print("##############")
npC = np.array([[[1, 2], [0, 1], [3, 4]], [[2, 1], [1, 0], [4, 3]]])
C = ndsparray(npC)
print(C.toarray())
print("##############")
print((C + C).toarray())
np.testing.assert_equal((C + C).toarray(), npC + npC)
print("##############")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment