Created
July 16, 2024 15:43
-
-
Save dschult/6b65bd18cd425d07b1a659eb7f67bc58 to your computer and use it in GitHub Desktop.
Proof of concept for nD sparse arrays using 2D storage with NumPy tools and 2D sparsetools
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# See the bottom for demo lines that print desired output | |
# | |
# key idea: store the nD array as a 2D array by raveling | |
# some axes into the rows and some axes into the columns. | |
# | |
# element-wise operations work nicely with the 2D version of the array | |
# and can be converted back to nD when needed. | |
# | |
# Two important functions here are `get_2d_coords` and `get_nd_coords` | |
# which convert between nD and 2D coords. The data elements stay in the | |
# same order. This version stores both nD coords and 2D coords, but this | |
# could be changed to save memory if that works better. | |
import itertools | |
import operator | |
import math | |
import numpy as np | |
import scipy as sp | |
from scipy.sparse import coo_array | |
def check_shape(shape, allow_nd=None): | |
# TODO police the allow_nd case | |
if allow_nd: | |
return shape | |
return sp.sparse._sputils.check_shape(shape) | |
class ndsparray: | |
"""Idea: store 2D version of the nD object, and info needed to convert | |
A._self_2d | |
A._shape_as_2d | |
A._coords_2d | |
A._compressed_axes #(not actually compressed for COO. But used as row in 2D.) | |
The same system could be used for nD CSR with compressed axes raveled to | |
rows in the 2D rep. and other axes raveled to columns. | |
""" | |
@property | |
def shape(self): | |
return self._shape | |
@property | |
def _shape_as_2d(self): | |
s = self._shape | |
if len(s) ==1: | |
return (1, s[-1]) | |
if len(s) == 2: | |
return s | |
return self._self_2d.shape | |
def __init__(self, | |
arg1=None, | |
*, | |
shape=None, | |
dtype=None, | |
compressed_axes=None, | |
copy=False, | |
maxprint=None, | |
): | |
#TODO: allow smaller index_dtype | |
index_dtype = np.int64 | |
if sp.sparse.issparse(arg1): | |
ndim = arg1.ndim | |
if ndim == 2: | |
self._self_2d = arg1.tocoo(copy=False) | |
self._shape = self._self_2d._shape | |
self.dtype = self._self_2d.dtype | |
return | |
else: | |
# TODO allow ndsparray input | |
raise NotImplementError("Todo") | |
elif arg1 is None: | |
if shape is None: | |
raise ValueError('cannot infer shape from size zero coords') | |
shape = check_shape(shape, allow_nd=True) | |
ndim = len(shape) | |
data = np.array([], dtype=dtype) | |
coords = tuple(np.array([], dtype='i') for _ in shape) # index_dtype?? | |
elif isinstance(arg1, tuple): | |
try: | |
data, coords = arg1 | |
except (TypeError, ValueError) as e: | |
raise TypeError('tuple input arg must be len 2: (data, coords)') from e | |
if not isinstance(data, np.ndarray): | |
raise TypeError('first object in the arg1 tuple is not an array') | |
if not isinstance(coords, tuple): | |
raise ValueError('2nd input is not a tuple') | |
if any(not isinstance(c, np.ndarray) for c in coords): | |
raise ValueError('2nd input is not a tuple of coord arrays') | |
ndim = len(coords) | |
if shape is None: | |
if any(len(idx) == 0 for idx in coords): | |
raise ValueError('cannot infer shape from size zero coords') | |
shape = tuple(operator.index(np.max(idx)) + 1 for idx in coords) | |
else: | |
shape = check_shape(shape, allow_nd=True) | |
elif isinstance(arg1, (np.ndarray, list)): # dense | |
M = np.asarray(arg1) | |
ndim = M.ndim | |
coords = M.nonzero() | |
data = M[coords] | |
coords = tuple(idx.astype(index_dtype, copy=False) for idx in coords) | |
if ndim <= 2: | |
self._self_2d = coo_array((data, coords), shape=shape, dtype=dtype) | |
self._shape = self._self_2d._shape | |
#self._row_axes = self._col_axes = None | |
self._axes_2d = None | |
self.dtype = self._self_2d.dtype | |
return | |
else: | |
coords = M.nonzero() | |
data = M[coords] | |
shape = M.shape | |
else: | |
raise ValueError('Could not parse arg1 input') | |
if ndim <= 2: | |
self._axes_2d = None | |
self._self_2d = coo_array((data, coords), shape=shape, dtype=dtype) | |
else: # ndim > 2: | |
if compressed_axes is None: # COO | |
axes_2d = (tuple(range(ndim - 1)), (ndim -1,)) | |
else: | |
index_axes = [ax for ax in range(ndim) if ax not in compressed_axes] | |
axes_2d = (compressed_axes, tuple(index_axis)) | |
coords_2d, shape_2d = get_2d_coords(coords, axes_2d, shape) | |
idx_dtype = sp.sparse._sputils.get_index_dtype( | |
coords_2d, maxval=max(shape_2d), check_contents=False | |
) | |
if idx_dtype != coords_2d[0].dtype or copy: | |
coords_2d = tuple(idx.astype(idx_dtype) for idx in coords_2d) | |
self._axes_2d = axes_2d | |
self._self_2d = coo_array((data, coords_2d), shape=shape_2d, dtype=dtype) | |
self._shape = shape | |
self.ndim = ndim | |
self.data = sp.sparse._sputils.getdata(data, dtype=dtype, copy=copy) | |
self.coords = coords | |
self.dtype = self._self_2d.dtype | |
#TODO: self._nd_check() | |
def toarray(self): | |
flat_indices = np.ravel_multi_index(self.coords, self.shape) | |
dense_result = np.zeros(self.shape, dtype=self.dtype) | |
dense_result.flat[flat_indices] = self.data | |
return dense_result | |
def __add__(self, other): | |
if isinstance(other, ndsparray): | |
res = self._self_2d + other._self_2d | |
else: | |
res = self._self_2d + other | |
nd_coords = get_nd_coords(res.tocoo().coords, self._axes_2d, self.shape) | |
return ndsparray((res.data, nd_coords), shape = self.shape) | |
def get_2d_coords(coords, axes_2d, shape): | |
coords_2d = [] | |
shape_as_2d = [] | |
for axes in axes_2d: | |
ax_shape = tuple(shape[i] for i in axes) | |
ax_co = tuple(coords[i] for i in axes) | |
co = np.ravel_multi_index(ax_co, ax_shape) | |
coords_2d.append(co) | |
shape_as_2d.append(math.prod(ax_shape)) | |
return tuple(coords_2d), tuple(shape_as_2d) | |
# same code without loop (easier to read?) | |
# row_axes, col_axes = axes_2d | |
# | |
# row_shape = tuple(shape[i] for i in row_axes) | |
# rows = tuple(coords[i] for i in row_axes) | |
# row_coords = np.ravel_multi_index(rows, row_shape) | |
# | |
# col_shape = tuple(shape[i] for i in col_axes) | |
# cols = tuple(coords[i] for i in col_axes) | |
# col_coords = np.ravel_multi_index(cols, col_shape) | |
# | |
# shape_as_2d = (math.prod(row_shape), math.prod(col_shape)) | |
# return (row_coords, col_coords), shape_as_2d | |
def get_nd_coords(coords_2d, axes_2d, shape): | |
coords = list(shape) # shape values not used: only for length of list. | |
for co, ax in zip(coords_2d, axes_2d): | |
ax_shape = tuple(shape[i] for i in ax) | |
ax_arr = np.unravel_index(co, ax_shape) | |
for i, coord in zip(ax, ax_arr): | |
coords[i] = coord | |
return tuple(coords) | |
# same code without loop (easier to read?) | |
# coords = list(shape) | |
# row_axes, col_axes = axes_2d | |
# row_co, col_co = coords_2d | |
# | |
# row_shape = tuple(shape[i] for i in row_axes) | |
# row_arr = np.unravel_index(row_co, row_shape) | |
# for i, coord in zip(row_axes, row_arr): | |
# coords[i] = coord | |
# | |
# col_shape = tuple(shape[i] for i in col_axes) | |
# col_arr = np.unravel_index(col_co, col_shape) | |
# for i, coord in zip(col_axes, col_arr): | |
# coords[i] = coord | |
# | |
# return tuple(coords) | |
if __name__ == "__main__": | |
A = ndsparray(shape=(3,4,2)) | |
print(f"{A.shape=}") | |
print(f"{A.coords=}") | |
print(f"{A.data=}") | |
print(f"{A._self_2d=}") | |
print(A.toarray()) | |
print("##############") | |
coords = (np.array([2, 3, 0, 0, 0]), np.array([1, 1, 1, 1, 0]), np.array([0, 1, 2, 3, 4])) | |
data = np.array([3.4, 4, 2, 1, 5]) | |
B = ndsparray((data, coords)) | |
print(f"{B.shape=}") | |
print(f"{B.coords=}") | |
print(f"{B.data=}") | |
print(f"{B._self_2d=}") | |
print(B.toarray()) | |
print("##############") | |
BB = B + B | |
print(f"{BB.shape=}") | |
print(f"{BB.coords=}") | |
print(f"{BB.data=}") | |
print(f"{BB._self_2d=}") | |
print(BB.toarray()) | |
print("##############") | |
npC = np.array([[[1, 2], [0, 1], [3, 4]], [[2, 1], [1, 0], [4, 3]]]) | |
C = ndsparray(npC) | |
print(C.toarray()) | |
print("##############") | |
print((C + C).toarray()) | |
np.testing.assert_equal((C + C).toarray(), npC + npC) | |
print("##############") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment