dschult · July 16, 2024 15:43
diff --git a/ndsparray.py b/ndsparray.py
 # See the bottom for demo lines that print desired output
 #
 # key idea:  store the nD array as a 2D array by raveling
 #            some axes into the rows and some axes into the columns.
 #
 # element-wise operations work nicely with the 2D version of the array
 # and can be converted back to nD when needed.
 # 
 # Two important functions here are `get_2d_coords` and `get_nd_coords`
 # which convert between nD and 2D coords. The data elements stay in the 
 # same order. This version stores both nD coords and 2D coords, but this
 # could be changed to save memory if that works better.

 import itertools
 import operator
 import math

 import numpy as np
 import scipy as sp
 from scipy.sparse import coo_array

 def check_shape(shape, allow_nd=None):
    # TODO police the allow_nd case
    if allow_nd:
        return shape
    return sp.sparse._sputils.check_shape(shape)

 class ndsparray:
    """Idea: store 2D version of the nD object, and info needed to convert
    A._self_2d
    A._shape_as_2d
    A._coords_2d
    A._compressed_axes  #(not actually compressed for COO. But used as row in 2D.)

    The same system could be used for nD CSR with compressed axes raveled to
    rows in the 2D rep. and other axes raveled to columns.
    """

    @property
    def shape(self):
        return self._shape

    @property
    def _shape_as_2d(self):
        s = self._shape
        if len(s) ==1:
            return (1, s[-1])
        if len(s) == 2:
            return s
        return self._self_2d.shape

    def __init__(self,
        arg1=None,
        *,
        shape=None,
        dtype=None,
        compressed_axes=None,
        copy=False,
        maxprint=None,
    ):
        #TODO: allow smaller index_dtype
        index_dtype = np.int64

        if sp.sparse.issparse(arg1):
            ndim = arg1.ndim
            if ndim == 2:
                self._self_2d = arg1.tocoo(copy=False)
                self._shape = self._self_2d._shape
                self.dtype = self._self_2d.dtype
                return
            else:
                # TODO allow ndsparray input
                raise NotImplementError("Todo")
        elif arg1 is None:
            if shape is None:
                raise ValueError('cannot infer shape from size zero coords')

            shape = check_shape(shape, allow_nd=True)
            ndim = len(shape)
            data = np.array([], dtype=dtype)
            coords = tuple(np.array([], dtype='i') for _ in shape)  # index_dtype??
        elif isinstance(arg1, tuple):
            try:
                data, coords = arg1
            except (TypeError, ValueError) as e:
                raise TypeError('tuple input arg must be len 2: (data, coords)') from e
            if not isinstance(data, np.ndarray):
                raise TypeError('first object in the arg1 tuple is not an array')
            if not isinstance(coords, tuple):
                raise ValueError('2nd input is not a tuple')
            if any(not isinstance(c, np.ndarray) for c in coords):
                raise ValueError('2nd input is not a tuple of coord arrays')
            ndim = len(coords)

            if shape is None:
                if any(len(idx) == 0 for idx in coords):
                    raise ValueError('cannot infer shape from size zero coords')
                shape = tuple(operator.index(np.max(idx)) + 1 for idx in coords)
            else:
                shape = check_shape(shape, allow_nd=True)
        elif isinstance(arg1, (np.ndarray, list)):  # dense
            M = np.asarray(arg1)
            ndim = M.ndim
            coords = M.nonzero()
            data = M[coords]
            coords = tuple(idx.astype(index_dtype, copy=False) for idx in coords)
            if ndim <= 2:
                self._self_2d = coo_array((data, coords), shape=shape, dtype=dtype)
                self._shape = self._self_2d._shape
                #self._row_axes = self._col_axes = None
                self._axes_2d = None
                self.dtype = self._self_2d.dtype
                return
            else:
                coords = M.nonzero()
                data = M[coords]
                shape = M.shape
        else:
            raise ValueError('Could not parse arg1 input')

        if ndim <= 2:
            self._axes_2d = None
            self._self_2d = coo_array((data, coords), shape=shape, dtype=dtype)

        else:  # ndim > 2:
            if compressed_axes is None:  # COO
                axes_2d = (tuple(range(ndim - 1)), (ndim -1,))
            else:
                index_axes = [ax for ax in range(ndim) if ax not in compressed_axes]
                axes_2d = (compressed_axes, tuple(index_axis))

            coords_2d, shape_2d = get_2d_coords(coords, axes_2d, shape)
            idx_dtype = sp.sparse._sputils.get_index_dtype(
                coords_2d, maxval=max(shape_2d), check_contents=False
            )
            if idx_dtype != coords_2d[0].dtype or copy:
                coords_2d = tuple(idx.astype(idx_dtype) for idx in coords_2d)

            self._axes_2d = axes_2d
            self._self_2d = coo_array((data, coords_2d), shape=shape_2d, dtype=dtype)

        self._shape = shape
        self.ndim = ndim
        self.data = sp.sparse._sputils.getdata(data, dtype=dtype, copy=copy)
        self.coords = coords
        self.dtype = self._self_2d.dtype

        #TODO:  self._nd_check()

    def toarray(self):
        flat_indices = np.ravel_multi_index(self.coords, self.shape)
        dense_result = np.zeros(self.shape, dtype=self.dtype)
        dense_result.flat[flat_indices] = self.data
        return dense_result

    def __add__(self, other):
        if isinstance(other, ndsparray):
            res = self._self_2d + other._self_2d
        else:
            res = self._self_2d + other
        nd_coords = get_nd_coords(res.tocoo().coords, self._axes_2d, self.shape)
        return ndsparray((res.data, nd_coords), shape = self.shape)

 def get_2d_coords(coords, axes_2d, shape):
    coords_2d = []
    shape_as_2d = []
    for axes in axes_2d:
        ax_shape = tuple(shape[i] for i in axes)
        ax_co = tuple(coords[i] for i in axes)
        co = np.ravel_multi_index(ax_co, ax_shape)
        coords_2d.append(co)
        shape_as_2d.append(math.prod(ax_shape))
    return tuple(coords_2d), tuple(shape_as_2d)

 # same code without loop (easier to read?)
 #    row_axes, col_axes = axes_2d
 #
 #    row_shape = tuple(shape[i] for i in row_axes)
 #    rows = tuple(coords[i] for i in row_axes)
 #    row_coords = np.ravel_multi_index(rows, row_shape)
 #
 #    col_shape = tuple(shape[i] for i in col_axes)
 #    cols = tuple(coords[i] for i in col_axes)
 #    col_coords = np.ravel_multi_index(cols, col_shape)
 #
 #    shape_as_2d = (math.prod(row_shape), math.prod(col_shape))
 #    return (row_coords, col_coords), shape_as_2d


 def get_nd_coords(coords_2d, axes_2d, shape):
    coords = list(shape)  # shape values not used: only for length of list.
    for co, ax in zip(coords_2d, axes_2d):
        ax_shape = tuple(shape[i] for i in ax)
        ax_arr = np.unravel_index(co, ax_shape)
        for i, coord in zip(ax, ax_arr):
            coords[i] = coord
    return tuple(coords)

 # same code without loop (easier to read?)
 #    coords = list(shape)
 #    row_axes, col_axes = axes_2d
 #    row_co, col_co = coords_2d
 #
 #    row_shape = tuple(shape[i] for i in row_axes)
 #    row_arr = np.unravel_index(row_co, row_shape)
 #    for i, coord in zip(row_axes, row_arr):
 #        coords[i] = coord
 #
 #    col_shape = tuple(shape[i] for i in col_axes)
 #    col_arr = np.unravel_index(col_co, col_shape)
 #    for i, coord in zip(col_axes, col_arr):
 #        coords[i] = coord
 #
 #    return tuple(coords)


 if __name__ == "__main__":
    A = ndsparray(shape=(3,4,2))
    print(f"{A.shape=}")
    print(f"{A.coords=}")
    print(f"{A.data=}")
    print(f"{A._self_2d=}")
    print(A.toarray())
    print("##############")

    coords = (np.array([2, 3, 0, 0, 0]), np.array([1, 1, 1, 1, 0]), np.array([0, 1, 2, 3, 4]))
    data = np.array([3.4, 4, 2, 1, 5])
    B = ndsparray((data, coords))
    print(f"{B.shape=}")
    print(f"{B.coords=}")
    print(f"{B.data=}")
    print(f"{B._self_2d=}")
    print(B.toarray())
    print("##############")

    BB = B + B
    print(f"{BB.shape=}")
    print(f"{BB.coords=}")
    print(f"{BB.data=}")
    print(f"{BB._self_2d=}")
    print(BB.toarray())
    print("##############")

    npC = np.array([[[1, 2], [0, 1], [3, 4]], [[2, 1], [1, 0], [4, 3]]])
    C = ndsparray(npC)
    print(C.toarray())
    print("##############")

    print((C + C).toarray())
    np.testing.assert_equal((C + C).toarray(), npC + npC)
    print("##############")
	# See the bottom for demo lines that print desired output
	#
	# key idea: store the nD array as a 2D array by raveling
	# some axes into the rows and some axes into the columns.
	#
	# element-wise operations work nicely with the 2D version of the array
	# and can be converted back to nD when needed.
	#
	# Two important functions here are `get_2d_coords` and `get_nd_coords`
	# which convert between nD and 2D coords. The data elements stay in the
	# same order. This version stores both nD coords and 2D coords, but this
	# could be changed to save memory if that works better.

	import itertools
	import operator
	import math

	import numpy as np
	import scipy as sp
	from scipy.sparse import coo_array

	def check_shape(shape, allow_nd=None):
	# TODO police the allow_nd case
	if allow_nd:
	return shape
	return sp.sparse._sputils.check_shape(shape)

	class ndsparray:
	"""Idea: store 2D version of the nD object, and info needed to convert
	A._self_2d
	A._shape_as_2d
	A._coords_2d
	A._compressed_axes #(not actually compressed for COO. But used as row in 2D.)

	The same system could be used for nD CSR with compressed axes raveled to
	rows in the 2D rep. and other axes raveled to columns.
	"""

	@property
	def shape(self):
	return self._shape

	@property
	def _shape_as_2d(self):
	s = self._shape
	if len(s) ==1:
	return (1, s[-1])
	if len(s) == 2:
	return s
	return self._self_2d.shape

	def __init__(self,
	arg1=None,
	*,
	shape=None,
	dtype=None,
	compressed_axes=None,
	copy=False,
	maxprint=None,
	):
	#TODO: allow smaller index_dtype
	index_dtype = np.int64

	if sp.sparse.issparse(arg1):
	ndim = arg1.ndim
	if ndim == 2:
	self._self_2d = arg1.tocoo(copy=False)
	self._shape = self._self_2d._shape
	self.dtype = self._self_2d.dtype
	return
	else:
	# TODO allow ndsparray input
	raise NotImplementError("Todo")
	elif arg1 is None:
	if shape is None:
	raise ValueError('cannot infer shape from size zero coords')

	shape = check_shape(shape, allow_nd=True)
	ndim = len(shape)
	data = np.array([], dtype=dtype)
	coords = tuple(np.array([], dtype='i') for _ in shape) # index_dtype??
	elif isinstance(arg1, tuple):
	try:
	data, coords = arg1
	except (TypeError, ValueError) as e:
	raise TypeError('tuple input arg must be len 2: (data, coords)') from e
	if not isinstance(data, np.ndarray):
	raise TypeError('first object in the arg1 tuple is not an array')
	if not isinstance(coords, tuple):
	raise ValueError('2nd input is not a tuple')
	if any(not isinstance(c, np.ndarray) for c in coords):
	raise ValueError('2nd input is not a tuple of coord arrays')
	ndim = len(coords)

	if shape is None:
	if any(len(idx) == 0 for idx in coords):
	raise ValueError('cannot infer shape from size zero coords')
	shape = tuple(operator.index(np.max(idx)) + 1 for idx in coords)
	else:
	shape = check_shape(shape, allow_nd=True)
	elif isinstance(arg1, (np.ndarray, list)): # dense
	M = np.asarray(arg1)
	ndim = M.ndim
	coords = M.nonzero()
	data = M[coords]
	coords = tuple(idx.astype(index_dtype, copy=False) for idx in coords)
	if ndim <= 2:
	self._self_2d = coo_array((data, coords), shape=shape, dtype=dtype)
	self._shape = self._self_2d._shape
	#self._row_axes = self._col_axes = None
	self._axes_2d = None
	self.dtype = self._self_2d.dtype
	return
	else:
	coords = M.nonzero()
	data = M[coords]
	shape = M.shape
	else:
	raise ValueError('Could not parse arg1 input')

	if ndim <= 2:
	self._axes_2d = None
	self._self_2d = coo_array((data, coords), shape=shape, dtype=dtype)

	else: # ndim > 2:
	if compressed_axes is None: # COO
	axes_2d = (tuple(range(ndim - 1)), (ndim -1,))
	else:
	index_axes = [ax for ax in range(ndim) if ax not in compressed_axes]
	axes_2d = (compressed_axes, tuple(index_axis))

	coords_2d, shape_2d = get_2d_coords(coords, axes_2d, shape)
	idx_dtype = sp.sparse._sputils.get_index_dtype(
	coords_2d, maxval=max(shape_2d), check_contents=False
	)
	if idx_dtype != coords_2d[0].dtype or copy:
	coords_2d = tuple(idx.astype(idx_dtype) for idx in coords_2d)

	self._axes_2d = axes_2d
	self._self_2d = coo_array((data, coords_2d), shape=shape_2d, dtype=dtype)

	self._shape = shape
	self.ndim = ndim
	self.data = sp.sparse._sputils.getdata(data, dtype=dtype, copy=copy)
	self.coords = coords
	self.dtype = self._self_2d.dtype

	#TODO: self._nd_check()

	def toarray(self):
	flat_indices = np.ravel_multi_index(self.coords, self.shape)
	dense_result = np.zeros(self.shape, dtype=self.dtype)
	dense_result.flat[flat_indices] = self.data
	return dense_result

	def __add__(self, other):
	if isinstance(other, ndsparray):
	res = self._self_2d + other._self_2d
	else:
	res = self._self_2d + other
	nd_coords = get_nd_coords(res.tocoo().coords, self._axes_2d, self.shape)
	return ndsparray((res.data, nd_coords), shape = self.shape)

	def get_2d_coords(coords, axes_2d, shape):
	coords_2d = []
	shape_as_2d = []
	for axes in axes_2d:
	ax_shape = tuple(shape[i] for i in axes)
	ax_co = tuple(coords[i] for i in axes)
	co = np.ravel_multi_index(ax_co, ax_shape)
	coords_2d.append(co)
	shape_as_2d.append(math.prod(ax_shape))
	return tuple(coords_2d), tuple(shape_as_2d)

	# same code without loop (easier to read?)
	# row_axes, col_axes = axes_2d
	#
	# row_shape = tuple(shape[i] for i in row_axes)
	# rows = tuple(coords[i] for i in row_axes)
	# row_coords = np.ravel_multi_index(rows, row_shape)
	#
	# col_shape = tuple(shape[i] for i in col_axes)
	# cols = tuple(coords[i] for i in col_axes)
	# col_coords = np.ravel_multi_index(cols, col_shape)
	#
	# shape_as_2d = (math.prod(row_shape), math.prod(col_shape))
	# return (row_coords, col_coords), shape_as_2d


	def get_nd_coords(coords_2d, axes_2d, shape):
	coords = list(shape) # shape values not used: only for length of list.
	for co, ax in zip(coords_2d, axes_2d):
	ax_shape = tuple(shape[i] for i in ax)
	ax_arr = np.unravel_index(co, ax_shape)
	for i, coord in zip(ax, ax_arr):
	coords[i] = coord
	return tuple(coords)

	# same code without loop (easier to read?)
	# coords = list(shape)
	# row_axes, col_axes = axes_2d
	# row_co, col_co = coords_2d
	#
	# row_shape = tuple(shape[i] for i in row_axes)
	# row_arr = np.unravel_index(row_co, row_shape)
	# for i, coord in zip(row_axes, row_arr):
	# coords[i] = coord
	#
	# col_shape = tuple(shape[i] for i in col_axes)
	# col_arr = np.unravel_index(col_co, col_shape)
	# for i, coord in zip(col_axes, col_arr):
	# coords[i] = coord
	#
	# return tuple(coords)


	if __name__ == "__main__":
	A = ndsparray(shape=(3,4,2))
	print(f"{A.shape=}")
	print(f"{A.coords=}")
	print(f"{A.data=}")
	print(f"{A._self_2d=}")
	print(A.toarray())
	print("##############")

	coords = (np.array([2, 3, 0, 0, 0]), np.array([1, 1, 1, 1, 0]), np.array([0, 1, 2, 3, 4]))
	data = np.array([3.4, 4, 2, 1, 5])
	B = ndsparray((data, coords))
	print(f"{B.shape=}")
	print(f"{B.coords=}")
	print(f"{B.data=}")
	print(f"{B._self_2d=}")
	print(B.toarray())
	print("##############")

	BB = B + B
	print(f"{BB.shape=}")
	print(f"{BB.coords=}")
	print(f"{BB.data=}")
	print(f"{BB._self_2d=}")
	print(BB.toarray())
	print("##############")

	npC = np.array([[[1, 2], [0, 1], [3, 4]], [[2, 1], [1, 0], [4, 3]]])
	C = ndsparray(npC)
	print(C.toarray())
	print("##############")

	print((C + C).toarray())
	np.testing.assert_equal((C + C).toarray(), npC + npC)
	print("##############")