Skip to content

Instantly share code, notes, and snippets.

@Linux-cpp-lisp
Last active October 12, 2017 16:36

Revisions

  1. Linux-cpp-lisp revised this gist Oct 12, 2017. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion boxes.py
    Original file line number Diff line number Diff line change
    @@ -22,7 +22,7 @@ def memory_optimized_n_eps(traj,
    Params:
    - traj: The n-dimensional trajectory to compute on. Modified IN-PLACE!
    - eps: The slide length of the squares/cubes/hypercubes to count.
    - eps: The side length of the squares/cubes/hypercubes to count.
    - memory_slot_limit: Approximate hard limit on the size (in bytes) of the box grid matrix used by the algorithm.
    This parameter can be used to control the memory use of the algorithm, however, it has a lower bound
    that depends on epsilon and the data itself. The algorithm's behaviour if the limit is below the lower
  2. Linux-cpp-lisp created this gist Dec 1, 2016.
    125 changes: 125 additions & 0 deletions boxes.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,125 @@
    #boxes.py: Compute N(epsilon) for the "box-counting" or capacity dimension of a data set.
    # Alby Musaelian, 12/2016.

    import numpy as np

    import warnings

    def memory_optimized_n_eps(traj,
    eps,
    memory_slot_limit=100*1024*1024,
    memory_limit_mode='error',
    save_extra_memory = False,
    logger=None):
    """Compute N(eps) for a n-dimensional time series.
    Compute the number of squares/cubes/hypercubes of side length `eps`
    needed to cover all points in the time-series `traj`.
    Returns an N(eps) (an integer).
    IMPORTANT: !! -- Modifies the order and values of the trajectory in place. -- !!
    Params:
    - traj: The n-dimensional trajectory to compute on. Modified IN-PLACE!
    - eps: The slide length of the squares/cubes/hypercubes to count.
    - memory_slot_limit: Approximate hard limit on the size (in bytes) of the box grid matrix used by the algorithm.
    This parameter can be used to control the memory use of the algorithm, however, it has a lower bound
    that depends on epsilon and the data itself. The algorithm's behaviour if the limit is below the lower
    bound is controlled by `memory_limit_mode`. Defaults to 100Mb.
    - memory_limit_mode: Either 'error' or 'increase'. When 'error' is set, if the least memory needed to run the computation
    is above `memory_slot_limit`, an exception is raised. In 'increase' mode, the amount of memory used will be
    increased to the lowest amount above the limit needed to run the computation. Defaults to 'error'.
    - save_extra_memory: When true, the algorithm will sort completely in place without creating an extra array of indexes.
    Defaults to False. When this mode is enabled, only C contiguous arrays (meaning, for example, no views or slices)
    can be provided for `traj`.
    - logger: A `Logger` to log information to. Defaults to `None`, or no logging.
    """
    assert memory_limit_mode in ['error', 'increase']

    if not traj.flags.owndata:
    warnings.warn("`memory_optimized_n_eps` modifies `traj` in place; "
    "the provided `traj` does not own its data and the computation may therefore unintentionally modify data elsewhere")

    if save_extra_memory and not traj.flags.c_contiguous:
    raise ValueError("`traj` must be C contiguous when in `save_extra_memory` mode.\n"
    "(Generally, this error means a view or slice was passed to `memory_optimized_n_eps`; "
    "try using `np.ascontiguousarray()` or `.copy(order='C')` or setting `save_extra_memory` to False)")

    mins = np.amin(traj, axis=0)
    maxes = np.amax(traj, axis=0)
    orig_shape = list(np.ceil((maxes - mins) / float(eps)))
    slicing_dimension = np.argmax(orig_shape)

    if logger:
    logger.info("Slicing Dimension: %d", slicing_dimension)

    #Compute the slice size
    other_dimension_product = np.prod(orig_shape[0:slicing_dimension] + orig_shape[slicing_dimension+1:])
    slice_size = np.floor(float(memory_slot_limit) / other_dimension_product)

    if slice_size < 1:
    if memory_limit_mode == 'error':
    error = ("memory_slot_limit must be large enough for a step size of at least 1. "
    "For these parameters, that implies a memory_slot_limit of at least %s"
    )
    raise ValueError(error % other_dimension_product)
    else:
    slice_size = 1

    #If the slice size is bigger than the data, clamp it to the size of the data.
    if slice_size > orig_shape[slicing_dimension]:
    slice_size = orig_shape[slicing_dimension]

    #Sort trajectory in-place (ascending) along the axis with the most boxes (largest data range)
    if save_extra_memory: #Sorting hack:
    #Hack from http://stackoverflow.com/questions/2828059/sorting-arrays-in-numpy-by-column
    traj.view(','.join([traj.dtype.str] * traj.shape[1])).sort(order=['f%d' % slicing_dimension], axis=0)
    else:
    traj = traj[traj[:,slicing_dimension].argsort()]

    #Move data origin to lower left corner of box grid
    traj -= mins

    #Create the slice grid
    shape = orig_shape[:]
    shape[slicing_dimension] = slice_size
    slice_grid = np.zeros(shape=shape, dtype=bool)

    box_count = 0
    slice_index = 0

    if logger:
    logger.info("Slice Size (boxes): %s", slice_size)
    logger.info("Slice Size (input units): %s", slice_size * eps)
    logger.info("Slice Grid Size (bytes): %s", slice_grid.nbytes)

    lastpt = -np.inf

    for pt in traj:
    # - Check for out-of-order sorts -
    assert not lastpt > pt[slicing_dimension], "Fatal error: %d out-of-order points were found. If using `save_extra_memory` mode, try turning it off."
    lastpt = pt[slicing_dimension]

    #Get the box index of the point in the overall grid:
    box_idex = np.floor(pt / eps).astype(int)
    #Translate that index to be for the current slice in the slicing dimension:
    box_idex[slicing_dimension] -= slice_index * slice_size

    #If the index is bigger that the slice size, move to another slice:
    if box_idex[slicing_dimension] >= slice_size:
    #Finish the current slice:
    box_count += slice_grid.sum()
    slice_grid.fill(False)
    #Skip enough slices to bring the index within range:
    slices_moved, new_idex = divmod(box_idex[slicing_dimension], slice_size)
    slice_index += int(slices_moved)
    box_idex[slicing_dimension] = new_idex

    #Mark the point's box as used:
    slice_grid[tuple(box_idex)] |= True

    #Add any remaining boxes to the sum:
    box_count += slice_grid.sum()

    return box_count