Linux-cpp-lisp · October 12, 2017 16:36 · Oct 12, 2017 · Dec 1, 2016
diff --git a/boxes.py b/boxes.py
@@ -22,7 +22,7 @@ def memory_optimized_n_eps(traj,
 
     Params:
      - traj: The n-dimensional trajectory to compute on. Modified IN-PLACE!
-     - eps: The slide length of the squares/cubes/hypercubes to count.
+     - eps: The side length of the squares/cubes/hypercubes to count.
      - memory_slot_limit: Approximate hard limit on the size (in bytes) of the box grid matrix used by the algorithm.
              This parameter can be used to control the memory use of the algorithm, however, it has a lower bound
              that depends on epsilon and the data itself. The algorithm's behaviour if the limit is below the lower

diff --git a/boxes.py b/boxes.py
@@ -0,0 +1,125 @@
+#boxes.py: Compute N(epsilon) for the "box-counting" or capacity dimension of a data set.
+# Alby Musaelian, 12/2016.
+
+import numpy as np
+
+import warnings
+
+def memory_optimized_n_eps(traj,
+                           eps,
+                           memory_slot_limit=100*1024*1024,
+                           memory_limit_mode='error',
+                           save_extra_memory = False,
+                           logger=None):
+    """Compute N(eps) for a n-dimensional time series.
+
+    Compute the number of squares/cubes/hypercubes  of side length `eps`
+    needed to cover all points in the time-series `traj`.
+
+    Returns an N(eps) (an integer).
+    
+    IMPORTANT: !! -- Modifies the order and values of the trajectory in place. -- !!
+
+    Params:
+     - traj: The n-dimensional trajectory to compute on. Modified IN-PLACE!
+     - eps: The slide length of the squares/cubes/hypercubes to count.
+     - memory_slot_limit: Approximate hard limit on the size (in bytes) of the box grid matrix used by the algorithm.
+             This parameter can be used to control the memory use of the algorithm, however, it has a lower bound
+             that depends on epsilon and the data itself. The algorithm's behaviour if the limit is below the lower
+             bound is controlled by `memory_limit_mode`. Defaults to 100Mb.
+     - memory_limit_mode: Either 'error' or 'increase'. When 'error' is set, if the least memory needed to run the computation
+             is above `memory_slot_limit`, an exception is raised. In 'increase' mode, the amount of memory used will be
+             increased to the lowest amount above the limit needed to run the computation. Defaults to 'error'.
+     - save_extra_memory: When true, the algorithm will sort completely in place without creating an extra array of indexes.
+             Defaults to False. When this mode is enabled, only C contiguous arrays (meaning, for example, no views or slices)
+             can be provided for `traj`.
+     - logger: A `Logger` to log information to. Defaults to `None`, or no logging. 
+    """
+    assert memory_limit_mode in ['error', 'increase']
+
+    if not traj.flags.owndata:
+        warnings.warn("`memory_optimized_n_eps` modifies `traj` in place; "
+                     "the provided `traj` does not own its data and the computation may therefore unintentionally modify data elsewhere")
+
+    if save_extra_memory and not traj.flags.c_contiguous:
+        raise ValueError("`traj` must be C contiguous when in `save_extra_memory` mode.\n"
+                         "(Generally, this error means a view or slice was passed to `memory_optimized_n_eps`; "
+                         "try using `np.ascontiguousarray()` or `.copy(order='C')` or setting `save_extra_memory` to False)")
+
+    mins = np.amin(traj, axis=0)
+    maxes = np.amax(traj, axis=0)
+    orig_shape = list(np.ceil((maxes - mins) / float(eps)))
+    slicing_dimension = np.argmax(orig_shape)
+
+    if logger:
+        logger.info("Slicing Dimension: %d", slicing_dimension)
+
+    #Compute the slice size
+    other_dimension_product = np.prod(orig_shape[0:slicing_dimension] + orig_shape[slicing_dimension+1:])
+    slice_size = np.floor(float(memory_slot_limit) / other_dimension_product)
+
+    if slice_size < 1:
+        if memory_limit_mode == 'error':
+            error = ("memory_slot_limit must be large enough for a step size of at least 1. "
+                     "For these parameters, that implies a memory_slot_limit of at least %s"
+                     )
+            raise ValueError(error % other_dimension_product)
+        else:
+            slice_size = 1
+
+    #If the slice size is bigger than the data, clamp it to the size of the data.
+    if slice_size > orig_shape[slicing_dimension]:
+        slice_size = orig_shape[slicing_dimension]
+
+    #Sort trajectory in-place (ascending) along the axis with the most boxes (largest data range)
+    if save_extra_memory: #Sorting hack:
+        #Hack from http://stackoverflow.com/questions/2828059/sorting-arrays-in-numpy-by-column
+        traj.view(','.join([traj.dtype.str] * traj.shape[1])).sort(order=['f%d' % slicing_dimension], axis=0)
+    else:
+        traj = traj[traj[:,slicing_dimension].argsort()]
+
+    #Move data origin to lower left corner of box grid
+    traj -= mins
+
+    #Create the slice grid
+    shape = orig_shape[:]
+    shape[slicing_dimension] = slice_size
+    slice_grid = np.zeros(shape=shape, dtype=bool)
+
+    box_count = 0
+    slice_index = 0
+
+    if logger:
+        logger.info("Slice Size (boxes): %s", slice_size)
+        logger.info("Slice Size (input units): %s", slice_size * eps)
+        logger.info("Slice Grid Size (bytes): %s", slice_grid.nbytes)
+
+    lastpt = -np.inf
+
+    for pt in traj:
+        # - Check for out-of-order sorts -
+        assert not lastpt > pt[slicing_dimension], "Fatal error: %d out-of-order points were found. If using `save_extra_memory` mode, try turning it off."
+        lastpt = pt[slicing_dimension]
+
+        #Get the box index of the point in the overall grid:
+        box_idex = np.floor(pt / eps).astype(int)
+        #Translate that index to be for the current slice in the slicing dimension:
+        box_idex[slicing_dimension] -= slice_index * slice_size
+
+        #If the index is bigger that the slice size, move to another slice:
+        if box_idex[slicing_dimension] >= slice_size:
+            #Finish the current slice:
+            box_count += slice_grid.sum()
+            slice_grid.fill(False)
+            #Skip enough slices to bring the index within range:
+            slices_moved, new_idex = divmod(box_idex[slicing_dimension], slice_size)
+            slice_index += int(slices_moved)
+            box_idex[slicing_dimension] = new_idex
+
+        #Mark the point's box as used:
+        slice_grid[tuple(box_idex)] |= True
+
+    #Add any remaining boxes to the sum:
+    box_count += slice_grid.sum()
+
+    return box_count