Last active
July 8, 2017 20:55
-
-
Save kevhill/9a38bd382847b86950f421ff4a981cc5 to your computer and use it in GitHub Desktop.
I ran into a barrier trying to cross correlate two matrices with missing values. I had to work our how to do that in a memory/cpu efficient way, and this is the best I could come up with.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import copy | |
def nan_cov(X, preserve_input=True): | |
# we need to mutate input to fill nan's with 0's | |
# if we need to use the input later, we need to use a copy now | |
if preserve_input: | |
X = copy.deepcopy(X) | |
# note the nan values, but then fill them with 0s | |
nanX = np.isnan(X) | |
X[nanX] = 0 | |
# masking matrix indicating where we have good data | |
iX = (~nanX).astype(int) | |
# sum of each row when filtered through valid measurements in each pair | |
# note that if one of the values going into the sum was nan, it is now 0 and | |
# will not contribute to the sum | |
sX = np.dot(X, iX.T) | |
# count of total matches | |
cX = np.dot(iX, iX.T) | |
# mean is just sum / count | |
uX = sX / cX | |
return (np.dot(X, X.T) / cX) - uX * uX.T | |
def nan_cor(X, preserve_input=True): | |
# we need to mutate input to fill nan's with 0's | |
# if we need to use the input later, we need to use a copy now | |
if preserve_input: | |
X = copy.deepcopy(X) | |
# note the nan values, but then fill them with 0s | |
nanX = np.isnan(X) | |
X[nanX] = 0 | |
# masking matrix indicating where we have good data | |
iX = (~nanX).astype(int) | |
# sum of each row when filtered through valid measurements in each pair | |
# note that if one of the values going into the sum was nan, it is now 0 and | |
# will not contribute to the sum | |
sX = np.dot(X, iX.T) | |
# count of total matches | |
cX = np.dot(iX, iX.T) | |
# mean is just sum / count | |
uX = sX / cX | |
# compute covariance matrix | |
cov = ((np.dot(X, X.T) / cX) - uX * uX.T) | |
# Variance for each row when filtered through valid measures in the pair | |
vX = (np.dot(X ** 2, iX.T) / cX - uX ** 2) | |
# normalize xcov by the sqrt of product of the variances | |
return cov / (vX * vX.T) ** .5 | |
def nan_xcov(A, B, preserve_input=True): | |
# we need to mutate A and B to fill nan's with 0's | |
# if we need to use the input later, we need to use a copy now | |
if preserve_input: | |
A = copy.deepcopy(A) | |
B = copy.deepcopy(B) | |
# note the nan values, but then fill them with 0s | |
nanA = np.isnan(A) | |
A[nanA] = 0 | |
nanB = np.isnan(B) | |
B[nanB] = 0 | |
# masking matrices indicating where we have good data | |
iA = (~nanA).astype(int) | |
iB = (~nanB).astype(int) | |
# this is the number of valid matches from A -> B (c.T == matches B -> A) | |
c = np.dot(iA, iB.T) | |
# compute the cross covariance between the two matrices, filtered for good values | |
return (np.dot(A, B.T) - (np.dot(A, iB.T) * np.dot(iA, B.T) / c)) / c | |
def nan_xcor(A, B, preserve_input=True): | |
# we need to mutate A and B to fill nan's with 0's | |
# if we need to use the input later, we need to use a copy now | |
if preserve_input: | |
A = copy.deepcopy(A) | |
B = copy.deepcopy(B) | |
# note the nan values, but then fill them with 0s | |
nanA = np.isnan(A) | |
A[nanA] = 0 | |
nanB = np.isnan(B) | |
B[nanB] = 0 | |
# masking matrices indicating where we have good data | |
iA = (~nanA).astype(int) | |
iB = (~nanB).astype(int) | |
# this is the number of valid matches from A -> B (c.T == matches B -> A) | |
c = np.dot(iA, iB.T) | |
# compute the cross covariance between the two matrices, filtered for good values | |
xcov = (np.dot(A, B.T) - (np.dot(A, iB.T) * np.dot(iA, B.T) / c)) / c | |
# Variance for each matrix when filtered through valid measures in the other | |
# in the case of vB we are really finding vB.t to put it in the same | |
# orientation as vA | |
vA = np.dot(A ** 2, iB.T) / c - (np.dot(A, iB.T) / c) ** 2 | |
vB = np.dot(iA, B.T ** 2) / c - (np.dot(iA, B.T) / c) ** 2 | |
# normalize xcov by the sqrt of product of the variances | |
return xcov / (vA * vB) ** .5 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment