Skip to content

Commit 1f6b460

Browse files
adamkleinwesm
authored andcommitted
ENH: continuing #1349, optimize DataFrame.cov for missing data
1 parent dbb5b45 commit 1f6b460

File tree

2 files changed

+8
-19
lines changed

2 files changed

+8
-19
lines changed

pandas/core/frame.py

+5-16
Original file line numberDiff line numberDiff line change
@@ -3838,27 +3838,16 @@ def cov(self):
38383838
y : DataFrame
38393839
"""
38403840
numeric_df = self._get_numeric_data()
3841-
mat = numeric_df.values.T
38423841
cols = numeric_df.columns
3843-
baseCov = np.cov(mat)
3842+
mat = numeric_df.values
38443843

3845-
for i, j, ac, bc in self._cov_helper(mat):
3846-
c = np.cov(ac, bc)[0, 1]
3847-
baseCov[i, j] = c
3848-
baseCov[j, i] = c
3844+
if notnull(mat).all():
3845+
baseCov = np.cov(mat.T)
3846+
else:
3847+
baseCov = lib.nancorr(mat, cov=True)
38493848

38503849
return self._constructor(baseCov, index=cols, columns=cols)
38513850

3852-
def _cov_helper(self, mat):
3853-
# Get the covariance with items that have NaN values
3854-
mask = np.isfinite(mat)
3855-
for i, A in enumerate(mat):
3856-
if not mask[i].all():
3857-
for j, B in enumerate(mat):
3858-
in_common = mask[i] & mask[j]
3859-
if in_common.any():
3860-
yield i, j, A[in_common], B[in_common]
3861-
38623851
def corrwith(self, other, axis=0, drop=False):
38633852
"""
38643853
Compute pairwise correlation between rows or columns of two DataFrame

pandas/src/moments.pyx

+3-3
Original file line numberDiff line numberDiff line change
@@ -248,11 +248,11 @@ def ewma(ndarray[double_t] input, double_t com):
248248
return output
249249

250250
#----------------------------------------------------------------------
251-
# Pairwise covariance
251+
# Pairwise correlation/covariance
252252

253253
@cython.boundscheck(False)
254254
@cython.wraparound(False)
255-
def nancorr(ndarray[float64_t, ndim=2] mat):
255+
def nancorr(ndarray[float64_t, ndim=2] mat, cov=False):
256256
cdef:
257257
Py_ssize_t i, j, xi, yi, N, K
258258
ndarray[float64_t, ndim=2] result
@@ -294,7 +294,7 @@ def nancorr(ndarray[float64_t, ndim=2] mat):
294294
sumxx += vx * vx
295295
sumyy += vy * vy
296296

297-
divisor = sqrt(sumxx * sumyy)
297+
divisor = (nobs - 1.0) if cov else sqrt(sumxx * sumyy)
298298

299299
if divisor != 0:
300300
result[xi, yi] = result[yi, xi] = sumx / divisor

0 commit comments

Comments
 (0)