Skip to content

Commit 8569a8d

Browse files
authored
Merge pull request #86 from xyz8983/DDFG-add-randc-function
Ddfg add randc function
2 parents 45412a4 + b1ac20c commit 8569a8d

File tree

3 files changed

+84
-8
lines changed

3 files changed

+84
-8
lines changed

impyute/dataset/base.py

Lines changed: 55 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
""" Shared functions to load/generate data """
22
import numpy as np
3+
import string
4+
import random
5+
import math
6+
import itertools
37
from impyute.dataset.corrupt import Corruptor
8+
from impyute.util import BadInputError
49

510
def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int"):
611
""" Return randomly generated dataset of numbers with uniformly
@@ -15,8 +20,8 @@ def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int")
1520
shape:tuple(optional)
1621
Size of the randomly generated data
1722
missingness: ('mcar', 'mar', 'mnar')
18-
Type of missigness you want in your dataset
19-
th: float between [0,1]
23+
Type of missingness you want in your dataset
24+
thr: float between [0,1]
2025
Percentage of missing data in generated data
2126
dtype: ('int','float')
2227
Type of data
@@ -45,8 +50,8 @@ def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float"
4550
shape:tuple(optional)
4651
Size of the randomly generated data
4752
missingness: ('mcar', 'mar', 'mnar')
48-
Type of missigness you want in your dataset
49-
th: float between [0,1]
53+
Type of missingness you want in your dataset
54+
thr: float between [0,1]
5055
Percentage of missing data in generated data
5156
dtype: ('int','float')
5257
Type of data
@@ -65,6 +70,51 @@ def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float"
6570
raw_data = getattr(corruptor, missingness)()
6671
return raw_data
6772

73+
def randc(nlevels=5, shape=(5, 5), missingness="mcar", thr=0.2):
74+
""" Return randomly generated dataset with uniformly distributed categorical data (alphabetic character)
75+
76+
Parameters
77+
----------
78+
nlevels: int
79+
Specify the number of different categories in the dataset
80+
shape: tuple(optional)
81+
Size of the randomly generated data
82+
missingness: string in ('mcar', 'mar', 'mnar')
83+
Type of missingness you want in your dataset
84+
thr: float between [0,1]
85+
Percentage of missing data in generated data
86+
87+
Returns
88+
-------
89+
numpy.ndarray
90+
"""
91+
if shape[0]*shape[1] < nlevels:
92+
raise BadInputError("nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape")
93+
94+
length = len(string.ascii_lowercase)
95+
n_fold = int(math.floor(math.log(nlevels, length)))
96+
cat_pool = list(string.ascii_lowercase)
97+
98+
# when nlevel > 26, the alphabetical character is used up, need to generate extra strings as categorical data
99+
if n_fold > 0:
100+
for i in range(2, n_fold+2):
101+
pool_candidate = list(itertools.product(string.ascii_lowercase, repeat=i))
102+
cat_pool.extend([''.join(w) for w in pool_candidate])
103+
if len(cat_pool) > nlevels:
104+
break
105+
106+
cat = random.sample(cat_pool, nlevels)
107+
data = np.random.choice(cat, shape, replace=True)
108+
109+
# make sure the data frame has nlevel different categories
110+
while len(np.unique(data)) != nlevels:
111+
data = np.random.choice(cat, shape, replace=True)
112+
113+
corruptor = Corruptor(data, thr=thr, dtype=np.str)
114+
raw_data = getattr(corruptor, missingness)()
115+
return raw_data
116+
117+
68118

69119
def mnist(missingness="mcar", thr=0.2):
70120
""" Loads corrupted MNIST
@@ -84,4 +134,4 @@ def mnist(missingness="mcar", thr=0.2):
84134
dataset = fetch_mldata('MNIST original')
85135
corruptor = Corruptor(dataset.data, thr=thr)
86136
data = getattr(corruptor, missingness)()
87-
return {"X": data, "Y": dataset.target}
137+
return {"X": data, "Y": dataset.target}

impyute/dataset/corrupt.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ class Corruptor:
99
----------
1010
data: np.ndarray
1111
Matrix of values with no NaN's that you want to add NaN's to.
12-
th: float (optional)
12+
thr: float (optional)
1313
The percentage of null values you want in your dataset, a number
1414
between 0 and 1.
1515
@@ -23,10 +23,10 @@ class Corruptor:
2323
Overwrite values with MNAR placed NaN's.
2424
2525
"""
26-
def __init__(self, data, thr=0.2):
26+
def __init__(self, data, thr=0.2, dtype=np.float):
2727
self.dtype = data.dtype
2828
self.shape = np.shape(data)
29-
self.data = data.astype(np.float)
29+
self.data = data.astype(dtype)
3030
self.thr = thr
3131

3232
def mcar(self):

test/dataset/test_randc.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import numpy as np
2+
import pytest
3+
from impyute.dataset.base import randc
4+
from impyute.util import BadInputError
5+
6+
def test_raise_error_nlevel_exceed_shape():
7+
with pytest.raises(BadInputError) as e:
8+
randc(shape=(2, 2))
9+
expected = "nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape"
10+
assert str(e.value) == expected
11+
12+
@pytest.mark.parametrize("nlevels, shape", [(5, (5,5)), (9, (3,4)), (100, (20, 20))])
13+
def test_nlevel_categories(nlevels, shape):
14+
"""ideally the returned matrix should have nlevel+1 different categories, +1 because the Corrupt class introduce np.nan
15+
however, if the missing value introduced by Corrupt class happens to replace a group of categories, the unique
16+
category number would be < nlevel + 1
17+
"""
18+
dataframe = randc(nlevels, shape)
19+
assert len(np.unique(dataframe)) <= nlevels + 1
20+
21+
22+
@pytest.mark.parametrize("nlevels, shape", [(5, (5,5)), (9, (3, 4)), (100, (20, 20))])
23+
def test_dataframe_shape(nlevels, shape):
24+
"""test if the returned data frame has desired shape"""
25+
dataframe = randc(nlevels, shape)
26+
assert dataframe.shape == shape

0 commit comments

Comments
 (0)