Merge pull request #86 from xyz8983/DDFG-add-randc-function

eltonlaw · web-flow · commit 8569a8dd165d · 2019-08-01T17:47:24.000-04:00
Ddfg add randc function
diff --git a/impyute/dataset/base.py b/impyute/dataset/base.py
@@ -1,6 +1,11 @@
 """ Shared functions to load/generate data """
 import numpy as np
+import string
+import random
+import math
+import itertools
 from impyute.dataset.corrupt import Corruptor
+from impyute.util import BadInputError
 
 def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int"):
     """ Return randomly generated dataset of numbers with uniformly
@@ -15,8 +20,8 @@ def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int")
     shape:tuple(optional)
         Size of the randomly generated data
     missingness: ('mcar', 'mar', 'mnar')
-        Type of missigness you want in your dataset
-    th: float between [0,1]
+        Type of missingness you want in your dataset
+    thr: float between [0,1]
         Percentage of missing data in generated data
     dtype: ('int','float')
         Type of data
@@ -45,8 +50,8 @@ def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float"
     shape:tuple(optional)
         Size of the randomly generated data
     missingness: ('mcar', 'mar', 'mnar')
-        Type of missigness you want in your dataset
-    th: float between [0,1]
+        Type of missingness you want in your dataset
+    thr: float between [0,1]
         Percentage of missing data in generated data
     dtype: ('int','float')
         Type of data
@@ -65,6 +70,51 @@ def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float"
     raw_data = getattr(corruptor, missingness)()
     return raw_data
 
+def randc(nlevels=5, shape=(5, 5), missingness="mcar", thr=0.2):
+    """ Return randomly generated dataset with uniformly distributed categorical data (alphabetic character)
+
+    Parameters
+    ----------
+    nlevels: int
+        Specify the number of different categories in the dataset
+    shape: tuple(optional)
+        Size of the randomly generated data
+    missingness: string in ('mcar', 'mar', 'mnar')
+        Type of missingness you want in your dataset
+    thr: float between [0,1]
+        Percentage of missing data in generated data
+
+    Returns
+    -------
+    numpy.ndarray
+    """
+    if shape[0]*shape[1] < nlevels:
+        raise BadInputError("nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape")
+
+    length = len(string.ascii_lowercase)
+    n_fold = int(math.floor(math.log(nlevels, length)))
+    cat_pool = list(string.ascii_lowercase)
+
+    # when nlevel > 26, the alphabetical character is used up, need to generate extra strings as categorical data
+    if n_fold > 0:
+        for i in range(2, n_fold+2):
+            pool_candidate = list(itertools.product(string.ascii_lowercase, repeat=i))
+            cat_pool.extend([''.join(w) for w in pool_candidate])
+            if len(cat_pool) > nlevels:
+                break
+
+    cat = random.sample(cat_pool, nlevels)
+    data = np.random.choice(cat, shape, replace=True)
+
+    # make sure the data frame has nlevel different categories
+    while len(np.unique(data)) != nlevels:
+        data = np.random.choice(cat, shape, replace=True)
+
+    corruptor = Corruptor(data, thr=thr, dtype=np.str)
+    raw_data = getattr(corruptor, missingness)()
+    return raw_data
+
+
 
 def mnist(missingness="mcar", thr=0.2):
     """ Loads corrupted MNIST
@@ -84,4 +134,4 @@ def mnist(missingness="mcar", thr=0.2):
     dataset = fetch_mldata('MNIST original')
     corruptor = Corruptor(dataset.data, thr=thr)
     data = getattr(corruptor, missingness)()
-    return {"X": data, "Y": dataset.target}
+    return {"X": data, "Y": dataset.target}
diff --git a/impyute/dataset/corrupt.py b/impyute/dataset/corrupt.py
@@ -9,7 +9,7 @@ class Corruptor:
     ----------
     data: np.ndarray
         Matrix of values with no NaN's that you want to add NaN's to.
-    th: float (optional)
+    thr: float (optional)
         The percentage of null values you want in your dataset, a number
         between 0 and 1.
 
@@ -23,10 +23,10 @@ class Corruptor:
         Overwrite values with MNAR placed NaN's.
 
     """
-    def __init__(self, data, thr=0.2):
+    def __init__(self, data, thr=0.2, dtype=np.float):
         self.dtype = data.dtype
         self.shape = np.shape(data)
-        self.data = data.astype(np.float)
+        self.data = data.astype(dtype)
         self.thr = thr
 
     def mcar(self):
diff --git a/test/dataset/test_randc.py b/test/dataset/test_randc.py
@@ -0,0 +1,26 @@
+import numpy as np
+import pytest
+from impyute.dataset.base import randc
+from impyute.util import BadInputError
+
+def test_raise_error_nlevel_exceed_shape():
+    with pytest.raises(BadInputError) as e:
+        randc(shape=(2, 2))
+    expected = "nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape"
+    assert str(e.value) == expected
+
+@pytest.mark.parametrize("nlevels, shape", [(5, (5,5)), (9, (3,4)), (100, (20, 20))])
+def test_nlevel_categories(nlevels, shape):
+    """ideally the returned matrix should have nlevel+1 different categories, +1 because the Corrupt class introduce np.nan
+       however, if the missing value introduced by Corrupt class happens to replace a group of categories, the unique
+       category number would be < nlevel + 1
+    """
+    dataframe = randc(nlevels, shape)
+    assert len(np.unique(dataframe)) <= nlevels + 1
+
+
+@pytest.mark.parametrize("nlevels, shape", [(5, (5,5)), (9, (3, 4)), (100, (20, 20))])
+def test_dataframe_shape(nlevels, shape):
+    """test if the returned data frame has desired shape"""
+    dataframe = randc(nlevels, shape)
+    assert dataframe.shape == shape