1
1
""" Shared functions to load/generate data """
2
2
import numpy as np
3
+ import string
4
+ import random
5
+ import math
6
+ import itertools
3
7
from impyute .dataset .corrupt import Corruptor
8
+ from impyute .util import BadInputError
4
9
5
10
def randu (bound = (0 , 10 ), shape = (5 , 5 ), missingness = "mcar" , thr = 0.2 , dtype = "int" ):
6
11
""" Return randomly generated dataset of numbers with uniformly
@@ -15,8 +20,8 @@ def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int")
15
20
shape:tuple(optional)
16
21
Size of the randomly generated data
17
22
missingness: ('mcar', 'mar', 'mnar')
18
- Type of missigness you want in your dataset
19
- th : float between [0,1]
23
+ Type of missingness you want in your dataset
24
+ thr : float between [0,1]
20
25
Percentage of missing data in generated data
21
26
dtype: ('int','float')
22
27
Type of data
@@ -45,8 +50,8 @@ def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float"
45
50
shape:tuple(optional)
46
51
Size of the randomly generated data
47
52
missingness: ('mcar', 'mar', 'mnar')
48
- Type of missigness you want in your dataset
49
- th : float between [0,1]
53
+ Type of missingness you want in your dataset
54
+ thr : float between [0,1]
50
55
Percentage of missing data in generated data
51
56
dtype: ('int','float')
52
57
Type of data
@@ -65,6 +70,51 @@ def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float"
65
70
raw_data = getattr (corruptor , missingness )()
66
71
return raw_data
67
72
73
+ def randc (nlevels = 5 , shape = (5 , 5 ), missingness = "mcar" , thr = 0.2 ):
74
+ """ Return randomly generated dataset with uniformly distributed categorical data (alphabetic character)
75
+
76
+ Parameters
77
+ ----------
78
+ nlevels: int
79
+ Specify the number of different categories in the dataset
80
+ shape: tuple(optional)
81
+ Size of the randomly generated data
82
+ missingness: string in ('mcar', 'mar', 'mnar')
83
+ Type of missingness you want in your dataset
84
+ thr: float between [0,1]
85
+ Percentage of missing data in generated data
86
+
87
+ Returns
88
+ -------
89
+ numpy.ndarray
90
+ """
91
+ if shape [0 ]* shape [1 ] < nlevels :
92
+ raise BadInputError ("nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape" )
93
+
94
+ length = len (string .ascii_lowercase )
95
+ n_fold = int (math .floor (math .log (nlevels , length )))
96
+ cat_pool = list (string .ascii_lowercase )
97
+
98
+ # when nlevel > 26, the alphabetical character is used up, need to generate extra strings as categorical data
99
+ if n_fold > 0 :
100
+ for i in range (2 , n_fold + 2 ):
101
+ pool_candidate = list (itertools .product (string .ascii_lowercase , repeat = i ))
102
+ cat_pool .extend (['' .join (w ) for w in pool_candidate ])
103
+ if len (cat_pool ) > nlevels :
104
+ break
105
+
106
+ cat = random .sample (cat_pool , nlevels )
107
+ data = np .random .choice (cat , shape , replace = True )
108
+
109
+ # make sure the data frame has nlevel different categories
110
+ while len (np .unique (data )) != nlevels :
111
+ data = np .random .choice (cat , shape , replace = True )
112
+
113
+ corruptor = Corruptor (data , thr = thr , dtype = np .str )
114
+ raw_data = getattr (corruptor , missingness )()
115
+ return raw_data
116
+
117
+
68
118
69
119
def mnist (missingness = "mcar" , thr = 0.2 ):
70
120
""" Loads corrupted MNIST
@@ -84,4 +134,4 @@ def mnist(missingness="mcar", thr=0.2):
84
134
dataset = fetch_mldata ('MNIST original' )
85
135
corruptor = Corruptor (dataset .data , thr = thr )
86
136
data = getattr (corruptor , missingness )()
87
- return {"X" : data , "Y" : dataset .target }
137
+ return {"X" : data , "Y" : dataset .target }
0 commit comments