-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathDataSet.jl
190 lines (165 loc) · 5.8 KB
/
DataSet.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
"""
A `DataSet` is a metadata overlay for data held locally or remotely which is
unopinionated about the underlying storage mechanism.
The data in a `DataSet` has a type which implies an index; the index can be
used to partition the data for processing.
"""
mutable struct DataSet
project # AbstractDataProject owning this DataSet
uuid::UUID # Unique identifier for the dataset. Use uuid4() to create these.
# The representation `conf` contains "configuration data" read directly from
# the TOML (or other data project source, eg json API etc)
conf
function DataSet(project, conf)
_validate_dataset_config(conf)
new(project, UUID(conf["uuid"]), conf)
end
end
DataSet(conf) = DataSet(nothing, conf)
function _validate_dataset_config(conf)
_check_keys(conf, DataSet, ["uuid"=>String, "storage"=>Dict, "name"=>String])
_check_keys(conf["storage"], DataSet, ["driver"=>String])
_check_optional_keys(conf,
"description"=>AbstractString,
"tags"=>VectorOf(AbstractString))
check_dataset_name(conf["name"])
end
function Base.show(io::IO, d::DataSet)
print(io, DataSet, "(name=$(repr(d.name)), uuid=$(repr(d.uuid)), #= … =#)")
end
function Base.show(io::IO, ::MIME"text/plain", d::DataSet)
println(io, "DataSet instance:")
println(io)
TOML.print(io, d.conf)
end
"""
is_valid_dataset_name(name)
Check whether a dataset name is valid. Valid names include start with a letter
and may contain letters, numbers or `_`. Names may be hieracicial, with pieces
separated with forward slashes. Examples:
my_data
my_data_1
username/data
organization/project/data
"""
is_valid_dataset_name(name::AbstractString) = occursin(DATASET_NAME_REGEX, name)
# DataSet names disallow most punctuation for now, as it may be needed as
# delimiters in data-related syntax (eg, for the data REPL).
const DATASET_NAME_REGEX_STRING = raw"""
[[:alpha:]]
(?:
[-[:alnum:]_] |
/ (?=[[:alpha:]])
)*
"""
const DATASET_NAME_REGEX = Regex("^\n$(DATASET_NAME_REGEX_STRING)\n\$", "x")
function make_valid_dataset_name(name)
if !is_valid_dataset_name(name)
name = replace(name, r"^[^[:alpha:]]+"=>"")
name = replace(name, '\\'=>'/')
name = replace(name, r"[^-[:alnum:]_/]"=>"_")
if !is_valid_dataset_name(name)
# best-effort fallback
name = "data"
end
end
return name
end
function check_dataset_name(name::AbstractString)
if !is_valid_dataset_name(name)
error("DataSet name \"$name\" is invalid. DataSet names must start with a letter and can contain only letters, numbers, `_` or `/`.")
end
end
#-------------------------------------------------------------------------------
# API for DataSet type
function Base.getproperty(d::DataSet, name::Symbol)
if name === :uuid
getfield(d, :uuid)
elseif name === :conf
getfield(d, :conf)
else
getfield(d, :conf)[string(name)]
end
end
function Base.setproperty!(d::DataSet, name::Symbol, x)
config!(d; name=>x)
end
Base.getindex(d::DataSet, name::AbstractString) = getindex(d.conf, name)
Base.haskey(d::DataSet, name::AbstractString) = haskey(d.conf, name)
function data_project(dataset::DataSet)
return getfield(dataset, :project)
end
# Split the fragment section as a '/' separated RelPath
function dataspec_fragment_as_path(d::DataSet)
if haskey(d, "dataspec")
fragment = get(d.dataspec, "fragment", nothing)
if !isnothing(fragment)
return RelPath(split(fragment, '/'))
end
end
return nothing
end
function config!(dataset::DataSet; kws...)
config!(data_project(dataset), dataset; kws...)
end
# The default case of a dataset config update when the update is independent of
# the project. (In general, projects may supply extra constraints.)
function config!(::Nothing, dataset::DataSet; kws...)
for (k,v) in pairs(kws)
if k in (:uuid, :name)
error("Cannot modify dataset config with key $k")
# TODO: elseif k === :storage
# Check consistency using storage driver API?
end
# TODO: Fold these schema checks in with _validate_dataset_config
# somehow.
if k === :description
if !(v isa AbstractString)
error("Dataset description must be a string")
end
elseif k === :tags
if !(v isa AbstractVector && all(x isa AbstractString for x in v))
error("Dataset tags must be a vector of strings")
end
end
dataset.conf[string(k)] = v
end
return dataset
end
#-------------------------------------------------------------------------------
# Functions for opening datasets
# do-block form of open()
function Base.open(f::Function, as_type, dataset::DataSet)
storage_config = dataset.storage
driver = _find_driver(dataset)
driver(storage_config, dataset) do storage
open(f, as_type, storage)
end
end
# Contexts-based form of open()
@! function Base.open(dataset::DataSet)
storage_config = dataset.storage
driver = _find_driver(dataset)
# Use `enter_do` because drivers don't yet use the ResourceContexts.jl mechanism
(storage,) = @! enter_do(driver, storage_config, dataset)
storage
end
@! function Base.open(as_type, dataset::DataSet)
storage = @! open(dataset)
@! open(as_type, storage)
end
# TODO:
# Consider making a distinction between open() and load().
# Finalizer-based version of open()
function Base.open(dataset::DataSet)
@context begin
result = @! open(dataset)
@! ResourceContexts.detach_context_cleanup(result)
end
end
function Base.open(as_type, dataset::DataSet)
@context begin
result = @! open(as_type, dataset)
@! ResourceContexts.detach_context_cleanup(result)
end
end