Skip to content

Commit 035b4ae

Browse files
authored
Cleanup: Split most code out of DataSets.jl (#36)
* Move DataSet related code into DataSet.jl * Move DataProject related code into data_project.jl * Move registry of storage drivers into storage_drivers.jl * Remove link_dataset and unlink_dataset internal functions, and a buggy unwanted variant of load_project.
1 parent 9d2132f commit 035b4ae

5 files changed

+628
-641
lines changed

src/DataSet.jl

+165
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
"""
2+
A `DataSet` is a metadata overlay for data held locally or remotely which is
3+
unopinionated about the underlying storage mechanism.
4+
5+
The data in a `DataSet` has a type which implies an index; the index can be
6+
used to partition the data for processing.
7+
"""
8+
struct DataSet
9+
# For now, the representation `conf` contains data read directly from the
10+
# TOML. Once the design has settled we might get some explicit fields and
11+
# do validation.
12+
uuid::UUID # Unique identifier for the dataset. Use uuid4() to create these.
13+
conf
14+
15+
function DataSet(conf)
16+
_check_keys(conf, DataSet, ["uuid"=>String, "storage"=>Dict, "name"=>String])
17+
_check_keys(conf["storage"], DataSet, ["driver"=>String])
18+
check_dataset_name(conf["name"])
19+
new(UUID(conf["uuid"]), conf)
20+
end
21+
22+
#=
23+
name::String # Default name for convenience.
24+
# The binding to an actual name is managed by the data
25+
# project.
26+
storage # Storage config and driver definition
27+
maps::Vector{DataMap}
28+
29+
# Generic dictionary of other properties... for now. Required properties
30+
# will be moved
31+
_other::Dict{Symbol,Any}
32+
33+
#storage_id # unique identifier in storage backend, if it exists
34+
#owner # Project or user who owns the data
35+
#description::String
36+
#type # Some representation of the type of data?
37+
# # An array, blob, table, tree, etc
38+
#cachable::Bool # Can the data be cached? It might not for data governance
39+
# # reasons or it might change commonly.
40+
## A set of identifiers
41+
#tags::Set{String}
42+
=#
43+
end
44+
45+
_key_match(config, (k,T)::Pair) = haskey(config, k) && config[k] isa T
46+
_key_match(config, k::String) = haskey(config, k)
47+
48+
function _check_keys(config, context, keys)
49+
missed_keys = filter(k->!_key_match(config, k), keys)
50+
if !isempty(missed_keys)
51+
error("""
52+
Missing expected keys in $context:
53+
$missed_keys
54+
55+
In TOML fragment:
56+
$(sprint(TOML.print,config))
57+
""")
58+
end
59+
end
60+
61+
"""
62+
check_dataset_name(name)
63+
64+
Check whether a dataset name is valid. Valid names include start with a letter
65+
and may contain letters, numbers or `_`. Names may be hieracicial, with pieces
66+
separated with forward slashes. Examples:
67+
68+
my_data
69+
my_data_1
70+
username/data
71+
organization/project/data
72+
"""
73+
function check_dataset_name(name::AbstractString)
74+
# DataSet names disallow most punctuation for now, as it may be needed as
75+
# delimiters in data-related syntax (eg, for the data REPL).
76+
dataset_name_pattern = r"
77+
^
78+
[[:alpha:]]
79+
(?:
80+
[[:alnum:]_] |
81+
/ (?=[[:alpha:]])
82+
)*
83+
$
84+
"x
85+
if !occursin(dataset_name_pattern, name)
86+
error("DataSet name \"$name\" is invalid. DataSet names must start with a letter and can contain only letters, numbers, `_` or `/`.")
87+
end
88+
end
89+
90+
# Hacky thing until we figure out which fields DataSet should actually have.
91+
function Base.getproperty(d::DataSet, name::Symbol)
92+
if name in fieldnames(DataSet)
93+
return getfield(d, name)
94+
else
95+
getfield(d, :conf)[string(name)]
96+
end
97+
end
98+
99+
Base.getindex(d::DataSet, name::AbstractString) = getindex(d.conf, name)
100+
Base.haskey(d::DataSet, name::AbstractString) = haskey(d.conf, name)
101+
102+
# Split the fragment section as a '/' separated RelPath
103+
function dataspec_fragment_as_path(d::DataSet)
104+
if haskey(d, "dataspec")
105+
fragment = get(d.dataspec, "fragment", nothing)
106+
if !isnothing(fragment)
107+
return RelPath(split(fragment, '/'))
108+
end
109+
end
110+
return nothing
111+
end
112+
113+
function Base.show(io::IO, d::DataSet)
114+
print(io, DataSet, "(name=$(repr(d.name)), uuid=$(repr(d.uuid)), #= … =#)")
115+
end
116+
117+
function Base.show(io::IO, ::MIME"text/plain", d::DataSet)
118+
TOML.print(io, d.conf)
119+
end
120+
121+
122+
#-------------------------------------------------------------------------------
123+
# Functions for opening datasets
124+
125+
# do-block form of open()
126+
function Base.open(f::Function, as_type, dataset::DataSet)
127+
storage_config = dataset.storage
128+
driver = _find_driver(dataset)
129+
driver(storage_config, dataset) do storage
130+
open(f, as_type, storage)
131+
end
132+
end
133+
134+
# Contexts-based form of open()
135+
@! function Base.open(dataset::DataSet)
136+
storage_config = dataset.storage
137+
driver = _find_driver(dataset)
138+
# Use `enter_do` because drivers don't yet use the ResourceContexts.jl mechanism
139+
(storage,) = @! enter_do(driver, storage_config, dataset)
140+
storage
141+
end
142+
143+
@! function Base.open(as_type, dataset::DataSet)
144+
storage = @! open(dataset)
145+
@! open(as_type, storage)
146+
end
147+
148+
# TODO:
149+
# Consider making a distinction between open() and load().
150+
151+
# Finalizer-based version of open()
152+
function Base.open(dataset::DataSet)
153+
@context begin
154+
result = @! open(dataset)
155+
@! ResourceContexts.detach_context_cleanup(result)
156+
end
157+
end
158+
159+
function Base.open(as_type, dataset::DataSet)
160+
@context begin
161+
result = @! open(as_type, dataset)
162+
@! ResourceContexts.detach_context_cleanup(result)
163+
end
164+
end
165+

0 commit comments

Comments
 (0)