|
| 1 | +""" |
| 2 | +A `DataSet` is a metadata overlay for data held locally or remotely which is |
| 3 | +unopinionated about the underlying storage mechanism. |
| 4 | +
|
| 5 | +The data in a `DataSet` has a type which implies an index; the index can be |
| 6 | +used to partition the data for processing. |
| 7 | +""" |
| 8 | +struct DataSet |
| 9 | + # For now, the representation `conf` contains data read directly from the |
| 10 | + # TOML. Once the design has settled we might get some explicit fields and |
| 11 | + # do validation. |
| 12 | + uuid::UUID # Unique identifier for the dataset. Use uuid4() to create these. |
| 13 | + conf |
| 14 | + |
| 15 | + function DataSet(conf) |
| 16 | + _check_keys(conf, DataSet, ["uuid"=>String, "storage"=>Dict, "name"=>String]) |
| 17 | + _check_keys(conf["storage"], DataSet, ["driver"=>String]) |
| 18 | + check_dataset_name(conf["name"]) |
| 19 | + new(UUID(conf["uuid"]), conf) |
| 20 | + end |
| 21 | + |
| 22 | + #= |
| 23 | + name::String # Default name for convenience. |
| 24 | + # The binding to an actual name is managed by the data |
| 25 | + # project. |
| 26 | + storage # Storage config and driver definition |
| 27 | + maps::Vector{DataMap} |
| 28 | +
|
| 29 | + # Generic dictionary of other properties... for now. Required properties |
| 30 | + # will be moved |
| 31 | + _other::Dict{Symbol,Any} |
| 32 | +
|
| 33 | + #storage_id # unique identifier in storage backend, if it exists |
| 34 | + #owner # Project or user who owns the data |
| 35 | + #description::String |
| 36 | + #type # Some representation of the type of data? |
| 37 | + # # An array, blob, table, tree, etc |
| 38 | + #cachable::Bool # Can the data be cached? It might not for data governance |
| 39 | + # # reasons or it might change commonly. |
| 40 | + ## A set of identifiers |
| 41 | + #tags::Set{String} |
| 42 | + =# |
| 43 | +end |
| 44 | + |
| 45 | +_key_match(config, (k,T)::Pair) = haskey(config, k) && config[k] isa T |
| 46 | +_key_match(config, k::String) = haskey(config, k) |
| 47 | + |
| 48 | +function _check_keys(config, context, keys) |
| 49 | + missed_keys = filter(k->!_key_match(config, k), keys) |
| 50 | + if !isempty(missed_keys) |
| 51 | + error(""" |
| 52 | + Missing expected keys in $context: |
| 53 | + $missed_keys |
| 54 | +
|
| 55 | + In TOML fragment: |
| 56 | + $(sprint(TOML.print,config)) |
| 57 | + """) |
| 58 | + end |
| 59 | +end |
| 60 | + |
| 61 | +""" |
| 62 | + check_dataset_name(name) |
| 63 | +
|
| 64 | +Check whether a dataset name is valid. Valid names include start with a letter |
| 65 | +and may contain letters, numbers or `_`. Names may be hieracicial, with pieces |
| 66 | +separated with forward slashes. Examples: |
| 67 | +
|
| 68 | + my_data |
| 69 | + my_data_1 |
| 70 | + username/data |
| 71 | + organization/project/data |
| 72 | +""" |
| 73 | +function check_dataset_name(name::AbstractString) |
| 74 | + # DataSet names disallow most punctuation for now, as it may be needed as |
| 75 | + # delimiters in data-related syntax (eg, for the data REPL). |
| 76 | + dataset_name_pattern = r" |
| 77 | + ^ |
| 78 | + [[:alpha:]] |
| 79 | + (?: |
| 80 | + [[:alnum:]_] | |
| 81 | + / (?=[[:alpha:]]) |
| 82 | + )* |
| 83 | + $ |
| 84 | + "x |
| 85 | + if !occursin(dataset_name_pattern, name) |
| 86 | + error("DataSet name \"$name\" is invalid. DataSet names must start with a letter and can contain only letters, numbers, `_` or `/`.") |
| 87 | + end |
| 88 | +end |
| 89 | + |
| 90 | +# Hacky thing until we figure out which fields DataSet should actually have. |
| 91 | +function Base.getproperty(d::DataSet, name::Symbol) |
| 92 | + if name in fieldnames(DataSet) |
| 93 | + return getfield(d, name) |
| 94 | + else |
| 95 | + getfield(d, :conf)[string(name)] |
| 96 | + end |
| 97 | +end |
| 98 | + |
| 99 | +Base.getindex(d::DataSet, name::AbstractString) = getindex(d.conf, name) |
| 100 | +Base.haskey(d::DataSet, name::AbstractString) = haskey(d.conf, name) |
| 101 | + |
| 102 | +# Split the fragment section as a '/' separated RelPath |
| 103 | +function dataspec_fragment_as_path(d::DataSet) |
| 104 | + if haskey(d, "dataspec") |
| 105 | + fragment = get(d.dataspec, "fragment", nothing) |
| 106 | + if !isnothing(fragment) |
| 107 | + return RelPath(split(fragment, '/')) |
| 108 | + end |
| 109 | + end |
| 110 | + return nothing |
| 111 | +end |
| 112 | + |
| 113 | +function Base.show(io::IO, d::DataSet) |
| 114 | + print(io, DataSet, "(name=$(repr(d.name)), uuid=$(repr(d.uuid)), #= … =#)") |
| 115 | +end |
| 116 | + |
| 117 | +function Base.show(io::IO, ::MIME"text/plain", d::DataSet) |
| 118 | + TOML.print(io, d.conf) |
| 119 | +end |
| 120 | + |
| 121 | + |
| 122 | +#------------------------------------------------------------------------------- |
| 123 | +# Functions for opening datasets |
| 124 | + |
| 125 | +# do-block form of open() |
| 126 | +function Base.open(f::Function, as_type, dataset::DataSet) |
| 127 | + storage_config = dataset.storage |
| 128 | + driver = _find_driver(dataset) |
| 129 | + driver(storage_config, dataset) do storage |
| 130 | + open(f, as_type, storage) |
| 131 | + end |
| 132 | +end |
| 133 | + |
| 134 | +# Contexts-based form of open() |
| 135 | +@! function Base.open(dataset::DataSet) |
| 136 | + storage_config = dataset.storage |
| 137 | + driver = _find_driver(dataset) |
| 138 | + # Use `enter_do` because drivers don't yet use the ResourceContexts.jl mechanism |
| 139 | + (storage,) = @! enter_do(driver, storage_config, dataset) |
| 140 | + storage |
| 141 | +end |
| 142 | + |
| 143 | +@! function Base.open(as_type, dataset::DataSet) |
| 144 | + storage = @! open(dataset) |
| 145 | + @! open(as_type, storage) |
| 146 | +end |
| 147 | + |
| 148 | +# TODO: |
| 149 | +# Consider making a distinction between open() and load(). |
| 150 | + |
| 151 | +# Finalizer-based version of open() |
| 152 | +function Base.open(dataset::DataSet) |
| 153 | + @context begin |
| 154 | + result = @! open(dataset) |
| 155 | + @! ResourceContexts.detach_context_cleanup(result) |
| 156 | + end |
| 157 | +end |
| 158 | + |
| 159 | +function Base.open(as_type, dataset::DataSet) |
| 160 | + @context begin |
| 161 | + result = @! open(as_type, dataset) |
| 162 | + @! ResourceContexts.detach_context_cleanup(result) |
| 163 | + end |
| 164 | +end |
| 165 | + |
0 commit comments