From 035b4ae455cdecc1dddde68a75bd05809a2c3e71 Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Wed, 16 Mar 2022 17:55:15 +1000 Subject: [PATCH 01/33] Cleanup: Split most code out of DataSets.jl (#36) * Move DataSet related code into DataSet.jl * Move DataProject related code into data_project.jl * Move registry of storage drivers into storage_drivers.jl * Remove link_dataset and unlink_dataset internal functions, and a buggy unwanted variant of load_project. --- src/DataSet.jl | 165 +++++++++ src/DataSets.jl | 681 +++----------------------------------- src/data_project.jl | 329 ++++++++++++++++++ src/file_data_projects.jl | 9 + src/storage_drivers.jl | 85 +++++ 5 files changed, 628 insertions(+), 641 deletions(-) create mode 100644 src/DataSet.jl create mode 100644 src/data_project.jl create mode 100644 src/storage_drivers.jl diff --git a/src/DataSet.jl b/src/DataSet.jl new file mode 100644 index 0000000..e761b6f --- /dev/null +++ b/src/DataSet.jl @@ -0,0 +1,165 @@ +""" +A `DataSet` is a metadata overlay for data held locally or remotely which is +unopinionated about the underlying storage mechanism. + +The data in a `DataSet` has a type which implies an index; the index can be +used to partition the data for processing. +""" +struct DataSet + # For now, the representation `conf` contains data read directly from the + # TOML. Once the design has settled we might get some explicit fields and + # do validation. + uuid::UUID # Unique identifier for the dataset. Use uuid4() to create these. + conf + + function DataSet(conf) + _check_keys(conf, DataSet, ["uuid"=>String, "storage"=>Dict, "name"=>String]) + _check_keys(conf["storage"], DataSet, ["driver"=>String]) + check_dataset_name(conf["name"]) + new(UUID(conf["uuid"]), conf) + end + + #= + name::String # Default name for convenience. + # The binding to an actual name is managed by the data + # project. + storage # Storage config and driver definition + maps::Vector{DataMap} + + # Generic dictionary of other properties... for now. Required properties + # will be moved + _other::Dict{Symbol,Any} + + #storage_id # unique identifier in storage backend, if it exists + #owner # Project or user who owns the data + #description::String + #type # Some representation of the type of data? + # # An array, blob, table, tree, etc + #cachable::Bool # Can the data be cached? It might not for data governance + # # reasons or it might change commonly. + ## A set of identifiers + #tags::Set{String} + =# +end + +_key_match(config, (k,T)::Pair) = haskey(config, k) && config[k] isa T +_key_match(config, k::String) = haskey(config, k) + +function _check_keys(config, context, keys) + missed_keys = filter(k->!_key_match(config, k), keys) + if !isempty(missed_keys) + error(""" + Missing expected keys in $context: + $missed_keys + + In TOML fragment: + $(sprint(TOML.print,config)) + """) + end +end + +""" + check_dataset_name(name) + +Check whether a dataset name is valid. Valid names include start with a letter +and may contain letters, numbers or `_`. Names may be hieracicial, with pieces +separated with forward slashes. Examples: + + my_data + my_data_1 + username/data + organization/project/data +""" +function check_dataset_name(name::AbstractString) + # DataSet names disallow most punctuation for now, as it may be needed as + # delimiters in data-related syntax (eg, for the data REPL). + dataset_name_pattern = r" + ^ + [[:alpha:]] + (?: + [[:alnum:]_] | + / (?=[[:alpha:]]) + )* + $ + "x + if !occursin(dataset_name_pattern, name) + error("DataSet name \"$name\" is invalid. DataSet names must start with a letter and can contain only letters, numbers, `_` or `/`.") + end +end + +# Hacky thing until we figure out which fields DataSet should actually have. +function Base.getproperty(d::DataSet, name::Symbol) + if name in fieldnames(DataSet) + return getfield(d, name) + else + getfield(d, :conf)[string(name)] + end +end + +Base.getindex(d::DataSet, name::AbstractString) = getindex(d.conf, name) +Base.haskey(d::DataSet, name::AbstractString) = haskey(d.conf, name) + +# Split the fragment section as a '/' separated RelPath +function dataspec_fragment_as_path(d::DataSet) + if haskey(d, "dataspec") + fragment = get(d.dataspec, "fragment", nothing) + if !isnothing(fragment) + return RelPath(split(fragment, '/')) + end + end + return nothing +end + +function Base.show(io::IO, d::DataSet) + print(io, DataSet, "(name=$(repr(d.name)), uuid=$(repr(d.uuid)), #= … =#)") +end + +function Base.show(io::IO, ::MIME"text/plain", d::DataSet) + TOML.print(io, d.conf) +end + + +#------------------------------------------------------------------------------- +# Functions for opening datasets + +# do-block form of open() +function Base.open(f::Function, as_type, dataset::DataSet) + storage_config = dataset.storage + driver = _find_driver(dataset) + driver(storage_config, dataset) do storage + open(f, as_type, storage) + end +end + +# Contexts-based form of open() +@! function Base.open(dataset::DataSet) + storage_config = dataset.storage + driver = _find_driver(dataset) + # Use `enter_do` because drivers don't yet use the ResourceContexts.jl mechanism + (storage,) = @! enter_do(driver, storage_config, dataset) + storage +end + +@! function Base.open(as_type, dataset::DataSet) + storage = @! open(dataset) + @! open(as_type, storage) +end + +# TODO: +# Consider making a distinction between open() and load(). + +# Finalizer-based version of open() +function Base.open(dataset::DataSet) + @context begin + result = @! open(dataset) + @! ResourceContexts.detach_context_cleanup(result) + end +end + +function Base.open(as_type, dataset::DataSet) + @context begin + result = @! open(as_type, dataset) + @! ResourceContexts.detach_context_cleanup(result) + end +end + diff --git a/src/DataSets.jl b/src/DataSets.jl index 37c9cbd..b2d9b8b 100644 --- a/src/DataSets.jl +++ b/src/DataSets.jl @@ -17,349 +17,6 @@ const PACKAGE_VERSION = let VersionNumber(project["version"]) end -include("paths.jl") - -#------------------------------------------------------------------------------- - -""" -A `DataSet` is a metadata overlay for data held locally or remotely which is -unopinionated about the underlying storage mechanism. - -The data in a `DataSet` has a type which implies an index; the index can be -used to partition the data for processing. -""" -struct DataSet - # For now, the representation `conf` contains data read directly from the - # TOML. Once the design has settled we might get some explicit fields and - # do validation. - uuid::UUID # Unique identifier for the dataset. Use uuid4() to create these. - conf - - function DataSet(conf) - _check_keys(conf, DataSet, ["uuid"=>String, "storage"=>Dict, "name"=>String]) - _check_keys(conf["storage"], DataSet, ["driver"=>String]) - check_dataset_name(conf["name"]) - new(UUID(conf["uuid"]), conf) - end - - #= - name::String # Default name for convenience. - # The binding to an actual name is managed by the data - # project. - storage # Storage config and driver definition - maps::Vector{DataMap} - - # Generic dictionary of other properties... for now. Required properties - # will be moved - _other::Dict{Symbol,Any} - - #storage_id # unique identifier in storage backend, if it exists - #owner # Project or user who owns the data - #description::String - #type # Some representation of the type of data? - # # An array, blob, table, tree, etc - #cachable::Bool # Can the data be cached? It might not for data governance - # # reasons or it might change commonly. - ## A set of identifiers - #tags::Set{String} - =# -end - -_key_match(config, (k,T)::Pair) = haskey(config, k) && config[k] isa T -_key_match(config, k::String) = haskey(config, k) - -function _check_keys(config, context, keys) - missed_keys = filter(k->!_key_match(config, k), keys) - if !isempty(missed_keys) - error(""" - Missing expected keys in $context: - $missed_keys - - In TOML fragment: - $(sprint(TOML.print,config)) - """) - end -end - -""" - check_dataset_name(name) - -Check whether a dataset name is valid. Valid names include start with a letter -and may contain letters, numbers or `_`. Names may be hieracicial, with pieces -separated with forward slashes. Examples: - - my_data - my_data_1 - username/data - organization/project/data -""" -function check_dataset_name(name::AbstractString) - # DataSet names disallow most punctuation for now, as it may be needed as - # delimiters in data-related syntax (eg, for the data REPL). - dataset_name_pattern = r" - ^ - [[:alpha:]] - (?: - [[:alnum:]_] | - / (?=[[:alpha:]]) - )* - $ - "x - if !occursin(dataset_name_pattern, name) - error("DataSet name \"$name\" is invalid. DataSet names must start with a letter and can contain only letters, numbers, `_` or `/`.") - end -end - -# Hacky thing until we figure out which fields DataSet should actually have. -function Base.getproperty(d::DataSet, name::Symbol) - if name in fieldnames(DataSet) - return getfield(d, name) - else - getfield(d, :conf)[string(name)] - end -end - -Base.getindex(d::DataSet, name::AbstractString) = getindex(d.conf, name) -Base.haskey(d::DataSet, name::AbstractString) = haskey(d.conf, name) - -# Split the fragment section as a '/' separated RelPath -function dataspec_fragment_as_path(d::DataSet) - if haskey(d, "dataspec") - fragment = get(d.dataspec, "fragment", nothing) - if !isnothing(fragment) - return RelPath(split(fragment, '/')) - end - end - return nothing -end - -function Base.show(io::IO, d::DataSet) - print(io, DataSet, "(name=$(repr(d.name)), uuid=$(repr(d.uuid)), #= … =#)") -end - -function Base.show(io::IO, ::MIME"text/plain", d::DataSet) - TOML.print(io, d.conf) -end - - -#------------------------------------------------------------------------------- -""" -Subtypes of `AbstractDataProject` have the interface - -Must implement: - - `Base.get(project, dataset_name, default)` — search - - `Base.keys(project)` - get dataset names - -Optional: - - `Base.iterate()` — default implementation in terms of `keys` and `get` - - `Base.pairs()` — default implementation in terms of `keys` and `get` - - `Base.haskey()` — default implementation in terms of `get` - - `Base.getindex()` — default implementation in terms of `get` - - `DataSets.project_name()` — returns `nothing` by default. - -Provided by AbstractDataProject (should not be overridden): - - `DataSets.dataset()` - implemented in terms of `get` -""" -abstract type AbstractDataProject end - -function Base.getindex(proj::AbstractDataProject, name::AbstractString) - data = get(proj, name, nothing) - data != nothing || error("DataSet $(repr(name)) not found") - data -end - -""" - dataset(name) - dataset(project, name) - -Returns the [`DataSet`](@ref) with the given `name` from `project`. If omitted, -the global data environment [`DataSets.PROJECT`](@ref) will be used. - -The `DataSet` is *metadata*, but to use the actual *data* in your program you -need to use the `open` function to access the `DataSet`'s content as a given -Julia type. - -# Example - -To open a dataset named `"a_text_file"` and read the whole content as a String, - -```julia -content = open(String, dataset("a_text_file")) -``` - -To open the same dataset as an `IO` stream and read only the first line, - -```julia -open(IO, dataset("a_text_file")) do io - line = readline(io) - @info "The first line is" line -end -``` - -To open a directory as a browsable tree object, - -```julia -open(BlobTree, dataset("a_tree_example")) -``` -""" -function dataset(proj::AbstractDataProject, spec::AbstractString) - namestr, query, fragmentstr = _split_dataspec(spec) - - if isnothing(namestr) - throw(ArgumentError("Invalid dataset specification: $spec")) - end - - dataset = proj[namestr] - - if isnothing(query) && isnothing(fragmentstr) - return dataset - end - - # Enhance dataset with "dataspec" holding URL-like fragment & query - dataspec = Dict() - if !isnothing(query) - dataspec["query"] = Dict{String,Any}(query) - end - if !isnothing(fragmentstr) - dataspec["fragment"] = fragmentstr - end - - # We need to take care here with copy() to avoid modifying the original - # dataset configuration. - conf = copy(dataset.conf) - conf["dataspec"] = dataspec - - return DataSet(conf) -end - - -# Percent-decode a string according to the URI escaping rules. -# Vendored from URIs.jl for now to avoid depending on that entire package for -# this one function. -function _unescapeuri(str) - occursin("%", str) || return str - out = IOBuffer() - i = 1 - io = IOBuffer(str) - while !eof(io) - c = read(io, Char) - if c == '%' - c1 = read(io, Char) - c = read(io, Char) - write(out, parse(UInt8, string(c1, c); base=16)) - else - write(out, c) - end - end - return String(take!(out)) -end - -function _split_dataspec(spec::AbstractString) - # Parse as a suffix of URI syntax - # name/of/dataset?param1=value1¶m2=value2#fragment - m = match(r" - ^ - ((?:[[:alpha:]][[:alnum:]_]*/?)+) # name - a/b/c - (?:\?([^#]*))? # query - a=b&c=d - (?:\#(.*))? # fragment - ... - $"x, - spec) - if isnothing(m) - return nothing, nothing, nothing - end - namestr = m[1] - query = m[2] - fragmentstr = m[3] - - if !isnothing(query) - query = [_unescapeuri(x)=>_unescapeuri(y) for (x,y) in split.(split(query, '&'), '=')] - end - if !isnothing(fragmentstr) - fragmentstr = _unescapeuri(fragmentstr) - end - - namestr, query, fragmentstr -end - -function Base.haskey(proj::AbstractDataProject, name::AbstractString) - get(proj, name, nothing) !== nothing -end - -function Base.iterate(project::AbstractDataProject, state=nothing) - if isnothing(state) - ks = keys(project) - ks_itr = iterate(ks) - else - (ks, ks_state) = state - ks_itr = iterate(ks, ks_state) - end - if isnothing(ks_itr) - return nothing - end - (k, ks_state) = ks_itr - val = get(project, k, nothing) - if isnothing(val) - # val could be `nothing` if entries in the project are updated - # concurrently. (Eg, this might happen for data projects which are - # backed by the filesystem.) - return iterate(project, (ks, ks_state)) - end - (val, (ks, ks_state)) -end - -# Unknown size by default, due to the above get-based implementation of -# iterate, coupled with possible concurrent modification. -Base.IteratorSize(::AbstractDataProject) = Base.SizeUnknown() - -function Base.pairs(proj::AbstractDataProject) - ks = keys(proj) - (k=>d for (k,d) in (k=>get(proj, k, nothing) for k in ks) if !isnothing(d)) -end - -""" - project_name(data_project) - -Return the name of the given `data_project`. Ideally this can be used to -uniquely identify the project when modifying the project stack in -`DataSets.PROJECT`. For projects which were generated from -`JULIA_DATASETS_PATH`, this will be the expanded path component. - -Other types of projects will have to return something else. For example, remote -data projects may want to return a URI. For projects which have no obvious -identifier, `nothing` is returned. -""" -project_name(data_project::AbstractDataProject) = nothing - -data_drivers(proj::AbstractDataProject) = [] - -#------------------------------------------------------------------------------- -""" - DataProject - -A concrete data project is a collection of DataSets with associated names. -Names are unique within the project. -""" -struct DataProject <: AbstractDataProject - datasets::Dict{String,DataSet} - drivers::Vector{Dict{String,Any}} -end - -DataProject() = DataProject(Dict{String,DataSet}(), Vector{Dict{String,Any}}()) - -DataProject(project::AbstractDataProject) = DataProject(Dict(pairs(project)), - Vector{Dict{String,Any}}()) - -data_drivers(project::DataProject) = project.drivers - -function _fill_template(toml_path, toml_str) - # Super hacky templating for paths relative to the toml file. - # We really should have something a lot nicer here... - if Sys.iswindows() - toml_path = replace(toml_path, '\\'=>'/') - end - toml_str = replace(toml_str, "@__DIR__"=>toml_path) -end - """ `CURRENT_DATA_CONFIG_VERSION` is the current version of the data configuration format, as reflected in the Data.toml `data_config_version` key. This allows old @@ -401,174 +58,14 @@ name="" """ const CURRENT_DATA_CONFIG_VERSION = 1 -""" - load_project(path; auto_update=false) - load_project(config_dict) - -Load a data project from a system `path` referring to a TOML file. If -`auto_update` is true, the returned project will monitor the file for updates -and reload when necessary. - -Alternatively, create a `DataProject` from a an existing dictionary -`config_dict`, which should be in the Data.toml format. - -See also [`load_project!`](@ref). -""" -function load_project(path::AbstractString; auto_update=false) - sys_path = abspath(path) - auto_update ? TomlFileDataProject(sys_path) : - _load_project(read(sys_path,String), dirname(sys_path)) -end - -function load_project(config::AbstractDict; kws...) - _check_keys(config, "Data.toml", ["data_config_version"=>Integer, - "datasets"=>AbstractVector]) - format_ver = config["data_config_version"] - if format_ver > CURRENT_DATA_CONFIG_VERSION - error(""" - data_config_version=$format_ver is newer than supported. - Consider upgrading to a newer version of DataSets.jl - """) - end - proj = DataProject() - for dataset_conf in config["datasets"] - dataset = DataSet(dataset_conf) - link_dataset(proj, dataset.name => dataset) - end - if haskey(config, "drivers") - _check_keys(config, DataProject, ["drivers"=>AbstractVector]) - for driver_conf in config["drivers"] - _check_keys(driver_conf, DataProject, ["type"=>String, "name"=>String, "module"=>Dict]) - _check_keys(driver_conf["module"], DataProject, ["name"=>String, "uuid"=>String]) - push!(proj.drivers, driver_conf) - end - end - proj -end - -# TODO: Deprecate this? -function load_project(path::AbstractPath; kws) - load_project(sys_abspath(abspath(path)); kws...) -end - -function link_dataset(proj::DataProject, (name,data)::Pair) - proj.datasets[name] = data -end - -link_dataset(proj::DataProject, d::DataSet) = link_dataset(proj, d.name=>d) - -function unlink_dataset(proj::DataProject, name::AbstractString) - if !haskey(proj.datasets, name) - throw(ArgumentError("No dataset \"$name\" in data project")) - end - d = proj.datasets[name] - delete!(proj.datasets, name) - d -end - -function Base.get(proj::DataProject, name::AbstractString, default) - get(proj.datasets, name, default) -end - -Base.keys(proj::DataProject) = keys(proj.datasets) - -function Base.iterate(proj::DataProject, state=nothing) - # proj.datasets iterates key=>value; need to rejig it to iterate values. - itr = isnothing(state) ? iterate(proj.datasets) : iterate(proj.datasets, state) - isnothing(itr) && return nothing - (x, state) = itr - (x.second, state) -end - -function Base.show(io::IO, ::MIME"text/plain", project::AbstractDataProject) - datasets = collect(pairs(project)) - summary(io, project) - println(io, ":") - if isempty(datasets) - print(io, " (empty)") - return - end - sorted = sort(datasets, by=first) - maxwidth = maximum(textwidth.(first.(sorted))) - for (i, (name, data)) in enumerate(sorted) - pad = maxwidth - textwidth(name) - storagetype = get(data.storage, "type", nothing) - icon = storagetype == "Blob" ? '📄' : - storagetype == "BlobTree" ? '📁' : - '❓' - print(io, " ", icon, ' ', name, ' '^pad, " => ", data.uuid) - if i < length(sorted) - println(io) - end - end -end - -function Base.summary(io::IO, project::AbstractDataProject) - print(io, typeof(project)) - name = project_name(project) - if !isnothing(name) - print(io, " [", name, "]") - end -end +include("paths.jl") +include("DataSet.jl") +include("data_project.jl") +include("file_data_projects.jl") +include("storage_drivers.jl") #------------------------------------------------------------------------------- -""" - StackedDataProject() - StackedDataProject(projects) - -Search stack of AbstractDataProjects, where projects are searched from the -first to last element of `projects`. - -Additional projects may be added or removed from the stack with `pushfirst!`, -`push!` and `empty!`. - -See also [`DataSets.PROJECT`](@ref). -""" -struct StackedDataProject <: AbstractDataProject - projects::Vector -end - -StackedDataProject() = StackedDataProject([]) - -data_drivers(stack::StackedDataProject) = vcat(data_drivers.(stack.projects)...) - -function Base.keys(stack::StackedDataProject) - names = [] - for project in stack.projects - append!(names, keys(project)) - end - unique(names) -end - -function Base.get(stack::StackedDataProject, name::AbstractString, default) - for project in stack.projects - d = get(project, name, nothing) - if !isnothing(d) - return d - end - end -end - -# API for manipulating the stack. -Base.push!(stack::StackedDataProject, project) = push!(stack.projects, project) -Base.pushfirst!(stack::StackedDataProject, project) = pushfirst!(stack.projects, project) -Base.popfirst!(stack::StackedDataProject) = popfirst!(stack.projects) -Base.pop!(stack::StackedDataProject) = pop!(stack.projects) -Base.empty!(stack::StackedDataProject) = empty!(stack.projects) - -function Base.show(io::IO, mime::MIME"text/plain", stack::StackedDataProject) - summary(io, stack) - println(io, ":") - for (i,project) in enumerate(stack.projects) - # show(io, mime, project) - # indent each project - str = sprint(show, mime, project) - print(io, join(" " .* split(str, "\n"), "\n")) - i != length(stack.projects) && println(io) - end -end - -include("file_data_projects.jl") +# Global datasets configuration for current Julia session function expand_project_path(path) if path == "@" @@ -608,9 +105,6 @@ function create_project_stack(env) StackedDataProject(stack) end -#------------------------------------------------------------------------------- -# Global datasets configuration for current Julia session - # Global stack of data projects, with the top of the stack being searched # first. """ @@ -667,6 +161,40 @@ function __init__() end end +""" + dataset(name) + dataset(project, name) + +Returns the [`DataSet`](@ref) with the given `name` from `project`. If omitted, +the global data environment [`DataSets.PROJECT`](@ref) will be used. + +The `DataSet` is *metadata*, but to use the actual *data* in your program you +need to use the `open` function to access the `DataSet`'s content as a given +Julia type. + +# Example + +To open a dataset named `"a_text_file"` and read the whole content as a String, + +```julia +content = open(String, dataset("a_text_file")) +``` + +To open the same dataset as an `IO` stream and read only the first line, + +```julia +open(IO, dataset("a_text_file")) do io + line = readline(io) + @info "The first line is" line +end +``` + +To open a directory as a browsable tree object, + +```julia +open(BlobTree, dataset("a_tree_example")) +``` +""" dataset(name) = dataset(PROJECT, name) """ @@ -685,135 +213,6 @@ function load_project!(path_or_config) _current_project = DataProject(new_project) end -#------------------------------------------------------------------------------- -# Storage layer and interface - -const _storage_drivers_lock = ReentrantLock() -const _storage_drivers = Dict{String,Any}() - -""" - add_storage_driver(driver_name=>storage_opener) - -Associate DataSet storage driver named `driver_name` with `storage_opener`. -When a `dataset` with `storage.driver == driver_name` is opened, -`storage_opener(user_func, storage_config, dataset)` will be called. Any -existing storage driver registered to `driver_name` will be overwritten. - -As a matter of convention, `storage_opener` should generally take configuration -from `storage_config` which is just `dataset.storage`. But to avoid config -duplication it may also use the content of `dataset`, (for example, dataset.uuid). - -Packages which define new storage drivers should generally call -`add_storage_driver()` within their `__init__()` functions. -""" -function add_storage_driver((name,opener)::Pair) - lock(_storage_drivers_lock) do - _storage_drivers[name] = opener - end -end - -function add_storage_driver(project::AbstractDataProject) - for conf in data_drivers(project) - pkgid = PkgId(UUID(conf["module"]["uuid"]), conf["module"]["name"]) - if Base.haskey(Base.package_locks, pkgid) - # Hack: Avoid triggering another call to require() for packages - # which are already in the process of being loaded. (This would - # result in a deadlock!) - # - # Obviously this depends on Base internals... - continue - end - mod = Base.require(pkgid) - #= - # TODO: Improve driver loading invariants. - # - # The difficulty here is that there's two possible ways for drivers to - # work: - # 1. The driver depends explicitly on `using DataSets`, so - # DataSets.__init__ is called *before* the Driver.__init__. - # 2. The driver uses a Requires-like mechanism to support multiple - # incompatible DataSets versions, so Driver.__init__ can occur - # *before* DataSets.__init__. - # - # This makes it hard for DataSets to check which drivers are added by a - # module: In case (2), the following check fails when the driver is - # loaded before DataSets and in case (1) we hit the double-require - # problem, resulting in the Base.package_locks bailout which disables - # the check below. - # - if conf["type"] == "storage" - driver_name = conf["name"] - # `mod` is assumed to run add_storage_driver() inside its __init__, - # unless the symbol mod.datasets_load_hook exists (in which case we - # call this instead). - lock(_storage_drivers_lock) do - get(_storage_drivers, driver_name) do - error("Package $pkgid did not provide storage driver $driver_name") - end - end - end - =# - end -end - -function _find_driver(dataset) - storage_config = dataset.storage - driver_name = get(storage_config, "driver") do - error("`storage.driver` configuration not found for dataset $(dataset.name)") - end - driver = lock(_storage_drivers_lock) do - get(_storage_drivers, driver_name) do - error(""" - Storage driver $(repr(driver_name)) not found for dataset $(dataset.name). - Current drivers are $(collect(keys(_storage_drivers))) - """) - end - end -end - -#------------------------------------------------------------------------------- -# Functions for opening datasets - -# do-block form of open() -function Base.open(f::Function, as_type, dataset::DataSet) - storage_config = dataset.storage - driver = _find_driver(dataset) - driver(storage_config, dataset) do storage - open(f, as_type, storage) - end -end - -# Contexts-based form of open() -@! function Base.open(dataset::DataSet) - storage_config = dataset.storage - driver = _find_driver(dataset) - # Use `enter_do` because drivers don't yet use the ResourceContexts.jl mechanism - (storage,) = @! enter_do(driver, storage_config, dataset) - storage -end - -@! function Base.open(as_type, dataset::DataSet) - storage = @! open(dataset) - @! open(as_type, storage) -end - -# TODO: -# Consider making a distinction between open() and load(). - -# Finalizer-based version of open() -function Base.open(dataset::DataSet) - @context begin - result = @! open(dataset) - @! ResourceContexts.detach_context_cleanup(result) - end -end - -function Base.open(as_type, dataset::DataSet) - @context begin - result = @! open(as_type, dataset) - @! ResourceContexts.detach_context_cleanup(result) - end -end #------------------------------------------------------------------------------- # Application entry points diff --git a/src/data_project.jl b/src/data_project.jl new file mode 100644 index 0000000..de97786 --- /dev/null +++ b/src/data_project.jl @@ -0,0 +1,329 @@ +# AbstractDataProject and the generic DataProject + +""" +Subtypes of `AbstractDataProject` have the interface + +Must implement: + - `Base.get(project, dataset_name, default)` — search + - `Base.keys(project)` - get dataset names + +Optional: + - `Base.iterate()` — default implementation in terms of `keys` and `get` + - `Base.pairs()` — default implementation in terms of `keys` and `get` + - `Base.haskey()` — default implementation in terms of `get` + - `Base.getindex()` — default implementation in terms of `get` + - `DataSets.project_name()` — returns `nothing` by default. + +Provided by AbstractDataProject (should not be overridden): + - `DataSets.dataset()` - implemented in terms of `get` +""" +abstract type AbstractDataProject end + +function Base.getindex(proj::AbstractDataProject, name::AbstractString) + data = get(proj, name, nothing) + data != nothing || error("DataSet $(repr(name)) not found") + data +end + + +function dataset(proj::AbstractDataProject, spec::AbstractString) + namestr, query, fragmentstr = _split_dataspec(spec) + + if isnothing(namestr) + throw(ArgumentError("Invalid dataset specification: $spec")) + end + + dataset = proj[namestr] + + if isnothing(query) && isnothing(fragmentstr) + return dataset + end + + # Enhance dataset with "dataspec" holding URL-like fragment & query + dataspec = Dict() + if !isnothing(query) + dataspec["query"] = Dict{String,Any}(query) + end + if !isnothing(fragmentstr) + dataspec["fragment"] = fragmentstr + end + + # We need to take care here with copy() to avoid modifying the original + # dataset configuration. + conf = copy(dataset.conf) + conf["dataspec"] = dataspec + + return DataSet(conf) +end + + +# Percent-decode a string according to the URI escaping rules. +# Vendored from URIs.jl for now to avoid depending on that entire package for +# this one function. +function _unescapeuri(str) + occursin("%", str) || return str + out = IOBuffer() + i = 1 + io = IOBuffer(str) + while !eof(io) + c = read(io, Char) + if c == '%' + c1 = read(io, Char) + c = read(io, Char) + write(out, parse(UInt8, string(c1, c); base=16)) + else + write(out, c) + end + end + return String(take!(out)) +end + +function _split_dataspec(spec::AbstractString) + # Parse as a suffix of URI syntax + # name/of/dataset?param1=value1¶m2=value2#fragment + m = match(r" + ^ + ((?:[[:alpha:]][[:alnum:]_]*/?)+) # name - a/b/c + (?:\?([^#]*))? # query - a=b&c=d + (?:\#(.*))? # fragment - ... + $"x, + spec) + if isnothing(m) + return nothing, nothing, nothing + end + namestr = m[1] + query = m[2] + fragmentstr = m[3] + + if !isnothing(query) + query = [_unescapeuri(x)=>_unescapeuri(y) for (x,y) in split.(split(query, '&'), '=')] + end + if !isnothing(fragmentstr) + fragmentstr = _unescapeuri(fragmentstr) + end + + namestr, query, fragmentstr +end + +function Base.haskey(proj::AbstractDataProject, name::AbstractString) + get(proj, name, nothing) !== nothing +end + +function Base.iterate(project::AbstractDataProject, state=nothing) + if isnothing(state) + ks = keys(project) + ks_itr = iterate(ks) + else + (ks, ks_state) = state + ks_itr = iterate(ks, ks_state) + end + if isnothing(ks_itr) + return nothing + end + (k, ks_state) = ks_itr + val = get(project, k, nothing) + if isnothing(val) + # val could be `nothing` if entries in the project are updated + # concurrently. (Eg, this might happen for data projects which are + # backed by the filesystem.) + return iterate(project, (ks, ks_state)) + end + (val, (ks, ks_state)) +end + +# Unknown size by default, due to the above get-based implementation of +# iterate, coupled with possible concurrent modification. +Base.IteratorSize(::AbstractDataProject) = Base.SizeUnknown() + +function Base.pairs(proj::AbstractDataProject) + ks = keys(proj) + (k=>d for (k,d) in (k=>get(proj, k, nothing) for k in ks) if !isnothing(d)) +end + +""" + project_name(data_project) + +Return the name of the given `data_project`. Ideally this can be used to +uniquely identify the project when modifying the project stack in +`DataSets.PROJECT`. For projects which were generated from +`JULIA_DATASETS_PATH`, this will be the expanded path component. + +Other types of projects will have to return something else. For example, remote +data projects may want to return a URI. For projects which have no obvious +identifier, `nothing` is returned. +""" +project_name(data_project::AbstractDataProject) = nothing + +data_drivers(proj::AbstractDataProject) = [] + +function Base.show(io::IO, ::MIME"text/plain", project::AbstractDataProject) + datasets = collect(pairs(project)) + summary(io, project) + println(io, ":") + if isempty(datasets) + print(io, " (empty)") + return + end + sorted = sort(datasets, by=first) + maxwidth = maximum(textwidth.(first.(sorted))) + for (i, (name, data)) in enumerate(sorted) + pad = maxwidth - textwidth(name) + storagetype = get(data.storage, "type", nothing) + icon = storagetype == "Blob" ? '📄' : + storagetype == "BlobTree" ? '📁' : + '❓' + print(io, " ", icon, ' ', name, ' '^pad, " => ", data.uuid) + if i < length(sorted) + println(io) + end + end +end + +function Base.summary(io::IO, project::AbstractDataProject) + print(io, typeof(project)) + name = project_name(project) + if !isnothing(name) + print(io, " [", name, "]") + end +end + +#------------------------------------------------------------------------------- +""" + DataProject + +A in-memory collection of DataSets. +""" +struct DataProject <: AbstractDataProject + datasets::Dict{String,DataSet} + drivers::Vector{Dict{String,Any}} +end + +DataProject() = DataProject(Dict{String,DataSet}(), Vector{Dict{String,Any}}()) + +DataProject(project::AbstractDataProject) = DataProject(Dict(pairs(project)), + Vector{Dict{String,Any}}()) + +data_drivers(project::DataProject) = project.drivers + +function Base.get(proj::DataProject, name::AbstractString, default) + get(proj.datasets, name, default) +end + +Base.keys(proj::DataProject) = keys(proj.datasets) + +function Base.iterate(proj::DataProject, state=nothing) + # proj.datasets iterates key=>value; need to rejig it to iterate values. + itr = isnothing(state) ? iterate(proj.datasets) : iterate(proj.datasets, state) + isnothing(itr) && return nothing + (x, state) = itr + (x.second, state) +end + +function Base.setindex!(proj::DataProject, data::DataSet, name::AbstractString) + proj.datasets[name] = data +end + +#------------------------------------------------------------------------------- +""" + StackedDataProject() + StackedDataProject(projects) + +Search stack of AbstractDataProjects, where projects are searched from the +first to last element of `projects`. + +Additional projects may be added or removed from the stack with `pushfirst!`, +`push!` and `empty!`. + +See also [`DataSets.PROJECT`](@ref). +""" +struct StackedDataProject <: AbstractDataProject + projects::Vector +end + +StackedDataProject() = StackedDataProject([]) + +data_drivers(stack::StackedDataProject) = vcat(data_drivers.(stack.projects)...) + +function Base.keys(stack::StackedDataProject) + names = [] + for project in stack.projects + append!(names, keys(project)) + end + unique(names) +end + +function Base.get(stack::StackedDataProject, name::AbstractString, default) + for project in stack.projects + d = get(project, name, nothing) + if !isnothing(d) + return d + end + end +end + +# API for manipulating the stack. +Base.push!(stack::StackedDataProject, project) = push!(stack.projects, project) +Base.pushfirst!(stack::StackedDataProject, project) = pushfirst!(stack.projects, project) +Base.popfirst!(stack::StackedDataProject) = popfirst!(stack.projects) +Base.pop!(stack::StackedDataProject) = pop!(stack.projects) +Base.empty!(stack::StackedDataProject) = empty!(stack.projects) + +function Base.show(io::IO, mime::MIME"text/plain", stack::StackedDataProject) + summary(io, stack) + println(io, ":") + for (i,project) in enumerate(stack.projects) + # show(io, mime, project) + # indent each project + str = sprint(show, mime, project) + print(io, join(" " .* split(str, "\n"), "\n")) + i != length(stack.projects) && println(io) + end +end + + +#------------------------------------------------------------------------------- +""" + load_project(path; auto_update=false) + load_project(config_dict) + +Load a data project from a system `path` referring to a TOML file. If +`auto_update` is true, the returned project will monitor the file for updates +and reload when necessary. + +Alternatively, create a `DataProject` from a an existing dictionary +`config_dict`, which should be in the Data.toml format. + +See also [`load_project!`](@ref). +""" +function load_project(path::AbstractString; auto_update=false) + sys_path = abspath(path) + auto_update ? TomlFileDataProject(sys_path) : + _load_project(read(sys_path,String), dirname(sys_path)) +end + +function load_project(config::AbstractDict; kws...) + _check_keys(config, "Data.toml", ["data_config_version"=>Integer, + "datasets"=>AbstractVector]) + format_ver = config["data_config_version"] + if format_ver > CURRENT_DATA_CONFIG_VERSION + error(""" + data_config_version=$format_ver is newer than supported. + Consider upgrading to a newer version of DataSets.jl + """) + end + proj = DataProject() + for dataset_conf in config["datasets"] + dataset = DataSet(dataset_conf) + proj[dataset.name] = dataset + end + if haskey(config, "drivers") + _check_keys(config, DataProject, ["drivers"=>AbstractVector]) + for driver_conf in config["drivers"] + _check_keys(driver_conf, DataProject, ["type"=>String, "name"=>String, "module"=>Dict]) + _check_keys(driver_conf["module"], DataProject, ["name"=>String, "uuid"=>String]) + push!(proj.drivers, driver_conf) + end + end + proj +end + diff --git a/src/file_data_projects.jl b/src/file_data_projects.jl index c484430..b396bd3 100644 --- a/src/file_data_projects.jl +++ b/src/file_data_projects.jl @@ -190,6 +190,15 @@ project_name(::ActiveDataProject) = _active_project_data_toml() #------------------------------------------------------------------------------- +function _fill_template(toml_path, toml_str) + # Super hacky templating for paths relative to the toml file. + # We really should have something a lot nicer here... + if Sys.iswindows() + toml_path = replace(toml_path, '\\'=>'/') + end + toml_str = replace(toml_str, "@__DIR__"=>toml_path) +end + function _load_project(content::AbstractString, sys_data_dir) toml_str = _fill_template(sys_data_dir, content) config = TOML.parse(toml_str) diff --git a/src/storage_drivers.jl b/src/storage_drivers.jl new file mode 100644 index 0000000..6794bbe --- /dev/null +++ b/src/storage_drivers.jl @@ -0,0 +1,85 @@ +# Global record of registered storage drivers + +const _storage_drivers_lock = ReentrantLock() +const _storage_drivers = Dict{String,Any}() + +""" + add_storage_driver(driver_name=>storage_opener) + +Associate DataSet storage driver named `driver_name` with `storage_opener`. +When a `dataset` with `storage.driver == driver_name` is opened, +`storage_opener(user_func, storage_config, dataset)` will be called. Any +existing storage driver registered to `driver_name` will be overwritten. + +As a matter of convention, `storage_opener` should generally take configuration +from `storage_config` which is just `dataset.storage`. But to avoid config +duplication it may also use the content of `dataset`, (for example, dataset.uuid). + +Packages which define new storage drivers should generally call +`add_storage_driver()` within their `__init__()` functions. +""" +function add_storage_driver((name,opener)::Pair) + lock(_storage_drivers_lock) do + _storage_drivers[name] = opener + end +end + +function add_storage_driver(project::AbstractDataProject) + for conf in data_drivers(project) + pkgid = PkgId(UUID(conf["module"]["uuid"]), conf["module"]["name"]) + if Base.haskey(Base.package_locks, pkgid) + # Hack: Avoid triggering another call to require() for packages + # which are already in the process of being loaded. (This would + # result in a deadlock!) + # + # Obviously this depends on Base internals... + continue + end + mod = Base.require(pkgid) + #= + # TODO: Improve driver loading invariants. + # + # The difficulty here is that there's two possible ways for drivers to + # work: + # 1. The driver depends explicitly on `using DataSets`, so + # DataSets.__init__ is called *before* the Driver.__init__. + # 2. The driver uses a Requires-like mechanism to support multiple + # incompatible DataSets versions, so Driver.__init__ can occur + # *before* DataSets.__init__. + # + # This makes it hard for DataSets to check which drivers are added by a + # module: In case (2), the following check fails when the driver is + # loaded before DataSets and in case (1) we hit the double-require + # problem, resulting in the Base.package_locks bailout which disables + # the check below. + # + if conf["type"] == "storage" + driver_name = conf["name"] + # `mod` is assumed to run add_storage_driver() inside its __init__, + # unless the symbol mod.datasets_load_hook exists (in which case we + # call this instead). + lock(_storage_drivers_lock) do + get(_storage_drivers, driver_name) do + error("Package $pkgid did not provide storage driver $driver_name") + end + end + end + =# + end +end + +function _find_driver(dataset) + storage_config = dataset.storage + driver_name = get(storage_config, "driver") do + error("`storage.driver` configuration not found for dataset $(dataset.name)") + end + driver = lock(_storage_drivers_lock) do + get(_storage_drivers, driver_name) do + error(""" + Storage driver $(repr(driver_name)) not found for dataset $(dataset.name). + Current drivers are $(collect(keys(_storage_drivers))) + """) + end + end +end + From fa0ea84f34140b68e3e575adba9af7eb3378553e Mon Sep 17 00:00:00 2001 From: Fons van der Plas Date: Wed, 20 Apr 2022 03:59:16 +0200 Subject: [PATCH 02/33] show method for DataSet: add line at the top (#37) --- src/DataSet.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/DataSet.jl b/src/DataSet.jl index e761b6f..70a01c8 100644 --- a/src/DataSet.jl +++ b/src/DataSet.jl @@ -115,6 +115,8 @@ function Base.show(io::IO, d::DataSet) end function Base.show(io::IO, ::MIME"text/plain", d::DataSet) + println(io, "DataSet instance:") + println(io) TOML.print(io, d.conf) end From fa2e7ddc69faad510b0617e3be030f80b9939ab0 Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Thu, 28 Apr 2022 15:59:38 +1000 Subject: [PATCH 03/33] Data REPL fixes: `ls` shows the stack etc (#39) Also: * `data> ls` shows the stack * Fix completion when a full command is given * Fix `show` when file is empty --- src/repl.jl | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/repl.jl b/src/repl.jl index 0e3d331..2988352 100644 --- a/src/repl.jl +++ b/src/repl.jl @@ -10,7 +10,7 @@ Press `>` to enter the data repl. Press TAB to complete commands. | Command | Alias | Action | |:---------- |:--------- | :---------- | | `help` | `?` | Show this message | -| `list` | `ls` | List all datasets by name | +| `list` | `ls` | List stack of projects and datasets by name | | `show $name` | | Preview the content of dataset `$name` | | `stack` | `st` | Manipulate the global data search stack | | `stack list` | `st ls` | List all projects in the global data search stack | @@ -76,7 +76,7 @@ function _show_dataset(out_stream::IO, blob::Blob) end display_lines, _ = displaysize(out_stream) max_lines = max(5, display_lines ÷ 2) - if n_textlike / length(str) > 0.95 + if length(str) == 0 || n_textlike / length(str) > 0.95 # It's approximately UTF-8 encoded text data - print as text lines = split(str, '\n', keepempty=true) nlines = min(lastindex(lines), max_lines) @@ -127,10 +127,6 @@ function complete_command_list(cmd_prefix, commands) completions = String[] for cmdset in commands for cmd in cmdset - if cmd == cmd_prefix - # Space after full length command - return ([" "], "", true) - end if startswith(cmd, cmd_prefix) push!(completions, cmd*" ") break @@ -209,7 +205,7 @@ function parse_data_repl_cmd(cmdstr) popfirst!(tokens) if cmd in ("list", "ls") return quote - $DataSets.DataProject($DataSets.PROJECT) + $DataSets.PROJECT end elseif cmd == "stack" && length(tokens) >= 1 subcmd = popfirst!(tokens) From fcb5666e7e0f72ec13302c518e9a5459877950ee Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Tue, 3 May 2022 15:43:52 +1000 Subject: [PATCH 04/33] Add DataSets.from_path(); allow `-` in dataset names (#40) The from_path function can be used to create a standalone DataSet from data on the local filesystem, inferring the type as file or directory. Also allow `-` in dataset names: this should be harmless enough in terms of our URL-like dataspec syntax, and also in the data REPL where shell-like splitting treats `-` as part of words. This is particularly useful on JuliaHub where we allow user names to contain `-`. --- src/DataSet.jl | 25 +++++++++++++++++---- src/file_data_projects.jl | 32 +++++++++++++++++++++++++++ test/runtests.jl | 46 +++++++++++++++++++++++++++++---------- 3 files changed, 88 insertions(+), 15 deletions(-) diff --git a/src/DataSet.jl b/src/DataSet.jl index 70a01c8..14278d7 100644 --- a/src/DataSet.jl +++ b/src/DataSet.jl @@ -59,7 +59,7 @@ function _check_keys(config, context, keys) end """ - check_dataset_name(name) + is_valid_dataset_name(name) Check whether a dataset name is valid. Valid names include start with a letter and may contain letters, numbers or `_`. Names may be hieracicial, with pieces @@ -70,19 +70,36 @@ separated with forward slashes. Examples: username/data organization/project/data """ -function check_dataset_name(name::AbstractString) +function is_valid_dataset_name(name::AbstractString) # DataSet names disallow most punctuation for now, as it may be needed as # delimiters in data-related syntax (eg, for the data REPL). dataset_name_pattern = r" ^ [[:alpha:]] (?: - [[:alnum:]_] | + [-[:alnum:]_] | / (?=[[:alpha:]]) )* $ "x - if !occursin(dataset_name_pattern, name) + return occursin(dataset_name_pattern, name) +end + +function make_valid_dataset_name(name) + if !is_valid_dataset_name(name) + name = replace(name, r"^[^[:alpha:]]+"=>"") + name = replace(name, '\\'=>'/') + name = replace(name, r"[^-[:alnum:]_/]"=>"_") + if !is_valid_dataset_name(name) + # best-effort fallback + name = "data" + end + end + return name +end + +function check_dataset_name(name::AbstractString) + if !is_valid_dataset_name(name) error("DataSet name \"$name\" is invalid. DataSet names must start with a letter and can contain only letters, numbers, `_` or `/`.") end end diff --git a/src/file_data_projects.jl b/src/file_data_projects.jl index b396bd3..55c7d5c 100644 --- a/src/file_data_projects.jl +++ b/src/file_data_projects.jl @@ -205,3 +205,35 @@ function _load_project(content::AbstractString, sys_data_dir) load_project(config) end +#------------------------------------------------------------------------------- +""" + from_path(path) + +Create a `DataSet` from a local filesystem path. The type of the dataset is +inferred as a blob or tree based on whether the local path is a file or +directory. +""" +function from_path(path::AbstractString) + dtype = isfile(path) ? "Blob" : + isdir(path) ? "BlobTree" : + nothing + + if isnothing(dtype) + msg = ispath(path) ? + "Unrecognized data at path \"$path\"" : + "Path \"$path\" does not exist" + throw(ArgumentError(msg)) + end + + conf = Dict( + "name"=>make_valid_dataset_name(path), + "uuid"=>string(uuid4()), + "storage"=>Dict( + "driver"=>"FileSystem", + "type"=>dtype, + "path"=>abspath(path), + ) + ) + + DataSet(conf) +end diff --git a/test/runtests.jl b/test/runtests.jl index ba534a2..d4bbf13 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -65,6 +65,17 @@ end end end +#------------------------------------------------------------------------------- +@testset "from_path" begin + file_dataset = DataSets.from_path(joinpath(@__DIR__, "data", "file.txt")) + @test read(open(file_dataset), String) == "Hello world!\n" + + dir_dataset = DataSets.from_path(joinpath(@__DIR__, "data", "csvset")) + + @test open(dir_dataset) isa BlobTree + @test keys(open(dir_dataset)) == ["1.csv", "2.csv"] +end + #------------------------------------------------------------------------------- @testset "open() for Blob and BlobTree" begin blob = Blob(FileSystemRoot("data/file.txt")) @@ -91,20 +102,33 @@ end end #------------------------------------------------------------------------------- -@testset "Data set name parsing" begin +@testset "Data set names" begin # Valid names - @test DataSets.check_dataset_name("a_b") === nothing - @test DataSets.check_dataset_name("a1") === nothing - @test DataSets.check_dataset_name("δεδομένα") === nothing - @test DataSets.check_dataset_name("a/b") === nothing - @test DataSets.check_dataset_name("a/b/c") === nothing + @test DataSets.is_valid_dataset_name("a_b") + @test DataSets.is_valid_dataset_name("a-b") + @test DataSets.is_valid_dataset_name("a1") + @test DataSets.is_valid_dataset_name("δεδομένα") + @test DataSets.is_valid_dataset_name("a/b") + @test DataSets.is_valid_dataset_name("a/b/c") # Invalid names + @test !DataSets.is_valid_dataset_name("1") + @test !DataSets.is_valid_dataset_name("a b") + @test !DataSets.is_valid_dataset_name("a.b") + @test !DataSets.is_valid_dataset_name("a/b/") + @test !DataSets.is_valid_dataset_name("a//b") + @test !DataSets.is_valid_dataset_name("/a/b") + # Error message for invalid names @test_throws ErrorException("DataSet name \"a?b\" is invalid. DataSet names must start with a letter and can contain only letters, numbers, `_` or `/`.") DataSets.check_dataset_name("a?b") - @test_throws ErrorException DataSets.check_dataset_name("1") - @test_throws ErrorException DataSets.check_dataset_name("a b") - @test_throws ErrorException DataSets.check_dataset_name("a.b") - @test_throws ErrorException DataSets.check_dataset_name("a/b/") - @test_throws ErrorException DataSets.check_dataset_name("/a/b") + + # Making valid names from path-like things + @test DataSets.make_valid_dataset_name("a/b") == "a/b" + @test DataSets.make_valid_dataset_name("a1") == "a1" + @test DataSets.make_valid_dataset_name("1a") == "a" + @test DataSets.make_valid_dataset_name("//a/b") == "a/b" + @test DataSets.make_valid_dataset_name("a..b") == "a__b" + @test DataSets.make_valid_dataset_name("C:\\a\\b") == "C_/a/b" + # fallback + @test DataSets.make_valid_dataset_name("a//b") == "data" end @testset "URL-like dataspec parsing" begin From 0e4aadec7bdccfe19ac1da5a0b614d4ae8f41335 Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Thu, 12 May 2022 18:14:32 +1000 Subject: [PATCH 05/33] Fix file naming inconsistency (#44) I had the TomlDataStorage struct inside DataTomlStorage.jl ?? --- src/DataSets.jl | 2 +- src/{DataTomlStorage.jl => TomlDataStorage.jl} | 0 test/{DataTomlStorage.jl => TomlDataStorage.jl} | 0 test/runtests.jl | 2 +- 4 files changed, 2 insertions(+), 2 deletions(-) rename src/{DataTomlStorage.jl => TomlDataStorage.jl} (100%) rename test/{DataTomlStorage.jl => TomlDataStorage.jl} (100%) diff --git a/src/DataSets.jl b/src/DataSets.jl index b2d9b8b..c6737c3 100644 --- a/src/DataSets.jl +++ b/src/DataSets.jl @@ -223,7 +223,7 @@ include("BlobTree.jl") # Builtin backends include("filesystem.jl") -include("DataTomlStorage.jl") +include("TomlDataStorage.jl") # Backends # include("ZipTree.jl") diff --git a/src/DataTomlStorage.jl b/src/TomlDataStorage.jl similarity index 100% rename from src/DataTomlStorage.jl rename to src/TomlDataStorage.jl diff --git a/test/DataTomlStorage.jl b/test/TomlDataStorage.jl similarity index 100% rename from test/DataTomlStorage.jl rename to test/TomlDataStorage.jl diff --git a/test/runtests.jl b/test/runtests.jl index d4bbf13..bafb3e5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -176,6 +176,6 @@ end include("projects.jl") include("entrypoint.jl") include("repl.jl") -include("DataTomlStorage.jl") +include("TomlDataStorage.jl") include("backend_compat.jl") include("driver_autoload.jl") From 7ffc2d2e25c6d7d4d8241b9a26e05087d7162093 Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Thu, 28 Apr 2022 14:30:47 +1000 Subject: [PATCH 06/33] Big BlobTree API refactor including newfile() / newdir() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor newfile() / newdir() into primarily in-place APIs. A temporary directory can still be created with newdir(). Before this refactor the API looked slightly nicer, but we would never be able to reach the efficiency of the native file APIs. In particular, we might end up moving data across devices as a last step in constructing a directory tree — this seemed bad! Various changes to BlobTree API to make it more Dict-like and less reliant on overloading filesystem-like functions. * Further document the BlobTree API * Allow path strings as keys in more places in BlobTree * Deprecations of some filesystem-like functions --- src/BlobTree.jl | 291 +++++++++++++++++++++++++++++++++----------- src/DataSets.jl | 3 + src/data_project.jl | 1 - src/filesystem.jl | 102 ++++++++++------ src/paths.jl | 4 +- 5 files changed, 293 insertions(+), 108 deletions(-) diff --git a/src/BlobTree.jl b/src/BlobTree.jl index 1af898c..24090e8 100644 --- a/src/BlobTree.jl +++ b/src/BlobTree.jl @@ -43,22 +43,6 @@ function Base.iterate(tree::AbstractBlobTree, state=nothing) end end -""" - children(tree::AbstractBlobTree) - -Return an array of the children of `tree`. A child `x` may abstractly either be -another tree (`children(x)` returns a collection) or a file, where `children(x)` -returns `()`. - -Note that this is subtly different from `readdir(path)` which returns relative -paths, or `readdir(path, join=true)` which returns absolute paths. -""" -function children(tree::AbstractBlobTree) - # TODO: Is dispatch to the root a correct default? - children(tree.root, tree.path) -end - - """ showtree([io,], tree) @@ -100,13 +84,12 @@ end function Base.copy!(dst::AbstractBlobTree, src::AbstractBlobTree) for x in src - newpath = joinpath(dst, basename(x)) + xname = basename(x) if isdir(x) - newdir = mkdir(newpath) - copy!(newdir, x) + copy!(newdir(dst, xname), x) else open(x) do io_src - open(newpath, write=true) do io_dst + newfile(dst, xname, write=true) do io_dst write(io_dst, io_src) end end @@ -247,29 +230,59 @@ The tree implements the `AbstracTrees.children()` interface and may be indexed with paths to traverse the hierarchy down to the leaves ("files") which are of type `Blob`. Individual leaves may be `open()`ed as various Julia types. +# Operations on BlobTree + +BlobTree has a largely dictionary-like interface: + +* List keys (ie, file and directory names): `keys(tree)` +* List keys and values: `pairs(tree)` +* Query keys: `haskey(tree)` +* Traverse the tree: `tree["path"]` +* Add new content: `newdir(tree, "path")`, `newfile(tree, "path")` +* Delete content: `delete!(tree, "path")` + +Unlike Dict, iteration of BlobTree iterates values (not key value pairs). This +has some benefits - for example, broadcasting processing across files in a +directory. + +* Property access + - `isdir()`, `isfile()` - determine whether a child of tree is a directory or file. + # Example -Normally you'd construct these via the [`dataset`](@ref) function which takes -care of constructing the correct `root` object. However, here's a direct -demonstration: +You can create a new temporary BlobTree via the `newdir()` function: ``` -julia> tree = BlobTree(DataSets.FileSystemRoot(dirname(pathof(DataSets))), path"../test/data") -📂 Tree ../test/data @ /home/chris/.julia/dev/DataSets/src - 📁 csvset - 📄 file.txt - 📄 foo.txt - 📄 people.csv.gz - -julia> tree["csvset"] -📂 Tree ../test/data/csvset @ /home/chris/.julia/dev/DataSets/src - 📄 1.csv - 📄 2.csv - -julia> tree[path"csvset"] -📂 Tree ../test/data/csvset @ /home/chris/.julia/dev/DataSets/src - 📄 1.csv - 📄 2.csv +julia> dir = newdir() + for i = 1:3 + newfile(dir, "\$i/a.txt") do io + println(io, "Content of a") + end + newfile(dir, "b-\$i.txt") do io + println(io, "Content of b") + end + end + dir +📂 Tree @ /tmp/jl_Sp6wMF + 📁 1 + 📁 2 + 📁 3 + 📄 b-1.txt + 📄 b-2.txt + 📄 b-3.txt +``` + +You can also get access to a `BlobTree` by using `DataSets.from_path()` with a +local directory name. For example: + +``` +julia> using Pkg + open(DataSets.from_path(joinpath(Pkg.dir("DataSets"), "src"))) +📂 Tree @ ~/.julia/dev/DataSets/src + 📄 DataSet.jl + 📄 DataSets.jl + 📄 DataTomlStorage.jl + ... ``` """ mutable struct BlobTree{Root} <: AbstractBlobTree @@ -279,32 +292,31 @@ end BlobTree(root) = BlobTree(root, RelPath()) -function AbstractTrees.printnode(io::IO, tree::BlobTree) - print(io, "📂 ", basename(tree)) -end - -function Base.show(io::IO, ::MIME"text/plain", tree::AbstractBlobTree) +function Base.show(io::IO, ::MIME"text/plain", tree::BlobTree) # TODO: Ideally we'd use # AbstractTrees.print_tree(io, tree, 1) # However, this is hard to use efficiently; we'd need to implement a lazy # `children()` for all our trees. It'd be much easier if # `AbstractTrees.has_children()` was used consistently upstream. - cs = children(tree) println(io, "📂 Tree ", tree.path, " @ ", summary(tree.root)) - for (i, c) in enumerate(cs) - print(io, " ", isdir(c) ? '📁' : '📄', " ", basename(c)) - if i != length(cs) + first = true + for (name,x) in pairs(tree) + if first + first = false + else print(io, '\n') end + print(io, " ", isdir(x) ? '📁' : '📄', " ", name) end end -Base.basename(tree::BlobTree) = basename(tree.path) -Base.abspath(tree::BlobTree) = AbsPath(tree.root, tree.path) +function AbstractTrees.printnode(io::IO, tree::BlobTree) + print(io, "📂 ", basename(tree)) +end # getindex vs joinpath: -# - getindex about indexing the datastrcutre; therefore it looks in the -# filesystem to only return things which exist. +# - getindex is about indexing the datastructure; therefore it looks in the +# storage system to only return things which exist. # - joinpath just makes paths, not knowing whether they exist. function Base.getindex(tree::BlobTree, path::RelPath) relpath = joinpath(tree.path, path) @@ -323,40 +335,129 @@ function Base.getindex(tree::BlobTree, path::RelPath) end function Base.getindex(tree::BlobTree, name::AbstractString) - getindex(tree, joinpath(RelPath(), name)) + getindex(tree, RelPath(name)) end -# We've got a weird mishmash of path vs tree handling here. -# TODO: Can we refactor this to cleanly separate the filesystem-like commands -# (which take abstract paths?) from BlobTree and Blob which act as an -# abstraction over the filesystem or other storage mechanisms? -function Base.joinpath(tree::BlobTree, r::RelPath) - AbsPath(tree.root, joinpath(tree.path, r)) -end -function Base.joinpath(tree::BlobTree, s::AbstractString) - AbsPath(tree.root, joinpath(tree.path, s)) +# Keys, values and iteration + +""" + children(tree::BlobTree) + +Return an array of the children of `tree`. A child `x` may abstractly either be +another tree (`children(x)` returns a collection) or a file, where `children(x)` +returns `()`. +""" +function children(tree::BlobTree) + [tree[RelPath([n])] for n in keys(tree)] end -function Base.haskey(tree::BlobTree, name::AbstractString) - ispath(tree.root, joinpath(tree.path, name)) +function Base.haskey(tree::BlobTree, path::AbstractString) + haskey(tree, RelPath(path)) end -function Base.readdir(tree::BlobTree) - readdir(tree.root, tree.path) +function Base.haskey(tree::BlobTree, path::RelPath) + ispath(tree.root, joinpath(tree.path, path)) end function Base.keys(tree::BlobTree) readdir(tree.root, tree.path) end -function Base.rm(tree::BlobTree; kws...) - rm(tree.root, tree.path; kws...) +function Base.pairs(tree::BlobTree) + zip(keys(tree), children(tree)) end -function children(tree::BlobTree) - child_names = readdir(tree) - [tree[c] for c in child_names] +function Base.values(tree::BlobTree) + children(tree) +end + + +# Mutation + +newdir(tree::BlobTree, path::AbstractString; kws...) = + newdir(tree, RelPath(path); kws...) +newfile(tree::BlobTree, path::AbstractString; kws...) = + newfile(tree, RelPath(path); kws...) +newfile(func::Function, tree::BlobTree, path::AbstractString; kws...) = + newfile(func, tree, RelPath(path); kws...) +Base.delete!(tree::BlobTree, path::AbstractString) = + delete!(tree, RelPath(path)) + +function _check_writeable(tree) + if !iswriteable(tree.root) + error("Attempt to write into a read-only tree with root $(tree.root)") + end +end + +function _check_new_item(tree, path, overwrite) + _check_writeable(tree) + if haskey(tree, path) && !overwrite + error("Overwriting a path $path which already exists requires the keyword `overwrite=true`") + end +end + +""" + newdir(tree, path; overwrite=false) + +Create a new directory at tree[path] and return it. If `overwrite=true`, remove +any existing directory before creating the new one. + + newdir() + +Create a new temporary `BlobTree` which can have files assigned into it and may +be assigned to a permanent location in a persistent `BlobTree`. If not assigned +to a permanent location, the temporary tree is cleaned up during garbage +collection. +""" +function newdir(tree::BlobTree, path::RelPath; overwrite=false) + _check_new_item(tree, path, overwrite) + p = joinpath(tree.path, RelPath(path)) + newdir(tree.root, p; overwrite=overwrite) + return BlobTree(tree.root, p) +end + +""" + newfile(tree, path; overwrite=false) + newfile(tree, path; overwrite=false) do io ... + +Create a new file object in the `tree` at the given `path`. In the second form, +the open file `io` will be passed to the do block. + + newfile() + +Create a new file which may be later assigned to a permanent location in a +tree. If not assigned to a permanent location, the temporary file is cleaned up +during garbage collection. + +# Example + +``` +newfile(tree, "some/demo/path.txt") do io + println(io, "Hi there!") +end +``` +""" +function newfile(tree::BlobTree, path::RelPath; overwrite=false) + _check_new_item(tree, path, overwrite) + p = joinpath(tree.path, path) + newfile(tree.root, p; overwrite=overwrite) + return Blob(tree.root, p) +end + +function newfile(func::Function, tree::BlobTree, path::RelPath; overwrite=false) + _check_new_item(tree, path, overwrite) + p = joinpath(tree.path, path) + newfile(func, tree.root, p; overwrite=overwrite) + return Blob(tree.root, p) +end + + +function Base.delete!(tree::BlobTree, path::RelPath) + _check_writeable(tree) + relpath = joinpath(tree.path, path) + root = tree.root + delete!(root, relpath) end function Base.open(f::Function, ::Type{BlobTree}, tree::BlobTree) @@ -368,3 +469,51 @@ end end # Base.open(::Type{T}, file::Blob; kws...) where {T} = open(identity, T, file.root, file.path; kws...) + + +#------------------------------------------------------------------------------- +# Path manipulation + +# TODO: Maybe deprecate these? Under the "datastructure-like" model, it seems wrong +# for a blob to know its name in the parent data structure. +Base.basename(tree::BlobTree) = basename(tree.path) +Base.abspath(tree::BlobTree) = AbsPath(tree.root, tree.path) + +function Base.joinpath(tree::BlobTree, r::RelPath) + AbsPath(tree.root, joinpath(tree.path, r)) +end + +function Base.joinpath(tree::BlobTree, s::AbstractString) + AbsPath(tree.root, joinpath(tree.path, s)) +end + + +#------------------------------------------------------------------------------- +# Deprecated +function Base.rm(tree::BlobTree; kws...) + _check_writeable(tree) + rm(tree.root, tree.path; kws...) +end + +function Base.readdir(tree::BlobTree) + readdir(tree.root, tree.path) +end + +# Create files within a temporary directory. +function newdir(tree::BlobTree) + Base.depwarn(""" + `newdir(::BlobTree)` for temporary trees is deprecated. + Use the in-place version `newdir(::BlobTree, dirname)` instead. + """, + :newdir) + newdir(tree.root) +end +function newfile(tree::BlobTree) + Base.depwarn(""" + `newfile(::BlobTree)` for temporary trees is deprecated. + Use the in-place version `newfile(::BlobTree, dirname)` instead. + """, + :newfile) + newfile(tree.root) +end + diff --git a/src/DataSets.jl b/src/DataSets.jl index c6737c3..1c5ba31 100644 --- a/src/DataSets.jl +++ b/src/DataSets.jl @@ -172,6 +172,9 @@ The `DataSet` is *metadata*, but to use the actual *data* in your program you need to use the `open` function to access the `DataSet`'s content as a given Julia type. +`name` is the name of the dataset, or more generally a "data specification": a +URI-like object of the form `namespace/name?params#fragment`. + # Example To open a dataset named `"a_text_file"` and read the whole content as a String, diff --git a/src/data_project.jl b/src/data_project.jl index de97786..7b545a6 100644 --- a/src/data_project.jl +++ b/src/data_project.jl @@ -25,7 +25,6 @@ function Base.getindex(proj::AbstractDataProject, name::AbstractString) data end - function dataset(proj::AbstractDataProject, spec::AbstractString) namestr, query, fragmentstr = _split_dataspec(spec) diff --git a/src/filesystem.jl b/src/filesystem.jl index 3a46e63..1219632 100644 --- a/src/filesystem.jl +++ b/src/filesystem.jl @@ -41,7 +41,7 @@ Base.readdir(root::AbstractFileSystemRoot, path::RelPath) = readdir(sys_abspath( function Base.mkdir(root::AbstractFileSystemRoot, path::RelPath; kws...) if !iswriteable(root) - error("Cannot make directory in read-only tree root at $(sys_abspath(p.root))") + error("Cannot make directory in read-only tree") end mkdir(sys_abspath(root, path), args...) return BlobTree(root, path) @@ -51,6 +51,13 @@ function Base.rm(root::AbstractFileSystemRoot, path::RelPath; kws...) rm(sys_abspath(root,path); kws...) end +function Base.delete!(root::AbstractFileSystemRoot, path::RelPath) + if !iswriteable(root) + error("Cannot delete from read-only tree $root") + end + rm(sys_abspath(root,path); recursive=true) +end + #-------------------------------------------------- # Storage data interface for Blob @@ -96,15 +103,15 @@ For BlobTree: """ struct FileSystemRoot <: AbstractFileSystemRoot path::String - read::Bool write::Bool end -function FileSystemRoot(path::AbstractString; write=false, read=true) +function FileSystemRoot(path::AbstractString; write=false) path = abspath(path) - FileSystemRoot(path, read, write) + FileSystemRoot(path, write) end +iswriteable(root) = false iswriteable(root::FileSystemRoot) = root.write sys_abspath(root::FileSystemRoot) = root.path @@ -114,7 +121,7 @@ function Base.abspath(relpath::RelPath) `abspath(::RelPath)` defaults to using `pwd()` as the root of the path but this leads to fragile code so will be removed in the future""", :abspath) - AbsPath(FileSystemRoot(pwd(); write=true, read=true), relpath) + AbsPath(FileSystemRoot(pwd(); write=true), relpath) end #------------------------------------------------------------------------------- @@ -141,46 +148,73 @@ end iswriteable(root::TempFilesystemRoot) = true sys_abspath(root::TempFilesystemRoot) = root.path -""" - newdir() - -Create a new temporary `BlobTree` which can have files assigned into it and may -be assigned to a permanent location in a persistent `BlobTree`. If not assigned -to a permanent location, the temporary tree is cleaned up during garbage -collection. -""" -function newdir(ctx::AbstractFileSystemRoot=FileSystemRoot(tempdir(), write=true)) +function newdir() # cleanup=false: we manage our own cleanup via the finalizer - path = mktempdir(sys_abspath(ctx), cleanup=false) + path = mktempdir(cleanup=false) return BlobTree(TempFilesystemRoot(path)) end -newdir(ctx::BlobTree) = newdir(ctx.root) -function newfile(ctx::AbstractFileSystemRoot=FileSystemRoot(tempdir(), write=true)) - path, io = mktemp(sys_abspath(ctx), cleanup=false) - close(io) - return Blob(TempFilesystemRoot(path)) +function newdir(root::AbstractFileSystemRoot, path::RelPath; overwrite=false) + p = sys_abspath(root, path) + if overwrite + rm(p, force=true, recursive=true) + end + mkpath(p) end -newfile(ctx::BlobTree) = newfile(ctx.root) -""" - newfile(func) - newfile(func, ctx) -Create a new temporary `Blob` object which may be later assigned to a permanent -location in a `BlobTree`. If not assigned to a permanent location, the -temporary file is cleaned up during garbage collection. +function newfile(func=nothing) + path, io = mktemp(cleanup=false) + if func !== nothing + try + func(io) + catch + rm(path) + rethrow() + finally + close(io) + end + end + return Blob(TempFilesystemRoot(path)) +end + +function newfile(f::Function, root::AbstractFileSystemRoot, path::RelPath; kws...) + p = sys_abspath(root, path) + mkpath(dirname(p)) + open(f, p, write=true) +end -# Example +function newfile(root::AbstractFileSystemRoot, path::RelPath; kws...) + newfile(io->nothing, root, path; kws...) +end -``` -tree[path"some/demo/path.txt"] = newfile() do io - println(io, "Hi there!") +#------------------------------------------------------------------------------- +# Deprecated newdir() and newfile() variants +function newdir(ctx::AbstractFileSystemRoot) + Base.depwarn(""" + `newdir(ctx::AbstractFileSystemRoot)` is deprecated. Use the in-place + version `newdir(::BlobTree, path)` instead. + """, :newdir) + path = mktempdir(sys_abspath(ctx), cleanup=false) + return BlobTree(TempFilesystemRoot(path)) end -``` -""" -function newfile(f::Function, ctx=FileSystemRoot(tempdir(), write=true)) + +function newfile(ctx::AbstractFileSystemRoot) + Base.depwarn(""" + `newfile(ctx::AbstractFileSystemRoot)` is deprecated. Use the in-place + version `newfile(::BlobTree, path)` instead. + """, :newfile) path, io = mktemp(sys_abspath(ctx), cleanup=false) + close(io) + return Blob(TempFilesystemRoot(path)) +end + +function newfile(f::Function, root::FileSystemRoot) + Base.depwarn(""" + `newfile(f::Function, ctx::AbstractFileSystemRoot)` is deprecated. + Use newfile() or the in-place version `newfile(::BlobTree, path)` instead. + """, :newfile) + path, io = mktemp(sys_abspath(root), cleanup=false) try f(io) catch diff --git a/src/paths.jl b/src/paths.jl index 0d57aaa..043f541 100644 --- a/src/paths.jl +++ b/src/paths.jl @@ -18,7 +18,7 @@ struct RelPath <: AbstractPath components::Vector{String} end -RelPath(::AbstractString) = error("RelPath(::String) is not defined to avoid ambiguities between operating systems. Use the `path\"...\"` string macro for path literals.") +RelPath(str::AbstractString) = RelPath(split(str, '/')) RelPath(components::AbstractVector=String[]) = RelPath(convert(Vector{String}, components)) # Path manipulation. @@ -73,7 +73,7 @@ end """ An AbsPath is the *key* into a hierarchical tree index, relative to some root. -As a *key*, the resource pointed to by this key may or may not exist. +The path is only a key; the resource pointed to by this key may or may not exist. """ struct AbsPath{Root} <: AbstractPath root::Root From 75010a37799062b251752a1fc7d99b141598a1bb Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Mon, 16 May 2022 13:58:59 +1000 Subject: [PATCH 07/33] Rename Blob->File, BlobTree->FileTree * The big rename * Add deprecations for Blob, BlobTree --- docs/src/Data.toml | 6 +- docs/src/reference.md | 10 +- docs/src/tutorial.md | 20 +- src/BlobTree.jl | 190 +++++++++--------- src/DataSets.jl | 7 +- src/GitTree.jl | 10 +- src/TomlDataStorage.jl | 18 +- src/ZipTree.jl | 28 +-- src/data_project.jl | 4 +- src/entrypoint.jl | 4 +- src/file_data_projects.jl | 4 +- src/filesystem.jl | 44 ++-- src/paths.jl | 2 +- src/repl.jl | 4 +- test/Data.toml | 39 +--- test/DataCompat.toml | 93 +++++++++ test/DriverAutoloadData.toml | 2 +- test/TomlDataStorage.jl | 8 +- test/active_project/Data.toml | 2 +- test/backend_compat.jl | 18 +- .../src/DummyStorageBackends.jl | 2 +- test/entrypoint.jl | 4 +- test/projects.jl | 6 +- test/runtests.jl | 26 +-- 24 files changed, 316 insertions(+), 235 deletions(-) create mode 100644 test/DataCompat.toml diff --git a/docs/src/Data.toml b/docs/src/Data.toml index 2602ba7..a9d6176 100644 --- a/docs/src/Data.toml +++ b/docs/src/Data.toml @@ -13,8 +13,8 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26" [datasets.storage] # The name of the storage driver. driver="FileSystem" - # Data stored in FileSystem is either Blob (a file) or BlobTree (a directory/folder) - type="Blob" + # Data stored in FileSystem is either File (a file) or FileTree (a directory/folder) + type="File" # Path with posix `/` separators. # Use @__DIR__ for paths relative to Data.toml path="@__DIR__/data/file.txt" @@ -27,7 +27,7 @@ uuid="e7fd7080-e346-4a68-9ca9-98593a99266a" [datasets.storage] driver="FileSystem" - type="BlobTree" + type="FileTree" path="@__DIR__/data/csvset" # Further datasets can be added as desired diff --git a/docs/src/reference.md b/docs/src/reference.md index 12a9bd9..7850d06 100644 --- a/docs/src/reference.md +++ b/docs/src/reference.md @@ -48,14 +48,14 @@ DataSets.TomlFileDataProject ## Data Models for files and directories -DataSets provides some builtin data models [`Blob`](@ref) and -[`BlobTree`](@ref) for accessin file- and directory-like data respectively. For +DataSets provides some builtin data models [`File`](@ref) and +[`FileTree`](@ref) for accessin file- and directory-like data respectively. For modifying these, the functions [`newfile`](@ref) and [`newdir`](@ref) can be -used, together with `setindex!` for `BlobTree`. +used, together with `setindex!` for `FileTree`. ```@docs -Blob -BlobTree +File +FileTree newfile newdir ``` diff --git a/docs/src/tutorial.md b/docs/src/tutorial.md index b07ea4d..668ddfe 100644 --- a/docs/src/tutorial.md +++ b/docs/src/tutorial.md @@ -60,7 +60,7 @@ description = "A text file containing the standard greeting" [storage] driver = "FileSystem" -type = "Blob" +type = "File" path = ".../DataSets/docs/src/data/file.txt" ``` @@ -80,14 +80,14 @@ description = "A text file containing the standard greeting" [storage] driver = "FileSystem" -type = "Blob" +type = "File" path = ".../DataSets/docs/src/data/file.txt" ``` ## Loading Data You can call `open()` on a DataSet to inspect the data inside. `open()` will -return the [`Blob`](@ref) and [`BlobTree`](@ref) types for local files and +return the [`File`](@ref) and [`FileTree`](@ref) types for local files and directories on disk. For example, ```jldoctest @@ -100,7 +100,7 @@ julia> open(dataset("a_tree_example")) 📄 2.csv ``` -Use the form `open(T, dataset)` to read the data as a specific type. `Blob` +Use the form `open(T, dataset)` to read the data as a specific type. `File` data can be opened as `String`, `IO`, or `Vector{UInt8}`, depending on your needs: @@ -133,13 +133,13 @@ content = "Hello world!\n" ``` Let's look at some tree-like data which is represented on local disk as a -folder or directory. Tree data is opened in Julia as the [`BlobTree`](@ref) -type and can be indexed with path components to get at the file [`Blob`](@ref)s +folder or directory. Tree data is opened in Julia as the [`FileTree`](@ref) +type and can be indexed with path components to get at the file [`File`](@ref)s inside. In turn, we can `open()` one of the file blobs and look at the data contained within. ```jldoctest -julia> tree = open(BlobTree, dataset("a_tree_example")) +julia> tree = open(FileTree, dataset("a_tree_example")) 📂 Tree @ .../DataSets/docs/src/data/csvset 📄 1.csv 📄 2.csv @@ -160,14 +160,14 @@ Rather than manually using the `open()` functions as shown above, the into your program. For example, here we define an entry point called `main` which takes -* DataSet type `Blob`, presenting it as a `String` within the program -* DataSet type `BlobTree`, presenting it as a `BlobTree` within the program +* DataSet type `File`, presenting it as a `String` within the program +* DataSet type `FileTree`, presenting it as a `FileTree` within the program The `@datarun` macro allows you to call such program entry points, extracting named data sets from a given project. ```jldoctest -julia> @datafunc function main(x::Blob=>String, t::BlobTree=>BlobTree) +julia> @datafunc function main(x::File=>String, t::FileTree=>FileTree) @show x open(String, t["1.csv"]) do csv_data @show csv_data diff --git a/src/BlobTree.jl b/src/BlobTree.jl index 24090e8..563fa98 100644 --- a/src/BlobTree.jl +++ b/src/BlobTree.jl @@ -12,20 +12,20 @@ import AbstractTrees: AbstractTrees, children #------------------------------------------------------------------------------- -abstract type AbstractBlobTree; end +abstract type AbstractFileTree; end # The tree API # TODO: Should we have `istree` separate from `isdir`? -Base.isdir(x::AbstractBlobTree) = true -Base.isfile(tree::AbstractBlobTree) = false -Base.ispath(x::AbstractBlobTree) = true +Base.isdir(x::AbstractFileTree) = true +Base.isfile(tree::AbstractFileTree) = false +Base.ispath(x::AbstractFileTree) = true # Number of children is not known without a (potentially high-latency) call to # an external resource -Base.IteratorSize(tree::AbstractBlobTree) = Base.SizeUnknown() +Base.IteratorSize(tree::AbstractFileTree) = Base.SizeUnknown() -function Base.iterate(tree::AbstractBlobTree, state=nothing) +function Base.iterate(tree::AbstractFileTree, state=nothing) if state == nothing # By default, call `children(tree)` to eagerly get a list of children # for iteration. @@ -48,7 +48,7 @@ end Pretty printing of file trees, in the spirit of the unix `tree` utility. """ -function showtree(io::IO, tree::AbstractBlobTree; maxdepth=5) +function showtree(io::IO, tree::AbstractFileTree; maxdepth=5) println(io, "📂 ", tree) _showtree(io, tree, "", maxdepth) end @@ -58,11 +58,11 @@ struct ShownTree end # Use a wrapper rather than defaulting to stdout so that this works in more # functional environments such as Pluto.jl -showtree(tree::AbstractBlobTree) = ShownTree(tree) +showtree(tree::AbstractFileTree) = ShownTree(tree) Base.show(io::IO, s::ShownTree) = showtree(io, s.tree) -function _showtree(io::IO, tree::AbstractBlobTree, prefix, depth) +function _showtree(io::IO, tree::AbstractFileTree, prefix, depth) cs = children(tree) for (i,x) in enumerate(cs) islast = i == lastindex(cs) # TODO: won't work if children() is lazy @@ -82,7 +82,7 @@ function _showtree(io::IO, tree::AbstractBlobTree, prefix, depth) end end -function Base.copy!(dst::AbstractBlobTree, src::AbstractBlobTree) +function Base.copy!(dst::AbstractFileTree, src::AbstractFileTree) for x in src xname = basename(x) if isdir(x) @@ -99,91 +99,91 @@ end #------------------------------------------------------------------------------- """ - Blob(root) - Blob(root, relpath) + File(root) + File(root, relpath) -`Blob` represents the location of a collection of unstructured binary data. The +`File` represents the location of a collection of unstructured binary data. The location is a path `relpath` relative to some `root` data resource. -A `Blob` can naturally be `open()`ed as a `Vector{UInt8}`, but can also be +A `File` can naturally be `open()`ed as a `Vector{UInt8}`, but can also be mapped into the program as an `IO` byte stream, or interpreted as a `String`. -Blobs can be arranged into hierarchies "directories" via the `BlobTree` type. +Files can be arranged into hierarchies "directories" via the `FileTree` type. """ -mutable struct Blob{Root} +mutable struct File{Root} root::Root path::RelPath end -Blob(root) = Blob(root, RelPath()) +File(root) = File(root, RelPath()) -Base.basename(file::Blob) = basename(file.path) -Base.abspath(file::Blob) = AbsPath(file.root, file.path) -Base.isdir(file::Blob) = false -Base.isfile(file::Blob) = true -Base.ispath(file::Blob) = true +Base.basename(file::File) = basename(file.path) +Base.abspath(file::File) = AbsPath(file.root, file.path) +Base.isdir(file::File) = false +Base.isfile(file::File) = true +Base.ispath(file::File) = true -function Base.show(io::IO, ::MIME"text/plain", file::Blob) +function Base.show(io::IO, ::MIME"text/plain", file::File) print(io, "📄 ", file.path, " @ ", summary(file.root)) end -function AbstractTrees.printnode(io::IO, file::Blob) +function AbstractTrees.printnode(io::IO, file::File) print(io, "📄 ", basename(file)) end # Opening as Vector{UInt8} or as String defers to IO interface -function Base.open(f::Function, ::Type{Vector{UInt8}}, file::Blob) +function Base.open(f::Function, ::Type{Vector{UInt8}}, file::File) open(IO, file.root, file.path) do io f(read(io)) # TODO: use Mmap? end end -function Base.open(f::Function, ::Type{String}, file::Blob) +function Base.open(f::Function, ::Type{String}, file::File) open(IO, file.root, file.path) do io f(read(io, String)) end end -# Default open-type for Blob is IO -Base.open(f::Function, file::Blob; kws...) = open(f, IO, file.root, file.path; kws...) +# Default open-type for File is IO +Base.open(f::Function, file::File; kws...) = open(f, IO, file.root, file.path; kws...) -# Opening Blob as itself is trivial -function Base.open(f::Function, ::Type{Blob}, file::Blob) +# Opening File as itself is trivial +function Base.open(f::Function, ::Type{File}, file::File) f(file) end # open with other types T defers to the underlying storage system -function Base.open(f::Function, ::Type{T}, file::Blob; kws...) where {T} +function Base.open(f::Function, ::Type{T}, file::File; kws...) where {T} open(f, T, file.root, file.path; kws...) end # ResourceContexts.jl - based versions of the above. -@! function Base.open(::Type{Vector{UInt8}}, file::Blob) +@! function Base.open(::Type{Vector{UInt8}}, file::File) @context begin # TODO: use Mmap? read(@! open(IO, file.root, file.path)) end end -@! function Base.open(::Type{String}, file::Blob) +@! function Base.open(::Type{String}, file::File) @context begin read(@!(open(IO, file.root, file.path)), String) end end -# Default open-type for Blob is IO -@! function Base.open(file::Blob; kws...) +# Default open-type for File is IO +@! function Base.open(file::File; kws...) @! open(IO, file.root, file.path; kws...) end -# Opening Blob as itself is trivial -@! function Base.open(::Type{Blob}, file::Blob) +# Opening File as itself is trivial +@! function Base.open(::Type{File}, file::File) file end # open with other types T defers to the underlying storage system -@! function Base.open(::Type{T}, file::Blob; kws...) where {T} +@! function Base.open(::Type{T}, file::File; kws...) where {T} @! open(T, file.root, file.path; kws...) end @@ -196,17 +196,17 @@ end res end -# Unscoped form of open for Blob -function Base.open(::Type{T}, blob::Blob; kws...) where {T} +# Unscoped form of open for File +function Base.open(::Type{T}, blob::File; kws...) where {T} @context begin result = @! open(T, blob; kws...) @! ResourceContexts.detach_context_cleanup(result) end end -# read() is also supported for `Blob`s -Base.read(file::Blob) = read(file.root, file.path) -Base.read(file::Blob, ::Type{T}) where {T} = read(file.root, file.path, T) +# read() is also supported for `File`s +Base.read(file::File) = read(file.root, file.path) +Base.read(file::File, ::Type{T}) where {T} = read(file.root, file.path, T) # Support for opening AbsPath @@ -221,18 +221,18 @@ Base.open(f::Function, path::AbsPath; kws...) = open(f, IO, path.root, path.path #------------------------------------------------------------------------------- """ - BlobTree(root) + FileTree(root) -`BlobTree` is a "directory tree" like hierarchy which may have `Blob`s and -`BlobTree`s as children. +`FileTree` is a "directory tree" like hierarchy which may have `File`s and +`FileTree`s as children. The tree implements the `AbstracTrees.children()` interface and may be indexed with paths to traverse the hierarchy down to the leaves ("files") which are of -type `Blob`. Individual leaves may be `open()`ed as various Julia types. +type `File`. Individual leaves may be `open()`ed as various Julia types. -# Operations on BlobTree +# Operations on FileTree -BlobTree has a largely dictionary-like interface: +FileTree has a largely dictionary-like interface: * List keys (ie, file and directory names): `keys(tree)` * List keys and values: `pairs(tree)` @@ -241,7 +241,7 @@ BlobTree has a largely dictionary-like interface: * Add new content: `newdir(tree, "path")`, `newfile(tree, "path")` * Delete content: `delete!(tree, "path")` -Unlike Dict, iteration of BlobTree iterates values (not key value pairs). This +Unlike Dict, iteration of FileTree iterates values (not key value pairs). This has some benefits - for example, broadcasting processing across files in a directory. @@ -250,7 +250,7 @@ directory. # Example -You can create a new temporary BlobTree via the `newdir()` function: +You can create a new temporary FileTree via the `newdir()` function: ``` julia> dir = newdir() @@ -272,7 +272,7 @@ julia> dir = newdir() 📄 b-3.txt ``` -You can also get access to a `BlobTree` by using `DataSets.from_path()` with a +You can also get access to a `FileTree` by using `DataSets.from_path()` with a local directory name. For example: ``` @@ -285,14 +285,14 @@ julia> using Pkg ... ``` """ -mutable struct BlobTree{Root} <: AbstractBlobTree +mutable struct FileTree{Root} <: AbstractFileTree root::Root path::RelPath end -BlobTree(root) = BlobTree(root, RelPath()) +FileTree(root) = FileTree(root, RelPath()) -function Base.show(io::IO, ::MIME"text/plain", tree::BlobTree) +function Base.show(io::IO, ::MIME"text/plain", tree::FileTree) # TODO: Ideally we'd use # AbstractTrees.print_tree(io, tree, 1) # However, this is hard to use efficiently; we'd need to implement a lazy @@ -310,7 +310,7 @@ function Base.show(io::IO, ::MIME"text/plain", tree::BlobTree) end end -function AbstractTrees.printnode(io::IO, tree::BlobTree) +function AbstractTrees.printnode(io::IO, tree::FileTree) print(io, "📂 ", basename(tree)) end @@ -318,15 +318,15 @@ end # - getindex is about indexing the datastructure; therefore it looks in the # storage system to only return things which exist. # - joinpath just makes paths, not knowing whether they exist. -function Base.getindex(tree::BlobTree, path::RelPath) +function Base.getindex(tree::FileTree, path::RelPath) relpath = joinpath(tree.path, path) root = tree.root # TODO: Make this more efficient by moving this work to the storage backend? # Sort of like an equivalent of `stat`? if isdir(root, relpath) - BlobTree(root, relpath) + FileTree(root, relpath) elseif isfile(root, relpath) - Blob(root, relpath) + File(root, relpath) elseif ispath(root, relpath) AbsPath(root, relpath) # Not great? else @@ -334,7 +334,7 @@ function Base.getindex(tree::BlobTree, path::RelPath) end end -function Base.getindex(tree::BlobTree, name::AbstractString) +function Base.getindex(tree::FileTree, name::AbstractString) getindex(tree, RelPath(name)) end @@ -342,46 +342,46 @@ end # Keys, values and iteration """ - children(tree::BlobTree) + children(tree::FileTree) Return an array of the children of `tree`. A child `x` may abstractly either be another tree (`children(x)` returns a collection) or a file, where `children(x)` returns `()`. """ -function children(tree::BlobTree) +function children(tree::FileTree) [tree[RelPath([n])] for n in keys(tree)] end -function Base.haskey(tree::BlobTree, path::AbstractString) +function Base.haskey(tree::FileTree, path::AbstractString) haskey(tree, RelPath(path)) end -function Base.haskey(tree::BlobTree, path::RelPath) +function Base.haskey(tree::FileTree, path::RelPath) ispath(tree.root, joinpath(tree.path, path)) end -function Base.keys(tree::BlobTree) +function Base.keys(tree::FileTree) readdir(tree.root, tree.path) end -function Base.pairs(tree::BlobTree) +function Base.pairs(tree::FileTree) zip(keys(tree), children(tree)) end -function Base.values(tree::BlobTree) +function Base.values(tree::FileTree) children(tree) end # Mutation -newdir(tree::BlobTree, path::AbstractString; kws...) = +newdir(tree::FileTree, path::AbstractString; kws...) = newdir(tree, RelPath(path); kws...) -newfile(tree::BlobTree, path::AbstractString; kws...) = +newfile(tree::FileTree, path::AbstractString; kws...) = newfile(tree, RelPath(path); kws...) -newfile(func::Function, tree::BlobTree, path::AbstractString; kws...) = +newfile(func::Function, tree::FileTree, path::AbstractString; kws...) = newfile(func, tree, RelPath(path); kws...) -Base.delete!(tree::BlobTree, path::AbstractString) = +Base.delete!(tree::FileTree, path::AbstractString) = delete!(tree, RelPath(path)) function _check_writeable(tree) @@ -405,16 +405,16 @@ any existing directory before creating the new one. newdir() -Create a new temporary `BlobTree` which can have files assigned into it and may -be assigned to a permanent location in a persistent `BlobTree`. If not assigned +Create a new temporary `FileTree` which can have files assigned into it and may +be assigned to a permanent location in a persistent `FileTree`. If not assigned to a permanent location, the temporary tree is cleaned up during garbage collection. """ -function newdir(tree::BlobTree, path::RelPath; overwrite=false) +function newdir(tree::FileTree, path::RelPath; overwrite=false) _check_new_item(tree, path, overwrite) p = joinpath(tree.path, RelPath(path)) newdir(tree.root, p; overwrite=overwrite) - return BlobTree(tree.root, p) + return FileTree(tree.root, p) end """ @@ -438,37 +438,37 @@ newfile(tree, "some/demo/path.txt") do io end ``` """ -function newfile(tree::BlobTree, path::RelPath; overwrite=false) +function newfile(tree::FileTree, path::RelPath; overwrite=false) _check_new_item(tree, path, overwrite) p = joinpath(tree.path, path) newfile(tree.root, p; overwrite=overwrite) - return Blob(tree.root, p) + return File(tree.root, p) end -function newfile(func::Function, tree::BlobTree, path::RelPath; overwrite=false) +function newfile(func::Function, tree::FileTree, path::RelPath; overwrite=false) _check_new_item(tree, path, overwrite) p = joinpath(tree.path, path) newfile(func, tree.root, p; overwrite=overwrite) - return Blob(tree.root, p) + return File(tree.root, p) end -function Base.delete!(tree::BlobTree, path::RelPath) +function Base.delete!(tree::FileTree, path::RelPath) _check_writeable(tree) relpath = joinpath(tree.path, path) root = tree.root delete!(root, relpath) end -function Base.open(f::Function, ::Type{BlobTree}, tree::BlobTree) +function Base.open(f::Function, ::Type{FileTree}, tree::FileTree) f(tree) end -@! function Base.open(::Type{BlobTree}, tree::BlobTree) +@! function Base.open(::Type{FileTree}, tree::FileTree) tree end -# Base.open(::Type{T}, file::Blob; kws...) where {T} = open(identity, T, file.root, file.path; kws...) +# Base.open(::Type{T}, file::File; kws...) where {T} = open(identity, T, file.root, file.path; kws...) #------------------------------------------------------------------------------- @@ -476,42 +476,42 @@ end # TODO: Maybe deprecate these? Under the "datastructure-like" model, it seems wrong # for a blob to know its name in the parent data structure. -Base.basename(tree::BlobTree) = basename(tree.path) -Base.abspath(tree::BlobTree) = AbsPath(tree.root, tree.path) +Base.basename(tree::FileTree) = basename(tree.path) +Base.abspath(tree::FileTree) = AbsPath(tree.root, tree.path) -function Base.joinpath(tree::BlobTree, r::RelPath) +function Base.joinpath(tree::FileTree, r::RelPath) AbsPath(tree.root, joinpath(tree.path, r)) end -function Base.joinpath(tree::BlobTree, s::AbstractString) +function Base.joinpath(tree::FileTree, s::AbstractString) AbsPath(tree.root, joinpath(tree.path, s)) end #------------------------------------------------------------------------------- # Deprecated -function Base.rm(tree::BlobTree; kws...) +function Base.rm(tree::FileTree; kws...) _check_writeable(tree) rm(tree.root, tree.path; kws...) end -function Base.readdir(tree::BlobTree) +function Base.readdir(tree::FileTree) readdir(tree.root, tree.path) end # Create files within a temporary directory. -function newdir(tree::BlobTree) +function newdir(tree::FileTree) Base.depwarn(""" - `newdir(::BlobTree)` for temporary trees is deprecated. - Use the in-place version `newdir(::BlobTree, dirname)` instead. + `newdir(::FileTree)` for temporary trees is deprecated. + Use the in-place version `newdir(::FileTree, dirname)` instead. """, :newdir) newdir(tree.root) end -function newfile(tree::BlobTree) +function newfile(tree::FileTree) Base.depwarn(""" - `newfile(::BlobTree)` for temporary trees is deprecated. - Use the in-place version `newfile(::BlobTree, dirname)` instead. + `newfile(::FileTree)` for temporary trees is deprecated. + Use the in-place version `newfile(::FileTree, dirname)` instead. """, :newfile) newfile(tree.root) diff --git a/src/DataSets.jl b/src/DataSets.jl index 1c5ba31..5396a2b 100644 --- a/src/DataSets.jl +++ b/src/DataSets.jl @@ -7,7 +7,7 @@ using ResourceContexts using Base: PkgId export DataSet, dataset, @datafunc, @datarun -export Blob, BlobTree, newfile, newdir +export File, FileTree, newfile, newdir """ The current DataSets version number @@ -195,7 +195,7 @@ end To open a directory as a browsable tree object, ```julia -open(BlobTree, dataset("a_tree_example")) +open(FileTree, dataset("a_tree_example")) ``` """ dataset(name) = dataset(PROJECT, name) @@ -235,4 +235,7 @@ include("TomlDataStorage.jl") # Application-level stuff include("repl.jl") +Base.@deprecate_binding Blob File +Base.@deprecate_binding BlobTree FileTree + end diff --git a/src/GitTree.jl b/src/GitTree.jl index 9dadc94..6f28616 100644 --- a/src/GitTree.jl +++ b/src/GitTree.jl @@ -19,7 +19,7 @@ function Base.open(f::Function, root::GitTreeRoot) git(subcmd) = setenv(`git $subcmd`, dir=root.path) s = read(git(`status --porcelain`), String) isempty(s) || error("Git working copy is dirty") - result = f(BlobTree(root)) + result = f(FileTree(root)) # FIXME: From the point of view of this code, it seems unnatural to attach # `write` to GitTreeRoot. if root.write @@ -30,13 +30,13 @@ function Base.open(f::Function, root::GitTreeRoot) end #------------------------------------------------------------------------------- -# FIXME: Factor together with BlobTreeRoot +# FIXME: Factor together with FileTreeRoot -function Base.haskey(tree::BlobTree{GitTreeRoot}, name::AbstractString) +function Base.haskey(tree::FileTree{GitTreeRoot}, name::AbstractString) ispath(sys_abspath(joinpath(tree,name))) end -function Base.open(func::Function, f::Blob{GitTreeRoot}; write=false, read=!write) +function Base.open(func::Function, f::File{GitTreeRoot}; write=false, read=!write) if !f.root.write && write error("Error writing file at read-only path $f") end @@ -55,6 +55,6 @@ function Base.mkdir(p::AbsPath{GitTreeRoot}, args...) error("Cannot make directory in read-only tree root at $(sys_abspath(p.root))") end mkdir(sys_abspath(p), args...) - return BlobTree(p.root, p.path) + return FileTree(p.root, p.path) end diff --git a/src/TomlDataStorage.jl b/src/TomlDataStorage.jl index 5520bba..5271869 100644 --- a/src/TomlDataStorage.jl +++ b/src/TomlDataStorage.jl @@ -6,19 +6,19 @@ Useful for small amounts of self-contained data. ## Metadata spec -For Blob: +For File: ``` [datasets.storage] driver="TomlDataStorage" - type="Blob" + type="File" data=\$(base64encode(data)) ``` -For BlobTree: +For FileTree: ``` [datasets.storage] driver="TomlDataStorage" - type="BlobTree" + type="FileTree" [datasets.storage.data.\$(dirname1)] "\$(filename1)" = \$(base64encode(data1)) @@ -63,7 +63,7 @@ function Base.readdir(storage::TomlDataStorage, path::RelPath) end #-------------------------------------------------- -# Storage data interface for Blob +# Storage data interface for File function Base.open(func::Function, as_type::Type{IO}, storage::TomlDataStorage, path; kws...) @@ -108,16 +108,16 @@ end function connect_toml_data_storage(f, config, dataset) type = config["type"] data = get(config, "data", nothing) - if type == "Blob" + if type in ("File", "Blob") if !(data isa AbstractString) error("TOML data storage requires string data in the \"storage.data\" key") end - f(Blob(TomlDataStorage(dataset, data))) - elseif type == "BlobTree" + f(File(TomlDataStorage(dataset, data))) + elseif type in ("FileTree", "BlobTree") if !(data isa AbstractDict) error("TOML data storage requires a dictionary in the \"storage.data\" key") end - f(BlobTree(TomlDataStorage(dataset, data))) + f(FileTree(TomlDataStorage(dataset, data))) else throw(ArgumentError("DataSet type $type not supported for data embedded in Data.toml")) end diff --git a/src/ZipTree.jl b/src/ZipTree.jl index cfb80c2..1ee1131 100644 --- a/src/ZipTree.jl +++ b/src/ZipTree.jl @@ -27,18 +27,18 @@ end #------------------------------------------------------------------------------- -# FIXME: Factor back together with BlobTree.jl !! +# FIXME: Factor back together with FileTree.jl !! -struct ZippedBlobTree <: AbstractBlobTree +struct ZippedFileTree <: AbstractFileTree root::ZipTreeRoot path::RelPath end -ZippedBlobTree(root::ZipTreeRoot) = ZippedBlobTree(root, RelPath()) +ZippedFileTree(root::ZipTreeRoot) = ZippedFileTree(root, RelPath()) -Base.basename(tree::ZippedBlobTree) = basename(tree.path) +Base.basename(tree::ZippedFileTree) = basename(tree.path) -function Base.getindex(tree::ZippedBlobTree, path::RelPath) +function Base.getindex(tree::ZippedFileTree, path::RelPath) newpath = joinpath(tree.path, path) i = findfirst(tree.root.file_info) do info info.path == newpath @@ -46,17 +46,17 @@ function Base.getindex(tree::ZippedBlobTree, path::RelPath) if i == nothing error("Path $newpath doesn't exist in $tree") elseif tree.root.file_info[i].is_dir - ZippedBlobTree(tree.root, newpath) + ZippedFileTree(tree.root, newpath) else - Blob(tree.root, newpath) + File(tree.root, newpath) end end -function Base.getindex(tree::ZippedBlobTree, name::AbstractString) +function Base.getindex(tree::ZippedFileTree, name::AbstractString) getindex(tree, joinpath(RelPath(), name)) end -function _tree_children(tree::ZippedBlobTree) +function _tree_children(tree::ZippedFileTree) children = String[] for (i,info) in enumerate(tree.root.file_info) if dirname(info.path) == tree.path @@ -66,8 +66,8 @@ function _tree_children(tree::ZippedBlobTree) children end -Base.IteratorSize(tree::ZippedBlobTree) = Base.SizeUnknown() -function Base.iterate(tree::ZippedBlobTree, state=nothing) +Base.IteratorSize(tree::ZippedFileTree) = Base.SizeUnknown() +function Base.iterate(tree::ZippedFileTree, state=nothing) if state == nothing children = _tree_children(tree) itr = iterate(children) @@ -83,16 +83,16 @@ function Base.iterate(tree::ZippedBlobTree, state=nothing) end end -function Base.joinpath(tree::ZippedBlobTree, r::RelPath) +function Base.joinpath(tree::ZippedFileTree, r::RelPath) # Should this AbsPath be rooted at `tree` rather than `tree.root`? AbsPath(tree.root, joinpath(tree.path, r)) end -function Base.joinpath(tree::ZippedBlobTree, s::AbstractString) +function Base.joinpath(tree::ZippedFileTree, s::AbstractString) AbsPath(tree.root, joinpath(tree.path, s)) end -function Base.open(func::Function, f::Blob{ZipTreeRoot}; write=false, read=!write) +function Base.open(func::Function, f::File{ZipTreeRoot}; write=false, read=!write) if write error("Error writing file at read-only path $f") end diff --git a/src/data_project.jl b/src/data_project.jl index 7b545a6..cf99441 100644 --- a/src/data_project.jl +++ b/src/data_project.jl @@ -168,8 +168,8 @@ function Base.show(io::IO, ::MIME"text/plain", project::AbstractDataProject) for (i, (name, data)) in enumerate(sorted) pad = maxwidth - textwidth(name) storagetype = get(data.storage, "type", nothing) - icon = storagetype == "Blob" ? '📄' : - storagetype == "BlobTree" ? '📁' : + icon = storagetype in ("File", "Blob") ? '📄' : + storagetype in ("FileTree", "BlobTree") ? '📁' : '❓' print(io, " ", icon, ' ', name, ' '^pad, " => ", data.uuid) if i < length(sorted) diff --git a/src/entrypoint.jl b/src/entrypoint.jl index 9cb5fe5..e658c97 100644 --- a/src/entrypoint.jl +++ b/src/entrypoint.jl @@ -93,8 +93,8 @@ Get a string representation of the "DataSet type", which represents the type of the data *outside* Julia. A given DataSet type may be mapped into many different Julia types. For example -consider the "Blob" type which is an array of bytes (commonly held in a file). -When loaded into Julia, this may be represented as a +consider the "File" type which is an array of bytes (commonly held in a file on +disk). When loaded into Julia, this may be represented as a * IO — via open()) * String — via open() |> read(_,String) * Vector{UInt8} — via mmap) diff --git a/src/file_data_projects.jl b/src/file_data_projects.jl index 55c7d5c..65a9979 100644 --- a/src/file_data_projects.jl +++ b/src/file_data_projects.jl @@ -214,8 +214,8 @@ inferred as a blob or tree based on whether the local path is a file or directory. """ function from_path(path::AbstractString) - dtype = isfile(path) ? "Blob" : - isdir(path) ? "BlobTree" : + dtype = isfile(path) ? "File" : + isdir(path) ? "FileTree" : nothing if isnothing(dtype) diff --git a/src/filesystem.jl b/src/filesystem.jl index 1219632..eb02219 100644 --- a/src/filesystem.jl +++ b/src/filesystem.jl @@ -14,8 +14,8 @@ end sys_joinpath(path::RelPath) = isempty(path.components) ? "" : joinpath(path.components...) sys_abspath(path::AbsPath) = sys_abspath(path.root, path.path) -sys_abspath(tree::BlobTree) = sys_abspath(tree.root, tree.path) -sys_abspath(file::Blob) = sys_abspath(file.root, file.path) +sys_abspath(tree::FileTree) = sys_abspath(tree.root, tree.path) +sys_abspath(file::File) = sys_abspath(file.root, file.path) #-------------------------------------------------- # Storage data interface for trees @@ -44,7 +44,7 @@ function Base.mkdir(root::AbstractFileSystemRoot, path::RelPath; kws...) error("Cannot make directory in read-only tree") end mkdir(sys_abspath(root, path), args...) - return BlobTree(root, path) + return FileTree(root, path) end function Base.rm(root::AbstractFileSystemRoot, path::RelPath; kws...) @@ -59,7 +59,7 @@ function Base.delete!(root::AbstractFileSystemRoot, path::RelPath) end #-------------------------------------------------- -# Storage data interface for Blob +# Storage data interface for File # TODO: Make this the generic implementation for AbstractDataStorage function Base.open(f::Function, as_type::Type{IO}, @@ -85,19 +85,19 @@ Base.read(root::AbstractFileSystemRoot, path::RelPath) = ## Metadata spec -For Blob: +For File: ``` [datasets.storage] driver="FileSystem" - type="Blob" + type="File" path=\$(path_to_file) ``` -For BlobTree: +For FileTree: ``` [datasets.storage] driver="FileSystem" - type="BlobTree" + type="FileTree" path=\$(path_to_directory) ``` """ @@ -151,7 +151,7 @@ sys_abspath(root::TempFilesystemRoot) = root.path function newdir() # cleanup=false: we manage our own cleanup via the finalizer path = mktempdir(cleanup=false) - return BlobTree(TempFilesystemRoot(path)) + return FileTree(TempFilesystemRoot(path)) end function newdir(root::AbstractFileSystemRoot, path::RelPath; overwrite=false) @@ -175,7 +175,7 @@ function newfile(func=nothing) close(io) end end - return Blob(TempFilesystemRoot(path)) + return File(TempFilesystemRoot(path)) end function newfile(f::Function, root::AbstractFileSystemRoot, path::RelPath; kws...) @@ -193,26 +193,26 @@ end function newdir(ctx::AbstractFileSystemRoot) Base.depwarn(""" `newdir(ctx::AbstractFileSystemRoot)` is deprecated. Use the in-place - version `newdir(::BlobTree, path)` instead. + version `newdir(::FileTree, path)` instead. """, :newdir) path = mktempdir(sys_abspath(ctx), cleanup=false) - return BlobTree(TempFilesystemRoot(path)) + return FileTree(TempFilesystemRoot(path)) end function newfile(ctx::AbstractFileSystemRoot) Base.depwarn(""" `newfile(ctx::AbstractFileSystemRoot)` is deprecated. Use the in-place - version `newfile(::BlobTree, path)` instead. + version `newfile(::FileTree, path)` instead. """, :newfile) path, io = mktemp(sys_abspath(ctx), cleanup=false) close(io) - return Blob(TempFilesystemRoot(path)) + return File(TempFilesystemRoot(path)) end function newfile(f::Function, root::FileSystemRoot) Base.depwarn(""" `newfile(f::Function, ctx::AbstractFileSystemRoot)` is deprecated. - Use newfile() or the in-place version `newfile(::BlobTree, path)` instead. + Use newfile() or the in-place version `newfile(::FileTree, path)` instead. """, :newfile) path, io = mktemp(sys_abspath(root), cleanup=false) try @@ -223,7 +223,7 @@ function newfile(f::Function, root::FileSystemRoot) finally close(io) end - return Blob(TempFilesystemRoot(path)) + return File(TempFilesystemRoot(path)) end # Move srcpath to destpath, making all attempts to preserve the original @@ -270,8 +270,8 @@ function mv_force_with_dest_rollback(srcpath, destpath, tempdir_parent) end end -function Base.setindex!(tree::BlobTree{<:AbstractFileSystemRoot}, - tmpdata::Union{Blob{TempFilesystemRoot},BlobTree{TempFilesystemRoot}}, +function Base.setindex!(tree::FileTree{<:AbstractFileSystemRoot}, + tmpdata::Union{File{TempFilesystemRoot},FileTree{TempFilesystemRoot}}, name::AbstractString) if !iswriteable(tree.root) error("Attempt to move to a read-only tree $tree") @@ -308,12 +308,12 @@ end function connect_filesystem(f, config, dataset) path = config["path"] type = config["type"] - if type == "Blob" + if type in ("File", "Blob") isfile(path) || throw(ArgumentError("$(repr(path)) should be a file")) - storage = Blob(FileSystemRoot(path)) - elseif type == "BlobTree" + storage = File(FileSystemRoot(path)) + elseif type in ("FileTree", "BlobTree") isdir(path) || throw(ArgumentError("$(repr(path)) should be a directory")) - storage = BlobTree(FileSystemRoot(path)) + storage = FileTree(FileSystemRoot(path)) path = dataspec_fragment_as_path(dataset) if !isnothing(path) storage = storage[path] diff --git a/src/paths.jl b/src/paths.jl index 043f541..8029170 100644 --- a/src/paths.jl +++ b/src/paths.jl @@ -11,7 +11,7 @@ A `RelPath` is a *key* into a hierarchical string-indexed tree datastructure, with each component indexing one level of the hierarchy. As a key, the resource referred to by a path may or may not exist. -Conversely, `BlobTree` and `Blob` refer to the actual data stored with a given +Conversely, `FileTree` and `File` refer to the actual data stored with a given key. """ struct RelPath <: AbstractPath diff --git a/src/repl.jl b/src/repl.jl index 2988352..e7f5709 100644 --- a/src/repl.jl +++ b/src/repl.jl @@ -64,7 +64,7 @@ function hexdump(out_stream, buf; groups_per_line=8, group_size=2, max_lines=typ end end -function _show_dataset(out_stream::IO, blob::Blob) +function _show_dataset(out_stream::IO, blob::File) @context begin io = @! open(IO, blob) N = 1024 @@ -97,7 +97,7 @@ function _show_dataset(out_stream::IO, blob::Blob) end end -function _show_dataset(out_stream::IO, tree::BlobTree) +function _show_dataset(out_stream::IO, tree::FileTree) show(out_stream, MIME("text/plain"), tree) end diff --git a/test/Data.toml b/test/Data.toml index f1199c8..d9e85d2 100644 --- a/test/Data.toml +++ b/test/Data.toml @@ -11,7 +11,7 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26" [datasets.storage] driver="FileSystem" - type="Blob" + type="File" path="@__DIR__/data/file.txt" # TODO: We'd like a layering abstraction. @@ -30,7 +30,7 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26" [datasets.storage] driver="FileSystem" - type="Blob" + type="File" path="@__DIR__/data/file.txt" #-------------------------------------------------- @@ -41,7 +41,7 @@ uuid="2d126588-5f76-4e53-8245-87dc91625bf4" [datasets.storage] driver="FileSystem" - type="Blob" + type="File" path="@__DIR__/data/people.csv.gz" #[[datasets.maps]] @@ -59,7 +59,7 @@ uuid="e7fd7080-e346-4a68-9ca9-98593a99266a" [datasets.storage] driver="FileSystem" - type="BlobTree" + type="FileTree" path="@__DIR__/data/csvset" # TODO: Add data maps here which expose it logically as a single CSV? @@ -75,7 +75,7 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26" [datasets.storage] driver="TomlDataStorage" - type="Blob" + type="File" data="AAAAAAAARUA=" @@ -86,7 +86,7 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26" [datasets.storage] driver="TomlDataStorage" - type="BlobTree" + type="FileTree" # TOML.print(Dict("datasets"=>[Dict("storage"=>Dict("data"=>Dict(["d0$i"=>Dict(["$x.txt"=>base64encode("$i $x content") for x in ("a","b")]...) for i in 1:4]...)))])) @@ -105,30 +105,3 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26" [datasets.storage.data.d04] "b.txt" = "NCBiIGNvbnRlbnQ=" "a.txt" = "NCBhIGNvbnRlbnQ=" - -#-------------------------------------------------- -# Old backend API tests - -[[datasets]] -description="Test old storage backend API, Blob" -name="old_backend_blob" -uuid="785b3cdc-428e-426f-a3f7-3f6ae88a9637" - - [datasets.storage] - driver="OldBackendAPI" - type="Blob" - data="eA==" - -[[datasets]] -description="Test old storage backend API, BlobTree" -name="old_backend_tree" -uuid="4af3a8a9-983b-487b-bfd8-804ca50b4a0c" - - [datasets.storage] - driver="OldBackendAPI" - type="BlobTree" - - [datasets.storage.data] - "b.txt" = "Yg==" - "a.txt" = "YQ==" - diff --git a/test/DataCompat.toml b/test/DataCompat.toml new file mode 100644 index 0000000..27c98be --- /dev/null +++ b/test/DataCompat.toml @@ -0,0 +1,93 @@ +# This file contains datasets in older formats, retained for backward +# compatibility. +data_config_version=1 + +#-------------------------------------------------- +[[datasets]] +description="A text file" +name="a_text_file" +uuid="b498f769-a7f6-4f67-8d74-40b770398f26" + + [datasets.storage] + driver="FileSystem" + type="Blob" + path="@__DIR__/data/file.txt" + +#-------------------------------------------------- +[[datasets]] +name="a_tree_example" +uuid="e7fd7080-e346-4a68-9ca9-98593a99266a" + + [datasets.storage] + driver="FileSystem" + type="BlobTree" + path="@__DIR__/data/csvset" + + +#-------------------------------------------------- +# Data embedded in the TOML + +[[datasets]] +description="A data blob embedded in the TOML" +name="embedded_blob" +uuid="b498f769-a7f6-4f67-8d74-40b770398f26" + + [datasets.storage] + driver="TomlDataStorage" + type="File" + data="AAAAAAAARUA=" + + +[[datasets]] +description="A data tree embedded in the TOML" +name="embedded_tree" +uuid="b498f769-a7f6-4f67-8d74-40b770398f26" + + [datasets.storage] + driver="TomlDataStorage" + type="FileTree" + +# TOML.print(Dict("datasets"=>[Dict("storage"=>Dict("data"=>Dict(["d0$i"=>Dict(["$x.txt"=>base64encode("$i $x content") for x in ("a","b")]...) for i in 1:4]...)))])) + + [datasets.storage.data.d01] + "b.txt" = "MSBiIGNvbnRlbnQ=" + "a.txt" = "MSBhIGNvbnRlbnQ=" + + [datasets.storage.data.d02] + "b.txt" = "MiBiIGNvbnRlbnQ=" + "a.txt" = "MiBhIGNvbnRlbnQ=" + + [datasets.storage.data.d03] + "b.txt" = "MyBiIGNvbnRlbnQ=" + "a.txt" = "MyBhIGNvbnRlbnQ=" + + [datasets.storage.data.d04] + "b.txt" = "NCBiIGNvbnRlbnQ=" + "a.txt" = "NCBhIGNvbnRlbnQ=" + +#-------------------------------------------------- +# Old backend API tests + +[[datasets]] +description="Test old storage backend API, File" +name="old_backend_blob" +uuid="785b3cdc-428e-426f-a3f7-3f6ae88a9637" + + [datasets.storage] + driver="OldBackendAPI" + type="Blob" + data="eA==" + +[[datasets]] +description="Test old storage backend API, FileTree" +name="old_backend_tree" +uuid="4af3a8a9-983b-487b-bfd8-804ca50b4a0c" + + [datasets.storage] + driver="OldBackendAPI" + type="BlobTree" + + [datasets.storage.data] + "b.txt" = "Yg==" + "a.txt" = "YQ==" + diff --git a/test/DriverAutoloadData.toml b/test/DriverAutoloadData.toml index 24bf164..9449587 100644 --- a/test/DriverAutoloadData.toml +++ b/test/DriverAutoloadData.toml @@ -7,7 +7,7 @@ uuid="785b3cdc-428e-426f-a3f7-3f6ae88a9637" [datasets.storage] driver="DummyTomlStorage" - type="Blob" + type="File" data="data_from_dummy_backend" #------------------------------------------------------------------------------- diff --git a/test/TomlDataStorage.jl b/test/TomlDataStorage.jl index 06b3343..cb23c6c 100644 --- a/test/TomlDataStorage.jl +++ b/test/TomlDataStorage.jl @@ -3,7 +3,7 @@ proj = DataSets.load_project("Data.toml") blob_ds = dataset(proj, "embedded_blob") - @test open(blob_ds) isa Blob + @test open(blob_ds) isa File @test open(String, blob_ds) == "\0\0\0\0\0\0E@" @test read(open(blob_ds), Float64) === 42.0 @@ -15,7 +15,7 @@ @test @!(open(String, blob_ds)) == "\0\0\0\0\0\0E@" blob = @! open(blob_ds) - @test blob isa Blob + @test blob isa File @test @!(open(String, blob)) == "\0\0\0\0\0\0E@" @test read(blob, Float64) === 42.0 @@ -23,12 +23,12 @@ end tree_ds = dataset(proj, "embedded_tree") - @test open(tree_ds) isa BlobTree + @test open(tree_ds) isa FileTree @test open(String, open(tree_ds)[path"d01/a.txt"]) == "1 a content" @test open(String, open(tree_ds)[path"d02/b.txt"]) == "2 b content" @context begin tree = @! open(tree_ds) - @test tree isa BlobTree + @test tree isa FileTree @test isdir(tree) @test !isfile(tree) diff --git a/test/active_project/Data.toml b/test/active_project/Data.toml index 2fbd893..03158c8 100644 --- a/test/active_project/Data.toml +++ b/test/active_project/Data.toml @@ -7,5 +7,5 @@ uuid="314996ef-12be-40d0-912c-9755af354fdb" [datasets.storage] driver="FileSystem" - type="Blob" + type="File" path="@__DIR__/data/file.txt" diff --git a/test/backend_compat.jl b/test/backend_compat.jl index 55c9872..c2f8335 100644 --- a/test/backend_compat.jl +++ b/test/backend_compat.jl @@ -66,7 +66,7 @@ DataSets.add_storage_driver("OldBackendAPI"=>connect_old_backend) #------------------------------------------------------------------------------- @testset "OldBackendAPI" begin - proj = DataSets.load_project("Data.toml") + proj = DataSets.load_project(joinpath(@__DIR__, "DataCompat.toml")) @test open(IO, dataset(proj, "old_backend_blob")) do io read(io, String) @@ -77,8 +77,22 @@ DataSets.add_storage_driver("OldBackendAPI"=>connect_old_backend) @test read(open(dataset(proj, "old_backend_blob"))) == UInt8['x'] @test readdir(open(dataset(proj, "old_backend_tree"))) == ["a.txt", "b.txt"] - @test open(dataset(proj, "old_backend_tree"))[path"a.txt"] isa Blob + @test open(dataset(proj, "old_backend_tree"))[path"a.txt"] isa File @test read(open(dataset(proj, "old_backend_tree"))[path"a.txt"], String) == "a" @test read(open(dataset(proj, "old_backend_tree"))[path"b.txt"], String) == "b" end +@testset "Compat for renaming Blob->File, BlobTree->FileTree" begin + proj = DataSets.load_project(joinpath(@__DIR__, "DataCompat.toml")) + + text_data = dataset(proj, "a_text_file") + @test open(text_data) isa Blob + @test read(open(text_data), String) == "Hello world!\n" + + tree_data = dataset(proj, "a_tree_example") + @context begin + @test @!(open(tree_data)) isa BlobTree + tree = @! open(tree_data) + @test readdir(tree) == ["1.csv", "2.csv"] + end +end diff --git a/test/drivers/DummyStorageBackends/src/DummyStorageBackends.jl b/test/drivers/DummyStorageBackends/src/DummyStorageBackends.jl index 48a5782..92d10c3 100644 --- a/test/drivers/DummyStorageBackends/src/DummyStorageBackends.jl +++ b/test/drivers/DummyStorageBackends/src/DummyStorageBackends.jl @@ -13,7 +13,7 @@ end function connect_dummy_backend(f, config, ds) storage = DummyBackend(config["data"]) - f(Blob(storage)) + f(File(storage)) end function __init__() diff --git a/test/entrypoint.jl b/test/entrypoint.jl index 85a3597..f1b8ad8 100644 --- a/test/entrypoint.jl +++ b/test/entrypoint.jl @@ -1,14 +1,14 @@ # Data entry point functions read_data = nothing -@datafunc function main1(x::Blob=>String, t::BlobTree=>BlobTree) +@datafunc function main1(x::File=>String, t::FileTree=>FileTree) csv_data = open(IO, t["1.csv"]) do io read(io,String) end global read_data = (x_string=x, csv_data=csv_data) end -@datafunc function main1(x::Blob=>IO) +@datafunc function main1(x::File=>IO) x_data = read(x, String) global read_data = (x_data=x_data,) end diff --git a/test/projects.jl b/test/projects.jl index 6d34b54..5ba13e4 100644 --- a/test/projects.jl +++ b/test/projects.jl @@ -13,8 +13,6 @@ test_project_names = ["a_text_file", "embedded_blob", "embedded_tree", "gzipped_table", - "old_backend_blob", - "old_backend_tree", "some_namespace/a_text_file"] @testset "TomlFileDataProject" begin @@ -56,7 +54,7 @@ end [datasets.storage] driver="FileSystem" - type="Blob" + type="File" path="@__DIR__/data/file.txt" """) flush(io) @@ -75,7 +73,7 @@ end [datasets.storage] driver="FileSystem" - type="Blob" + type="File" path="@__DIR__/data/file2.txt" """) flush(io) diff --git a/test/runtests.jl b/test/runtests.jl index bafb3e5..101acdd 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -28,7 +28,7 @@ end "storage"=>Dict( "driver"=>"FileSystem", - "type"=>"Blob", + "type"=>"File", "path"=>joinpath(@__DIR__, "data", "file.txt") ) )] @@ -44,22 +44,22 @@ end proj = DataSets.load_project("Data.toml") text_data = dataset(proj, "a_text_file") - @test open(text_data) isa Blob + @test open(text_data) isa File @test read(open(text_data), String) == "Hello world!\n" @context begin @test read(@!(open(text_data)), String) == "Hello world!\n" end tree_data = dataset(proj, "a_tree_example") - @test open(tree_data) isa BlobTree + @test open(tree_data) isa FileTree @context begin - @test @!(open(tree_data)) isa BlobTree + @test @!(open(tree_data)) isa FileTree tree = @! open(tree_data) @test readdir(tree) == ["1.csv", "2.csv"] end blob_in_tree_data = dataset(proj, "a_tree_example#1.csv") - @test open(blob_in_tree_data) isa Blob + @test open(blob_in_tree_data) isa File @context begin @test @!(open(String, blob_in_tree_data)) == """Name,Age\n"Aaron",23\n"Harry",42\n""" end @@ -72,32 +72,32 @@ end dir_dataset = DataSets.from_path(joinpath(@__DIR__, "data", "csvset")) - @test open(dir_dataset) isa BlobTree + @test open(dir_dataset) isa FileTree @test keys(open(dir_dataset)) == ["1.csv", "2.csv"] end #------------------------------------------------------------------------------- -@testset "open() for Blob and BlobTree" begin - blob = Blob(FileSystemRoot("data/file.txt")) +@testset "open() for File and FileTree" begin + blob = File(FileSystemRoot("data/file.txt")) @test open(identity, String, blob) == "Hello world!\n" @test String(open(identity, Vector{UInt8}, blob)) == "Hello world!\n" @test open(io->read(io,String), IO, blob) == "Hello world!\n" - @test open(identity, Blob, blob) === blob + @test open(identity, File, blob) === blob # Unscoped forms @test open(String, blob) == "Hello world!\n" @test String(open(Vector{UInt8}, blob)) == "Hello world!\n" @test read(open(IO, blob), String) == "Hello world!\n" - tree = BlobTree(FileSystemRoot("data")) - @test open(identity, BlobTree, tree) === tree + tree = FileTree(FileSystemRoot("data")) + @test open(identity, FileTree, tree) === tree # Context-based forms @context begin @test @!(open(String, blob)) == "Hello world!\n" @test String(@! open(Vector{UInt8}, blob)) == "Hello world!\n" @test read(@!(open(IO, blob)), String) == "Hello world!\n" - @test @!(open(Blob, blob)) === blob - @test @!(open(BlobTree, tree)) === tree + @test @!(open(File, blob)) === blob + @test @!(open(FileTree, tree)) === tree end end From 05234419ee90e88ac91ef95a16a5d1d02b9ad043 Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Mon, 16 May 2022 17:40:03 +1000 Subject: [PATCH 08/33] Add more docs for FileTree Not including documentation on writing to a file tree yet; that will come in a future change. --- docs/src/reference.md | 2 +- docs/src/tutorial.md | 89 +++++++++++++++++++++++++++++++++---------- 2 files changed, 69 insertions(+), 22 deletions(-) diff --git a/docs/src/reference.md b/docs/src/reference.md index 7850d06..a99ac4b 100644 --- a/docs/src/reference.md +++ b/docs/src/reference.md @@ -51,7 +51,7 @@ DataSets.TomlFileDataProject DataSets provides some builtin data models [`File`](@ref) and [`FileTree`](@ref) for accessin file- and directory-like data respectively. For modifying these, the functions [`newfile`](@ref) and [`newdir`](@ref) can be -used, together with `setindex!` for `FileTree`. +used. ```@docs File diff --git a/docs/src/tutorial.md b/docs/src/tutorial.md index 668ddfe..bc8b53b 100644 --- a/docs/src/tutorial.md +++ b/docs/src/tutorial.md @@ -54,6 +54,8 @@ particular dataset: ```jldoctest julia> dataset("a_text_file") +DataSet instance: + name = "a_text_file" uuid = "b498f769-a7f6-4f67-8d74-40b770398f26" description = "A text file containing the standard greeting" @@ -70,10 +72,12 @@ global configuration this is also possible: ```jldoctest julia> project = DataSets.load_project("src/Data.toml") DataSets.DataProject: - a_text_file => b498f769-a7f6-4f67-8d74-40b770398f26 - a_tree_example => e7fd7080-e346-4a68-9ca9-98593a99266a + 📄 a_text_file => b498f769-a7f6-4f67-8d74-40b770398f26 + 📁 a_tree_example => e7fd7080-e346-4a68-9ca9-98593a99266a julia> dataset(project, "a_text_file") +DataSet instance: + name = "a_text_file" uuid = "b498f769-a7f6-4f67-8d74-40b770398f26" description = "A text file containing the standard greeting" @@ -84,20 +88,15 @@ type = "File" path = ".../DataSets/docs/src/data/file.txt" ``` -## Loading Data +## Working with `File` data -You can call `open()` on a DataSet to inspect the data inside. `open()` will -return the [`File`](@ref) and [`FileTree`](@ref) types for local files and -directories on disk. For example, +The most basic type of dataset is the [`File`](@ref) which is a simple 1D array +of bytes (ie, a `Vector{UInt8}`; a blob). To access the file you can call +`open()` on the corresponding DataSet which will return a `File`. For example, ```jldoctest julia> open(dataset("a_text_file")) 📄 @ .../DataSets/docs/src/data/file.txt - -julia> open(dataset("a_tree_example")) -📂 Tree @ .../DataSets/docs/src/data/csvset - 📄 1.csv - 📄 2.csv ``` Use the form `open(T, dataset)` to read the data as a specific type. `File` @@ -121,7 +120,7 @@ julia> open(String, dataset("a_text_file")) ``` To ensure the dataset is closed again in a timely way (freeing any resources -such as file handles), you should use the scoped form, for example: +such as file handles), you can use the scoped form, for example: ```jldoctest julia> open(IO, dataset("a_text_file")) do io @@ -132,11 +131,24 @@ julia> open(IO, dataset("a_text_file")) do io content = "Hello world!\n" ``` +## Working with `FileTree` data + Let's look at some tree-like data which is represented on local disk as a -folder or directory. Tree data is opened in Julia as the [`FileTree`](@ref) -type and can be indexed with path components to get at the file [`File`](@ref)s -inside. In turn, we can `open()` one of the file blobs and look at the data -contained within. +folder or directory. Tree data is represented in Julia as the +[`FileTree`](@ref) type and can be indexed with path components to get at the +[`File`](@ref)s inside. In turn, we can `open()` one of the file blobs and +look at the data contained within. + +```jldoctest +julia> open(dataset("a_tree_example")) +📂 Tree @ .../DataSets/docs/src/data/csvset + 📄 1.csv + 📄 2.csv +``` + +A `FileTree` has a dictionary-like API: it's a map from `String` names to +`File`s or `FileTree` subtrees. Iterating over it yields each child of the tree +in turn. For example, to examine the content of all files in a tree: ```jldoctest julia> tree = open(FileTree, dataset("a_tree_example")) @@ -144,15 +156,50 @@ julia> tree = open(FileTree, dataset("a_tree_example")) 📄 1.csv 📄 2.csv +julia> for file in tree + content = open(String, file) + @info "File content" file content + end +┌ Info: File content +│ file = 📄 1.csv @ .../DataSets/docs/src/data/csvset +└ content = "Name,Age\n\"Aaron\",23\n\"Harry\",42\n" +┌ Info: File content +│ file = 📄 2.csv @ .../DataSets/docs/src/data/csvset +└ content = "Name,Age\n\"Rose\",19\n\"Tom\",25\n" +``` + +To list the names of files and subtrees, use `keys()`, or `haskey()` to +determine the presence of a file name + +```jldoctest +julia> tree = open(FileTree, dataset("a_tree_example")); + +julia> keys(tree) +2-element Vector{String}: + "1.csv" + "2.csv" + +julia> haskey(tree, "not_there.csv") +false +``` + +To get a particular file, indexing can be used, and `isfile()` and `isdir()` +can be used to detect whether a child of a tree is a file or a subtree. + +```jldoctest +julia> tree = open(FileTree, dataset("a_tree_example")); + julia> tree["1.csv"] -📄 1.csv @ .../DataSets/docs/src/data/csvset +📄 1.csv @ /home/chris/.julia/dev/DataSets/docs/src/data/csvset + +julia> isfile(tree["1.csv"]) +true -julia> open(String, tree["1.csv"]) |> Text -Name,Age -"Aaron",23 -"Harry",42 +julia> isdir(tree) +true ``` + ## Program Entry Points Rather than manually using the `open()` functions as shown above, the From 49b14d910d2db7351ec80e4b8c7e2fec2351d59c Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Tue, 17 May 2022 15:42:19 +1000 Subject: [PATCH 09/33] Move file BlobTree.jl -> FileTree.jl --- src/DataSets.jl | 2 +- src/{BlobTree.jl => FileTree.jl} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename src/{BlobTree.jl => FileTree.jl} (100%) diff --git a/src/DataSets.jl b/src/DataSets.jl index 5396a2b..150f565 100644 --- a/src/DataSets.jl +++ b/src/DataSets.jl @@ -222,7 +222,7 @@ end include("entrypoint.jl") # Builtin Data models -include("BlobTree.jl") +include("FileTree.jl") # Builtin backends include("filesystem.jl") diff --git a/src/BlobTree.jl b/src/FileTree.jl similarity index 100% rename from src/BlobTree.jl rename to src/FileTree.jl From 311ead286df9940f98c501b8ab1e87e409bd4714 Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Wed, 18 May 2022 19:58:30 +1000 Subject: [PATCH 10/33] WIP: Improved test coverage for FileTree / File Also some docstring refinement --- src/FileTree.jl | 32 +++++----- src/filesystem.jl | 8 +++ src/paths.jl | 1 + test/FileTree.jl | 151 ++++++++++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 48 +-------------- 5 files changed, 176 insertions(+), 64 deletions(-) create mode 100644 test/FileTree.jl diff --git a/src/FileTree.jl b/src/FileTree.jl index 563fa98..11eae62 100644 --- a/src/FileTree.jl +++ b/src/FileTree.jl @@ -89,14 +89,17 @@ function Base.copy!(dst::AbstractFileTree, src::AbstractFileTree) copy!(newdir(dst, xname), x) else open(x) do io_src - newfile(dst, xname, write=true) do io_dst + newfile(dst, xname, overwrite=true) do io_dst write(io_dst, io_src) end end end end + return dst end +Base.copy(src::AbstractFileTree) = copy!(newdir(), src) + #------------------------------------------------------------------------------- """ File(root) @@ -122,6 +125,7 @@ Base.abspath(file::File) = AbsPath(file.root, file.path) Base.isdir(file::File) = false Base.isfile(file::File) = true Base.ispath(file::File) = true +Base.filesize(file::File) = filesize(file.root, file.path) function Base.show(io::IO, ::MIME"text/plain", file::File) print(io, "📄 ", file.path, " @ ", summary(file.root)) @@ -197,9 +201,9 @@ end end # Unscoped form of open for File -function Base.open(::Type{T}, blob::File; kws...) where {T} +function Base.open(::Type{T}, file::File; kws...) where {T} @context begin - result = @! open(T, blob; kws...) + result = @! open(T, file; kws...) @! ResourceContexts.detach_context_cleanup(result) end end @@ -247,10 +251,12 @@ directory. * Property access - `isdir()`, `isfile()` - determine whether a child of tree is a directory or file. + - `filesize()` — size of `File` elements in a tree # Example -You can create a new temporary FileTree via the `newdir()` function: +Create a new temporary FileTree via the `newdir()` function and fill it with +files via `newfile()`: ``` julia> dir = newdir() @@ -272,8 +278,7 @@ julia> dir = newdir() 📄 b-3.txt ``` -You can also get access to a `FileTree` by using `DataSets.from_path()` with a -local directory name. For example: +Create a `FileTree` from a local directory with `DataSets.from_path()`: ``` julia> using Pkg @@ -400,19 +405,12 @@ end """ newdir(tree, path; overwrite=false) -Create a new directory at tree[path] and return it. If `overwrite=true`, remove -any existing directory before creating the new one. - - newdir() - -Create a new temporary `FileTree` which can have files assigned into it and may -be assigned to a permanent location in a persistent `FileTree`. If not assigned -to a permanent location, the temporary tree is cleaned up during garbage -collection. +Create a new FileTree ("directory") at tree[path] and return it. If +`overwrite=true`, remove any existing tree before creating the new one. """ function newdir(tree::FileTree, path::RelPath; overwrite=false) _check_new_item(tree, path, overwrite) - p = joinpath(tree.path, RelPath(path)) + p = joinpath(tree.path, path) newdir(tree.root, p; overwrite=overwrite) return FileTree(tree.root, p) end @@ -475,7 +473,7 @@ end # Path manipulation # TODO: Maybe deprecate these? Under the "datastructure-like" model, it seems wrong -# for a blob to know its name in the parent data structure. +# for a file to know its name in the parent data structure. Base.basename(tree::FileTree) = basename(tree.path) Base.abspath(tree::FileTree) = AbsPath(tree.root, tree.path) diff --git a/src/filesystem.jl b/src/filesystem.jl index eb02219..82d4f50 100644 --- a/src/filesystem.jl +++ b/src/filesystem.jl @@ -30,6 +30,7 @@ sys_abspath(file::File) = sys_abspath(file.root, file.path) Base.isdir(root::AbstractFileSystemRoot, path::RelPath) = isdir(sys_abspath(root, path)) Base.isfile(root::AbstractFileSystemRoot, path::RelPath) = isfile(sys_abspath(root, path)) Base.ispath(root::AbstractFileSystemRoot, path::RelPath) = ispath(sys_abspath(root, path)) +Base.filesize(root::AbstractFileSystemRoot, path::RelPath) = filesize(sys_abspath(root, path)) Base.summary(io::IO, root::AbstractFileSystemRoot) = print(io, sys_abspath(root)) @@ -148,6 +149,13 @@ end iswriteable(root::TempFilesystemRoot) = true sys_abspath(root::TempFilesystemRoot) = root.path +""" + newdir() + +Create a new `FileTree` on the local temporary directory. If not moved to a +permanent location (for example, with `some_tree["name"] = newdir()`) the +temporary tree will be cleaned up during garbage collection. +""" function newdir() # cleanup=false: we manage our own cleanup via the finalizer path = mktempdir(cleanup=false) diff --git a/src/paths.jl b/src/paths.jl index 8029170..54bf4ed 100644 --- a/src/paths.jl +++ b/src/paths.jl @@ -18,6 +18,7 @@ struct RelPath <: AbstractPath components::Vector{String} end +RelPath(path::RelPath) = path RelPath(str::AbstractString) = RelPath(split(str, '/')) RelPath(components::AbstractVector=String[]) = RelPath(convert(Vector{String}, components)) diff --git a/test/FileTree.jl b/test/FileTree.jl new file mode 100644 index 0000000..2223799 --- /dev/null +++ b/test/FileTree.jl @@ -0,0 +1,151 @@ + +@testset "File API" begin + file = File(FileSystemRoot("data/file.txt")) + + @testset "metadata" begin + @test filesize(file) == 13 + @test isfile(file) + @test !isdir(file) + @test ispath(file) + end + + @testset "open()" begin + # Do-block based forms + @test open(identity, String, file) == "Hello world!\n" + @test String(open(identity, Vector{UInt8}, file)) == "Hello world!\n" + @test open(io->read(io,String), IO, file) == "Hello world!\n" + @test open(identity, File, file) === file + + # Unscoped forms + @test open(String, file) == "Hello world!\n" + @test String(open(Vector{UInt8}, file)) == "Hello world!\n" + @test read(open(IO, file), String) == "Hello world!\n" + + # Context-based forms + @context begin + @test @!(open(String, file)) == "Hello world!\n" + @test String(@! open(Vector{UInt8}, file)) == "Hello world!\n" + @test read(@!(open(IO, file)), String) == "Hello world!\n" + @test @!(open(File, file)) === file + end + end +end + +@testset "FileTree API" begin + tree = FileTree(FileSystemRoot("data")) + + @testset "metadata" begin + @test !isfile(tree) + @test isdir(tree) + @test ispath(tree) + end + + @test tree["csvset"] isa FileTree + @test tree["csvset/1.csv"] isa File + @test tree["csvset"]["2.csv"] isa File + + @testset "open()" begin + @test open(identity, FileTree, tree) === tree + + # Context-based forms + @context begin + @test @!(open(FileTree, tree)) === tree + end + end + + # TODO: delete! +end + +@testset "newfile() and newdir()" begin + @testset "isolated newfile" begin + @test newfile() isa File + @test read(newfile()) == [] + @test begin + f = newfile() do io + print(io, "content") + end + read(f, String) + end == "content" + end + + tree = newdir() + for j=1:2 + d = newdir(tree, "d$j") + for i=1:2 + newfile(d, "hi_$(j)_$(i).txt") do io + println(io, "hi $j/$i") + end + end + end + @test read(tree["d1/hi_1_1.txt"], String) == "hi 1/1\n" + @test read(tree["d1/hi_1_2.txt"], String) == "hi 1/2\n" + @test read(tree["d2/hi_2_1.txt"], String) == "hi 2/1\n" + @test read(tree["d2/hi_2_2.txt"], String) == "hi 2/2\n" + + @testset "Iteration" begin + # keys + @test keys(tree) == ["d1", "d2"] + @test keys(tree["d1"]) == ["hi_1_1.txt", "hi_1_2.txt"] + @test keys(tree["d2"]) == ["hi_2_1.txt", "hi_2_2.txt"] + # values + for v in tree + @test v isa FileTree + end + for v in values(tree) + @test v isa FileTree + end + for v in tree["d1"] + @test v isa File + end + # pairs + @test first.(pairs(tree["d1"])) == ["hi_1_1.txt", "hi_1_2.txt"] + #@test typeof.(last.(pairs(tree["d1"]))) == [File, File] + end + + @testset "copy / copy! for FileTree" begin + tree2 = copy!(newdir(), tree) + @test keys(tree2) == ["d1", "d2"] + @test keys(tree2["d1"]) == ["hi_1_1.txt", "hi_1_2.txt"] + @test keys(tree2["d2"]) == ["hi_2_1.txt", "hi_2_2.txt"] + @test read(tree2["d1/hi_1_1.txt"], String) == "hi 1/1\n" + + @testset "copy! into a subtree" begin + copy!(newdir(tree2, "dst"), tree) + @test keys(tree2["dst"]) == ["d1", "d2"] + @test keys(tree2["dst/d1"]) == ["hi_1_1.txt", "hi_1_2.txt"] + end + + @testset "copy" begin + @test keys(copy(tree)) == ["d1", "d2"] + end + end + + @testset "newdir/newfile with overwrite=true" begin + tree3 = copy!(newdir(), tree) + + @test_throws ErrorException newdir(tree3, "d1") + @test keys(tree3["d1"]) == ["hi_1_1.txt", "hi_1_2.txt"] + newdir(tree3, "d1", overwrite=true) + @test keys(tree3["d1"]) == [] + + # Various forms of newfile + @test newfile(tree3, "empty") isa File + @test open(String, tree3["empty"]) == "" + @test_throws ErrorException newfile(tree3, "empty") + newfile(tree3, "empty", overwrite=true) do io + print(io, "xxx") + end + @test open(String, tree3["empty"]) == "xxx" + # newfile creates directories implicitly + @test newfile(tree3, "a/b/c") isa File + @test tree3["a"]["b"]["c"] isa File + end +end + +#= +#TODO +@testset "FileSystemRoot" begin + # Test that the file is persisted on disk + @test isfile(DataSets.sys_abspath(tree["d1/hi_2.txt"])) +end +=# diff --git a/test/runtests.jl b/test/runtests.jl index 101acdd..a7d65fc 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -76,31 +76,6 @@ end @test keys(open(dir_dataset)) == ["1.csv", "2.csv"] end -#------------------------------------------------------------------------------- -@testset "open() for File and FileTree" begin - blob = File(FileSystemRoot("data/file.txt")) - @test open(identity, String, blob) == "Hello world!\n" - @test String(open(identity, Vector{UInt8}, blob)) == "Hello world!\n" - @test open(io->read(io,String), IO, blob) == "Hello world!\n" - @test open(identity, File, blob) === blob - # Unscoped forms - @test open(String, blob) == "Hello world!\n" - @test String(open(Vector{UInt8}, blob)) == "Hello world!\n" - @test read(open(IO, blob), String) == "Hello world!\n" - - tree = FileTree(FileSystemRoot("data")) - @test open(identity, FileTree, tree) === tree - - # Context-based forms - @context begin - @test @!(open(String, blob)) == "Hello world!\n" - @test String(@! open(Vector{UInt8}, blob)) == "Hello world!\n" - @test read(@!(open(IO, blob)), String) == "Hello world!\n" - @test @!(open(File, blob)) === blob - @test @!(open(FileTree, tree)) === tree - end -end - #------------------------------------------------------------------------------- @testset "Data set names" begin # Valid names @@ -151,28 +126,7 @@ end @test dataset(proj, "a_text_file?x=1&yy=2#frag")["dataspec"]["fragment"] == "frag" end -#------------------------------------------------------------------------------- -# Trees -@testset "Temporary trees" begin - function write_dir(j) - d = newdir() - for i=1:2 - d["hi_$i.txt"] = newfile() do io - println(io, "hi $j $i") - end - end - return d - end - - temptree = newdir() - for j=1:3 - temptree["d$j"] = write_dir(j) - end - @test open(io->read(io,String), IO, temptree["d1"]["hi_2.txt"]) == "hi 1 2\n" - @test open(io->read(io,String), IO, temptree["d3"]["hi_1.txt"]) == "hi 3 1\n" - @test isfile(DataSets.sys_abspath(temptree["d1"]["hi_2.txt"])) -end - +include("FileTree.jl") include("projects.jl") include("entrypoint.jl") include("repl.jl") From 4c7480e5bd7142f13caf3e74c070c212f90d07b3 Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Thu, 19 May 2022 19:43:28 +1000 Subject: [PATCH 11/33] Tests + simplify FileSystemRoot * Additional tests for FileTree - the whole API should be covered now. * Remove internal TempFilesystemRoot; replace with a simple flag on FileSystemRoot. * Remove internal AbstractFileSystemRoot; this seems unnecessary now. --- src/filesystem.jl | 277 ++++++++++++++++++++++------------------------ test/FileTree.jl | 156 ++++++++++++++++---------- 2 files changed, 228 insertions(+), 205 deletions(-) diff --git a/src/filesystem.jl b/src/filesystem.jl index 82d4f50..f16d5e7 100644 --- a/src/filesystem.jl +++ b/src/filesystem.jl @@ -1,22 +1,64 @@ -# -# Storage Driver implementation for trees which are rooted in the file system -# (in git terminology, there exists a "working copy") -# -abstract type AbstractFileSystemRoot end +""" +Root storage object for trees which are rooted in the file system (in git +terminology, there exists a "working copy") + +## Metadata spec + +For File: +``` + [datasets.storage] + driver="FileSystem" + type="File" + path=\$(path_to_file) +``` + +For FileTree: +``` + [datasets.storage] + driver="FileSystem" + type="FileTree" + path=\$(path_to_directory) +``` +""" +mutable struct FileSystemRoot + path::String + write::Bool + cleanup::Bool +end + +function FileSystemRoot(path::AbstractString; write=false, cleanup=false) + path = abspath(path) + root = FileSystemRoot(path, write, cleanup) + if cleanup + finalizer(root) do r + if r.cleanup + rm(r.path, recursive=true, force=true) + end + end + end + return root +end # These functions sys_abspath and sys_joinpath generate/joins OS-specific # _local filesystem paths_ out of logical paths. They should be defined only # for trees which are rooted in the actual filesystem. -function sys_abspath(root::AbstractFileSystemRoot, path::RelPath) +sys_joinpath(path::RelPath) = isempty(path.components) ? "" : joinpath(path.components...) + +sys_abspath(root::FileSystemRoot) = root.path + +function sys_abspath(root::FileSystemRoot, path::RelPath) rootpath = sys_abspath(root) return isempty(path.components) ? rootpath : joinpath(rootpath, sys_joinpath(path)) end -sys_joinpath(path::RelPath) = isempty(path.components) ? "" : joinpath(path.components...) sys_abspath(path::AbsPath) = sys_abspath(path.root, path.path) sys_abspath(tree::FileTree) = sys_abspath(tree.root, tree.path) sys_abspath(file::File) = sys_abspath(file.root, file.path) +iswriteable(root::FileSystemRoot) = root.write + + + #-------------------------------------------------- # Storage data interface for trees # @@ -25,22 +67,22 @@ sys_abspath(file::File) = sys_abspath(file.root, file.path) ## 1. Query # TODO: would it be better to express the following dispatch in terms of -# AbsPath{<:AbstractFileSystemRoot} rather than usin double dispatch? +# AbsPath{<:FileSystemRoot} rather than usin double dispatch? -Base.isdir(root::AbstractFileSystemRoot, path::RelPath) = isdir(sys_abspath(root, path)) -Base.isfile(root::AbstractFileSystemRoot, path::RelPath) = isfile(sys_abspath(root, path)) -Base.ispath(root::AbstractFileSystemRoot, path::RelPath) = ispath(sys_abspath(root, path)) -Base.filesize(root::AbstractFileSystemRoot, path::RelPath) = filesize(sys_abspath(root, path)) +Base.isdir(root::FileSystemRoot, path::RelPath) = isdir(sys_abspath(root, path)) +Base.isfile(root::FileSystemRoot, path::RelPath) = isfile(sys_abspath(root, path)) +Base.ispath(root::FileSystemRoot, path::RelPath) = ispath(sys_abspath(root, path)) +Base.filesize(root::FileSystemRoot, path::RelPath) = filesize(sys_abspath(root, path)) -Base.summary(io::IO, root::AbstractFileSystemRoot) = print(io, sys_abspath(root)) +Base.summary(io::IO, root::FileSystemRoot) = print(io, sys_abspath(root)) -Base.readdir(root::AbstractFileSystemRoot, path::RelPath) = readdir(sys_abspath(root, path)) +Base.readdir(root::FileSystemRoot, path::RelPath) = readdir(sys_abspath(root, path)) ## 2. Mutation # # TODO: Likely requires rework! -function Base.mkdir(root::AbstractFileSystemRoot, path::RelPath; kws...) +function Base.mkdir(root::FileSystemRoot, path::RelPath; kws...) if !iswriteable(root) error("Cannot make directory in read-only tree") end @@ -48,15 +90,15 @@ function Base.mkdir(root::AbstractFileSystemRoot, path::RelPath; kws...) return FileTree(root, path) end -function Base.rm(root::AbstractFileSystemRoot, path::RelPath; kws...) +function Base.rm(root::FileSystemRoot, path::RelPath; kws...) rm(sys_abspath(root,path); kws...) end -function Base.delete!(root::AbstractFileSystemRoot, path::RelPath) +function Base.delete!(root::FileSystemRoot, path::RelPath) if !iswriteable(root) error("Cannot delete from read-only tree $root") end - rm(sys_abspath(root,path); recursive=true) + rm(sys_abspath(root, path); recursive=true) end #-------------------------------------------------- @@ -64,11 +106,11 @@ end # TODO: Make this the generic implementation for AbstractDataStorage function Base.open(f::Function, as_type::Type{IO}, - root::AbstractFileSystemRoot, path; kws...) + root::FileSystemRoot, path; kws...) @context f(@! open(as_type, root, path; kws...)) end -@! function Base.open(::Type{IO}, root::AbstractFileSystemRoot, path; +@! function Base.open(::Type{IO}, root::FileSystemRoot, path; write=false, read=!write, kws...) if !iswriteable(root) && write error("Error writing file at read-only path $path") @@ -76,78 +118,14 @@ end @! open(sys_abspath(root, path); read=read, write=write, kws...) end -Base.read(root::AbstractFileSystemRoot, path::RelPath, ::Type{T}) where {T} = +Base.read(root::FileSystemRoot, path::RelPath, ::Type{T}) where {T} = read(sys_abspath(root, path), T) -Base.read(root::AbstractFileSystemRoot, path::RelPath) = +Base.read(root::FileSystemRoot, path::RelPath) = read(sys_abspath(root, path)) -#-------------------------------------------------- -""" - -## Metadata spec - -For File: -``` - [datasets.storage] - driver="FileSystem" - type="File" - path=\$(path_to_file) -``` - -For FileTree: -``` - [datasets.storage] - driver="FileSystem" - type="FileTree" - path=\$(path_to_directory) -``` -""" -struct FileSystemRoot <: AbstractFileSystemRoot - path::String - write::Bool -end - -function FileSystemRoot(path::AbstractString; write=false) - path = abspath(path) - FileSystemRoot(path, write) -end - -iswriteable(root) = false -iswriteable(root::FileSystemRoot) = root.write - -sys_abspath(root::FileSystemRoot) = root.path - -function Base.abspath(relpath::RelPath) - Base.depwarn(""" - `abspath(::RelPath)` defaults to using `pwd()` as the root of the path - but this leads to fragile code so will be removed in the future""", - :abspath) - AbsPath(FileSystemRoot(pwd(); write=true), relpath) -end - #------------------------------------------------------------------------------- -# Infrastructure for a somewhat more functional interface for creating file -# trees than the fully mutable version we usually use. - -mutable struct TempFilesystemRoot <: AbstractFileSystemRoot - path::Union{Nothing,String} - function TempFilesystemRoot(path) - root = new(path) - finalizer(root) do r - if !isnothing(r.path) - rm(r.path, recursive=true, force=true) - end - end - return root - end -end - -function Base.readdir(root::TempFilesystemRoot, path::RelPath) - return isnothing(root.path) ? [] : readdir(sys_abspath(root, path)) -end - -iswriteable(root::TempFilesystemRoot) = true -sys_abspath(root::TempFilesystemRoot) = root.path +# Mutation via newdir/newfile +_temp_root(path) = FileSystemRoot(path, write=true, cleanup=true) """ newdir() @@ -159,10 +137,10 @@ temporary tree will be cleaned up during garbage collection. function newdir() # cleanup=false: we manage our own cleanup via the finalizer path = mktempdir(cleanup=false) - return FileTree(TempFilesystemRoot(path)) + return FileTree(FileSystemRoot(path, write=true, cleanup=true)) end -function newdir(root::AbstractFileSystemRoot, path::RelPath; overwrite=false) +function newdir(root::FileSystemRoot, path::RelPath; overwrite=false) p = sys_abspath(root, path) if overwrite rm(p, force=true, recursive=true) @@ -170,7 +148,6 @@ function newdir(root::AbstractFileSystemRoot, path::RelPath; overwrite=false) mkpath(p) end - function newfile(func=nothing) path, io = mktemp(cleanup=false) if func !== nothing @@ -183,57 +160,20 @@ function newfile(func=nothing) close(io) end end - return File(TempFilesystemRoot(path)) + return File(_temp_root(path)) end -function newfile(f::Function, root::AbstractFileSystemRoot, path::RelPath; kws...) +function newfile(f::Function, root::FileSystemRoot, path::RelPath; kws...) p = sys_abspath(root, path) mkpath(dirname(p)) open(f, p, write=true) end -function newfile(root::AbstractFileSystemRoot, path::RelPath; kws...) +function newfile(root::FileSystemRoot, path::RelPath; kws...) newfile(io->nothing, root, path; kws...) end #------------------------------------------------------------------------------- -# Deprecated newdir() and newfile() variants -function newdir(ctx::AbstractFileSystemRoot) - Base.depwarn(""" - `newdir(ctx::AbstractFileSystemRoot)` is deprecated. Use the in-place - version `newdir(::FileTree, path)` instead. - """, :newdir) - path = mktempdir(sys_abspath(ctx), cleanup=false) - return FileTree(TempFilesystemRoot(path)) -end - -function newfile(ctx::AbstractFileSystemRoot) - Base.depwarn(""" - `newfile(ctx::AbstractFileSystemRoot)` is deprecated. Use the in-place - version `newfile(::FileTree, path)` instead. - """, :newfile) - path, io = mktemp(sys_abspath(ctx), cleanup=false) - close(io) - return File(TempFilesystemRoot(path)) -end - -function newfile(f::Function, root::FileSystemRoot) - Base.depwarn(""" - `newfile(f::Function, ctx::AbstractFileSystemRoot)` is deprecated. - Use newfile() or the in-place version `newfile(::FileTree, path)` instead. - """, :newfile) - path, io = mktemp(sys_abspath(root), cleanup=false) - try - f(io) - catch - rm(path) - rethrow() - finally - close(io) - end - return File(TempFilesystemRoot(path)) -end - # Move srcpath to destpath, making all attempts to preserve the original # content of `destpath` if anything goes wrong. We assume that `srcpath` is # temporary content which doesn't need to be protected. @@ -278,37 +218,31 @@ function mv_force_with_dest_rollback(srcpath, destpath, tempdir_parent) end end -function Base.setindex!(tree::FileTree{<:AbstractFileSystemRoot}, - tmpdata::Union{File{TempFilesystemRoot},FileTree{TempFilesystemRoot}}, - name::AbstractString) +function Base.setindex!(tree::FileTree{FileSystemRoot}, + tmpdata::Union{File{FileSystemRoot},FileTree{FileSystemRoot}}, + path::AbstractString) if !iswriteable(tree.root) error("Attempt to move to a read-only tree $tree") end - if isnothing(tmpdata.root.path) + if !tmpdata.root.cleanup type = isdir(tmpdata) ? "directory" : "file" - error("Attempted to root a temporary $type which has already been moved to $(tree.path)/$name ") + error("Attempted to move $type which is already rooted in $(tmpdata.root)") end if !isempty(tree.path) # Eh, the number of ways the user can misuse this isn't really funny :-/ error("Temporary trees must be moved in full. The tree had non-empty path $(tree.path)") end - destpath = sys_abspath(joinpath(tree, name)) + destpath = sys_abspath(joinpath(tree, RelPath(path))) srcpath = sys_abspath(tmpdata) tempdir_parent = sys_abspath(tree) + mkpath(dirname(destpath)) mv_force_with_dest_rollback(srcpath, destpath, tempdir_parent) - # Transfer ownership of the data to `tree`. This is ugly to be sure, as it - # leaves `tmpdata` empty! However, we'll have to live with this wart unless - # we want to be duplicating large amounts of data on disk. - tmpdata.root.path = nothing + # Transfer ownership of the data to `tree`. + tmpdata.root.cleanup = false + tmpdata.root.path = destpath return tree end -# It's interesting to read about the linux VFS interface in regards to how the -# OS actually represents these things. For example -# https://stackoverflow.com/questions/36144807/why-does-linux-use-getdents-on-directories-instead-of-read - - - #-------------------------------------------------- @@ -333,3 +267,52 @@ function connect_filesystem(f, config, dataset) end add_storage_driver("FileSystem"=>connect_filesystem) + + +#------------------------------------------------------------------------------- +# Deprecations +function Base.abspath(relpath::RelPath) + Base.depwarn(""" + `abspath(::RelPath)` defaults to using `pwd()` as the root of the path + but this leads to fragile code so will be removed in the future""", + :abspath) + AbsPath(FileSystemRoot(pwd(); write=true), relpath) +end + +# Deprecated newdir() and newfile() variants +function newdir(ctx::FileSystemRoot) + Base.depwarn(""" + `newdir(ctx::FileSystemRoot)` is deprecated. Use the in-place + version `newdir(::FileTree, path)` instead. + """, :newdir) + path = mktempdir(sys_abspath(ctx), cleanup=false) + return FileTree(_temp_root(path)) +end + +function newfile(ctx::FileSystemRoot) + Base.depwarn(""" + `newfile(ctx::FileSystemRoot)` is deprecated. Use the in-place + version `newfile(::FileTree, path)` instead. + """, :newfile) + path, io = mktemp(sys_abspath(ctx), cleanup=false) + close(io) + return File(_temp_root(path)) +end + +function newfile(f::Function, root::FileSystemRoot) + Base.depwarn(""" + `newfile(f::Function, ctx::FileSystemRoot)` is deprecated. + Use newfile() or the in-place version `newfile(::FileTree, path)` instead. + """, :newfile) + path, io = mktemp(sys_abspath(root), cleanup=false) + try + f(io) + catch + rm(path) + rethrow() + finally + close(io) + end + return File(_temp_root(path)) +end + diff --git a/test/FileTree.jl b/test/FileTree.jl index 2223799..8c92bd2 100644 --- a/test/FileTree.jl +++ b/test/FileTree.jl @@ -1,62 +1,4 @@ - -@testset "File API" begin - file = File(FileSystemRoot("data/file.txt")) - - @testset "metadata" begin - @test filesize(file) == 13 - @test isfile(file) - @test !isdir(file) - @test ispath(file) - end - - @testset "open()" begin - # Do-block based forms - @test open(identity, String, file) == "Hello world!\n" - @test String(open(identity, Vector{UInt8}, file)) == "Hello world!\n" - @test open(io->read(io,String), IO, file) == "Hello world!\n" - @test open(identity, File, file) === file - - # Unscoped forms - @test open(String, file) == "Hello world!\n" - @test String(open(Vector{UInt8}, file)) == "Hello world!\n" - @test read(open(IO, file), String) == "Hello world!\n" - - # Context-based forms - @context begin - @test @!(open(String, file)) == "Hello world!\n" - @test String(@! open(Vector{UInt8}, file)) == "Hello world!\n" - @test read(@!(open(IO, file)), String) == "Hello world!\n" - @test @!(open(File, file)) === file - end - end -end - @testset "FileTree API" begin - tree = FileTree(FileSystemRoot("data")) - - @testset "metadata" begin - @test !isfile(tree) - @test isdir(tree) - @test ispath(tree) - end - - @test tree["csvset"] isa FileTree - @test tree["csvset/1.csv"] isa File - @test tree["csvset"]["2.csv"] isa File - - @testset "open()" begin - @test open(identity, FileTree, tree) === tree - - # Context-based forms - @context begin - @test @!(open(FileTree, tree)) === tree - end - end - - # TODO: delete! -end - -@testset "newfile() and newdir()" begin @testset "isolated newfile" begin @test newfile() isa File @test read(newfile()) == [] @@ -82,6 +24,22 @@ end @test read(tree["d2/hi_2_1.txt"], String) == "hi 2/1\n" @test read(tree["d2/hi_2_2.txt"], String) == "hi 2/2\n" + @testset "metadata" begin + f = tree["d1/hi_1_1.txt"] + @test filesize(f) == 7 + @test isfile(f) + @test !isdir(f) + @test ispath(f) + + d = tree["d1"] + @test !isfile(d) + @test isdir(d) + @test ispath(d) + + @test haskey(tree, "d1") + @test !haskey(tree, "x") + end + @testset "Iteration" begin # keys @test keys(tree) == ["d1", "d2"] @@ -140,6 +98,88 @@ end @test newfile(tree3, "a/b/c") isa File @test tree3["a"]["b"]["c"] isa File end + + @testset "setindex!" begin + tree = newdir() + @test keys(tree) == [] + tree["a"] = newfile() + @test tree["a"] isa File + tree["b"] = newdir() + @test tree["b"] isa FileTree + tree["c/d"] = newfile() + @test tree["c"] isa FileTree + @test tree["c/d"] isa File + @test keys(tree) == ["a","b","c"] + d = newdir() + newfile(io->print(io, "E"), d, "e") + newfile(io->print(io, "F"), d, "f") + tree["x"] = d + @test read(tree["x/e"], String) == "E" + @test read(tree["x/f"], String) == "F" + end + + @testset "delete!" begin + tree = newdir() + newfile(tree, "a/b/c") + newfile(tree, "a/b/d") + @test keys(tree) == ["a"] + delete!(tree, "a") + @test keys(tree) == [] + newfile(tree, "x") + @test keys(tree) == ["x"] + delete!(tree, "x") + @test keys(tree) == [] + end + + @testset "open(::File)" begin + file = newfile(io->print(io, "xx")) + + # Do-block based forms + @test open(identity, String, file) == "xx" + @test String(open(identity, Vector{UInt8}, file)) == "xx" + @test open(io->read(io,String), IO, file) == "xx" + @test open(identity, File, file) === file + + # Unscoped forms + @test open(String, file) == "xx" + @test String(open(Vector{UInt8}, file)) == "xx" + @test read(open(IO, file), String) == "xx" + + # Context-based forms + @context begin + @test @!(open(String, file)) == "xx" + @test String(@! open(Vector{UInt8}, file)) == "xx" + @test read(@!(open(IO, file)), String) == "xx" + @test @!(open(File, file)) === file + end + end + + @testset "open(::FileTree)" begin + tree = FileTree(FileSystemRoot("data")) + + @test open(identity, FileTree, tree) === tree + + # Context-based forms + @context begin + @test @!(open(FileTree, tree)) === tree + end + end +end + +@testset "newfile / newdir cleanup" begin + f = newfile() + global sys_file_path = f.root.path + GC.@preserve f @test isfile(sys_file_path) + d = newdir() + global sys_dir_path = d.root.path + GC.@preserve d @test isdir(sys_dir_path) +end +# Having the following as a separate top level statement ensures that `f` and +# `d` aren't accidentally still rooted so the the GC can clean them up. +@testset "newfile / newdir cleanup step 2" begin + GC.gc() + @test !ispath(sys_file_path) + @test !ispath(sys_dir_path) end #= From bb0e6e38fd5351eed45f6b9e6d5e84e12a251b01 Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Fri, 20 May 2022 13:14:26 +1000 Subject: [PATCH 12/33] Fix: Close temp files before newfile() returns --- src/filesystem.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/filesystem.jl b/src/filesystem.jl index f16d5e7..199403b 100644 --- a/src/filesystem.jl +++ b/src/filesystem.jl @@ -159,6 +159,8 @@ function newfile(func=nothing) finally close(io) end + else + close(io) end return File(_temp_root(path)) end From a6505b4cae387c80af7754081a4368fe932f9633 Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Fri, 20 May 2022 17:57:31 +1000 Subject: [PATCH 13/33] Documentation tweaks --- src/FileTree.jl | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/FileTree.jl b/src/FileTree.jl index 11eae62..2a354d7 100644 --- a/src/FileTree.jl +++ b/src/FileTree.jl @@ -225,27 +225,30 @@ Base.open(f::Function, path::AbsPath; kws...) = open(f, IO, path.root, path.path #------------------------------------------------------------------------------- """ + newdir() FileTree(root) -`FileTree` is a "directory tree" like hierarchy which may have `File`s and -`FileTree`s as children. +Create a `FileTree` which is a "directory tree" like hierarchy which may have +`File`s and `FileTree`s as children. `newdir()` creates the tree in a +temporary directory on the local filesystem. Alternative `root`s may be +supplied which store the data elsewhere. -The tree implements the `AbstracTrees.children()` interface and may be indexed -with paths to traverse the hierarchy down to the leaves ("files") which are of -type `File`. Individual leaves may be `open()`ed as various Julia types. +The tree implements the `AbstractTrees.children()` interface and may be indexed +with `/`-separated paths to traverse the hierarchy down to the leaves which are +of type `File`. Individual leaves may be `open()`ed as various Julia types. # Operations on FileTree -FileTree has a largely dictionary-like interface: +`FileTree` has a largely dictionary-like interface: * List keys (ie, file and directory names): `keys(tree)` -* List keys and values: `pairs(tree)` +* List keys,value pairs: `pairs(tree)` * Query keys: `haskey(tree)` -* Traverse the tree: `tree["path"]` +* Traverse the tree: `tree["path"]`, `tree["multi/component/path"]` * Add new content: `newdir(tree, "path")`, `newfile(tree, "path")` * Delete content: `delete!(tree, "path")` -Unlike Dict, iteration of FileTree iterates values (not key value pairs). This +Iteration of FileTree iterates values (not key value pairs). This has some benefits - for example, broadcasting processing across files in a directory. @@ -490,6 +493,9 @@ end # Deprecated function Base.rm(tree::FileTree; kws...) _check_writeable(tree) + Base.depwarn(""" + `rm(::FileTree)` is deprecated. Use `delete!(tree, path)` instead. + """, :rm) rm(tree.root, tree.path; kws...) end From 1430ef8e8e8b48b7705d386f7f647379a686b8d0 Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Mon, 23 May 2022 16:43:24 +1000 Subject: [PATCH 14/33] Additional DataSet configuration checking --- src/DataSet.jl | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/DataSet.jl b/src/DataSet.jl index 14278d7..2bd150a 100644 --- a/src/DataSet.jl +++ b/src/DataSet.jl @@ -15,6 +15,9 @@ struct DataSet function DataSet(conf) _check_keys(conf, DataSet, ["uuid"=>String, "storage"=>Dict, "name"=>String]) _check_keys(conf["storage"], DataSet, ["driver"=>String]) + _check_optional_keys(conf, + "description"=>AbstractString, + "tags"=>VectorOf(AbstractString)) check_dataset_name(conf["name"]) new(UUID(conf["uuid"]), conf) end @@ -52,12 +55,30 @@ function _check_keys(config, context, keys) Missing expected keys in $context: $missed_keys - In TOML fragment: + In DataSet fragment: $(sprint(TOML.print,config)) """) end end +struct VectorOf + T +end + +function _check_optional_keys(config, context, keys...) + for (k, check) in keys + if haskey(config, k) + v = config[k] + if check isa Type && !(v isa check) + error("""Invalid DataSet key $k. Expected type $check""") + elseif check isa VectorOf && !(v isa AbstractVector && + all(x isa check.T for x in v)) + error("""Invalid DataSet key $k""") + end + end + end +end + """ is_valid_dataset_name(name) From 97d973a5fb221a563714fdb5558e8a62fcddb582 Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Wed, 11 May 2022 14:47:57 +1000 Subject: [PATCH 15/33] Add config() API for configuring datasets --- src/DataSet.jl | 67 +++++++++++++++++++-------------------- src/data_project.jl | 47 ++++++++++++++++++++++++++- src/file_data_projects.jl | 67 +++++++++++++++++++++++++-------------- 3 files changed, 123 insertions(+), 58 deletions(-) diff --git a/src/DataSet.jl b/src/DataSet.jl index 2bd150a..123d00f 100644 --- a/src/DataSet.jl +++ b/src/DataSet.jl @@ -5,46 +5,21 @@ unopinionated about the underlying storage mechanism. The data in a `DataSet` has a type which implies an index; the index can be used to partition the data for processing. """ -struct DataSet - # For now, the representation `conf` contains data read directly from the - # TOML. Once the design has settled we might get some explicit fields and - # do validation. +mutable struct DataSet + project # AbstractDataProject owning this DataSet uuid::UUID # Unique identifier for the dataset. Use uuid4() to create these. + # The representation `conf` contains "configuration data" read directly from + # the TOML (or other data project source, eg json API etc) conf - function DataSet(conf) - _check_keys(conf, DataSet, ["uuid"=>String, "storage"=>Dict, "name"=>String]) - _check_keys(conf["storage"], DataSet, ["driver"=>String]) - _check_optional_keys(conf, - "description"=>AbstractString, - "tags"=>VectorOf(AbstractString)) - check_dataset_name(conf["name"]) - new(UUID(conf["uuid"]), conf) + function DataSet(project, conf) + _validate_dataset_config(conf) + new(project, UUID(conf["uuid"]), conf) end - - #= - name::String # Default name for convenience. - # The binding to an actual name is managed by the data - # project. - storage # Storage config and driver definition - maps::Vector{DataMap} - - # Generic dictionary of other properties... for now. Required properties - # will be moved - _other::Dict{Symbol,Any} - - #storage_id # unique identifier in storage backend, if it exists - #owner # Project or user who owns the data - #description::String - #type # Some representation of the type of data? - # # An array, blob, table, tree, etc - #cachable::Bool # Can the data be cached? It might not for data governance - # # reasons or it might change commonly. - ## A set of identifiers - #tags::Set{String} - =# end +DataSet(conf) = DataSet(nothing, conf) + _key_match(config, (k,T)::Pair) = haskey(config, k) && config[k] isa T _key_match(config, k::String) = haskey(config, k) @@ -79,6 +54,15 @@ function _check_optional_keys(config, context, keys...) end end +function _validate_dataset_config(conf) + _check_keys(conf, DataSet, ["uuid"=>String, "storage"=>Dict, "name"=>String]) + _check_keys(conf["storage"], DataSet, ["driver"=>String]) + _check_optional_keys(conf, + "description"=>AbstractString, + "tags"=>VectorOf(AbstractString)) + check_dataset_name(conf["name"]) +end + """ is_valid_dataset_name(name) @@ -158,6 +142,21 @@ function Base.show(io::IO, ::MIME"text/plain", d::DataSet) TOML.print(io, d.conf) end +function config(dataset::DataSet; kws...) + config(dataset.project, dataset; kws...) +end + +# The default case of a dataset config update when the update is independent of +# the project. (In general, projects may supply extra constraints.) +function config(::Nothing, dataset::DataSet; kws...) + for (k,v) in pairs(kws) + if k in (:uuid, :name) + error("Cannot modify dataset config with key $k") + end + dataset.conf[string(k)] = v + end + return dataset +end #------------------------------------------------------------------------------- # Functions for opening datasets diff --git a/src/data_project.jl b/src/data_project.jl index cf99441..b1d655d 100644 --- a/src/data_project.jl +++ b/src/data_project.jl @@ -52,9 +52,37 @@ function dataset(proj::AbstractDataProject, spec::AbstractString) conf = copy(dataset.conf) conf["dataspec"] = dataspec + # This copy is problematic now that datasets can be mutated: how can the + # dataset point back to its project without "dataspec" being updated? return DataSet(conf) end +""" + config(name::AbstractString; kws...) + config(proj::AbstractDataProject, name::AbstractString; kws...) + + config(dataset::DataSet; kws...) + +Update the configuration of `dataset` with the given keyword arguments and +persist it in the dataset's project storage. The versions which take a `name` +use that name to search within the given data project. + +# Examples + +Update the dataset with name "SomeData" in the global project +``` +DataSets.config("SomeData"; description="This is a description") +``` + +Tag the dataset "SomeData" with tags "A" and "B". +``` +ds = dataset("SomeData") +config(ds, tags=["A", "B"]) +``` +""" +function config(project::AbstractDataProject, name::AbstractString; kws...) + config(project[name]; kws...) +end # Percent-decode a string according to the URI escaping rules. # Vendored from URIs.jl for now to avoid depending on that entire package for @@ -312,7 +340,7 @@ function load_project(config::AbstractDict; kws...) end proj = DataProject() for dataset_conf in config["datasets"] - dataset = DataSet(dataset_conf) + dataset = DataSet(proj, dataset_conf) proj[dataset.name] = dataset end if haskey(config, "drivers") @@ -326,3 +354,20 @@ function load_project(config::AbstractDict; kws...) proj end +function save_project(path::AbstractString, proj::DataProject) + # TODO: Put this TOML conversion in DataProject ? + conf = Dict( + "data_config_version"=>CURRENT_DATA_CONFIG_VERSION, + "datasets"=>[d.conf for (n,d) in proj.datasets], + "drivers"=>proj.drivers + ) + mktemp(dirname(path)) do tmppath, tmpio + TOML.print(tmpio, conf) + close(tmpio) + mv(tmppath, path, force=true) + end +end + +function config(name::AbstractString; kws...) + config(PROJECT, name; kws...) +end diff --git a/src/file_data_projects.jl b/src/file_data_projects.jl index 65a9979..29861a4 100644 --- a/src/file_data_projects.jl +++ b/src/file_data_projects.jl @@ -4,8 +4,8 @@ A cache of file content, parsed with an arbitrary parser function. This is a modified and generalized version of `Base.CachedTOMLDict`. -Getting the value of the cache with `f[]` will automatically update the parsed -value whenever the file changes. +Getting the value of the cache with `get_cache(f)` will automatically update +the parsed value whenever the file changes. """ mutable struct CachedParsedFile{T} path::String @@ -38,7 +38,7 @@ function CachedParsedFile{T}(parser::Function, path::String) where T ) end -function Base.getindex(f::CachedParsedFile) +function get_cache(f::CachedParsedFile, allow_refresh=true) s = stat(f.path) time_since_cached = time() - f.mtime rough_mtime_granularity = 0.1 # seconds @@ -59,6 +59,9 @@ function Base.getindex(f::CachedParsedFile) f.mtime = s.mtime f.size = s.size f.hash = new_hash + if !allow_refresh + error("The file at $(f.path) was written externally") + end @debug "Cache of file $(repr(f.path)) invalid, reparsing..." return f.d = f.parser(content) end @@ -68,17 +71,21 @@ end function Base.show(io::IO, m::MIME"text/plain", f::CachedParsedFile) println(io, "Cache of file $(repr(f.path)) with value") - show(io, m, f[]) + show(io, m, get_cache(f)) end # Parse Data.toml into DataProject which updates when the file does. -function parse_and_cache_project(sys_path::AbstractString) +function parse_and_cache_project(proj, sys_path::AbstractString) sys_data_dir = dirname(sys_path) CachedParsedFile{DataProject}(sys_path) do content if isnothing(content) DataProject() else - _load_project(String(content), sys_data_dir) + inner_proj = _load_project(String(content), sys_data_dir) + for d in inner_proj + d.project = proj + end + inner_proj end end end @@ -87,19 +94,19 @@ end abstract type AbstractTomlFileDataProject <: AbstractDataProject end function Base.get(proj::AbstractTomlFileDataProject, name::AbstractString, default) - get(_get_cached(proj), name, default) + get(get_cache(proj), name, default) end function Base.keys(proj::AbstractTomlFileDataProject) - keys(_get_cached(proj)) + keys(get_cache(proj)) end function Base.iterate(proj::AbstractTomlFileDataProject, state=nothing) # This is a little complex because we want iterate to work even if the # active project changes concurrently, which means wrapping up the initial - # result of _get_cached with the iterator state. + # result of get_cache with the iterator state. if isnothing(state) - cached_values = values(_get_cached(proj)) + cached_values = values(get_cache(proj)) if isnothing(cached_values) return nothing end @@ -116,9 +123,20 @@ function Base.iterate(proj::AbstractTomlFileDataProject, state=nothing) end end -Base.pairs(proj::AbstractTomlFileDataProject) = pairs(_get_cached(proj)) +Base.pairs(proj::AbstractTomlFileDataProject) = pairs(get_cache(proj)) -data_drivers(proj::AbstractTomlFileDataProject) = data_drivers(_get_cached(proj)) +data_drivers(proj::AbstractTomlFileDataProject) = data_drivers(get_cache(proj)) + +function config(proj::AbstractTomlFileDataProject, dataset::DataSet; kws...) + if dataset.project !== proj + error("dataset must belong to project") + end + # Here we accept the update independently of the project - Data.toml should + # be able to manage any dataset config. + config(nothing, dataset; kws...) + save_project(proj.path, get_cache(proj, false)) + return dataset +end #------------------------------------------------------------------------------- """ @@ -128,15 +146,15 @@ filesystem. mutable struct TomlFileDataProject <: AbstractTomlFileDataProject path::String cache::CachedParsedFile{DataProject} + function TomlFileDataProject(path::String) + proj = new(path) + proj.cache = parse_and_cache_project(proj, path) + proj + end end -function TomlFileDataProject(path::String) - cache = parse_and_cache_project(path) - TomlFileDataProject(path, cache) -end - -function _get_cached(proj::TomlFileDataProject) - proj.cache[] +function get_cache(proj::TomlFileDataProject, refresh=true) + get_cache(proj.cache, refresh) end project_name(proj::TomlFileDataProject) = proj.path @@ -160,7 +178,7 @@ end function ActiveDataProject() proj = ActiveDataProject(nothing, DataProject()) - _get_cached(proj) + get_cache(proj) proj end @@ -170,20 +188,23 @@ function _active_project_data_toml(project_path=Base.active_project(false)) joinpath(dirname(project_path), "Data.toml") end -function _get_cached(proj::ActiveDataProject) +function get_cache(proj::ActiveDataProject, allow_refresh=true) active_project = Base.active_project(false) if proj.active_project_path != active_project + if !allow_refresh + error("The current project path was changed") + end # The unusual case: active project has changed. if isnothing(active_project) proj.cache = DataProject() else data_toml = _active_project_data_toml(active_project) # Need to re-cache - proj.cache = parse_and_cache_project(data_toml) + proj.cache = parse_and_cache_project(proj, data_toml) end proj.active_project_path = active_project end - proj.cache isa DataProject ? proj.cache : proj.cache[] + proj.cache isa DataProject ? proj.cache : get_cache(proj.cache, allow_refresh) end project_name(::ActiveDataProject) = _active_project_data_toml() From a12ca95743c8a858bd8a9511819b2077ff46002d Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Tue, 24 May 2022 16:43:58 +1000 Subject: [PATCH 16/33] Move config data validation into utils.jl --- src/DataSet.jl | 54 +++++++++---------------------------------------- src/DataSets.jl | 2 ++ src/utils.jl | 38 ++++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 44 deletions(-) create mode 100644 src/utils.jl diff --git a/src/DataSet.jl b/src/DataSet.jl index 123d00f..66a0d93 100644 --- a/src/DataSet.jl +++ b/src/DataSet.jl @@ -20,40 +20,6 @@ end DataSet(conf) = DataSet(nothing, conf) -_key_match(config, (k,T)::Pair) = haskey(config, k) && config[k] isa T -_key_match(config, k::String) = haskey(config, k) - -function _check_keys(config, context, keys) - missed_keys = filter(k->!_key_match(config, k), keys) - if !isempty(missed_keys) - error(""" - Missing expected keys in $context: - $missed_keys - - In DataSet fragment: - $(sprint(TOML.print,config)) - """) - end -end - -struct VectorOf - T -end - -function _check_optional_keys(config, context, keys...) - for (k, check) in keys - if haskey(config, k) - v = config[k] - if check isa Type && !(v isa check) - error("""Invalid DataSet key $k. Expected type $check""") - elseif check isa VectorOf && !(v isa AbstractVector && - all(x isa check.T for x in v)) - error("""Invalid DataSet key $k""") - end - end - end -end - function _validate_dataset_config(conf) _check_keys(conf, DataSet, ["uuid"=>String, "storage"=>Dict, "name"=>String]) _check_keys(conf["storage"], DataSet, ["driver"=>String]) @@ -63,6 +29,16 @@ function _validate_dataset_config(conf) check_dataset_name(conf["name"]) end +function Base.show(io::IO, d::DataSet) + print(io, DataSet, "(name=$(repr(d.name)), uuid=$(repr(d.uuid)), #= … =#)") +end + +function Base.show(io::IO, ::MIME"text/plain", d::DataSet) + println(io, "DataSet instance:") + println(io) + TOML.print(io, d.conf) +end + """ is_valid_dataset_name(name) @@ -132,16 +108,6 @@ function dataspec_fragment_as_path(d::DataSet) return nothing end -function Base.show(io::IO, d::DataSet) - print(io, DataSet, "(name=$(repr(d.name)), uuid=$(repr(d.uuid)), #= … =#)") -end - -function Base.show(io::IO, ::MIME"text/plain", d::DataSet) - println(io, "DataSet instance:") - println(io) - TOML.print(io, d.conf) -end - function config(dataset::DataSet; kws...) config(dataset.project, dataset; kws...) end diff --git a/src/DataSets.jl b/src/DataSets.jl index 150f565..ca15b19 100644 --- a/src/DataSets.jl +++ b/src/DataSets.jl @@ -218,6 +218,8 @@ end #------------------------------------------------------------------------------- +include("utils.jl") + # Application entry points include("entrypoint.jl") diff --git a/src/utils.jl b/src/utils.jl new file mode 100644 index 0000000..a2cc260 --- /dev/null +++ b/src/utils.jl @@ -0,0 +1,38 @@ +# Some basic utilities to validate "config-like" data +# +# (Perhaps these could be replaced with the use of JSON schema or some such?) + +_key_match(config, (k,T)::Pair) = haskey(config, k) && config[k] isa T +_key_match(config, k::String) = haskey(config, k) + +function _check_keys(config, context, keys) + missed_keys = filter(k->!_key_match(config, k), keys) + if !isempty(missed_keys) + error(""" + Missing expected keys in $context: + $missed_keys + + In DataSet fragment: + $(sprint(TOML.print,config)) + """) + end +end + +struct VectorOf + T +end + +function _check_optional_keys(config, context, keys...) + for (k, check) in keys + if haskey(config, k) + v = config[k] + if check isa Type && !(v isa check) + error("""Invalid DataSet key $k. Expected type $check""") + elseif check isa VectorOf && !(v isa AbstractVector && + all(x isa check.T for x in v)) + error("""Invalid DataSet key $k""") + end + end + end +end + From 4878c74b367615a292263039b3d9917cb2ebe37d Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Wed, 25 May 2022 17:34:02 +1000 Subject: [PATCH 17/33] Deprecate `@__DIR__` templating for use in Data.toml Templating paths with `@__DIR__` was always a hack because it was unaware of the structure of the TOML file and didn't generalize well. Replace this with querying of a `DataSet`s parent project for its absolute path (or rather, by asking the project to join its root path to the relative path contained in the Data.toml). This allows DataSet instances to be safely saved back to their parent projects. --- docs/src/Data.toml | 6 +-- docs/src/tutorial.md | 3 +- src/DataSet.jl | 15 ++++-- src/data_project.jl | 25 ++++----- src/file_data_projects.jl | 29 +++++++++-- src/filesystem.jl | 95 +++++++++++++++++++++++++++-------- test/Data.toml | 28 ++--------- test/active_project/Data.toml | 2 +- test/backend_compat.jl | 2 +- test/projects.jl | 10 +--- 10 files changed, 136 insertions(+), 79 deletions(-) diff --git a/docs/src/Data.toml b/docs/src/Data.toml index a9d6176..372f8d9 100644 --- a/docs/src/Data.toml +++ b/docs/src/Data.toml @@ -16,8 +16,8 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26" # Data stored in FileSystem is either File (a file) or FileTree (a directory/folder) type="File" # Path with posix `/` separators. - # Use @__DIR__ for paths relative to Data.toml - path="@__DIR__/data/file.txt" + # Relative paths are relative to the location of Data.toml + path="data/file.txt" # A second example [[datasets]] @@ -28,7 +28,7 @@ uuid="e7fd7080-e346-4a68-9ca9-98593a99266a" [datasets.storage] driver="FileSystem" type="FileTree" - path="@__DIR__/data/csvset" + path="data/csvset" # Further datasets can be added as desired # [[datasets]] diff --git a/docs/src/tutorial.md b/docs/src/tutorial.md index bc8b53b..acac62b 100644 --- a/docs/src/tutorial.md +++ b/docs/src/tutorial.md @@ -12,6 +12,7 @@ DocTestFilters = [ r"path =.*", r"@.*", r"(?<=IOStream\().*", + r"(?<=TomlFileDataProject \[).*", ] ``` @@ -71,7 +72,7 @@ global configuration this is also possible: ```jldoctest julia> project = DataSets.load_project("src/Data.toml") -DataSets.DataProject: +DataSets.TomlFileDataProject [.../DataSets/docs/src/Data.toml]: 📄 a_text_file => b498f769-a7f6-4f67-8d74-40b770398f26 📁 a_tree_example => e7fd7080-e346-4a68-9ca9-98593a99266a diff --git a/src/DataSet.jl b/src/DataSet.jl index 66a0d93..8a9f016 100644 --- a/src/DataSet.jl +++ b/src/DataSet.jl @@ -85,10 +85,13 @@ function check_dataset_name(name::AbstractString) end end -# Hacky thing until we figure out which fields DataSet should actually have. +#------------------------------------------------------------------------------- +# API for DataSet type function Base.getproperty(d::DataSet, name::Symbol) - if name in fieldnames(DataSet) - return getfield(d, name) + if name === :uuid + getfield(d, :uuid) + elseif name === :conf + getfield(d, :conf) else getfield(d, :conf)[string(name)] end @@ -97,6 +100,10 @@ end Base.getindex(d::DataSet, name::AbstractString) = getindex(d.conf, name) Base.haskey(d::DataSet, name::AbstractString) = haskey(d.conf, name) +function data_project(dataset::DataSet) + return getfield(dataset, :project) +end + # Split the fragment section as a '/' separated RelPath function dataspec_fragment_as_path(d::DataSet) if haskey(d, "dataspec") @@ -109,7 +116,7 @@ function dataspec_fragment_as_path(d::DataSet) end function config(dataset::DataSet; kws...) - config(dataset.project, dataset; kws...) + config(data_project(dataset), dataset; kws...) end # The default case of a dataset config update when the update is independent of diff --git a/src/data_project.jl b/src/data_project.jl index b1d655d..fe7ec61 100644 --- a/src/data_project.jl +++ b/src/data_project.jl @@ -52,9 +52,10 @@ function dataset(proj::AbstractDataProject, spec::AbstractString) conf = copy(dataset.conf) conf["dataspec"] = dataspec - # This copy is problematic now that datasets can be mutated: how can the - # dataset point back to its project without "dataspec" being updated? - return DataSet(conf) + # FIXME: This copy is problematic now that datasets can be mutated with + # `DataSets.config()` as "dataspec" will infect the dataset when it's + # saved again. + return DataSet(data_project(dataset), conf) end """ @@ -310,22 +311,18 @@ end #------------------------------------------------------------------------------- """ - load_project(path; auto_update=false) - load_project(config_dict) + load_project(path) -Load a data project from a system `path` referring to a TOML file. If -`auto_update` is true, the returned project will monitor the file for updates -and reload when necessary. - -Alternatively, create a `DataProject` from a an existing dictionary -`config_dict`, which should be in the Data.toml format. +Load a data project from a system `path` referring to a TOML file. See also [`load_project!`](@ref). """ -function load_project(path::AbstractString; auto_update=false) +function load_project(path::AbstractString; auto_update=true) sys_path = abspath(path) - auto_update ? TomlFileDataProject(sys_path) : - _load_project(read(sys_path,String), dirname(sys_path)) + if !auto_update + Base.depwarn("`auto_update` is deprecated", :load_project) + end + TomlFileDataProject(sys_path) end function load_project(config::AbstractDict; kws...) diff --git a/src/file_data_projects.jl b/src/file_data_projects.jl index 29861a4..2ded53f 100644 --- a/src/file_data_projects.jl +++ b/src/file_data_projects.jl @@ -128,7 +128,7 @@ Base.pairs(proj::AbstractTomlFileDataProject) = pairs(get_cache(proj)) data_drivers(proj::AbstractTomlFileDataProject) = data_drivers(get_cache(proj)) function config(proj::AbstractTomlFileDataProject, dataset::DataSet; kws...) - if dataset.project !== proj + if data_project(dataset) !== proj error("dataset must belong to project") end # Here we accept the update independently of the project - Data.toml should @@ -157,6 +157,10 @@ function get_cache(proj::TomlFileDataProject, refresh=true) get_cache(proj.cache, refresh) end +function local_data_abspath(proj::TomlFileDataProject, relpath) + return joinpath(dirname(proj.path), relpath) +end + project_name(proj::TomlFileDataProject) = proj.path #------------------------------------------------------------------------------ @@ -207,6 +211,13 @@ function get_cache(proj::ActiveDataProject, allow_refresh=true) proj.cache isa DataProject ? proj.cache : get_cache(proj.cache, allow_refresh) end +function local_data_abspath(proj::ActiveDataProject, relpath) + if isnothing(proj.active_project_path) + error("No active project") + end + return joinpath(dirname(proj.active_project_path), relpath) +end + project_name(::ActiveDataProject) = _active_project_data_toml() #------------------------------------------------------------------------------- @@ -217,7 +228,15 @@ function _fill_template(toml_path, toml_str) if Sys.iswindows() toml_path = replace(toml_path, '\\'=>'/') end - toml_str = replace(toml_str, "@__DIR__"=>toml_path) + if occursin("@__DIR__", toml_str) + Base.depwarn(""" + Using @__DIR__ in Data.toml is deprecated. Use a '/'-separated + relative path instead.""", + :_fill_template) + return replace(toml_str, "@__DIR__"=>toml_path) + else + return toml_str + end end function _load_project(content::AbstractString, sys_data_dir) @@ -246,13 +265,17 @@ function from_path(path::AbstractString) throw(ArgumentError(msg)) end + path_key = Sys.isunix() ? "unix_path" : + Sys.iswindows() ? "windows_path" : + error("Unknown system: cannot determine path type") + conf = Dict( "name"=>make_valid_dataset_name(path), "uuid"=>string(uuid4()), "storage"=>Dict( "driver"=>"FileSystem", "type"=>dtype, - "path"=>abspath(path), + path_key=>abspath(path), ) ) diff --git a/src/filesystem.jl b/src/filesystem.jl index 199403b..73d1803 100644 --- a/src/filesystem.jl +++ b/src/filesystem.jl @@ -1,24 +1,6 @@ """ Root storage object for trees which are rooted in the file system (in git terminology, there exists a "working copy") - -## Metadata spec - -For File: -``` - [datasets.storage] - driver="FileSystem" - type="File" - path=\$(path_to_file) -``` - -For FileTree: -``` - [datasets.storage] - driver="FileSystem" - type="FileTree" - path=\$(path_to_directory) -``` """ mutable struct FileSystemRoot path::String @@ -247,10 +229,83 @@ end #-------------------------------------------------- +# FileSystem storage driver + +""" + local_data_abspath(project, relpath) + +Return the absolute path of data on disk where `relpath` is relative to +`project`. + +This function must be implemented for any `AbstractDataProject` subtype which +intends to support the `FileSystem` data driver. +""" +function local_data_abspath +end + + +function local_data_abspath(::Nothing, path) + error("Path must be absolute for DataSets without parent data projects") +end -# Filesystem storage driver + +""" +## Metadata spec + +For File: +``` + [datasets.storage] + driver="FileSystem" + type="File" + \$path_key=\$(path_string) +``` + +For FileTree: +``` + [datasets.storage] + driver="FileSystem" + type="FileTree" + \$path_key=\$(path_string) +``` + +`path_key` should be one of the following forms: +``` + path=\$(relative_slash_separated_path_to_file) + unix_path=\$(absolute_unix_path_to_file) + windows_path=\$(absolute_windows_path_to_file) +``` +""" function connect_filesystem(f, config, dataset) - path = config["path"] + # Paths keys can be in three forms documented above; + if haskey(config, "path") + pathstr = config["path"] + # Local absolute paths are not portable. Previously these were allowed + # in the "path" key, but those are now deprecated in favor of + # system-specific path keys unix_path or windows_path + if isabspath(pathstr) + Base.depwarn(""" + Absolute paths in Data.toml are deprecated. Instead, use relative + paths (separated with `/`) relative to the Data.toml location.""", + :connect_filesystem) + path = pathstr + else + if '\\' in pathstr && Sys.iswindows() + # Heuristic deprecation warning for windows paths in Data.toml + Base.depwarn( + "Relative paths in Data.toml should be separated with '/' characters.", + :connect_filesystem) + pathstr = join(split(pathstr, '\\'), '/') + end + relpath = joinpath(split(pathstr, '/')...) + path = local_data_abspath(data_project(dataset), relpath) + end + elseif haskey(config, "unix_path") && Sys.isunix() + path = config["unix_path"] + elseif haskey(config, "windows_path") && Sys.iswindows() + path = config["windows_path"] + else + error("No \"path\" key found for FileSystem storage driver.") + end type = config["type"] if type in ("File", "Blob") isfile(path) || throw(ArgumentError("$(repr(path)) should be a file")) diff --git a/test/Data.toml b/test/Data.toml index d9e85d2..3ddbdc0 100644 --- a/test/Data.toml +++ b/test/Data.toml @@ -12,16 +12,7 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26" [datasets.storage] driver="FileSystem" type="File" - path="@__DIR__/data/file.txt" - - # TODO: We'd like a layering abstraction. - - # [[datasets.maps]] - # type="File" - # - # [[datasets.maps]] - # type="text" - # parameters={encoding="UTF-8"} + path="data/file.txt" [[datasets]] description="A text file with namespace" @@ -31,7 +22,7 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26" [datasets.storage] driver="FileSystem" type="File" - path="@__DIR__/data/file.txt" + path="data/file.txt" #-------------------------------------------------- [[datasets]] @@ -42,15 +33,7 @@ uuid="2d126588-5f76-4e53-8245-87dc91625bf4" [datasets.storage] driver="FileSystem" type="File" - path="@__DIR__/data/people.csv.gz" - - #[[datasets.maps]] - #type="GZip" - # - #[[datasets.maps]] - #type="CSV" - #parameters={delim=","} - + path="data/people.csv.gz" #-------------------------------------------------- [[datasets]] @@ -60,10 +43,7 @@ uuid="e7fd7080-e346-4a68-9ca9-98593a99266a" [datasets.storage] driver="FileSystem" type="FileTree" - path="@__DIR__/data/csvset" - - # TODO: Add data maps here which expose it logically as a single CSV? - + path="data/csvset" #-------------------------------------------------- # Data embedded in the TOML diff --git a/test/active_project/Data.toml b/test/active_project/Data.toml index 03158c8..6d5fb9d 100644 --- a/test/active_project/Data.toml +++ b/test/active_project/Data.toml @@ -8,4 +8,4 @@ uuid="314996ef-12be-40d0-912c-9755af354fdb" [datasets.storage] driver="FileSystem" type="File" - path="@__DIR__/data/file.txt" + path="data/file.txt" diff --git a/test/backend_compat.jl b/test/backend_compat.jl index c2f8335..27914b8 100644 --- a/test/backend_compat.jl +++ b/test/backend_compat.jl @@ -82,7 +82,7 @@ DataSets.add_storage_driver("OldBackendAPI"=>connect_old_backend) @test read(open(dataset(proj, "old_backend_tree"))[path"b.txt"], String) == "b" end -@testset "Compat for renaming Blob->File, BlobTree->FileTree" begin +@testset "Compat for @__DIR__ and renaming Blob->File, BlobTree->FileTree" begin proj = DataSets.load_project(joinpath(@__DIR__, "DataCompat.toml")) text_data = dataset(proj, "a_text_file") diff --git a/test/projects.jl b/test/projects.jl index 5ba13e4..5e9f7cf 100644 --- a/test/projects.jl +++ b/test/projects.jl @@ -33,12 +33,6 @@ test_project_names = ["a_text_file", # identity @test project_name(proj) == abspath("Data.toml") - - # Test @__DIR__ templating - # Use `cleanpath` as there's currently a mixture of / and \ on windows - # which does work, but is quite ugly. - cleanpath(p) = replace(p, '\\'=>'/') - @test cleanpath(proj["a_text_file"].storage["path"]) == cleanpath(joinpath(@__DIR__, "data", "file.txt")) end @testset "TomlFileDataProject live updates" begin @@ -55,7 +49,7 @@ end [datasets.storage] driver="FileSystem" type="File" - path="@__DIR__/data/file.txt" + path="data/file.txt" """) flush(io) @@ -74,7 +68,7 @@ end [datasets.storage] driver="FileSystem" type="File" - path="@__DIR__/data/file2.txt" + path="data/file2.txt" """) flush(io) From 9b4bfd1553ac6ebd1a9d7311b9e5e178b6087413 Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Thu, 26 May 2022 17:46:21 +1000 Subject: [PATCH 18/33] Rename config() -> config!() --- docs/src/reference.md | 22 ++++++++++++++++++---- src/DataSet.jl | 8 +++++--- src/data_project.jl | 20 ++++++++++---------- src/file_data_projects.jl | 4 ++-- test/active_project/Data.toml | 2 +- test/projects.jl | 2 +- test/runtests.jl | 2 +- 7 files changed, 38 insertions(+), 22 deletions(-) diff --git a/docs/src/reference.md b/docs/src/reference.md index a99ac4b..4334183 100644 --- a/docs/src/reference.md +++ b/docs/src/reference.md @@ -3,12 +3,18 @@ ## Using datasets The primary mechanism for loading datasets is the `dataset` function, coupled -with `open()` to open the resulting `DataSet` as some Julia type. In addition, -DataSets.jl provides two macros [`@datafunc`](@ref) and [`@datarun`](@ref) to -help in creating program entry points and running them. +with `open()` to open the resulting `DataSet` as some Julia type. ```@docs dataset +``` + +In addition, DataSets.jl provides two macros [`@datafunc`](@ref) and +[`@datarun`](@ref) to help in creating program entry points and running them. +Note that these APIs aren't fully formed and might be deprecated before +DataSets-1.0. + +```@docs @datafunc @datarun ``` @@ -46,6 +52,14 @@ DataSets.ActiveDataProject DataSets.TomlFileDataProject ``` +### Modifying datasets + +The metadata for a dataset may be updated using `config!` + +```@docs +DataSets.config! +``` + ## Data Models for files and directories DataSets provides some builtin data models [`File`](@ref) and @@ -62,7 +76,7 @@ newdir ## Storage Drivers -To add a new kind of data storage backend, implement [`DataSets.add_storage_driver`](@ref) +To add a new kind of data storage backend, call [`DataSets.add_storage_driver`](@ref) ```@docs DataSets.add_storage_driver diff --git a/src/DataSet.jl b/src/DataSet.jl index 8a9f016..d5d756d 100644 --- a/src/DataSet.jl +++ b/src/DataSet.jl @@ -115,16 +115,18 @@ function dataspec_fragment_as_path(d::DataSet) return nothing end -function config(dataset::DataSet; kws...) - config(data_project(dataset), dataset; kws...) +function config!(dataset::DataSet; kws...) + config!(data_project(dataset), dataset; kws...) end # The default case of a dataset config update when the update is independent of # the project. (In general, projects may supply extra constraints.) -function config(::Nothing, dataset::DataSet; kws...) +function config!(::Nothing, dataset::DataSet; kws...) for (k,v) in pairs(kws) if k in (:uuid, :name) error("Cannot modify dataset config with key $k") + # TODO: elseif k === :storage + # Check consistency using storage driver API? end dataset.conf[string(k)] = v end diff --git a/src/data_project.jl b/src/data_project.jl index fe7ec61..05cc468 100644 --- a/src/data_project.jl +++ b/src/data_project.jl @@ -53,16 +53,16 @@ function dataset(proj::AbstractDataProject, spec::AbstractString) conf["dataspec"] = dataspec # FIXME: This copy is problematic now that datasets can be mutated with - # `DataSets.config()` as "dataspec" will infect the dataset when it's + # `DataSets.config!()` as "dataspec" will infect the dataset when it's # saved again. return DataSet(data_project(dataset), conf) end """ - config(name::AbstractString; kws...) - config(proj::AbstractDataProject, name::AbstractString; kws...) + config!(name::AbstractString; kws...) + config!(proj::AbstractDataProject, name::AbstractString; kws...) - config(dataset::DataSet; kws...) + config!(dataset::DataSet; kws...) Update the configuration of `dataset` with the given keyword arguments and persist it in the dataset's project storage. The versions which take a `name` @@ -72,17 +72,17 @@ use that name to search within the given data project. Update the dataset with name "SomeData" in the global project ``` -DataSets.config("SomeData"; description="This is a description") +DataSets.config!("SomeData"; description="This is a description") ``` Tag the dataset "SomeData" with tags "A" and "B". ``` ds = dataset("SomeData") -config(ds, tags=["A", "B"]) +DataSets.config!(ds, tags=["A", "B"]) ``` """ -function config(project::AbstractDataProject, name::AbstractString; kws...) - config(project[name]; kws...) +function config!(project::AbstractDataProject, name::AbstractString; kws...) + config!(project[name]; kws...) end # Percent-decode a string according to the URI escaping rules. @@ -365,6 +365,6 @@ function save_project(path::AbstractString, proj::DataProject) end end -function config(name::AbstractString; kws...) - config(PROJECT, name; kws...) +function config!(name::AbstractString; kws...) + config!(PROJECT, name; kws...) end diff --git a/src/file_data_projects.jl b/src/file_data_projects.jl index 2ded53f..6eb8879 100644 --- a/src/file_data_projects.jl +++ b/src/file_data_projects.jl @@ -127,13 +127,13 @@ Base.pairs(proj::AbstractTomlFileDataProject) = pairs(get_cache(proj)) data_drivers(proj::AbstractTomlFileDataProject) = data_drivers(get_cache(proj)) -function config(proj::AbstractTomlFileDataProject, dataset::DataSet; kws...) +function config!(proj::AbstractTomlFileDataProject, dataset::DataSet; kws...) if data_project(dataset) !== proj error("dataset must belong to project") end # Here we accept the update independently of the project - Data.toml should # be able to manage any dataset config. - config(nothing, dataset; kws...) + config!(nothing, dataset; kws...) save_project(proj.path, get_cache(proj, false)) return dataset end diff --git a/test/active_project/Data.toml b/test/active_project/Data.toml index 6d5fb9d..64f90df 100644 --- a/test/active_project/Data.toml +++ b/test/active_project/Data.toml @@ -1,4 +1,4 @@ -data_config_version=0 +data_config_version=1 [[datasets]] description="A text file" diff --git a/test/projects.jl b/test/projects.jl index 5e9f7cf..cd448b6 100644 --- a/test/projects.jl +++ b/test/projects.jl @@ -39,7 +39,7 @@ end # Test live updating when the file is rewritten mktemp() do path,io write(io, """ - data_config_version=0 + data_config_version=1 [[datasets]] description="A text file" diff --git a/test/runtests.jl b/test/runtests.jl index a7d65fc..0b2925d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -20,7 +20,7 @@ end @testset "DataSet config from Dict" begin config = Dict( - "data_config_version"=>0, + "data_config_version"=>1, "datasets"=>[Dict( "description"=>"A text file", "name"=>"a_text_file", From 10140a92a829fc26102d8827c017981c99b0d977 Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Thu, 26 May 2022 17:46:31 +1000 Subject: [PATCH 19/33] Test + docs for DataSets.config!() --- src/DataSet.jl | 15 ++++++++++ src/data_project.jl | 17 +++++------ src/file_data_projects.jl | 32 ++++++++++++++++++-- test/projects.jl | 63 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 114 insertions(+), 13 deletions(-) diff --git a/src/DataSet.jl b/src/DataSet.jl index d5d756d..8a1843d 100644 --- a/src/DataSet.jl +++ b/src/DataSet.jl @@ -97,6 +97,10 @@ function Base.getproperty(d::DataSet, name::Symbol) end end +function Base.setproperty!(d::DataSet, name::Symbol, x) + config!(d; name=>x) +end + Base.getindex(d::DataSet, name::AbstractString) = getindex(d.conf, name) Base.haskey(d::DataSet, name::AbstractString) = haskey(d.conf, name) @@ -128,6 +132,17 @@ function config!(::Nothing, dataset::DataSet; kws...) # TODO: elseif k === :storage # Check consistency using storage driver API? end + # TODO: Fold these schema checks in with _validate_dataset_config + # somehow. + if k === :description + if !(v isa AbstractString) + error("Dataset description must be a string") + end + elseif k === :tags + if !(v isa AbstractVector && all(x isa AbstractString for x in v)) + error("Dataset tags must be a vector of strings") + end + end dataset.conf[string(k)] = v end return dataset diff --git a/src/data_project.jl b/src/data_project.jl index 05cc468..9458c82 100644 --- a/src/data_project.jl +++ b/src/data_project.jl @@ -70,15 +70,16 @@ use that name to search within the given data project. # Examples -Update the dataset with name "SomeData" in the global project +Update the description of the dataset named `"SomeData"` in the global project: ``` DataSets.config!("SomeData"; description="This is a description") ``` -Tag the dataset "SomeData" with tags "A" and "B". +Alternatively, setting `DataSet` properties can be used to update metadata. For +example, to tag the dataset "SomeData" with tags `"A"` and `"B"`. ``` ds = dataset("SomeData") -DataSets.config!(ds, tags=["A", "B"]) +ds.tags = ["A", "B"] ``` """ function config!(project::AbstractDataProject, name::AbstractString; kws...) @@ -351,18 +352,14 @@ function load_project(config::AbstractDict; kws...) proj end -function save_project(path::AbstractString, proj::DataProject) - # TODO: Put this TOML conversion in DataProject ? +function project_toml(proj::DataProject) + # FIXME: Preserve other unknown keys here for forward compatibility. conf = Dict( "data_config_version"=>CURRENT_DATA_CONFIG_VERSION, "datasets"=>[d.conf for (n,d) in proj.datasets], "drivers"=>proj.drivers ) - mktemp(dirname(path)) do tmppath, tmpio - TOML.print(tmpio, conf) - close(tmpio) - mv(tmppath, path, force=true) - end + return sprint(TOML.print, conf) end function config!(name::AbstractString; kws...) diff --git a/src/file_data_projects.jl b/src/file_data_projects.jl index 6eb8879..1f0e194 100644 --- a/src/file_data_projects.jl +++ b/src/file_data_projects.jl @@ -69,6 +69,20 @@ function get_cache(f::CachedParsedFile, allow_refresh=true) return f.d end +function set_cache(f::CachedParsedFile, content::AbstractString) + mktemp(dirname(f.path)) do tmppath, tmpio + write(tmpio, content) + close(tmpio) + # Uses mktemp() + mv() to atomically overwrite the file + mv(tmppath, f.path, force=true) + end + s = stat(f.path) + f.inode = s.inode + f.mtime = s.mtime + f.size = s.size + f.hash = sha1(content) +end + function Base.show(io::IO, m::MIME"text/plain", f::CachedParsedFile) println(io, "Cache of file $(repr(f.path)) with value") show(io, m, get_cache(f)) @@ -83,7 +97,9 @@ function parse_and_cache_project(proj, sys_path::AbstractString) else inner_proj = _load_project(String(content), sys_data_dir) for d in inner_proj - d.project = proj + # Hack; we steal ownership from the DataProject here. + # What's a better way to do this? + setfield!(d, :project, proj) end inner_proj end @@ -134,7 +150,7 @@ function config!(proj::AbstractTomlFileDataProject, dataset::DataSet; kws...) # Here we accept the update independently of the project - Data.toml should # be able to manage any dataset config. config!(nothing, dataset; kws...) - save_project(proj.path, get_cache(proj, false)) + set_cache(proj, project_toml(get_cache(proj, false))) return dataset end @@ -157,6 +173,10 @@ function get_cache(proj::TomlFileDataProject, refresh=true) get_cache(proj.cache, refresh) end +function set_cache(proj::TomlFileDataProject, content::AbstractString) + set_cache(proj.cache, content) +end + function local_data_abspath(proj::TomlFileDataProject, relpath) return joinpath(dirname(proj.path), relpath) end @@ -211,6 +231,14 @@ function get_cache(proj::ActiveDataProject, allow_refresh=true) proj.cache isa DataProject ? proj.cache : get_cache(proj.cache, allow_refresh) end +function set_cache(proj::ActiveDataProject, content::AbstractString) + if proj.cache isa DataProject + error("No current active project") + else + set_cache(proj.cache, content) + end +end + function local_data_abspath(proj::ActiveDataProject, relpath) if isnothing(proj.active_project_path) error("No active project") diff --git a/test/projects.jl b/test/projects.jl index cd448b6..61a49f8 100644 --- a/test/projects.jl +++ b/test/projects.jl @@ -6,7 +6,8 @@ using DataSets: TomlFileDataProject, ActiveDataProject, StackedDataProject, - project_name + project_name, + config! test_project_names = ["a_text_file", "a_tree_example", @@ -130,3 +131,63 @@ end @test project_name(DataSets.PROJECT.projects[1]) == joinpath(@__DIR__, "Data.toml") end +@testset "config!() metadata update" begin + # Test live updating when the file is rewritten + mktempdir() do tmppath + data_toml_path = joinpath(tmppath, "Data.toml") + open(data_toml_path, write=true) do io + write(io, """ + data_config_version=1 + + [[datasets]] + description="A" + name="a_text_file" + uuid="b498f769-a7f6-4f67-8d74-40b770398f26" + + [datasets.storage] + driver="FileSystem" + type="File" + path="data/file.txt" + """) + end + + proj = TomlFileDataProject(data_toml_path) + @testset "config!(proj, ...)" begin + @test dataset(proj, "a_text_file").description == "A" + config!(proj, "a_text_file", description="B") + config!(proj, "a_text_file", tags=Any["xx", "yy"]) + @test dataset(proj, "a_text_file").description == "B" + @test dataset(proj, "a_text_file").tags == ["xx", "yy"] + end + + @testset "Persistence on disk" begin + proj2 = TomlFileDataProject(data_toml_path) + @test dataset(proj2, "a_text_file").description == "B" + @test dataset(proj2, "a_text_file").tags == ["xx", "yy"] + end + + @testset "config! via DataSet instances" begin + ds = dataset(proj, "a_text_file") + config!(ds, description = "C") + @test dataset(proj, "a_text_file").description == "C" + ds.description = "D" + @test dataset(proj, "a_text_file").description == "D" + end + + @testset "description and tags validation" begin + ds = dataset(proj, "a_text_file") + @test_throws Exception config!(ds, description = 1) + @test_throws Exception config!(ds, tags = "hi") + end + + @testset "global config! methods" begin + empty!(DataSets.PROJECT) + pushfirst!(DataSets.PROJECT, TomlFileDataProject(data_toml_path)) + + config!("a_text_file", description="X") + @test dataset("a_text_file").description == "X" + + empty!(DataSets.PROJECT) + end + end +end From bf6b2951841d59059c4e4aab6415e57f0f6344f0 Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Fri, 27 May 2022 11:56:57 +1000 Subject: [PATCH 20/33] Replace templated @__DIR__ with "." This improves path handling for data projects which will be saved back to disk. --- src/file_data_projects.jl | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/file_data_projects.jl b/src/file_data_projects.jl index 1f0e194..79f3773 100644 --- a/src/file_data_projects.jl +++ b/src/file_data_projects.jl @@ -250,25 +250,20 @@ project_name(::ActiveDataProject) = _active_project_data_toml() #------------------------------------------------------------------------------- -function _fill_template(toml_path, toml_str) - # Super hacky templating for paths relative to the toml file. - # We really should have something a lot nicer here... - if Sys.iswindows() - toml_path = replace(toml_path, '\\'=>'/') - end +function _fill_template(toml_str) if occursin("@__DIR__", toml_str) Base.depwarn(""" Using @__DIR__ in Data.toml is deprecated. Use a '/'-separated relative path instead.""", :_fill_template) - return replace(toml_str, "@__DIR__"=>toml_path) + return replace(toml_str, "@__DIR__"=>".") else return toml_str end end function _load_project(content::AbstractString, sys_data_dir) - toml_str = _fill_template(sys_data_dir, content) + toml_str = _fill_template(content) config = TOML.parse(toml_str) load_project(config) end From ad3c9a9f42b2de275b04bb7ddb3a3ea69791424a Mon Sep 17 00:00:00 2001 From: Morten Piibeleht Date: Tue, 1 Nov 2022 09:33:29 +1300 Subject: [PATCH 21/33] Fix nightly tests (#50) * Use Pkg.develop instead of LOAD_PATH * Expand CI test matrix to include 1.5, 1.7 1.5 is the lowest version we support here. --- .github/workflows/ci.yml | 2 ++ test/driver_autoload.jl | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e362dd7..aed7db1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,6 +13,8 @@ jobs: strategy: matrix: version: + - '1.5' + - '1.7' - '1' - 'nightly' os: diff --git a/test/driver_autoload.jl b/test/driver_autoload.jl index 18442e2..1f30236 100644 --- a/test/driver_autoload.jl +++ b/test/driver_autoload.jl @@ -1,6 +1,6 @@ @testset "Automatic code loading for drivers" begin empty!(DataSets.PROJECT) - pushfirst!(LOAD_PATH, abspath("drivers")) + Pkg.develop(path=joinpath(@__DIR__, "drivers", "DummyStorageBackends")) ENV["JULIA_DATASETS_PATH"] = joinpath(@__DIR__, "DriverAutoloadData.toml") DataSets.__init__() @test haskey(DataSets._storage_drivers, "DummyTomlStorage") From 812d71d70ebfda043696176192e0c449841cce88 Mon Sep 17 00:00:00 2001 From: Morten Piibeleht Date: Fri, 4 Nov 2022 07:40:21 +1300 Subject: [PATCH 22/33] Remove DataSets._current_project (#53) --- src/DataSets.jl | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/DataSets.jl b/src/DataSets.jl index ca15b19..b4b43d2 100644 --- a/src/DataSets.jl +++ b/src/DataSets.jl @@ -139,9 +139,6 @@ Additional projects may be added or removed from the stack with `pushfirst!`, """ PROJECT = StackedDataProject() -# deprecated. TODO: Remove dependency on this from JuliaHub -_current_project = DataProject() - _isprecompiling() = ccall(:jl_generating_output, Cint, ()) == 1 function __init__() From b8192dace14c3576883823e99d53616d3c96633d Mon Sep 17 00:00:00 2001 From: Morten Piibeleht Date: Fri, 4 Nov 2022 07:40:33 +1300 Subject: [PATCH 23/33] Remove manifest, gitignore generated docs files (#51) --- .github/workflows/ci.yml | 2 +- .gitignore | 2 + docs/Manifest.toml | 147 --------------------------------------- docs/Project.toml | 3 + 4 files changed, 6 insertions(+), 148 deletions(-) delete mode 100644 docs/Manifest.toml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index aed7db1..2a6568f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -48,7 +48,7 @@ jobs: - uses: actions/checkout@v2 - uses: julia-actions/setup-julia@latest with: - version: '1.6' + version: '1' - run: julia --project=docs -e ' using Pkg; Pkg.develop(PackageSpec(; path=pwd())); diff --git a/.gitignore b/.gitignore index ba39cc5..2ece7b6 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ Manifest.toml +/docs/build +/docs/Manifest.toml diff --git a/docs/Manifest.toml b/docs/Manifest.toml deleted file mode 100644 index c822d6d..0000000 --- a/docs/Manifest.toml +++ /dev/null @@ -1,147 +0,0 @@ -# This file is machine-generated - editing it directly is not advised - -[[ArgTools]] -uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" - -[[Artifacts]] -uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" - -[[Base64]] -uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" - -[[Dates]] -deps = ["Printf"] -uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" - -[[DocStringExtensions]] -deps = ["LibGit2", "Markdown", "Pkg", "Test"] -git-tree-sha1 = "9d4f64f79012636741cf01133158a54b24924c32" -uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" -version = "0.8.4" - -[[Documenter]] -deps = ["Base64", "Dates", "DocStringExtensions", "IOCapture", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"] -git-tree-sha1 = "3ebb967819b284dc1e3c0422229b58a40a255649" -uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" -version = "0.26.3" - -[[Downloads]] -deps = ["ArgTools", "LibCURL", "NetworkOptions"] -uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" - -[[IOCapture]] -deps = ["Logging"] -git-tree-sha1 = "377252859f740c217b936cebcd918a44f9b53b59" -uuid = "b5f81e59-6552-4d32-b1f0-c071b021bf89" -version = "0.1.1" - -[[InteractiveUtils]] -deps = ["Markdown"] -uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" - -[[JSON]] -deps = ["Dates", "Mmap", "Parsers", "Unicode"] -git-tree-sha1 = "81690084b6198a2e1da36fcfda16eeca9f9f24e4" -uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" -version = "0.21.1" - -[[LibCURL]] -deps = ["LibCURL_jll", "MozillaCACerts_jll"] -uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" - -[[LibCURL_jll]] -deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] -uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" - -[[LibGit2]] -deps = ["Base64", "NetworkOptions", "Printf", "SHA"] -uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" - -[[LibSSH2_jll]] -deps = ["Artifacts", "Libdl", "MbedTLS_jll"] -uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" - -[[Libdl]] -uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" - -[[Logging]] -uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" - -[[Markdown]] -deps = ["Base64"] -uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" - -[[MbedTLS_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" - -[[Mmap]] -uuid = "a63ad114-7e13-5084-954f-fe012c677804" - -[[MozillaCACerts_jll]] -uuid = "14a3606d-f60d-562e-9121-12d972cd8159" - -[[NetworkOptions]] -uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" - -[[Parsers]] -deps = ["Dates"] -git-tree-sha1 = "c8abc88faa3f7a3950832ac5d6e690881590d6dc" -uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" -version = "1.1.0" - -[[Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] -uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" - -[[Printf]] -deps = ["Unicode"] -uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" - -[[REPL]] -deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] -uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" - -[[Random]] -deps = ["Serialization"] -uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" - -[[SHA]] -uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" - -[[Serialization]] -uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" - -[[Sockets]] -uuid = "6462fe0b-24de-5631-8697-dd941f90decc" - -[[TOML]] -deps = ["Dates"] -uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" - -[[Tar]] -deps = ["ArgTools", "SHA"] -uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" - -[[Test]] -deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] -uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[[UUIDs]] -deps = ["Random", "SHA"] -uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" - -[[Unicode]] -uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" - -[[Zlib_jll]] -deps = ["Libdl"] -uuid = "83775a58-1f1d-513f-b197-d71354ab007a" - -[[nghttp2_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" - -[[p7zip_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/docs/Project.toml b/docs/Project.toml index dfa65cd..3a52a5d 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,2 +1,5 @@ [deps] Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" + +[compat] +Documenter = "0.27" From 705492e1c27f233684fef2c2be98f70b5be47fb4 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 4 Nov 2022 07:40:55 +1300 Subject: [PATCH 24/33] CompatHelper: bump compat for AbstractTrees to 0.4 (#47) --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index fcce2f4..14ebcaf 100644 --- a/Project.toml +++ b/Project.toml @@ -15,7 +15,7 @@ TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [compat] -AbstractTrees = "0.3" +AbstractTrees = "0.4" ReplMaker = "0.2" ResourceContexts = "0.1,0.2" TOML = "1" From bd8444e2762e84b4835c8475b268f862a4969595 Mon Sep 17 00:00:00 2001 From: Morten Piibeleht Date: Fri, 4 Nov 2022 13:24:36 +1300 Subject: [PATCH 25/33] Determine data repo from DEPOT_PATH (#52) --- src/DataSets.jl | 54 ++++++++++++++++++++++++++++++++---------------- src/repl.jl | 2 +- test/projects.jl | 10 ++++++++- 3 files changed, 46 insertions(+), 20 deletions(-) diff --git a/src/DataSets.jl b/src/DataSets.jl index b4b43d2..5214879 100644 --- a/src/DataSets.jl +++ b/src/DataSets.jl @@ -67,29 +67,44 @@ include("storage_drivers.jl") #------------------------------------------------------------------------------- # Global datasets configuration for current Julia session -function expand_project_path(path) +function data_project_from_path(path; depot_paths) if path == "@" - return path + ActiveDataProject() elseif path == "" - return joinpath(homedir(), ".julia", "datasets", "Data.toml") + # We will not throw an error here because this gets call in __init__, and we + # do not want to interrupt the loading of the package. Instead, we omit this + # project. + if isempty(depot_paths) + @warn "Julia depot data project (for an empty dataset path) can not be constructed because DEPOT_PATH is empty." + return nothing + end + depot = first(depot_paths) + # Julia is perfectly happy with DEPOT_PATHs that are not absolute, and hence their + # interpretation changes when the user cd-s around in their session. + # + # https://github.com/JuliaLang/julia/issues/44958 + # + # To offer a little bit more reliability here for the user, we absolutize the + # path when DataSets gets loaded, so that things would not be affected by the + # user changing directories. + if !isabspath(depot) + depot = abspath(expanduser(depot)) + @warn "Julia depot path ($(first(depot_paths))) not absolute. Fixing data project path relative to current working directory." depot + end + TomlFileDataProject(joinpath(depot, "datasets", "Data.toml")) else + # In other cases, we expect a reasonable absolute (or relative) path from + # the user, which can either points directly to a file, unless it is an existing + # directory. path = abspath(expanduser(path)) if isdir(path) path = joinpath(path, "Data.toml") end - end - path -end - -function data_project_from_path(path) - if path == "@" - ActiveDataProject() - else - TomlFileDataProject(expand_project_path(path)) + TomlFileDataProject(path) end end -function create_project_stack(env) +function create_project_stack(env, depot_paths) stack = [] env_search_path = get(env, "JULIA_DATASETS_PATH", nothing) if isnothing(env_search_path) @@ -99,7 +114,8 @@ function create_project_stack(env) split(env_search_path, Sys.iswindows() ? ';' : ':') end for path in paths - project = data_project_from_path(path) + project = data_project_from_path(path; depot_paths) + isnothing(project) && continue push!(stack, project) end StackedDataProject(stack) @@ -124,11 +140,13 @@ interpreted as follows: For directories, the filename "Data.toml" is implicitly appended. `expanduser()` is used to expand the user's home directory. - As in `DEPOT_PATH`, an *empty* path component means the user's default - Julia home directory, `joinpath(homedir(), ".julia", "datasets")` + Julia depot (e.g. `~/.julia/datasets`), determined by the first element + of `DEPOT_PATH`. -This simplified version of the code loading rules (LOAD_PATH/DEPOT_PATH) is +This simplified version of the code loading rules (`LOAD_PATH`/`DEPOT_PATH``) is used as it seems unlikely that we'll want data location to be version- -dependent in the same way that that code is. +dependent in the same way that that code is. Note that any changes to `DEPOT_PATH` +after `DataSets` has been loaded do not affect `DataSets.PROJECT`. Unlike `LOAD_PATH`, `JULIA_DATASETS_PATH` is represented inside the program as a `StackedDataProject`, and users can add custom projects by defining their own @@ -146,7 +164,7 @@ function __init__() # be unnecessary and can cause problems if those driver modules use # Requires-like code loading. if !_isprecompiling() - global PROJECT = create_project_stack(ENV) + global PROJECT = create_project_stack(ENV, DEPOT_PATH) for proj in PROJECT.projects try add_storage_driver(proj) diff --git a/src/repl.jl b/src/repl.jl index e7f5709..7c7a8b6 100644 --- a/src/repl.jl +++ b/src/repl.jl @@ -212,7 +212,7 @@ function parse_data_repl_cmd(cmdstr) if subcmd == "push" path = popfirst!(tokens) return quote - proj = $DataSets.data_project_from_path($path) + proj = $DataSets.data_project_from_path($path; depot_paths=DEPOT_PATH) stack = $DataSets.PROJECT pushfirst!(stack, proj) stack diff --git a/test/projects.jl b/test/projects.jl index 61a49f8..2ac994c 100644 --- a/test/projects.jl +++ b/test/projects.jl @@ -114,7 +114,7 @@ end joinpath(@__DIR__, "Data.toml"), ""], paths_sep) fake_env = Dict("JULIA_DATASETS_PATH"=>datasets_paths) - proj = DataSets.create_project_stack(fake_env) + proj = DataSets.create_project_stack(fake_env, [joinpath(homedir(), ".julia"), joinpath("root", "julia")]) @test proj.projects[1] isa ActiveDataProject @test proj.projects[2] isa TomlFileDataProject @@ -129,6 +129,14 @@ end DataSets.__init__() @test DataSets.PROJECT.projects[1] isa TomlFileDataProject @test project_name(DataSets.PROJECT.projects[1]) == joinpath(@__DIR__, "Data.toml") + + # Test a few edge cases too: + @test_logs ( + :warn, "Julia depot data project (for an empty dataset path) can not be constructed because DEPOT_PATH is empty." + ) DataSets.create_project_stack(Dict("JULIA_DATASETS_PATH"=>"foo$(paths_sep)"), []) + @test_logs ( + :warn, "Julia depot path (relative/depot/path) not absolute. Fixing data project path relative to current working directory." + ) DataSets.create_project_stack(Dict("JULIA_DATASETS_PATH"=>"$(paths_sep)/foo"), ["relative/depot/path"]) end @testset "config!() metadata update" begin From 2b31808e649b77ee0aafad5ade038e88bc485d6c Mon Sep 17 00:00:00 2001 From: Morten Piibeleht Date: Thu, 24 Nov 2022 08:47:47 +1300 Subject: [PATCH 26/33] Handle hyphens in dataset() (#57) --- src/DataSet.jl | 26 +++++++++++--------------- src/data_project.jl | 22 +++++++++++++--------- test/runtests.jl | 43 +++++++++++++++++++++++++++++-------------- 3 files changed, 53 insertions(+), 38 deletions(-) diff --git a/src/DataSet.jl b/src/DataSet.jl index 8a1843d..bfcafa0 100644 --- a/src/DataSet.jl +++ b/src/DataSet.jl @@ -51,20 +51,17 @@ separated with forward slashes. Examples: username/data organization/project/data """ -function is_valid_dataset_name(name::AbstractString) - # DataSet names disallow most punctuation for now, as it may be needed as - # delimiters in data-related syntax (eg, for the data REPL). - dataset_name_pattern = r" - ^ - [[:alpha:]] - (?: - [-[:alnum:]_] | - / (?=[[:alpha:]]) - )* - $ - "x - return occursin(dataset_name_pattern, name) -end +is_valid_dataset_name(name::AbstractString) = occursin(DATASET_NAME_REGEX, name) +# DataSet names disallow most punctuation for now, as it may be needed as +# delimiters in data-related syntax (eg, for the data REPL). +const DATASET_NAME_REGEX_STRING = raw""" +[[:alpha:]] +(?: + [-[:alnum:]_] | + / (?=[[:alpha:]]) +)* +""" +const DATASET_NAME_REGEX = Regex("^\n$(DATASET_NAME_REGEX_STRING)\n\$", "x") function make_valid_dataset_name(name) if !is_valid_dataset_name(name) @@ -191,4 +188,3 @@ function Base.open(as_type, dataset::DataSet) @! ResourceContexts.detach_context_cleanup(result) end end - diff --git a/src/data_project.jl b/src/data_project.jl index 9458c82..0ee6f95 100644 --- a/src/data_project.jl +++ b/src/data_project.jl @@ -107,16 +107,20 @@ function _unescapeuri(str) return String(take!(out)) end +# Parse as a suffix of URI syntax +# name/of/dataset?param1=value1¶m2=value2#fragment +const DATASET_SPEC_REGEX = Regex( + """ + ^ + ($(DATASET_NAME_REGEX_STRING)) + (?:\\?([^#]*))? # query - a=b&c=d + (?:\\#(.*))? # fragment - ... + \$ + """, + "x", +) function _split_dataspec(spec::AbstractString) - # Parse as a suffix of URI syntax - # name/of/dataset?param1=value1¶m2=value2#fragment - m = match(r" - ^ - ((?:[[:alpha:]][[:alnum:]_]*/?)+) # name - a/b/c - (?:\?([^#]*))? # query - a=b&c=d - (?:\#(.*))? # fragment - ... - $"x, - spec) + m = match(DATASET_SPEC_REGEX, spec) if isnothing(m) return nothing, nothing, nothing end diff --git a/test/runtests.jl b/test/runtests.jl index 0b2925d..b5c7b1b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -78,20 +78,20 @@ end #------------------------------------------------------------------------------- @testset "Data set names" begin - # Valid names - @test DataSets.is_valid_dataset_name("a_b") - @test DataSets.is_valid_dataset_name("a-b") - @test DataSets.is_valid_dataset_name("a1") - @test DataSets.is_valid_dataset_name("δεδομένα") - @test DataSets.is_valid_dataset_name("a/b") - @test DataSets.is_valid_dataset_name("a/b/c") - # Invalid names - @test !DataSets.is_valid_dataset_name("1") - @test !DataSets.is_valid_dataset_name("a b") - @test !DataSets.is_valid_dataset_name("a.b") - @test !DataSets.is_valid_dataset_name("a/b/") - @test !DataSets.is_valid_dataset_name("a//b") - @test !DataSets.is_valid_dataset_name("/a/b") + @testset "Valid name: $name" for name in ( + "a_b", "a-b", "a1", "δεδομένα", "a/b", "a/b/c", "a-", "b_", + ) + @test DataSets.is_valid_dataset_name(name) + @test DataSets._split_dataspec(name) == (name, nothing, nothing) + end + + @testset "Invalid name: $name" for name in ( + "1", "a b", "a.b", "a/b/", "a//b", "/a/b", "a/-", "a/1", "a/ _/b" + ) + @test !DataSets.is_valid_dataset_name(name) + @test DataSets._split_dataspec(name) == (nothing, nothing, nothing) + end + # Error message for invalid names @test_throws ErrorException("DataSet name \"a?b\" is invalid. DataSet names must start with a letter and can contain only letters, numbers, `_` or `/`.") DataSets.check_dataset_name("a?b") @@ -107,6 +107,21 @@ end end @testset "URL-like dataspec parsing" begin + # Valid dataspecs + DataSets._split_dataspec("foo?x=1#f") == ("foo", ["x" => "1"], "f") + DataSets._split_dataspec("foo#f") == ("foo", nothing, "f") + DataSets._split_dataspec("foo?x=1") == ("foo", ["x" => "1"], nothing) + DataSets._split_dataspec("foo?x=1") == ("foo", ["x" => "1"], nothing) + # Invalid dataspecs + DataSets._split_dataspec("foo ?x=1") == (nothing, nothing, nothing) + DataSets._split_dataspec("foo\n?x=1") == (nothing, nothing, nothing) + DataSets._split_dataspec("foo\nbar?x=1") == (nothing, nothing, nothing) + DataSets._split_dataspec(" foo?x=1") == (nothing, nothing, nothing) + DataSets._split_dataspec("1?x=1") == (nothing, nothing, nothing) + DataSets._split_dataspec("foo-?x=1") == (nothing, nothing, nothing) + DataSets._split_dataspec("foo #f") == (nothing, nothing, nothing) + DataSets._split_dataspec("@?x=1") == (nothing, nothing, nothing) + proj = DataSets.load_project("Data.toml") @test !haskey(dataset(proj, "a_text_file"), "dataspec") From bdaa1ffef440941df9c62e8cb6fb1fef23b7f080 Mon Sep 17 00:00:00 2001 From: Morten Piibeleht Date: Thu, 12 Oct 2023 13:39:33 +1300 Subject: [PATCH 27/33] feat: add way to register post-__init__ callbacks (#66) (cherry picked from commit 7ef6c567f9a7aec33be1d7a704694735ef48ee4c) --- src/DataSets.jl | 50 ++++++++++++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 9 ++++++++- 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/src/DataSets.jl b/src/DataSets.jl index 5214879..46bbe83 100644 --- a/src/DataSets.jl +++ b/src/DataSets.jl @@ -173,6 +173,56 @@ function __init__() =# project=proj exception=(exc,catch_backtrace()) end end + # Call any post-__init__() callbacks that were registered before __init__() was called, + # or had chance to finish. + lock(_PROJECT_INIT_LOCK) do + _PROJECT_INITIALIZED[] = true + for f in _PROJECT_INIT_CALLBACKS + _invoke_init_cb(f) + end + # No need to keep the callbacks around, and maybe the GC can free some memory. + empty!(_PROJECT_INIT_CALLBACKS) + end + end +end + +# The register_post_init_callback() can be used to add a callback that will get called +# when DataSets.__init__() has run. Note: if f() throws an error, it does not cause a crash. +# +# This is useful for sysimages where the module is already be loaded (in Base.loaded_modules), +# but __init__() has not been called yet. In particular, this means that other packages' __init__ +# functions can be sure that when they call initialization code that affects DataSets (in particular, +# DataSets.PROJECT), then that code runs after __init__() has run. +# +# In the non-sysimage case, DataSets.__init__() would normally have already been called when +# once register_post_init_callback() becomes available, and so in those situations, the callback +# gets called immediately. However, in a system image, DataSets may have to queue up (FIFO) the +# callback functions and wait until DataSets.__init__() has finished. +# +# Since the __init__() functions in sysimages can run in parallel, we use a lock just in case, +# to make sure that two parallel calls would succeed. +const _PROJECT_INIT_LOCK = ReentrantLock() +const _PROJECT_INITIALIZED = Ref{Bool}(false) +const _PROJECT_INIT_CALLBACKS = Base.Callable[] +function register_post_init_callback(f::Base.Callable) + invoke = lock(_PROJECT_INIT_LOCK) do + if _PROJECT_INITIALIZED[] + return true + end + push!(_PROJECT_INIT_CALLBACKS, f) + return false + end + # We'll invoke outside of the lock, so that a long-running f() call + # wouldn't block other calls to register_post_init_callback() + invoke && _invoke_init_cb(f) + return nothing +end + +function _invoke_init_cb(f::Base.Callable) + try + Base.invokelatest(f) + catch e + @error "Failed to run init callback: $f" exception = (e, catch_backtrace()) end end diff --git a/test/runtests.jl b/test/runtests.jl index b5c7b1b..4e34625 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -7,7 +7,14 @@ using ResourceContexts using DataSets: FileSystemRoot -#------------------------------------------------------------------------------- +@testset "register_post_init_callback" begin + init_was_called = Ref(false) + DataSets.register_post_init_callback() do + init_was_called[] = true + end + @test init_was_called[] +end + @testset "DataSet config" begin proj = DataSets.load_project("Data.toml") From 29589af87644dc429ef7a64f79778b54337ca8cd Mon Sep 17 00:00:00 2001 From: Morten Piibeleht Date: Mon, 19 Feb 2024 22:34:11 +1300 Subject: [PATCH 28/33] Add compat entries to standard libraries (#71) * chore: add standard library compats to Project.toml * loosen a bit * fix compats --- Project.toml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 4b87646..ac2e005 100644 --- a/Project.toml +++ b/Project.toml @@ -16,9 +16,14 @@ UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [compat] AbstractTrees = "0.3,0.4" +Base64 = "<0.0.1, 1" +Markdown = "<0.0.1,1" +REPL = "<0.0.1, 1" ReplMaker = "0.2" ResourceContexts = "0.1,0.2" -TOML = "1" +SHA = "<0.0.1, 0.7, 1" +TOML = "<0.0.1, 1" +UUIDs = "<0.0.1, 1" julia = "1.5" [extras] From 27047b25564fa89857b0216e6dee344b9bf65cea Mon Sep 17 00:00:00 2001 From: Morten Piibeleht Date: Mon, 19 Feb 2024 22:34:21 +1300 Subject: [PATCH 29/33] feat: allow dataset names to start with numbers (#70) --- Project.toml | 2 +- docs/src/design.md | 3 +-- src/DataSets.jl | 15 +++++++++------ test/runtests.jl | 4 +++- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/Project.toml b/Project.toml index ac2e005..45047da 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "DataSets" uuid = "c9661210-8a83-48f0-b833-72e62abce419" authors = ["Chris Foster and contributors"] -version = "0.2.10" +version = "0.2.11" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" diff --git a/docs/src/design.md b/docs/src/design.md index f060db9..7500881 100644 --- a/docs/src/design.md +++ b/docs/src/design.md @@ -93,7 +93,7 @@ names to `DataSet`s. Perhaps it also maintains the serialized `DataSet` information as well for those datasets which are not registered. It might be stored in a Data.toml, in analogy to Project.toml. -Maintaince of the data project should occur via a data REPL. +Maintenance of the data project should occur via a data REPL. ## Data Registries @@ -277,4 +277,3 @@ array of strings) is restricted to tabular data, but seems similar in spirit to DataSets.jl. * [FileTrees.jl](http://shashi.biz/FileTrees.jl) provides tools for representing and processing tree-structured data lazily and in parallel. - diff --git a/src/DataSets.jl b/src/DataSets.jl index ed7c5d0..6e4ee01 100644 --- a/src/DataSets.jl +++ b/src/DataSets.jl @@ -84,14 +84,17 @@ end """ check_dataset_name(name) -Check whether a dataset name is valid. Valid names include start with a letter -and may contain letters, numbers or `_`. Names may be hieracicial, with pieces -separated with forward slashes. Examples: +Check whether a dataset name is valid. + +Valid names must start with a letter or a number, the rest of the name can also contain `-` +and `_` characters. The names can also be hieracicial, with segments separated by forward +slashes (`/`). Each segment must also start with either a letter or a number. For example: my_data my_data_1 username/data - organization-dataset_name/project/data + organization_name/project-name/data + 123user/456dataset--name """ function check_dataset_name(name::AbstractString) if !occursin(DATASET_NAME_REGEX, name) @@ -101,10 +104,10 @@ end # DataSet names disallow most punctuation for now, as it may be needed as # delimiters in data-related syntax (eg, for the data REPL). const DATASET_NAME_REGEX_STRING = raw""" -[[:alpha:]] +[[:alnum:]] (?: [-[:alnum:]_] | - / (?=[[:alpha:]]) + / (?=[[:alnum:]]) )* """ const DATASET_NAME_REGEX = Regex("^\n$(DATASET_NAME_REGEX_STRING)\n\$", "x") diff --git a/test/runtests.jl b/test/runtests.jl index fabe54a..98a7966 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -101,13 +101,15 @@ end @testset "Data set name parsing" begin @testset "Valid name: $name" for name in ( "a_b", "a-b", "a1", "δεδομένα", "a/b", "a/b/c", "a-", "b_", + "1", "a/1", "123", "12ab/34cd", "1/2/3", "1-2-3", "x_-__", "a---", ) @test DataSets.check_dataset_name(name) === nothing @test DataSets._split_dataspec(name) == (name, nothing, nothing) end @testset "Invalid name: $name" for name in ( - "1", "a b", "a.b", "a/b/", "a//b", "/a/b", "a/-", "a/1", "a/ _/b" + "a b", "a.b", "a/b/", "a//b", "/a/b", "a/-", "a/ _/b", + "a/-a", "a/-1", ) @test_throws ErrorException DataSets.check_dataset_name(name) @test DataSets._split_dataspec(name) == (nothing, nothing, nothing) From bcc6e6a0c603d0896e2526381f52797e093b4d58 Mon Sep 17 00:00:00 2001 From: tan Date: Wed, 29 Jan 2025 12:07:20 +0530 Subject: [PATCH 30/33] fix: handle symbolic links while comparing paths Update test to handle differences in paths that may crop up due to symbolic links. Compare paths by convertig both to `realpath`s. --- test/projects.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/projects.jl b/test/projects.jl index 6d34b54..5cad524 100644 --- a/test/projects.jl +++ b/test/projects.jl @@ -39,7 +39,8 @@ test_project_names = ["a_text_file", # Test @__DIR__ templating # Use `cleanpath` as there's currently a mixture of / and \ on windows # which does work, but is quite ugly. - cleanpath(p) = replace(p, '\\'=>'/') + # Also use realpath to resolve any differences due to symbolic links. + cleanpath(p) = realpath(replace(p, '\\'=>'/')) @test cleanpath(proj["a_text_file"].storage["path"]) == cleanpath(joinpath(@__DIR__, "data", "file.txt")) end From 9bceb6fd8fe815a704e1d3dfde8ae41f86c2cbc2 Mon Sep 17 00:00:00 2001 From: Morten Piibeleht Date: Thu, 30 Jan 2025 16:04:14 +1300 Subject: [PATCH 31/33] test: load dataset names from files --- test/dataset-names-invalid.txt | 9 +++++++++ test/dataset-names-valid.txt | 16 ++++++++++++++++ test/runtests.jl | 30 ++++++++++++++++++------------ 3 files changed, 43 insertions(+), 12 deletions(-) create mode 100644 test/dataset-names-invalid.txt create mode 100644 test/dataset-names-valid.txt diff --git a/test/dataset-names-invalid.txt b/test/dataset-names-invalid.txt new file mode 100644 index 0000000..589f600 --- /dev/null +++ b/test/dataset-names-invalid.txt @@ -0,0 +1,9 @@ +a b +a.b +a/b/ +a//b +/a/b +a/- +a/ _/b +a/-a +a/-1 diff --git a/test/dataset-names-valid.txt b/test/dataset-names-valid.txt new file mode 100644 index 0000000..9c2440a --- /dev/null +++ b/test/dataset-names-valid.txt @@ -0,0 +1,16 @@ +a_b +a-b +a1 +δεδομένα +a/b +a/b/c +a- +b_ +1 +a/1 +123 +12ab/34cd +1/2/3 +1-2-3 +x_-__ +a--- diff --git a/test/runtests.jl b/test/runtests.jl index 98a7966..8c6b42f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -98,21 +98,27 @@ end end #------------------------------------------------------------------------------- +function load_list(filename) + lines = eachline(joinpath(@__DIR__, filename)) + filter(!isempty, strip.(lines)) +end @testset "Data set name parsing" begin - @testset "Valid name: $name" for name in ( - "a_b", "a-b", "a1", "δεδομένα", "a/b", "a/b/c", "a-", "b_", - "1", "a/1", "123", "12ab/34cd", "1/2/3", "1-2-3", "x_-__", "a---", - ) - @test DataSets.check_dataset_name(name) === nothing - @test DataSets._split_dataspec(name) == (name, nothing, nothing) + @testset "Valid names" begin + valid_names = load_list("dataset-names-valid.txt") + @test length(valid_names) == 16 + @testset "Valid name: $name" for name in valid_names + @test DataSets.check_dataset_name(name) === nothing + @test DataSets._split_dataspec(name) == (name, nothing, nothing) + end end - @testset "Invalid name: $name" for name in ( - "a b", "a.b", "a/b/", "a//b", "/a/b", "a/-", "a/ _/b", - "a/-a", "a/-1", - ) - @test_throws ErrorException DataSets.check_dataset_name(name) - @test DataSets._split_dataspec(name) == (nothing, nothing, nothing) + @testset "Invalid names" begin + invalid_names = load_list("dataset-names-invalid.txt") + @test length(invalid_names) == 9 + @testset "Invalid name: $name" for name in invalid_names + @test_throws ErrorException DataSets.check_dataset_name(name) + @test DataSets._split_dataspec(name) == (nothing, nothing, nothing) + end end end From 26e2fc16c1c57941d30d511857598fdf7c180cab Mon Sep 17 00:00:00 2001 From: Morten Piibeleht Date: Thu, 30 Jan 2025 16:17:43 +1300 Subject: [PATCH 32/33] feat: expand the dataset name regex to allow periods Co-authored-by: Sebastian Pfitzner --- src/DataSets.jl | 32 ++++++++++++++--- test/dataset-names-invalid.txt | 9 ----- test/runtests.jl | 36 ++++++++++++++++--- test/testnames-invalid.txt | 25 +++++++++++++ ...et-names-valid.txt => testnames-valid.txt} | 8 +++++ 5 files changed, 92 insertions(+), 18 deletions(-) delete mode 100644 test/dataset-names-invalid.txt create mode 100644 test/testnames-invalid.txt rename test/{dataset-names-valid.txt => testnames-valid.txt} (56%) diff --git a/src/DataSets.jl b/src/DataSets.jl index 6e4ee01..22dacb9 100644 --- a/src/DataSets.jl +++ b/src/DataSets.jl @@ -87,18 +87,39 @@ end Check whether a dataset name is valid. Valid names must start with a letter or a number, the rest of the name can also contain `-` -and `_` characters. The names can also be hieracicial, with segments separated by forward -slashes (`/`). Each segment must also start with either a letter or a number. For example: +and `_` characters. The names can also be hierarchical, with segments separated by forward +slashes (`/`) or (`.`). Each segment must also start with either a letter or a number. + +For example, the following dataset names are valid: my_data my_data_1 username/data organization_name/project-name/data 123user/456dataset--name + username/my_table.csv + dataset/v0.1.2 + +whereas names like this are invalid: + + __mydata__ + username/.git + my...dataset + +!!! note "Segment separators" + + In dataset names, both `/` and `.` are considered segment separators from a syntax + perspective. While DataSets.jl does not impose any specific interpretation on the + dataset name, it is recommended to use `/` to separate segments from a semantic + perspective, and to interpret each forward-slash-separated segment as a path separator. + Periods would conventionally be used to separate file extensions within a segment. + + E.g. use `username/my-project-data/population.csv`, rather than + `username.my-project-data.population.csv` or something like that. """ function check_dataset_name(name::AbstractString) if !occursin(DATASET_NAME_REGEX, name) - error("DataSet name \"$name\" is invalid. DataSet names must start with a letter and can contain only letters, numbers, `-`, `_` or `/`.") + error("DataSet name \"$name\" is invalid. DataSet names must start with a letter or a number, and can contain only letters, numbers, `-` and `_`, or `/` and `.` as segment separators.") end end # DataSet names disallow most punctuation for now, as it may be needed as @@ -106,8 +127,9 @@ end const DATASET_NAME_REGEX_STRING = raw""" [[:alnum:]] (?: - [-[:alnum:]_] | - / (?=[[:alnum:]]) + [-[:alnum:]_] | + \.(?=[[:alnum:]]) | + \/ (?=[[:alnum:]]) )* """ const DATASET_NAME_REGEX = Regex("^\n$(DATASET_NAME_REGEX_STRING)\n\$", "x") diff --git a/test/dataset-names-invalid.txt b/test/dataset-names-invalid.txt deleted file mode 100644 index 589f600..0000000 --- a/test/dataset-names-invalid.txt +++ /dev/null @@ -1,9 +0,0 @@ -a b -a.b -a/b/ -a//b -/a/b -a/- -a/ _/b -a/-a -a/-1 diff --git a/test/runtests.jl b/test/runtests.jl index 8c6b42f..4c48815 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -104,20 +104,48 @@ function load_list(filename) end @testset "Data set name parsing" begin @testset "Valid names" begin - valid_names = load_list("dataset-names-valid.txt") - @test length(valid_names) == 16 + valid_names = load_list("testnames-valid.txt") + @test !isempty(valid_names) @testset "Valid name: $name" for name in valid_names @test DataSets.check_dataset_name(name) === nothing @test DataSets._split_dataspec(name) == (name, nothing, nothing) + # Also test that the name is still valid when it appears as part of + # a path elements. + let path_name = "foo/$(name)" + @test DataSets.check_dataset_name(path_name) === nothing + @test DataSets._split_dataspec(path_name) == (path_name, nothing, nothing) + end + let path_name = "$(name)/foo" + @test DataSets.check_dataset_name(path_name) === nothing + @test DataSets._split_dataspec(path_name) == (path_name, nothing, nothing) + end + let path_name = "foo/$(name)/bar" + @test DataSets.check_dataset_name(path_name) === nothing + @test DataSets._split_dataspec(path_name) == (path_name, nothing, nothing) + end end end @testset "Invalid names" begin - invalid_names = load_list("dataset-names-invalid.txt") - @test length(invalid_names) == 9 + invalid_names = load_list("testnames-invalid.txt") + @test !isempty(invalid_names) @testset "Invalid name: $name" for name in invalid_names @test_throws ErrorException DataSets.check_dataset_name(name) @test DataSets._split_dataspec(name) == (nothing, nothing, nothing) + # Also test that the name is still invalid when it appears as part of + # a path elements. + let path_name = "foo/$(name)" + @test_throws ErrorException DataSets.check_dataset_name(path_name) === nothing + @test DataSets._split_dataspec(path_name) == (nothing, nothing, nothing) + end + let path_name = "$(name)/foo" + @test_throws ErrorException DataSets.check_dataset_name(path_name) === nothing + @test DataSets._split_dataspec(path_name) == (nothing, nothing, nothing) + end + let path_name = "foo/$(name)/bar" + @test_throws ErrorException DataSets.check_dataset_name(path_name) === nothing + @test DataSets._split_dataspec(path_name) == (nothing, nothing, nothing) + end end end end diff --git a/test/testnames-invalid.txt b/test/testnames-invalid.txt new file mode 100644 index 0000000..bf78155 --- /dev/null +++ b/test/testnames-invalid.txt @@ -0,0 +1,25 @@ +a b +a/b/ +a//b +/a/b +a/- +a/ _/b +a/-a +a/-1 +.a +..a +a. +a.. +.a. +a..b +.abc +abc. +abc/.def +abc/def. +a./b +a.- +_._ +a._b +a.-b +./a +b/../a diff --git a/test/dataset-names-valid.txt b/test/testnames-valid.txt similarity index 56% rename from test/dataset-names-valid.txt rename to test/testnames-valid.txt index 9c2440a..53c7bbb 100644 --- a/test/dataset-names-valid.txt +++ b/test/testnames-valid.txt @@ -14,3 +14,11 @@ a/1 1-2-3 x_-__ a--- +a.b +a.b +abc.def +abc/def.ghi +abc-def.ghi_jkl +a.b.c +a_.c +foo__-.csv From 004b35b889daba099beb8bfc4494e58a027763d3 Mon Sep 17 00:00:00 2001 From: Morten Piibeleht Date: Tue, 4 Feb 2025 14:33:17 +1300 Subject: [PATCH 33/33] Set version to 0.2.12 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 45047da..409bf9e 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "DataSets" uuid = "c9661210-8a83-48f0-b833-72e62abce419" authors = ["Chris Foster and contributors"] -version = "0.2.11" +version = "0.2.12" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"