From 2208e4ffe5bfadd1525306689aa10fc1f4760e0a Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Wed, 20 Apr 2022 17:24:45 +1000 Subject: [PATCH 1/3] DataSet create/delete + storage driver API changes * For data projects: - create() to create datasets - setindex!() to add existing datasets - delete() to delete datasets - Implementations for StackedDataProject, AbstractTOMLDataProject and TOMLDataProject * Concrete save_project() API to persist a DataProject to a file as TOML * For storage drivers: - AbstractDataDriver and implementation for FileSystemDriver - open_dataset to do what the current function-based API does - create_storage to initialize storage - delete_storage to remove storage - These ideas seem a bit half-baked * Refactoring open() to add write=true keyword --- src/DataSet.jl | 73 +++++++++++++++++++++-------- src/DataSets.jl | 9 ++-- src/FileTree.jl | 20 ++++---- src/TomlDataStorage.jl | 96 ++++++++++++++++++++++++++------------- src/data_project.jl | 56 +++++++++++++++++++++-- src/file_data_projects.jl | 76 +++++++++++++++++++++++++++++++ src/filesystem.jl | 82 ++++++++++++++++++++++++++++++--- src/storage_drivers.jl | 22 ++++++--- 8 files changed, 355 insertions(+), 79 deletions(-) diff --git a/src/DataSet.jl b/src/DataSet.jl index 8a1843d..dfd5da9 100644 --- a/src/DataSet.jl +++ b/src/DataSet.jl @@ -103,6 +103,7 @@ end Base.getindex(d::DataSet, name::AbstractString) = getindex(d.conf, name) Base.haskey(d::DataSet, name::AbstractString) = haskey(d.conf, name) +Base.get(d::DataSet, name::AbstractString, default) = get(d.conf, name, default) function data_project(dataset::DataSet) return getfield(dataset, :project) @@ -150,44 +151,80 @@ end #------------------------------------------------------------------------------- # Functions for opening datasets +# +# In principle we've got the following six variants: +# +# Scoped forms: +# +# open(dataset; kws...) do ... MISSING!! +# open(T, dataset; kws...) do ... +# +# Context manager: +# +# x = open(ctx, dataset; kws...) +# x = open(ctx, T, dataset; kws...) +# +# Finalizer-based: +# x = open(dataset; kws...) +# x = open(T, dataset; kws...) + # do-block form of open() -function Base.open(f::Function, as_type, dataset::DataSet) - storage_config = dataset.storage +function Base.open(f::Function, as_type, dataset::DataSet; write=false) driver = _find_driver(dataset) - driver(storage_config, dataset) do storage - open(f, as_type, storage) + if driver isa AbstractDataDriver + storage = open_dataset(driver, dataset, write) + try + open(f, as_type, storage, write=write) + close_dataset(storage) + catch exc + close_dataset(storage, exc) + rethrow() + end + else + # Old deprecated API + storage_config = dataset.storage + driver = _find_driver(dataset) + driver(storage_config, dataset) do storage + open(f, as_type, storage) + end end end # Contexts-based form of open() -@! function Base.open(dataset::DataSet) - storage_config = dataset.storage +@! function Base.open(dataset::DataSet; write=false) driver = _find_driver(dataset) - # Use `enter_do` because drivers don't yet use the ResourceContexts.jl mechanism - (storage,) = @! enter_do(driver, storage_config, dataset) + if driver isa AbstractDataDriver + storage = open_dataset(driver, dataset, write) + @defer close_dataset(storage) + else + # Old deprecated API + # Use `enter_do` because drivers are just functions + if write + error("Cannot use `write=true` with the old storage API.") + end + storage_config = dataset.storage + (storage,) = @! enter_do(driver, storage_config, dataset) + end storage end -@! function Base.open(as_type, dataset::DataSet) - storage = @! open(dataset) - @! open(as_type, storage) +@! function Base.open(as_type, dataset::DataSet; write=false) + storage = @! open(dataset; write=write) + @! open(as_type, storage; write=write) end -# TODO: -# Consider making a distinction between open() and load(). - # Finalizer-based version of open() -function Base.open(dataset::DataSet) +function Base.open(dataset::DataSet; write=false) @context begin - result = @! open(dataset) + result = @! open(dataset; write=write) @! ResourceContexts.detach_context_cleanup(result) end end -function Base.open(as_type, dataset::DataSet) +function Base.open(as_type, dataset::DataSet; write=false) @context begin - result = @! open(as_type, dataset) + result = @! open(as_type, dataset; write=write) @! ResourceContexts.detach_context_cleanup(result) end end diff --git a/src/DataSets.jl b/src/DataSets.jl index ca15b19..cf77d82 100644 --- a/src/DataSets.jl +++ b/src/DataSets.jl @@ -58,6 +58,8 @@ name="" """ const CURRENT_DATA_CONFIG_VERSION = 1 +const ConfigDict = Dict{String,Any} + include("paths.jl") include("DataSet.jl") include("data_project.jl") @@ -226,14 +228,15 @@ include("entrypoint.jl") # Builtin Data models include("FileTree.jl") -# Builtin backends +# Builtin data drivers include("filesystem.jl") include("TomlDataStorage.jl") - -# Backends # include("ZipTree.jl") # include("GitTree.jl") +add_storage_driver("FileSystem"=>FileSystemDriver()) +add_storage_driver("TomlDataStorage"=>TomlDataDriver()) + # Application-level stuff include("repl.jl") diff --git a/src/FileTree.jl b/src/FileTree.jl index 2a354d7..45cb124 100644 --- a/src/FileTree.jl +++ b/src/FileTree.jl @@ -1,13 +1,12 @@ -# Many datasets have tree-like indices. Examples: -# -# Index Data -# -# * OS: directories files -# * Git: trees blobs -# * S3: prefixes blobs -# * HDF5 group typed data -# * Zip flattend directory(?) blobs +# Many storage systems have tree-like indices. Examples: # +# Storage Index Data +# ------- ----------- ---------- +# OS filesystem files +# Git trees blobs +# S3 keys blobs +# HDF5 groups typed data +# Zip keys blobs import AbstractTrees: AbstractTrees, children @@ -471,6 +470,9 @@ end # Base.open(::Type{T}, file::File; kws...) where {T} = open(identity, T, file.root, file.path; kws...) +function close_dataset(storage::Union{File,FileTree}, exc=nothing) + close_dataset(storage.root) +end #------------------------------------------------------------------------------- # Path manipulation diff --git a/src/TomlDataStorage.jl b/src/TomlDataStorage.jl index 5271869..3015b28 100644 --- a/src/TomlDataStorage.jl +++ b/src/TomlDataStorage.jl @@ -28,17 +28,34 @@ For FileTree: ... ``` """ -struct TomlDataStorage +struct TomlDataRoot dataset::DataSet - data::Union{String,Dict{String,Any}} + data::Union{Vector{UInt8},ConfigDict} + write::Bool +end + +_data_strings_to_buffers(data::String) = base64decode(data) +function _data_strings_to_buffers(data::Dict) + ConfigDict(k=>_data_strings_to_buffers(v) for (k,v) in pairs(data)) +end +_data_strings_to_buffers(data) = error("Unexpected embedded data: expected a string or dictionary") + +_data_buffers_to_strings(data::Vector{UInt8}) = base64encode(data) +function _data_buffers_to_strings(data::Dict) + ConfigDict(k=>_data_buffers_to_strings(v) for (k,v) in pairs(data)) end # Get TOML data at `path`, returning nothing if not present -function _getpath(storage::TomlDataStorage, path::RelPath) +function _getpath(storage::TomlDataRoot, path::RelPath) x = storage.data for c in path.components + if !(x isa AbstractDict) + return nothing + end x = get(x, c, nothing) - !isnothing(x) || return nothing + if isnothing(x) + return nothing + end end x end @@ -46,13 +63,13 @@ end #-------------------------------------------------- # Storage data interface for trees -Base.isdir(storage::TomlDataStorage, path::RelPath) = _getpath(storage, path) isa Dict -Base.isfile(storage::TomlDataStorage, path::RelPath) = _getpath(storage, path) isa String -Base.ispath(storage::TomlDataStorage, path::RelPath) = !isnothing(_getpath(storage, path)) +Base.isdir(storage::TomlDataRoot, path::RelPath) = _getpath(storage, path) isa Dict +Base.isfile(storage::TomlDataRoot, path::RelPath) = _getpath(storage, path) isa String +Base.ispath(storage::TomlDataRoot, path::RelPath) = !isnothing(_getpath(storage, path)) -Base.summary(io::IO, storage::TomlDataStorage) = print(io, "Data.toml") +Base.summary(io::IO, storage::TomlDataRoot) = print(io, "Data.toml") -function Base.readdir(storage::TomlDataStorage, path::RelPath) +function Base.readdir(storage::TomlDataRoot, path::RelPath) try tree = _getpath(storage, path) !isnothing(tree) || KeyError(path) @@ -66,62 +83,77 @@ end # Storage data interface for File function Base.open(func::Function, as_type::Type{IO}, - storage::TomlDataStorage, path; kws...) - @context func(@! open(as_type, storage, path; kws...)) + storage::TomlDataRoot, path; write=false, kws...) + @context func(@! open(as_type, storage, path; write=false, kws...)) end -@! function Base.open(::Type{Vector{UInt8}}, storage::TomlDataStorage, path; - write=false, read=!write, kws...) - if write - error("Embedded data is read-only from within the DataSets interface") - end +@! function Base.open(::Type{Vector{UInt8}}, storage::TomlDataRoot, path; + write=false) try - str = _getpath(storage, path) - !isnothing(str) || KeyError(path) - base64decode(str::AbstractString) + buf = _getpath(storage, path) + !isnothing(buf) || KeyError(path) + return buf catch error("TOML storage requires data to be as base64 encoded strings") end end -@! function Base.open(::Type{IO}, storage::TomlDataStorage, path; kws...) - buf = @! open(Vector{UInt8}, storage, path; kws...) - IOBuffer(buf) +@! function Base.open(::Type{IO}, storage::TomlDataRoot, path; write=false) + buf = @! open(Vector{UInt8}, storage, path; write=write) + if write + # For consistency with filesystem version of open() + resize!(buf,0) + end + return IOBuffer(buf, write=write) +end + +@! function Base.open(::Type{String}, storage::TomlDataRoot, path; write=false) + buf = @! open(Vector{UInt8}, storage, path; write=write) + return String(copy(buf)) end +function close_dataset(storage::TomlDataRoot, exc=nothing) + if storage.write + encoded_data = _data_buffers_to_strings(storage.data) + # Force writing of dataset to project + conf = copy(storage.dataset.storage) + conf["data"] = encoded_data + config(storage.dataset; storage=conf) + end +end # TODO: The following should be factored out and implemented generically -function Base.read(storage::TomlDataStorage, path::RelPath, ::Type{T}) where {T} +function Base.read(storage::TomlDataRoot, path::RelPath, ::Type{T}) where {T} @context begin io = @! open(IO, storage, path) read(io, T) end end -function Base.read(storage::TomlDataStorage, path::RelPath) +function Base.read(storage::TomlDataRoot, path::RelPath) @context @! open(Vector{UInt8}, storage, path) end #------------------------------------------------------------------------------- -# Connect storage backend -function connect_toml_data_storage(f, config, dataset) - type = config["type"] - data = get(config, "data", nothing) +struct TomlDataDriver <: AbstractDataDriver +end + +function open_dataset(driver::TomlDataDriver, dataset, write) + type = dataset.storage["type"] + data = get(dataset.storage, "data", nothing) if type in ("File", "Blob") if !(data isa AbstractString) error("TOML data storage requires string data in the \"storage.data\" key") end - f(File(TomlDataStorage(dataset, data))) + return File(TomlDataRoot(dataset, _data_strings_to_buffers(data), write)) elseif type in ("FileTree", "BlobTree") if !(data isa AbstractDict) error("TOML data storage requires a dictionary in the \"storage.data\" key") end - f(FileTree(TomlDataStorage(dataset, data))) + return FileTree(TomlDataRoot(dataset, _data_strings_to_buffers(data), write)) else throw(ArgumentError("DataSet type $type not supported for data embedded in Data.toml")) end end -add_storage_driver("TomlDataStorage"=>connect_toml_data_storage) - diff --git a/src/data_project.jl b/src/data_project.jl index 9458c82..a149c4f 100644 --- a/src/data_project.jl +++ b/src/data_project.jl @@ -41,7 +41,7 @@ function dataset(proj::AbstractDataProject, spec::AbstractString) # Enhance dataset with "dataspec" holding URL-like fragment & query dataspec = Dict() if !isnothing(query) - dataspec["query"] = Dict{String,Any}(query) + dataspec["query"] = ConfigDict(query) end if !isnothing(fragmentstr) dataspec["fragment"] = fragmentstr @@ -224,13 +224,13 @@ A in-memory collection of DataSets. """ struct DataProject <: AbstractDataProject datasets::Dict{String,DataSet} - drivers::Vector{Dict{String,Any}} + drivers::Vector{ConfigDict} end -DataProject() = DataProject(Dict{String,DataSet}(), Vector{Dict{String,Any}}()) +DataProject() = DataProject(Dict{String,DataSet}(), Vector{ConfigDict}()) DataProject(project::AbstractDataProject) = DataProject(Dict(pairs(project)), - Vector{Dict{String,Any}}()) + Vector{ConfigDict}()) data_drivers(project::DataProject) = project.drivers @@ -249,9 +249,21 @@ function Base.iterate(proj::DataProject, state=nothing) end function Base.setindex!(proj::DataProject, data::DataSet, name::AbstractString) + if haskey(proj, name) && proj[name] !== data + throw(ArgumentError("Cannot replace existing dataset with name \"$name\". Try DataSets.delete() first.")) + end + if isnothing(data_project(data)) + setfield!(data, :project, proj) + elseif data_project(data) !== proj + throw(ArgumentError("DataSet is already owned by a different project")) + end proj.datasets[name] = data end +function delete(proj::DataProject, name::AbstractString) + delete!(proj.datasets, name) +end + #------------------------------------------------------------------------------- """ StackedDataProject() @@ -309,6 +321,25 @@ function Base.show(io::IO, mime::MIME"text/plain", stack::StackedDataProject) end end +function create(stack::StackedDataProject, name; kws...) + for proj in stack.projects + ds = create(proj, name; kws...) + if !isnothing(ds) + return ds + end + end + return nothing +end + +function delete(stack::StackedDataProject, name) + for proj in stack.projects + if haskey(proj, name) + delete(proj, name) + return + end + end + throw(ArgumentError("Could not find dataset \"$name\" in project")) +end #------------------------------------------------------------------------------- """ @@ -362,6 +393,23 @@ function project_toml(proj::DataProject) return sprint(TOML.print, conf) end +#------------------------------------------------------------------------------- +# Global versions of the dataset metadata manipulation functions which act on +# the global dataset PROJECT object. + function config!(name::AbstractString; kws...) config!(PROJECT, name; kws...) end + +function create(name::AbstractString; kws...) + ds = create(PROJECT, name; kws...) + if isnothing(ds) + error("Could not create dataset in any available data project") + end + return ds +end + +function delete(name::AbstractString) + delete(PROJECT, name) +end + diff --git a/src/file_data_projects.jl b/src/file_data_projects.jl index 79f3773..10a6bf3 100644 --- a/src/file_data_projects.jl +++ b/src/file_data_projects.jl @@ -154,6 +154,50 @@ function config!(proj::AbstractTomlFileDataProject, dataset::DataSet; kws...) return dataset end +function create(proj::AbstractTomlFileDataProject, name; + # One of the following required + source::Union{Nothing,DataSet}=nothing, + driver::Union{Nothing,AbstractString}=nothing, + linked = false, + # Descriptive metadata + kws... + ) + + if isnothing(project_name(proj)) + return nothing + end + + if haskey(proj, name) + throw(ArgumentError("DataSet named \"$name\" already exists in project.")) + end + + driver = linked && !isnothing(source) ? _find_driver(source) : + !isnothing(driver) ? _find_driver(driver) : + default_driver(proj) + + if linked + storage = deepcopy(source.storage) + else + storage = create_storage(proj, driver, name; source=source, kws...) + end + + conf = Dict( + "name"=>name, + "uuid"=>string(uuid4()), + "storage"=>storage + ) + + conf["linked"] = linked + for (k,v) in kws + conf[string(k)] = v + end + + ds = DataSet(conf) + proj[ds.name] = ds + return ds +end + + #------------------------------------------------------------------------------- """ Data project which automatically updates based on a TOML file on the local @@ -183,6 +227,38 @@ end project_name(proj::TomlFileDataProject) = proj.path +function Base.setindex!(proj::TomlFileDataProject, data::DataSet, name::AbstractString) + p = get_cache(proj) + p[name] = data + save_project(proj.path, p) +end + +function delete(proj::TomlFileDataProject, name::AbstractString) + # FIXME: Make this safe for concurrent use in-process + # (or better, between processes?) + p = get_cache(proj) + + ds = dataset(p, name) + # Assume all datasets which don't have the "linked" property are linked. + # This prevents us accidentally deleting data. + if get(ds, "linked", true) + @info "Linked dataset is preserved on data storage" name + else + driver = _find_driver(ds) + delete_storage(proj, driver, ds) + end + + delete(p, name) + save_project(proj.path, p) +end + +#------------------------------------------------------------------------------- +default_driver(proj::AbstractTomlFileDataProject) = FileSystemDriver() + +project_root_path(proj) = error("No local path for data project type $(typeof(proj))") +project_root_path(proj::TomlFileDataProject) = dirname(proj.path) + + #------------------------------------------------------------------------------ """ Data project, based on the location of the current explicitly selected Julia diff --git a/src/filesystem.jl b/src/filesystem.jl index 73d1803..fcf9faa 100644 --- a/src/filesystem.jl +++ b/src/filesystem.jl @@ -100,6 +100,11 @@ end @! open(sys_abspath(root, path); read=read, write=write, kws...) end +# FIXME: close() vs close_dataset() ? +function close_dataset(storage::FileSystemRoot, exc=nothing) + # Nothing to do here — data is already on the filesystem. +end + Base.read(root::FileSystemRoot, path::RelPath, ::Type{T}) where {T} = read(sys_abspath(root, path), T) Base.read(root::FileSystemRoot, path::RelPath) = @@ -227,9 +232,10 @@ function Base.setindex!(tree::FileTree{FileSystemRoot}, return tree end - #-------------------------------------------------- # FileSystem storage driver +struct FileSystemDriver <: AbstractDataDriver +end """ local_data_abspath(project, relpath) @@ -275,7 +281,8 @@ For FileTree: windows_path=\$(absolute_windows_path_to_file) ``` """ -function connect_filesystem(f, config, dataset) +function open_dataset(driver::FileSystemDriver, dataset, write) + config = dataset.storage # Paths keys can be in three forms documented above; if haskey(config, "path") pathstr = config["path"] @@ -309,10 +316,10 @@ function connect_filesystem(f, config, dataset) type = config["type"] if type in ("File", "Blob") isfile(path) || throw(ArgumentError("$(repr(path)) should be a file")) - storage = File(FileSystemRoot(path)) + storage = File(FileSystemRoot(path; write=write)) elseif type in ("FileTree", "BlobTree") isdir(path) || throw(ArgumentError("$(repr(path)) should be a directory")) - storage = FileTree(FileSystemRoot(path)) + storage = FileTree(FileSystemRoot(path; write=write)) path = dataspec_fragment_as_path(dataset) if !isnothing(path) storage = storage[path] @@ -320,9 +327,72 @@ function connect_filesystem(f, config, dataset) else throw(ArgumentError("DataSet type $type not supported on the filesystem")) end - f(storage) + return storage end -add_storage_driver("FileSystem"=>connect_filesystem) + +function create_storage(proj, driver::FileSystemDriver, + name::AbstractString; + source::Union{Nothing,DataSet}=nothing, + dtype::Union{Nothing,AbstractString}=nothing, + kws...) + + # For other cases here, we're not linking to the original source, but + # rather making a copy. + if !isnothing(source) + dtype = source.storage["type"] + else + if isnothing(dtype) + throw(ArgumentError("Must provide one of `source` or `dtype`.")) + end + end + + has_slashes = '/' in name + local_name = has_slashes ? + joinpath(split(name, '/')) : + name + + # project_root_path() will fail + data_path = joinpath(project_root_path(proj), local_name) + if ispath(data_path) + error("Local path already exists: $data_path") + end + + if !(dtype in ("Blob", "BlobTree")) + error("Unknown storage type for FileSystemDriver: $dtype") + end + + # Create file or directory + if has_slashes + mkpath(dirname(data_path)) + end + if dtype == "Blob" + touch(data_path) + elseif dtype == "BlobTree" + mkdir(data_path) + end + + Dict( + "driver"=>"FileSystem", + "type"=>dtype, + "path"=>"@__DIR__/$name", + ) +end + + +function delete_storage(proj, driver::FileSystemDriver, ds::DataSet) + path = ds.storage["path"] + type = ds.storage["type"] + if type == "Blob" + isfile(path) || throw(ArgumentError("$(repr(path)) should be a file")) + rm(path) + elseif type == "BlobTree" + isdir(path) || throw(ArgumentError("$(repr(path)) should be a directory")) + rm(path, recursive=true) + else + throw(ArgumentError("DataSet type $type not supported on the filesystem")) + end +end + diff --git a/src/storage_drivers.jl b/src/storage_drivers.jl index 6794bbe..a934dbc 100644 --- a/src/storage_drivers.jl +++ b/src/storage_drivers.jl @@ -1,5 +1,7 @@ # Global record of registered storage drivers +abstract type AbstractDataDriver end + const _storage_drivers_lock = ReentrantLock() const _storage_drivers = Dict{String,Any}() @@ -68,18 +70,24 @@ function add_storage_driver(project::AbstractDataProject) end end -function _find_driver(dataset) +function _find_driver(dataset::DataSet) storage_config = dataset.storage driver_name = get(storage_config, "driver") do error("`storage.driver` configuration not found for dataset $(dataset.name)") end + driver = _find_driver(driver_name) + if isnothing(driver) + error(""" + Storage driver $(repr(driver_name)) not found for dataset $(dataset.name). + Current drivers are $(collect(keys(_storage_drivers))) + """) + end + return driver +end + +function _find_driver(driver_name::AbstractString) driver = lock(_storage_drivers_lock) do - get(_storage_drivers, driver_name) do - error(""" - Storage driver $(repr(driver_name)) not found for dataset $(dataset.name). - Current drivers are $(collect(keys(_storage_drivers))) - """) - end + get(_storage_drivers, driver_name, nothing) end end From c3769be3dbac49569fe8c7615ff6585dce0f44e8 Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Mon, 30 May 2022 15:14:13 +1000 Subject: [PATCH 2/3] Rename delete -> delete! and create -> create! Fix save_project for delete and create --- src/FileTree.jl | 4 +-- src/data_project.jl | 28 ++++++++++++--------- src/file_data_projects.jl | 51 +++++++++++++++++++++------------------ 3 files changed, 47 insertions(+), 36 deletions(-) diff --git a/src/FileTree.jl b/src/FileTree.jl index 45cb124..c8d50c0 100644 --- a/src/FileTree.jl +++ b/src/FileTree.jl @@ -389,7 +389,7 @@ newfile(tree::FileTree, path::AbstractString; kws...) = newfile(func::Function, tree::FileTree, path::AbstractString; kws...) = newfile(func, tree, RelPath(path); kws...) Base.delete!(tree::FileTree, path::AbstractString) = - delete!(tree, RelPath(path)) + Base.delete!(tree, RelPath(path)) function _check_writeable(tree) if !iswriteable(tree.root) @@ -457,7 +457,7 @@ function Base.delete!(tree::FileTree, path::RelPath) _check_writeable(tree) relpath = joinpath(tree.path, path) root = tree.root - delete!(root, relpath) + Base.delete!(root, relpath) end function Base.open(f::Function, ::Type{FileTree}, tree::FileTree) diff --git a/src/data_project.jl b/src/data_project.jl index a149c4f..9f107fc 100644 --- a/src/data_project.jl +++ b/src/data_project.jl @@ -250,7 +250,7 @@ end function Base.setindex!(proj::DataProject, data::DataSet, name::AbstractString) if haskey(proj, name) && proj[name] !== data - throw(ArgumentError("Cannot replace existing dataset with name \"$name\". Try DataSets.delete() first.")) + throw(ArgumentError("Cannot replace existing dataset with name \"$name\". Try delete!() first.")) end if isnothing(data_project(data)) setfield!(data, :project, proj) @@ -260,8 +260,8 @@ function Base.setindex!(proj::DataProject, data::DataSet, name::AbstractString) proj.datasets[name] = data end -function delete(proj::DataProject, name::AbstractString) - delete!(proj.datasets, name) +function Base.delete!(proj::DataProject, name::AbstractString) + Base.delete!(proj.datasets, name) end #------------------------------------------------------------------------------- @@ -321,9 +321,9 @@ function Base.show(io::IO, mime::MIME"text/plain", stack::StackedDataProject) end end -function create(stack::StackedDataProject, name; kws...) +function create!(stack::StackedDataProject, name; kws...) for proj in stack.projects - ds = create(proj, name; kws...) + ds = create!(proj, name; kws...) if !isnothing(ds) return ds end @@ -331,10 +331,10 @@ function create(stack::StackedDataProject, name; kws...) return nothing end -function delete(stack::StackedDataProject, name) +function Base.delete!(stack::StackedDataProject, name) for proj in stack.projects if haskey(proj, name) - delete(proj, name) + Base.delete!(proj, name) return end end @@ -401,15 +401,21 @@ function config!(name::AbstractString; kws...) config!(PROJECT, name; kws...) end -function create(name::AbstractString; kws...) - ds = create(PROJECT, name; kws...) +function create!(name::AbstractString; kws...) + ds = create!(PROJECT, name; kws...) if isnothing(ds) error("Could not create dataset in any available data project") end return ds end -function delete(name::AbstractString) - delete(PROJECT, name) +# Version of `delete!` for acting on the global data project. +# +# Unfortunately this can't be a method of `Base.delete!`, as that'd be type +# piracy... +function delete!(name::AbstractString) + Base.delete!(PROJECT, name) end +# DataSets.delete! is just Base.delete! in other cases +delete!(xs...) = Base.delete!(xs...) diff --git a/src/file_data_projects.jl b/src/file_data_projects.jl index 10a6bf3..7e069a1 100644 --- a/src/file_data_projects.jl +++ b/src/file_data_projects.jl @@ -147,14 +147,21 @@ function config!(proj::AbstractTomlFileDataProject, dataset::DataSet; kws...) if data_project(dataset) !== proj error("dataset must belong to project") end - # Here we accept the update independently of the project - Data.toml should - # be able to manage any dataset config. - config!(nothing, dataset; kws...) - set_cache(proj, project_toml(get_cache(proj, false))) + _save_project(proj) do inner_proj + # Here we accept the update independently of the project - Data.toml + # should be able to manage any dataset config. + config!(nothing, dataset; kws...) + inner_proj + end return dataset end -function create(proj::AbstractTomlFileDataProject, name; +function _save_project(f::Function, proj::AbstractTomlFileDataProject) + p = f(get_cache(proj, false)) + set_cache(proj, project_toml(p)) +end + +function create!(proj::AbstractTomlFileDataProject, name; # One of the following required source::Union{Nothing,DataSet}=nothing, driver::Union{Nothing,AbstractString}=nothing, @@ -228,28 +235,26 @@ end project_name(proj::TomlFileDataProject) = proj.path function Base.setindex!(proj::TomlFileDataProject, data::DataSet, name::AbstractString) - p = get_cache(proj) - p[name] = data - save_project(proj.path, p) + _save_project(proj) do inner_proj + inner_proj[name] = data + inner_proj + end end -function delete(proj::TomlFileDataProject, name::AbstractString) - # FIXME: Make this safe for concurrent use in-process - # (or better, between processes?) - p = get_cache(proj) +function Base.delete!(proj::TomlFileDataProject, name::AbstractString) + _save_project(proj) do inner_proj + ds = dataset(inner_proj, name) + # Assume all datasets which don't have the "linked" property are linked. + # This prevents us accidentally deleting data. + if get(ds, "linked", true) + @info "Linked dataset is preserved on data storage" name + else + driver = _find_driver(ds) + delete_storage(proj, driver, ds) + end - ds = dataset(p, name) - # Assume all datasets which don't have the "linked" property are linked. - # This prevents us accidentally deleting data. - if get(ds, "linked", true) - @info "Linked dataset is preserved on data storage" name - else - driver = _find_driver(ds) - delete_storage(proj, driver, ds) + Base.delete!(inner_proj, name) end - - delete(p, name) - save_project(proj.path, p) end #------------------------------------------------------------------------------- From c895b4f745e078d433314d439cfec00412120daf Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Mon, 30 May 2022 15:57:38 +1000 Subject: [PATCH 3/3] fixup! DataSet create/delete + storage driver API changes --- src/FileTree.jl | 4 ++ src/TomlDataStorage.jl | 4 +- src/data_project.jl | 29 +++++++++- src/file_data_projects.jl | 6 +-- src/filesystem.jl | 109 +++++++++++++++++++------------------- 5 files changed, 89 insertions(+), 63 deletions(-) diff --git a/src/FileTree.jl b/src/FileTree.jl index c8d50c0..ff3e157 100644 --- a/src/FileTree.jl +++ b/src/FileTree.jl @@ -474,6 +474,10 @@ function close_dataset(storage::Union{File,FileTree}, exc=nothing) close_dataset(storage.root) end +# Utility functions +is_File_dtype(dtype) = (dtype == "File" || dtype == "Blob") +is_FileTree_dtype(dtype) = (dtype == "FileTree" || dtype == "BlobTree") + #------------------------------------------------------------------------------- # Path manipulation diff --git a/src/TomlDataStorage.jl b/src/TomlDataStorage.jl index 3015b28..681f30e 100644 --- a/src/TomlDataStorage.jl +++ b/src/TomlDataStorage.jl @@ -142,12 +142,12 @@ end function open_dataset(driver::TomlDataDriver, dataset, write) type = dataset.storage["type"] data = get(dataset.storage, "data", nothing) - if type in ("File", "Blob") + if is_File_dtype(type) if !(data isa AbstractString) error("TOML data storage requires string data in the \"storage.data\" key") end return File(TomlDataRoot(dataset, _data_strings_to_buffers(data), write)) - elseif type in ("FileTree", "BlobTree") + elseif is_FileTree_dtype(type) if !(data isa AbstractDict) error("TOML data storage requires a dictionary in the \"storage.data\" key") end diff --git a/src/data_project.jl b/src/data_project.jl index 9f107fc..2503835 100644 --- a/src/data_project.jl +++ b/src/data_project.jl @@ -198,8 +198,8 @@ function Base.show(io::IO, ::MIME"text/plain", project::AbstractDataProject) for (i, (name, data)) in enumerate(sorted) pad = maxwidth - textwidth(name) storagetype = get(data.storage, "type", nothing) - icon = storagetype in ("File", "Blob") ? '📄' : - storagetype in ("FileTree", "BlobTree") ? '📁' : + icon = is_File_dtype(storagetype) ? '📄' : + is_FileTree_dtype(storagetype) ? '📁' : '❓' print(io, " ", icon, ' ', name, ' '^pad, " => ", data.uuid) if i < length(sorted) @@ -321,6 +321,31 @@ function Base.show(io::IO, mime::MIME"text/plain", stack::StackedDataProject) end end +""" + create!(name; [type=..., driver=..., description=..., tags=..., kws...]) + +Create a dataset named `name` in the global data project. + + create!(proj, name; kws...) + +Create a dataset in the given data project. The keyword arguments `kws` are the +same as the global version. + + +# Examples + +To create a `File` named "SomeFile" on the local filesystem, + +``` +DataSets.create!("SomeFile", type="File", driver="FileSystem") +``` + +To create a `FileTree` named "Namespace/SomeDir" on the local filesystem, + +``` +DataSets.create!("Namespace/SomeDir", type="FileTree", driver="FileSystem") +``` +""" function create!(stack::StackedDataProject, name; kws...) for proj in stack.projects ds = create!(proj, name; kws...) diff --git a/src/file_data_projects.jl b/src/file_data_projects.jl index 7e069a1..d4a6f42 100644 --- a/src/file_data_projects.jl +++ b/src/file_data_projects.jl @@ -201,6 +201,7 @@ function create!(proj::AbstractTomlFileDataProject, name; ds = DataSet(conf) proj[ds.name] = ds + setfield!(ds, :project, proj) return ds end @@ -252,17 +253,14 @@ function Base.delete!(proj::TomlFileDataProject, name::AbstractString) driver = _find_driver(ds) delete_storage(proj, driver, ds) end - Base.delete!(inner_proj, name) + inner_proj end end #------------------------------------------------------------------------------- default_driver(proj::AbstractTomlFileDataProject) = FileSystemDriver() -project_root_path(proj) = error("No local path for data project type $(typeof(proj))") -project_root_path(proj::TomlFileDataProject) = dirname(proj.path) - #------------------------------------------------------------------------------ """ diff --git a/src/filesystem.jl b/src/filesystem.jl index fcf9faa..fb8b807 100644 --- a/src/filesystem.jl +++ b/src/filesystem.jl @@ -255,6 +255,40 @@ function local_data_abspath(::Nothing, path) end +# Return local filesystem absolute path for the dataset, as a string +function _filesystem_dataset_abspath(dataset) + config = dataset.storage + if haskey(config, "path") + pathstr = config["path"] + # Local absolute paths are not portable. Previously these were allowed + # in the "path" key, but those are now deprecated in favor of + # system-specific path keys unix_path or windows_path + if isabspath(pathstr) + Base.depwarn(""" + Absolute paths in Data.toml are deprecated. Instead, use relative + paths (separated with `/`) relative to the Data.toml location.""", + :connect_filesystem) + return pathstr + else + if '\\' in pathstr && Sys.iswindows() + # Heuristic deprecation warning for windows paths in Data.toml + Base.depwarn( + "Relative paths in Data.toml should be separated with '/' characters.", + :connect_filesystem) + pathstr = join(split(pathstr, '\\'), '/') + end + relpath = joinpath(split(pathstr, '/')...) + return local_data_abspath(data_project(dataset), relpath) + end + elseif haskey(config, "unix_path") && Sys.isunix() + return config["unix_path"] + elseif haskey(config, "windows_path") && Sys.iswindows() + return config["windows_path"] + else + error("No \"path\" key found for FileSystem storage driver.") + end +end + """ ## Metadata spec @@ -284,40 +318,12 @@ For FileTree: function open_dataset(driver::FileSystemDriver, dataset, write) config = dataset.storage # Paths keys can be in three forms documented above; - if haskey(config, "path") - pathstr = config["path"] - # Local absolute paths are not portable. Previously these were allowed - # in the "path" key, but those are now deprecated in favor of - # system-specific path keys unix_path or windows_path - if isabspath(pathstr) - Base.depwarn(""" - Absolute paths in Data.toml are deprecated. Instead, use relative - paths (separated with `/`) relative to the Data.toml location.""", - :connect_filesystem) - path = pathstr - else - if '\\' in pathstr && Sys.iswindows() - # Heuristic deprecation warning for windows paths in Data.toml - Base.depwarn( - "Relative paths in Data.toml should be separated with '/' characters.", - :connect_filesystem) - pathstr = join(split(pathstr, '\\'), '/') - end - relpath = joinpath(split(pathstr, '/')...) - path = local_data_abspath(data_project(dataset), relpath) - end - elseif haskey(config, "unix_path") && Sys.isunix() - path = config["unix_path"] - elseif haskey(config, "windows_path") && Sys.iswindows() - path = config["windows_path"] - else - error("No \"path\" key found for FileSystem storage driver.") - end type = config["type"] - if type in ("File", "Blob") + path = _filesystem_dataset_abspath(dataset) + if is_File_dtype(type) isfile(path) || throw(ArgumentError("$(repr(path)) should be a file")) storage = File(FileSystemRoot(path; write=write)) - elseif type in ("FileTree", "BlobTree") + elseif is_FileTree_dtype(type) isdir(path) || throw(ArgumentError("$(repr(path)) should be a directory")) storage = FileTree(FileSystemRoot(path; write=write)) path = dataspec_fragment_as_path(dataset) @@ -333,59 +339,54 @@ end function create_storage(proj, driver::FileSystemDriver, name::AbstractString; source::Union{Nothing,DataSet}=nothing, - dtype::Union{Nothing,AbstractString}=nothing, + type::Union{Nothing,AbstractString}=nothing, kws...) - # For other cases here, we're not linking to the original source, but # rather making a copy. if !isnothing(source) - dtype = source.storage["type"] + type = source.storage["type"] else - if isnothing(dtype) - throw(ArgumentError("Must provide one of `source` or `dtype`.")) + if isnothing(type) + throw(ArgumentError("Must provide one of `source` or `type`.")) end end has_slashes = '/' in name - local_name = has_slashes ? - joinpath(split(name, '/')) : - name + relpath = has_slashes ? joinpath(split(name, '/')) : name - # project_root_path() will fail - data_path = joinpath(project_root_path(proj), local_name) + data_path = local_data_abspath(proj, relpath) if ispath(data_path) error("Local path already exists: $data_path") end - if !(dtype in ("Blob", "BlobTree")) - error("Unknown storage type for FileSystemDriver: $dtype") + if !(type in ("File", "FileTree")) + error("Unknown storage type for FileSystemDriver: $type") end # Create file or directory if has_slashes mkpath(dirname(data_path)) end - if dtype == "Blob" + if is_File_dtype(type) touch(data_path) - elseif dtype == "BlobTree" + elseif is_FileTree_dtype(type) mkdir(data_path) end Dict( "driver"=>"FileSystem", - "type"=>dtype, - "path"=>"@__DIR__/$name", + "type"=>type, + "path"=>relpath, ) end - -function delete_storage(proj, driver::FileSystemDriver, ds::DataSet) - path = ds.storage["path"] - type = ds.storage["type"] - if type == "Blob" +function delete_storage(proj, driver::FileSystemDriver, dataset::DataSet) + path = _filesystem_dataset_abspath(dataset) + type = dataset.storage["type"] + if is_File_dtype(type) isfile(path) || throw(ArgumentError("$(repr(path)) should be a file")) rm(path) - elseif type == "BlobTree" + elseif is_FileTree_dtype(type) isdir(path) || throw(ArgumentError("$(repr(path)) should be a directory")) rm(path, recursive=true) else @@ -394,8 +395,6 @@ function delete_storage(proj, driver::FileSystemDriver, ds::DataSet) end - - #------------------------------------------------------------------------------- # Deprecations function Base.abspath(relpath::RelPath)