diff --git a/Project.toml b/Project.toml index 6bf4305..4a06e6e 100644 --- a/Project.toml +++ b/Project.toml @@ -1,12 +1,14 @@ name = "DataSets" uuid = "c9661210-8a83-48f0-b833-72e62abce419" authors = ["Chris Foster and contributors"] -version = "0.2.2" +version = "0.2.3" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" +Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" ReplMaker = "b873ce64-0db9-51f5-a568-4457d8e49576" +ResourceContexts = "8d208092-d35c-4dd3-a0d7-8325f9cce6b4" SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce" TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" @@ -14,6 +16,7 @@ UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [compat] AbstractTrees = "0.3" ReplMaker = "0.2" +ResourceContexts = "0.1" TOML = "1" julia = "1.5" diff --git a/docs/dev.jl b/docs/dev.jl new file mode 100644 index 0000000..0df06d9 --- /dev/null +++ b/docs/dev.jl @@ -0,0 +1,2 @@ +using LiveServer +servedocs(doc_env=true, foldername=@__DIR__) diff --git a/docs/src/tutorial.md b/docs/src/tutorial.md index 15c830a..b07ea4d 100644 --- a/docs/src/tutorial.md +++ b/docs/src/tutorial.md @@ -10,7 +10,8 @@ end DocTestFilters = [ r"(?<=Project: \[).*$", r"path =.*", - r"@.*" + r"@.*", + r"(?<=IOStream\().*", ] ``` @@ -85,17 +86,42 @@ path = ".../DataSets/docs/src/data/file.txt" ## Loading Data -To load data, call the `open()` function on the `DataSet` and pass the desired -Julia type which will be returned. For example, to read the dataset named -`"a_text_file"` as a `String`, +You can call `open()` on a DataSet to inspect the data inside. `open()` will +return the [`Blob`](@ref) and [`BlobTree`](@ref) types for local files and +directories on disk. For example, ```jldoctest +julia> open(dataset("a_text_file")) +📄 @ .../DataSets/docs/src/data/file.txt + +julia> open(dataset("a_tree_example")) +📂 Tree @ .../DataSets/docs/src/data/csvset + 📄 1.csv + 📄 2.csv +``` + +Use the form `open(T, dataset)` to read the data as a specific type. `Blob` +data can be opened as `String`, `IO`, or `Vector{UInt8}`, depending on your +needs: + +```jldoctest +julia> io = open(IO, dataset("a_text_file")) +IOStream() + +julia> read(io, String) +"Hello world!\n" + +julia> buf = open(Vector{UInt8}, dataset("a_text_file")); + +julia> String(buf) +"Hello world!\n" + julia> open(String, dataset("a_text_file")) "Hello world!\n" ``` -It's also possible to open this data as an `IO` stream, in which case the do -block form should be used: +To ensure the dataset is closed again in a timely way (freeing any resources +such as file handles), you should use the scoped form, for example: ```jldoctest julia> open(IO, dataset("a_text_file")) do io @@ -106,10 +132,11 @@ julia> open(IO, dataset("a_text_file")) do io content = "Hello world!\n" ``` -Let's also inspect the tree example using the tree data type -[`BlobTree`](@ref). Such data trees can be indexed with path components to get -at the file [`Blob`](@ref)s inside, which in turn can be `open`ed to retrieve -the data. +Let's look at some tree-like data which is represented on local disk as a +folder or directory. Tree data is opened in Julia as the [`BlobTree`](@ref) +type and can be indexed with path components to get at the file [`Blob`](@ref)s +inside. In turn, we can `open()` one of the file blobs and look at the data +contained within. ```jldoctest julia> tree = open(BlobTree, dataset("a_tree_example")) @@ -118,9 +145,9 @@ julia> tree = open(BlobTree, dataset("a_tree_example")) 📄 2.csv julia> tree["1.csv"] -📄 1.csv @ .../DataSets/test/data/csvset +📄 1.csv @ .../DataSets/docs/src/data/csvset -julia> Text(open(String, tree["1.csv"])) +julia> open(String, tree["1.csv"]) |> Text Name,Age "Aaron",23 "Harry",42 diff --git a/src/BlobTree.jl b/src/BlobTree.jl index 74741ad..1af898c 100644 --- a/src/BlobTree.jl +++ b/src/BlobTree.jl @@ -127,7 +127,7 @@ mapped into the program as an `IO` byte stream, or interpreted as a `String`. Blobs can be arranged into hierarchies "directories" via the `BlobTree` type. """ -struct Blob{Root} +mutable struct Blob{Root} root::Root path::RelPath end @@ -148,7 +148,7 @@ function AbstractTrees.printnode(io::IO, file::Blob) print(io, "📄 ", basename(file)) end -# Opening as Vector{UInt8} or as String uses IO interface +# Opening as Vector{UInt8} or as String defers to IO interface function Base.open(f::Function, ::Type{Vector{UInt8}}, file::Blob) open(IO, file.root, file.path) do io f(read(io)) # TODO: use Mmap? @@ -174,9 +174,51 @@ function Base.open(f::Function, ::Type{T}, file::Blob; kws...) where {T} open(f, T, file.root, file.path; kws...) end -# Unscoped form of open -function Base.open(::Type{T}, file::Blob; kws...) where {T} - open(identity, T, file; kws...) +# ResourceContexts.jl - based versions of the above. + +@! function Base.open(::Type{Vector{UInt8}}, file::Blob) + @context begin + # TODO: use Mmap? + read(@! open(IO, file.root, file.path)) + end +end + +@! function Base.open(::Type{String}, file::Blob) + @context begin + read(@!(open(IO, file.root, file.path)), String) + end +end + +# Default open-type for Blob is IO +@! function Base.open(file::Blob; kws...) + @! open(IO, file.root, file.path; kws...) +end + +# Opening Blob as itself is trivial +@! function Base.open(::Type{Blob}, file::Blob) + file +end + +# open with other types T defers to the underlying storage system +@! function Base.open(::Type{T}, file::Blob; kws...) where {T} + @! open(T, file.root, file.path; kws...) +end + +# Fallback implementation of `@! open(T, root, path)` based on enter_do. +# +# TODO: Update other backends to avoid calling this; using enter_do is pretty +# inefficient. +@! function Base.open(::Type{T}, root, path; kws...) where {T} + (res,) = @! enter_do(open, T, root, path; kws...) + res +end + +# Unscoped form of open for Blob +function Base.open(::Type{T}, blob::Blob; kws...) where {T} + @context begin + result = @! open(T, blob; kws...) + @! ResourceContexts.detach_context_cleanup(result) + end end # read() is also supported for `Blob`s @@ -230,7 +272,7 @@ julia> tree[path"csvset"] 📄 2.csv ``` """ -struct BlobTree{Root} <: AbstractBlobTree +mutable struct BlobTree{Root} <: AbstractBlobTree root::Root path::RelPath end @@ -267,6 +309,8 @@ Base.abspath(tree::BlobTree) = AbsPath(tree.root, tree.path) function Base.getindex(tree::BlobTree, path::RelPath) relpath = joinpath(tree.path, path) root = tree.root + # TODO: Make this more efficient by moving this work to the storage backend? + # Sort of like an equivalent of `stat`? if isdir(root, relpath) BlobTree(root, relpath) elseif isfile(root, relpath) @@ -283,9 +327,9 @@ function Base.getindex(tree::BlobTree, name::AbstractString) end # We've got a weird mishmash of path vs tree handling here. -# TODO: Can we refactor this to cleanly separate the filesystem commands (which -# take abstract paths?) from BlobTree and Blob which act as an abstraction over -# the filesystem or other storage mechanisms? +# TODO: Can we refactor this to cleanly separate the filesystem-like commands +# (which take abstract paths?) from BlobTree and Blob which act as an +# abstraction over the filesystem or other storage mechanisms? function Base.joinpath(tree::BlobTree, r::RelPath) AbsPath(tree.root, joinpath(tree.path, r)) end @@ -302,6 +346,10 @@ function Base.readdir(tree::BlobTree) readdir(tree.root, tree.path) end +function Base.keys(tree::BlobTree) + readdir(tree.root, tree.path) +end + function Base.rm(tree::BlobTree; kws...) rm(tree.root, tree.path; kws...) end @@ -315,4 +363,8 @@ function Base.open(f::Function, ::Type{BlobTree}, tree::BlobTree) f(tree) end +@! function Base.open(::Type{BlobTree}, tree::BlobTree) + tree +end + # Base.open(::Type{T}, file::Blob; kws...) where {T} = open(identity, T, file.root, file.path; kws...) diff --git a/src/DataSets.jl b/src/DataSets.jl index d11f926..52064e2 100644 --- a/src/DataSets.jl +++ b/src/DataSets.jl @@ -3,6 +3,7 @@ module DataSets using UUIDs using TOML using SHA +using ResourceContexts export DataSet, dataset, @datafunc, @datarun export Blob, BlobTree, newfile, newdir @@ -460,7 +461,7 @@ Additional projects may be added or removed from the stack with `pushfirst!`, """ PROJECT = StackedDataProject() -# deprecated. +# deprecated. TODO: Remove dependency on this from JuliaHub _current_project = DataProject() function __init__() @@ -487,7 +488,8 @@ end #------------------------------------------------------------------------------- # Storage layer and interface -_drivers = Dict{String,Any}() +const _storage_drivers_lock = ReentrantLock() +const _storage_drivers = Dict{String,Any}() """ add_storage_driver(driver_name=>storage_opener) @@ -505,54 +507,61 @@ Packages which define new storage drivers should generally call `add_storage_driver()` within their `__init__()` functions. """ function add_storage_driver((name,opener)::Pair) - _drivers[name] = opener + lock(_storage_drivers_lock) do + _storage_drivers[name] = opener + end end +#------------------------------------------------------------------------------- +# Functions for opening datasets + +# do-block form of open() function Base.open(f::Function, as_type, dataset::DataSet) storage_config = dataset.storage - driver = _drivers[storage_config["driver"]] + driver = lock(_storage_drivers_lock) do + _storage_drivers[storage_config["driver"]] + end driver(storage_config, dataset) do storage open(f, as_type, storage) end end -# For convenience, this non-scoped open() just returns the data handle as -# opened. See check_scoped_open for a way to help users avoid errors when using -# this (ie, if `identity` is not a valid argument to open() because resources -# would be closed before it returns). -# -# FIXME: Consider removing this. It should likely be replaced with `load()`, in -# analogy to FileIO.jl's load operation: -# * `load()` is "load the entire file into memory as such-and-such type" -# * `open()` is "open this resource, and run some function while it's open" -Base.open(as_type, conf::DataSet) = open(identity, as_type, conf) - -""" - check_scoped_open(func, as_type) +# Contexts-based form of open() +@! function Base.open(dataset::DataSet) + storage_config = dataset.storage + driver_name = storage_config["driver"] + driver = lock(_storage_drivers_lock) do + _storage_drivers[driver_name] + end + # Use `enter_do` because drivers don't yet use the ResourceContexts.jl mechanism + (storage,) = @! enter_do(driver, storage_config, dataset) + storage +end -Call `check_scoped_open(func, as_type) in your implementation of `open(func, -as_type, data)` if you clean up or `close()` resources by the time `open()` -returns. +@! function Base.open(as_type, dataset::DataSet) + storage = @! open(dataset) + @! open(as_type, storage) +end -That is, if the unscoped form `use(open(AsType, data))` is invalid and the -following scoped form required: +# TODO: +# Consider making a distinction between open() and load(). -``` -open(AsType, data) do x - use(x) +# Finalizer-based version of open() +function Base.open(dataset::DataSet) + @context begin + result = @! open(dataset) + @! ResourceContexts.detach_context_cleanup(result) + end end -``` -The dicotomy of resource handling techniques in `open()` are due to an -unresolved language design problem of how resource handling and cleanup should -work (see https://github.com/JuliaLang/julia/issues/7721). -""" -check_scoped_open(func, as_type) = nothing - -function check_scoped_open(func::typeof(identity), as_type) - throw(ArgumentError("You must use the scoped form `open(your_function, AsType, data)` to open as type $as_type")) +function Base.open(as_type, dataset::DataSet) + @context begin + result = @! open(as_type, dataset) + @! ResourceContexts.detach_context_cleanup(result) + end end +#------------------------------------------------------------------------------- # Application entry points include("entrypoint.jl") @@ -561,6 +570,7 @@ include("BlobTree.jl") # Builtin backends include("filesystem.jl") +include("DataTomlStorage.jl") # Backends # include("ZipTree.jl") diff --git a/src/DataTomlStorage.jl b/src/DataTomlStorage.jl new file mode 100644 index 0000000..5520bba --- /dev/null +++ b/src/DataTomlStorage.jl @@ -0,0 +1,127 @@ +using Base64 + +""" +Storage driver which keeps the data embedded within the TOML file itself. +Useful for small amounts of self-contained data. + +## Metadata spec + +For Blob: +``` + [datasets.storage] + driver="TomlDataStorage" + type="Blob" + data=\$(base64encode(data)) +``` + +For BlobTree: +``` + [datasets.storage] + driver="TomlDataStorage" + type="BlobTree" + + [datasets.storage.data.\$(dirname1)] + "\$(filename1)" = \$(base64encode(data1)) + "\$(filename2)" = \$(base64encode(data2)) + + [datasets.storage.data.\$(dirname2)] + ... +``` +""" +struct TomlDataStorage + dataset::DataSet + data::Union{String,Dict{String,Any}} +end + +# Get TOML data at `path`, returning nothing if not present +function _getpath(storage::TomlDataStorage, path::RelPath) + x = storage.data + for c in path.components + x = get(x, c, nothing) + !isnothing(x) || return nothing + end + x +end + +#-------------------------------------------------- +# Storage data interface for trees + +Base.isdir(storage::TomlDataStorage, path::RelPath) = _getpath(storage, path) isa Dict +Base.isfile(storage::TomlDataStorage, path::RelPath) = _getpath(storage, path) isa String +Base.ispath(storage::TomlDataStorage, path::RelPath) = !isnothing(_getpath(storage, path)) + +Base.summary(io::IO, storage::TomlDataStorage) = print(io, "Data.toml") + +function Base.readdir(storage::TomlDataStorage, path::RelPath) + try + tree = _getpath(storage, path) + !isnothing(tree) || KeyError(path) + sort!(collect(keys(tree::AbstractDict))) + catch + error("TOML storage requires trees to be as TOML dictionaries") + end +end + +#-------------------------------------------------- +# Storage data interface for Blob + +function Base.open(func::Function, as_type::Type{IO}, + storage::TomlDataStorage, path; kws...) + @context func(@! open(as_type, storage, path; kws...)) +end + +@! function Base.open(::Type{Vector{UInt8}}, storage::TomlDataStorage, path; + write=false, read=!write, kws...) + if write + error("Embedded data is read-only from within the DataSets interface") + end + try + str = _getpath(storage, path) + !isnothing(str) || KeyError(path) + base64decode(str::AbstractString) + catch + error("TOML storage requires data to be as base64 encoded strings") + end +end + +@! function Base.open(::Type{IO}, storage::TomlDataStorage, path; kws...) + buf = @! open(Vector{UInt8}, storage, path; kws...) + IOBuffer(buf) +end + + +# TODO: The following should be factored out and implemented generically +function Base.read(storage::TomlDataStorage, path::RelPath, ::Type{T}) where {T} + @context begin + io = @! open(IO, storage, path) + read(io, T) + end +end + +function Base.read(storage::TomlDataStorage, path::RelPath) + @context @! open(Vector{UInt8}, storage, path) +end + + +#------------------------------------------------------------------------------- +# Connect storage backend +function connect_toml_data_storage(f, config, dataset) + type = config["type"] + data = get(config, "data", nothing) + if type == "Blob" + if !(data isa AbstractString) + error("TOML data storage requires string data in the \"storage.data\" key") + end + f(Blob(TomlDataStorage(dataset, data))) + elseif type == "BlobTree" + if !(data isa AbstractDict) + error("TOML data storage requires a dictionary in the \"storage.data\" key") + end + f(BlobTree(TomlDataStorage(dataset, data))) + else + throw(ArgumentError("DataSet type $type not supported for data embedded in Data.toml")) + end +end + +add_storage_driver("TomlDataStorage"=>connect_toml_data_storage) + diff --git a/src/filesystem.jl b/src/filesystem.jl index 5b528e8..1b899eb 100644 --- a/src/filesystem.jl +++ b/src/filesystem.jl @@ -4,7 +4,7 @@ # abstract type AbstractFileSystemRoot end -# These underscore functions sys_abspath and sys_joinpath generate/joins OS-specific +# These functions sys_abspath and sys_joinpath generate/joins OS-specific # _local filesystem paths_ out of logical paths. They should be defined only # for trees which are rooted in the actual filesystem. function sys_abspath(root::AbstractFileSystemRoot, path::RelPath) @@ -17,26 +17,27 @@ sys_abspath(path::AbsPath) = sys_abspath(path.root, path.path) sys_abspath(tree::BlobTree) = sys_abspath(tree.root, tree.path) sys_abspath(file::Blob) = sys_abspath(file.root, file.path) +#-------------------------------------------------- +# Storage data interface for trees +# +# TODO: Formalize this interface! + +## 1. Query + # TODO: would it be better to express the following dispatch in terms of # AbsPath{<:AbstractFileSystemRoot} rather than usin double dispatch? + Base.isdir(root::AbstractFileSystemRoot, path::RelPath) = isdir(sys_abspath(root, path)) Base.isfile(root::AbstractFileSystemRoot, path::RelPath) = isfile(sys_abspath(root, path)) Base.ispath(root::AbstractFileSystemRoot, path::RelPath) = ispath(sys_abspath(root, path)) -Base.read(root::AbstractFileSystemRoot, path::RelPath, ::Type{T}) where {T} = - read(sys_abspath(root, path), T) -Base.read(root::AbstractFileSystemRoot, path::RelPath) where {T} = - read(sys_abspath(root, path)) Base.summary(io::IO, root::AbstractFileSystemRoot) = print(io, sys_abspath(root)) -function Base.open(f::Function, ::Type{IO}, root::AbstractFileSystemRoot, path; - write=false, read=!write, kws...) - if !iswriteable(root) && write - error("Error writing file at read-only path $path") - end - check_scoped_open(f, IO) - open(f, sys_abspath(root, path); read=read, write=write, kws...) -end +Base.readdir(root::AbstractFileSystemRoot, path::RelPath) = readdir(sys_abspath(root, path)) + +## 2. Mutation +# +# TODO: Likely requires rework! function Base.mkdir(root::AbstractFileSystemRoot, path::RelPath; kws...) if !iswriteable(root) @@ -50,9 +51,49 @@ function Base.rm(root::AbstractFileSystemRoot, path::RelPath; kws...) rm(sys_abspath(root,path); kws...) end -Base.readdir(root::AbstractFileSystemRoot, path::RelPath) = readdir(sys_abspath(root, path)) +#-------------------------------------------------- +# Storage data interface for Blob + +# TODO: Make this the generic implementation for AbstractDataStorage +function Base.open(f::Function, as_type::Type{IO}, + root::AbstractFileSystemRoot, path; kws...) + @context f(@! open(as_type, root, path; kws...)) +end + +@! function Base.open(::Type{IO}, root::AbstractFileSystemRoot, path; + write=false, read=!write, kws...) + if !iswriteable(root) && write + error("Error writing file at read-only path $path") + end + @! open(sys_abspath(root, path); read=read, write=write, kws...) +end + +Base.read(root::AbstractFileSystemRoot, path::RelPath, ::Type{T}) where {T} = + read(sys_abspath(root, path), T) +Base.read(root::AbstractFileSystemRoot, path::RelPath) = + read(sys_abspath(root, path)) #-------------------------------------------------- +""" + +## Metadata spec + +For Blob: +``` + [datasets.storage] + driver="FileSystem" + type="Blob" + path=\$(path_to_file) +``` + +For BlobTree: +``` + [datasets.storage] + driver="FileSystem" + type="BlobTree" + path=\$(path_to_directory) +``` +""" struct FileSystemRoot <: AbstractFileSystemRoot path::String read::Bool @@ -68,12 +109,15 @@ iswriteable(root::FileSystemRoot) = root.write sys_abspath(root::FileSystemRoot) = root.path -# For use outside DataSets, we will assume the special case that abspath() with -# a RelPath refers to the current working directory on the local system. -Base.abspath(relpath::RelPath) = +function Base.abspath(relpath::RelPath) + Base.depwarn(""" + `abspath(::RelPath)` defaults to using `pwd()` as the root of the path + but this leads to fragile code so will be removed in the future""", + :abspath) AbsPath(FileSystemRoot(pwd(); write=true, read=true), relpath) +end -#-------------------------------------------------- +#------------------------------------------------------------------------------- # Infrastructure for a somewhat more functional interface for creating file # trees than the fully mutable version we usually use. diff --git a/test/Data.toml b/test/Data.toml index a357ac0..c313528 100644 --- a/test/Data.toml +++ b/test/Data.toml @@ -23,7 +23,6 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26" # type="text" # parameters={encoding="UTF-8"} - #-------------------------------------------------- [[datasets]] description="Gzipped CSV example" @@ -55,3 +54,67 @@ uuid="e7fd7080-e346-4a68-9ca9-98593a99266a" # TODO: Add data maps here which expose it logically as a single CSV? + +#-------------------------------------------------- +[[datasets]] +description="A data blob embedded in the TOML" +name="embedded_blob" +uuid="b498f769-a7f6-4f67-8d74-40b770398f26" + + [datasets.storage] + driver="TomlDataStorage" + type="Blob" + data="AAAAAAAARUA=" + + +[[datasets]] +description="A data tree embedded in the TOML" +name="embedded_tree" +uuid="b498f769-a7f6-4f67-8d74-40b770398f26" + + [datasets.storage] + driver="TomlDataStorage" + type="BlobTree" + +# TOML.print(Dict("datasets"=>[Dict("storage"=>Dict("data"=>Dict(["d0$i"=>Dict(["$x.txt"=>base64encode("$i $x content") for x in ("a","b")]...) for i in 1:4]...)))])) + + [datasets.storage.data.d01] + "b.txt" = "MSBiIGNvbnRlbnQ=" + "a.txt" = "MSBhIGNvbnRlbnQ=" + + [datasets.storage.data.d02] + "b.txt" = "MiBiIGNvbnRlbnQ=" + "a.txt" = "MiBhIGNvbnRlbnQ=" + + [datasets.storage.data.d03] + "b.txt" = "MyBiIGNvbnRlbnQ=" + "a.txt" = "MyBhIGNvbnRlbnQ=" + + [datasets.storage.data.d04] + "b.txt" = "NCBiIGNvbnRlbnQ=" + "a.txt" = "NCBhIGNvbnRlbnQ=" + +#-------------------------------------------------- +[[datasets]] +description="Test old storage backend API, Blob" +name="old_backend_blob" +uuid="785b3cdc-428e-426f-a3f7-3f6ae88a9637" + + [datasets.storage] + driver="OldBackendAPI" + type="Blob" + data="eA==" + +[[datasets]] +description="Test old storage backend API, BlobTree" +name="old_backend_tree" +uuid="4af3a8a9-983b-487b-bfd8-804ca50b4a0c" + + [datasets.storage] + driver="OldBackendAPI" + type="BlobTree" + + [datasets.storage.data] + "b.txt" = "Yg==" + "a.txt" = "YQ==" + diff --git a/test/DataTomlStorage.jl b/test/DataTomlStorage.jl new file mode 100644 index 0000000..06b3343 --- /dev/null +++ b/test/DataTomlStorage.jl @@ -0,0 +1,49 @@ + +@testset "open() for DataSet" begin + proj = DataSets.load_project("Data.toml") + + blob_ds = dataset(proj, "embedded_blob") + @test open(blob_ds) isa Blob + @test open(String, blob_ds) == "\0\0\0\0\0\0E@" + @test read(open(blob_ds), Float64) === 42.0 + + @test open(IO, blob_ds) do io + read(io, String) + end == "\0\0\0\0\0\0E@" + + @context begin + @test @!(open(String, blob_ds)) == "\0\0\0\0\0\0E@" + + blob = @! open(blob_ds) + @test blob isa Blob + @test @!(open(String, blob)) == "\0\0\0\0\0\0E@" + + @test read(blob, Float64) === 42.0 + @test read(blob) == UInt8[0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x45, 0x40] + end + + tree_ds = dataset(proj, "embedded_tree") + @test open(tree_ds) isa BlobTree + @test open(String, open(tree_ds)[path"d01/a.txt"]) == "1 a content" + @test open(String, open(tree_ds)[path"d02/b.txt"]) == "2 b content" + @context begin + tree = @! open(tree_ds) + @test tree isa BlobTree + + @test isdir(tree) + @test !isfile(tree) + + @test readdir(tree) == ["d01", "d02", "d03", "d04"] + @test readdir(tree["d01"]) == ["a.txt", "b.txt"] + + @test !isdir(tree[path"d01/a.txt"]) + @test isfile(tree[path"d01/a.txt"]) + + @test_throws ErrorException tree[path"nonexistent/a/b"] + @test_throws ErrorException tree["nonexistent"] + + @test @!(open(String, tree[path"d01/a.txt"])) == "1 a content" + @test @!(open(String, tree[path"d02/b.txt"])) == "2 b content" + end +end + diff --git a/test/backend_compat.jl b/test/backend_compat.jl new file mode 100644 index 0000000..8cfff10 --- /dev/null +++ b/test/backend_compat.jl @@ -0,0 +1,82 @@ +using Base64 + +# Compatibility for data backends which implement DataSets 0.2.0 storage +# interface. + +#------------------------------------------------------------------------------- +struct OldBackendAPI + data +end + +function _lookup_path(storage::OldBackendAPI, path) + # For backends which try to avoid depending on DataSets types + path = string(path) + isempty(path) ? storage.data : get(storage.data, path, nothing) +end + +function _get_data(storage::OldBackendAPI, path) + base64decode(_lookup_path(storage, path)) +end + +function Base.open(f::Function, ::Type{IO}, storage::OldBackendAPI, path; kws...) where {T} + f(IOBuffer(_get_data(storage, path))) +end + +function Base.read(storage::OldBackendAPI, path, ::Type{T}) where {T} + read(IOBuffer(_get_data(storage, path)), T) +end + +function Base.read(storage::OldBackendAPI, path) + _get_data(storage, path) +end + +function Base.readdir(storage::OldBackendAPI, path) + if isempty(path) + sort!(collect(keys(storage.data))) + else + [] + end +end + +function Base.isdir(storage::OldBackendAPI, path) + path = string(path) + @assert storage.data isa Dict + isempty(path) +end + +function Base.isfile(storage::OldBackendAPI, path) + _lookup_path(storage, path) isa String +end + +function Base.ispath(storage::OldBackendAPI, path) + !isnothing(_lookup_path(storage, path)) +end + + +function connect_old_backend(f, config, ds) + storage = OldBackendAPI(config["data"]) + if config["type"] == "Blob" + f(Blob(storage)) + else + f(BlobTree(storage)) + end +end + +DataSets.add_storage_driver("OldBackendAPI"=>connect_old_backend) + +#------------------------------------------------------------------------------- +@testset "OldBackendAPI" begin + @test open(IO, dataset("old_backend_blob")) do io + read(io, String) + end == "x" + @test String(open(read, IO, dataset("old_backend_blob"))) == "x" + @test open(Vector{UInt8}, dataset("old_backend_blob")) == UInt8['x'] + @test read(open(dataset("old_backend_blob")), String) == "x" + @test read(open(dataset("old_backend_blob"))) == UInt8['x'] + + @test readdir(open(dataset("old_backend_tree"))) == ["a.txt", "b.txt"] + @test open(dataset("old_backend_tree"))[path"a.txt"] isa Blob + @test read(open(dataset("old_backend_tree"))[path"a.txt"], String) == "a" + @test read(open(dataset("old_backend_tree"))[path"b.txt"], String) == "b" +end + diff --git a/test/projects.jl b/test/projects.jl index f7477ab..f2bce97 100644 --- a/test/projects.jl +++ b/test/projects.jl @@ -16,13 +16,13 @@ using DataSets: @test isnothing(get(proj, "nonexistent_data", nothing)) # keys - @test sort(collect(keys(proj))) == ["a_table", "a_text_file", "a_tree_example"] + @test sort(collect(keys(proj))) == ["a_table", "a_text_file", "a_tree_example", "embedded_blob", "embedded_tree", "old_backend_blob", "old_backend_tree"] @test haskey(proj, "a_text_file") @test !haskey(proj, "nonexistent_data") # iteration - @test sort(getproperty.(collect(proj), :name)) == ["a_table", "a_text_file", "a_tree_example"] - @test sort(first.(pairs(proj))) == ["a_table", "a_text_file", "a_tree_example"] + @test sort(getproperty.(collect(proj), :name)) == ["a_table", "a_text_file", "a_tree_example", "embedded_blob", "embedded_tree", "old_backend_blob", "old_backend_tree"] + @test sort(first.(pairs(proj))) == ["a_table", "a_text_file", "a_tree_example", "embedded_blob", "embedded_tree", "old_backend_blob", "old_backend_tree"] # identity @test project_name(proj) == abspath("Data.toml") @@ -99,7 +99,7 @@ end push!(proj, TomlFileDataProject(joinpath(@__DIR__, "active_project", "Data.toml"))) push!(proj, TomlFileDataProject(joinpath(@__DIR__, "Data.toml"))) - @test sort(collect(keys(proj))) == ["a_table", "a_text_file", "a_tree_example"] + @test sort(collect(keys(proj))) == ["a_table", "a_text_file", "a_tree_example", "embedded_blob", "embedded_tree", "old_backend_blob", "old_backend_tree"] # Data "a_text_file" should be found in the first project in the stack, # overriding the data of the same name in the second project. @test proj["a_text_file"].uuid == UUID("314996ef-12be-40d0-912c-9755af354fdb") diff --git a/test/runtests.jl b/test/runtests.jl index 1288117..9c50bf3 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,7 @@ using DataSets using Test using UUIDs +using ResourceContexts using DataSets: FileSystemRoot @@ -37,21 +38,48 @@ end @test ds.uuid == UUID("b498f769-a7f6-4f67-8d74-40b770398f26") end +@testset "open() for DataSet" begin + proj = DataSets.load_project("Data.toml") + + text_data = dataset(proj, "a_text_file") + @test open(text_data) isa Blob + @test read(open(text_data), String) == "Hello world!\n" + @context begin + @test read(@!(open(text_data)), String) == "Hello world!\n" + end + + tree_data = dataset(proj, "a_tree_example") + @test open(tree_data) isa BlobTree + @context begin + @test @!(open(tree_data)) isa BlobTree + tree = @! open(tree_data) + @test readdir(tree) == ["1.csv", "2.csv"] + end +end + #------------------------------------------------------------------------------- -@testset "open() functions" begin +@testset "open() for Blob and BlobTree" begin blob = Blob(FileSystemRoot("data/file.txt")) @test open(identity, String, blob) == "Hello world!\n" @test String(open(identity, Vector{UInt8}, blob)) == "Hello world!\n" @test open(io->read(io,String), IO, blob) == "Hello world!\n" - @test open(io->read(io,String), IO, blob) == "Hello world!\n" @test open(identity, Blob, blob) === blob - # Unscoped form for types which support it. + # Unscoped forms @test open(String, blob) == "Hello world!\n" @test String(open(Vector{UInt8}, blob)) == "Hello world!\n" - @test_throws ArgumentError("You must use the scoped form `open(your_function, AsType, data)` to open as type IO") open(IO, blob) + @test read(open(IO, blob), String) == "Hello world!\n" tree = BlobTree(FileSystemRoot("data")) @test open(identity, BlobTree, tree) === tree + + # Context-based forms + @context begin + @test @!(open(String, blob)) == "Hello world!\n" + @test String(@! open(Vector{UInt8}, blob)) == "Hello world!\n" + @test read(@!(open(IO, blob)), String) == "Hello world!\n" + @test @!(open(Blob, blob)) === blob + @test @!(open(BlobTree, tree)) === tree + end end #------------------------------------------------------------------------------- @@ -134,3 +162,5 @@ end end include("projects.jl") +include("DataTomlStorage.jl") +include("backend_compat.jl")