diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2a6568f..c66200a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -3,6 +3,7 @@ on: push: branches: - master + - release-* tags: '*' pull_request: jobs: @@ -48,7 +49,7 @@ jobs: - uses: actions/checkout@v2 - uses: julia-actions/setup-julia@latest with: - version: '1' + version: '1.6' - run: julia --project=docs -e ' using Pkg; Pkg.develop(PackageSpec(; path=pwd())); diff --git a/.gitignore b/.gitignore index 2ece7b6..ba39cc5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1 @@ Manifest.toml -/docs/build -/docs/Manifest.toml diff --git a/Project.toml b/Project.toml index 14ebcaf..409bf9e 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "DataSets" uuid = "c9661210-8a83-48f0-b833-72e62abce419" authors = ["Chris Foster and contributors"] -version = "0.2.6" +version = "0.2.12" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" @@ -15,10 +15,15 @@ TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [compat] -AbstractTrees = "0.4" +AbstractTrees = "0.3,0.4" +Base64 = "<0.0.1, 1" +Markdown = "<0.0.1,1" +REPL = "<0.0.1, 1" ReplMaker = "0.2" ResourceContexts = "0.1,0.2" -TOML = "1" +SHA = "<0.0.1, 0.7, 1" +TOML = "<0.0.1, 1" +UUIDs = "<0.0.1, 1" julia = "1.5" [extras] diff --git a/docs/Manifest.toml b/docs/Manifest.toml new file mode 100644 index 0000000..c822d6d --- /dev/null +++ b/docs/Manifest.toml @@ -0,0 +1,147 @@ +# This file is machine-generated - editing it directly is not advised + +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DocStringExtensions]] +deps = ["LibGit2", "Markdown", "Pkg", "Test"] +git-tree-sha1 = "9d4f64f79012636741cf01133158a54b24924c32" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.8.4" + +[[Documenter]] +deps = ["Base64", "Dates", "DocStringExtensions", "IOCapture", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"] +git-tree-sha1 = "3ebb967819b284dc1e3c0422229b58a40a255649" +uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +version = "0.26.3" + +[[Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + +[[IOCapture]] +deps = ["Logging"] +git-tree-sha1 = "377252859f740c217b936cebcd918a44f9b53b59" +uuid = "b5f81e59-6552-4d32-b1f0-c071b021bf89" +version = "0.1.1" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[JSON]] +deps = ["Dates", "Mmap", "Parsers", "Unicode"] +git-tree-sha1 = "81690084b6198a2e1da36fcfda16eeca9f9f24e4" +uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +version = "0.21.1" + +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + +[[LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + +[[Parsers]] +deps = ["Dates"] +git-tree-sha1 = "c8abc88faa3f7a3950832ac5d6e690881590d6dc" +uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" +version = "1.1.0" + +[[Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" + +[[Test]] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" + +[[p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/docs/Project.toml b/docs/Project.toml index 3a52a5d..dfa65cd 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,5 +1,2 @@ [deps] Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" - -[compat] -Documenter = "0.27" diff --git a/docs/src/Data.toml b/docs/src/Data.toml index 372f8d9..2602ba7 100644 --- a/docs/src/Data.toml +++ b/docs/src/Data.toml @@ -13,11 +13,11 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26" [datasets.storage] # The name of the storage driver. driver="FileSystem" - # Data stored in FileSystem is either File (a file) or FileTree (a directory/folder) - type="File" + # Data stored in FileSystem is either Blob (a file) or BlobTree (a directory/folder) + type="Blob" # Path with posix `/` separators. - # Relative paths are relative to the location of Data.toml - path="data/file.txt" + # Use @__DIR__ for paths relative to Data.toml + path="@__DIR__/data/file.txt" # A second example [[datasets]] @@ -27,8 +27,8 @@ uuid="e7fd7080-e346-4a68-9ca9-98593a99266a" [datasets.storage] driver="FileSystem" - type="FileTree" - path="data/csvset" + type="BlobTree" + path="@__DIR__/data/csvset" # Further datasets can be added as desired # [[datasets]] diff --git a/docs/src/design.md b/docs/src/design.md index f060db9..7500881 100644 --- a/docs/src/design.md +++ b/docs/src/design.md @@ -93,7 +93,7 @@ names to `DataSet`s. Perhaps it also maintains the serialized `DataSet` information as well for those datasets which are not registered. It might be stored in a Data.toml, in analogy to Project.toml. -Maintaince of the data project should occur via a data REPL. +Maintenance of the data project should occur via a data REPL. ## Data Registries @@ -277,4 +277,3 @@ array of strings) is restricted to tabular data, but seems similar in spirit to DataSets.jl. * [FileTrees.jl](http://shashi.biz/FileTrees.jl) provides tools for representing and processing tree-structured data lazily and in parallel. - diff --git a/docs/src/reference.md b/docs/src/reference.md index 4334183..12a9bd9 100644 --- a/docs/src/reference.md +++ b/docs/src/reference.md @@ -3,18 +3,12 @@ ## Using datasets The primary mechanism for loading datasets is the `dataset` function, coupled -with `open()` to open the resulting `DataSet` as some Julia type. +with `open()` to open the resulting `DataSet` as some Julia type. In addition, +DataSets.jl provides two macros [`@datafunc`](@ref) and [`@datarun`](@ref) to +help in creating program entry points and running them. ```@docs dataset -``` - -In addition, DataSets.jl provides two macros [`@datafunc`](@ref) and -[`@datarun`](@ref) to help in creating program entry points and running them. -Note that these APIs aren't fully formed and might be deprecated before -DataSets-1.0. - -```@docs @datafunc @datarun ``` @@ -52,31 +46,23 @@ DataSets.ActiveDataProject DataSets.TomlFileDataProject ``` -### Modifying datasets - -The metadata for a dataset may be updated using `config!` - -```@docs -DataSets.config! -``` - ## Data Models for files and directories -DataSets provides some builtin data models [`File`](@ref) and -[`FileTree`](@ref) for accessin file- and directory-like data respectively. For +DataSets provides some builtin data models [`Blob`](@ref) and +[`BlobTree`](@ref) for accessin file- and directory-like data respectively. For modifying these, the functions [`newfile`](@ref) and [`newdir`](@ref) can be -used. +used, together with `setindex!` for `BlobTree`. ```@docs -File -FileTree +Blob +BlobTree newfile newdir ``` ## Storage Drivers -To add a new kind of data storage backend, call [`DataSets.add_storage_driver`](@ref) +To add a new kind of data storage backend, implement [`DataSets.add_storage_driver`](@ref) ```@docs DataSets.add_storage_driver diff --git a/docs/src/tutorial.md b/docs/src/tutorial.md index acac62b..b07ea4d 100644 --- a/docs/src/tutorial.md +++ b/docs/src/tutorial.md @@ -12,7 +12,6 @@ DocTestFilters = [ r"path =.*", r"@.*", r"(?<=IOStream\().*", - r"(?<=TomlFileDataProject \[).*", ] ``` @@ -55,15 +54,13 @@ particular dataset: ```jldoctest julia> dataset("a_text_file") -DataSet instance: - name = "a_text_file" uuid = "b498f769-a7f6-4f67-8d74-40b770398f26" description = "A text file containing the standard greeting" [storage] driver = "FileSystem" -type = "File" +type = "Blob" path = ".../DataSets/docs/src/data/file.txt" ``` @@ -72,35 +69,38 @@ global configuration this is also possible: ```jldoctest julia> project = DataSets.load_project("src/Data.toml") -DataSets.TomlFileDataProject [.../DataSets/docs/src/Data.toml]: - ๐Ÿ“„ a_text_file => b498f769-a7f6-4f67-8d74-40b770398f26 - ๐Ÿ“ a_tree_example => e7fd7080-e346-4a68-9ca9-98593a99266a +DataSets.DataProject: + a_text_file => b498f769-a7f6-4f67-8d74-40b770398f26 + a_tree_example => e7fd7080-e346-4a68-9ca9-98593a99266a julia> dataset(project, "a_text_file") -DataSet instance: - name = "a_text_file" uuid = "b498f769-a7f6-4f67-8d74-40b770398f26" description = "A text file containing the standard greeting" [storage] driver = "FileSystem" -type = "File" +type = "Blob" path = ".../DataSets/docs/src/data/file.txt" ``` -## Working with `File` data +## Loading Data -The most basic type of dataset is the [`File`](@ref) which is a simple 1D array -of bytes (ie, a `Vector{UInt8}`; a blob). To access the file you can call -`open()` on the corresponding DataSet which will return a `File`. For example, +You can call `open()` on a DataSet to inspect the data inside. `open()` will +return the [`Blob`](@ref) and [`BlobTree`](@ref) types for local files and +directories on disk. For example, ```jldoctest julia> open(dataset("a_text_file")) ๐Ÿ“„ @ .../DataSets/docs/src/data/file.txt + +julia> open(dataset("a_tree_example")) +๐Ÿ“‚ Tree @ .../DataSets/docs/src/data/csvset + ๐Ÿ“„ 1.csv + ๐Ÿ“„ 2.csv ``` -Use the form `open(T, dataset)` to read the data as a specific type. `File` +Use the form `open(T, dataset)` to read the data as a specific type. `Blob` data can be opened as `String`, `IO`, or `Vector{UInt8}`, depending on your needs: @@ -121,7 +121,7 @@ julia> open(String, dataset("a_text_file")) ``` To ensure the dataset is closed again in a timely way (freeing any resources -such as file handles), you can use the scoped form, for example: +such as file handles), you should use the scoped form, for example: ```jldoctest julia> open(IO, dataset("a_text_file")) do io @@ -132,75 +132,27 @@ julia> open(IO, dataset("a_text_file")) do io content = "Hello world!\n" ``` -## Working with `FileTree` data - Let's look at some tree-like data which is represented on local disk as a -folder or directory. Tree data is represented in Julia as the -[`FileTree`](@ref) type and can be indexed with path components to get at the -[`File`](@ref)s inside. In turn, we can `open()` one of the file blobs and -look at the data contained within. - -```jldoctest -julia> open(dataset("a_tree_example")) -๐Ÿ“‚ Tree @ .../DataSets/docs/src/data/csvset - ๐Ÿ“„ 1.csv - ๐Ÿ“„ 2.csv -``` - -A `FileTree` has a dictionary-like API: it's a map from `String` names to -`File`s or `FileTree` subtrees. Iterating over it yields each child of the tree -in turn. For example, to examine the content of all files in a tree: +folder or directory. Tree data is opened in Julia as the [`BlobTree`](@ref) +type and can be indexed with path components to get at the file [`Blob`](@ref)s +inside. In turn, we can `open()` one of the file blobs and look at the data +contained within. ```jldoctest -julia> tree = open(FileTree, dataset("a_tree_example")) +julia> tree = open(BlobTree, dataset("a_tree_example")) ๐Ÿ“‚ Tree @ .../DataSets/docs/src/data/csvset ๐Ÿ“„ 1.csv ๐Ÿ“„ 2.csv -julia> for file in tree - content = open(String, file) - @info "File content" file content - end -โ”Œ Info: File content -โ”‚ file = ๐Ÿ“„ 1.csv @ .../DataSets/docs/src/data/csvset -โ”” content = "Name,Age\n\"Aaron\",23\n\"Harry\",42\n" -โ”Œ Info: File content -โ”‚ file = ๐Ÿ“„ 2.csv @ .../DataSets/docs/src/data/csvset -โ”” content = "Name,Age\n\"Rose\",19\n\"Tom\",25\n" -``` - -To list the names of files and subtrees, use `keys()`, or `haskey()` to -determine the presence of a file name - -```jldoctest -julia> tree = open(FileTree, dataset("a_tree_example")); - -julia> keys(tree) -2-element Vector{String}: - "1.csv" - "2.csv" - -julia> haskey(tree, "not_there.csv") -false -``` - -To get a particular file, indexing can be used, and `isfile()` and `isdir()` -can be used to detect whether a child of a tree is a file or a subtree. - -```jldoctest -julia> tree = open(FileTree, dataset("a_tree_example")); - julia> tree["1.csv"] -๐Ÿ“„ 1.csv @ /home/chris/.julia/dev/DataSets/docs/src/data/csvset - -julia> isfile(tree["1.csv"]) -true +๐Ÿ“„ 1.csv @ .../DataSets/docs/src/data/csvset -julia> isdir(tree) -true +julia> open(String, tree["1.csv"]) |> Text +Name,Age +"Aaron",23 +"Harry",42 ``` - ## Program Entry Points Rather than manually using the `open()` functions as shown above, the @@ -208,14 +160,14 @@ Rather than manually using the `open()` functions as shown above, the into your program. For example, here we define an entry point called `main` which takes -* DataSet type `File`, presenting it as a `String` within the program -* DataSet type `FileTree`, presenting it as a `FileTree` within the program +* DataSet type `Blob`, presenting it as a `String` within the program +* DataSet type `BlobTree`, presenting it as a `BlobTree` within the program The `@datarun` macro allows you to call such program entry points, extracting named data sets from a given project. ```jldoctest -julia> @datafunc function main(x::File=>String, t::FileTree=>FileTree) +julia> @datafunc function main(x::Blob=>String, t::BlobTree=>BlobTree) @show x open(String, t["1.csv"]) do csv_data @show csv_data diff --git a/src/BlobTree.jl b/src/BlobTree.jl new file mode 100644 index 0000000..1af898c --- /dev/null +++ b/src/BlobTree.jl @@ -0,0 +1,370 @@ +# Many datasets have tree-like indices. Examples: +# +# Index Data +# +# * OS: directories files +# * Git: trees blobs +# * S3: prefixes blobs +# * HDF5 group typed data +# * Zip flattend directory(?) blobs +# + +import AbstractTrees: AbstractTrees, children + +#------------------------------------------------------------------------------- +abstract type AbstractBlobTree; end + +# The tree API + +# TODO: Should we have `istree` separate from `isdir`? +Base.isdir(x::AbstractBlobTree) = true +Base.isfile(tree::AbstractBlobTree) = false +Base.ispath(x::AbstractBlobTree) = true + +# Number of children is not known without a (potentially high-latency) call to +# an external resource +Base.IteratorSize(tree::AbstractBlobTree) = Base.SizeUnknown() + +function Base.iterate(tree::AbstractBlobTree, state=nothing) + if state == nothing + # By default, call `children(tree)` to eagerly get a list of children + # for iteration. + cs = children(tree) + itr = iterate(cs) + else + (cs, cstate) = state + itr = iterate(cs, cstate) + end + if itr == nothing + return nothing + else + (c, cstate) = itr + (c, (cs, cstate)) + end +end + +""" + children(tree::AbstractBlobTree) + +Return an array of the children of `tree`. A child `x` may abstractly either be +another tree (`children(x)` returns a collection) or a file, where `children(x)` +returns `()`. + +Note that this is subtly different from `readdir(path)` which returns relative +paths, or `readdir(path, join=true)` which returns absolute paths. +""" +function children(tree::AbstractBlobTree) + # TODO: Is dispatch to the root a correct default? + children(tree.root, tree.path) +end + + +""" + showtree([io,], tree) + +Pretty printing of file trees, in the spirit of the unix `tree` utility. +""" +function showtree(io::IO, tree::AbstractBlobTree; maxdepth=5) + println(io, "๐Ÿ“‚ ", tree) + _showtree(io, tree, "", maxdepth) +end + +struct ShownTree + tree +end +# Use a wrapper rather than defaulting to stdout so that this works in more +# functional environments such as Pluto.jl +showtree(tree::AbstractBlobTree) = ShownTree(tree) + +Base.show(io::IO, s::ShownTree) = showtree(io, s.tree) + +function _showtree(io::IO, tree::AbstractBlobTree, prefix, depth) + cs = children(tree) + for (i,x) in enumerate(cs) + islast = i == lastindex(cs) # TODO: won't work if children() is lazy + first_prefix = prefix * (islast ? "โ””โ”€โ”€" : "โ”œโ”€โ”€") + other_prefix = prefix * (islast ? " " : "โ”‚ย ย ") + if isdir(x) + print(io, first_prefix, "๐Ÿ“‚ ") + printstyled(io, basename(x), "\n", color=:light_blue, bold=true) + if depth > 1 + _showtree(io, x, other_prefix, depth-1) + else + print(io, other_prefix, 'โ‹ฎ') + end + else + println(io, first_prefix, " ", basename(x)) + end + end +end + +function Base.copy!(dst::AbstractBlobTree, src::AbstractBlobTree) + for x in src + newpath = joinpath(dst, basename(x)) + if isdir(x) + newdir = mkdir(newpath) + copy!(newdir, x) + else + open(x) do io_src + open(newpath, write=true) do io_dst + write(io_dst, io_src) + end + end + end + end +end + +#------------------------------------------------------------------------------- +""" + Blob(root) + Blob(root, relpath) + +`Blob` represents the location of a collection of unstructured binary data. The +location is a path `relpath` relative to some `root` data resource. + +A `Blob` can naturally be `open()`ed as a `Vector{UInt8}`, but can also be +mapped into the program as an `IO` byte stream, or interpreted as a `String`. + +Blobs can be arranged into hierarchies "directories" via the `BlobTree` type. +""" +mutable struct Blob{Root} + root::Root + path::RelPath +end + +Blob(root) = Blob(root, RelPath()) + +Base.basename(file::Blob) = basename(file.path) +Base.abspath(file::Blob) = AbsPath(file.root, file.path) +Base.isdir(file::Blob) = false +Base.isfile(file::Blob) = true +Base.ispath(file::Blob) = true + +function Base.show(io::IO, ::MIME"text/plain", file::Blob) + print(io, "๐Ÿ“„ ", file.path, " @ ", summary(file.root)) +end + +function AbstractTrees.printnode(io::IO, file::Blob) + print(io, "๐Ÿ“„ ", basename(file)) +end + +# Opening as Vector{UInt8} or as String defers to IO interface +function Base.open(f::Function, ::Type{Vector{UInt8}}, file::Blob) + open(IO, file.root, file.path) do io + f(read(io)) # TODO: use Mmap? + end +end + +function Base.open(f::Function, ::Type{String}, file::Blob) + open(IO, file.root, file.path) do io + f(read(io, String)) + end +end + +# Default open-type for Blob is IO +Base.open(f::Function, file::Blob; kws...) = open(f, IO, file.root, file.path; kws...) + +# Opening Blob as itself is trivial +function Base.open(f::Function, ::Type{Blob}, file::Blob) + f(file) +end + +# open with other types T defers to the underlying storage system +function Base.open(f::Function, ::Type{T}, file::Blob; kws...) where {T} + open(f, T, file.root, file.path; kws...) +end + +# ResourceContexts.jl - based versions of the above. + +@! function Base.open(::Type{Vector{UInt8}}, file::Blob) + @context begin + # TODO: use Mmap? + read(@! open(IO, file.root, file.path)) + end +end + +@! function Base.open(::Type{String}, file::Blob) + @context begin + read(@!(open(IO, file.root, file.path)), String) + end +end + +# Default open-type for Blob is IO +@! function Base.open(file::Blob; kws...) + @! open(IO, file.root, file.path; kws...) +end + +# Opening Blob as itself is trivial +@! function Base.open(::Type{Blob}, file::Blob) + file +end + +# open with other types T defers to the underlying storage system +@! function Base.open(::Type{T}, file::Blob; kws...) where {T} + @! open(T, file.root, file.path; kws...) +end + +# Fallback implementation of `@! open(T, root, path)` based on enter_do. +# +# TODO: Update other backends to avoid calling this; using enter_do is pretty +# inefficient. +@! function Base.open(::Type{T}, root, path; kws...) where {T} + (res,) = @! enter_do(open, T, root, path; kws...) + res +end + +# Unscoped form of open for Blob +function Base.open(::Type{T}, blob::Blob; kws...) where {T} + @context begin + result = @! open(T, blob; kws...) + @! ResourceContexts.detach_context_cleanup(result) + end +end + +# read() is also supported for `Blob`s +Base.read(file::Blob) = read(file.root, file.path) +Base.read(file::Blob, ::Type{T}) where {T} = read(file.root, file.path, T) + + +# Support for opening AbsPath +# +# TODO: Put this elsewhere? +function Base.open(f::Function, ::Type{T}, path::AbsPath; kws...) where {T} + open(f, T, path.root, path.path; kws...) +end + +Base.open(f::Function, path::AbsPath; kws...) = open(f, IO, path.root, path.path; kws...) + + +#------------------------------------------------------------------------------- +""" + BlobTree(root) + +`BlobTree` is a "directory tree" like hierarchy which may have `Blob`s and +`BlobTree`s as children. + +The tree implements the `AbstracTrees.children()` interface and may be indexed +with paths to traverse the hierarchy down to the leaves ("files") which are of +type `Blob`. Individual leaves may be `open()`ed as various Julia types. + +# Example + +Normally you'd construct these via the [`dataset`](@ref) function which takes +care of constructing the correct `root` object. However, here's a direct +demonstration: + +``` +julia> tree = BlobTree(DataSets.FileSystemRoot(dirname(pathof(DataSets))), path"../test/data") +๐Ÿ“‚ Tree ../test/data @ /home/chris/.julia/dev/DataSets/src + ๐Ÿ“ csvset + ๐Ÿ“„ file.txt + ๐Ÿ“„ foo.txt + ๐Ÿ“„ people.csv.gz + +julia> tree["csvset"] +๐Ÿ“‚ Tree ../test/data/csvset @ /home/chris/.julia/dev/DataSets/src + ๐Ÿ“„ 1.csv + ๐Ÿ“„ 2.csv + +julia> tree[path"csvset"] +๐Ÿ“‚ Tree ../test/data/csvset @ /home/chris/.julia/dev/DataSets/src + ๐Ÿ“„ 1.csv + ๐Ÿ“„ 2.csv +``` +""" +mutable struct BlobTree{Root} <: AbstractBlobTree + root::Root + path::RelPath +end + +BlobTree(root) = BlobTree(root, RelPath()) + +function AbstractTrees.printnode(io::IO, tree::BlobTree) + print(io, "๐Ÿ“‚ ", basename(tree)) +end + +function Base.show(io::IO, ::MIME"text/plain", tree::AbstractBlobTree) + # TODO: Ideally we'd use + # AbstractTrees.print_tree(io, tree, 1) + # However, this is hard to use efficiently; we'd need to implement a lazy + # `children()` for all our trees. It'd be much easier if + # `AbstractTrees.has_children()` was used consistently upstream. + cs = children(tree) + println(io, "๐Ÿ“‚ Tree ", tree.path, " @ ", summary(tree.root)) + for (i, c) in enumerate(cs) + print(io, " ", isdir(c) ? '๐Ÿ“' : '๐Ÿ“„', " ", basename(c)) + if i != length(cs) + print(io, '\n') + end + end +end + +Base.basename(tree::BlobTree) = basename(tree.path) +Base.abspath(tree::BlobTree) = AbsPath(tree.root, tree.path) + +# getindex vs joinpath: +# - getindex about indexing the datastrcutre; therefore it looks in the +# filesystem to only return things which exist. +# - joinpath just makes paths, not knowing whether they exist. +function Base.getindex(tree::BlobTree, path::RelPath) + relpath = joinpath(tree.path, path) + root = tree.root + # TODO: Make this more efficient by moving this work to the storage backend? + # Sort of like an equivalent of `stat`? + if isdir(root, relpath) + BlobTree(root, relpath) + elseif isfile(root, relpath) + Blob(root, relpath) + elseif ispath(root, relpath) + AbsPath(root, relpath) # Not great? + else + error("Path $relpath @ $root doesn't exist") + end +end + +function Base.getindex(tree::BlobTree, name::AbstractString) + getindex(tree, joinpath(RelPath(), name)) +end + +# We've got a weird mishmash of path vs tree handling here. +# TODO: Can we refactor this to cleanly separate the filesystem-like commands +# (which take abstract paths?) from BlobTree and Blob which act as an +# abstraction over the filesystem or other storage mechanisms? +function Base.joinpath(tree::BlobTree, r::RelPath) + AbsPath(tree.root, joinpath(tree.path, r)) +end + +function Base.joinpath(tree::BlobTree, s::AbstractString) + AbsPath(tree.root, joinpath(tree.path, s)) +end + +function Base.haskey(tree::BlobTree, name::AbstractString) + ispath(tree.root, joinpath(tree.path, name)) +end + +function Base.readdir(tree::BlobTree) + readdir(tree.root, tree.path) +end + +function Base.keys(tree::BlobTree) + readdir(tree.root, tree.path) +end + +function Base.rm(tree::BlobTree; kws...) + rm(tree.root, tree.path; kws...) +end + +function children(tree::BlobTree) + child_names = readdir(tree) + [tree[c] for c in child_names] +end + +function Base.open(f::Function, ::Type{BlobTree}, tree::BlobTree) + f(tree) +end + +@! function Base.open(::Type{BlobTree}, tree::BlobTree) + tree +end + +# Base.open(::Type{T}, file::Blob; kws...) where {T} = open(identity, T, file.root, file.path; kws...) diff --git a/src/DataSet.jl b/src/DataSet.jl deleted file mode 100644 index bfcafa0..0000000 --- a/src/DataSet.jl +++ /dev/null @@ -1,190 +0,0 @@ -""" -A `DataSet` is a metadata overlay for data held locally or remotely which is -unopinionated about the underlying storage mechanism. - -The data in a `DataSet` has a type which implies an index; the index can be -used to partition the data for processing. -""" -mutable struct DataSet - project # AbstractDataProject owning this DataSet - uuid::UUID # Unique identifier for the dataset. Use uuid4() to create these. - # The representation `conf` contains "configuration data" read directly from - # the TOML (or other data project source, eg json API etc) - conf - - function DataSet(project, conf) - _validate_dataset_config(conf) - new(project, UUID(conf["uuid"]), conf) - end -end - -DataSet(conf) = DataSet(nothing, conf) - -function _validate_dataset_config(conf) - _check_keys(conf, DataSet, ["uuid"=>String, "storage"=>Dict, "name"=>String]) - _check_keys(conf["storage"], DataSet, ["driver"=>String]) - _check_optional_keys(conf, - "description"=>AbstractString, - "tags"=>VectorOf(AbstractString)) - check_dataset_name(conf["name"]) -end - -function Base.show(io::IO, d::DataSet) - print(io, DataSet, "(name=$(repr(d.name)), uuid=$(repr(d.uuid)), #= โ€ฆ =#)") -end - -function Base.show(io::IO, ::MIME"text/plain", d::DataSet) - println(io, "DataSet instance:") - println(io) - TOML.print(io, d.conf) -end - -""" - is_valid_dataset_name(name) - -Check whether a dataset name is valid. Valid names include start with a letter -and may contain letters, numbers or `_`. Names may be hieracicial, with pieces -separated with forward slashes. Examples: - - my_data - my_data_1 - username/data - organization/project/data -""" -is_valid_dataset_name(name::AbstractString) = occursin(DATASET_NAME_REGEX, name) -# DataSet names disallow most punctuation for now, as it may be needed as -# delimiters in data-related syntax (eg, for the data REPL). -const DATASET_NAME_REGEX_STRING = raw""" -[[:alpha:]] -(?: - [-[:alnum:]_] | - / (?=[[:alpha:]]) -)* -""" -const DATASET_NAME_REGEX = Regex("^\n$(DATASET_NAME_REGEX_STRING)\n\$", "x") - -function make_valid_dataset_name(name) - if !is_valid_dataset_name(name) - name = replace(name, r"^[^[:alpha:]]+"=>"") - name = replace(name, '\\'=>'/') - name = replace(name, r"[^-[:alnum:]_/]"=>"_") - if !is_valid_dataset_name(name) - # best-effort fallback - name = "data" - end - end - return name -end - -function check_dataset_name(name::AbstractString) - if !is_valid_dataset_name(name) - error("DataSet name \"$name\" is invalid. DataSet names must start with a letter and can contain only letters, numbers, `_` or `/`.") - end -end - -#------------------------------------------------------------------------------- -# API for DataSet type -function Base.getproperty(d::DataSet, name::Symbol) - if name === :uuid - getfield(d, :uuid) - elseif name === :conf - getfield(d, :conf) - else - getfield(d, :conf)[string(name)] - end -end - -function Base.setproperty!(d::DataSet, name::Symbol, x) - config!(d; name=>x) -end - -Base.getindex(d::DataSet, name::AbstractString) = getindex(d.conf, name) -Base.haskey(d::DataSet, name::AbstractString) = haskey(d.conf, name) - -function data_project(dataset::DataSet) - return getfield(dataset, :project) -end - -# Split the fragment section as a '/' separated RelPath -function dataspec_fragment_as_path(d::DataSet) - if haskey(d, "dataspec") - fragment = get(d.dataspec, "fragment", nothing) - if !isnothing(fragment) - return RelPath(split(fragment, '/')) - end - end - return nothing -end - -function config!(dataset::DataSet; kws...) - config!(data_project(dataset), dataset; kws...) -end - -# The default case of a dataset config update when the update is independent of -# the project. (In general, projects may supply extra constraints.) -function config!(::Nothing, dataset::DataSet; kws...) - for (k,v) in pairs(kws) - if k in (:uuid, :name) - error("Cannot modify dataset config with key $k") - # TODO: elseif k === :storage - # Check consistency using storage driver API? - end - # TODO: Fold these schema checks in with _validate_dataset_config - # somehow. - if k === :description - if !(v isa AbstractString) - error("Dataset description must be a string") - end - elseif k === :tags - if !(v isa AbstractVector && all(x isa AbstractString for x in v)) - error("Dataset tags must be a vector of strings") - end - end - dataset.conf[string(k)] = v - end - return dataset -end - -#------------------------------------------------------------------------------- -# Functions for opening datasets - -# do-block form of open() -function Base.open(f::Function, as_type, dataset::DataSet) - storage_config = dataset.storage - driver = _find_driver(dataset) - driver(storage_config, dataset) do storage - open(f, as_type, storage) - end -end - -# Contexts-based form of open() -@! function Base.open(dataset::DataSet) - storage_config = dataset.storage - driver = _find_driver(dataset) - # Use `enter_do` because drivers don't yet use the ResourceContexts.jl mechanism - (storage,) = @! enter_do(driver, storage_config, dataset) - storage -end - -@! function Base.open(as_type, dataset::DataSet) - storage = @! open(dataset) - @! open(as_type, storage) -end - -# TODO: -# Consider making a distinction between open() and load(). - -# Finalizer-based version of open() -function Base.open(dataset::DataSet) - @context begin - result = @! open(dataset) - @! ResourceContexts.detach_context_cleanup(result) - end -end - -function Base.open(as_type, dataset::DataSet) - @context begin - result = @! open(as_type, dataset) - @! ResourceContexts.detach_context_cleanup(result) - end -end diff --git a/src/DataSets.jl b/src/DataSets.jl index 46bbe83..22dacb9 100644 --- a/src/DataSets.jl +++ b/src/DataSets.jl @@ -7,7 +7,7 @@ using ResourceContexts using Base: PkgId export DataSet, dataset, @datafunc, @datarun -export File, FileTree, newfile, newdir +export Blob, BlobTree, newfile, newdir """ The current DataSets version number @@ -17,6 +17,379 @@ const PACKAGE_VERSION = let VersionNumber(project["version"]) end +include("paths.jl") + +#------------------------------------------------------------------------------- + +""" +A `DataSet` is a metadata overlay for data held locally or remotely which is +unopinionated about the underlying storage mechanism. + +The data in a `DataSet` has a type which implies an index; the index can be +used to partition the data for processing. +""" +struct DataSet + # For now, the representation `conf` contains data read directly from the + # TOML. Once the design has settled we might get some explicit fields and + # do validation. + uuid::UUID # Unique identifier for the dataset. Use uuid4() to create these. + conf + + function DataSet(conf) + _check_keys(conf, DataSet, ["uuid"=>String, "storage"=>Dict, "name"=>String]) + _check_keys(conf["storage"], DataSet, ["driver"=>String]) + check_dataset_name(conf["name"]) + new(UUID(conf["uuid"]), conf) + end + + #= + name::String # Default name for convenience. + # The binding to an actual name is managed by the data + # project. + storage # Storage config and driver definition + maps::Vector{DataMap} + + # Generic dictionary of other properties... for now. Required properties + # will be moved + _other::Dict{Symbol,Any} + + #storage_id # unique identifier in storage backend, if it exists + #owner # Project or user who owns the data + #description::String + #type # Some representation of the type of data? + # # An array, blob, table, tree, etc + #cachable::Bool # Can the data be cached? It might not for data governance + # # reasons or it might change commonly. + ## A set of identifiers + #tags::Set{String} + =# +end + +_key_match(config, (k,T)::Pair) = haskey(config, k) && config[k] isa T +_key_match(config, k::String) = haskey(config, k) + +function _check_keys(config, context, keys) + missed_keys = filter(k->!_key_match(config, k), keys) + if !isempty(missed_keys) + error(""" + Missing expected keys in $context: + $missed_keys + + In TOML fragment: + $(sprint(TOML.print,config)) + """) + end +end + +""" + check_dataset_name(name) + +Check whether a dataset name is valid. + +Valid names must start with a letter or a number, the rest of the name can also contain `-` +and `_` characters. The names can also be hierarchical, with segments separated by forward +slashes (`/`) or (`.`). Each segment must also start with either a letter or a number. + +For example, the following dataset names are valid: + + my_data + my_data_1 + username/data + organization_name/project-name/data + 123user/456dataset--name + username/my_table.csv + dataset/v0.1.2 + +whereas names like this are invalid: + + __mydata__ + username/.git + my...dataset + +!!! note "Segment separators" + + In dataset names, both `/` and `.` are considered segment separators from a syntax + perspective. While DataSets.jl does not impose any specific interpretation on the + dataset name, it is recommended to use `/` to separate segments from a semantic + perspective, and to interpret each forward-slash-separated segment as a path separator. + Periods would conventionally be used to separate file extensions within a segment. + + E.g. use `username/my-project-data/population.csv`, rather than + `username.my-project-data.population.csv` or something like that. +""" +function check_dataset_name(name::AbstractString) + if !occursin(DATASET_NAME_REGEX, name) + error("DataSet name \"$name\" is invalid. DataSet names must start with a letter or a number, and can contain only letters, numbers, `-` and `_`, or `/` and `.` as segment separators.") + end +end +# DataSet names disallow most punctuation for now, as it may be needed as +# delimiters in data-related syntax (eg, for the data REPL). +const DATASET_NAME_REGEX_STRING = raw""" +[[:alnum:]] +(?: + [-[:alnum:]_] | + \.(?=[[:alnum:]]) | + \/ (?=[[:alnum:]]) +)* +""" +const DATASET_NAME_REGEX = Regex("^\n$(DATASET_NAME_REGEX_STRING)\n\$", "x") + +# Hacky thing until we figure out which fields DataSet should actually have. +function Base.getproperty(d::DataSet, name::Symbol) + if name in fieldnames(DataSet) + return getfield(d, name) + else + getfield(d, :conf)[string(name)] + end +end + +Base.getindex(d::DataSet, name::AbstractString) = getindex(d.conf, name) +Base.haskey(d::DataSet, name::AbstractString) = haskey(d.conf, name) + +# Split the fragment section as a '/' separated RelPath +function dataspec_fragment_as_path(d::DataSet) + if haskey(d, "dataspec") + fragment = get(d.dataspec, "fragment", nothing) + if !isnothing(fragment) + return RelPath(split(fragment, '/')) + end + end + return nothing +end + +function Base.show(io::IO, d::DataSet) + print(io, DataSet, "(name=$(repr(d.name)), uuid=$(repr(d.uuid)), #= โ€ฆ =#)") +end + +function Base.show(io::IO, ::MIME"text/plain", d::DataSet) + TOML.print(io, d.conf) +end + + +#------------------------------------------------------------------------------- +""" +Subtypes of `AbstractDataProject` have the interface + +Must implement: + - `Base.get(project, dataset_name, default)` โ€” search + - `Base.keys(project)` - get dataset names + +Optional: + - `Base.iterate()` โ€” default implementation in terms of `keys` and `get` + - `Base.pairs()` โ€” default implementation in terms of `keys` and `get` + - `Base.haskey()` โ€” default implementation in terms of `get` + - `Base.getindex()` โ€” default implementation in terms of `get` + - `DataSets.project_name()` โ€” returns `nothing` by default. + +Provided by AbstractDataProject (should not be overridden): + - `DataSets.dataset()` - implemented in terms of `get` +""" +abstract type AbstractDataProject end + +function Base.getindex(proj::AbstractDataProject, name::AbstractString) + data = get(proj, name, nothing) + data != nothing || error("DataSet $(repr(name)) not found") + data +end + +""" + dataset(name) + dataset(project, name) + +Returns the [`DataSet`](@ref) with the given `name` from `project`. If omitted, +the global data environment [`DataSets.PROJECT`](@ref) will be used. + +The `DataSet` is *metadata*, but to use the actual *data* in your program you +need to use the `open` function to access the `DataSet`'s content as a given +Julia type. + +# Example + +To open a dataset named `"a_text_file"` and read the whole content as a String, + +```julia +content = open(String, dataset("a_text_file")) +``` + +To open the same dataset as an `IO` stream and read only the first line, + +```julia +open(IO, dataset("a_text_file")) do io + line = readline(io) + @info "The first line is" line +end +``` + +To open a directory as a browsable tree object, + +```julia +open(BlobTree, dataset("a_tree_example")) +``` +""" +function dataset(proj::AbstractDataProject, spec::AbstractString) + namestr, query, fragmentstr = _split_dataspec(spec) + + if isnothing(namestr) + throw(ArgumentError("Invalid dataset specification: $spec")) + end + + dataset = proj[namestr] + + if isnothing(query) && isnothing(fragmentstr) + return dataset + end + + # Enhance dataset with "dataspec" holding URL-like fragment & query + dataspec = Dict() + if !isnothing(query) + dataspec["query"] = Dict{String,Any}(query) + end + if !isnothing(fragmentstr) + dataspec["fragment"] = fragmentstr + end + + # We need to take care here with copy() to avoid modifying the original + # dataset configuration. + conf = copy(dataset.conf) + conf["dataspec"] = dataspec + + return DataSet(conf) +end + + +# Percent-decode a string according to the URI escaping rules. +# Vendored from URIs.jl for now to avoid depending on that entire package for +# this one function. +function _unescapeuri(str) + occursin("%", str) || return str + out = IOBuffer() + i = 1 + io = IOBuffer(str) + while !eof(io) + c = read(io, Char) + if c == '%' + c1 = read(io, Char) + c = read(io, Char) + write(out, parse(UInt8, string(c1, c); base=16)) + else + write(out, c) + end + end + return String(take!(out)) +end + +# Parse as a suffix of URI syntax +# name/of/dataset?param1=value1¶m2=value2#fragment +const DATASET_SPEC_REGEX = Regex( + """ + ^ + ($(DATASET_NAME_REGEX_STRING)) + (?:\\?([^#]*))? # query - a=b&c=d + (?:\\#(.*))? # fragment - ... + \$ + """, + "x", +) +function _split_dataspec(spec::AbstractString) + # Parse as a suffix of URI syntax + # name/of/dataset?param1=value1¶m2=value2#fragment + m = match(DATASET_SPEC_REGEX, spec) + if isnothing(m) + return nothing, nothing, nothing + end + namestr = m[1] + query = m[2] + fragmentstr = m[3] + + if !isnothing(query) + query = [_unescapeuri(x)=>_unescapeuri(y) for (x,y) in split.(split(query, '&'), '=')] + end + if !isnothing(fragmentstr) + fragmentstr = _unescapeuri(fragmentstr) + end + + namestr, query, fragmentstr +end + +function Base.haskey(proj::AbstractDataProject, name::AbstractString) + get(proj, name, nothing) !== nothing +end + +function Base.iterate(project::AbstractDataProject, state=nothing) + if isnothing(state) + ks = keys(project) + ks_itr = iterate(ks) + else + (ks, ks_state) = state + ks_itr = iterate(ks, ks_state) + end + if isnothing(ks_itr) + return nothing + end + (k, ks_state) = ks_itr + val = get(project, k, nothing) + if isnothing(val) + # val could be `nothing` if entries in the project are updated + # concurrently. (Eg, this might happen for data projects which are + # backed by the filesystem.) + return iterate(project, (ks, ks_state)) + end + (val, (ks, ks_state)) +end + +# Unknown size by default, due to the above get-based implementation of +# iterate, coupled with possible concurrent modification. +Base.IteratorSize(::AbstractDataProject) = Base.SizeUnknown() + +function Base.pairs(proj::AbstractDataProject) + ks = keys(proj) + (k=>d for (k,d) in (k=>get(proj, k, nothing) for k in ks) if !isnothing(d)) +end + +""" + project_name(data_project) + +Return the name of the given `data_project`. Ideally this can be used to +uniquely identify the project when modifying the project stack in +`DataSets.PROJECT`. For projects which were generated from +`JULIA_DATASETS_PATH`, this will be the expanded path component. + +Other types of projects will have to return something else. For example, remote +data projects may want to return a URI. For projects which have no obvious +identifier, `nothing` is returned. +""" +project_name(data_project::AbstractDataProject) = nothing + +data_drivers(proj::AbstractDataProject) = [] + +#------------------------------------------------------------------------------- +""" + DataProject + +A concrete data project is a collection of DataSets with associated names. +Names are unique within the project. +""" +struct DataProject <: AbstractDataProject + datasets::Dict{String,DataSet} + drivers::Vector{Dict{String,Any}} +end + +DataProject() = DataProject(Dict{String,DataSet}(), Vector{Dict{String,Any}}()) + +DataProject(project::AbstractDataProject) = DataProject(Dict(pairs(project)), + Vector{Dict{String,Any}}()) + +data_drivers(project::DataProject) = project.drivers + +function _fill_template(toml_path, toml_str) + # Super hacky templating for paths relative to the toml file. + # We really should have something a lot nicer here... + if Sys.iswindows() + toml_path = replace(toml_path, '\\'=>'/') + end + toml_str = replace(toml_str, "@__DIR__"=>toml_path) +end + """ `CURRENT_DATA_CONFIG_VERSION` is the current version of the data configuration format, as reflected in the Data.toml `data_config_version` key. This allows old @@ -58,53 +431,198 @@ name="" """ const CURRENT_DATA_CONFIG_VERSION = 1 -include("paths.jl") -include("DataSet.jl") -include("data_project.jl") -include("file_data_projects.jl") -include("storage_drivers.jl") +""" + load_project(path; auto_update=false) + load_project(config_dict) + +Load a data project from a system `path` referring to a TOML file. If +`auto_update` is true, the returned project will monitor the file for updates +and reload when necessary. + +Alternatively, create a `DataProject` from a an existing dictionary +`config_dict`, which should be in the Data.toml format. + +See also [`load_project!`](@ref). +""" +function load_project(path::AbstractString; auto_update=false) + sys_path = abspath(path) + auto_update ? TomlFileDataProject(sys_path) : + _load_project(read(sys_path,String), dirname(sys_path)) +end + +function load_project(config::AbstractDict; kws...) + _check_keys(config, "Data.toml", ["data_config_version"=>Integer, + "datasets"=>AbstractVector]) + format_ver = config["data_config_version"] + if format_ver > CURRENT_DATA_CONFIG_VERSION + error(""" + data_config_version=$format_ver is newer than supported. + Consider upgrading to a newer version of DataSets.jl + """) + end + proj = DataProject() + for dataset_conf in config["datasets"] + dataset = DataSet(dataset_conf) + link_dataset(proj, dataset.name => dataset) + end + if haskey(config, "drivers") + _check_keys(config, DataProject, ["drivers"=>AbstractVector]) + for driver_conf in config["drivers"] + _check_keys(driver_conf, DataProject, ["type"=>String, "name"=>String, "module"=>Dict]) + _check_keys(driver_conf["module"], DataProject, ["name"=>String, "uuid"=>String]) + push!(proj.drivers, driver_conf) + end + end + proj +end + +# TODO: Deprecate this? +function load_project(path::AbstractPath; kws) + load_project(sys_abspath(abspath(path)); kws...) +end + +function link_dataset(proj::DataProject, (name,data)::Pair) + proj.datasets[name] = data +end + +link_dataset(proj::DataProject, d::DataSet) = link_dataset(proj, d.name=>d) + +function unlink_dataset(proj::DataProject, name::AbstractString) + if !haskey(proj.datasets, name) + throw(ArgumentError("No dataset \"$name\" in data project")) + end + d = proj.datasets[name] + delete!(proj.datasets, name) + d +end + +function Base.get(proj::DataProject, name::AbstractString, default) + get(proj.datasets, name, default) +end + +Base.keys(proj::DataProject) = keys(proj.datasets) + +function Base.iterate(proj::DataProject, state=nothing) + # proj.datasets iterates key=>value; need to rejig it to iterate values. + itr = isnothing(state) ? iterate(proj.datasets) : iterate(proj.datasets, state) + isnothing(itr) && return nothing + (x, state) = itr + (x.second, state) +end + +function Base.show(io::IO, ::MIME"text/plain", project::AbstractDataProject) + datasets = collect(pairs(project)) + summary(io, project) + println(io, ":") + if isempty(datasets) + print(io, " (empty)") + return + end + sorted = sort(datasets, by=first) + maxwidth = maximum(textwidth.(first.(sorted))) + for (i, (name, data)) in enumerate(sorted) + pad = maxwidth - textwidth(name) + storagetype = get(data.storage, "type", nothing) + icon = storagetype == "Blob" ? '๐Ÿ“„' : + storagetype == "BlobTree" ? '๐Ÿ“' : + 'โ“' + print(io, " ", icon, ' ', name, ' '^pad, " => ", data.uuid) + if i < length(sorted) + println(io) + end + end +end + +function Base.summary(io::IO, project::AbstractDataProject) + print(io, typeof(project)) + name = project_name(project) + if !isnothing(name) + print(io, " [", name, "]") + end +end #------------------------------------------------------------------------------- -# Global datasets configuration for current Julia session +""" + StackedDataProject() + StackedDataProject(projects) + +Search stack of AbstractDataProjects, where projects are searched from the +first to last element of `projects`. + +Additional projects may be added or removed from the stack with `pushfirst!`, +`push!` and `empty!`. + +See also [`DataSets.PROJECT`](@ref). +""" +struct StackedDataProject <: AbstractDataProject + projects::Vector +end + +StackedDataProject() = StackedDataProject([]) + +data_drivers(stack::StackedDataProject) = vcat(data_drivers.(stack.projects)...) + +function Base.keys(stack::StackedDataProject) + names = [] + for project in stack.projects + append!(names, keys(project)) + end + unique(names) +end + +function Base.get(stack::StackedDataProject, name::AbstractString, default) + for project in stack.projects + d = get(project, name, nothing) + if !isnothing(d) + return d + end + end +end + +# API for manipulating the stack. +Base.push!(stack::StackedDataProject, project) = push!(stack.projects, project) +Base.pushfirst!(stack::StackedDataProject, project) = pushfirst!(stack.projects, project) +Base.popfirst!(stack::StackedDataProject) = popfirst!(stack.projects) +Base.pop!(stack::StackedDataProject) = pop!(stack.projects) +Base.empty!(stack::StackedDataProject) = empty!(stack.projects) + +function Base.show(io::IO, mime::MIME"text/plain", stack::StackedDataProject) + summary(io, stack) + println(io, ":") + for (i,project) in enumerate(stack.projects) + # show(io, mime, project) + # indent each project + str = sprint(show, mime, project) + print(io, join(" " .* split(str, "\n"), "\n")) + i != length(stack.projects) && println(io) + end +end + +include("file_data_projects.jl") -function data_project_from_path(path; depot_paths) +function expand_project_path(path) if path == "@" - ActiveDataProject() + return path elseif path == "" - # We will not throw an error here because this gets call in __init__, and we - # do not want to interrupt the loading of the package. Instead, we omit this - # project. - if isempty(depot_paths) - @warn "Julia depot data project (for an empty dataset path) can not be constructed because DEPOT_PATH is empty." - return nothing - end - depot = first(depot_paths) - # Julia is perfectly happy with DEPOT_PATHs that are not absolute, and hence their - # interpretation changes when the user cd-s around in their session. - # - # https://github.com/JuliaLang/julia/issues/44958 - # - # To offer a little bit more reliability here for the user, we absolutize the - # path when DataSets gets loaded, so that things would not be affected by the - # user changing directories. - if !isabspath(depot) - depot = abspath(expanduser(depot)) - @warn "Julia depot path ($(first(depot_paths))) not absolute. Fixing data project path relative to current working directory." depot - end - TomlFileDataProject(joinpath(depot, "datasets", "Data.toml")) + return joinpath(homedir(), ".julia", "datasets", "Data.toml") else - # In other cases, we expect a reasonable absolute (or relative) path from - # the user, which can either points directly to a file, unless it is an existing - # directory. path = abspath(expanduser(path)) if isdir(path) path = joinpath(path, "Data.toml") end - TomlFileDataProject(path) + end + path +end + +function data_project_from_path(path) + if path == "@" + ActiveDataProject() + else + TomlFileDataProject(expand_project_path(path)) end end -function create_project_stack(env, depot_paths) +function create_project_stack(env) stack = [] env_search_path = get(env, "JULIA_DATASETS_PATH", nothing) if isnothing(env_search_path) @@ -114,13 +632,15 @@ function create_project_stack(env, depot_paths) split(env_search_path, Sys.iswindows() ? ';' : ':') end for path in paths - project = data_project_from_path(path; depot_paths) - isnothing(project) && continue + project = data_project_from_path(path) push!(stack, project) end StackedDataProject(stack) end +#------------------------------------------------------------------------------- +# Global datasets configuration for current Julia session + # Global stack of data projects, with the top of the stack being searched # first. """ @@ -140,13 +660,11 @@ interpreted as follows: For directories, the filename "Data.toml" is implicitly appended. `expanduser()` is used to expand the user's home directory. - As in `DEPOT_PATH`, an *empty* path component means the user's default - Julia depot (e.g. `~/.julia/datasets`), determined by the first element - of `DEPOT_PATH`. + Julia home directory, `joinpath(homedir(), ".julia", "datasets")` -This simplified version of the code loading rules (`LOAD_PATH`/`DEPOT_PATH``) is +This simplified version of the code loading rules (LOAD_PATH/DEPOT_PATH) is used as it seems unlikely that we'll want data location to be version- -dependent in the same way that that code is. Note that any changes to `DEPOT_PATH` -after `DataSets` has been loaded do not affect `DataSets.PROJECT`. +dependent in the same way that that code is. Unlike `LOAD_PATH`, `JULIA_DATASETS_PATH` is represented inside the program as a `StackedDataProject`, and users can add custom projects by defining their own @@ -157,6 +675,9 @@ Additional projects may be added or removed from the stack with `pushfirst!`, """ PROJECT = StackedDataProject() +# deprecated. TODO: Remove dependency on this from JuliaHub +_current_project = DataProject() + _isprecompiling() = ccall(:jl_generating_output, Cint, ()) == 1 function __init__() @@ -164,7 +685,7 @@ function __init__() # be unnecessary and can cause problems if those driver modules use # Requires-like code loading. if !_isprecompiling() - global PROJECT = create_project_stack(ENV, DEPOT_PATH) + global PROJECT = create_project_stack(ENV) for proj in PROJECT.projects try add_storage_driver(proj) @@ -226,43 +747,6 @@ function _invoke_init_cb(f::Base.Callable) end end -""" - dataset(name) - dataset(project, name) - -Returns the [`DataSet`](@ref) with the given `name` from `project`. If omitted, -the global data environment [`DataSets.PROJECT`](@ref) will be used. - -The `DataSet` is *metadata*, but to use the actual *data* in your program you -need to use the `open` function to access the `DataSet`'s content as a given -Julia type. - -`name` is the name of the dataset, or more generally a "data specification": a -URI-like object of the form `namespace/name?params#fragment`. - -# Example - -To open a dataset named `"a_text_file"` and read the whole content as a String, - -```julia -content = open(String, dataset("a_text_file")) -``` - -To open the same dataset as an `IO` stream and read only the first line, - -```julia -open(IO, dataset("a_text_file")) do io - line = readline(io) - @info "The first line is" line -end -``` - -To open a directory as a browsable tree object, - -```julia -open(FileTree, dataset("a_tree_example")) -``` -""" dataset(name) = dataset(PROJECT, name) """ @@ -281,19 +765,146 @@ function load_project!(path_or_config) _current_project = DataProject(new_project) end +#------------------------------------------------------------------------------- +# Storage layer and interface + +const _storage_drivers_lock = ReentrantLock() +const _storage_drivers = Dict{String,Any}() + +""" + add_storage_driver(driver_name=>storage_opener) + +Associate DataSet storage driver named `driver_name` with `storage_opener`. +When a `dataset` with `storage.driver == driver_name` is opened, +`storage_opener(user_func, storage_config, dataset)` will be called. Any +existing storage driver registered to `driver_name` will be overwritten. + +As a matter of convention, `storage_opener` should generally take configuration +from `storage_config` which is just `dataset.storage`. But to avoid config +duplication it may also use the content of `dataset`, (for example, dataset.uuid). + +Packages which define new storage drivers should generally call +`add_storage_driver()` within their `__init__()` functions. +""" +function add_storage_driver((name,opener)::Pair) + lock(_storage_drivers_lock) do + _storage_drivers[name] = opener + end +end + +function add_storage_driver(project::AbstractDataProject) + for conf in data_drivers(project) + pkgid = PkgId(UUID(conf["module"]["uuid"]), conf["module"]["name"]) + if Base.haskey(Base.package_locks, pkgid) + # Hack: Avoid triggering another call to require() for packages + # which are already in the process of being loaded. (This would + # result in a deadlock!) + # + # Obviously this depends on Base internals... + continue + end + mod = Base.require(pkgid) + #= + # TODO: Improve driver loading invariants. + # + # The difficulty here is that there's two possible ways for drivers to + # work: + # 1. The driver depends explicitly on `using DataSets`, so + # DataSets.__init__ is called *before* the Driver.__init__. + # 2. The driver uses a Requires-like mechanism to support multiple + # incompatible DataSets versions, so Driver.__init__ can occur + # *before* DataSets.__init__. + # + # This makes it hard for DataSets to check which drivers are added by a + # module: In case (2), the following check fails when the driver is + # loaded before DataSets and in case (1) we hit the double-require + # problem, resulting in the Base.package_locks bailout which disables + # the check below. + # + if conf["type"] == "storage" + driver_name = conf["name"] + # `mod` is assumed to run add_storage_driver() inside its __init__, + # unless the symbol mod.datasets_load_hook exists (in which case we + # call this instead). + lock(_storage_drivers_lock) do + get(_storage_drivers, driver_name) do + error("Package $pkgid did not provide storage driver $driver_name") + end + end + end + =# + end +end + +function _find_driver(dataset) + storage_config = dataset.storage + driver_name = get(storage_config, "driver") do + error("`storage.driver` configuration not found for dataset $(dataset.name)") + end + driver = lock(_storage_drivers_lock) do + get(_storage_drivers, driver_name) do + error(""" + Storage driver $(repr(driver_name)) not found for dataset $(dataset.name). + Current drivers are $(collect(keys(_storage_drivers))) + """) + end + end +end #------------------------------------------------------------------------------- -include("utils.jl") +# Functions for opening datasets + +# do-block form of open() +function Base.open(f::Function, as_type, dataset::DataSet) + storage_config = dataset.storage + driver = _find_driver(dataset) + driver(storage_config, dataset) do storage + open(f, as_type, storage) + end +end + +# Contexts-based form of open() +@! function Base.open(dataset::DataSet) + storage_config = dataset.storage + driver = _find_driver(dataset) + # Use `enter_do` because drivers don't yet use the ResourceContexts.jl mechanism + (storage,) = @! enter_do(driver, storage_config, dataset) + storage +end + +@! function Base.open(as_type, dataset::DataSet) + storage = @! open(dataset) + @! open(as_type, storage) +end + +# TODO: +# Consider making a distinction between open() and load(). +# Finalizer-based version of open() +function Base.open(dataset::DataSet) + @context begin + result = @! open(dataset) + @! ResourceContexts.detach_context_cleanup(result) + end +end + +function Base.open(as_type, dataset::DataSet) + @context begin + result = @! open(as_type, dataset) + @! ResourceContexts.detach_context_cleanup(result) + end +end + +#------------------------------------------------------------------------------- # Application entry points include("entrypoint.jl") # Builtin Data models -include("FileTree.jl") +include("BlobTree.jl") # Builtin backends include("filesystem.jl") -include("TomlDataStorage.jl") +include("DataTomlStorage.jl") # Backends # include("ZipTree.jl") @@ -302,7 +913,4 @@ include("TomlDataStorage.jl") # Application-level stuff include("repl.jl") -Base.@deprecate_binding Blob File -Base.@deprecate_binding BlobTree FileTree - end diff --git a/src/TomlDataStorage.jl b/src/DataTomlStorage.jl similarity index 92% rename from src/TomlDataStorage.jl rename to src/DataTomlStorage.jl index 5271869..5520bba 100644 --- a/src/TomlDataStorage.jl +++ b/src/DataTomlStorage.jl @@ -6,19 +6,19 @@ Useful for small amounts of self-contained data. ## Metadata spec -For File: +For Blob: ``` [datasets.storage] driver="TomlDataStorage" - type="File" + type="Blob" data=\$(base64encode(data)) ``` -For FileTree: +For BlobTree: ``` [datasets.storage] driver="TomlDataStorage" - type="FileTree" + type="BlobTree" [datasets.storage.data.\$(dirname1)] "\$(filename1)" = \$(base64encode(data1)) @@ -63,7 +63,7 @@ function Base.readdir(storage::TomlDataStorage, path::RelPath) end #-------------------------------------------------- -# Storage data interface for File +# Storage data interface for Blob function Base.open(func::Function, as_type::Type{IO}, storage::TomlDataStorage, path; kws...) @@ -108,16 +108,16 @@ end function connect_toml_data_storage(f, config, dataset) type = config["type"] data = get(config, "data", nothing) - if type in ("File", "Blob") + if type == "Blob" if !(data isa AbstractString) error("TOML data storage requires string data in the \"storage.data\" key") end - f(File(TomlDataStorage(dataset, data))) - elseif type in ("FileTree", "BlobTree") + f(Blob(TomlDataStorage(dataset, data))) + elseif type == "BlobTree" if !(data isa AbstractDict) error("TOML data storage requires a dictionary in the \"storage.data\" key") end - f(FileTree(TomlDataStorage(dataset, data))) + f(BlobTree(TomlDataStorage(dataset, data))) else throw(ArgumentError("DataSet type $type not supported for data embedded in Data.toml")) end diff --git a/src/FileTree.jl b/src/FileTree.jl deleted file mode 100644 index 2a354d7..0000000 --- a/src/FileTree.jl +++ /dev/null @@ -1,523 +0,0 @@ -# Many datasets have tree-like indices. Examples: -# -# Index Data -# -# * OS: directories files -# * Git: trees blobs -# * S3: prefixes blobs -# * HDF5 group typed data -# * Zip flattend directory(?) blobs -# - -import AbstractTrees: AbstractTrees, children - -#------------------------------------------------------------------------------- -abstract type AbstractFileTree; end - -# The tree API - -# TODO: Should we have `istree` separate from `isdir`? -Base.isdir(x::AbstractFileTree) = true -Base.isfile(tree::AbstractFileTree) = false -Base.ispath(x::AbstractFileTree) = true - -# Number of children is not known without a (potentially high-latency) call to -# an external resource -Base.IteratorSize(tree::AbstractFileTree) = Base.SizeUnknown() - -function Base.iterate(tree::AbstractFileTree, state=nothing) - if state == nothing - # By default, call `children(tree)` to eagerly get a list of children - # for iteration. - cs = children(tree) - itr = iterate(cs) - else - (cs, cstate) = state - itr = iterate(cs, cstate) - end - if itr == nothing - return nothing - else - (c, cstate) = itr - (c, (cs, cstate)) - end -end - -""" - showtree([io,], tree) - -Pretty printing of file trees, in the spirit of the unix `tree` utility. -""" -function showtree(io::IO, tree::AbstractFileTree; maxdepth=5) - println(io, "๐Ÿ“‚ ", tree) - _showtree(io, tree, "", maxdepth) -end - -struct ShownTree - tree -end -# Use a wrapper rather than defaulting to stdout so that this works in more -# functional environments such as Pluto.jl -showtree(tree::AbstractFileTree) = ShownTree(tree) - -Base.show(io::IO, s::ShownTree) = showtree(io, s.tree) - -function _showtree(io::IO, tree::AbstractFileTree, prefix, depth) - cs = children(tree) - for (i,x) in enumerate(cs) - islast = i == lastindex(cs) # TODO: won't work if children() is lazy - first_prefix = prefix * (islast ? "โ””โ”€โ”€" : "โ”œโ”€โ”€") - other_prefix = prefix * (islast ? " " : "โ”‚ย ย ") - if isdir(x) - print(io, first_prefix, "๐Ÿ“‚ ") - printstyled(io, basename(x), "\n", color=:light_blue, bold=true) - if depth > 1 - _showtree(io, x, other_prefix, depth-1) - else - print(io, other_prefix, 'โ‹ฎ') - end - else - println(io, first_prefix, " ", basename(x)) - end - end -end - -function Base.copy!(dst::AbstractFileTree, src::AbstractFileTree) - for x in src - xname = basename(x) - if isdir(x) - copy!(newdir(dst, xname), x) - else - open(x) do io_src - newfile(dst, xname, overwrite=true) do io_dst - write(io_dst, io_src) - end - end - end - end - return dst -end - -Base.copy(src::AbstractFileTree) = copy!(newdir(), src) - -#------------------------------------------------------------------------------- -""" - File(root) - File(root, relpath) - -`File` represents the location of a collection of unstructured binary data. The -location is a path `relpath` relative to some `root` data resource. - -A `File` can naturally be `open()`ed as a `Vector{UInt8}`, but can also be -mapped into the program as an `IO` byte stream, or interpreted as a `String`. - -Files can be arranged into hierarchies "directories" via the `FileTree` type. -""" -mutable struct File{Root} - root::Root - path::RelPath -end - -File(root) = File(root, RelPath()) - -Base.basename(file::File) = basename(file.path) -Base.abspath(file::File) = AbsPath(file.root, file.path) -Base.isdir(file::File) = false -Base.isfile(file::File) = true -Base.ispath(file::File) = true -Base.filesize(file::File) = filesize(file.root, file.path) - -function Base.show(io::IO, ::MIME"text/plain", file::File) - print(io, "๐Ÿ“„ ", file.path, " @ ", summary(file.root)) -end - -function AbstractTrees.printnode(io::IO, file::File) - print(io, "๐Ÿ“„ ", basename(file)) -end - -# Opening as Vector{UInt8} or as String defers to IO interface -function Base.open(f::Function, ::Type{Vector{UInt8}}, file::File) - open(IO, file.root, file.path) do io - f(read(io)) # TODO: use Mmap? - end -end - -function Base.open(f::Function, ::Type{String}, file::File) - open(IO, file.root, file.path) do io - f(read(io, String)) - end -end - -# Default open-type for File is IO -Base.open(f::Function, file::File; kws...) = open(f, IO, file.root, file.path; kws...) - -# Opening File as itself is trivial -function Base.open(f::Function, ::Type{File}, file::File) - f(file) -end - -# open with other types T defers to the underlying storage system -function Base.open(f::Function, ::Type{T}, file::File; kws...) where {T} - open(f, T, file.root, file.path; kws...) -end - -# ResourceContexts.jl - based versions of the above. - -@! function Base.open(::Type{Vector{UInt8}}, file::File) - @context begin - # TODO: use Mmap? - read(@! open(IO, file.root, file.path)) - end -end - -@! function Base.open(::Type{String}, file::File) - @context begin - read(@!(open(IO, file.root, file.path)), String) - end -end - -# Default open-type for File is IO -@! function Base.open(file::File; kws...) - @! open(IO, file.root, file.path; kws...) -end - -# Opening File as itself is trivial -@! function Base.open(::Type{File}, file::File) - file -end - -# open with other types T defers to the underlying storage system -@! function Base.open(::Type{T}, file::File; kws...) where {T} - @! open(T, file.root, file.path; kws...) -end - -# Fallback implementation of `@! open(T, root, path)` based on enter_do. -# -# TODO: Update other backends to avoid calling this; using enter_do is pretty -# inefficient. -@! function Base.open(::Type{T}, root, path; kws...) where {T} - (res,) = @! enter_do(open, T, root, path; kws...) - res -end - -# Unscoped form of open for File -function Base.open(::Type{T}, file::File; kws...) where {T} - @context begin - result = @! open(T, file; kws...) - @! ResourceContexts.detach_context_cleanup(result) - end -end - -# read() is also supported for `File`s -Base.read(file::File) = read(file.root, file.path) -Base.read(file::File, ::Type{T}) where {T} = read(file.root, file.path, T) - - -# Support for opening AbsPath -# -# TODO: Put this elsewhere? -function Base.open(f::Function, ::Type{T}, path::AbsPath; kws...) where {T} - open(f, T, path.root, path.path; kws...) -end - -Base.open(f::Function, path::AbsPath; kws...) = open(f, IO, path.root, path.path; kws...) - - -#------------------------------------------------------------------------------- -""" - newdir() - FileTree(root) - -Create a `FileTree` which is a "directory tree" like hierarchy which may have -`File`s and `FileTree`s as children. `newdir()` creates the tree in a -temporary directory on the local filesystem. Alternative `root`s may be -supplied which store the data elsewhere. - -The tree implements the `AbstractTrees.children()` interface and may be indexed -with `/`-separated paths to traverse the hierarchy down to the leaves which are -of type `File`. Individual leaves may be `open()`ed as various Julia types. - -# Operations on FileTree - -`FileTree` has a largely dictionary-like interface: - -* List keys (ie, file and directory names): `keys(tree)` -* List keys,value pairs: `pairs(tree)` -* Query keys: `haskey(tree)` -* Traverse the tree: `tree["path"]`, `tree["multi/component/path"]` -* Add new content: `newdir(tree, "path")`, `newfile(tree, "path")` -* Delete content: `delete!(tree, "path")` - -Iteration of FileTree iterates values (not key value pairs). This -has some benefits - for example, broadcasting processing across files in a -directory. - -* Property access - - `isdir()`, `isfile()` - determine whether a child of tree is a directory or file. - - `filesize()` โ€” size of `File` elements in a tree - -# Example - -Create a new temporary FileTree via the `newdir()` function and fill it with -files via `newfile()`: - -``` -julia> dir = newdir() - for i = 1:3 - newfile(dir, "\$i/a.txt") do io - println(io, "Content of a") - end - newfile(dir, "b-\$i.txt") do io - println(io, "Content of b") - end - end - dir -๐Ÿ“‚ Tree @ /tmp/jl_Sp6wMF - ๐Ÿ“ 1 - ๐Ÿ“ 2 - ๐Ÿ“ 3 - ๐Ÿ“„ b-1.txt - ๐Ÿ“„ b-2.txt - ๐Ÿ“„ b-3.txt -``` - -Create a `FileTree` from a local directory with `DataSets.from_path()`: - -``` -julia> using Pkg - open(DataSets.from_path(joinpath(Pkg.dir("DataSets"), "src"))) -๐Ÿ“‚ Tree @ ~/.julia/dev/DataSets/src - ๐Ÿ“„ DataSet.jl - ๐Ÿ“„ DataSets.jl - ๐Ÿ“„ DataTomlStorage.jl - ... -``` -""" -mutable struct FileTree{Root} <: AbstractFileTree - root::Root - path::RelPath -end - -FileTree(root) = FileTree(root, RelPath()) - -function Base.show(io::IO, ::MIME"text/plain", tree::FileTree) - # TODO: Ideally we'd use - # AbstractTrees.print_tree(io, tree, 1) - # However, this is hard to use efficiently; we'd need to implement a lazy - # `children()` for all our trees. It'd be much easier if - # `AbstractTrees.has_children()` was used consistently upstream. - println(io, "๐Ÿ“‚ Tree ", tree.path, " @ ", summary(tree.root)) - first = true - for (name,x) in pairs(tree) - if first - first = false - else - print(io, '\n') - end - print(io, " ", isdir(x) ? '๐Ÿ“' : '๐Ÿ“„', " ", name) - end -end - -function AbstractTrees.printnode(io::IO, tree::FileTree) - print(io, "๐Ÿ“‚ ", basename(tree)) -end - -# getindex vs joinpath: -# - getindex is about indexing the datastructure; therefore it looks in the -# storage system to only return things which exist. -# - joinpath just makes paths, not knowing whether they exist. -function Base.getindex(tree::FileTree, path::RelPath) - relpath = joinpath(tree.path, path) - root = tree.root - # TODO: Make this more efficient by moving this work to the storage backend? - # Sort of like an equivalent of `stat`? - if isdir(root, relpath) - FileTree(root, relpath) - elseif isfile(root, relpath) - File(root, relpath) - elseif ispath(root, relpath) - AbsPath(root, relpath) # Not great? - else - error("Path $relpath @ $root doesn't exist") - end -end - -function Base.getindex(tree::FileTree, name::AbstractString) - getindex(tree, RelPath(name)) -end - - -# Keys, values and iteration - -""" - children(tree::FileTree) - -Return an array of the children of `tree`. A child `x` may abstractly either be -another tree (`children(x)` returns a collection) or a file, where `children(x)` -returns `()`. -""" -function children(tree::FileTree) - [tree[RelPath([n])] for n in keys(tree)] -end - -function Base.haskey(tree::FileTree, path::AbstractString) - haskey(tree, RelPath(path)) -end - -function Base.haskey(tree::FileTree, path::RelPath) - ispath(tree.root, joinpath(tree.path, path)) -end - -function Base.keys(tree::FileTree) - readdir(tree.root, tree.path) -end - -function Base.pairs(tree::FileTree) - zip(keys(tree), children(tree)) -end - -function Base.values(tree::FileTree) - children(tree) -end - - -# Mutation - -newdir(tree::FileTree, path::AbstractString; kws...) = - newdir(tree, RelPath(path); kws...) -newfile(tree::FileTree, path::AbstractString; kws...) = - newfile(tree, RelPath(path); kws...) -newfile(func::Function, tree::FileTree, path::AbstractString; kws...) = - newfile(func, tree, RelPath(path); kws...) -Base.delete!(tree::FileTree, path::AbstractString) = - delete!(tree, RelPath(path)) - -function _check_writeable(tree) - if !iswriteable(tree.root) - error("Attempt to write into a read-only tree with root $(tree.root)") - end -end - -function _check_new_item(tree, path, overwrite) - _check_writeable(tree) - if haskey(tree, path) && !overwrite - error("Overwriting a path $path which already exists requires the keyword `overwrite=true`") - end -end - -""" - newdir(tree, path; overwrite=false) - -Create a new FileTree ("directory") at tree[path] and return it. If -`overwrite=true`, remove any existing tree before creating the new one. -""" -function newdir(tree::FileTree, path::RelPath; overwrite=false) - _check_new_item(tree, path, overwrite) - p = joinpath(tree.path, path) - newdir(tree.root, p; overwrite=overwrite) - return FileTree(tree.root, p) -end - -""" - newfile(tree, path; overwrite=false) - newfile(tree, path; overwrite=false) do io ... - -Create a new file object in the `tree` at the given `path`. In the second form, -the open file `io` will be passed to the do block. - - newfile() - -Create a new file which may be later assigned to a permanent location in a -tree. If not assigned to a permanent location, the temporary file is cleaned up -during garbage collection. - -# Example - -``` -newfile(tree, "some/demo/path.txt") do io - println(io, "Hi there!") -end -``` -""" -function newfile(tree::FileTree, path::RelPath; overwrite=false) - _check_new_item(tree, path, overwrite) - p = joinpath(tree.path, path) - newfile(tree.root, p; overwrite=overwrite) - return File(tree.root, p) -end - -function newfile(func::Function, tree::FileTree, path::RelPath; overwrite=false) - _check_new_item(tree, path, overwrite) - p = joinpath(tree.path, path) - newfile(func, tree.root, p; overwrite=overwrite) - return File(tree.root, p) -end - - -function Base.delete!(tree::FileTree, path::RelPath) - _check_writeable(tree) - relpath = joinpath(tree.path, path) - root = tree.root - delete!(root, relpath) -end - -function Base.open(f::Function, ::Type{FileTree}, tree::FileTree) - f(tree) -end - -@! function Base.open(::Type{FileTree}, tree::FileTree) - tree -end - -# Base.open(::Type{T}, file::File; kws...) where {T} = open(identity, T, file.root, file.path; kws...) - - -#------------------------------------------------------------------------------- -# Path manipulation - -# TODO: Maybe deprecate these? Under the "datastructure-like" model, it seems wrong -# for a file to know its name in the parent data structure. -Base.basename(tree::FileTree) = basename(tree.path) -Base.abspath(tree::FileTree) = AbsPath(tree.root, tree.path) - -function Base.joinpath(tree::FileTree, r::RelPath) - AbsPath(tree.root, joinpath(tree.path, r)) -end - -function Base.joinpath(tree::FileTree, s::AbstractString) - AbsPath(tree.root, joinpath(tree.path, s)) -end - - -#------------------------------------------------------------------------------- -# Deprecated -function Base.rm(tree::FileTree; kws...) - _check_writeable(tree) - Base.depwarn(""" - `rm(::FileTree)` is deprecated. Use `delete!(tree, path)` instead. - """, :rm) - rm(tree.root, tree.path; kws...) -end - -function Base.readdir(tree::FileTree) - readdir(tree.root, tree.path) -end - -# Create files within a temporary directory. -function newdir(tree::FileTree) - Base.depwarn(""" - `newdir(::FileTree)` for temporary trees is deprecated. - Use the in-place version `newdir(::FileTree, dirname)` instead. - """, - :newdir) - newdir(tree.root) -end -function newfile(tree::FileTree) - Base.depwarn(""" - `newfile(::FileTree)` for temporary trees is deprecated. - Use the in-place version `newfile(::FileTree, dirname)` instead. - """, - :newfile) - newfile(tree.root) -end - diff --git a/src/GitTree.jl b/src/GitTree.jl index 6f28616..9dadc94 100644 --- a/src/GitTree.jl +++ b/src/GitTree.jl @@ -19,7 +19,7 @@ function Base.open(f::Function, root::GitTreeRoot) git(subcmd) = setenv(`git $subcmd`, dir=root.path) s = read(git(`status --porcelain`), String) isempty(s) || error("Git working copy is dirty") - result = f(FileTree(root)) + result = f(BlobTree(root)) # FIXME: From the point of view of this code, it seems unnatural to attach # `write` to GitTreeRoot. if root.write @@ -30,13 +30,13 @@ function Base.open(f::Function, root::GitTreeRoot) end #------------------------------------------------------------------------------- -# FIXME: Factor together with FileTreeRoot +# FIXME: Factor together with BlobTreeRoot -function Base.haskey(tree::FileTree{GitTreeRoot}, name::AbstractString) +function Base.haskey(tree::BlobTree{GitTreeRoot}, name::AbstractString) ispath(sys_abspath(joinpath(tree,name))) end -function Base.open(func::Function, f::File{GitTreeRoot}; write=false, read=!write) +function Base.open(func::Function, f::Blob{GitTreeRoot}; write=false, read=!write) if !f.root.write && write error("Error writing file at read-only path $f") end @@ -55,6 +55,6 @@ function Base.mkdir(p::AbsPath{GitTreeRoot}, args...) error("Cannot make directory in read-only tree root at $(sys_abspath(p.root))") end mkdir(sys_abspath(p), args...) - return FileTree(p.root, p.path) + return BlobTree(p.root, p.path) end diff --git a/src/ZipTree.jl b/src/ZipTree.jl index 1ee1131..cfb80c2 100644 --- a/src/ZipTree.jl +++ b/src/ZipTree.jl @@ -27,18 +27,18 @@ end #------------------------------------------------------------------------------- -# FIXME: Factor back together with FileTree.jl !! +# FIXME: Factor back together with BlobTree.jl !! -struct ZippedFileTree <: AbstractFileTree +struct ZippedBlobTree <: AbstractBlobTree root::ZipTreeRoot path::RelPath end -ZippedFileTree(root::ZipTreeRoot) = ZippedFileTree(root, RelPath()) +ZippedBlobTree(root::ZipTreeRoot) = ZippedBlobTree(root, RelPath()) -Base.basename(tree::ZippedFileTree) = basename(tree.path) +Base.basename(tree::ZippedBlobTree) = basename(tree.path) -function Base.getindex(tree::ZippedFileTree, path::RelPath) +function Base.getindex(tree::ZippedBlobTree, path::RelPath) newpath = joinpath(tree.path, path) i = findfirst(tree.root.file_info) do info info.path == newpath @@ -46,17 +46,17 @@ function Base.getindex(tree::ZippedFileTree, path::RelPath) if i == nothing error("Path $newpath doesn't exist in $tree") elseif tree.root.file_info[i].is_dir - ZippedFileTree(tree.root, newpath) + ZippedBlobTree(tree.root, newpath) else - File(tree.root, newpath) + Blob(tree.root, newpath) end end -function Base.getindex(tree::ZippedFileTree, name::AbstractString) +function Base.getindex(tree::ZippedBlobTree, name::AbstractString) getindex(tree, joinpath(RelPath(), name)) end -function _tree_children(tree::ZippedFileTree) +function _tree_children(tree::ZippedBlobTree) children = String[] for (i,info) in enumerate(tree.root.file_info) if dirname(info.path) == tree.path @@ -66,8 +66,8 @@ function _tree_children(tree::ZippedFileTree) children end -Base.IteratorSize(tree::ZippedFileTree) = Base.SizeUnknown() -function Base.iterate(tree::ZippedFileTree, state=nothing) +Base.IteratorSize(tree::ZippedBlobTree) = Base.SizeUnknown() +function Base.iterate(tree::ZippedBlobTree, state=nothing) if state == nothing children = _tree_children(tree) itr = iterate(children) @@ -83,16 +83,16 @@ function Base.iterate(tree::ZippedFileTree, state=nothing) end end -function Base.joinpath(tree::ZippedFileTree, r::RelPath) +function Base.joinpath(tree::ZippedBlobTree, r::RelPath) # Should this AbsPath be rooted at `tree` rather than `tree.root`? AbsPath(tree.root, joinpath(tree.path, r)) end -function Base.joinpath(tree::ZippedFileTree, s::AbstractString) +function Base.joinpath(tree::ZippedBlobTree, s::AbstractString) AbsPath(tree.root, joinpath(tree.path, s)) end -function Base.open(func::Function, f::File{ZipTreeRoot}; write=false, read=!write) +function Base.open(func::Function, f::Blob{ZipTreeRoot}; write=false, read=!write) if write error("Error writing file at read-only path $f") end diff --git a/src/data_project.jl b/src/data_project.jl deleted file mode 100644 index 0ee6f95..0000000 --- a/src/data_project.jl +++ /dev/null @@ -1,371 +0,0 @@ -# AbstractDataProject and the generic DataProject - -""" -Subtypes of `AbstractDataProject` have the interface - -Must implement: - - `Base.get(project, dataset_name, default)` โ€” search - - `Base.keys(project)` - get dataset names - -Optional: - - `Base.iterate()` โ€” default implementation in terms of `keys` and `get` - - `Base.pairs()` โ€” default implementation in terms of `keys` and `get` - - `Base.haskey()` โ€” default implementation in terms of `get` - - `Base.getindex()` โ€” default implementation in terms of `get` - - `DataSets.project_name()` โ€” returns `nothing` by default. - -Provided by AbstractDataProject (should not be overridden): - - `DataSets.dataset()` - implemented in terms of `get` -""" -abstract type AbstractDataProject end - -function Base.getindex(proj::AbstractDataProject, name::AbstractString) - data = get(proj, name, nothing) - data != nothing || error("DataSet $(repr(name)) not found") - data -end - -function dataset(proj::AbstractDataProject, spec::AbstractString) - namestr, query, fragmentstr = _split_dataspec(spec) - - if isnothing(namestr) - throw(ArgumentError("Invalid dataset specification: $spec")) - end - - dataset = proj[namestr] - - if isnothing(query) && isnothing(fragmentstr) - return dataset - end - - # Enhance dataset with "dataspec" holding URL-like fragment & query - dataspec = Dict() - if !isnothing(query) - dataspec["query"] = Dict{String,Any}(query) - end - if !isnothing(fragmentstr) - dataspec["fragment"] = fragmentstr - end - - # We need to take care here with copy() to avoid modifying the original - # dataset configuration. - conf = copy(dataset.conf) - conf["dataspec"] = dataspec - - # FIXME: This copy is problematic now that datasets can be mutated with - # `DataSets.config!()` as "dataspec" will infect the dataset when it's - # saved again. - return DataSet(data_project(dataset), conf) -end - -""" - config!(name::AbstractString; kws...) - config!(proj::AbstractDataProject, name::AbstractString; kws...) - - config!(dataset::DataSet; kws...) - -Update the configuration of `dataset` with the given keyword arguments and -persist it in the dataset's project storage. The versions which take a `name` -use that name to search within the given data project. - -# Examples - -Update the description of the dataset named `"SomeData"` in the global project: -``` -DataSets.config!("SomeData"; description="This is a description") -``` - -Alternatively, setting `DataSet` properties can be used to update metadata. For -example, to tag the dataset "SomeData" with tags `"A"` and `"B"`. -``` -ds = dataset("SomeData") -ds.tags = ["A", "B"] -``` -""" -function config!(project::AbstractDataProject, name::AbstractString; kws...) - config!(project[name]; kws...) -end - -# Percent-decode a string according to the URI escaping rules. -# Vendored from URIs.jl for now to avoid depending on that entire package for -# this one function. -function _unescapeuri(str) - occursin("%", str) || return str - out = IOBuffer() - i = 1 - io = IOBuffer(str) - while !eof(io) - c = read(io, Char) - if c == '%' - c1 = read(io, Char) - c = read(io, Char) - write(out, parse(UInt8, string(c1, c); base=16)) - else - write(out, c) - end - end - return String(take!(out)) -end - -# Parse as a suffix of URI syntax -# name/of/dataset?param1=value1¶m2=value2#fragment -const DATASET_SPEC_REGEX = Regex( - """ - ^ - ($(DATASET_NAME_REGEX_STRING)) - (?:\\?([^#]*))? # query - a=b&c=d - (?:\\#(.*))? # fragment - ... - \$ - """, - "x", -) -function _split_dataspec(spec::AbstractString) - m = match(DATASET_SPEC_REGEX, spec) - if isnothing(m) - return nothing, nothing, nothing - end - namestr = m[1] - query = m[2] - fragmentstr = m[3] - - if !isnothing(query) - query = [_unescapeuri(x)=>_unescapeuri(y) for (x,y) in split.(split(query, '&'), '=')] - end - if !isnothing(fragmentstr) - fragmentstr = _unescapeuri(fragmentstr) - end - - namestr, query, fragmentstr -end - -function Base.haskey(proj::AbstractDataProject, name::AbstractString) - get(proj, name, nothing) !== nothing -end - -function Base.iterate(project::AbstractDataProject, state=nothing) - if isnothing(state) - ks = keys(project) - ks_itr = iterate(ks) - else - (ks, ks_state) = state - ks_itr = iterate(ks, ks_state) - end - if isnothing(ks_itr) - return nothing - end - (k, ks_state) = ks_itr - val = get(project, k, nothing) - if isnothing(val) - # val could be `nothing` if entries in the project are updated - # concurrently. (Eg, this might happen for data projects which are - # backed by the filesystem.) - return iterate(project, (ks, ks_state)) - end - (val, (ks, ks_state)) -end - -# Unknown size by default, due to the above get-based implementation of -# iterate, coupled with possible concurrent modification. -Base.IteratorSize(::AbstractDataProject) = Base.SizeUnknown() - -function Base.pairs(proj::AbstractDataProject) - ks = keys(proj) - (k=>d for (k,d) in (k=>get(proj, k, nothing) for k in ks) if !isnothing(d)) -end - -""" - project_name(data_project) - -Return the name of the given `data_project`. Ideally this can be used to -uniquely identify the project when modifying the project stack in -`DataSets.PROJECT`. For projects which were generated from -`JULIA_DATASETS_PATH`, this will be the expanded path component. - -Other types of projects will have to return something else. For example, remote -data projects may want to return a URI. For projects which have no obvious -identifier, `nothing` is returned. -""" -project_name(data_project::AbstractDataProject) = nothing - -data_drivers(proj::AbstractDataProject) = [] - -function Base.show(io::IO, ::MIME"text/plain", project::AbstractDataProject) - datasets = collect(pairs(project)) - summary(io, project) - println(io, ":") - if isempty(datasets) - print(io, " (empty)") - return - end - sorted = sort(datasets, by=first) - maxwidth = maximum(textwidth.(first.(sorted))) - for (i, (name, data)) in enumerate(sorted) - pad = maxwidth - textwidth(name) - storagetype = get(data.storage, "type", nothing) - icon = storagetype in ("File", "Blob") ? '๐Ÿ“„' : - storagetype in ("FileTree", "BlobTree") ? '๐Ÿ“' : - 'โ“' - print(io, " ", icon, ' ', name, ' '^pad, " => ", data.uuid) - if i < length(sorted) - println(io) - end - end -end - -function Base.summary(io::IO, project::AbstractDataProject) - print(io, typeof(project)) - name = project_name(project) - if !isnothing(name) - print(io, " [", name, "]") - end -end - -#------------------------------------------------------------------------------- -""" - DataProject - -A in-memory collection of DataSets. -""" -struct DataProject <: AbstractDataProject - datasets::Dict{String,DataSet} - drivers::Vector{Dict{String,Any}} -end - -DataProject() = DataProject(Dict{String,DataSet}(), Vector{Dict{String,Any}}()) - -DataProject(project::AbstractDataProject) = DataProject(Dict(pairs(project)), - Vector{Dict{String,Any}}()) - -data_drivers(project::DataProject) = project.drivers - -function Base.get(proj::DataProject, name::AbstractString, default) - get(proj.datasets, name, default) -end - -Base.keys(proj::DataProject) = keys(proj.datasets) - -function Base.iterate(proj::DataProject, state=nothing) - # proj.datasets iterates key=>value; need to rejig it to iterate values. - itr = isnothing(state) ? iterate(proj.datasets) : iterate(proj.datasets, state) - isnothing(itr) && return nothing - (x, state) = itr - (x.second, state) -end - -function Base.setindex!(proj::DataProject, data::DataSet, name::AbstractString) - proj.datasets[name] = data -end - -#------------------------------------------------------------------------------- -""" - StackedDataProject() - StackedDataProject(projects) - -Search stack of AbstractDataProjects, where projects are searched from the -first to last element of `projects`. - -Additional projects may be added or removed from the stack with `pushfirst!`, -`push!` and `empty!`. - -See also [`DataSets.PROJECT`](@ref). -""" -struct StackedDataProject <: AbstractDataProject - projects::Vector -end - -StackedDataProject() = StackedDataProject([]) - -data_drivers(stack::StackedDataProject) = vcat(data_drivers.(stack.projects)...) - -function Base.keys(stack::StackedDataProject) - names = [] - for project in stack.projects - append!(names, keys(project)) - end - unique(names) -end - -function Base.get(stack::StackedDataProject, name::AbstractString, default) - for project in stack.projects - d = get(project, name, nothing) - if !isnothing(d) - return d - end - end -end - -# API for manipulating the stack. -Base.push!(stack::StackedDataProject, project) = push!(stack.projects, project) -Base.pushfirst!(stack::StackedDataProject, project) = pushfirst!(stack.projects, project) -Base.popfirst!(stack::StackedDataProject) = popfirst!(stack.projects) -Base.pop!(stack::StackedDataProject) = pop!(stack.projects) -Base.empty!(stack::StackedDataProject) = empty!(stack.projects) - -function Base.show(io::IO, mime::MIME"text/plain", stack::StackedDataProject) - summary(io, stack) - println(io, ":") - for (i,project) in enumerate(stack.projects) - # show(io, mime, project) - # indent each project - str = sprint(show, mime, project) - print(io, join(" " .* split(str, "\n"), "\n")) - i != length(stack.projects) && println(io) - end -end - - -#------------------------------------------------------------------------------- -""" - load_project(path) - -Load a data project from a system `path` referring to a TOML file. - -See also [`load_project!`](@ref). -""" -function load_project(path::AbstractString; auto_update=true) - sys_path = abspath(path) - if !auto_update - Base.depwarn("`auto_update` is deprecated", :load_project) - end - TomlFileDataProject(sys_path) -end - -function load_project(config::AbstractDict; kws...) - _check_keys(config, "Data.toml", ["data_config_version"=>Integer, - "datasets"=>AbstractVector]) - format_ver = config["data_config_version"] - if format_ver > CURRENT_DATA_CONFIG_VERSION - error(""" - data_config_version=$format_ver is newer than supported. - Consider upgrading to a newer version of DataSets.jl - """) - end - proj = DataProject() - for dataset_conf in config["datasets"] - dataset = DataSet(proj, dataset_conf) - proj[dataset.name] = dataset - end - if haskey(config, "drivers") - _check_keys(config, DataProject, ["drivers"=>AbstractVector]) - for driver_conf in config["drivers"] - _check_keys(driver_conf, DataProject, ["type"=>String, "name"=>String, "module"=>Dict]) - _check_keys(driver_conf["module"], DataProject, ["name"=>String, "uuid"=>String]) - push!(proj.drivers, driver_conf) - end - end - proj -end - -function project_toml(proj::DataProject) - # FIXME: Preserve other unknown keys here for forward compatibility. - conf = Dict( - "data_config_version"=>CURRENT_DATA_CONFIG_VERSION, - "datasets"=>[d.conf for (n,d) in proj.datasets], - "drivers"=>proj.drivers - ) - return sprint(TOML.print, conf) -end - -function config!(name::AbstractString; kws...) - config!(PROJECT, name; kws...) -end diff --git a/src/entrypoint.jl b/src/entrypoint.jl index e658c97..9cb5fe5 100644 --- a/src/entrypoint.jl +++ b/src/entrypoint.jl @@ -93,8 +93,8 @@ Get a string representation of the "DataSet type", which represents the type of the data *outside* Julia. A given DataSet type may be mapped into many different Julia types. For example -consider the "File" type which is an array of bytes (commonly held in a file on -disk). When loaded into Julia, this may be represented as a +consider the "Blob" type which is an array of bytes (commonly held in a file). +When loaded into Julia, this may be represented as a * IO โ€” via open()) * String โ€” via open() |> read(_,String) * Vector{UInt8} โ€” via mmap) diff --git a/src/file_data_projects.jl b/src/file_data_projects.jl index 79f3773..c484430 100644 --- a/src/file_data_projects.jl +++ b/src/file_data_projects.jl @@ -4,8 +4,8 @@ A cache of file content, parsed with an arbitrary parser function. This is a modified and generalized version of `Base.CachedTOMLDict`. -Getting the value of the cache with `get_cache(f)` will automatically update -the parsed value whenever the file changes. +Getting the value of the cache with `f[]` will automatically update the parsed +value whenever the file changes. """ mutable struct CachedParsedFile{T} path::String @@ -38,7 +38,7 @@ function CachedParsedFile{T}(parser::Function, path::String) where T ) end -function get_cache(f::CachedParsedFile, allow_refresh=true) +function Base.getindex(f::CachedParsedFile) s = stat(f.path) time_since_cached = time() - f.mtime rough_mtime_granularity = 0.1 # seconds @@ -59,9 +59,6 @@ function get_cache(f::CachedParsedFile, allow_refresh=true) f.mtime = s.mtime f.size = s.size f.hash = new_hash - if !allow_refresh - error("The file at $(f.path) was written externally") - end @debug "Cache of file $(repr(f.path)) invalid, reparsing..." return f.d = f.parser(content) end @@ -69,39 +66,19 @@ function get_cache(f::CachedParsedFile, allow_refresh=true) return f.d end -function set_cache(f::CachedParsedFile, content::AbstractString) - mktemp(dirname(f.path)) do tmppath, tmpio - write(tmpio, content) - close(tmpio) - # Uses mktemp() + mv() to atomically overwrite the file - mv(tmppath, f.path, force=true) - end - s = stat(f.path) - f.inode = s.inode - f.mtime = s.mtime - f.size = s.size - f.hash = sha1(content) -end - function Base.show(io::IO, m::MIME"text/plain", f::CachedParsedFile) println(io, "Cache of file $(repr(f.path)) with value") - show(io, m, get_cache(f)) + show(io, m, f[]) end # Parse Data.toml into DataProject which updates when the file does. -function parse_and_cache_project(proj, sys_path::AbstractString) +function parse_and_cache_project(sys_path::AbstractString) sys_data_dir = dirname(sys_path) CachedParsedFile{DataProject}(sys_path) do content if isnothing(content) DataProject() else - inner_proj = _load_project(String(content), sys_data_dir) - for d in inner_proj - # Hack; we steal ownership from the DataProject here. - # What's a better way to do this? - setfield!(d, :project, proj) - end - inner_proj + _load_project(String(content), sys_data_dir) end end end @@ -110,19 +87,19 @@ end abstract type AbstractTomlFileDataProject <: AbstractDataProject end function Base.get(proj::AbstractTomlFileDataProject, name::AbstractString, default) - get(get_cache(proj), name, default) + get(_get_cached(proj), name, default) end function Base.keys(proj::AbstractTomlFileDataProject) - keys(get_cache(proj)) + keys(_get_cached(proj)) end function Base.iterate(proj::AbstractTomlFileDataProject, state=nothing) # This is a little complex because we want iterate to work even if the # active project changes concurrently, which means wrapping up the initial - # result of get_cache with the iterator state. + # result of _get_cached with the iterator state. if isnothing(state) - cached_values = values(get_cache(proj)) + cached_values = values(_get_cached(proj)) if isnothing(cached_values) return nothing end @@ -139,20 +116,9 @@ function Base.iterate(proj::AbstractTomlFileDataProject, state=nothing) end end -Base.pairs(proj::AbstractTomlFileDataProject) = pairs(get_cache(proj)) +Base.pairs(proj::AbstractTomlFileDataProject) = pairs(_get_cached(proj)) -data_drivers(proj::AbstractTomlFileDataProject) = data_drivers(get_cache(proj)) - -function config!(proj::AbstractTomlFileDataProject, dataset::DataSet; kws...) - if data_project(dataset) !== proj - error("dataset must belong to project") - end - # Here we accept the update independently of the project - Data.toml should - # be able to manage any dataset config. - config!(nothing, dataset; kws...) - set_cache(proj, project_toml(get_cache(proj, false))) - return dataset -end +data_drivers(proj::AbstractTomlFileDataProject) = data_drivers(_get_cached(proj)) #------------------------------------------------------------------------------- """ @@ -162,23 +128,15 @@ filesystem. mutable struct TomlFileDataProject <: AbstractTomlFileDataProject path::String cache::CachedParsedFile{DataProject} - function TomlFileDataProject(path::String) - proj = new(path) - proj.cache = parse_and_cache_project(proj, path) - proj - end end -function get_cache(proj::TomlFileDataProject, refresh=true) - get_cache(proj.cache, refresh) +function TomlFileDataProject(path::String) + cache = parse_and_cache_project(path) + TomlFileDataProject(path, cache) end -function set_cache(proj::TomlFileDataProject, content::AbstractString) - set_cache(proj.cache, content) -end - -function local_data_abspath(proj::TomlFileDataProject, relpath) - return joinpath(dirname(proj.path), relpath) +function _get_cached(proj::TomlFileDataProject) + proj.cache[] end project_name(proj::TomlFileDataProject) = proj.path @@ -202,7 +160,7 @@ end function ActiveDataProject() proj = ActiveDataProject(nothing, DataProject()) - get_cache(proj) + _get_cached(proj) proj end @@ -212,95 +170,29 @@ function _active_project_data_toml(project_path=Base.active_project(false)) joinpath(dirname(project_path), "Data.toml") end -function get_cache(proj::ActiveDataProject, allow_refresh=true) +function _get_cached(proj::ActiveDataProject) active_project = Base.active_project(false) if proj.active_project_path != active_project - if !allow_refresh - error("The current project path was changed") - end # The unusual case: active project has changed. if isnothing(active_project) proj.cache = DataProject() else data_toml = _active_project_data_toml(active_project) # Need to re-cache - proj.cache = parse_and_cache_project(proj, data_toml) + proj.cache = parse_and_cache_project(data_toml) end proj.active_project_path = active_project end - proj.cache isa DataProject ? proj.cache : get_cache(proj.cache, allow_refresh) -end - -function set_cache(proj::ActiveDataProject, content::AbstractString) - if proj.cache isa DataProject - error("No current active project") - else - set_cache(proj.cache, content) - end -end - -function local_data_abspath(proj::ActiveDataProject, relpath) - if isnothing(proj.active_project_path) - error("No active project") - end - return joinpath(dirname(proj.active_project_path), relpath) + proj.cache isa DataProject ? proj.cache : proj.cache[] end project_name(::ActiveDataProject) = _active_project_data_toml() #------------------------------------------------------------------------------- -function _fill_template(toml_str) - if occursin("@__DIR__", toml_str) - Base.depwarn(""" - Using @__DIR__ in Data.toml is deprecated. Use a '/'-separated - relative path instead.""", - :_fill_template) - return replace(toml_str, "@__DIR__"=>".") - else - return toml_str - end -end - function _load_project(content::AbstractString, sys_data_dir) - toml_str = _fill_template(content) + toml_str = _fill_template(sys_data_dir, content) config = TOML.parse(toml_str) load_project(config) end -#------------------------------------------------------------------------------- -""" - from_path(path) - -Create a `DataSet` from a local filesystem path. The type of the dataset is -inferred as a blob or tree based on whether the local path is a file or -directory. -""" -function from_path(path::AbstractString) - dtype = isfile(path) ? "File" : - isdir(path) ? "FileTree" : - nothing - - if isnothing(dtype) - msg = ispath(path) ? - "Unrecognized data at path \"$path\"" : - "Path \"$path\" does not exist" - throw(ArgumentError(msg)) - end - - path_key = Sys.isunix() ? "unix_path" : - Sys.iswindows() ? "windows_path" : - error("Unknown system: cannot determine path type") - - conf = Dict( - "name"=>make_valid_dataset_name(path), - "uuid"=>string(uuid4()), - "storage"=>Dict( - "driver"=>"FileSystem", - "type"=>dtype, - path_key=>abspath(path), - ) - ) - - DataSet(conf) -end diff --git a/src/filesystem.jl b/src/filesystem.jl index 73d1803..3a46e63 100644 --- a/src/filesystem.jl +++ b/src/filesystem.jl @@ -1,45 +1,21 @@ -""" -Root storage object for trees which are rooted in the file system (in git -terminology, there exists a "working copy") -""" -mutable struct FileSystemRoot - path::String - write::Bool - cleanup::Bool -end - -function FileSystemRoot(path::AbstractString; write=false, cleanup=false) - path = abspath(path) - root = FileSystemRoot(path, write, cleanup) - if cleanup - finalizer(root) do r - if r.cleanup - rm(r.path, recursive=true, force=true) - end - end - end - return root -end +# +# Storage Driver implementation for trees which are rooted in the file system +# (in git terminology, there exists a "working copy") +# +abstract type AbstractFileSystemRoot end # These functions sys_abspath and sys_joinpath generate/joins OS-specific # _local filesystem paths_ out of logical paths. They should be defined only # for trees which are rooted in the actual filesystem. -sys_joinpath(path::RelPath) = isempty(path.components) ? "" : joinpath(path.components...) - -sys_abspath(root::FileSystemRoot) = root.path - -function sys_abspath(root::FileSystemRoot, path::RelPath) +function sys_abspath(root::AbstractFileSystemRoot, path::RelPath) rootpath = sys_abspath(root) return isempty(path.components) ? rootpath : joinpath(rootpath, sys_joinpath(path)) end +sys_joinpath(path::RelPath) = isempty(path.components) ? "" : joinpath(path.components...) sys_abspath(path::AbsPath) = sys_abspath(path.root, path.path) -sys_abspath(tree::FileTree) = sys_abspath(tree.root, tree.path) -sys_abspath(file::File) = sys_abspath(file.root, file.path) - -iswriteable(root::FileSystemRoot) = root.write - - +sys_abspath(tree::BlobTree) = sys_abspath(tree.root, tree.path) +sys_abspath(file::Blob) = sys_abspath(file.root, file.path) #-------------------------------------------------- # Storage data interface for trees @@ -49,50 +25,42 @@ iswriteable(root::FileSystemRoot) = root.write ## 1. Query # TODO: would it be better to express the following dispatch in terms of -# AbsPath{<:FileSystemRoot} rather than usin double dispatch? +# AbsPath{<:AbstractFileSystemRoot} rather than usin double dispatch? -Base.isdir(root::FileSystemRoot, path::RelPath) = isdir(sys_abspath(root, path)) -Base.isfile(root::FileSystemRoot, path::RelPath) = isfile(sys_abspath(root, path)) -Base.ispath(root::FileSystemRoot, path::RelPath) = ispath(sys_abspath(root, path)) -Base.filesize(root::FileSystemRoot, path::RelPath) = filesize(sys_abspath(root, path)) +Base.isdir(root::AbstractFileSystemRoot, path::RelPath) = isdir(sys_abspath(root, path)) +Base.isfile(root::AbstractFileSystemRoot, path::RelPath) = isfile(sys_abspath(root, path)) +Base.ispath(root::AbstractFileSystemRoot, path::RelPath) = ispath(sys_abspath(root, path)) -Base.summary(io::IO, root::FileSystemRoot) = print(io, sys_abspath(root)) +Base.summary(io::IO, root::AbstractFileSystemRoot) = print(io, sys_abspath(root)) -Base.readdir(root::FileSystemRoot, path::RelPath) = readdir(sys_abspath(root, path)) +Base.readdir(root::AbstractFileSystemRoot, path::RelPath) = readdir(sys_abspath(root, path)) ## 2. Mutation # # TODO: Likely requires rework! -function Base.mkdir(root::FileSystemRoot, path::RelPath; kws...) +function Base.mkdir(root::AbstractFileSystemRoot, path::RelPath; kws...) if !iswriteable(root) - error("Cannot make directory in read-only tree") + error("Cannot make directory in read-only tree root at $(sys_abspath(p.root))") end mkdir(sys_abspath(root, path), args...) - return FileTree(root, path) + return BlobTree(root, path) end -function Base.rm(root::FileSystemRoot, path::RelPath; kws...) +function Base.rm(root::AbstractFileSystemRoot, path::RelPath; kws...) rm(sys_abspath(root,path); kws...) end -function Base.delete!(root::FileSystemRoot, path::RelPath) - if !iswriteable(root) - error("Cannot delete from read-only tree $root") - end - rm(sys_abspath(root, path); recursive=true) -end - #-------------------------------------------------- -# Storage data interface for File +# Storage data interface for Blob # TODO: Make this the generic implementation for AbstractDataStorage function Base.open(f::Function, as_type::Type{IO}, - root::FileSystemRoot, path; kws...) + root::AbstractFileSystemRoot, path; kws...) @context f(@! open(as_type, root, path; kws...)) end -@! function Base.open(::Type{IO}, root::FileSystemRoot, path; +@! function Base.open(::Type{IO}, root::AbstractFileSystemRoot, path; write=false, read=!write, kws...) if !iswriteable(root) && write error("Error writing file at read-only path $path") @@ -100,64 +68,130 @@ end @! open(sys_abspath(root, path); read=read, write=write, kws...) end -Base.read(root::FileSystemRoot, path::RelPath, ::Type{T}) where {T} = +Base.read(root::AbstractFileSystemRoot, path::RelPath, ::Type{T}) where {T} = read(sys_abspath(root, path), T) -Base.read(root::FileSystemRoot, path::RelPath) = +Base.read(root::AbstractFileSystemRoot, path::RelPath) = read(sys_abspath(root, path)) -#------------------------------------------------------------------------------- -# Mutation via newdir/newfile -_temp_root(path) = FileSystemRoot(path, write=true, cleanup=true) - +#-------------------------------------------------- """ - newdir() -Create a new `FileTree` on the local temporary directory. If not moved to a -permanent location (for example, with `some_tree["name"] = newdir()`) the -temporary tree will be cleaned up during garbage collection. +## Metadata spec + +For Blob: +``` + [datasets.storage] + driver="FileSystem" + type="Blob" + path=\$(path_to_file) +``` + +For BlobTree: +``` + [datasets.storage] + driver="FileSystem" + type="BlobTree" + path=\$(path_to_directory) +``` """ -function newdir() - # cleanup=false: we manage our own cleanup via the finalizer - path = mktempdir(cleanup=false) - return FileTree(FileSystemRoot(path, write=true, cleanup=true)) +struct FileSystemRoot <: AbstractFileSystemRoot + path::String + read::Bool + write::Bool end -function newdir(root::FileSystemRoot, path::RelPath; overwrite=false) - p = sys_abspath(root, path) - if overwrite - rm(p, force=true, recursive=true) - end - mkpath(p) +function FileSystemRoot(path::AbstractString; write=false, read=true) + path = abspath(path) + FileSystemRoot(path, read, write) end -function newfile(func=nothing) - path, io = mktemp(cleanup=false) - if func !== nothing - try - func(io) - catch - rm(path) - rethrow() - finally - close(io) +iswriteable(root::FileSystemRoot) = root.write + +sys_abspath(root::FileSystemRoot) = root.path + +function Base.abspath(relpath::RelPath) + Base.depwarn(""" + `abspath(::RelPath)` defaults to using `pwd()` as the root of the path + but this leads to fragile code so will be removed in the future""", + :abspath) + AbsPath(FileSystemRoot(pwd(); write=true, read=true), relpath) +end + +#------------------------------------------------------------------------------- +# Infrastructure for a somewhat more functional interface for creating file +# trees than the fully mutable version we usually use. + +mutable struct TempFilesystemRoot <: AbstractFileSystemRoot + path::Union{Nothing,String} + function TempFilesystemRoot(path) + root = new(path) + finalizer(root) do r + if !isnothing(r.path) + rm(r.path, recursive=true, force=true) + end end - else - close(io) + return root end - return File(_temp_root(path)) end -function newfile(f::Function, root::FileSystemRoot, path::RelPath; kws...) - p = sys_abspath(root, path) - mkpath(dirname(p)) - open(f, p, write=true) +function Base.readdir(root::TempFilesystemRoot, path::RelPath) + return isnothing(root.path) ? [] : readdir(sys_abspath(root, path)) end -function newfile(root::FileSystemRoot, path::RelPath; kws...) - newfile(io->nothing, root, path; kws...) +iswriteable(root::TempFilesystemRoot) = true +sys_abspath(root::TempFilesystemRoot) = root.path + +""" + newdir() + +Create a new temporary `BlobTree` which can have files assigned into it and may +be assigned to a permanent location in a persistent `BlobTree`. If not assigned +to a permanent location, the temporary tree is cleaned up during garbage +collection. +""" +function newdir(ctx::AbstractFileSystemRoot=FileSystemRoot(tempdir(), write=true)) + # cleanup=false: we manage our own cleanup via the finalizer + path = mktempdir(sys_abspath(ctx), cleanup=false) + return BlobTree(TempFilesystemRoot(path)) +end +newdir(ctx::BlobTree) = newdir(ctx.root) + +function newfile(ctx::AbstractFileSystemRoot=FileSystemRoot(tempdir(), write=true)) + path, io = mktemp(sys_abspath(ctx), cleanup=false) + close(io) + return Blob(TempFilesystemRoot(path)) +end +newfile(ctx::BlobTree) = newfile(ctx.root) + +""" + newfile(func) + newfile(func, ctx) + +Create a new temporary `Blob` object which may be later assigned to a permanent +location in a `BlobTree`. If not assigned to a permanent location, the +temporary file is cleaned up during garbage collection. + +# Example + +``` +tree[path"some/demo/path.txt"] = newfile() do io + println(io, "Hi there!") +end +``` +""" +function newfile(f::Function, ctx=FileSystemRoot(tempdir(), write=true)) + path, io = mktemp(sys_abspath(ctx), cleanup=false) + try + f(io) + catch + rm(path) + rethrow() + finally + close(io) + end + return Blob(TempFilesystemRoot(path)) end -#------------------------------------------------------------------------------- # Move srcpath to destpath, making all attempts to preserve the original # content of `destpath` if anything goes wrong. We assume that `srcpath` is # temporary content which doesn't need to be protected. @@ -202,117 +236,50 @@ function mv_force_with_dest_rollback(srcpath, destpath, tempdir_parent) end end -function Base.setindex!(tree::FileTree{FileSystemRoot}, - tmpdata::Union{File{FileSystemRoot},FileTree{FileSystemRoot}}, - path::AbstractString) +function Base.setindex!(tree::BlobTree{<:AbstractFileSystemRoot}, + tmpdata::Union{Blob{TempFilesystemRoot},BlobTree{TempFilesystemRoot}}, + name::AbstractString) if !iswriteable(tree.root) error("Attempt to move to a read-only tree $tree") end - if !tmpdata.root.cleanup + if isnothing(tmpdata.root.path) type = isdir(tmpdata) ? "directory" : "file" - error("Attempted to move $type which is already rooted in $(tmpdata.root)") + error("Attempted to root a temporary $type which has already been moved to $(tree.path)/$name ") end if !isempty(tree.path) # Eh, the number of ways the user can misuse this isn't really funny :-/ error("Temporary trees must be moved in full. The tree had non-empty path $(tree.path)") end - destpath = sys_abspath(joinpath(tree, RelPath(path))) + destpath = sys_abspath(joinpath(tree, name)) srcpath = sys_abspath(tmpdata) tempdir_parent = sys_abspath(tree) - mkpath(dirname(destpath)) mv_force_with_dest_rollback(srcpath, destpath, tempdir_parent) - # Transfer ownership of the data to `tree`. - tmpdata.root.cleanup = false - tmpdata.root.path = destpath + # Transfer ownership of the data to `tree`. This is ugly to be sure, as it + # leaves `tmpdata` empty! However, we'll have to live with this wart unless + # we want to be duplicating large amounts of data on disk. + tmpdata.root.path = nothing return tree end +# It's interesting to read about the linux VFS interface in regards to how the +# OS actually represents these things. For example +# https://stackoverflow.com/questions/36144807/why-does-linux-use-getdents-on-directories-instead-of-read -#-------------------------------------------------- -# FileSystem storage driver - -""" - local_data_abspath(project, relpath) - -Return the absolute path of data on disk where `relpath` is relative to -`project`. - -This function must be implemented for any `AbstractDataProject` subtype which -intends to support the `FileSystem` data driver. -""" -function local_data_abspath -end - - -function local_data_abspath(::Nothing, path) - error("Path must be absolute for DataSets without parent data projects") -end -""" -## Metadata spec - -For File: -``` - [datasets.storage] - driver="FileSystem" - type="File" - \$path_key=\$(path_string) -``` -For FileTree: -``` - [datasets.storage] - driver="FileSystem" - type="FileTree" - \$path_key=\$(path_string) -``` +#-------------------------------------------------- -`path_key` should be one of the following forms: -``` - path=\$(relative_slash_separated_path_to_file) - unix_path=\$(absolute_unix_path_to_file) - windows_path=\$(absolute_windows_path_to_file) -``` -""" +# Filesystem storage driver function connect_filesystem(f, config, dataset) - # Paths keys can be in three forms documented above; - if haskey(config, "path") - pathstr = config["path"] - # Local absolute paths are not portable. Previously these were allowed - # in the "path" key, but those are now deprecated in favor of - # system-specific path keys unix_path or windows_path - if isabspath(pathstr) - Base.depwarn(""" - Absolute paths in Data.toml are deprecated. Instead, use relative - paths (separated with `/`) relative to the Data.toml location.""", - :connect_filesystem) - path = pathstr - else - if '\\' in pathstr && Sys.iswindows() - # Heuristic deprecation warning for windows paths in Data.toml - Base.depwarn( - "Relative paths in Data.toml should be separated with '/' characters.", - :connect_filesystem) - pathstr = join(split(pathstr, '\\'), '/') - end - relpath = joinpath(split(pathstr, '/')...) - path = local_data_abspath(data_project(dataset), relpath) - end - elseif haskey(config, "unix_path") && Sys.isunix() - path = config["unix_path"] - elseif haskey(config, "windows_path") && Sys.iswindows() - path = config["windows_path"] - else - error("No \"path\" key found for FileSystem storage driver.") - end + path = config["path"] type = config["type"] - if type in ("File", "Blob") + if type == "Blob" isfile(path) || throw(ArgumentError("$(repr(path)) should be a file")) - storage = File(FileSystemRoot(path)) - elseif type in ("FileTree", "BlobTree") + storage = Blob(FileSystemRoot(path)) + elseif type == "BlobTree" isdir(path) || throw(ArgumentError("$(repr(path)) should be a directory")) - storage = FileTree(FileSystemRoot(path)) + storage = BlobTree(FileSystemRoot(path)) path = dataspec_fragment_as_path(dataset) if !isnothing(path) storage = storage[path] @@ -324,52 +291,3 @@ function connect_filesystem(f, config, dataset) end add_storage_driver("FileSystem"=>connect_filesystem) - - -#------------------------------------------------------------------------------- -# Deprecations -function Base.abspath(relpath::RelPath) - Base.depwarn(""" - `abspath(::RelPath)` defaults to using `pwd()` as the root of the path - but this leads to fragile code so will be removed in the future""", - :abspath) - AbsPath(FileSystemRoot(pwd(); write=true), relpath) -end - -# Deprecated newdir() and newfile() variants -function newdir(ctx::FileSystemRoot) - Base.depwarn(""" - `newdir(ctx::FileSystemRoot)` is deprecated. Use the in-place - version `newdir(::FileTree, path)` instead. - """, :newdir) - path = mktempdir(sys_abspath(ctx), cleanup=false) - return FileTree(_temp_root(path)) -end - -function newfile(ctx::FileSystemRoot) - Base.depwarn(""" - `newfile(ctx::FileSystemRoot)` is deprecated. Use the in-place - version `newfile(::FileTree, path)` instead. - """, :newfile) - path, io = mktemp(sys_abspath(ctx), cleanup=false) - close(io) - return File(_temp_root(path)) -end - -function newfile(f::Function, root::FileSystemRoot) - Base.depwarn(""" - `newfile(f::Function, ctx::FileSystemRoot)` is deprecated. - Use newfile() or the in-place version `newfile(::FileTree, path)` instead. - """, :newfile) - path, io = mktemp(sys_abspath(root), cleanup=false) - try - f(io) - catch - rm(path) - rethrow() - finally - close(io) - end - return File(_temp_root(path)) -end - diff --git a/src/paths.jl b/src/paths.jl index 54bf4ed..0d57aaa 100644 --- a/src/paths.jl +++ b/src/paths.jl @@ -11,15 +11,14 @@ A `RelPath` is a *key* into a hierarchical string-indexed tree datastructure, with each component indexing one level of the hierarchy. As a key, the resource referred to by a path may or may not exist. -Conversely, `FileTree` and `File` refer to the actual data stored with a given +Conversely, `BlobTree` and `Blob` refer to the actual data stored with a given key. """ struct RelPath <: AbstractPath components::Vector{String} end -RelPath(path::RelPath) = path -RelPath(str::AbstractString) = RelPath(split(str, '/')) +RelPath(::AbstractString) = error("RelPath(::String) is not defined to avoid ambiguities between operating systems. Use the `path\"...\"` string macro for path literals.") RelPath(components::AbstractVector=String[]) = RelPath(convert(Vector{String}, components)) # Path manipulation. @@ -74,7 +73,7 @@ end """ An AbsPath is the *key* into a hierarchical tree index, relative to some root. -The path is only a key; the resource pointed to by this key may or may not exist. +As a *key*, the resource pointed to by this key may or may not exist. """ struct AbsPath{Root} <: AbstractPath root::Root diff --git a/src/repl.jl b/src/repl.jl index 7c7a8b6..0e3d331 100644 --- a/src/repl.jl +++ b/src/repl.jl @@ -10,7 +10,7 @@ Press `>` to enter the data repl. Press TAB to complete commands. | Command | Alias | Action | |:---------- |:--------- | :---------- | | `help` | `?` | Show this message | -| `list` | `ls` | List stack of projects and datasets by name | +| `list` | `ls` | List all datasets by name | | `show $name` | | Preview the content of dataset `$name` | | `stack` | `st` | Manipulate the global data search stack | | `stack list` | `st ls` | List all projects in the global data search stack | @@ -64,7 +64,7 @@ function hexdump(out_stream, buf; groups_per_line=8, group_size=2, max_lines=typ end end -function _show_dataset(out_stream::IO, blob::File) +function _show_dataset(out_stream::IO, blob::Blob) @context begin io = @! open(IO, blob) N = 1024 @@ -76,7 +76,7 @@ function _show_dataset(out_stream::IO, blob::File) end display_lines, _ = displaysize(out_stream) max_lines = max(5, display_lines รท 2) - if length(str) == 0 || n_textlike / length(str) > 0.95 + if n_textlike / length(str) > 0.95 # It's approximately UTF-8 encoded text data - print as text lines = split(str, '\n', keepempty=true) nlines = min(lastindex(lines), max_lines) @@ -97,7 +97,7 @@ function _show_dataset(out_stream::IO, blob::File) end end -function _show_dataset(out_stream::IO, tree::FileTree) +function _show_dataset(out_stream::IO, tree::BlobTree) show(out_stream, MIME("text/plain"), tree) end @@ -127,6 +127,10 @@ function complete_command_list(cmd_prefix, commands) completions = String[] for cmdset in commands for cmd in cmdset + if cmd == cmd_prefix + # Space after full length command + return ([" "], "", true) + end if startswith(cmd, cmd_prefix) push!(completions, cmd*" ") break @@ -205,14 +209,14 @@ function parse_data_repl_cmd(cmdstr) popfirst!(tokens) if cmd in ("list", "ls") return quote - $DataSets.PROJECT + $DataSets.DataProject($DataSets.PROJECT) end elseif cmd == "stack" && length(tokens) >= 1 subcmd = popfirst!(tokens) if subcmd == "push" path = popfirst!(tokens) return quote - proj = $DataSets.data_project_from_path($path; depot_paths=DEPOT_PATH) + proj = $DataSets.data_project_from_path($path) stack = $DataSets.PROJECT pushfirst!(stack, proj) stack diff --git a/src/storage_drivers.jl b/src/storage_drivers.jl deleted file mode 100644 index 6794bbe..0000000 --- a/src/storage_drivers.jl +++ /dev/null @@ -1,85 +0,0 @@ -# Global record of registered storage drivers - -const _storage_drivers_lock = ReentrantLock() -const _storage_drivers = Dict{String,Any}() - -""" - add_storage_driver(driver_name=>storage_opener) - -Associate DataSet storage driver named `driver_name` with `storage_opener`. -When a `dataset` with `storage.driver == driver_name` is opened, -`storage_opener(user_func, storage_config, dataset)` will be called. Any -existing storage driver registered to `driver_name` will be overwritten. - -As a matter of convention, `storage_opener` should generally take configuration -from `storage_config` which is just `dataset.storage`. But to avoid config -duplication it may also use the content of `dataset`, (for example, dataset.uuid). - -Packages which define new storage drivers should generally call -`add_storage_driver()` within their `__init__()` functions. -""" -function add_storage_driver((name,opener)::Pair) - lock(_storage_drivers_lock) do - _storage_drivers[name] = opener - end -end - -function add_storage_driver(project::AbstractDataProject) - for conf in data_drivers(project) - pkgid = PkgId(UUID(conf["module"]["uuid"]), conf["module"]["name"]) - if Base.haskey(Base.package_locks, pkgid) - # Hack: Avoid triggering another call to require() for packages - # which are already in the process of being loaded. (This would - # result in a deadlock!) - # - # Obviously this depends on Base internals... - continue - end - mod = Base.require(pkgid) - #= - # TODO: Improve driver loading invariants. - # - # The difficulty here is that there's two possible ways for drivers to - # work: - # 1. The driver depends explicitly on `using DataSets`, so - # DataSets.__init__ is called *before* the Driver.__init__. - # 2. The driver uses a Requires-like mechanism to support multiple - # incompatible DataSets versions, so Driver.__init__ can occur - # *before* DataSets.__init__. - # - # This makes it hard for DataSets to check which drivers are added by a - # module: In case (2), the following check fails when the driver is - # loaded before DataSets and in case (1) we hit the double-require - # problem, resulting in the Base.package_locks bailout which disables - # the check below. - # - if conf["type"] == "storage" - driver_name = conf["name"] - # `mod` is assumed to run add_storage_driver() inside its __init__, - # unless the symbol mod.datasets_load_hook exists (in which case we - # call this instead). - lock(_storage_drivers_lock) do - get(_storage_drivers, driver_name) do - error("Package $pkgid did not provide storage driver $driver_name") - end - end - end - =# - end -end - -function _find_driver(dataset) - storage_config = dataset.storage - driver_name = get(storage_config, "driver") do - error("`storage.driver` configuration not found for dataset $(dataset.name)") - end - driver = lock(_storage_drivers_lock) do - get(_storage_drivers, driver_name) do - error(""" - Storage driver $(repr(driver_name)) not found for dataset $(dataset.name). - Current drivers are $(collect(keys(_storage_drivers))) - """) - end - end -end - diff --git a/src/utils.jl b/src/utils.jl deleted file mode 100644 index a2cc260..0000000 --- a/src/utils.jl +++ /dev/null @@ -1,38 +0,0 @@ -# Some basic utilities to validate "config-like" data -# -# (Perhaps these could be replaced with the use of JSON schema or some such?) - -_key_match(config, (k,T)::Pair) = haskey(config, k) && config[k] isa T -_key_match(config, k::String) = haskey(config, k) - -function _check_keys(config, context, keys) - missed_keys = filter(k->!_key_match(config, k), keys) - if !isempty(missed_keys) - error(""" - Missing expected keys in $context: - $missed_keys - - In DataSet fragment: - $(sprint(TOML.print,config)) - """) - end -end - -struct VectorOf - T -end - -function _check_optional_keys(config, context, keys...) - for (k, check) in keys - if haskey(config, k) - v = config[k] - if check isa Type && !(v isa check) - error("""Invalid DataSet key $k. Expected type $check""") - elseif check isa VectorOf && !(v isa AbstractVector && - all(x isa check.T for x in v)) - error("""Invalid DataSet key $k""") - end - end - end -end - diff --git a/test/Data.toml b/test/Data.toml index 3ddbdc0..f1199c8 100644 --- a/test/Data.toml +++ b/test/Data.toml @@ -11,8 +11,17 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26" [datasets.storage] driver="FileSystem" - type="File" - path="data/file.txt" + type="Blob" + path="@__DIR__/data/file.txt" + + # TODO: We'd like a layering abstraction. + + # [[datasets.maps]] + # type="File" + # + # [[datasets.maps]] + # type="text" + # parameters={encoding="UTF-8"} [[datasets]] description="A text file with namespace" @@ -21,8 +30,8 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26" [datasets.storage] driver="FileSystem" - type="File" - path="data/file.txt" + type="Blob" + path="@__DIR__/data/file.txt" #-------------------------------------------------- [[datasets]] @@ -32,8 +41,16 @@ uuid="2d126588-5f76-4e53-8245-87dc91625bf4" [datasets.storage] driver="FileSystem" - type="File" - path="data/people.csv.gz" + type="Blob" + path="@__DIR__/data/people.csv.gz" + + #[[datasets.maps]] + #type="GZip" + # + #[[datasets.maps]] + #type="CSV" + #parameters={delim=","} + #-------------------------------------------------- [[datasets]] @@ -42,8 +59,11 @@ uuid="e7fd7080-e346-4a68-9ca9-98593a99266a" [datasets.storage] driver="FileSystem" - type="FileTree" - path="data/csvset" + type="BlobTree" + path="@__DIR__/data/csvset" + + # TODO: Add data maps here which expose it logically as a single CSV? + #-------------------------------------------------- # Data embedded in the TOML @@ -55,7 +75,7 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26" [datasets.storage] driver="TomlDataStorage" - type="File" + type="Blob" data="AAAAAAAARUA=" @@ -66,7 +86,7 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26" [datasets.storage] driver="TomlDataStorage" - type="FileTree" + type="BlobTree" # TOML.print(Dict("datasets"=>[Dict("storage"=>Dict("data"=>Dict(["d0$i"=>Dict(["$x.txt"=>base64encode("$i $x content") for x in ("a","b")]...) for i in 1:4]...)))])) @@ -85,3 +105,30 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26" [datasets.storage.data.d04] "b.txt" = "NCBiIGNvbnRlbnQ=" "a.txt" = "NCBhIGNvbnRlbnQ=" + +#-------------------------------------------------- +# Old backend API tests + +[[datasets]] +description="Test old storage backend API, Blob" +name="old_backend_blob" +uuid="785b3cdc-428e-426f-a3f7-3f6ae88a9637" + + [datasets.storage] + driver="OldBackendAPI" + type="Blob" + data="eA==" + +[[datasets]] +description="Test old storage backend API, BlobTree" +name="old_backend_tree" +uuid="4af3a8a9-983b-487b-bfd8-804ca50b4a0c" + + [datasets.storage] + driver="OldBackendAPI" + type="BlobTree" + + [datasets.storage.data] + "b.txt" = "Yg==" + "a.txt" = "YQ==" + diff --git a/test/DataCompat.toml b/test/DataCompat.toml deleted file mode 100644 index 27c98be..0000000 --- a/test/DataCompat.toml +++ /dev/null @@ -1,93 +0,0 @@ -# This file contains datasets in older formats, retained for backward -# compatibility. -data_config_version=1 - -#-------------------------------------------------- -[[datasets]] -description="A text file" -name="a_text_file" -uuid="b498f769-a7f6-4f67-8d74-40b770398f26" - - [datasets.storage] - driver="FileSystem" - type="Blob" - path="@__DIR__/data/file.txt" - -#-------------------------------------------------- -[[datasets]] -name="a_tree_example" -uuid="e7fd7080-e346-4a68-9ca9-98593a99266a" - - [datasets.storage] - driver="FileSystem" - type="BlobTree" - path="@__DIR__/data/csvset" - - -#-------------------------------------------------- -# Data embedded in the TOML - -[[datasets]] -description="A data blob embedded in the TOML" -name="embedded_blob" -uuid="b498f769-a7f6-4f67-8d74-40b770398f26" - - [datasets.storage] - driver="TomlDataStorage" - type="File" - data="AAAAAAAARUA=" - - -[[datasets]] -description="A data tree embedded in the TOML" -name="embedded_tree" -uuid="b498f769-a7f6-4f67-8d74-40b770398f26" - - [datasets.storage] - driver="TomlDataStorage" - type="FileTree" - -# TOML.print(Dict("datasets"=>[Dict("storage"=>Dict("data"=>Dict(["d0$i"=>Dict(["$x.txt"=>base64encode("$i $x content") for x in ("a","b")]...) for i in 1:4]...)))])) - - [datasets.storage.data.d01] - "b.txt" = "MSBiIGNvbnRlbnQ=" - "a.txt" = "MSBhIGNvbnRlbnQ=" - - [datasets.storage.data.d02] - "b.txt" = "MiBiIGNvbnRlbnQ=" - "a.txt" = "MiBhIGNvbnRlbnQ=" - - [datasets.storage.data.d03] - "b.txt" = "MyBiIGNvbnRlbnQ=" - "a.txt" = "MyBhIGNvbnRlbnQ=" - - [datasets.storage.data.d04] - "b.txt" = "NCBiIGNvbnRlbnQ=" - "a.txt" = "NCBhIGNvbnRlbnQ=" - -#-------------------------------------------------- -# Old backend API tests - -[[datasets]] -description="Test old storage backend API, File" -name="old_backend_blob" -uuid="785b3cdc-428e-426f-a3f7-3f6ae88a9637" - - [datasets.storage] - driver="OldBackendAPI" - type="Blob" - data="eA==" - -[[datasets]] -description="Test old storage backend API, FileTree" -name="old_backend_tree" -uuid="4af3a8a9-983b-487b-bfd8-804ca50b4a0c" - - [datasets.storage] - driver="OldBackendAPI" - type="BlobTree" - - [datasets.storage.data] - "b.txt" = "Yg==" - "a.txt" = "YQ==" - diff --git a/test/TomlDataStorage.jl b/test/DataTomlStorage.jl similarity index 91% rename from test/TomlDataStorage.jl rename to test/DataTomlStorage.jl index cb23c6c..06b3343 100644 --- a/test/TomlDataStorage.jl +++ b/test/DataTomlStorage.jl @@ -3,7 +3,7 @@ proj = DataSets.load_project("Data.toml") blob_ds = dataset(proj, "embedded_blob") - @test open(blob_ds) isa File + @test open(blob_ds) isa Blob @test open(String, blob_ds) == "\0\0\0\0\0\0E@" @test read(open(blob_ds), Float64) === 42.0 @@ -15,7 +15,7 @@ @test @!(open(String, blob_ds)) == "\0\0\0\0\0\0E@" blob = @! open(blob_ds) - @test blob isa File + @test blob isa Blob @test @!(open(String, blob)) == "\0\0\0\0\0\0E@" @test read(blob, Float64) === 42.0 @@ -23,12 +23,12 @@ end tree_ds = dataset(proj, "embedded_tree") - @test open(tree_ds) isa FileTree + @test open(tree_ds) isa BlobTree @test open(String, open(tree_ds)[path"d01/a.txt"]) == "1 a content" @test open(String, open(tree_ds)[path"d02/b.txt"]) == "2 b content" @context begin tree = @! open(tree_ds) - @test tree isa FileTree + @test tree isa BlobTree @test isdir(tree) @test !isfile(tree) diff --git a/test/DriverAutoloadData.toml b/test/DriverAutoloadData.toml index 9449587..24bf164 100644 --- a/test/DriverAutoloadData.toml +++ b/test/DriverAutoloadData.toml @@ -7,7 +7,7 @@ uuid="785b3cdc-428e-426f-a3f7-3f6ae88a9637" [datasets.storage] driver="DummyTomlStorage" - type="File" + type="Blob" data="data_from_dummy_backend" #------------------------------------------------------------------------------- diff --git a/test/FileTree.jl b/test/FileTree.jl deleted file mode 100644 index 8c92bd2..0000000 --- a/test/FileTree.jl +++ /dev/null @@ -1,191 +0,0 @@ -@testset "FileTree API" begin - @testset "isolated newfile" begin - @test newfile() isa File - @test read(newfile()) == [] - @test begin - f = newfile() do io - print(io, "content") - end - read(f, String) - end == "content" - end - - tree = newdir() - for j=1:2 - d = newdir(tree, "d$j") - for i=1:2 - newfile(d, "hi_$(j)_$(i).txt") do io - println(io, "hi $j/$i") - end - end - end - @test read(tree["d1/hi_1_1.txt"], String) == "hi 1/1\n" - @test read(tree["d1/hi_1_2.txt"], String) == "hi 1/2\n" - @test read(tree["d2/hi_2_1.txt"], String) == "hi 2/1\n" - @test read(tree["d2/hi_2_2.txt"], String) == "hi 2/2\n" - - @testset "metadata" begin - f = tree["d1/hi_1_1.txt"] - @test filesize(f) == 7 - @test isfile(f) - @test !isdir(f) - @test ispath(f) - - d = tree["d1"] - @test !isfile(d) - @test isdir(d) - @test ispath(d) - - @test haskey(tree, "d1") - @test !haskey(tree, "x") - end - - @testset "Iteration" begin - # keys - @test keys(tree) == ["d1", "d2"] - @test keys(tree["d1"]) == ["hi_1_1.txt", "hi_1_2.txt"] - @test keys(tree["d2"]) == ["hi_2_1.txt", "hi_2_2.txt"] - # values - for v in tree - @test v isa FileTree - end - for v in values(tree) - @test v isa FileTree - end - for v in tree["d1"] - @test v isa File - end - # pairs - @test first.(pairs(tree["d1"])) == ["hi_1_1.txt", "hi_1_2.txt"] - #@test typeof.(last.(pairs(tree["d1"]))) == [File, File] - end - - @testset "copy / copy! for FileTree" begin - tree2 = copy!(newdir(), tree) - @test keys(tree2) == ["d1", "d2"] - @test keys(tree2["d1"]) == ["hi_1_1.txt", "hi_1_2.txt"] - @test keys(tree2["d2"]) == ["hi_2_1.txt", "hi_2_2.txt"] - @test read(tree2["d1/hi_1_1.txt"], String) == "hi 1/1\n" - - @testset "copy! into a subtree" begin - copy!(newdir(tree2, "dst"), tree) - @test keys(tree2["dst"]) == ["d1", "d2"] - @test keys(tree2["dst/d1"]) == ["hi_1_1.txt", "hi_1_2.txt"] - end - - @testset "copy" begin - @test keys(copy(tree)) == ["d1", "d2"] - end - end - - @testset "newdir/newfile with overwrite=true" begin - tree3 = copy!(newdir(), tree) - - @test_throws ErrorException newdir(tree3, "d1") - @test keys(tree3["d1"]) == ["hi_1_1.txt", "hi_1_2.txt"] - newdir(tree3, "d1", overwrite=true) - @test keys(tree3["d1"]) == [] - - # Various forms of newfile - @test newfile(tree3, "empty") isa File - @test open(String, tree3["empty"]) == "" - @test_throws ErrorException newfile(tree3, "empty") - newfile(tree3, "empty", overwrite=true) do io - print(io, "xxx") - end - @test open(String, tree3["empty"]) == "xxx" - # newfile creates directories implicitly - @test newfile(tree3, "a/b/c") isa File - @test tree3["a"]["b"]["c"] isa File - end - - @testset "setindex!" begin - tree = newdir() - @test keys(tree) == [] - tree["a"] = newfile() - @test tree["a"] isa File - tree["b"] = newdir() - @test tree["b"] isa FileTree - tree["c/d"] = newfile() - @test tree["c"] isa FileTree - @test tree["c/d"] isa File - @test keys(tree) == ["a","b","c"] - d = newdir() - newfile(io->print(io, "E"), d, "e") - newfile(io->print(io, "F"), d, "f") - tree["x"] = d - @test read(tree["x/e"], String) == "E" - @test read(tree["x/f"], String) == "F" - end - - @testset "delete!" begin - tree = newdir() - newfile(tree, "a/b/c") - newfile(tree, "a/b/d") - @test keys(tree) == ["a"] - delete!(tree, "a") - @test keys(tree) == [] - newfile(tree, "x") - @test keys(tree) == ["x"] - delete!(tree, "x") - @test keys(tree) == [] - end - - @testset "open(::File)" begin - file = newfile(io->print(io, "xx")) - - # Do-block based forms - @test open(identity, String, file) == "xx" - @test String(open(identity, Vector{UInt8}, file)) == "xx" - @test open(io->read(io,String), IO, file) == "xx" - @test open(identity, File, file) === file - - # Unscoped forms - @test open(String, file) == "xx" - @test String(open(Vector{UInt8}, file)) == "xx" - @test read(open(IO, file), String) == "xx" - - # Context-based forms - @context begin - @test @!(open(String, file)) == "xx" - @test String(@! open(Vector{UInt8}, file)) == "xx" - @test read(@!(open(IO, file)), String) == "xx" - @test @!(open(File, file)) === file - end - end - - @testset "open(::FileTree)" begin - tree = FileTree(FileSystemRoot("data")) - - @test open(identity, FileTree, tree) === tree - - # Context-based forms - @context begin - @test @!(open(FileTree, tree)) === tree - end - end -end - -@testset "newfile / newdir cleanup" begin - f = newfile() - global sys_file_path = f.root.path - GC.@preserve f @test isfile(sys_file_path) - d = newdir() - global sys_dir_path = d.root.path - GC.@preserve d @test isdir(sys_dir_path) -end -# Having the following as a separate top level statement ensures that `f` and -# `d` aren't accidentally still rooted so the the GC can clean them up. -@testset "newfile / newdir cleanup step 2" begin - GC.gc() - @test !ispath(sys_file_path) - @test !ispath(sys_dir_path) -end - -#= -#TODO -@testset "FileSystemRoot" begin - # Test that the file is persisted on disk - @test isfile(DataSets.sys_abspath(tree["d1/hi_2.txt"])) -end -=# diff --git a/test/active_project/Data.toml b/test/active_project/Data.toml index 64f90df..2fbd893 100644 --- a/test/active_project/Data.toml +++ b/test/active_project/Data.toml @@ -1,4 +1,4 @@ -data_config_version=1 +data_config_version=0 [[datasets]] description="A text file" @@ -7,5 +7,5 @@ uuid="314996ef-12be-40d0-912c-9755af354fdb" [datasets.storage] driver="FileSystem" - type="File" - path="data/file.txt" + type="Blob" + path="@__DIR__/data/file.txt" diff --git a/test/backend_compat.jl b/test/backend_compat.jl index 27914b8..55c9872 100644 --- a/test/backend_compat.jl +++ b/test/backend_compat.jl @@ -66,7 +66,7 @@ DataSets.add_storage_driver("OldBackendAPI"=>connect_old_backend) #------------------------------------------------------------------------------- @testset "OldBackendAPI" begin - proj = DataSets.load_project(joinpath(@__DIR__, "DataCompat.toml")) + proj = DataSets.load_project("Data.toml") @test open(IO, dataset(proj, "old_backend_blob")) do io read(io, String) @@ -77,22 +77,8 @@ DataSets.add_storage_driver("OldBackendAPI"=>connect_old_backend) @test read(open(dataset(proj, "old_backend_blob"))) == UInt8['x'] @test readdir(open(dataset(proj, "old_backend_tree"))) == ["a.txt", "b.txt"] - @test open(dataset(proj, "old_backend_tree"))[path"a.txt"] isa File + @test open(dataset(proj, "old_backend_tree"))[path"a.txt"] isa Blob @test read(open(dataset(proj, "old_backend_tree"))[path"a.txt"], String) == "a" @test read(open(dataset(proj, "old_backend_tree"))[path"b.txt"], String) == "b" end -@testset "Compat for @__DIR__ and renaming Blob->File, BlobTree->FileTree" begin - proj = DataSets.load_project(joinpath(@__DIR__, "DataCompat.toml")) - - text_data = dataset(proj, "a_text_file") - @test open(text_data) isa Blob - @test read(open(text_data), String) == "Hello world!\n" - - tree_data = dataset(proj, "a_tree_example") - @context begin - @test @!(open(tree_data)) isa BlobTree - tree = @! open(tree_data) - @test readdir(tree) == ["1.csv", "2.csv"] - end -end diff --git a/test/drivers/DummyStorageBackends/src/DummyStorageBackends.jl b/test/drivers/DummyStorageBackends/src/DummyStorageBackends.jl index 92d10c3..48a5782 100644 --- a/test/drivers/DummyStorageBackends/src/DummyStorageBackends.jl +++ b/test/drivers/DummyStorageBackends/src/DummyStorageBackends.jl @@ -13,7 +13,7 @@ end function connect_dummy_backend(f, config, ds) storage = DummyBackend(config["data"]) - f(File(storage)) + f(Blob(storage)) end function __init__() diff --git a/test/entrypoint.jl b/test/entrypoint.jl index f1b8ad8..85a3597 100644 --- a/test/entrypoint.jl +++ b/test/entrypoint.jl @@ -1,14 +1,14 @@ # Data entry point functions read_data = nothing -@datafunc function main1(x::File=>String, t::FileTree=>FileTree) +@datafunc function main1(x::Blob=>String, t::BlobTree=>BlobTree) csv_data = open(IO, t["1.csv"]) do io read(io,String) end global read_data = (x_string=x, csv_data=csv_data) end -@datafunc function main1(x::File=>IO) +@datafunc function main1(x::Blob=>IO) x_data = read(x, String) global read_data = (x_data=x_data,) end diff --git a/test/projects.jl b/test/projects.jl index 2ac994c..5cad524 100644 --- a/test/projects.jl +++ b/test/projects.jl @@ -6,14 +6,15 @@ using DataSets: TomlFileDataProject, ActiveDataProject, StackedDataProject, - project_name, - config! + project_name test_project_names = ["a_text_file", "a_tree_example", "embedded_blob", "embedded_tree", "gzipped_table", + "old_backend_blob", + "old_backend_tree", "some_namespace/a_text_file"] @testset "TomlFileDataProject" begin @@ -34,13 +35,20 @@ test_project_names = ["a_text_file", # identity @test project_name(proj) == abspath("Data.toml") + + # Test @__DIR__ templating + # Use `cleanpath` as there's currently a mixture of / and \ on windows + # which does work, but is quite ugly. + # Also use realpath to resolve any differences due to symbolic links. + cleanpath(p) = realpath(replace(p, '\\'=>'/')) + @test cleanpath(proj["a_text_file"].storage["path"]) == cleanpath(joinpath(@__DIR__, "data", "file.txt")) end @testset "TomlFileDataProject live updates" begin # Test live updating when the file is rewritten mktemp() do path,io write(io, """ - data_config_version=1 + data_config_version=0 [[datasets]] description="A text file" @@ -49,8 +57,8 @@ end [datasets.storage] driver="FileSystem" - type="File" - path="data/file.txt" + type="Blob" + path="@__DIR__/data/file.txt" """) flush(io) @@ -68,8 +76,8 @@ end [datasets.storage] driver="FileSystem" - type="File" - path="data/file2.txt" + type="Blob" + path="@__DIR__/data/file2.txt" """) flush(io) @@ -114,7 +122,7 @@ end joinpath(@__DIR__, "Data.toml"), ""], paths_sep) fake_env = Dict("JULIA_DATASETS_PATH"=>datasets_paths) - proj = DataSets.create_project_stack(fake_env, [joinpath(homedir(), ".julia"), joinpath("root", "julia")]) + proj = DataSets.create_project_stack(fake_env) @test proj.projects[1] isa ActiveDataProject @test proj.projects[2] isa TomlFileDataProject @@ -129,73 +137,5 @@ end DataSets.__init__() @test DataSets.PROJECT.projects[1] isa TomlFileDataProject @test project_name(DataSets.PROJECT.projects[1]) == joinpath(@__DIR__, "Data.toml") - - # Test a few edge cases too: - @test_logs ( - :warn, "Julia depot data project (for an empty dataset path) can not be constructed because DEPOT_PATH is empty." - ) DataSets.create_project_stack(Dict("JULIA_DATASETS_PATH"=>"foo$(paths_sep)"), []) - @test_logs ( - :warn, "Julia depot path (relative/depot/path) not absolute. Fixing data project path relative to current working directory." - ) DataSets.create_project_stack(Dict("JULIA_DATASETS_PATH"=>"$(paths_sep)/foo"), ["relative/depot/path"]) end -@testset "config!() metadata update" begin - # Test live updating when the file is rewritten - mktempdir() do tmppath - data_toml_path = joinpath(tmppath, "Data.toml") - open(data_toml_path, write=true) do io - write(io, """ - data_config_version=1 - - [[datasets]] - description="A" - name="a_text_file" - uuid="b498f769-a7f6-4f67-8d74-40b770398f26" - - [datasets.storage] - driver="FileSystem" - type="File" - path="data/file.txt" - """) - end - - proj = TomlFileDataProject(data_toml_path) - @testset "config!(proj, ...)" begin - @test dataset(proj, "a_text_file").description == "A" - config!(proj, "a_text_file", description="B") - config!(proj, "a_text_file", tags=Any["xx", "yy"]) - @test dataset(proj, "a_text_file").description == "B" - @test dataset(proj, "a_text_file").tags == ["xx", "yy"] - end - - @testset "Persistence on disk" begin - proj2 = TomlFileDataProject(data_toml_path) - @test dataset(proj2, "a_text_file").description == "B" - @test dataset(proj2, "a_text_file").tags == ["xx", "yy"] - end - - @testset "config! via DataSet instances" begin - ds = dataset(proj, "a_text_file") - config!(ds, description = "C") - @test dataset(proj, "a_text_file").description == "C" - ds.description = "D" - @test dataset(proj, "a_text_file").description == "D" - end - - @testset "description and tags validation" begin - ds = dataset(proj, "a_text_file") - @test_throws Exception config!(ds, description = 1) - @test_throws Exception config!(ds, tags = "hi") - end - - @testset "global config! methods" begin - empty!(DataSets.PROJECT) - pushfirst!(DataSets.PROJECT, TomlFileDataProject(data_toml_path)) - - config!("a_text_file", description="X") - @test dataset("a_text_file").description == "X" - - empty!(DataSets.PROJECT) - end - end -end diff --git a/test/runtests.jl b/test/runtests.jl index 4e34625..4c48815 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -27,7 +27,7 @@ end @testset "DataSet config from Dict" begin config = Dict( - "data_config_version"=>1, + "data_config_version"=>0, "datasets"=>[Dict( "description"=>"A text file", "name"=>"a_text_file", @@ -35,7 +35,7 @@ end "storage"=>Dict( "driver"=>"FileSystem", - "type"=>"File", + "type"=>"Blob", "path"=>joinpath(@__DIR__, "data", "file.txt") ) )] @@ -51,66 +51,103 @@ end proj = DataSets.load_project("Data.toml") text_data = dataset(proj, "a_text_file") - @test open(text_data) isa File + @test open(text_data) isa Blob @test read(open(text_data), String) == "Hello world!\n" @context begin @test read(@!(open(text_data)), String) == "Hello world!\n" end tree_data = dataset(proj, "a_tree_example") - @test open(tree_data) isa FileTree + @test open(tree_data) isa BlobTree @context begin - @test @!(open(tree_data)) isa FileTree + @test @!(open(tree_data)) isa BlobTree tree = @! open(tree_data) @test readdir(tree) == ["1.csv", "2.csv"] end blob_in_tree_data = dataset(proj, "a_tree_example#1.csv") - @test open(blob_in_tree_data) isa File + @test open(blob_in_tree_data) isa Blob @context begin @test @!(open(String, blob_in_tree_data)) == """Name,Age\n"Aaron",23\n"Harry",42\n""" end end #------------------------------------------------------------------------------- -@testset "from_path" begin - file_dataset = DataSets.from_path(joinpath(@__DIR__, "data", "file.txt")) - @test read(open(file_dataset), String) == "Hello world!\n" - - dir_dataset = DataSets.from_path(joinpath(@__DIR__, "data", "csvset")) - - @test open(dir_dataset) isa FileTree - @test keys(open(dir_dataset)) == ["1.csv", "2.csv"] +@testset "open() for Blob and BlobTree" begin + blob = Blob(FileSystemRoot("data/file.txt")) + @test open(identity, String, blob) == "Hello world!\n" + @test String(open(identity, Vector{UInt8}, blob)) == "Hello world!\n" + @test open(io->read(io,String), IO, blob) == "Hello world!\n" + @test open(identity, Blob, blob) === blob + # Unscoped forms + @test open(String, blob) == "Hello world!\n" + @test String(open(Vector{UInt8}, blob)) == "Hello world!\n" + @test read(open(IO, blob), String) == "Hello world!\n" + + tree = BlobTree(FileSystemRoot("data")) + @test open(identity, BlobTree, tree) === tree + + # Context-based forms + @context begin + @test @!(open(String, blob)) == "Hello world!\n" + @test String(@! open(Vector{UInt8}, blob)) == "Hello world!\n" + @test read(@!(open(IO, blob)), String) == "Hello world!\n" + @test @!(open(Blob, blob)) === blob + @test @!(open(BlobTree, tree)) === tree + end end #------------------------------------------------------------------------------- -@testset "Data set names" begin - @testset "Valid name: $name" for name in ( - "a_b", "a-b", "a1", "ฮดฮตฮดฮฟฮผฮญฮฝฮฑ", "a/b", "a/b/c", "a-", "b_", - ) - @test DataSets.is_valid_dataset_name(name) - @test DataSets._split_dataspec(name) == (name, nothing, nothing) +function load_list(filename) + lines = eachline(joinpath(@__DIR__, filename)) + filter(!isempty, strip.(lines)) +end +@testset "Data set name parsing" begin + @testset "Valid names" begin + valid_names = load_list("testnames-valid.txt") + @test !isempty(valid_names) + @testset "Valid name: $name" for name in valid_names + @test DataSets.check_dataset_name(name) === nothing + @test DataSets._split_dataspec(name) == (name, nothing, nothing) + # Also test that the name is still valid when it appears as part of + # a path elements. + let path_name = "foo/$(name)" + @test DataSets.check_dataset_name(path_name) === nothing + @test DataSets._split_dataspec(path_name) == (path_name, nothing, nothing) + end + let path_name = "$(name)/foo" + @test DataSets.check_dataset_name(path_name) === nothing + @test DataSets._split_dataspec(path_name) == (path_name, nothing, nothing) + end + let path_name = "foo/$(name)/bar" + @test DataSets.check_dataset_name(path_name) === nothing + @test DataSets._split_dataspec(path_name) == (path_name, nothing, nothing) + end + end end - @testset "Invalid name: $name" for name in ( - "1", "a b", "a.b", "a/b/", "a//b", "/a/b", "a/-", "a/1", "a/ _/b" - ) - @test !DataSets.is_valid_dataset_name(name) - @test DataSets._split_dataspec(name) == (nothing, nothing, nothing) + @testset "Invalid names" begin + invalid_names = load_list("testnames-invalid.txt") + @test !isempty(invalid_names) + @testset "Invalid name: $name" for name in invalid_names + @test_throws ErrorException DataSets.check_dataset_name(name) + @test DataSets._split_dataspec(name) == (nothing, nothing, nothing) + # Also test that the name is still invalid when it appears as part of + # a path elements. + let path_name = "foo/$(name)" + @test_throws ErrorException DataSets.check_dataset_name(path_name) === nothing + @test DataSets._split_dataspec(path_name) == (nothing, nothing, nothing) + end + let path_name = "$(name)/foo" + @test_throws ErrorException DataSets.check_dataset_name(path_name) === nothing + @test DataSets._split_dataspec(path_name) == (nothing, nothing, nothing) + end + let path_name = "foo/$(name)/bar" + @test_throws ErrorException DataSets.check_dataset_name(path_name) === nothing + @test DataSets._split_dataspec(path_name) == (nothing, nothing, nothing) + end + end end - - # Error message for invalid names - @test_throws ErrorException("DataSet name \"a?b\" is invalid. DataSet names must start with a letter and can contain only letters, numbers, `_` or `/`.") DataSets.check_dataset_name("a?b") - - # Making valid names from path-like things - @test DataSets.make_valid_dataset_name("a/b") == "a/b" - @test DataSets.make_valid_dataset_name("a1") == "a1" - @test DataSets.make_valid_dataset_name("1a") == "a" - @test DataSets.make_valid_dataset_name("//a/b") == "a/b" - @test DataSets.make_valid_dataset_name("a..b") == "a__b" - @test DataSets.make_valid_dataset_name("C:\\a\\b") == "C_/a/b" - # fallback - @test DataSets.make_valid_dataset_name("a//b") == "data" end @testset "URL-like dataspec parsing" begin @@ -148,10 +185,31 @@ end @test dataset(proj, "a_text_file?x=1&yy=2#frag")["dataspec"]["fragment"] == "frag" end -include("FileTree.jl") +#------------------------------------------------------------------------------- +# Trees +@testset "Temporary trees" begin + function write_dir(j) + d = newdir() + for i=1:2 + d["hi_$i.txt"] = newfile() do io + println(io, "hi $j $i") + end + end + return d + end + + temptree = newdir() + for j=1:3 + temptree["d$j"] = write_dir(j) + end + @test open(io->read(io,String), IO, temptree["d1"]["hi_2.txt"]) == "hi 1 2\n" + @test open(io->read(io,String), IO, temptree["d3"]["hi_1.txt"]) == "hi 3 1\n" + @test isfile(DataSets.sys_abspath(temptree["d1"]["hi_2.txt"])) +end + include("projects.jl") include("entrypoint.jl") include("repl.jl") -include("TomlDataStorage.jl") +include("DataTomlStorage.jl") include("backend_compat.jl") include("driver_autoload.jl") diff --git a/test/testnames-invalid.txt b/test/testnames-invalid.txt new file mode 100644 index 0000000..bf78155 --- /dev/null +++ b/test/testnames-invalid.txt @@ -0,0 +1,25 @@ +a b +a/b/ +a//b +/a/b +a/- +a/ _/b +a/-a +a/-1 +.a +..a +a. +a.. +.a. +a..b +.abc +abc. +abc/.def +abc/def. +a./b +a.- +_._ +a._b +a.-b +./a +b/../a diff --git a/test/testnames-valid.txt b/test/testnames-valid.txt new file mode 100644 index 0000000..53c7bbb --- /dev/null +++ b/test/testnames-valid.txt @@ -0,0 +1,24 @@ +a_b +a-b +a1 +ฮดฮตฮดฮฟฮผฮญฮฝฮฑ +a/b +a/b/c +a- +b_ +1 +a/1 +123 +12ab/34cd +1/2/3 +1-2-3 +x_-__ +a--- +a.b +a.b +abc.def +abc/def.ghi +abc-def.ghi_jkl +a.b.c +a_.c +foo__-.csv