Skip to content

Commit 20bd7e5

Browse files
authored
Storage driver for data embedded in Data.toml (#15)
* Storage driver for data embedded in Data.toml * Blobs are stored as base64 encoded strings * Trees are stored as dictionaries While we're at it, start to clean up storage backend interface. Also deprecate `abspath(::RelPath)` depending on `pwd()`, as the assumption that this refers to the filesystem, and that depending on the global `pwd()` is actually reasonable seem pretty dubious in hindsight.
1 parent d78528e commit 20bd7e5

9 files changed

+297
-32
lines changed

Project.toml

+3-2
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,19 @@ version = "0.2.3"
55

66
[deps]
77
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
8-
ResourceContexts = "8d208092-d35c-4dd3-a0d7-8325f9cce6b4"
8+
Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
99
REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
1010
ReplMaker = "b873ce64-0db9-51f5-a568-4457d8e49576"
11+
ResourceContexts = "8d208092-d35c-4dd3-a0d7-8325f9cce6b4"
1112
SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
1213
TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
1314
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
1415

1516
[compat]
1617
AbstractTrees = "0.3"
1718
ReplMaker = "0.2"
18-
TOML = "1"
1919
ResourceContexts = "0.1"
20+
TOML = "1"
2021
julia = "1.5"
2122

2223
[extras]

src/BlobTree.jl

+9-3
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,8 @@ Base.abspath(tree::BlobTree) = AbsPath(tree.root, tree.path)
300300
function Base.getindex(tree::BlobTree, path::RelPath)
301301
relpath = joinpath(tree.path, path)
302302
root = tree.root
303+
# TODO: Make this more efficient by moving this work to the storage backend?
304+
# Sort of like an equivalent of `stat`?
303305
if isdir(root, relpath)
304306
BlobTree(root, relpath)
305307
elseif isfile(root, relpath)
@@ -316,9 +318,9 @@ function Base.getindex(tree::BlobTree, name::AbstractString)
316318
end
317319

318320
# We've got a weird mishmash of path vs tree handling here.
319-
# TODO: Can we refactor this to cleanly separate the filesystem commands (which
320-
# take abstract paths?) from BlobTree and Blob which act as an abstraction over
321-
# the filesystem or other storage mechanisms?
321+
# TODO: Can we refactor this to cleanly separate the filesystem-like commands
322+
# (which take abstract paths?) from BlobTree and Blob which act as an
323+
# abstraction over the filesystem or other storage mechanisms?
322324
function Base.joinpath(tree::BlobTree, r::RelPath)
323325
AbsPath(tree.root, joinpath(tree.path, r))
324326
end
@@ -335,6 +337,10 @@ function Base.readdir(tree::BlobTree)
335337
readdir(tree.root, tree.path)
336338
end
337339

340+
function Base.keys(tree::BlobTree)
341+
readdir(tree.root, tree.path)
342+
end
343+
338344
function Base.rm(tree::BlobTree; kws...)
339345
rm(tree.root, tree.path; kws...)
340346
end

src/DataSets.jl

+2-1
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,7 @@ Additional projects may be added or removed from the stack with `pushfirst!`,
461461
"""
462462
PROJECT = StackedDataProject()
463463

464-
# deprecated.
464+
# deprecated. TODO: Remove dependency on this from JuliaHub
465465
_current_project = DataProject()
466466

467467
function __init__()
@@ -562,6 +562,7 @@ include("BlobTree.jl")
562562

563563
# Builtin backends
564564
include("filesystem.jl")
565+
include("DataTomlStorage.jl")
565566

566567
# Backends
567568
# include("ZipTree.jl")

src/DataTomlStorage.jl

+127
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
using Base64
2+
3+
"""
4+
Storage driver which keeps the data embedded within the TOML file itself.
5+
Useful for small amounts of self-contained data.
6+
7+
## Metadata spec
8+
9+
For Blob:
10+
```
11+
[datasets.storage]
12+
driver="TomlDataStorage"
13+
type="Blob"
14+
data=\$(base64encode(data))
15+
```
16+
17+
For BlobTree:
18+
```
19+
[datasets.storage]
20+
driver="TomlDataStorage"
21+
type="BlobTree"
22+
23+
[datasets.storage.data.\$(dirname1)]
24+
"\$(filename1)" = \$(base64encode(data1))
25+
"\$(filename2)" = \$(base64encode(data2))
26+
27+
[datasets.storage.data.\$(dirname2)]
28+
...
29+
```
30+
"""
31+
struct TomlDataStorage
32+
dataset::DataSet
33+
data::Union{String,Dict{String,Any}}
34+
end
35+
36+
# Get TOML data at `path`, returning nothing if not present
37+
function _getpath(storage::TomlDataStorage, path::RelPath)
38+
x = storage.data
39+
for c in path.components
40+
x = get(x, c, nothing)
41+
!isnothing(x) || return nothing
42+
end
43+
x
44+
end
45+
46+
#--------------------------------------------------
47+
# Storage data interface for trees
48+
49+
Base.isdir(storage::TomlDataStorage, path::RelPath) = _getpath(storage, path) isa Dict
50+
Base.isfile(storage::TomlDataStorage, path::RelPath) = _getpath(storage, path) isa String
51+
Base.ispath(storage::TomlDataStorage, path::RelPath) = !isnothing(_getpath(storage, path))
52+
53+
Base.summary(io::IO, storage::TomlDataStorage) = print(io, "Data.toml")
54+
55+
function Base.readdir(storage::TomlDataStorage, path::RelPath)
56+
try
57+
tree = _getpath(storage, path)
58+
!isnothing(tree) || KeyError(path)
59+
sort!(collect(keys(tree::AbstractDict)))
60+
catch
61+
error("TOML storage requires trees to be as TOML dictionaries")
62+
end
63+
end
64+
65+
#--------------------------------------------------
66+
# Storage data interface for Blob
67+
68+
function Base.open(func::Function, as_type::Type{IO},
69+
storage::TomlDataStorage, path; kws...)
70+
@context func(@! open(as_type, storage, path; kws...))
71+
end
72+
73+
@! function Base.open(::Type{Vector{UInt8}}, storage::TomlDataStorage, path;
74+
write=false, read=!write, kws...)
75+
if write
76+
error("Embedded data is read-only from within the DataSets interface")
77+
end
78+
try
79+
str = _getpath(storage, path)
80+
!isnothing(str) || KeyError(path)
81+
base64decode(str::AbstractString)
82+
catch
83+
error("TOML storage requires data to be as base64 encoded strings")
84+
end
85+
end
86+
87+
@! function Base.open(::Type{IO}, storage::TomlDataStorage, path; kws...)
88+
buf = @! open(Vector{UInt8}, storage, path; kws...)
89+
IOBuffer(buf)
90+
end
91+
92+
93+
# TODO: The following should be factored out and implemented generically
94+
function Base.read(storage::TomlDataStorage, path::RelPath, ::Type{T}) where {T}
95+
@context begin
96+
io = @! open(IO, storage, path)
97+
read(io, T)
98+
end
99+
end
100+
101+
function Base.read(storage::TomlDataStorage, path::RelPath)
102+
@context @! open(Vector{UInt8}, storage, path)
103+
end
104+
105+
106+
#-------------------------------------------------------------------------------
107+
# Connect storage backend
108+
function connect_toml_data_storage(f, config, dataset)
109+
type = config["type"]
110+
data = get(config, "data", nothing)
111+
if type == "Blob"
112+
if !(data isa AbstractString)
113+
error("TOML data storage requires string data in the \"storage.data\" key")
114+
end
115+
f(Blob(TomlDataStorage(dataset, data)))
116+
elseif type == "BlobTree"
117+
if !(data isa AbstractDict)
118+
error("TOML data storage requires a dictionary in the \"storage.data\" key")
119+
end
120+
f(BlobTree(TomlDataStorage(dataset, data)))
121+
else
122+
throw(ArgumentError("DataSet type $type not supported for data embedded in Data.toml"))
123+
end
124+
end
125+
126+
add_storage_driver("TomlDataStorage"=>connect_toml_data_storage)
127+

src/filesystem.jl

+61-21
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#
55
abstract type AbstractFileSystemRoot end
66

7-
# These underscore functions sys_abspath and sys_joinpath generate/joins OS-specific
7+
# These functions sys_abspath and sys_joinpath generate/joins OS-specific
88
# _local filesystem paths_ out of logical paths. They should be defined only
99
# for trees which are rooted in the actual filesystem.
1010
function sys_abspath(root::AbstractFileSystemRoot, path::RelPath)
@@ -17,30 +17,27 @@ sys_abspath(path::AbsPath) = sys_abspath(path.root, path.path)
1717
sys_abspath(tree::BlobTree) = sys_abspath(tree.root, tree.path)
1818
sys_abspath(file::Blob) = sys_abspath(file.root, file.path)
1919

20+
#--------------------------------------------------
21+
# Storage data interface for trees
22+
#
23+
# TODO: Formalize this interface!
24+
25+
## 1. Query
26+
2027
# TODO: would it be better to express the following dispatch in terms of
2128
# AbsPath{<:AbstractFileSystemRoot} rather than usin double dispatch?
29+
2230
Base.isdir(root::AbstractFileSystemRoot, path::RelPath) = isdir(sys_abspath(root, path))
2331
Base.isfile(root::AbstractFileSystemRoot, path::RelPath) = isfile(sys_abspath(root, path))
2432
Base.ispath(root::AbstractFileSystemRoot, path::RelPath) = ispath(sys_abspath(root, path))
25-
Base.read(root::AbstractFileSystemRoot, path::RelPath, ::Type{T}) where {T} =
26-
read(sys_abspath(root, path), T)
27-
Base.read(root::AbstractFileSystemRoot, path::RelPath) where {T} =
28-
read(sys_abspath(root, path))
2933

3034
Base.summary(io::IO, root::AbstractFileSystemRoot) = print(io, sys_abspath(root))
3135

32-
function Base.open(f::Function, as_type::Type{IO}, root::AbstractFileSystemRoot, path;
33-
kws...)
34-
@context f(@! open(as_type, root, path; kws...))
35-
end
36+
Base.readdir(root::AbstractFileSystemRoot, path::RelPath) = readdir(sys_abspath(root, path))
3637

37-
@! function Base.open(::Type{IO}, root::AbstractFileSystemRoot, path;
38-
write=false, read=!write, kws...)
39-
if !iswriteable(root) && write
40-
error("Error writing file at read-only path $path")
41-
end
42-
@! open(sys_abspath(root, path); read=read, write=write, kws...)
43-
end
38+
## 2. Mutation
39+
#
40+
# TODO: Likely requires rework!
4441

4542
function Base.mkdir(root::AbstractFileSystemRoot, path::RelPath; kws...)
4643
if !iswriteable(root)
@@ -54,9 +51,49 @@ function Base.rm(root::AbstractFileSystemRoot, path::RelPath; kws...)
5451
rm(sys_abspath(root,path); kws...)
5552
end
5653

57-
Base.readdir(root::AbstractFileSystemRoot, path::RelPath) = readdir(sys_abspath(root, path))
54+
#--------------------------------------------------
55+
# Storage data interface for Blob
56+
57+
# TODO: Make this the generic implementation for AbstractDataStorage
58+
function Base.open(f::Function, as_type::Type{IO},
59+
root::AbstractFileSystemRoot, path; kws...)
60+
@context f(@! open(as_type, root, path; kws...))
61+
end
62+
63+
@! function Base.open(::Type{IO}, root::AbstractFileSystemRoot, path;
64+
write=false, read=!write, kws...)
65+
if !iswriteable(root) && write
66+
error("Error writing file at read-only path $path")
67+
end
68+
@! open(sys_abspath(root, path); read=read, write=write, kws...)
69+
end
70+
71+
Base.read(root::AbstractFileSystemRoot, path::RelPath, ::Type{T}) where {T} =
72+
read(sys_abspath(root, path), T)
73+
Base.read(root::AbstractFileSystemRoot, path::RelPath) =
74+
read(sys_abspath(root, path))
5875

5976
#--------------------------------------------------
77+
"""
78+
79+
## Metadata spec
80+
81+
For Blob:
82+
```
83+
[datasets.storage]
84+
driver="FileSystem"
85+
type="Blob"
86+
path=\$(path_to_file)
87+
```
88+
89+
For BlobTree:
90+
```
91+
[datasets.storage]
92+
driver="FileSystem"
93+
type="BlobTree"
94+
path=\$(path_to_directory)
95+
```
96+
"""
6097
struct FileSystemRoot <: AbstractFileSystemRoot
6198
path::String
6299
read::Bool
@@ -72,12 +109,15 @@ iswriteable(root::FileSystemRoot) = root.write
72109

73110
sys_abspath(root::FileSystemRoot) = root.path
74111

75-
# For use outside DataSets, we will assume the special case that abspath() with
76-
# a RelPath refers to the current working directory on the local system.
77-
Base.abspath(relpath::RelPath) =
112+
function Base.abspath(relpath::RelPath)
113+
Base.depwarn("""
114+
`abspath(::RelPath)` defaults to using `pwd()` as the root of the path
115+
but this leads to fragile code so will be removed in the future""",
116+
:abspath)
78117
AbsPath(FileSystemRoot(pwd(); write=true, read=true), relpath)
118+
end
79119

80-
#--------------------------------------------------
120+
#-------------------------------------------------------------------------------
81121
# Infrastructure for a somewhat more functional interface for creating file
82122
# trees than the fully mutable version we usually use.
83123

test/Data.toml

+40-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26"
2323
# type="text"
2424
# parameters={encoding="UTF-8"}
2525

26-
2726
#--------------------------------------------------
2827
[[datasets]]
2928
description="Gzipped CSV example"
@@ -55,3 +54,43 @@ uuid="e7fd7080-e346-4a68-9ca9-98593a99266a"
5554

5655
# TODO: Add data maps here which expose it logically as a single CSV?
5756

57+
58+
#--------------------------------------------------
59+
[[datasets]]
60+
description="A data blob embedded in the TOML"
61+
name="embedded_blob"
62+
uuid="b498f769-a7f6-4f67-8d74-40b770398f26"
63+
64+
[datasets.storage]
65+
driver="TomlDataStorage"
66+
type="Blob"
67+
data="AAAAAAAARUA="
68+
69+
70+
[[datasets]]
71+
description="A data tree embedded in the TOML"
72+
name="embedded_tree"
73+
uuid="b498f769-a7f6-4f67-8d74-40b770398f26"
74+
75+
[datasets.storage]
76+
driver="TomlDataStorage"
77+
type="BlobTree"
78+
79+
# TOML.print(Dict("datasets"=>[Dict("storage"=>Dict("data"=>Dict(["d0$i"=>Dict(["$x.txt"=>base64encode("$i $x content") for x in ("a","b")]...) for i in 1:4]...)))]))
80+
81+
[datasets.storage.data.d01]
82+
"b.txt" = "MSBiIGNvbnRlbnQ="
83+
"a.txt" = "MSBhIGNvbnRlbnQ="
84+
85+
[datasets.storage.data.d02]
86+
"b.txt" = "MiBiIGNvbnRlbnQ="
87+
"a.txt" = "MiBhIGNvbnRlbnQ="
88+
89+
[datasets.storage.data.d03]
90+
"b.txt" = "MyBiIGNvbnRlbnQ="
91+
"a.txt" = "MyBhIGNvbnRlbnQ="
92+
93+
[datasets.storage.data.d04]
94+
"b.txt" = "NCBiIGNvbnRlbnQ="
95+
"a.txt" = "NCBhIGNvbnRlbnQ="
96+

0 commit comments

Comments
 (0)