Skip to content

Commit d918afa

Browse files
authored
Add drivers section to Data.toml for autoloading (#20)
This allows modules which provide data storage drivers to be automatically loaded when DataSets itself is loaded, providing a declarative data environment workflow while bypassing world age issues we'd get from loading these on demand. It seems somewhat unclear whether interfering with code loading like this is a good idea, but I think only time and some experience with this mechanism will tell...
1 parent 9b582f2 commit d918afa

File tree

7 files changed

+124
-3
lines changed

7 files changed

+124
-3
lines changed

src/DataSets.jl

+63-3
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ using UUIDs
44
using TOML
55
using SHA
66
using ResourceContexts
7+
using Base: PkgId
78

89
export DataSet, dataset, @datafunc, @datarun
910
export Blob, BlobTree, newfile, newdir
@@ -215,6 +216,8 @@ identifier, `nothing` is returned.
215216
"""
216217
project_name(data_project::AbstractDataProject) = nothing
217218

219+
data_drivers(proj::AbstractDataProject) = []
220+
218221
#-------------------------------------------------------------------------------
219222
"""
220223
DataProject
@@ -224,11 +227,15 @@ Names are unique within the project.
224227
"""
225228
struct DataProject <: AbstractDataProject
226229
datasets::Dict{String,DataSet}
230+
drivers::Vector{Dict{String,Any}}
227231
end
228232

229-
DataProject() = DataProject(Dict{String,DataSet}())
233+
DataProject() = DataProject(Dict{String,DataSet}(), Vector{Dict{String,Any}}())
234+
235+
DataProject(project::AbstractDataProject) = DataProject(Dict(pairs(project)),
236+
Vector{Dict{String,Any}}())
230237

231-
DataProject(project::AbstractDataProject) = DataProject(Dict(pairs(project)))
238+
data_drivers(project::DataProject) = project.drivers
232239

233240
function _fill_template(toml_path, toml_str)
234241
# Super hacky templating for paths relative to the toml file.
@@ -276,6 +283,14 @@ function load_project(config::AbstractDict; kws...)
276283
dataset = DataSet(dataset_conf)
277284
link_dataset(proj, dataset.name => dataset)
278285
end
286+
if haskey(config, "drivers")
287+
_check_keys(config, DataProject, ["drivers"=>AbstractVector])
288+
for driver_conf in config["drivers"]
289+
_check_keys(driver_conf, DataProject, ["type"=>String, "name"=>String, "module"=>Dict])
290+
_check_keys(driver_conf["module"], DataProject, ["name"=>String, "uuid"=>String])
291+
push!(proj.drivers, driver_conf)
292+
end
293+
end
279294
proj
280295
end
281296

@@ -363,6 +378,8 @@ end
363378

364379
StackedDataProject() = StackedDataProject([])
365380

381+
data_drivers(stack::StackedDataProject) = vcat(data_drivers.(stack.projects)...)
382+
366383
function Base.keys(stack::StackedDataProject)
367384
names = []
368385
for project in stack.projects
@@ -479,8 +496,23 @@ PROJECT = StackedDataProject()
479496
# deprecated. TODO: Remove dependency on this from JuliaHub
480497
_current_project = DataProject()
481498

499+
_isprecompiling() = ccall(:jl_generating_output, Cint, ()) == 1
500+
482501
function __init__()
483-
global PROJECT = create_project_stack(ENV)
502+
# Triggering Base.require for storage drivers during precompilation should
503+
# be unnecessary and can cause problems if those driver modules use
504+
# Requires-like code loading.
505+
if !_isprecompiling()
506+
global PROJECT = create_project_stack(ENV)
507+
for proj in PROJECT.projects
508+
try
509+
add_storage_driver(proj)
510+
catch exc
511+
@error "Could not load storage drivers from data project" #=
512+
=# project=proj exception=(exc,catch_backtrace())
513+
end
514+
end
515+
end
484516
end
485517

486518
dataset(name) = dataset(PROJECT, name)
@@ -494,6 +526,7 @@ May be renamed in a future version.
494526
"""
495527
function load_project!(path_or_config)
496528
new_project = load_project(path_or_config, auto_update=true)
529+
add_storage_driver(new_project)
497530
pushfirst!(PROJECT, new_project)
498531
# deprecated: _current_project reflects only the initial version of the
499532
# project on *top* of the stack.
@@ -527,6 +560,33 @@ function add_storage_driver((name,opener)::Pair)
527560
end
528561
end
529562

563+
function add_storage_driver(project::AbstractDataProject)
564+
for conf in data_drivers(project)
565+
if conf["type"] != "storage"
566+
# Anticipate there might be layer drivers too
567+
continue
568+
end
569+
pkgid = PkgId(UUID(conf["module"]["uuid"]), conf["module"]["name"])
570+
if Base.haskey(Base.package_locks, pkgid)
571+
# Hack: Avoid triggering another call to require() for packages
572+
# which are already in the process of being loaded. (This would
573+
# result in a deadlock!)
574+
#
575+
# Obviously this depends on Base internals...
576+
continue
577+
end
578+
mod = Base.require(pkgid)
579+
driver_name = conf["name"]
580+
# Module itself does add_storage_driver() inside its __init__
581+
# TODO: Is this a good workflow?
582+
lock(_storage_drivers_lock) do
583+
get(_storage_drivers, driver_name) do
584+
error("Package $pkgid did not provide storage driver $driver_name")
585+
end
586+
end
587+
end
588+
end
589+
530590
function _find_driver(dataset)
531591
storage_config = dataset.storage
532592
driver_name = get(storage_config, "driver") do

src/file_data_projects.jl

+2
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,8 @@ end
118118

119119
Base.pairs(proj::AbstractTomlFileDataProject) = pairs(_get_cached(proj))
120120

121+
data_drivers(proj::AbstractTomlFileDataProject) = data_drivers(_get_cached(proj))
122+
121123
#-------------------------------------------------------------------------------
122124
"""
123125
Data project which automatically updates based on a TOML file on the local

test/DriverAutoloadData.toml

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
data_config_version = 0
2+
3+
[[datasets]]
4+
description="Test dynamic loading of drivers"
5+
name="dummy_storage_blob"
6+
uuid="785b3cdc-428e-426f-a3f7-3f6ae88a9637"
7+
8+
[datasets.storage]
9+
driver="DummyTomlStorage"
10+
type="Blob"
11+
data="data_from_dummy_backend"
12+
13+
#-------------------------------------------------------------------------------
14+
15+
[[drivers]]
16+
type="storage"
17+
name="DummyTomlStorage"
18+
19+
[drivers.module]
20+
name="DummyStorageBackends"
21+
uuid="89b7a33a-382e-4698-a931-421b088d35a2"

test/driver_autoload.jl

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
@testset "Automatic code loading for drivers" begin
2+
empty!(DataSets.PROJECT)
3+
pushfirst!(LOAD_PATH, abspath("drivers"))
4+
ENV["JULIA_DATASETS_PATH"] = joinpath(@__DIR__, "DriverAutoloadData.toml")
5+
DataSets.__init__()
6+
@test haskey(DataSets._storage_drivers, "DummyTomlStorage")
7+
8+
@test open(String, dataset("dummy_storage_blob")) == "data_from_dummy_backend"
9+
end
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
name="DummyStorageBackends"
2+
uuid="89b7a33a-382e-4698-a931-421b088d35a2"
3+
4+
[deps]
5+
DataSets = "c9661210-8a83-48f0-b833-72e62abce419"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
module DummyStorageBackends
2+
3+
using DataSets
4+
5+
struct DummyBackend
6+
data
7+
end
8+
9+
function Base.open(f::Function, ::Type{IO}, storage::DummyBackend, path; kws...) where {T}
10+
@assert isempty(path)
11+
f(IOBuffer(storage.data))
12+
end
13+
14+
function connect_dummy_backend(f, config, ds)
15+
storage = DummyBackend(config["data"])
16+
f(Blob(storage))
17+
end
18+
19+
function __init__()
20+
DataSets.add_storage_driver("DummyTomlStorage"=>connect_dummy_backend)
21+
end
22+
23+
end

test/runtests.jl

+1
Original file line numberDiff line numberDiff line change
@@ -125,3 +125,4 @@ include("entrypoint.jl")
125125
include("repl.jl")
126126
include("DataTomlStorage.jl")
127127
include("backend_compat.jl")
128+
include("driver_autoload.jl")

0 commit comments

Comments
 (0)