From e8dce9507c2f0824ab4690b348ac11705dd368e5 Mon Sep 17 00:00:00 2001
From: Chris Foster <chris42f@gmail.com>
Date: Sat, 5 Jun 2021 14:04:43 +1000
Subject: [PATCH 01/10] Improve links in README

---
 README.md | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 1e71d95..29069f7 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # DataSets
 
-[![Build Status](https://github.com/JuliaComputing/DataSets.jl/workflows/CI/badge.svg)](https://github.com/JuliaComputing/DataSets.jl/actions)
-[![Docs dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://juliacomputing.github.io/DataSets.jl/dev)
+[![Version](https://juliahub.com/docs/DataSets/version.svg)](https://juliahub.com/ui/Packages/DataSets/4adr3)
+[![docs latest](https://img.shields.io/badge/docs-latest-blue.svg)](https://juliahub.com/docs/DataSets)
 
 DataSets.jl exists to help manage data and reduce the amount of data wrangling
 code you need to write. It's annoying to write
@@ -19,5 +19,9 @@ to make it easy to *relocate* an algorithm between different data environments
 without code changes. For example from your laptop to the cloud, to another
 user's machine, or to an HPC system.
 
-Read [**the documentation**](https://juliacomputing.github.io/DataSets.jl/dev/)
-for more information.
+Read [**the latest documentation**](https://juliahub.com/docs/DataSets) more information.
+
+### Development
+
+[![docs-dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://juliacomputing.github.io/DataSets.jl/dev)
+[![Build Status](https://github.com/JuliaComputing/DataSets.jl/workflows/CI/badge.svg)](https://github.com/JuliaComputing/DataSets.jl/actions)

From 9cca7545c8e382ec805c6c0701c7421729b3d0d3 Mon Sep 17 00:00:00 2001
From: Chris Foster <chris42f@gmail.com>
Date: Fri, 11 Jun 2021 22:44:05 +1000
Subject: [PATCH 02/10] Revive data REPL for listing and showing datasets (#19)

Support for:
* Listing datasets in the global data project stack with `ls`,`list`
* Showing the content of datasets with `show`
* Manipulate the global data project stack with `stack list`, `stack push`, `stack pop`
* Help with `help`/`?`

Includes REPL completions, help text and some tests.

Doesn't yet include support for adding new datasets to existing projects.
---
 Project.toml     |   1 +
 src/DataSets.jl  |  24 +++-
 src/repl.jl      | 327 ++++++++++++++++++++++++++++++++++-------------
 test/repl.jl     |  35 +++++
 test/runtests.jl |   1 +
 5 files changed, 291 insertions(+), 97 deletions(-)
 create mode 100644 test/repl.jl

diff --git a/Project.toml b/Project.toml
index 4a06e6e..692f315 100644
--- a/Project.toml
+++ b/Project.toml
@@ -6,6 +6,7 @@ version = "0.2.3"
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
+Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
 REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
 ReplMaker = "b873ce64-0db9-51f5-a568-4457d8e49576"
 ResourceContexts = "8d208092-d35c-4dd3-a0d7-8325f9cce6b4"
diff --git a/src/DataSets.jl b/src/DataSets.jl
index 52064e2..746da72 100644
--- a/src/DataSets.jl
+++ b/src/DataSets.jl
@@ -321,7 +321,11 @@ function Base.show(io::IO, ::MIME"text/plain", project::AbstractDataProject)
     maxwidth = maximum(textwidth.(first.(sorted)))
     for (i, (name, data)) in enumerate(sorted)
         pad = maxwidth - textwidth(name)
-        print(io, "  ", name, ' '^pad, " => ", data.uuid)
+        storagetype = get(data.storage, "type", nothing)
+        icon = storagetype == "Blob"     ? '📄' :
+               storagetype == "BlobTree" ? '📁' :
+               '❓'
+        print(io, "  ", icon, ' ', name, ' '^pad, " => ", data.uuid)
         if i < length(sorted)
             println(io)
         end
@@ -375,6 +379,8 @@ end
 # API for manipulating the stack.
 Base.push!(stack::StackedDataProject, project) = push!(stack.projects, project)
 Base.pushfirst!(stack::StackedDataProject, project) = pushfirst!(stack.projects, project)
+Base.popfirst!(stack::StackedDataProject) = popfirst!(stack.projects)
+Base.pop!(stack::StackedDataProject) = pop!(stack.projects)
 Base.empty!(stack::StackedDataProject) = empty!(stack.projects)
 
 function Base.show(io::IO, mime::MIME"text/plain", stack::StackedDataProject)
@@ -405,6 +411,14 @@ function expand_project_path(path)
     path
 end
 
+function data_project_from_path(path)
+    if path == "@"
+        project = ActiveDataProject()
+    else
+        project = TomlFileDataProject(expand_project_path(path))
+    end
+end
+
 function create_project_stack(env)
     stack = []
     env_search_path = get(env, "JULIA_DATASETS_PATH", nothing)
@@ -414,11 +428,7 @@ function create_project_stack(env)
         paths = split(env_search_path, Sys.iswindows() ? ';' : ':')
     end
     for path in paths
-        if path == "@"
-            project = ActiveDataProject()
-        else
-            project = TomlFileDataProject(expand_project_path(path))
-        end
+        project = data_project_from_path(path)
         push!(stack, project)
     end
     StackedDataProject(stack)
@@ -577,6 +587,6 @@ include("DataTomlStorage.jl")
 # include("GitTree.jl")
 
 # Application-level stuff
-# include("repl.jl")
+include("repl.jl")
 
 end
diff --git a/src/repl.jl b/src/repl.jl
index 165d461..789f5cb 100644
--- a/src/repl.jl
+++ b/src/repl.jl
@@ -1,125 +1,272 @@
-"""
-    `DataSets.DataApp` contains all `DataSets` "application level" code and state.
+module DataREPL
+
+using Markdown
 
-This includes the builtin REPL utilities and the default data project for use
-within a julia session.
+_data_repl_help = md"""
+## DataSets Data REPL
+
+|   Command    | Alias     | Action      |
+|:----------   |:--------- | :---------- |
+| `help`       | `?`       | Show this message |
+| `list`       | `ls`      | List all datasets by name |
+| `show $name` |           | Show the content of dataset `$name` |
+| `stack` | `st`           | Manipulate the global data search stack |
+| `stack list` | `st ls`   | List all projects in the global data search stack |
+| `stack push $path` | `st push` | Add data project `$path` to front of the search stack |
+| `stack pop`  | `st pop`  | Remove data project from front of the search stack    |
 
-In contrast, the main `DataSets` module includes only core data structures and
-operations which don't depend on global state.
 """
-module DataApp
 
 using ..DataSets
-import ..DataSets: DataSet, DataProject, link_dataset, load_project
 
-using REPL: LineEdit
-using URIs
+using ResourceContexts
+
+using REPL
 using ReplMaker
+# using URIs
 
-# Possible REPL verbs
-#
-# Adding
-#   add link - Create a new association between external data and the project
-#   add new  - Add a new dataset *within* the project
-#
-# Removing
-#   rm [link]   - Unlink the dataset from the project
-#   rm ! <name> - Remove the dataset from the project, and remove the actual
-#                 data too if it's embedded.
-#
-# open - REPL data viewer (?)
-# list - 
+#-------------------------------------------------------------------------------
+# Utilities for browsing dataset content
+function show_dataset(name)
+    out_stream = stdout
+    @context begin
+        data = @! open(dataset(name))
+        _show_dataset(out_stream, data)
+    end
+end
 
-# Translate `data>` REPL syntax into an Expr to be evaluated in the REPL
-# backend.
-function make_data_repl_command(cmdstr)
-    # Use shell tokenization rules for familiarity
-    cmd_tokens = Base.shell_split(cmdstr)
-    cmdname = cmd_tokens[1]
-    if cmdname in ("ln", "link")
-        # FIXME: Test :incomplete
-        if length(cmd_tokens) < 3
-            return Expr(:incomplete, "Needs name and location")
+# hex dump in xxd format
+function hexdump(out_stream, buf; groups_per_line=8, group_size=2)
+    linesize = groups_per_line*group_size
+    for line = 1:div(length(buf), linesize, RoundUp)
+        linebuf = buf[(line-1)*linesize+1 : min(line*linesize,end)]
+        address = (line-1)*linesize
+        print(out_stream, string(address, base=16, pad=4), ": ")
+        for group = 1:groups_per_line
+            for i=1:group_size
+                j = (group-1)*group_size+i
+                if j <= length(linebuf)
+                    print(out_stream, string(linebuf[j], base=16, pad=2))
+                else
+                    print(out_stream, "  ")
+                end
+            end
+            print(out_stream, ' ')
         end
-        name = cmd_tokens[2]
-        location = cmd_tokens[3]
-        toks = cmd_tokens[4:end]
-        if any(toks[1:2:end] .!= "|")
-            error("Expected '|' separated layers after $location. Got $toks.")
+        print(out_stream, ' ')
+        for j = 1:linesize
+            c = Char(j <= length(linebuf) ? linebuf[j] : ' ')
+            print(out_stream, isprint(c) ? c : '.')
         end
-        layers = toks[2:2:end]
-        return quote
-            name = $name
-            location = DataSets.DataApp.expand_location($location)
-            layers = DataSets.DataApp.expand_layer.($layers)
-            d = DataSets.DataSet(default_name=name, location=location, layers=layers)
-            DataSets.link_dataset(DataSets.DataApp._current_project, name=>d)
-            d
+        print(out_stream, '\n')
+    end
+end
+
+function _show_dataset(out_stream::IO, blob::Blob)
+    @context begin
+        io = @! open(IO, blob)
+        N = 1024
+        buf = read(io, N)
+        str = String(copy(buf))
+        n_textlike = count(str) do c
+            isvalid(c) || return false
+            isprint(c) || c in ('\n', '\r', '\t')
         end
-    elseif cmdname == "unlink"
-        name = cmd_tokens[2]
-        return quote
-            DataSets.unlink_dataset(DataSets.DataApp._current_project, $name)
-            nothing
+        if n_textlike / length(str) > 0.95
+            # It's approximately UTF-8 encoded text data.
+            print(out_stream, str)
+        else
+            # It's something else, perhaps binary or another text
+            # encoding. Do a hex dump instead.
+            hexdump(out_stream, buf)
         end
-    elseif cmdname in ("ls", "list")
-        return quote
-            # Will be `show()`n by the REPL
-            DataSets.DataApp._current_project
+        if !eof(io)
+            println(out_stream, "…")
         end
-    elseif cmdname == "show"
-        error("Not implemented")
-        # Idea here could be to open a browser for the data.
-    else
-        error("Invalid data REPL syntax: \"$cmdstr\"")
     end
 end
 
-function init_repl(; start_key = ">")
-    ReplMaker.initrepl(make_data_repl_command,
-                       repl = Base.active_repl,
-                       # valid_input_checker = Support for multiple lines syntax?
-                       prompt_text = "data> ",
-                       prompt_color = :red,
-                       start_key = start_key,
-                       sticky_mode=true,
-                       mode_name = "Data_Manager",
-                       startup_text=false)
-    nothing
+function _show_dataset(out_stream::IO, tree::BlobTree)
+    show(out_stream, MIME("text/plain"), tree)
+end
+
+function _show_dataset(out_stream::IO, x)
+    show(out_stream, MIME("text/plain"), x)
 end
 
-link_dataset(name_and_data::Pair) = link_dataset(_current_project, name_and_data)
 
-function repl_link_dataset(name, accessors)
-    @assert length(accessors) == 1
-    uri = URI(accessors[1])
-    d = DataSet(name=name, location=uri)
-    link_dataset(d)
+#-------------------------------------------------------------------------------
+# REPL command handling and completions
+
+# function split_command(str)
+#     token_ranges = []
+#     i = 1
+#     while true
+#         rng = findnext(r"[^\s]+", full, i)
+#         !isnothing(rng) || break
+#         push!(token_ranges, rng)
+#         i = last(rng)+1
+#     end
+#     tokens = getindex.(full, token_ranges)
+#     token_ranges, tokens
+# end
+
+function complete_command_list(cmd_prefix, commands)
+    # Completions for basic commands
+    completions = String[]
+    for cmdset in commands
+        for cmd in cmdset
+            if cmd == cmd_prefix
+                # Space after full length command
+                return ([" "], "", true)
+            end
+            if startswith(cmd, cmd_prefix)
+                push!(completions, cmd*" ")
+                break
+            end
+        end
+    end
+    return completions
+end
+
+function path_str(path_completion)
+    path = REPL.REPLCompletions.completion_text(path_completion)
+    if Sys.iswindows()
+        # On windows, REPLCompletions.complete_path() adds extra escapes for
+        # use within a normal string in the Juila REPL but we don't need those.
+        path = replace(path, "\\\\"=>'\\')
+    end
+    return path
 end
 
-function list_datasets(io, proj::DataProject)
-    for (name,d) in proj.datasets
-        println(io, name, " @ ", d.location)
+function complete(str_to_complete)
+    tokens = split(str_to_complete, r" +", keepempty=true)
+    cmd = popfirst!(tokens)
+    if isempty(tokens)
+        # Completions for basic commands
+        completions = complete_command_list(cmd, [
+            ("list","ls"),
+            ("show",),
+            ("stack",),
+            ("help","?")
+        ])
+        # Empty completion => return anyway to show user their prefix is wrong
+        return (completions, cmd, !isempty(completions))
+    end
+    if cmd == "show"
+        if length(tokens) <= 1
+            name_prefix = isempty(tokens) ? "" : tokens[1]
+            completions = String[]
+            ks = sort!(collect(keys(DataSets.PROJECT)))
+            for k in ks
+                if startswith(k, name_prefix) && k != name_prefix
+                    push!(completions, k)
+                end
+            end
+            return (completions, name_prefix, !isempty(completions))
+        end
+    elseif cmd == "stack"
+        if length(tokens) <= 1
+            subcmd_prefix = isempty(tokens) ? "" : tokens[1]
+            # Completions for project stack subcommands
+            completions = complete_command_list(subcmd_prefix, [
+                ("push",),
+                ("pop",),
+                ("list","ls",)
+            ])
+            return (completions, tokens[1], !isempty(completions))
+        elseif length(tokens) == 2
+            subcmd = popfirst!(tokens)
+            if subcmd == "push"
+                path_prefix = isempty(tokens) ? "" : tokens[1]
+                (path_completions, range, should_complete) =
+                    REPL.REPLCompletions.complete_path(path_prefix, length(path_prefix))
+                completions = [path_str(c) for c in path_completions]
+                return (completions, path_prefix[range], should_complete)
+            end
+        end
     end
+    return ([], "", false)
 end
 
-function expand_location(location)
-    path = abspath(location)
-    if ispath(path)
-        uri = URI("file", "", 0, path)
-    else
-        uri = URI(location)
+# Translate `data>` REPL syntax into an Expr to be evaluated in the REPL
+# backend.
+function parse_data_repl_cmd(cmdstr)
+    # Use shell tokenization rules for familiarity
+    tokens = Base.shell_split(cmdstr)
+    cmd = tokens[1]
+    popfirst!(tokens)
+    if cmd in ("list", "ls")
+        return quote
+            $DataSets.DataProject(Dict(pairs($DataSets.PROJECT)))
+        end
+    elseif cmd == "stack" && length(tokens) >= 1
+        subcmd = popfirst!(tokens)
+        if subcmd == "push"
+            path = popfirst!(tokens)
+            return quote
+                proj = $DataSets.data_project_from_path($path)
+                stack = $DataSets.PROJECT
+                pushfirst!(stack, proj)
+                stack
+            end
+        elseif subcmd == "pop"
+            return quote
+                stack = $DataSets.PROJECT
+                popfirst!(stack)
+                stack
+            end
+        elseif subcmd in ("list", "ls")
+            return quote
+                $DataSets.PROJECT
+            end
+        end
+    elseif cmd == "show"
+        name = tokens[1]
+        return quote
+            $DataREPL.show_dataset($name)
+        end
+    elseif cmd in ("help", "?")
+        return _data_repl_help
     end
+    error("Invalid data REPL syntax: \"$cmdstr\"")
 end
 
-function expand_layer(layer)
-    # TODO: expand the short REPL syntax into layer objects?
-    layer
+
+#-------------------------------------------------------------------------------
+# Integration with REPL / ReplMaker
+struct DataCompletionProvider <: REPL.LineEdit.CompletionProvider
 end
 
+function REPL.complete_line(provider::DataCompletionProvider,
+                            state::REPL.LineEdit.PromptState)::
+                            Tuple{Vector{String},String,Bool}
+    # See REPL.jl complete_line(c::REPLCompletionProvider, s::PromptState)
+    partial = REPL.beforecursor(state.input_buffer)
+    full = REPL.LineEdit.input_string(state)
+    if partial != full
+        # For now, only complete at end of line
+        return ([], "", false)
+    end
+    complete(full)
+end
+
+function init_data_repl(; start_key = ">")
+    ReplMaker.initrepl(parse_data_repl_cmd,
+                       repl = Base.active_repl,
+                       # valid_input_checker = Support for multiple lines syntax?
+                       prompt_text = "data> ",
+                       prompt_color = :red,
+                       start_key = start_key,
+                       sticky_mode = true,
+                       mode_name = "DataSets",
+                       completion_provider = DataCompletionProvider(),
+                       startup_text = true)
+    nothing
+end
 
 function __init__()
-    init_repl()
+    isinteractive() && init_data_repl()
 end
 
 end
diff --git a/test/repl.jl b/test/repl.jl
new file mode 100644
index 0000000..3d17205
--- /dev/null
+++ b/test/repl.jl
@@ -0,0 +1,35 @@
+using DataSets.DataREPL: complete, parse_data_repl_cmd
+
+@testset "repl completions" begin
+    @test complete("") == (["list ", "show ", "stack ", "help "], "", true)
+    @test complete("s") == (["show ", "stack "], "s", true)
+    @test complete("stack ") == (["push ", "pop ", "list "], "", true)
+
+    cd(@__DIR__) do
+        if Sys.iswindows()
+            @test complete("stack push da") == (["data\\"], "da", true)
+        else
+            @test complete("stack push da") == (["data/"], "da", true)
+        end
+    end
+end
+
+@testset "repl commands" begin
+    @test eval(parse_data_repl_cmd("help")) === DataSets.DataREPL._data_repl_help
+    @test eval(parse_data_repl_cmd("?")) === DataSets.DataREPL._data_repl_help
+    empty!(DataSets.PROJECT)
+    @test eval(parse_data_repl_cmd("stack push $(@__DIR__)")) === DataSets.PROJECT
+    @test length(DataSets.PROJECT.projects) == 1
+    @test eval(parse_data_repl_cmd("stack pop")) === DataSets.PROJECT
+    @test isempty(DataSets.PROJECT.projects)
+end
+
+@testset "data show utils" begin
+    @test sprint(DataSets.DataREPL.hexdump, UInt8.(0:70)) == raw"""
+    0000: 0001 0203 0405 0607 0809 0a0b 0c0d 0e0f  ................
+    0010: 1011 1213 1415 1617 1819 1a1b 1c1d 1e1f  ................
+    0020: 2021 2223 2425 2627 2829 2a2b 2c2d 2e2f   !"#$%&'()*+,-./
+    0030: 3031 3233 3435 3637 3839 3a3b 3c3d 3e3f  0123456789:;<=>?
+    0040: 4041 4243 4445 46                        @ABCDEF         
+    """
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 9c50bf3..167e5e0 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -161,6 +161,7 @@ end
     @test isfile(DataSets.sys_abspath(temptree["d1"]["hi_2.txt"]))
 end
 
+include("repl.jl")
 include("projects.jl")
 include("DataTomlStorage.jl")
 include("backend_compat.jl")

From 425c1b3d4ca057dc348d9cc6232b7d0c352e62fd Mon Sep 17 00:00:00 2001
From: Chris Foster <chris42f@gmail.com>
Date: Sat, 5 Jun 2021 16:16:39 +1000
Subject: [PATCH 03/10] Improved error message for missing drivers

---
 src/DataSets.jl | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/DataSets.jl b/src/DataSets.jl
index 746da72..5638f98 100644
--- a/src/DataSets.jl
+++ b/src/DataSets.jl
@@ -522,15 +522,28 @@ function add_storage_driver((name,opener)::Pair)
     end
 end
 
+function _find_driver(dataset)
+    storage_config = dataset.storage
+    driver_name = get(storage_config, "driver") do
+        error("`storage.driver` configuration not found for dataset $(dataset.name)")
+    end
+    driver = lock(_storage_drivers_lock) do
+        get(_storage_drivers, driver_name) do
+            error("""
+                  Storage driver $(repr(driver_name)) not found for dataset $(dataset.name).
+                  Current drivers are $(collect(keys(_storage_drivers)))
+                  """)
+        end
+    end
+end
+
 #-------------------------------------------------------------------------------
 # Functions for opening datasets
 
 # do-block form of open()
 function Base.open(f::Function, as_type, dataset::DataSet)
     storage_config = dataset.storage
-    driver = lock(_storage_drivers_lock) do
-        _storage_drivers[storage_config["driver"]]
-    end
+    driver = _find_driver(dataset)
     driver(storage_config, dataset) do storage
         open(f, as_type, storage)
     end
@@ -539,10 +552,7 @@ end
 # Contexts-based form of open()
 @! function Base.open(dataset::DataSet)
     storage_config = dataset.storage
-    driver_name = storage_config["driver"]
-    driver = lock(_storage_drivers_lock) do
-        _storage_drivers[driver_name]
-    end
+    driver = _find_driver(dataset)
     # Use `enter_do` because drivers don't yet use the ResourceContexts.jl mechanism
     (storage,) = @! enter_do(driver, storage_config, dataset)
     storage

From de0a3482ddeaaed224d539bee3d724551e87cede Mon Sep 17 00:00:00 2001
From: Chris Foster <chris42f@gmail.com>
Date: Mon, 14 Jun 2021 14:46:33 +1000
Subject: [PATCH 04/10] Factor out list of dataset names in tests

Now only need to update one place when the list of test datasets
changes.
---
 src/ZipTree.jl   |  2 +-
 test/projects.jl | 23 +++++++++++++++--------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/ZipTree.jl b/src/ZipTree.jl
index 2db62af..cfb80c2 100644
--- a/src/ZipTree.jl
+++ b/src/ZipTree.jl
@@ -122,7 +122,7 @@ close(r)
 #-------------------------------------------------------------------------------
 
 # Fixes for ZipFile.jl
-# TODO: Upstream these!
+# TODO: Remove these once https://github.com/fhs/ZipFile.jl/pull/75 is in a release.
 
 # It appears that ZipFile.jl just doesn't have a way to rewind to the start of
 # one of the embedded files.
diff --git a/test/projects.jl b/test/projects.jl
index f2bce97..850960c 100644
--- a/test/projects.jl
+++ b/test/projects.jl
@@ -8,6 +8,14 @@ using DataSets:
     StackedDataProject,
     project_name
 
+test_project_names = ["a_table",
+                      "a_text_file",
+                      "a_tree_example",
+                      "embedded_blob",
+                      "embedded_tree",
+                      "old_backend_blob",
+                      "old_backend_tree"]
+
 @testset "TomlFileDataProject" begin
     proj = TomlFileDataProject(abspath("Data.toml"))
     # getindex, get
@@ -16,13 +24,13 @@ using DataSets:
     @test isnothing(get(proj, "nonexistent_data", nothing))
 
     # keys
-    @test sort(collect(keys(proj))) == ["a_table", "a_text_file", "a_tree_example", "embedded_blob", "embedded_tree", "old_backend_blob", "old_backend_tree"]
+    @test sort(collect(keys(proj))) == test_project_names
     @test haskey(proj, "a_text_file")
     @test !haskey(proj, "nonexistent_data")
 
     # iteration
-    @test sort(getproperty.(collect(proj), :name)) == ["a_table", "a_text_file", "a_tree_example", "embedded_blob", "embedded_tree", "old_backend_blob", "old_backend_tree"]
-    @test sort(first.(pairs(proj))) == ["a_table", "a_text_file", "a_tree_example", "embedded_blob", "embedded_tree", "old_backend_blob", "old_backend_tree"]
+    @test sort(getproperty.(collect(proj), :name)) == test_project_names
+    @test sort(first.(pairs(proj))) == test_project_names
 
     # identity
     @test project_name(proj) == abspath("Data.toml")
@@ -99,7 +107,7 @@ end
     push!(proj, TomlFileDataProject(joinpath(@__DIR__, "active_project", "Data.toml")))
     push!(proj, TomlFileDataProject(joinpath(@__DIR__, "Data.toml")))
 
-    @test sort(collect(keys(proj))) == ["a_table", "a_text_file", "a_tree_example", "embedded_blob", "embedded_tree", "old_backend_blob", "old_backend_tree"]
+    @test sort(collect(keys(proj))) == test_project_names
     # Data "a_text_file" should be found in the first project in the stack,
     # overriding the data of the same name in the second project.
     @test proj["a_text_file"].uuid == UUID("314996ef-12be-40d0-912c-9755af354fdb")
@@ -123,10 +131,9 @@ end
 
     # Test that __init__ takes global DataSets.PROJECT from ENV
     empty!(DataSets.PROJECT)
-    ENV["JULIA_DATASETS_PATH"] = datasets_paths
+    ENV["JULIA_DATASETS_PATH"] = @__DIR__
     DataSets.__init__()
-    @test DataSets.PROJECT.projects[1] isa ActiveDataProject
-    @test DataSets.PROJECT.projects[2] isa TomlFileDataProject
-    @test DataSets.PROJECT.projects[3] isa TomlFileDataProject
+    @test DataSets.PROJECT.projects[1] isa TomlFileDataProject
+    @test project_name(DataSets.PROJECT.projects[1]) == joinpath(@__DIR__, "Data.toml")
 end
 

From 253aa99e7752cb2003c4065e232e72960bc037d3 Mon Sep 17 00:00:00 2001
From: Chris Foster <chris42f@gmail.com>
Date: Mon, 14 Jun 2021 16:18:52 +1000
Subject: [PATCH 05/10] Disable data REPL startup text

While it's helpful for direct users, this can show up in unexpected
places when DataSets is loaded by another module. So probably best to
disable this.
---
 src/repl.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/repl.jl b/src/repl.jl
index 789f5cb..b3dee5e 100644
--- a/src/repl.jl
+++ b/src/repl.jl
@@ -261,7 +261,7 @@ function init_data_repl(; start_key = ">")
                        sticky_mode = true,
                        mode_name = "DataSets",
                        completion_provider = DataCompletionProvider(),
-                       startup_text = true)
+                       startup_text = false)
     nothing
 end
 

From 93c7905fc6dbd313a2668b7433d9e186e01b8167 Mon Sep 17 00:00:00 2001
From: Chris Foster <chris42f@gmail.com>
Date: Mon, 14 Jun 2021 16:21:16 +1000
Subject: [PATCH 06/10] Move entry point tests into their own file

---
 test/Data.toml         |  4 ++++
 test/backend_compat.jl | 22 ++++++++++---------
 test/entrypoint.jl     | 42 ++++++++++++++++++++++++++++++++++++
 test/runtests.jl       | 48 +++---------------------------------------
 4 files changed, 61 insertions(+), 55 deletions(-)
 create mode 100644 test/entrypoint.jl

diff --git a/test/Data.toml b/test/Data.toml
index c313528..819cd6b 100644
--- a/test/Data.toml
+++ b/test/Data.toml
@@ -56,6 +56,8 @@ uuid="e7fd7080-e346-4a68-9ca9-98593a99266a"
 
 
 #--------------------------------------------------
+# Data embedded in the TOML
+
 [[datasets]]
 description="A data blob embedded in the TOML"
 name="embedded_blob"
@@ -95,6 +97,8 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26"
         "a.txt" = "NCBhIGNvbnRlbnQ="
 
 #--------------------------------------------------
+# Old backend API tests
+
 [[datasets]]
 description="Test old storage backend API, Blob"
 name="old_backend_blob"
diff --git a/test/backend_compat.jl b/test/backend_compat.jl
index 8cfff10..55c9872 100644
--- a/test/backend_compat.jl
+++ b/test/backend_compat.jl
@@ -66,17 +66,19 @@ DataSets.add_storage_driver("OldBackendAPI"=>connect_old_backend)
 
 #-------------------------------------------------------------------------------
 @testset "OldBackendAPI" begin
-    @test open(IO, dataset("old_backend_blob")) do io
+    proj = DataSets.load_project("Data.toml")
+
+    @test open(IO, dataset(proj, "old_backend_blob")) do io
            read(io, String)
     end == "x"
-    @test String(open(read, IO, dataset("old_backend_blob"))) == "x"
-    @test open(Vector{UInt8}, dataset("old_backend_blob")) == UInt8['x']
-    @test read(open(dataset("old_backend_blob")), String) == "x"
-    @test read(open(dataset("old_backend_blob"))) == UInt8['x']
-
-    @test readdir(open(dataset("old_backend_tree"))) == ["a.txt", "b.txt"]
-    @test open(dataset("old_backend_tree"))[path"a.txt"] isa Blob
-    @test read(open(dataset("old_backend_tree"))[path"a.txt"], String) == "a"
-    @test read(open(dataset("old_backend_tree"))[path"b.txt"], String) == "b"
+    @test String(open(read, IO, dataset(proj, "old_backend_blob"))) == "x"
+    @test open(Vector{UInt8}, dataset(proj, "old_backend_blob")) == UInt8['x']
+    @test read(open(dataset(proj, "old_backend_blob")), String) == "x"
+    @test read(open(dataset(proj, "old_backend_blob"))) == UInt8['x']
+
+    @test readdir(open(dataset(proj, "old_backend_tree"))) == ["a.txt", "b.txt"]
+    @test open(dataset(proj, "old_backend_tree"))[path"a.txt"] isa Blob
+    @test read(open(dataset(proj, "old_backend_tree"))[path"a.txt"], String) == "a"
+    @test read(open(dataset(proj, "old_backend_tree"))[path"b.txt"], String) == "b"
 end
 
diff --git a/test/entrypoint.jl b/test/entrypoint.jl
new file mode 100644
index 0000000..85a3597
--- /dev/null
+++ b/test/entrypoint.jl
@@ -0,0 +1,42 @@
+# Data entry point functions
+read_data = nothing
+
+@datafunc function main1(x::Blob=>String, t::BlobTree=>BlobTree)
+    csv_data = open(IO, t["1.csv"]) do io
+        read(io,String)
+    end
+    global read_data = (x_string=x, csv_data=csv_data)
+end
+
+@datafunc function main1(x::Blob=>IO)
+    x_data = read(x, String)
+    global read_data = (x_data=x_data,)
+end
+
+
+@testset "@datafunc and @datarun" begin
+    proj = DataSets.load_project("Data.toml")
+
+    @datarun proj main1("a_text_file", "a_tree_example")
+
+    @test read_data == (x_string="Hello world!\n",
+                        csv_data="Name,Age\n\"Aaron\",23\n\"Harry\",42\n")
+
+    @datarun proj main1("a_text_file")
+    @test read_data == (x_data="Hello world!\n",)
+
+    # No match for a single tree
+    @test_throws ArgumentError @datarun proj main1("a_tree_example")
+end
+
+@testset "@datarun with DataSet.PROJECT" begin
+    empty!(DataSets.PROJECT)
+    DataSets.load_project!("Data.toml")
+
+    @test dataset("a_text_file").uuid == UUID("b498f769-a7f6-4f67-8d74-40b770398f26")
+
+    global read_data = nothing
+    @datarun main1("a_text_file")
+    @test read_data == (x_data="Hello world!\n",)
+end
+
diff --git a/test/runtests.jl b/test/runtests.jl
index 167e5e0..c6ef5e8 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -83,50 +83,7 @@ end
 end
 
 #-------------------------------------------------------------------------------
-# Data entry points
-read_data = nothing
-
-@datafunc function main1(x::Blob=>String, t::BlobTree=>BlobTree)
-    csv_data = open(IO, t["1.csv"]) do io
-        read(io,String)
-    end
-    global read_data = (x_string=x, csv_data=csv_data)
-end
-
-@datafunc function main1(x::Blob=>IO)
-    x_data = read(x, String)
-    global read_data = (x_data=x_data,)
-end
-
-
-@testset "@datafunc and @datarun" begin
-    proj = DataSets.load_project("Data.toml")
-
-    @datarun proj main1("a_text_file", "a_tree_example")
-
-    @test read_data == (x_string="Hello world!\n",
-                        csv_data="Name,Age\n\"Aaron\",23\n\"Harry\",42\n")
-
-    @datarun proj main1("a_text_file")
-    @test read_data == (x_data="Hello world!\n",)
-
-    # No match for a single tree
-    @test_throws ArgumentError @datarun proj main1("a_tree_example")
-end
-
-@testset "@datarun with DataSet.PROJECT" begin
-    empty!(DataSets.PROJECT)
-    DataSets.load_project!("Data.toml")
-
-    @test dataset("a_text_file").uuid == UUID("b498f769-a7f6-4f67-8d74-40b770398f26")
-
-    global read_data = nothing
-    @datarun main1("a_text_file")
-    @test read_data == (x_data="Hello world!\n",)
-end
-
-#-------------------------------------------------------------------------------
-@testset "Data set parsing" begin
+@testset "Data set name parsing" begin
     # Valid names
     @test DataSets.check_dataset_name("a_b") === nothing
     @test DataSets.check_dataset_name("a1") === nothing
@@ -161,7 +118,8 @@ end
     @test isfile(DataSets.sys_abspath(temptree["d1"]["hi_2.txt"]))
 end
 
-include("repl.jl")
 include("projects.jl")
+include("entrypoint.jl")
+include("repl.jl")
 include("DataTomlStorage.jl")
 include("backend_compat.jl")

From 9b582f265459d93926c41857997b79e789f5bf98 Mon Sep 17 00:00:00 2001
From: Chris Foster <chris42f@gmail.com>
Date: Tue, 15 Jun 2021 22:21:06 +1000
Subject: [PATCH 07/10] More Data.toml schema validation and other minor
 cleanup (#22)

---
 src/DataSets.jl  | 37 +++++++++++++++++++++----------------
 src/repl.jl      |  2 +-
 test/runtests.jl |  2 ++
 3 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/src/DataSets.jl b/src/DataSets.jl
index 5638f98..3971caa 100644
--- a/src/DataSets.jl
+++ b/src/DataSets.jl
@@ -27,7 +27,8 @@ struct DataSet
     conf
 
     function DataSet(conf)
-        _check_keys(conf, DataSet, ["uuid", "storage", "name"])
+        _check_keys(conf, DataSet, ["uuid"=>String, "storage"=>Dict, "name"=>String])
+        _check_keys(conf["storage"], DataSet, ["driver"=>String])
         check_dataset_name(conf["name"])
         new(UUID(conf["uuid"]), conf)
     end
@@ -55,15 +56,18 @@ struct DataSet
     =#
 end
 
-function _check_keys(toml, context, keys)
-    missed_keys = filter(k->!haskey(toml, k), keys)
+_key_match(config, (k,T)::Pair) = haskey(config, k) && config[k] isa T
+_key_match(config, k::String) = haskey(config, k)
+
+function _check_keys(config, context, keys)
+    missed_keys = filter(k->!_key_match(config, k), keys)
     if !isempty(missed_keys)
         error("""
-              Missing expected keys:
+              Missing expected keys in $context:
               $missed_keys
 
               In TOML fragment:
-              $(sprint(TOML.print,toml))
+              $(sprint(TOML.print,config))
               """)
     end
 end
@@ -76,10 +80,6 @@ function check_dataset_name(name::AbstractString)
     end
 end
 
-function read_toml(::Type{DataSet}, toml)
-    DataSet(toml)
-end
-
 # Hacky thing until we figure out which fields DataSet should actually have.
 function Base.getproperty(d::DataSet, name::Symbol)
     if name in fieldnames(DataSet)
@@ -228,6 +228,8 @@ end
 
 DataProject() = DataProject(Dict{String,DataSet}())
 
+DataProject(project::AbstractDataProject) = DataProject(Dict(pairs(project)))
+
 function _fill_template(toml_path, toml_str)
     # Super hacky templating for paths relative to the toml file.
     # We really should have something a lot nicer here...
@@ -263,13 +265,15 @@ function load_project(path::AbstractString; auto_update=false)
 end
 
 function load_project(config::AbstractDict; kws...)
+    _check_keys(config, "Data.toml", ["data_config_version"=>Integer,
+                                      "datasets"=>AbstractVector])
     format_ver = config["data_config_version"]
     if format_ver > CURRENT_DATA_CONFIG_VERSION
         error("data_config_version=$format_ver is newer than supported")
     end
     proj = DataProject()
-    for data_toml in config["datasets"]
-        dataset = read_toml(DataSet, data_toml)
+    for dataset_conf in config["datasets"]
+        dataset = DataSet(dataset_conf)
         link_dataset(proj, dataset.name => dataset)
     end
     proj
@@ -413,19 +417,20 @@ end
 
 function data_project_from_path(path)
     if path == "@"
-        project = ActiveDataProject()
+        ActiveDataProject()
     else
-        project = TomlFileDataProject(expand_project_path(path))
+        TomlFileDataProject(expand_project_path(path))
     end
 end
 
 function create_project_stack(env)
     stack = []
     env_search_path = get(env, "JULIA_DATASETS_PATH", nothing)
-    if isnothing(env_search_path) || env_search_path == ""
+    if isnothing(env_search_path)
         paths = ["@", ""]
     else
-        paths = split(env_search_path, Sys.iswindows() ? ';' : ':')
+        paths = isempty(env_search_path) ? String[] :
+            split(env_search_path, Sys.iswindows() ? ';' : ':')
     end
     for path in paths
         project = data_project_from_path(path)
@@ -492,7 +497,7 @@ function load_project!(path_or_config)
     pushfirst!(PROJECT, new_project)
     # deprecated: _current_project reflects only the initial version of the
     # project on *top* of the stack.
-    _current_project = DataProject(Dict(pairs(new_project)))
+    _current_project = DataProject(new_project)
 end
 
 #-------------------------------------------------------------------------------
diff --git a/src/repl.jl b/src/repl.jl
index b3dee5e..94720bc 100644
--- a/src/repl.jl
+++ b/src/repl.jl
@@ -198,7 +198,7 @@ function parse_data_repl_cmd(cmdstr)
     popfirst!(tokens)
     if cmd in ("list", "ls")
         return quote
-            $DataSets.DataProject(Dict(pairs($DataSets.PROJECT)))
+            $DataSets.DataProject($DataSets.PROJECT)
         end
     elseif cmd == "stack" && length(tokens) >= 1
         subcmd = popfirst!(tokens)
diff --git a/test/runtests.jl b/test/runtests.jl
index c6ef5e8..1fc5aa6 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,3 +1,5 @@
+ENV["JULIA_DATASETS_PATH"] = ""
+
 using DataSets
 using Test
 using UUIDs

From d918afacfb7060b82f634f0b7905954ca0710a34 Mon Sep 17 00:00:00 2001
From: Chris Foster <chris42f@gmail.com>
Date: Wed, 23 Jun 2021 16:14:15 +1000
Subject: [PATCH 08/10] Add drivers section to Data.toml for autoloading (#20)

This allows modules which provide data storage drivers to be
automatically loaded when DataSets itself is loaded, providing a
declarative data environment workflow while bypassing world age
issues we'd get from loading these on demand.

It seems somewhat unclear whether interfering with code loading
like this is a good idea, but I think only time and some experience
with this mechanism will tell...
---
 src/DataSets.jl                               | 66 ++++++++++++++++++-
 src/file_data_projects.jl                     |  2 +
 test/DriverAutoloadData.toml                  | 21 ++++++
 test/driver_autoload.jl                       |  9 +++
 .../drivers/DummyStorageBackends/Project.toml |  5 ++
 .../src/DummyStorageBackends.jl               | 23 +++++++
 test/runtests.jl                              |  1 +
 7 files changed, 124 insertions(+), 3 deletions(-)
 create mode 100644 test/DriverAutoloadData.toml
 create mode 100644 test/driver_autoload.jl
 create mode 100644 test/drivers/DummyStorageBackends/Project.toml
 create mode 100644 test/drivers/DummyStorageBackends/src/DummyStorageBackends.jl

diff --git a/src/DataSets.jl b/src/DataSets.jl
index 3971caa..437872c 100644
--- a/src/DataSets.jl
+++ b/src/DataSets.jl
@@ -4,6 +4,7 @@ using UUIDs
 using TOML
 using SHA
 using ResourceContexts
+using Base: PkgId
 
 export DataSet, dataset, @datafunc, @datarun
 export Blob, BlobTree, newfile, newdir
@@ -215,6 +216,8 @@ identifier, `nothing` is returned.
 """
 project_name(data_project::AbstractDataProject) = nothing
 
+data_drivers(proj::AbstractDataProject) = []
+
 #-------------------------------------------------------------------------------
 """
     DataProject
@@ -224,11 +227,15 @@ Names are unique within the project.
 """
 struct DataProject <: AbstractDataProject
     datasets::Dict{String,DataSet}
+    drivers::Vector{Dict{String,Any}}
 end
 
-DataProject() = DataProject(Dict{String,DataSet}())
+DataProject() = DataProject(Dict{String,DataSet}(), Vector{Dict{String,Any}}())
+
+DataProject(project::AbstractDataProject) = DataProject(Dict(pairs(project)),
+                                                        Vector{Dict{String,Any}}())
 
-DataProject(project::AbstractDataProject) = DataProject(Dict(pairs(project)))
+data_drivers(project::DataProject) = project.drivers
 
 function _fill_template(toml_path, toml_str)
     # Super hacky templating for paths relative to the toml file.
@@ -276,6 +283,14 @@ function load_project(config::AbstractDict; kws...)
         dataset = DataSet(dataset_conf)
         link_dataset(proj, dataset.name => dataset)
     end
+    if haskey(config, "drivers")
+        _check_keys(config, DataProject, ["drivers"=>AbstractVector])
+        for driver_conf in config["drivers"]
+            _check_keys(driver_conf, DataProject, ["type"=>String, "name"=>String, "module"=>Dict])
+            _check_keys(driver_conf["module"], DataProject, ["name"=>String, "uuid"=>String])
+            push!(proj.drivers, driver_conf)
+        end
+    end
     proj
 end
 
@@ -363,6 +378,8 @@ end
 
 StackedDataProject() = StackedDataProject([])
 
+data_drivers(stack::StackedDataProject) = vcat(data_drivers.(stack.projects)...)
+
 function Base.keys(stack::StackedDataProject)
     names = []
     for project in stack.projects
@@ -479,8 +496,23 @@ PROJECT = StackedDataProject()
 # deprecated. TODO: Remove dependency on this from JuliaHub
 _current_project = DataProject()
 
+_isprecompiling() = ccall(:jl_generating_output, Cint, ()) == 1
+
 function __init__()
-    global PROJECT = create_project_stack(ENV)
+    # Triggering Base.require for storage drivers during precompilation should
+    # be unnecessary and can cause problems if those driver modules use
+    # Requires-like code loading.
+    if !_isprecompiling()
+        global PROJECT = create_project_stack(ENV)
+        for proj in PROJECT.projects
+            try
+                add_storage_driver(proj)
+            catch exc
+                @error "Could not load storage drivers from data project" #=
+                    =# project=proj exception=(exc,catch_backtrace())
+            end
+        end
+    end
 end
 
 dataset(name) = dataset(PROJECT, name)
@@ -494,6 +526,7 @@ May be renamed in a future version.
 """
 function load_project!(path_or_config)
     new_project = load_project(path_or_config, auto_update=true)
+    add_storage_driver(new_project)
     pushfirst!(PROJECT, new_project)
     # deprecated: _current_project reflects only the initial version of the
     # project on *top* of the stack.
@@ -527,6 +560,33 @@ function add_storage_driver((name,opener)::Pair)
     end
 end
 
+function add_storage_driver(project::AbstractDataProject)
+    for conf in data_drivers(project)
+        if conf["type"] != "storage"
+            # Anticipate there might be layer drivers too
+            continue
+        end
+        pkgid = PkgId(UUID(conf["module"]["uuid"]), conf["module"]["name"])
+        if Base.haskey(Base.package_locks, pkgid)
+            # Hack: Avoid triggering another call to require() for packages
+            # which are already in the process of being loaded. (This would
+            # result in a deadlock!)
+            #
+            # Obviously this depends on Base internals...
+            continue
+        end
+        mod = Base.require(pkgid)
+        driver_name = conf["name"]
+        # Module itself does add_storage_driver() inside its __init__
+        # TODO: Is this a good workflow?
+        lock(_storage_drivers_lock) do
+            get(_storage_drivers, driver_name) do
+                error("Package $pkgid did not provide storage driver $driver_name")
+            end
+        end
+    end
+end
+
 function _find_driver(dataset)
     storage_config = dataset.storage
     driver_name = get(storage_config, "driver") do
diff --git a/src/file_data_projects.jl b/src/file_data_projects.jl
index c5702b4..c484430 100644
--- a/src/file_data_projects.jl
+++ b/src/file_data_projects.jl
@@ -118,6 +118,8 @@ end
 
 Base.pairs(proj::AbstractTomlFileDataProject) = pairs(_get_cached(proj))
 
+data_drivers(proj::AbstractTomlFileDataProject) = data_drivers(_get_cached(proj))
+
 #-------------------------------------------------------------------------------
 """
 Data project which automatically updates based on a TOML file on the local
diff --git a/test/DriverAutoloadData.toml b/test/DriverAutoloadData.toml
new file mode 100644
index 0000000..47cd9ca
--- /dev/null
+++ b/test/DriverAutoloadData.toml
@@ -0,0 +1,21 @@
+data_config_version = 0
+
+[[datasets]]
+description="Test dynamic loading of drivers"
+name="dummy_storage_blob"
+uuid="785b3cdc-428e-426f-a3f7-3f6ae88a9637"
+
+    [datasets.storage]
+    driver="DummyTomlStorage"
+    type="Blob"
+    data="data_from_dummy_backend"
+
+#-------------------------------------------------------------------------------
+
+[[drivers]]
+type="storage"
+name="DummyTomlStorage"
+
+    [drivers.module]
+    name="DummyStorageBackends"
+    uuid="89b7a33a-382e-4698-a931-421b088d35a2"
diff --git a/test/driver_autoload.jl b/test/driver_autoload.jl
new file mode 100644
index 0000000..18442e2
--- /dev/null
+++ b/test/driver_autoload.jl
@@ -0,0 +1,9 @@
+@testset "Automatic code loading for drivers" begin
+    empty!(DataSets.PROJECT)
+    pushfirst!(LOAD_PATH, abspath("drivers"))
+    ENV["JULIA_DATASETS_PATH"] = joinpath(@__DIR__, "DriverAutoloadData.toml")
+    DataSets.__init__()
+    @test haskey(DataSets._storage_drivers, "DummyTomlStorage")
+
+    @test open(String, dataset("dummy_storage_blob")) == "data_from_dummy_backend"
+end
diff --git a/test/drivers/DummyStorageBackends/Project.toml b/test/drivers/DummyStorageBackends/Project.toml
new file mode 100644
index 0000000..664b302
--- /dev/null
+++ b/test/drivers/DummyStorageBackends/Project.toml
@@ -0,0 +1,5 @@
+name="DummyStorageBackends"
+uuid="89b7a33a-382e-4698-a931-421b088d35a2"
+
+[deps]
+DataSets = "c9661210-8a83-48f0-b833-72e62abce419"
diff --git a/test/drivers/DummyStorageBackends/src/DummyStorageBackends.jl b/test/drivers/DummyStorageBackends/src/DummyStorageBackends.jl
new file mode 100644
index 0000000..48a5782
--- /dev/null
+++ b/test/drivers/DummyStorageBackends/src/DummyStorageBackends.jl
@@ -0,0 +1,23 @@
+module DummyStorageBackends
+
+using DataSets
+
+struct DummyBackend
+    data
+end
+
+function Base.open(f::Function, ::Type{IO}, storage::DummyBackend, path; kws...) where {T}
+    @assert isempty(path)
+    f(IOBuffer(storage.data))
+end
+
+function connect_dummy_backend(f, config, ds)
+    storage = DummyBackend(config["data"])
+    f(Blob(storage))
+end
+
+function __init__()
+    DataSets.add_storage_driver("DummyTomlStorage"=>connect_dummy_backend)
+end
+
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 1fc5aa6..79e6280 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -125,3 +125,4 @@ include("entrypoint.jl")
 include("repl.jl")
 include("DataTomlStorage.jl")
 include("backend_compat.jl")
+include("driver_autoload.jl")

From 0a95d2137b62075e1ac82321fbc514c03e90643c Mon Sep 17 00:00:00 2001
From: Chris Foster <chris42f@gmail.com>
Date: Thu, 24 Jun 2021 18:57:17 +1000
Subject: [PATCH 09/10] Use `/` as separator for dataset namespaces (#23)

This is the obvious hierarchical separator, and consistent with having
URNs as an expanded form for dataset names (see RFC8141
https://datatracker.ietf.org/doc/html/rfc8141)

The only real downside is that it uses some syntax which might be nice
for a path component, but we could potentially use something like
namespace/name:a/b/c.
---
 docs/src/Data.toml           |  2 +-
 src/DataSets.jl              | 84 ++++++++++++++++++++++++++++++++----
 test/Data.toml               | 12 +++++-
 test/DriverAutoloadData.toml |  2 +-
 test/projects.jl             |  3 +-
 test/runtests.jl             |  7 ++-
 6 files changed, 96 insertions(+), 14 deletions(-)

diff --git a/docs/src/Data.toml b/docs/src/Data.toml
index c8a24d5..2602ba7 100644
--- a/docs/src/Data.toml
+++ b/docs/src/Data.toml
@@ -1,5 +1,5 @@
 # Version of the data TOML format.
-data_config_version=0
+data_config_version=1
 
 [[datasets]]
 # Some alphanumeric name (can include spaces and underscores)
diff --git a/src/DataSets.jl b/src/DataSets.jl
index 437872c..6144388 100644
--- a/src/DataSets.jl
+++ b/src/DataSets.jl
@@ -73,11 +73,32 @@ function _check_keys(config, context, keys)
     end
 end
 
+"""
+    check_dataset_name(name)
+
+Check whether a dataset name is valid. Valid names include start with a letter
+and may contain letters, numbers or `_`. Names may be hieracicial, with pieces
+separated with forward slashes. Examples:
+
+    my_data
+    my_data_1
+    username/data
+    organization/project/data
+"""
 function check_dataset_name(name::AbstractString)
-    # Disallow punctuation in DataSet names for now, as it may be needed as
+    # DataSet names disallow most punctuation for now, as it may be needed as
     # delimiters in data-related syntax (eg, for the data REPL).
-    if !occursin(r"^[[:alpha:]][[:alnum:]_]*$", name)
-        error("DataSet name must start with a letter, and can only contain letters, numbers or underscores; got \"$name\"")
+    dataset_name_pattern = r"
+        ^
+        [[:alpha:]]
+        (?:
+            [[:alnum:]_]      |
+            / (?=[[:alpha:]])
+        )*
+        $
+        "x
+    if !occursin(dataset_name_pattern, name)
+        error("DataSet name \"$name\" is invalid. DataSet names must start with a letter and can contain only letters, numbers, `_` or `/`.")
     end
 end
 
@@ -164,7 +185,16 @@ function dataset(proj::AbstractDataProject, name::AbstractString)
     # In the future, we can consider parsing `name` into a dataset prefix and a
     # data selector / resource section. Eg a path for BlobTree which gives us a
     # SubDataSet
-    proj[name]
+    #
+    # The URN RFC8141 has some good design inspiration here, in particular the
+    # distinction between r-component and q-component seems relevant:
+    # * Some parameters may need to be passed to the "resolver" (ie, the data
+    #   storage backend)
+    # * Some parameters may need to be passed to the dataset itself (eg, a
+    #   relative path within the dataset)
+    #
+    # See https://datatracker.ietf.org/doc/html/rfc8141#page-12
+    return proj[name]
 end
 
 function Base.haskey(proj::AbstractDataProject, name::AbstractString)
@@ -247,10 +277,45 @@ function _fill_template(toml_path, toml_str)
 end
 
 """
-Current version of the data configuration format, as reflected in the
-Data.toml data_config_version key.
+`CURRENT_DATA_CONFIG_VERSION` is the current version of the data configuration
+format, as reflected in the Data.toml `data_config_version` key. This allows old
+versions of DataSets.jl to detect when the Data.toml schema has changed.
+
+New versions of DataSets.jl should always try to parse old versions of
+Data.toml where possible.
+
+### Version 0 (DataSets <= 0.2.3):
+
+Required structure:
+
+```toml
+data_config_version = 0
+
+[[datasets]]
+name = "alphnumeric and underscore chars"
+uuid = "a uuid"
+
+[datasets.storage]
+    driver = "StorageDriver"
+```
+
+### Version 1 (DataSets 0.2.4):
+
+Same as version 0 with additions
+* Allows the `/` character in dataset names to serve as a namespace separator.
+* Adds a new `[[drivers]]` section with the format
+
+```toml
+[[drivers]]
+type="storage"
+name="<driver name>"
+
+    [drivers.module]
+    name="<module name>"
+    uuid="<module uuid>"
+```
 """
-const CURRENT_DATA_CONFIG_VERSION = 0
+const CURRENT_DATA_CONFIG_VERSION = 1
 
 """
     load_project(path; auto_update=false)
@@ -276,7 +341,10 @@ function load_project(config::AbstractDict; kws...)
                                       "datasets"=>AbstractVector])
     format_ver = config["data_config_version"]
     if format_ver > CURRENT_DATA_CONFIG_VERSION
-        error("data_config_version=$format_ver is newer than supported")
+        error("""
+              data_config_version=$format_ver is newer than supported.
+              Consider upgrading to a newer version of DataSets.jl
+              """)
     end
     proj = DataProject()
     for dataset_conf in config["datasets"]
diff --git a/test/Data.toml b/test/Data.toml
index 819cd6b..3437868 100644
--- a/test/Data.toml
+++ b/test/Data.toml
@@ -1,5 +1,5 @@
 # This specifies the version of the Data.toml configuration
-data_config_version=0
+data_config_version=1
 
 # The following is an array of the actual `DataSet`s.
 
@@ -23,6 +23,16 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26"
     # type="text"
     # parameters={encoding="UTF-8"}
 
+[[datasets]]
+description="A text file with namespace"
+name="some_namespace/a_text_file"
+uuid="b498f769-a7f6-4f67-8d74-40b770398f26"
+
+    [datasets.storage]
+    driver="FileSystem"
+    type="Blob"
+    path="@__DIR__/data/file.txt"
+
 #--------------------------------------------------
 [[datasets]]
 description="Gzipped CSV example"
diff --git a/test/DriverAutoloadData.toml b/test/DriverAutoloadData.toml
index 47cd9ca..24bf164 100644
--- a/test/DriverAutoloadData.toml
+++ b/test/DriverAutoloadData.toml
@@ -1,4 +1,4 @@
-data_config_version = 0
+data_config_version = 1
 
 [[datasets]]
 description="Test dynamic loading of drivers"
diff --git a/test/projects.jl b/test/projects.jl
index 850960c..2bb1f2d 100644
--- a/test/projects.jl
+++ b/test/projects.jl
@@ -14,7 +14,8 @@ test_project_names = ["a_table",
                       "embedded_blob",
                       "embedded_tree",
                       "old_backend_blob",
-                      "old_backend_tree"]
+                      "old_backend_tree",
+                      "some_namespace/a_text_file"]
 
 @testset "TomlFileDataProject" begin
     proj = TomlFileDataProject(abspath("Data.toml"))
diff --git a/test/runtests.jl b/test/runtests.jl
index 79e6280..6767695 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -90,12 +90,15 @@ end
     @test DataSets.check_dataset_name("a_b") === nothing
     @test DataSets.check_dataset_name("a1") === nothing
     @test DataSets.check_dataset_name("δεδομένα") === nothing
+    @test DataSets.check_dataset_name("a/b") === nothing
+    @test DataSets.check_dataset_name("a/b/c") === nothing
     # Invalid names
-    @test_throws ErrorException("DataSet name must start with a letter, and can only contain letters, numbers or underscores; got \"a/b\"") DataSets.check_dataset_name("a/b")
+    @test_throws ErrorException("DataSet name \"a?b\" is invalid. DataSet names must start with a letter and can contain only letters, numbers, `_` or `/`.") DataSets.check_dataset_name("a?b")
     @test_throws ErrorException DataSets.check_dataset_name("1")
     @test_throws ErrorException DataSets.check_dataset_name("a b")
     @test_throws ErrorException DataSets.check_dataset_name("a.b")
-    @test_throws ErrorException DataSets.check_dataset_name("a:b")
+    @test_throws ErrorException DataSets.check_dataset_name("a/b/")
+    @test_throws ErrorException DataSets.check_dataset_name("/a/b")
 end
 
 #-------------------------------------------------------------------------------

From dddf952230c9aebb134bfe0b1f1cb7bc41c6eb6b Mon Sep 17 00:00:00 2001
From: Chris Foster <chris42f@gmail.com>
Date: Thu, 24 Jun 2021 19:00:38 +1000
Subject: [PATCH 10/10] Bump version 0.2.4

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 692f315..84afdc0 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "DataSets"
 uuid = "c9661210-8a83-48f0-b833-72e62abce419"
 authors = ["Chris Foster <chris42f@gmail.com> and contributors"]
-version = "0.2.3"
+version = "0.2.4"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"