feat: allow dataset names to start with numbers (#70)

mortenpi · web-flow · commit 27047b25564f · 2024-02-19T22:34:21.000+13:00
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "DataSets"
 uuid = "c9661210-8a83-48f0-b833-72e62abce419"
 authors = ["Chris Foster <chris42f@gmail.com> and contributors"]
-version = "0.2.10"
+version = "0.2.11"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
diff --git a/docs/src/design.md b/docs/src/design.md
@@ -93,7 +93,7 @@ names to `DataSet`s. Perhaps it also maintains the serialized `DataSet`
 information as well for those datasets which are not registered. It might be
 stored in a Data.toml, in analogy to Project.toml.
 
-Maintaince of the data project should occur via a data REPL.
+Maintenance of the data project should occur via a data REPL.
 
 ## Data Registries
 
@@ -277,4 +277,3 @@ array of strings)
   is restricted to tabular data, but seems similar in spirit to DataSets.jl.
 * [FileTrees.jl](http://shashi.biz/FileTrees.jl) provides tools for
   representing and processing tree-structured data lazily and in parallel.
-
diff --git a/src/DataSets.jl b/src/DataSets.jl
@@ -84,14 +84,17 @@ end
 """
     check_dataset_name(name)
 
-Check whether a dataset name is valid. Valid names include start with a letter
-and may contain letters, numbers or `_`. Names may be hieracicial, with pieces
-separated with forward slashes. Examples:
+Check whether a dataset name is valid.
+
+Valid names must start with a letter or a number, the rest of the name can also contain `-`
+and `_` characters. The names can also be hieracicial, with segments separated by forward
+slashes (`/`). Each segment must also start with either a letter or a number. For example:
 
     my_data
     my_data_1
     username/data
-    organization-dataset_name/project/data
+    organization_name/project-name/data
+    123user/456dataset--name
 """
 function check_dataset_name(name::AbstractString)
     if !occursin(DATASET_NAME_REGEX, name)
@@ -101,10 +104,10 @@ end
 # DataSet names disallow most punctuation for now, as it may be needed as
 # delimiters in data-related syntax (eg, for the data REPL).
 const DATASET_NAME_REGEX_STRING = raw"""
-[[:alpha:]]
+[[:alnum:]]
 (?:
     [-[:alnum:]_]     |
-    / (?=[[:alpha:]])
+    / (?=[[:alnum:]])
 )*
 """
 const DATASET_NAME_REGEX = Regex("^\n$(DATASET_NAME_REGEX_STRING)\n\$", "x")
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -101,13 +101,15 @@ end
 @testset "Data set name parsing" begin
     @testset "Valid name: $name" for name in (
         "a_b", "a-b", "a1", "δεδομένα", "a/b", "a/b/c", "a-", "b_",
+        "1", "a/1", "123", "12ab/34cd", "1/2/3", "1-2-3", "x_-__", "a---",
     )
         @test DataSets.check_dataset_name(name) === nothing
         @test DataSets._split_dataspec(name) == (name, nothing, nothing)
     end
 
     @testset "Invalid name: $name" for name in (
-        "1", "a b", "a.b", "a/b/", "a//b", "/a/b", "a/-", "a/1", "a/ _/b"
+        "a b", "a.b", "a/b/", "a//b", "/a/b", "a/-", "a/ _/b",
+        "a/-a", "a/-1",
     )
         @test_throws ErrorException DataSets.check_dataset_name(name)
         @test DataSets._split_dataspec(name) == (nothing, nothing, nothing)