Skip to content

Commit 26e2fc1

Browse files
mortenpipfitzseb
andcommitted
feat: expand the dataset name regex to allow periods
Co-authored-by: Sebastian Pfitzner <pfitzseb@gmail.com>
1 parent 9bceb6f commit 26e2fc1

5 files changed

+92
-18
lines changed

src/DataSets.jl

+27-5
Original file line numberDiff line numberDiff line change
@@ -87,27 +87,49 @@ end
8787
Check whether a dataset name is valid.
8888
8989
Valid names must start with a letter or a number, the rest of the name can also contain `-`
90-
and `_` characters. The names can also be hieracicial, with segments separated by forward
91-
slashes (`/`). Each segment must also start with either a letter or a number. For example:
90+
and `_` characters. The names can also be hierarchical, with segments separated by forward
91+
slashes (`/`) or (`.`). Each segment must also start with either a letter or a number.
92+
93+
For example, the following dataset names are valid:
9294
9395
my_data
9496
my_data_1
9597
username/data
9698
organization_name/project-name/data
9799
123user/456dataset--name
100+
username/my_table.csv
101+
dataset/v0.1.2
102+
103+
whereas names like this are invalid:
104+
105+
__mydata__
106+
username/.git
107+
my...dataset
108+
109+
!!! note "Segment separators"
110+
111+
In dataset names, both `/` and `.` are considered segment separators from a syntax
112+
perspective. While DataSets.jl does not impose any specific interpretation on the
113+
dataset name, it is recommended to use `/` to separate segments from a semantic
114+
perspective, and to interpret each forward-slash-separated segment as a path separator.
115+
Periods would conventionally be used to separate file extensions within a segment.
116+
117+
E.g. use `username/my-project-data/population.csv`, rather than
118+
`username.my-project-data.population.csv` or something like that.
98119
"""
99120
function check_dataset_name(name::AbstractString)
100121
if !occursin(DATASET_NAME_REGEX, name)
101-
error("DataSet name \"$name\" is invalid. DataSet names must start with a letter and can contain only letters, numbers, `-`, `_` or `/`.")
122+
error("DataSet name \"$name\" is invalid. DataSet names must start with a letter or a number, and can contain only letters, numbers, `-` and `_`, or `/` and `.` as segment separators.")
102123
end
103124
end
104125
# DataSet names disallow most punctuation for now, as it may be needed as
105126
# delimiters in data-related syntax (eg, for the data REPL).
106127
const DATASET_NAME_REGEX_STRING = raw"""
107128
[[:alnum:]]
108129
(?:
109-
[-[:alnum:]_] |
110-
/ (?=[[:alnum:]])
130+
[-[:alnum:]_] |
131+
\.(?=[[:alnum:]]) |
132+
\/ (?=[[:alnum:]])
111133
)*
112134
"""
113135
const DATASET_NAME_REGEX = Regex("^\n$(DATASET_NAME_REGEX_STRING)\n\$", "x")

test/dataset-names-invalid.txt

-9
This file was deleted.

test/runtests.jl

+32-4
Original file line numberDiff line numberDiff line change
@@ -104,20 +104,48 @@ function load_list(filename)
104104
end
105105
@testset "Data set name parsing" begin
106106
@testset "Valid names" begin
107-
valid_names = load_list("dataset-names-valid.txt")
108-
@test length(valid_names) == 16
107+
valid_names = load_list("testnames-valid.txt")
108+
@test !isempty(valid_names)
109109
@testset "Valid name: $name" for name in valid_names
110110
@test DataSets.check_dataset_name(name) === nothing
111111
@test DataSets._split_dataspec(name) == (name, nothing, nothing)
112+
# Also test that the name is still valid when it appears as part of
113+
# a path elements.
114+
let path_name = "foo/$(name)"
115+
@test DataSets.check_dataset_name(path_name) === nothing
116+
@test DataSets._split_dataspec(path_name) == (path_name, nothing, nothing)
117+
end
118+
let path_name = "$(name)/foo"
119+
@test DataSets.check_dataset_name(path_name) === nothing
120+
@test DataSets._split_dataspec(path_name) == (path_name, nothing, nothing)
121+
end
122+
let path_name = "foo/$(name)/bar"
123+
@test DataSets.check_dataset_name(path_name) === nothing
124+
@test DataSets._split_dataspec(path_name) == (path_name, nothing, nothing)
125+
end
112126
end
113127
end
114128

115129
@testset "Invalid names" begin
116-
invalid_names = load_list("dataset-names-invalid.txt")
117-
@test length(invalid_names) == 9
130+
invalid_names = load_list("testnames-invalid.txt")
131+
@test !isempty(invalid_names)
118132
@testset "Invalid name: $name" for name in invalid_names
119133
@test_throws ErrorException DataSets.check_dataset_name(name)
120134
@test DataSets._split_dataspec(name) == (nothing, nothing, nothing)
135+
# Also test that the name is still invalid when it appears as part of
136+
# a path elements.
137+
let path_name = "foo/$(name)"
138+
@test_throws ErrorException DataSets.check_dataset_name(path_name) === nothing
139+
@test DataSets._split_dataspec(path_name) == (nothing, nothing, nothing)
140+
end
141+
let path_name = "$(name)/foo"
142+
@test_throws ErrorException DataSets.check_dataset_name(path_name) === nothing
143+
@test DataSets._split_dataspec(path_name) == (nothing, nothing, nothing)
144+
end
145+
let path_name = "foo/$(name)/bar"
146+
@test_throws ErrorException DataSets.check_dataset_name(path_name) === nothing
147+
@test DataSets._split_dataspec(path_name) == (nothing, nothing, nothing)
148+
end
121149
end
122150
end
123151
end

test/testnames-invalid.txt

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
a b
2+
a/b/
3+
a//b
4+
/a/b
5+
a/-
6+
a/ _/b
7+
a/-a
8+
a/-1
9+
.a
10+
..a
11+
a.
12+
a..
13+
.a.
14+
a..b
15+
.abc
16+
abc.
17+
abc/.def
18+
abc/def.
19+
a./b
20+
a.-
21+
_._
22+
a._b
23+
a.-b
24+
./a
25+
b/../a

test/dataset-names-valid.txt test/testnames-valid.txt

+8
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,11 @@ a/1
1414
1-2-3
1515
x_-__
1616
a---
17+
a.b
18+
a.b
19+
abc.def
20+
abc/def.ghi
21+
abc-def.ghi_jkl
22+
a.b.c
23+
a_.c
24+
foo__-.csv

0 commit comments

Comments
 (0)