diff --git a/Project.toml b/Project.toml index 84afdc0..c96e94e 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "DataSets" uuid = "c9661210-8a83-48f0-b833-72e62abce419" authors = ["Chris Foster and contributors"] -version = "0.2.4" +version = "0.2.5" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" diff --git a/README.md b/README.md index 29069f7..e689360 100644 --- a/README.md +++ b/README.md @@ -3,23 +3,19 @@ [![Version](https://juliahub.com/docs/DataSets/version.svg)](https://juliahub.com/ui/Packages/DataSets/4adr3) [![docs latest](https://img.shields.io/badge/docs-latest-blue.svg)](https://juliahub.com/docs/DataSets) -DataSets.jl exists to help manage data and reduce the amount of data wrangling -code you need to write. It's annoying to write -* Command line wrappers which deal with paths to data storage -* Code to load and save from various *data storage systems* (eg, local - filesystem data; local git data, downloaders for remote data over various - protocols, etc) -* Code to load the same data model from various serializations (eg, text: - plain/compressed, property tree: toml/json/msgpack/bson/... - tabular: csv/csv.gz/parquet/sqlite/...) -* Code to deal with data lifecycle; versions, provenance, etc +DataSets helps make data wrangling code more reusable. -DataSets provides scaffolding to make this kind of code more reusable. We want -to make it easy to *relocate* an algorithm between different data environments -without code changes. For example from your laptop to the cloud, to another -user's machine, or to an HPC system. +* We want to make it easy to *relocate* an algorithm between different **data + storage environments** without code changes. For example from your laptop to + the cloud, to another user's machine, or to an HPC system. +* We want to reduce coupling between data and code, by storing **rich type + information** in metadata. Metadata bridges the gap between the ad hoc + implicit type system of data outside your program and the Julia data + structures within your program. -Read [**the latest documentation**](https://juliahub.com/docs/DataSets) more information. +Watch [**DataSets.jl talk**](https://www.youtube.com/watch?v=PJkf0CO5APs) from +JuliaCon 2021, or read [**the latest documentation**](https://juliahub.com/docs/DataSets) +more information. ### Development diff --git a/src/DataSets.jl b/src/DataSets.jl index 6144388..2bd51a3 100644 --- a/src/DataSets.jl +++ b/src/DataSets.jl @@ -9,6 +9,14 @@ using Base: PkgId export DataSet, dataset, @datafunc, @datarun export Blob, BlobTree, newfile, newdir +""" +The current DataSets version number +""" +const PACKAGE_VERSION = let + project = TOML.parsefile(joinpath(pkgdir(DataSets), "Project.toml")) + VersionNumber(project["version"]) +end + include("paths.jl") #------------------------------------------------------------------------------- diff --git a/src/repl.jl b/src/repl.jl index 94720bc..0e3d331 100644 --- a/src/repl.jl +++ b/src/repl.jl @@ -5,11 +5,13 @@ using Markdown _data_repl_help = md""" ## DataSets Data REPL +Press `>` to enter the data repl. Press TAB to complete commands. + | Command | Alias | Action | |:---------- |:--------- | :---------- | | `help` | `?` | Show this message | | `list` | `ls` | List all datasets by name | -| `show $name` | | Show the content of dataset `$name` | +| `show $name` | | Preview the content of dataset `$name` | | `stack` | `st` | Manipulate the global data search stack | | `stack list` | `st ls` | List all projects in the global data search stack | | `stack push $path` | `st push` | Add data project `$path` to front of the search stack | @@ -36,9 +38,9 @@ function show_dataset(name) end # hex dump in xxd format -function hexdump(out_stream, buf; groups_per_line=8, group_size=2) +function hexdump(out_stream, buf; groups_per_line=8, group_size=2, max_lines=typemax(Int)) linesize = groups_per_line*group_size - for line = 1:div(length(buf), linesize, RoundUp) + for line = 1:min(max_lines, div(length(buf), linesize, RoundUp)) linebuf = buf[(line-1)*linesize+1 : min(line*linesize,end)] address = (line-1)*linesize print(out_stream, string(address, base=16, pad=4), ": ") @@ -72,16 +74,25 @@ function _show_dataset(out_stream::IO, blob::Blob) isvalid(c) || return false isprint(c) || c in ('\n', '\r', '\t') end + display_lines, _ = displaysize(out_stream) + max_lines = max(5, display_lines ÷ 2) if n_textlike / length(str) > 0.95 - # It's approximately UTF-8 encoded text data. - print(out_stream, str) + # It's approximately UTF-8 encoded text data - print as text + lines = split(str, '\n', keepempty=true) + nlines = min(lastindex(lines), max_lines) + print(out_stream, join(lines[1:nlines], '\n')) + println(out_stream) + if !eof(io) || nlines < length(lines) + println(out_stream, "⋮") + end else # It's something else, perhaps binary or another text # encoding. Do a hex dump instead. - hexdump(out_stream, buf) - end - if !eof(io) - println(out_stream, "…") + println(out_stream, "Binary data:") + hexdump(out_stream, buf; max_lines=max_lines) + if !eof(io) + println(out_stream, "⋮") + end end end end diff --git a/test/Data.toml b/test/Data.toml index 3437868..f1199c8 100644 --- a/test/Data.toml +++ b/test/Data.toml @@ -36,7 +36,7 @@ uuid="b498f769-a7f6-4f67-8d74-40b770398f26" #-------------------------------------------------- [[datasets]] description="Gzipped CSV example" -name="a_table" +name="gzipped_table" uuid="2d126588-5f76-4e53-8245-87dc91625bf4" [datasets.storage] diff --git a/test/projects.jl b/test/projects.jl index 2bb1f2d..6d34b54 100644 --- a/test/projects.jl +++ b/test/projects.jl @@ -8,11 +8,11 @@ using DataSets: StackedDataProject, project_name -test_project_names = ["a_table", - "a_text_file", +test_project_names = ["a_text_file", "a_tree_example", "embedded_blob", "embedded_tree", + "gzipped_table", "old_backend_blob", "old_backend_tree", "some_namespace/a_text_file"]