diff --git a/.travis.yml b/.travis.yml index 728648f..142b791 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,6 @@ notifications: jobs: allow_failures: - - julia: 1.4 - julia: nightly include: @@ -20,15 +19,15 @@ jobs: - stage: "Unit testing" julia: 1.3 + + - stage: + julia: 1.4 after_success: # Code coverage - julia -e 'using Pkg; Pkg.add("Coverage");' - julia -e 'using Coverage; Coveralls.submit(Coveralls.process_folder());' - julia -e 'using Coverage; Codecov.submit(Codecov.process_folder());' - - stage: - julia: 1.4 - - stage: julia: nightly @@ -37,7 +36,7 @@ jobs: ######################################################## - stage: "Documentation" - julia: 1.3 + julia: 1.4 install: - sudo apt-get update - sudo apt-get install -y python3.7 python3-pip python3-setuptools diff --git a/Manifest.toml b/Manifest.toml index 4aab482..c587f6b 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -14,29 +14,29 @@ version = "0.4.0" [[Arpack_jll]] deps = ["Libdl", "OpenBLAS_jll", "Pkg"] -git-tree-sha1 = "68a90a692ddc0eb72d69a6993ca26e2a923bf195" +git-tree-sha1 = "e214a9b9bd1b4e1b4f15b22c0994862b66af7ff7" uuid = "68821587-b530-5797-8361-c406ea357684" -version = "3.5.0+2" +version = "3.5.0+3" [[Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" -[[BinaryProvider]] -deps = ["Libdl", "SHA"] -git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c" -uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" -version = "0.5.8" +[[CompilerSupportLibraries_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "7c4f882c41faa72118841185afc58a2eb00ef612" +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" +version = "0.3.3+0" [[DataAPI]] -git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252" +git-tree-sha1 = "00612b2fbe534a539dc7f70106c71e3a943d9b98" uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" -version = "1.1.0" +version = "1.2.0" [[DataStructures]] deps = ["InteractiveUtils", "OrderedCollections"] -git-tree-sha1 = "b7720de347734f4716d1815b00ce5664ed6bbfd4" +git-tree-sha1 = "9faa13be79557bf4c5713fb912b0e3c5aa33d046" uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -version = "0.17.9" +version = "0.17.13" [[Dates]] deps = ["Printf"] @@ -48,9 +48,9 @@ uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" [[Distributions]] deps = ["FillArrays", "LinearAlgebra", "PDMats", "Printf", "QuadGK", "Random", "SpecialFunctions", "Statistics", "StatsBase", "StatsFuns"] -git-tree-sha1 = "6b19601c0e98de3a8964ed33ad73e130c7165b1d" +git-tree-sha1 = "c4ed10355637fcb0725dc6a27060f74df24f13cd" uuid = "31c24e10-a181-5473-b8eb-7969acd0382f" -version = "0.22.4" +version = "0.23.2" [[DocStringExtensions]] deps = ["LibGit2", "Markdown", "Pkg", "Test"] @@ -60,9 +60,9 @@ version = "0.8.1" [[Documenter]] deps = ["Base64", "Dates", "DocStringExtensions", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"] -git-tree-sha1 = "d497bcc45bb98a1fbe19445a774cfafeabc6c6df" +git-tree-sha1 = "bc99c157ff2957c058a1067061d16c2c83d1ec42" uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" -version = "0.24.5" +version = "0.24.9" [[FFTW]] deps = ["AbstractFFTs", "FFTW_jll", "IntelOpenMP_jll", "Libdl", "LinearAlgebra", "MKL_jll", "Reexport"] @@ -72,15 +72,15 @@ version = "1.2.0" [[FFTW_jll]] deps = ["Libdl", "Pkg"] -git-tree-sha1 = "05674f209a6e3387dd103a945b0113eeb64b1a58" +git-tree-sha1 = "6c975cd606128d45d1df432fb812d6eb10fee00b" uuid = "f5851436-0d7a-5f13-b9de-f02708fd171a" -version = "3.3.9+3" +version = "3.3.9+5" [[FillArrays]] deps = ["LinearAlgebra", "Random", "SparseArrays"] -git-tree-sha1 = "fec413d4fc547992eb62a5c544cedb6d7853c1f5" +git-tree-sha1 = "51cc2f9bc4eb9c6c0e81ec2f779d1085583cc956" uuid = "1a297f60-69ca-5386-bcde-b61e274b549b" -version = "0.8.4" +version = "0.8.7" [[IntelOpenMP_jll]] deps = ["Libdl", "Pkg"] @@ -99,6 +99,7 @@ uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" version = "0.21.0" [[LibGit2]] +deps = ["Printf"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" [[Libdl]] @@ -131,16 +132,16 @@ version = "0.4.3" uuid = "a63ad114-7e13-5084-954f-fe012c677804" [[OpenBLAS_jll]] -deps = ["Libdl", "Pkg"] -git-tree-sha1 = "e2551d7c25d52f35b76d86a50917a3ba8988f519" +deps = ["CompilerSupportLibraries_jll", "Libdl", "Pkg"] +git-tree-sha1 = "2ee3e636e94b9fd95fa8364d5cba2e20dae16609" uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" -version = "0.3.7+5" +version = "0.3.9+2" [[OpenSpecFun_jll]] -deps = ["Libdl", "Pkg"] -git-tree-sha1 = "65f672edebf3f4e613ddf37db9dcbd7a407e5e90" +deps = ["CompilerSupportLibraries_jll", "Libdl", "Pkg"] +git-tree-sha1 = "d51c416559217d974a1113522d5919235ae67a87" uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" -version = "0.5.3+1" +version = "0.5.3+3" [[OrderedCollections]] deps = ["Random", "Serialization", "Test"] @@ -150,18 +151,18 @@ version = "1.1.0" [[PDMats]] deps = ["Arpack", "LinearAlgebra", "SparseArrays", "SuiteSparse", "Test"] -git-tree-sha1 = "5f303510529486bb02ac4d70da8295da38302194" +git-tree-sha1 = "2fc6f50ddd959e462f0a2dbc802ddf2a539c6e35" uuid = "90014a1f-27ba-587c-ab20-58faa44d9150" -version = "0.9.11" +version = "0.9.12" [[Parsers]] deps = ["Dates", "Test"] -git-tree-sha1 = "d112c19ccca00924d5d3a38b11ae2b4b268dda39" +git-tree-sha1 = "f8f5d2d4b4b07342e5811d2b6428e45524e241df" uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" -version = "0.3.11" +version = "1.0.2" [[Pkg]] -deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"] +deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" [[Printf]] @@ -189,10 +190,16 @@ uuid = "189a3867-3050-52da-a836-e630ba90ab69" version = "0.2.0" [[Rmath]] -deps = ["BinaryProvider", "Libdl", "Random", "Statistics"] -git-tree-sha1 = "2bbddcb984a1d08612d0c4abb5b4774883f6fa98" +deps = ["Random", "Rmath_jll"] +git-tree-sha1 = "86c5647b565873641538d8f812c04e4c9dbeb370" uuid = "79098fc4-a85e-5d69-aa6a-4863f24498fa" -version = "0.6.0" +version = "0.6.1" + +[[Rmath_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "1660f8fefbf5ab9c67560513131d4e933012fc4b" +uuid = "f50d1b31-88e8-58de-be2c-1cc44531875f" +version = "0.2.2+0" [[SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" @@ -215,9 +222,9 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" [[SpecialFunctions]] deps = ["OpenSpecFun_jll"] -git-tree-sha1 = "268052ee908b2c086cc0011f528694f02f3e2408" +git-tree-sha1 = "e19b98acb182567bcb7b75bb5d9eedf3a3b5ec6c" uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "0.9.0" +version = "0.10.0" [[Statistics]] deps = ["LinearAlgebra", "SparseArrays"] @@ -225,15 +232,15 @@ uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [[StatsBase]] deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"] -git-tree-sha1 = "c53e809e63fe5cf5de13632090bc3520649c9950" +git-tree-sha1 = "a6102b1f364befdb05746f386b67c6b7e3262c45" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" -version = "0.32.0" +version = "0.33.0" [[StatsFuns]] deps = ["Rmath", "SpecialFunctions"] -git-tree-sha1 = "79982835d2ff3970685cb704500909c94189bde9" +git-tree-sha1 = "f290ddd5fdedeadd10e961eb3f4d3340f09d030a" uuid = "4c63d2b9-4356-54db-8cca-17b64c39e42c" -version = "0.9.3" +version = "0.9.4" [[SuiteSparse]] deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"] diff --git a/Project.toml b/Project.toml index b08d5d9..a1cd0f4 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "LSHFunctions" uuid = "5134c85a-a9db-11e9-340f-8514dff59a31" authors = ["Will Shand "] -version = "0.1.0" +version = "0.1.1" [deps] Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" @@ -11,11 +11,12 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" QuadGK = "1fd47b50-473d-5c70-9696-f719f8f3bcdc" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [compat] -Distributions = "0.22" +Distributions = "0.22, 0.23" Documenter = "0.24" FFTW = "1.2" QuadGK = "2.3" diff --git a/docs/Manifest.toml b/docs/Manifest.toml index 2ec3b6d..07ba923 100644 --- a/docs/Manifest.toml +++ b/docs/Manifest.toml @@ -11,42 +11,32 @@ version = "0.5.8" [[ColorTypes]] deps = ["FixedPointNumbers", "Random"] -git-tree-sha1 = "b9de8dc6106e09c79f3f776c27c62360d30e5eb8" +git-tree-sha1 = "c4c1cca28748906265ed62c788d6fe6f0134d264" uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" -version = "0.9.1" +version = "0.10.0" [[Colors]] -deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport"] -git-tree-sha1 = "177d8b959d3c103a6d57574c38ee79c81059c31b" +deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Reexport"] +git-tree-sha1 = "2fdeb981ebcf52cd800ddb6a0aa5eac34153552d" uuid = "5ae59095-9a9b-59fe-a467-6f913c188581" -version = "0.11.2" - -[[Compat]] -deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "ed2c4abadf84c53d9e58510b5fc48912c2336fbb" -uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "2.2.0" +version = "0.12.0" [[Conda]] deps = ["JSON", "VersionParsing"] -git-tree-sha1 = "9a11d428dcdc425072af4aea19ab1e8c3e01c032" +git-tree-sha1 = "7a58bb32ce5d85f8bf7559aa7c2842f9aecf52fc" uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d" -version = "1.3.0" +version = "1.4.1" [[DataStructures]] deps = ["InteractiveUtils", "OrderedCollections"] -git-tree-sha1 = "b7720de347734f4716d1815b00ce5664ed6bbfd4" +git-tree-sha1 = "5a431d46abf2ef2a4d5d00bd0ae61f651cf854c8" uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -version = "0.17.9" +version = "0.17.10" [[Dates]] deps = ["Printf"] uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" -[[DelimitedFiles]] -deps = ["Mmap"] -uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" - [[Distributed]] deps = ["Random", "Serialization", "Sockets"] uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" @@ -59,23 +49,23 @@ version = "0.8.1" [[Documenter]] deps = ["Base64", "Dates", "DocStringExtensions", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"] -git-tree-sha1 = "885467cebde4639a3d81953652cc53ff5a73cb87" +git-tree-sha1 = "3bacd94d853a6bccaee1d0104d8b06d29a7506ac" uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" -version = "0.24.3" +version = "0.24.6" [[DocumenterTools]] deps = ["Base64", "DocStringExtensions", "Documenter", "FileWatching", "LibGit2", "Sass"] -git-tree-sha1 = "e3b2a338e7e1803713ae42e556643bf9ba004395" +git-tree-sha1 = "aa1a30dfa07a778b5ce8a448436725502e2913d0" uuid = "35a29f4d-8980-5a13-9543-d66fff28ecb8" -version = "0.1.4" +version = "0.1.5" [[FileWatching]] uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" [[FixedPointNumbers]] -git-tree-sha1 = "4aaea64dd0c30ad79037084f8ca2b94348e65eaa" +git-tree-sha1 = "3ba9ea634d4c8b289d590403b4a06f8e227a6238" uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" -version = "0.7.1" +version = "0.8.0" [[InteractiveUtils]] deps = ["Markdown"] @@ -88,10 +78,9 @@ uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" version = "0.21.0" [[LaTeXStrings]] -deps = ["Compat"] -git-tree-sha1 = "7ab9b8788cfab2bdde22adf9004bda7ad9954b6c" +git-tree-sha1 = "de44b395389b84fd681394d4e8d39ef14e3a2ea8" uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f" -version = "1.0.3" +version = "1.1.0" [[LibGit2]] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" @@ -108,9 +97,9 @@ uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" [[MacroTools]] deps = ["DataStructures", "Markdown", "Random"] -git-tree-sha1 = "e2fc7a55bb2224e203bbd8b59f72b91323233458" +git-tree-sha1 = "07ee65e03e28ca88bc9a338a3726ae0c3efaa94b" uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" -version = "0.5.3" +version = "0.5.4" [[Markdown]] deps = ["Base64"] @@ -127,9 +116,9 @@ version = "1.1.0" [[Parsers]] deps = ["Dates", "Test"] -git-tree-sha1 = "0139ba59ce9bc680e2925aec5b7db79065d60556" +git-tree-sha1 = "0c16b3179190d3046c073440d94172cfc3bb0553" uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" -version = "0.3.10" +version = "0.3.12" [[Pkg]] deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"] @@ -140,10 +129,10 @@ deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" [[PyCall]] -deps = ["Conda", "Dates", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Serialization", "Statistics", "Test", "VersionParsing"] -git-tree-sha1 = "6e5bac1b1faf3575731a6a5b76f638f2389561d3" +deps = ["Conda", "Dates", "Libdl", "LinearAlgebra", "MacroTools", "Serialization", "VersionParsing"] +git-tree-sha1 = "3a3fdb9000d35958c9ba2323ca7c4958901f115d" uuid = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" -version = "1.91.2" +version = "1.91.4" [[PyPlot]] deps = ["Colors", "LaTeXStrings", "PyCall", "Sockets", "Test", "VersionParsing"] @@ -177,21 +166,9 @@ version = "0.1.0" [[Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" -[[SharedArrays]] -deps = ["Distributed", "Mmap", "Random", "Serialization"] -uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" - [[Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" -[[SparseArrays]] -deps = ["LinearAlgebra", "Random"] -uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" - -[[Statistics]] -deps = ["LinearAlgebra", "SparseArrays"] -uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" - [[Test]] deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/docs/src/full_api.md b/docs/src/full_api.md index d6a1695..eb5e8dc 100644 --- a/docs/src/full_api.md +++ b/docs/src/full_api.md @@ -43,6 +43,12 @@ ChebHash ## Miscellaneous +```@autodocs +Modules = [LSHFunctions] +Private = false +Pages = [joinpath("utils", "hash_compression.jl")] +``` + ```@docs @interval ``` diff --git a/docs/src/function_hashing.md b/docs/src/function_hashing.md index d259870..0a7be29 100644 --- a/docs/src/function_hashing.md +++ b/docs/src/function_hashing.md @@ -131,5 +131,5 @@ julia> length(hashfn.sample_points) 512 ``` -## Footnotes - +## References +- Shand, William and Becker, Stephen. *Locality-sensitive hashing in function spaces*. [arXiv:2002.03909](https://arxiv.org/abs/2002.03909). diff --git a/docs/src/lshfunction_api.md b/docs/src/lshfunction_api.md index 6256503..6acf733 100644 --- a/docs/src/lshfunction_api.md +++ b/docs/src/lshfunction_api.md @@ -156,6 +156,3 @@ LSHFunctions.jl provides a few common utility functions that you can use across collision_probability(hashfn, sim; n_hashes=1)^5 true ``` - -## References -- Shand, William and Becker, Stephen. *Locality-sensitive hashing in function spaces*. [arXiv:2002.03909](https://arxiv.org/abs/2002.03909). diff --git a/src/LSHBase.jl b/src/LSHBase.jl index 886a542..dbf312b 100644 --- a/src/LSHBase.jl +++ b/src/LSHBase.jl @@ -58,8 +58,11 @@ function LSHFunction end function lsh_family end @doc """ - collision_probability(hashfn::H, sim; - n_hashes::Union{Symbol,Integer}=:auto) where {H <: LSHFunction} + collision_probability( + hashfn::H, + sim; + n_hashes::Union{Symbol,Integer}=:auto + ) where {H <: LSHFunction} Compute the probability of hash collision between two inputs with similarity `sim` for an [`LSHFunction`](@ref) of type `H`. This function returns the probability that `n_hashes` hashes simultaneously collide. @@ -342,7 +345,7 @@ N_HASHES_DOCSTR(; default = DEFAULT_N_HASHES) = """ `n_hashes::Integer` (default: `$(default)`): the number of hash functions to generate.""" DTYPE_DOCSTR(hashfn; default = DEFAULT_DTYPE) = """ -`dtype::DataType` (default: `$(default)`): the data type to use in the $(hashfn) internals. For performance reasons you should pick `dtype` to match the type of the data you're hashing.""" +`dtype::Type` (default: `$(default)`): the data type to use in the $(hashfn) internals. For performance reasons you should pick `dtype` to match the type of the data you're hashing.""" RESIZE_POW2_DOCSTR(hashfn; default = DEFAULT_RESIZE_POW2) = """ `resize_pow2::Bool` (default: `$(default)`): affects the way in which the returned `$(hashfn)` resizes to hash inputs of different sizes. If you think you'll be hashing inputs of many different sizes, it's more efficient to set `resize_pow2 = true`.""" diff --git a/src/LSHFunctions.jl b/src/LSHFunctions.jl index 2ff4244..992baae 100644 --- a/src/LSHFunctions.jl +++ b/src/LSHFunctions.jl @@ -3,10 +3,11 @@ module LSHFunctions using Distributions, LinearAlgebra, SparseArrays #======================== -Common types/utilities used through the LSH module +Common types/utilities used throughout the module ========================# -include("utils.jl") +include(joinpath("utils", "hash_compression.jl")) +include(joinpath("utils", "vecops.jl")) include("LSHBase.jl") include("intervals.jl") include("similarities.jl") @@ -47,6 +48,6 @@ export SimHash, L1Hash, L2Hash, MIPSHash, SignALSH, MinHash, # Helper / utility functions for LSHFunctions export index_hash, query_hash, n_hashes, hashtype, similarity, lsh_family, - embedded_similarity, collision_probability, @interval + embedded_similarity, collision_probability, @interval, HashCompressor end # module diff --git a/src/hashes/lphash.jl b/src/hashes/lphash.jl index 45e35b2..ea7c307 100644 --- a/src/hashes/lphash.jl +++ b/src/hashes/lphash.jl @@ -81,7 +81,7 @@ L1Hash(args...; kws...) where {T} = LpHash(args...; power = 1, kws...) L2Hash(args...; kws...) where {T} = LpHash(args...; power = 2, kws...) -LpHash(args...; dtype::DataType = DEFAULT_DTYPE, kws...) = +LpHash(args...; dtype::Type = DEFAULT_DTYPE, kws...) = LpHash{dtype}(args...; kws...) ### Documentation for L1Hash and L2Hash @@ -95,7 +95,7 @@ for (hashfn, power) in zip((:L1Hash, :L2Hash), (1, 2)) @doc """ $($hashfn)( n_hashes::Integer = $(DEFAULT_N_HASHES); - dtype::DataType = $(DEFAULT_DTYPE), + dtype::Type = $(DEFAULT_DTYPE), r::Real = 1.0, resize_pow2::Bool = $(DEFAULT_RESIZE_POW2) ) diff --git a/src/hashes/minhash.jl b/src/hashes/minhash.jl index 17083b9..c9ca326 100644 --- a/src/hashes/minhash.jl +++ b/src/hashes/minhash.jl @@ -25,7 +25,7 @@ end """ MinHash(n_hashes::Integer = $(DEFAULT_N_HASHES); - dtype::DataType = Any, + dtype::Type = Any, symbols::Union{Vector,Set} = Set()) Construct a locality-sensitive hash function for Jaccard similarity. @@ -34,7 +34,7 @@ Construct a locality-sensitive hash function for Jaccard similarity. - $(N_HASHES_DOCSTR()) # Keyword parameters -- `dtype::DataType` (default: `Any`): the type of symbols in the sets you're hashing. This is overriden by the data type contained in `symbols` when `symbols` is non-empty. +- `dtype::Type` (default: `Any`): the type of symbols in the sets you're hashing. This is overriden by the data type contained in `symbols` when `symbols` is non-empty. - `symbols::Union{Vector,Set}`: a `Vector` or `Set` containing all of the possible elements ("symbols") of the sets that you will be hashing. If left empty, `MinHash` will instead expand its dictionary when it sees new symbols (at small additional computational expense). # Examples @@ -80,7 +80,7 @@ julia> hashfn(Set(["a", "b", "c"])); See also: [`jaccard`](@ref) """ function MinHash(args...; - dtype::DataType = Any, + dtype::Type = Any, symbols::C = Set{Any}()) where {T, C <: Union{Vector{T},Set{T}}} if length(symbols) > 0 diff --git a/src/hashes/sign_alsh.jl b/src/hashes/sign_alsh.jl index 750bbf8..73b9242 100644 --- a/src/hashes/sign_alsh.jl +++ b/src/hashes/sign_alsh.jl @@ -29,7 +29,7 @@ end @doc """ SignALSH(n_hashes::Integer = $(DEFAULT_N_HASHES), - dtype::DataType = $(DEFAULT_DTYPE), + dtype::Type = $(DEFAULT_DTYPE), maxnorm::Union{Nothing,Real} = nothing, m::Integer = 3, resize_pow2::Bool = $(DEFAULT_RESIZE_POW2)) diff --git a/src/hashes/simhash.jl b/src/hashes/simhash.jl index 8d540b0..71e935e 100644 --- a/src/hashes/simhash.jl +++ b/src/hashes/simhash.jl @@ -37,7 +37,7 @@ SimHash(args...; dtype = DEFAULT_DTYPE, kws...) = @doc """ SimHash(n_hashes::Integer = $(DEFAULT_N_HASHES); - dtype::DataType = $(DEFAULT_DTYPE), + dtype::Type = $(DEFAULT_DTYPE), resize_pow2::Bool = $(DEFAULT_RESIZE_POW2)) Creates a locality-sensitive hash function for cosine similarity. diff --git a/src/similarities.jl b/src/similarities.jl index 0e91a1f..cdf002d 100644 --- a/src/similarities.jl +++ b/src/similarities.jl @@ -212,10 +212,7 @@ Computes the Jaccard similarity between sets ``A`` and ``B``, which is defined a ``\text{Jaccard}(A,B) = \frac{\left|A \cap B\right|}{\left|A \cup B\right|}`` # Arguments -- `A::Set`, `B::Set`: the two sets with which to compute Jaccard similarity. - -# Returns -`Float64`: the Jaccard similarity between sets `A` and `B`, which is between `0` and `1`. +- `A::Set`, `B::Set`: two sets whose Jaccard similarity we would like to compute. # Examples ```jldoctest; setup = :(using LSHFunctions) @@ -231,20 +228,154 @@ true See also: [`MinHash`](@ref) """ function jaccard(A::Set, B::Set) :: Float64 - # To avoid corner cases where A and B are both empty if isempty(A) + # Use the convention that if A = B = ∅, their Jaccard + # similarity is zero. Float64(0) else length(A ∩ B) / length(A ∪ B) end end +@doc raw""" + function jaccard(x::BitArray{1}, y::BitArray{1}) + +Computes the Jaccard similarity between a pair of binary vectors: + +``J(x, y) = \frac{\sum_{i} \min{(x_i,y_i)}}{\sum_{i} \max{(x_i,y_i)}}`` + +# Arguments +- `x::BitArray{1}`, `y::BitArray{1}`: two binary vectors, in the form of `BitArray`s. + +# Examples +```jldoctest; setup = :(using LSHFunctions) +julia> x = BitArray([true, false, true, true, false]); + +julia> y = BitArray([false, false, true, true, true]); + +julia> jaccard(x,y) +0.5 +``` +""" +function jaccard(x::BitArray{1}, y::BitArray{1}) :: Float64 + union = sum(x .| y) + if union == 0 + # Use the convention that if x and y are full of zeros, their Jaccard + # similarity is zero. + Float64(0) + else + intersection = sum(x .& y) + intersection / union + end +end + +@doc raw""" + function jaccard(x::AbstractVector{<:Real}, y::AbstractVector{<:Real}) + +Computes the Jaccard similarity between a pair of vectors of real numbers: + +``J(x, y) = \frac{\sum_{i} \min{(x_i,y_i)}}{\sum_{i} \max{(x_i,y_i)}}`` + +# Arguments +- `x::AbstractVector{<:Real}`, `y::AbstractVector{<:Real}`: a pair of vectors containing real numbers (subtypes of `Real`). + +# Examples +```jldoctest; setup = :(using LSHFunctions) +julia> x = [0.8, 0.1, 0.3, 0.4, 0.1]; + +julia> y = [1.0, 0.6, 0.0, 0.4, 0.5]; + +julia> jaccard(x,y) +0.5 +``` +""" +function jaccard(x::AbstractVector{T}, + y::AbstractVector) :: Float64 where {T <: Real} + if length(x) != length(y) + DimensionMismatch("dimensions must match") |> throw + end + + intersection = T(0) + union = T(0) + + @inbounds @simd for ii = 1:length(x) + if 0 ≤ x[ii] ≤ y[ii] + intersection += x[ii] + union += y[ii] + elseif 0 ≤ y[ii] < x[ii] + intersection += y[ii] + union += x[ii] + else + ErrorException("vectors must have non-negative elements") |> throw + end + end + + if union == T(0) + # Use the convention that if x and y are full of zeros, their Jaccard + # similarity is zero. + Float64(0) + else + Float64(intersection / union) + end +end + +jaccard(x::AbstractVector{<:Integer}, y::AbstractVector{<:AbstractFloat}) = + jaccard(y, x) + +@doc raw""" + function jaccard(A::Set{<:K}, + B::Set{<:K}, + weights::Dict{K,V}) where {K,V<:Number} + +Computes the weighted Jaccard similarity between two sets: + +``J(x, y) = \frac{\sum_{x\in A\cap B} w_x}{\sum_{y\in A\cup B} w_y}`` + +# Arguments +- `A::Set`, `B::Set`: two sets whose Jaccard similarity we would like to compute. +- `weights::Dict`: a dictionary mapping symbols in the sets `A` and `B` to numerical weights. These weights must be positive. + +# Examples +```jldoctest; setup = :(using LSHFunctions) +julia> A = Set(["a", "b", "c"]); + +julia> B = Set(["b", "c", "d"]); + +julia> W = Dict("a" => 0.2, "b" => 2.4, "c" => 0.6, "d" => 1.8); + +julia> jaccard(A,B,W) +0.6 +``` +""" +function jaccard(A::Set{<:K}, + B::Set{<:K}, + weights::Dict{K,V}) :: Float64 where {K,V<:Real} + + union_weight = V(0) + + for el in A ∪ B + w = weights[el] + if w < 0 + ErrorException("weights must be non-negative") |> throw + end + union_weight += w + end + + intersection_weight = sum(weights[el] for el in A ∩ B) + + # By convention, if A = B = ∅, their Jaccard similarity is zero + if union_weight == V(0) + Float64(0) + else + Float64(intersection_weight / union_weight) + end +end + #==================== Inner product and norms ====================# ### Inner products -# TODO: docs @doc raw""" inner_prod(x::AbstractVector, y::AbstractVector) diff --git a/src/utils.jl b/src/utils.jl deleted file mode 100644 index 45fcd6b..0000000 --- a/src/utils.jl +++ /dev/null @@ -1,28 +0,0 @@ -#================================================================ - -Common helper functions shared between multiple routines in the LSH module. - -================================================================# - -# Compute the norms of vectors and columns of matrices -col_norms(x::Union{AbstractVector,AbstractMatrix}) = - map(norm, eachcol(x)) - -col_norms(x::Union{Vector,Matrix}) = - map(BLAS.nrm2, eachcol(x)) - -col_norms(x::SparseVector) = - [BLAS.nrm2(x.nzval)] - -col_norms(x::SparseMatrixCSC{T}) where {T} = begin - output = Vector{T}(undef, size(x,2)) - @inbounds for ii = 1:size(x,2) - result = T(0) - start_idx, end_idx = x.colptr[ii], x.colptr[ii+1]-1 - @simd for idx = start_idx:end_idx - result += x.nzval[idx].^2 - end - output[ii] = √result - end - return output -end diff --git a/src/utils/hash_compression.jl b/src/utils/hash_compression.jl new file mode 100644 index 0000000..f5328fb --- /dev/null +++ b/src/utils/hash_compression.jl @@ -0,0 +1,89 @@ +#================================================================ + +Utilities for compressing hashes into fixed-size hashes. + +================================================================# + +using SHA + +#======================== +HashCompressor definition and constructors +========================# + +@doc """ + struct HashCompressor + +A compressor for converting variable-width hashes generated by LSHFunctions +into fixed-width hashes. HashCompressor works by taking an array of hashes +generated by an LSHFunction, and using SHA-256 to convert it into a fixed-width +hash. +""" +struct HashCompressor + n_bytes :: Int64 + salt :: Vector{UInt8} +end + +@doc """ + function HashCompressor( + n_bytes :: Integer = 32, + salt :: Union{Vector{UInt8}} = Vector{UInt8}(undef,0) + ) + +Construct a new `HashCompressor`. The created `HashCompressor` will compress hashes +into `n_bytes` bytes, and use the provided salt during hash compression. + +# Keyword arguments +- `n_bytes::Integer` (default: `32`): the number of bytes to compress hashes into. +- `salt::Vector{UInt8}` (default: `Vector{UInt8}(undef,0)`: a salt to prepend to hashes before compression using SHA-256. + +# Examples +```jldoctest; setup = :(using LSHFunctions) +julia> compressor = HashCompressor(n_bytes=4); + +julia> compressor([1, 4, 2, 9, 5, 5]) +4-element Array{UInt8,1}: + 0xf3 + 0x91 + 0x55 + 0x2e +``` +""" +function HashCompressor( + ; + n_bytes :: Integer = 32, + salt :: Union{Nothing,Vector{UInt8}} = Vector{UInt8}(undef,0) +) + if !(0 <= n_bytes <= 32) + "n_bytes must satisfy 0 <= n_bytes <= 32" |> + ErrorException |> + throw + end + + HashCompressor(Int64(n_bytes), salt) +end + +#======================== +Compression functions +========================# +(compressor::HashCompressor)(hashes::BitArray{1}) = + reinterpret(UInt8, hashes.chunks) |> compressor + +(compressor::HashCompressor)(hashes::AbstractVector{I}) where {I <: Integer} = + reinterpret(UInt8, hashes) |> compressor + +function (compressor::HashCompressor)(hashes::AbstractVector{UInt8}) + hashes = begin + if length(compressor.salt) == 0 + sha2_256(hashes) + else + sha2_256([compressor.salt; hashes]) + end + end + + if compressor.n_bytes < 32 + hashes[1:compressor.n_bytes] + else + hashes + end +end + diff --git a/src/utils/vecops.jl b/src/utils/vecops.jl new file mode 100644 index 0000000..ed10787 --- /dev/null +++ b/src/utils/vecops.jl @@ -0,0 +1,29 @@ +#================================================================ + +Common matrix and vector operations used in multiple locations +throughout the module. + +================================================================# + +# Compute the norms of vectors and columns of matrices +col_norms(x::Union{AbstractVector,AbstractMatrix}) = + map(norm, eachcol(x)) + +col_norms(x::Union{Vector,Matrix}) = + map(BLAS.nrm2, eachcol(x)) + +col_norms(x::SparseVector) = + [BLAS.nrm2(x.nzval)] + +col_norms(x::SparseMatrixCSC{T}) where {T} = begin + output = Vector{T}(undef, size(x,2)) + @inbounds for ii = 1:size(x,2) + result = T(0) + start_idx, end_idx = x.colptr[ii], x.colptr[ii+1]-1 + @simd for idx = start_idx:end_idx + result += x.nzval[idx].^2 + end + output[ii] = √result + end + return output +end diff --git a/test/runtests.jl b/test/runtests.jl index 30da869..a198c16 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -28,3 +28,5 @@ include(joinpath("hashes", "test_lshfunction.jl")) include(joinpath("function_hashing", "test_monte_carlo.jl")) include(joinpath("function_hashing", "test_chebhash.jl")) + +include(joinpath("utils", "test_hash_compression.jl")) diff --git a/test/test_similarities.jl b/test/test_similarities.jl index 70d17c4..d23fe97 100644 --- a/test/test_similarities.jl +++ b/test/test_similarities.jl @@ -232,6 +232,61 @@ end # Convention used in this module @test jaccard(Set(), Set()) == 0 end + + @testset "Compute Jaccard similarity between binary vectors" begin + x = BitArray([true, false, true, true, false]) + y = BitArray([false, false, true, true, true]) + + @test jaccard(x, y) == jaccard(y, x) == 2 / 4 + + # When x and y are both full of zero bits, we define the + # Jaccard similarity between them to be zero. + x = falses(5) + y = falses(5) + @test jaccard(x, y) == 0 + end + + @testset "Compute weighted Jaccard similarity between Real vectors" begin + x = [0.8, 0.1, 0.3, 0.4, 0.1] + y = [1.0, 0.6, 0.0, 0.4, 0.5] + + @test jaccard(x, y) == + jaccard(y, x) == + (0.8+0.1+0.0+0.4+0.1) / (1.0+0.6+0.3+0.4+0.5) + + # Test Jaccard similarity between vectors with different dtypes + x = mod.(rand(Int32, 20), 10) + y = mod.(rand(Int64, 20), 10) + @test jaccard(Float64.(x), Float64.(y)) ≈ jaccard(x, y) + @test isapprox(jaccard(Float64.(x), Float64.(y)), jaccard(Float32.(x), y), atol=1e-8) + @test isapprox(jaccard(Float64.(x), Float64.(y)), jaccard(x, Float32.(y)), atol=1e-8) + @test jaccard(Float64.(x), Float64.(y)) ≈ jaccard(Float32.(x), Float64.(y)) + + # Define the Jaccard similarity between pairs of Real vectors + # to be zero. + x = zeros(10) + y = zeros(10) + @test jaccard(x, y) == 0 + + # Throw an error when any of the elements are negative, or when the + # two vectors have different lengths. + @test_throws(DimensionMismatch, jaccard(rand(5), rand(6))) + @test_throws(ErrorException, jaccard(-ones(3), ones(3))) + end + + @testset "Compute weighted Jaccard similarity between Sets" begin + A = Set(["a", "b", "c"]) + B = Set(["b", "c", "d"]) + W = Dict("a" => 0.2, "b" => 2.4, "c" => 0.6, "d" => 1.8) + + @test jaccard(A, B, W) ≈ + jaccard(B, A, W) ≈ + (2.4 + 0.6) / (0.2 + 2.4 + 0.6 + 1.8) + + # We should throw an error when any of the weights are negative + W["a"] = -1.0 + @test_throws(ErrorException, jaccard(A, B, W)) + end end @testset "Inner product similarity tests" begin diff --git a/test/utils/test_hash_compression.jl b/test/utils/test_hash_compression.jl new file mode 100644 index 0000000..6ca6e62 --- /dev/null +++ b/test/utils/test_hash_compression.jl @@ -0,0 +1,35 @@ +using Test, LSHFunctions + +#================== +Tests +==================# +@testset "HashCompressor tests" begin + @testset "Can compress Vector{UInt8} hashes" begin + compressor = HashCompressor(n_bytes=6) + hashes = UInt8[0x01, 0x04, 0x02, 0x08, 0x06, 0x07, 0x08, 0x04] + + @test compressor(hashes) == UInt8[0xce, 0xd8, 0x24, 0x1c, 0xc0, 0x48] + end + + @testset "Can compress Vector{Integer} hashes" begin + compressor = HashCompressor(n_bytes=4) + hashes = [-1, 8, -6, 3, -5, -9, 9, 0] + + @test compressor(hashes) == UInt8[0xb2, 0x7f, 0x8e, 0xb4] + end + + @testset "Can compress BitArray{1} hashes" begin + compressor = HashCompressor(n_bytes=5) + hashes = BitArray([1, 1, 1, 0, 0, 1, 0, 0, 1, 0]) + + @test compressor(hashes) == UInt8[0xa2, 0x99, 0xd7, 0x9f, 0x67] + end + + @testset "Can salt hashes" begin + salt = UInt8[0xcb, 0xe7, 0x12] + compressor = HashCompressor(n_bytes=6, salt=salt) + hashes = [-1, 8, -6, 3, -5, -9, 9, 0] + + @test compressor(hashes) == UInt8[0x9f, 0x5c, 0xf4, 0x3a, 0x29, 0x22] + end +end