diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 01e6847..caadcc0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,53 +14,51 @@ concurrency: # Cancel intermediate builds: only pull request builds group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref != 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release-') || github.run_number }} cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} + jobs: test: - name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} + name: julia -t${{ matrix.threads}} - ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} runs-on: ${{ matrix.os }} + timeout-minutes: 30 strategy: fail-fast: false matrix: - version: - - 'nightly' - os: - - ubuntu-latest - - macOS-latest - - windows-latest - arch: - - x64 - - x86 + threads: + # - '1' + - '4,4' + version: [nightly] + os: [ubuntu-latest, windows-latest, macOS-latest] + arch: [x64, x86, aarch64] exclude: + - os: ubuntu-latest + arch: aarch64 + - os: windows-latest + arch: aarch64 + - os: macOS-latest + arch: x64 - os: macOS-latest arch: x86 steps: - - uses: actions/checkout@v2 - - uses: julia-actions/setup-julia@v1 + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@v2 with: version: ${{ matrix.version }} arch: ${{ matrix.arch }} - - uses: actions/cache@v1 - env: - cache-name: cache-artifacts - with: - path: ~/.julia/artifacts - key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }} - restore-keys: | - ${{ runner.os }}-test-${{ env.cache-name }}- - ${{ runner.os }}-test-${{ matrix.os }} - ${{ runner.os }}- - - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/cache@v2 - uses: julia-actions/julia-runtest@v1 env: JULIA_DISTRIBUTED_TESTING_STANDALONE: 1 + JULIA_NUM_THREADS: '${{ matrix.threads}}' - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v1 + - uses: codecov/codecov-action@v5 with: - file: lcov.info + files: lcov.info + token: ${{ secrets.CODECOV_TOKEN }} + docs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: julia-actions/setup-julia@latest with: # version: '1.6' diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..df02284 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +Manifest.toml +*.swp diff --git a/Project.toml b/Project.toml index bb30760..382b1ab 100644 --- a/Project.toml +++ b/Project.toml @@ -8,8 +8,18 @@ Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" Sockets = "6462fe0b-24de-5631-8697-dd941f90decc" [extras] +Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["LinearAlgebra", "Test"] +test = ["Aqua", "LinearAlgebra", "Test"] + +[compat] +Aqua = "0.8.10" +LinearAlgebra = "1" +Random = "1" +Serialization = "1" +Sockets = "1" +Test = "1" +julia = "1" diff --git a/README.md b/README.md index 76f6355..845347a 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,57 @@ -# Distributed - -The `Distributed` package provides functionality for creating and controlling multiple Julia processes remotely, and for performing distributed and parallel computing. It uses network sockets or other supported interfaces to communicate between Julia processes, and relies on Julia's `Serialization` stdlib package to transform Julia objects into a format that can be transferred between processes efficiently. It provides a full set of utilities to create and destroy new Julia processes and add them to a "cluster" (a collection of Julia processes connected together), as well as functions to perform Remote Procedure Calls (RPC) between the processes within a cluster. See [`API`](@ref) for details. - +# Distributed (with a multiscale parallelism extension) + +The `Distributed` package provides functionality for creating and controlling +multiple Julia processes remotely, and for performing distributed and parallel +computing. It uses network sockets or other supported interfaces to communicate +between Julia processes, and relies on Julia's `Serialization` stdlib package to +transform Julia objects into a format that can be transferred between processes +efficiently. It provides a full set of utilities to create and destroy new Julia +processes and add them to a "cluster" (a collection of Julia processes connected +together), as well as functions to perform Remote Procedure Calls (RPC) between +the processes within a cluster. See the `API` section for details. This package ships as part of the Julia stdlib. +> [!NOTE] +> This repository is a fork of the original [`Distributed`](https://github.com/JuliaLang/Distributed.jl) package for developing ideas behind the support of _multiscale parallelism_ in Julia. In gross terms, such an extension allows worker processes to execute the `addprocs` operation, so that a worker process may also play the role of a master process with respect to a set of worker processes it creates by invoking `addprocs`. For that, all `Distributed` operations listed below are extended with a keyword parameter `role`, with three possible values: `:default` (default argument), `:master`, and `:worker`. So, a worker that created processes by means of `addprocs` may execute operations as: +> * a ***worker process*** by using `role = :worker`, for interacting with the master processes that created it, as well as other workers; or +> * a ***master process*** by using `role = :master`, for interacting with the workers it created. +> +> It is important to note that the modifications to the API do not affect usual `Distributed` programs. +> +> Mutiscale parallelism may help programmers in at least two scenarios: +> * to deploy _multicluster computations_, i.e. parallel computations employing multiple clusters by assuming the parallel programming patterns and tools at the multicluster and cluster levels are distinct; +> * better support for _multilevel parallel programming_ patterns. +> +> We are working on the implementation of case studies. + ## Using development versions of this package To use a newer version of this package, you need to build Julia from scratch. The build process is the same as any other build except that you need to change the commit used in `stdlib/Distributed.version`. It's also possible to load a development version of the package using [the trick used in the Section named "Using the development version of Pkg.jl" in the `Pkg.jl` repo](https://github.com/JuliaLang/Pkg.jl#using-the-development-version-of-pkgjl), but the capabilities are limited as all other packages will depend on the stdlib version of the package and will not work with the modified package. +### On Julia 1.11+ +In Julia 1.11 Distributed was excised from the default system image and became +more of an independent package. As such, to use a different version it's enough +to just `dev` it explicitly: +```julia-repl +pkg> dev https://github.com/JuliaLang/Distributed.jl.git +``` +### On older Julia versions +To use a newer version of this package on older Julia versions, you need to build +Julia from scratch. The build process is the same as any other build except that +you need to change the commit used in `stdlib/Distributed.version`. +It's also possible to load a development version of the package using [the trick +used in the Section named "Using the development version of Pkg.jl" in the +`Pkg.jl` +repo](https://github.com/JuliaLang/Pkg.jl#using-the-development-version-of-pkgjl), +but the capabilities are limited as all other packages will depend on the stdlib +version of the package and will not work with the modified package. + ## API -The public API of `Distributed` consists of a variety of functions for various tasks; for creating and destroying processes within a cluster: +The public API of `Distributed` consists of a variety of functions for various +tasks; for creating and destroying processes within a cluster: - `addprocs` - create one or more Julia processes and connect them to the cluster - `rmprocs` - shutdown and remove one or more Julia processes from the cluster @@ -20,7 +59,9 @@ The public API of `Distributed` consists of a variety of functions for various t For controlling other processes via RPC: - `remotecall` - call a function on another process and return a `Future` referencing the result of that call -- `Future` - an object that references the result of a `remotecall` that hasn't yet completed - use `fetch` to return the call's result, or `wait` to just wait for the remote call to finish +- `Future` - an object that references the result of a `remotecall` that hasn't + yet completed - use `fetch` to return the call's result, or `wait` to just + wait for the remote call to finish. - `remotecall_fetch` - the same as `fetch(remotecall(...))` - `remotecall_wait` - the same as `wait(remotecall(...))` - `remote_do` - like `remotecall`, but does not provide a way to access the result of the call @@ -49,7 +90,15 @@ For controlling multiple processes at once: ### Process Identifiers -Julia processes connected with `Distributed` are all assigned a cluster-unique `Int` identifier, starting from `1`. The first Julia process within a cluster is given ID `1`, while other processes added via `addprocs` get incrementing IDs (`2`, `3`, etc.). Functions and macros which communicate from one process to another usually take one or more identifiers to determine which process they target - for example, `remotecall_fetch(myid, 2)` calls `myid()` on process 2. - -!!! note - Only process 1 (often called the "head", "primary", or "master") may add or remove processes, and manages the rest of the cluster. Other processes (called "workers" or "worker processes") may still call functions on each other and send and receive data, but `addprocs`/`rmprocs` on worker processes will fail with an error. +Julia processes connected with `Distributed` are all assigned a cluster-unique +`Int` identifier, starting from `1`. The first Julia process within a cluster is +given ID `1`, while other processes added via `addprocs` get incrementing IDs +(`2`, `3`, etc.). Functions and macros which communicate from one process to +another usually take one or more identifiers to determine which process they +target - for example, `remotecall_fetch(myid, 2)` calls `myid()` on process 2. + +**Note:** Only process 1 (often called the "head", "primary", or "master") may +add or remove processes, and manages the rest of the cluster. Other processes +(called "workers" or "worker processes") may still call functions on each other +and send and receive data, but `addprocs`/`rmprocs` on worker processes will +fail with an error. \ No newline at end of file diff --git a/docs/src/index.md b/docs/src/index.md deleted file mode 100644 index 22d63ce..0000000 --- a/docs/src/index.md +++ /dev/null @@ -1,71 +0,0 @@ -# [Distributed Computing](@id man-distributed) - -```@docs -Distributed -Distributed.addprocs -Distributed.nprocs -Distributed.nworkers -Distributed.procs() -Distributed.procs(::Integer) -Distributed.workers -Distributed.rmprocs -Distributed.interrupt -Distributed.myid -Distributed.pmap -Distributed.RemoteException -Distributed.ProcessExitedException -Distributed.Future -Distributed.RemoteChannel -Distributed.fetch(::Distributed.Future) -Distributed.fetch(::RemoteChannel) -Distributed.remotecall(::Any, ::Integer, ::Any...) -Distributed.remotecall_wait(::Any, ::Integer, ::Any...) -Distributed.remotecall_fetch(::Any, ::Integer, ::Any...) -Distributed.remote_do(::Any, ::Integer, ::Any...) -Distributed.put!(::RemoteChannel, ::Any...) -Distributed.put!(::Distributed.Future, ::Any) -Distributed.take!(::RemoteChannel, ::Any...) -Distributed.isready(::RemoteChannel, ::Any...) -Distributed.isready(::Distributed.Future) -Distributed.AbstractWorkerPool -Distributed.WorkerPool -Distributed.CachingPool -Distributed.default_worker_pool -Distributed.clear! -Distributed.remote -Distributed.remotecall(::Any, ::AbstractWorkerPool, ::Any...) -Distributed.remotecall_wait(::Any, ::AbstractWorkerPool, ::Any...) -Distributed.remotecall_fetch(::Any, ::AbstractWorkerPool, ::Any...) -Distributed.remote_do(::Any, ::AbstractWorkerPool, ::Any...) -Distributed.@spawn -Distributed.@spawnat -Distributed.@fetch -Distributed.@fetchfrom -Distributed.@distributed -Distributed.@everywhere -Distributed.remoteref_id -Distributed.channel_from_id -Distributed.worker_id_from_socket -Distributed.cluster_cookie() -Distributed.cluster_cookie(::Any) -``` - -## Cluster Manager Interface - -This interface provides a mechanism to launch and manage Julia workers on different cluster environments. -There are two types of managers present in Base: `LocalManager`, for launching additional workers on the -same host, and `SSHManager`, for launching on remote hosts via `ssh`. TCP/IP sockets are used to connect -and transport messages between processes. It is possible for Cluster Managers to provide a different transport. - -```@docs -Distributed.ClusterManager -Distributed.WorkerConfig -Distributed.launch -Distributed.manage -Distributed.kill(::ClusterManager, ::Int, ::WorkerConfig) -Distributed.connect(::ClusterManager, ::Int, ::WorkerConfig) -Distributed.init_worker -Distributed.start_worker -Distributed.process_messages -Distributed.default_addprocs_params -``` diff --git a/src/Distributed.jl b/src/Distributed.jl index a7c5b17..4a44266 100644 --- a/src/Distributed.jl +++ b/src/Distributed.jl @@ -15,7 +15,7 @@ using Base: Process, Semaphore, JLOptions, buffer_writes, @async_unwrap, julia_cmd, AsyncGenerator, acquire, release, invokelatest, shell_escape_posixly, shell_escape_csh, shell_escape_wincmd, escape_microsoft_c_args, - uv_error, something, notnothing, isbuffered, mapany + uv_error, something, notnothing, isbuffered, mapany, SizeUnknown using Base.Threads: Event using Serialization, Sockets @@ -49,6 +49,7 @@ export procs, remote, remotecall, + remotecall_eval, remotecall_fetch, remotecall_wait, remote_do, @@ -72,14 +73,15 @@ export check_same_host function _require_callback(mod::Base.PkgId) - if Base.toplevel_load[] && myid() == 1 && nprocs() > 1 + if Base.toplevel_load[] && nprocs(role=:master) > 1 # broadcast top-level (e.g. from Main) import/using from node 1 (only) - @sync for p in procs() + @sync for p in procs(role = :master) + #@info "require callback", p p == 1 && continue # Extensions are already loaded on workers by their triggers being loaded # so no need to fire the callback upon extension being loaded on master. Base.loading_extension && continue - @async_unwrap remotecall_wait(p) do + @async_unwrap remotecall_wait(p; role = :master) do Base.require(mod) nothing end @@ -94,7 +96,7 @@ struct RRID whence::Int id::Int - RRID() = RRID(myid(), next_ref_id()) + RRID(;role= :default) = RRID(myid(role=role), next_ref_id()) RRID(whence, id) = new(whence, id) end diff --git a/src/cluster.jl b/src/cluster.jl index 2444695..f291b66 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -99,10 +99,10 @@ mutable struct Worker del_msgs::Array{Any,1} # XXX: Could del_msgs and add_msgs be Channels? add_msgs::Array{Any,1} @atomic gcflag::Bool - state::WorkerState - c_state::Condition # wait for state changes - ct_time::Float64 # creation time - conn_func::Any # used to setup connections lazily + @atomic state::WorkerState + c_state::Threads.Condition # wait for state changes, lock for state + ct_time::Float64 # creation time + conn_func::Any # used to setup connections lazily r_stream::IO w_stream::IO @@ -115,8 +115,8 @@ mutable struct Worker function Worker(id::Int, r_stream::IO, w_stream::IO, manager::ClusterManager; version::Union{VersionNumber, Nothing}=nothing, - config::WorkerConfig=WorkerConfig()) - w = Worker(id) + config::WorkerConfig=WorkerConfig(), role= :default) + w = Worker(id; role = role) w.r_stream = r_stream w.w_stream = buffer_writes(w_stream) w.w_serializer = ClusterSerializer(w.w_stream) @@ -128,56 +128,63 @@ mutable struct Worker w end - Worker(id::Int) = Worker(id, nothing) - function Worker(id::Int, conn_func) + Worker(id::Int; role= :default) = Worker(id, nothing; role = role) + function Worker(id::Int, conn_func; role= :default) @assert id > 0 + map_pid_wrkr = Map_pid_wrkr(role = role) if haskey(map_pid_wrkr, id) return map_pid_wrkr[id] end - w=new(id, Threads.ReentrantLock(), [], [], false, W_CREATED, Condition(), time(), conn_func) + w=new(id, Threads.ReentrantLock(), [], [], false, W_CREATED, Threads.Condition(), time(), conn_func) w.initialized = Event() - register_worker(w) + register_worker(w; role = role) w end - Worker() = Worker(get_next_pid()) + Worker(;role= :default) = Worker(get_next_pid(); role = role) end +wid(w::Worker; role= :default) = w.id + function set_worker_state(w, state) - w.state = state - notify(w.c_state; all=true) + lock(w.c_state) do + @atomic w.state = state + notify(w.c_state; all=true) + end end -function check_worker_state(w::Worker) - if w.state === W_CREATED - if !isclusterlazy() - if PGRP.topology === :all_to_all +function check_worker_state(w::Worker; role= :default) + if (@atomic w.state) === W_CREATED + if !isclusterlazy(role = role) + pg = PGRP(role = role) + if pg.topology === :all_to_all # Since higher pids connect with lower pids, the remote worker # may not have connected to us yet. Wait for some time. - wait_for_conn(w) + wait_for_conn(w; role=role) else - error("peer $(w.id) is not connected to $(myid()). Topology : " * string(PGRP.topology)) + error("peer $(wid(w, role=role)) is not connected to $(myid(role=role)). Topology : " * string(pg.topology)) end else w.ct_time = time() - if myid() > w.id - t = @async exec_conn_func(w) + if myid(role=role) > wid(w, role=role) + t = @async exec_conn_func(w; role=role) else # route request via node 1 - t = @async remotecall_fetch((p,to_id) -> remotecall_fetch(exec_conn_func, p, to_id), 1, w.id, myid()) + t = @async remotecall_fetch((p,to_id) -> remotecall_fetch((to_id, role2) -> exec_conn_func(to_id; role = role2), p, to_id, p == 1 ? :master : :worker; role = :master), 1, wid(w, role=role), myid(role=role); role=role) end errormonitor(t) - wait_for_conn(w) + wait_for_conn(w; role=role) end end + return nothing end -exec_conn_func(id::Int) = exec_conn_func(worker_from_id(id)::Worker) -function exec_conn_func(w::Worker) +exec_conn_func(id::Int; role= :default) = exec_conn_func(worker_from_id(id; role = role)::Worker; role = role) +function exec_conn_func(w::Worker; role= :default) try f = notnothing(w.conn_func) # Will be called if some other task tries to connect at the same time. - w.conn_func = () -> wait_for_conn(w) + w.conn_func = () -> wait_for_conn(w; role=role) f() catch e w.conn_func = () -> throw(e) @@ -186,14 +193,16 @@ function exec_conn_func(w::Worker) nothing end -function wait_for_conn(w) - if w.state === W_CREATED +function wait_for_conn(w; role=:defaut) + if (@atomic w.state) === W_CREATED timeout = worker_timeout() - (time() - w.ct_time) - timeout <= 0 && error("peer $(w.id) has not connected to $(myid())") + timeout <= 0 && error("peer $(wid(w, role=role)) has not connected to $(myid(role=role))") - @async (sleep(timeout); notify(w.c_state; all=true)) - wait(w.c_state) - w.state === W_CREATED && error("peer $(w.id) didn't connect to $(myid()) within $timeout seconds") + if timedwait(() -> (@atomic w.state) === W_CONNECTED, timeout) === :timed_out + # Notify any waiters on the state and throw + @lock w.c_state notify(w.c_state) + error("peer $(w.id) didn't connect to $(myid()) within $timeout seconds") + end end nothing end @@ -201,11 +210,29 @@ end ## process group creation ## mutable struct LocalProcess - id::Int + id0::Int + id1::Int bind_addr::String + bind_addr_2::String bind_port::UInt16 cookie::String - LocalProcess() = new(1) + LocalProcess() = new(1,1) +end + +function wid(lp::LocalProcess; role= :default) + if role == :master + return lp.id1 + elseif role == :worker + return lp.id0 + elseif role == :default && myrole() == :master + return lp.id1 # as :master + elseif role == :default && myrole() == :worker + return lp.id0 # as :worker + else + return lp.id1 # as :master + #throw("unexpected use of role=:default (wid)") + end + end worker_timeout() = parse(Float64, get(ENV, "JULIA_WORKER_TIMEOUT", "60.0")) @@ -230,6 +257,7 @@ It does not return. """ start_worker(cookie::AbstractString=readline(stdin); kwargs...) = start_worker(stdout, cookie; kwargs...) function start_worker(out::IO, cookie::AbstractString=readline(stdin); close_stdin::Bool=true, stderr_to_stdout::Bool=true) + init_multi() if close_stdin # workers will not use it @@ -249,12 +277,9 @@ function start_worker(out::IO, cookie::AbstractString=readline(stdin); close_std end errormonitor(@async while isopen(sock) client = accept(sock) - process_messages(client, client, true) + process_messages(client, client, true; role = :worker) end) - print(out, "julia_worker:") # print header - print(out, "$(string(LPROC.bind_port))#") # print port - print(out, LPROC.bind_addr) - print(out, '\n') + println(out, "julia_worker:$(string(LPROC.bind_port))#$(LPROC.bind_addr_2)\n") # print header flush(out) Sockets.nagle(sock, false) @@ -270,7 +295,7 @@ function start_worker(out::IO, cookie::AbstractString=readline(stdin); close_std check_master_connect() while true; wait(); end catch err - print(stderr, "unhandled exception on $(myid()): $(err)\nexiting.\n") + print(stderr, "unhandled exception on $(myid(role = :worker)): $(err)\nexiting.\n") end close(sock) @@ -330,6 +355,8 @@ function read_worker_host_port(io::IO) throw(LaunchWorkerError("Unable to read host:port string from worker. Launch command exited with error?")) end + #@info "conninfo: $conninfo" + ntries -= 1 bind_addr, port = parse_connection_info(conninfo) if !isempty(bind_addr) @@ -379,12 +406,12 @@ function init_worker(cookie::AbstractString, manager::ClusterManager=DefaultClus # Since our pid has yet to be set, ensure no RemoteChannel / Future have been created or addprocs() called. @assert nprocs() <= 1 - @assert isempty(PGRP.refs) + @assert isempty(PGRP(role = :worker).refs) @assert isempty(client_refs) # System is started in head node mode, cleanup related entries - empty!(PGRP.workers) - empty!(map_pid_wrkr) + empty!(PGRP(role = :worker).workers) + empty!(Map_pid_wrkr(role = :worker)) cluster_cookie(cookie) nothing @@ -443,10 +470,16 @@ end function addprocs(manager::ClusterManager; kwargs...) init_multi() - cluster_mgmt_from_master_check() +# cluster_mgmt_from_master_check() lock(worker_lock) try + + if myrole() == :worker + myrole!(:master_worker) + end + PGRP(role=:master).level = PGRP(role=:worker).level + 1 + addprocs_locked(manager::ClusterManager; kwargs...) finally unlock(worker_lock) @@ -455,16 +488,18 @@ end function addprocs_locked(manager::ClusterManager; kwargs...) params = merge(default_addprocs_params(manager), Dict{Symbol,Any}(kwargs)) - topology(Symbol(params[:topology])) + topology(Symbol(params[:topology]); role = :master) - if PGRP.topology !== :all_to_all + pgm = PGRP(role = :master) + + if pgm.topology !== :all_to_all params[:lazy] = false end - if PGRP.lazy === nothing || nprocs() == 1 - PGRP.lazy = params[:lazy] - elseif isclusterlazy() != params[:lazy] - throw(ArgumentError(string("Active workers with lazy=", isclusterlazy(), + if pgm.lazy === nothing || nprocs() == 1 + pgm.lazy = params[:lazy] + elseif isclusterlazy(role = :master) != params[:lazy] + throw(ArgumentError(string("Active workers with lazy=", isclusterlazy(role = :master), ". Cannot set lazy=", params[:lazy]))) end @@ -485,19 +520,24 @@ function addprocs_locked(manager::ClusterManager; kwargs...) # call manager's `launch` is a separate task. This allows the master # process initiate the connection setup process as and when workers come # online + # NOTE: Must be `@async`. See FIXME above t_launch = @async launch(manager, params, launched, launch_ntfy) @sync begin while true if isempty(launched) istaskdone(t_launch) && break - @async (sleep(1); notify(launch_ntfy)) + @async begin # NOTE: Must be `@async`. See FIXME above + sleep(1) + notify(launch_ntfy) + end wait(launch_ntfy) end if !isempty(launched) wconfig = popfirst!(launched) let wconfig=wconfig + # NOTE: Must be `@async`. See FIXME above @async setup_launched_worker(manager, wconfig, launched_q) end end @@ -509,17 +549,17 @@ function addprocs_locked(manager::ClusterManager; kwargs...) # Since all worker-to-worker setups may not have completed by the time this # function returns to the caller, send the complete list to all workers. # Useful for nprocs(), nworkers(), etc to return valid values on the workers. - all_w = workers() + all_w = workers(role = :master) for pid in all_w - remote_do(set_valid_processes, pid, all_w) + remote_do((all_w, role) -> set_valid_processes(all_w, role = role), pid, all_w, pid == 1 ? :master : :worker; role = :master) end sort!(launched_q) end -function set_valid_processes(plist::Array{Int}) +function set_valid_processes(plist::Array{Int}; role= :default) for pid in setdiff(plist, workers()) - myid() != pid && Worker(pid) + myid(role=role) != pid && Worker(pid; role = role) end end @@ -566,7 +606,7 @@ function launch_n_additional_processes(manager, frompid, fromconfig, cnt, launch exeflags = something(fromconfig.exeflags, ``) cmd = `$exename $exeflags` - new_addresses = remotecall_fetch(launch_additional, frompid, cnt, cmd) + new_addresses = remotecall_fetch(launch_additional, frompid, cnt, cmd; role = :master) for address in new_addresses (bind_addr, port) = address @@ -580,7 +620,7 @@ function launch_n_additional_processes(manager, frompid, fromconfig, cnt, launch let wconfig=wconfig @async begin pid = create_worker(manager, wconfig) - remote_do(redirect_output_from_additional_worker, frompid, pid, port) + remote_do(redirect_output_from_additional_worker, frompid, pid, port; role = :master) push!(launched_q, pid) end end @@ -589,40 +629,42 @@ function launch_n_additional_processes(manager, frompid, fromconfig, cnt, launch end function create_worker(manager, wconfig) + role = :master + # only node 1 can add new nodes, since nobody else has the full list of address:port - @assert LPROC.id == 1 + @assert myid(role=role) == 1 timeout = worker_timeout() # initiate a connect. Does not wait for connection completion in case of TCP. - w = Worker() + w = Worker(role = role) local r_s, w_s try - (r_s, w_s) = connect(manager, w.id, wconfig) + (r_s, w_s) = connect(manager, wid(w, role=role), wconfig) catch ex try - deregister_worker(w.id) - kill(manager, w.id, wconfig) + deregister_worker(wid(w, role=role), role = role) + kill(manager, wid(w, role=role), wconfig) finally rethrow(ex) end end - w = Worker(w.id, r_s, w_s, manager; config=wconfig) + w = Worker(wid(w, role=role), r_s, w_s, manager; config=wconfig, role = role) # install a finalizer to perform cleanup if necessary finalizer(w) do w - if myid() == 1 - manage(w.manager, w.id, w.config, :finalize) + if myid(role=role) == 1 + manage(w.manager, wid(w, role=role), w.config, :finalize) end end # set when the new worker has finished connections with all other workers - ntfy_oid = RRID() - rr_ntfy_join = lookup_ref(ntfy_oid) - rr_ntfy_join.waitingfor = myid() + ntfy_oid = RRID(role = role) + rr_ntfy_join = lookup_ref(ntfy_oid; role = role) + rr_ntfy_join.waitingfor = myid(role=role) # Start a new task to handle inbound messages from connected worker in master. # Also calls `wait_connected` on TCP streams. - process_messages(w.r_stream, w.w_stream, false) + process_messages(w.r_stream, w.w_stream, false; role = :master) # send address information of all workers to the new worker. # Cluster managers set the address of each worker in `WorkerConfig.connect_at`. @@ -639,23 +681,29 @@ function create_worker(manager, wconfig) # - On master, receiving a JoinCompleteMsg triggers rr_ntfy_join (signifies that worker setup is complete) join_list = [] - if PGRP.topology === :all_to_all + pgm = PGRP(role = role) + if pgm.topology === :all_to_all # need to wait for lower worker pids to have completed connecting, since the numerical value # of pids is relevant to the connection process, i.e., higher pids connect to lower pids and they # require the value of config.connect_at which is set only upon connection completion - for jw in PGRP.workers - if (jw.id != 1) && (jw.id < w.id) - (jw.state === W_CREATED) && wait(jw.c_state) + for jw in pgm.workers + if (wid(jw, role=role) != 1) && (wid(jw, role=role) < wid(w, role=role)) + # wait for wl to join + if (@atomic jw.state) === W_CREATED + lock(jw.c_state) do + wait(jw.c_state) + end + end push!(join_list, jw) end end - elseif PGRP.topology === :custom + elseif pgm.topology === :custom # wait for requested workers to be up before connecting to them. - filterfunc(x) = (x.id != 1) && isdefined(x, :config) && + filterfunc(x) = (wid(x, role=role) != 1) && isdefined(x, :config) && (notnothing(x.config.ident) in something(wconfig.connect_idents, [])) - wlist = filter(filterfunc, PGRP.workers) + wlist = filter(filterfunc, pgm.workers) waittime = 0 while wconfig.connect_idents !== nothing && length(wlist) < length(wconfig.connect_idents) @@ -664,37 +712,40 @@ function create_worker(manager, wconfig) end sleep(1.0) waittime += 1 - wlist = filter(filterfunc, PGRP.workers) + wlist = filter(filterfunc, pgm.workers) end for wl in wlist - (wl.state === W_CREATED) && wait(wl.c_state) + lock(wl.c_state) do + if (@atomic wl.state) === W_CREATED + # wait for wl to join + wait(wl.c_state) + end + end push!(join_list, wl) end end all_locs = mapany(x -> isa(x, Worker) ? - (something(x.config.connect_at, ()), x.id) : - ((), x.id, true), + (something(x.config.connect_at, ()), wid(x, role=role)) : + ((), wid(x, role=role), true), join_list) send_connection_hdr(w, true) enable_threaded_blas = something(wconfig.enable_threaded_blas, false) - join_message = JoinPGRPMsg(w.id, all_locs, PGRP.topology, enable_threaded_blas, isclusterlazy()) - send_msg_now(w, MsgHeader(RRID(0,0), ntfy_oid), join_message) + join_message = JoinPGRPMsg(wid(w, role=role), all_locs, pgm.topology, enable_threaded_blas, isclusterlazy(role = role)) + send_msg_now(w, MsgHeader(RRID(0,0), ntfy_oid), join_message; role = role) - @async manage(w.manager, w.id, w.config, :register) + @async manage(w.manager, wid(w, role=role), w.config, :register) + # wait for rr_ntfy_join with timeout - timedout = false - @async (sleep($timeout); timedout = true; put!(rr_ntfy_join, 1)) - wait(rr_ntfy_join) - if timedout + if timedwait(() -> isready(rr_ntfy_join), timeout) === :timed_out error("worker did not connect within $timeout seconds") end lock(client_refs) do - delete!(PGRP.refs, ntfy_oid) + delete!(pgm.refs, ntfy_oid) end - return w.id + return wid(w, role=role) end @@ -729,23 +780,21 @@ function redirect_output_from_additional_worker(pid, port) end function check_master_connect() - timeout = worker_timeout() * 1e9 # If we do not have at least process 1 connect to us within timeout # we log an error and exit, unless we're running on valgrind if ccall(:jl_running_on_valgrind,Cint,()) != 0 return end - @async begin - start = time_ns() - while !haskey(map_pid_wrkr, 1) && (time_ns() - start) < timeout - sleep(1.0) - end - - if !haskey(map_pid_wrkr, 1) - print(stderr, "Master process (id 1) could not connect within $(timeout/1e9) seconds.\nexiting.\n") - exit(1) - end + errormonitor( + @async begin + map_pid_wrkr = Map_pid_wrkr(role = :worker) + timeout = worker_timeout() + if timedwait(() -> haskey(map_pid_wrkr, 1), timeout) === :timed_out + print(stderr, "Master process (id 1) could not connect within $(timeout) seconds.\nexiting.\n") + exit(1) + end end + ) end @@ -784,34 +833,60 @@ let next_pid = 2 # 1 is reserved for the client (always) end mutable struct ProcessGroup + level::Integer name::String workers::Array{Any,1} refs::Dict{RRID,Any} # global references topology::Symbol lazy::Union{Bool, Nothing} - ProcessGroup(w::Array{Any,1}) = new("pg-default", w, Dict(), :all_to_all, nothing) + ProcessGroup(w::Array{Any,1}) = new(0, "pg-default", w, Dict(), :all_to_all, nothing) +end + +const _PGRP0 = ProcessGroup([]) +const _PGRP1 = ProcessGroup([]) + +function PGRP(;role= :default) + if role == :master +# @info "$(role) / PGRP1 !" + return _PGRP1 + elseif role == :worker +# @info "$(role) / PGRP0 ! -- worker" + return _PGRP0 +# elseif role == :default && _PGRP0.level == 0 + elseif role == :default && myrole() == :master +# @info "$(role) / PGRP1 !" + return _PGRP1 # as :master +# elseif role == :default && _PGRP0.level > 0 + elseif role == :default && myrole() == :worker +# @info "$(role) / PGRP0 !" + return _PGRP0 # as :worker + else + return _PGRP1 # as :master + # throw("unexpected use of role = $role (PGRP) - $(myrole())") + end end -const PGRP = ProcessGroup([]) -function topology(t) +function topology(t; role= :default) @assert t in [:all_to_all, :master_worker, :custom] - if (PGRP.topology==t) || ((myid()==1) && (nprocs()==1)) || (myid() > 1) - PGRP.topology = t + pg = PGRP(role = role) + if (pg.topology==t) || ((myid(role=role)==1) && (nprocs()==1)) || (myid(role=role) > 1) + pg.topology = t else - error("Workers with Topology $(PGRP.topology) already exist. Requested Topology $(t) cannot be set.") + error("Workers with Topology $(pg.topology) already exist. Requested Topology $(t) cannot be set.") end t end -isclusterlazy() = something(PGRP.lazy, false) +isclusterlazy(; role= :default) = something(PGRP(role = role).lazy, false) -get_bind_addr(pid::Integer) = get_bind_addr(worker_from_id(pid)) -get_bind_addr(w::LocalProcess) = LPROC.bind_addr -function get_bind_addr(w::Worker) +get_bind_addr(pid::Integer) = get_bind_addr(worker_from_id(pid; role = :master)) # always called as manager +get_bind_addr(w::LocalProcess) = LPROC.bind_addr # always called as manager +function get_bind_addr(w::Worker) + role = :worker # always called as worker if w.config.bind_addr === nothing - if w.id != myid() - w.config.bind_addr = remotecall_fetch(get_bind_addr, w.id, w.id) + if wid(w, role=role) != myid(role=role) + w.config.bind_addr = remotecall_fetch(get_bind_addr, wid(w, role=role), wid(w, role=role), role = role) end end w.config.bind_addr @@ -822,10 +897,33 @@ const LPROC = LocalProcess() const LPROCROLE = Ref{Symbol}(:master) const HDR_VERSION_LEN=16 const HDR_COOKIE_LEN=16 -const map_pid_wrkr = Dict{Int, Union{Worker, LocalProcess}}() +const _map_pid_wrkr_0 = Dict{Int, Union{Worker, LocalProcess}}() +const _map_pid_wrkr_1 = Dict{Int, Union{Worker, LocalProcess}}() const map_sock_wrkr = IdDict() const map_del_wrkr = Set{Int}() +function Map_pid_wrkr(;role= :default) + # @info ("_map_pid_wrkr_0", _map_pid_wrkr_0, "end") + # @info ("_map_pid_wrkr_1", _map_pid_wrkr_1, "end") + pg = PGRP(role = role) + if role == :master + # @info "Map_pid_wrkr_1 ", role + return _map_pid_wrkr_1 + elseif role == :worker + # @info "Map_pid_wrkr_0 ", role + return _map_pid_wrkr_0 + elseif role == :default && myrole() == :master + # @info "Map_pid_wrkr_1 ", role, pg.level + return _map_pid_wrkr_1 # as :master + elseif role == :default && myrole() == :worker + # @info "Map_pid_wrkr_0 ", role, pg.level + return _map_pid_wrkr_0 # as :worker + else + return _map_pid_wrkr_1 # as :master + # throw("unexpected use of role = :default (Map_pid_wrkr)") + end +end + # whether process is a master or worker in a distributed setup myrole() = LPROCROLE[] function myrole!(proctype::Symbol) @@ -847,7 +945,38 @@ julia> remotecall_fetch(() -> myid(), 4) 4 ``` """ -myid() = LPROC.id +function myid(;role= :default) + if role == :master + return LPROC.id1 + elseif role == :worker + return LPROC.id0 + elseif role == :default && myrole() == :master + return LPROC.id1 # as :master + elseif role == :default && myrole() == :worker + return LPROC.id0 # as :worker + else + return LPROC.id1 # as :master + #throw("unexpected use of role := default (myid) - $(myrole())") + end + +end + +function myid!(id;role= :default) + if role == :master + LPROC.id1 = id + elseif role == :worker + LPROC.id0 = id + elseif role == :default && myrole() == :master + LPROC.id1 = id # as :master + elseif role == :default && myrole() == :worker + LPROC.id0 = id # as :worker + else + LPROC.id1 = id # as :master + #throw("unexpected use of role := default (myid!)") + end + +end + """ nprocs() @@ -865,18 +994,19 @@ julia> workers() 3 ``` """ -function nprocs() - if myid() == 1 || (PGRP.topology === :all_to_all && !isclusterlazy()) - n = length(PGRP.workers) +function nprocs(; role= :default) + pg = PGRP(role = role) + if myid(role=role) == 1 || (pg.topology === :all_to_all && !isclusterlazy(role = role)) + n = length(pg.workers) # filter out workers in the process of being setup/shutdown. - for jw in PGRP.workers - if !isa(jw, LocalProcess) && (jw.state !== W_CONNECTED) + for jw in pg.workers + if !isa(jw, LocalProcess) && ((@atomic jw.state) !== W_CONNECTED) n = n - 1 end end return n else - return length(PGRP.workers) + return length(pg.workers) end end @@ -897,8 +1027,8 @@ julia> nworkers() 2 ``` """ -function nworkers() - n = nprocs() +function nworkers(;role= :default) + n = nprocs(role = role) n == 1 ? 1 : n-1 end @@ -918,25 +1048,27 @@ julia> procs() 3 ``` """ -function procs() - if myid() == 1 || (PGRP.topology === :all_to_all && !isclusterlazy()) +function procs(; role= :default) + pg = PGRP(role = role) + if myid(role=role) == 1 || (pg.topology === :all_to_all && !isclusterlazy(role = role)) # filter out workers in the process of being setup/shutdown. - return Int[x.id for x in PGRP.workers if isa(x, LocalProcess) || (x.state === W_CONNECTED)] + return Int[wid(x, role=role) for x in pg.workers if isa(x, LocalProcess) || ((@atomic x.state) === W_CONNECTED)] else - return Int[x.id for x in PGRP.workers] + return Int[wid(x, role=role) for x in pg.workers] end end -function id_in_procs(id) # faster version of `id in procs()` - if myid() == 1 || (PGRP.topology === :all_to_all && !isclusterlazy()) - for x in PGRP.workers - if (x.id::Int) == id && (isa(x, LocalProcess) || (x::Worker).state === W_CONNECTED) +function id_in_procs(id0; role= :default) # faster version of `id in procs()` + pg = PGRP(role = role) + if myid(role=role) == 1 || (pg.topology === :all_to_all && !isclusterlazy(role = role)) + for x in pg.workers + if (wid(x, role=role)::Int) == id0 && (isa(x, LocalProcess) || (@atomic (x::Worker).state) === W_CONNECTED) return true end end else - for x in PGRP.workers - if (x.id::Int) == id + for x in pg.workers + if (wid(x, role=role)::Int) == id0 return true end end @@ -950,17 +1082,18 @@ end Return a list of all process identifiers on the same physical node. Specifically all workers bound to the same ip-address as `pid` are returned. """ -function procs(pid::Integer) - if myid() == 1 - all_workers = [x for x in PGRP.workers if isa(x, LocalProcess) || (x.state === W_CONNECTED)] +function procs(pid::Integer; role= :default) + if myid(role = role) == 1 + map_pid_wrkr = Map_pid_wrkr(role = role) + all_workers = [x for x in PGRP(role = role).workers if isa(x, LocalProcess) || ((@atomic x.state) === W_CONNECTED)] if (pid == 1) || (isa(map_pid_wrkr[pid].manager, LocalManager)) - Int[x.id for x in filter(w -> (w.id==1) || (isa(w.manager, LocalManager)), all_workers)] + Int[wid(x, role=role) for x in filter(w -> (wid(w, role=role)==1) || (isa(w.manager, LocalManager)), all_workers)] else ipatpid = get_bind_addr(pid) - Int[x.id for x in filter(w -> get_bind_addr(w) == ipatpid, all_workers)] + Int[wid(x, role=role) for x in filter(w -> get_bind_addr(w) == ipatpid, all_workers)] end else - remotecall_fetch(procs, 1, pid) + remotecall_fetch(pid -> procs(pid, role = :master), 1; role = role) end end @@ -972,15 +1105,15 @@ Return a list of all worker process identifiers. # Examples ```julia-repl \$ julia -p 2 - +, pid julia> workers() 2-element Array{Int64,1}: 2 3 ``` """ -function workers() - allp = procs() +function workers(; role= :default) + allp = procs(role = role) if length(allp) == 1 allp else @@ -988,11 +1121,11 @@ function workers() end end -function cluster_mgmt_from_master_check() - if myid() != 1 - throw(ErrorException("Only process 1 can add and remove workers")) - end -end +#function cluster_mgmt_from_master_check() +# if myid() != 1 +# throw(ErrorException("Only process 1 can add and remove workers")) +# end +#end """ rmprocs(pids...; waitfor=typemax(Int)) @@ -1025,22 +1158,22 @@ julia> workers() 6 ``` """ -function rmprocs(pids...; waitfor=typemax(Int)) - cluster_mgmt_from_master_check() +function rmprocs(pids...; role = :default, waitfor=typemax(Int)) # supposed to be called always as :master +# cluster_mgmt_from_master_check() pids = vcat(pids...) if waitfor == 0 - t = @async _rmprocs(pids, typemax(Int)) + t = @async _rmprocs(pids, role, typemax(Int)) yield() return t else - _rmprocs(pids, waitfor) + _rmprocs(pids, role, waitfor) # return a dummy task object that user code can wait on. return @async nothing end end -function _rmprocs(pids, waitfor) +function _rmprocs(pids, role, waitfor) lock(worker_lock) try rmprocset = Union{LocalProcess, Worker}[] @@ -1048,6 +1181,7 @@ function _rmprocs(pids, waitfor) if p == 1 @warn "rmprocs: process 1 not removed" else + map_pid_wrkr = Map_pid_wrkr(role = role) if haskey(map_pid_wrkr, p) w = map_pid_wrkr[p] set_worker_state(w, W_TERMINATING) @@ -1059,11 +1193,11 @@ function _rmprocs(pids, waitfor) start = time_ns() while (time_ns() - start) < waitfor*1e9 - all(w -> w.state === W_TERMINATED, rmprocset) && break + all(w -> (@atomic w.state) === W_TERMINATED, rmprocset) && break sleep(min(0.1, waitfor - (time_ns() - start)/1e9)) end - unremoved = [wrkr.id for wrkr in filter(w -> w.state !== W_TERMINATED, rmprocset)] + unremoved = [wid(wrkr, role=role) for wrkr in filter(w -> (@atomic w.state) !== W_TERMINATED, rmprocset)] if length(unremoved) > 0 estr = string("rmprocs: pids ", unremoved, " not terminated after ", waitfor, " seconds.") throw(ErrorException(estr)) @@ -1087,17 +1221,18 @@ end # No-arg constructor added for compatibility with Julia 1.0 & 1.1, should be deprecated in the future ProcessExitedException() = ProcessExitedException(-1) -worker_from_id(i) = worker_from_id(PGRP, i) -function worker_from_id(pg::ProcessGroup, i) +worker_from_id(i; role= :default) = worker_from_id(PGRP(role = role), i; role = role) +function worker_from_id(pg::ProcessGroup, i; role= :default) if !isempty(map_del_wrkr) && in(i, map_del_wrkr) throw(ProcessExitedException(i)) end + map_pid_wrkr = Map_pid_wrkr(role = role) w = get(map_pid_wrkr, i, nothing) if w === nothing - if myid() == 1 - error("no process with id $i exists") + if myid(role=role) == 1 + error("no process with id $i exists ($role)") end - w = Worker(i) + w = Worker(i; role = role) map_pid_wrkr[i] = w else w = w::Union{Worker, LocalProcess} @@ -1113,25 +1248,26 @@ returns the `pid` of the worker it is connected to. This is useful when writing custom [`serialize`](@ref) methods for a type, which optimizes the data written out depending on the receiving process id. """ -function worker_id_from_socket(s) +function worker_id_from_socket(s; role= :default) w = get(map_sock_wrkr, s, nothing) if isa(w,Worker) if s === w.r_stream || s === w.w_stream - return w.id + return wid(w, role=role) end end if isa(s,IOStream) && fd(s)==-1 # serializing to a local buffer - return myid() + return myid(role=role) end return -1 end -register_worker(w) = register_worker(PGRP, w) -function register_worker(pg, w) +register_worker(w; role= :default) = register_worker(PGRP(role = role), w; role = role) +function register_worker(pg, w; role= :default) push!(pg.workers, w) - map_pid_wrkr[w.id] = w + map_pid_wrkr = Map_pid_wrkr(role = role) + map_pid_wrkr[wid(w, role=role)] = w end function register_worker_streams(w) @@ -1139,9 +1275,10 @@ function register_worker_streams(w) map_sock_wrkr[w.w_stream] = w end -deregister_worker(pid) = deregister_worker(PGRP, pid) -function deregister_worker(pg, pid) - pg.workers = filter(x -> !(x.id == pid), pg.workers) +deregister_worker(pid; role= :default) = deregister_worker(PGRP(role = role), pid, role=role) +function deregister_worker(pg, pid; role= :default) + pg.workers = filter(x -> !(wid(x, role=role) == pid), pg.workers) + map_pid_wrkr = Map_pid_wrkr(role = role) w = pop!(map_pid_wrkr, pid, nothing) if isa(w, Worker) if isdefined(w, :r_stream) @@ -1151,13 +1288,13 @@ function deregister_worker(pg, pid) end end - if myid() == 1 && (myrole() === :master) && isdefined(w, :config) + if myid(role=role) == 1 && #=role === :master &&=# isdefined(w, :config) # Notify the cluster manager of this workers death - manage(w.manager, w.id, w.config, :deregister) - if PGRP.topology !== :all_to_all || isclusterlazy() - for rpid in workers() + manage(w.manager, wid(w, role=role), w.config, :deregister) + if pg.topology !== :all_to_all || isclusterlazy(role = role) + for rpid in workers(role=role) try - remote_do(deregister_worker, rpid, pid) + remote_do((pid,role) -> deregister_worker(pid, role=role), rpid, pid, rpid == 1 ? :master : :worker; role = role) catch end end @@ -1192,11 +1329,12 @@ function deregister_worker(pg, pid) end -function interrupt(pid::Integer) - @assert myid() == 1 +function interrupt(pid::Integer) + @assert myid(role = :master) == 1 + map_pid_wrkr = Map_pid_wrkr(role = :master) w = map_pid_wrkr[pid] if isa(w, Worker) - manage(w.manager, w.id, w.config, :interrupt) + manage(w.manager, wid(w, role=:master), w.config, :interrupt) end return end @@ -1215,8 +1353,8 @@ interrupt(pids::Integer...) = interrupt([pids...]) Interrupt the current executing task on the specified workers. This is equivalent to pressing Ctrl-C on the local machine. If no arguments are given, all workers are interrupted. """ -function interrupt(pids::AbstractVector=workers()) - @assert myid() == 1 +function interrupt(pids::AbstractVector=workers(role = :master)) + @assert myid(role = :master) == 1 @sync begin for pid in pids @async interrupt(pid) @@ -1227,13 +1365,14 @@ end wp_bind_addr(p::LocalProcess) = p.bind_addr wp_bind_addr(p) = p.config.bind_addr -function check_same_host(pids) - if myid() != 1 - return remotecall_fetch(check_same_host, 1, pids) +function check_same_host(pids; role= :default) + if myid(role = role) != 1 + return remotecall_fetch(pids -> check_same_host(pids, role = :master), 1, pids; role = role) else # We checkfirst if all test pids have been started using the local manager, # else we check for the same bind_to addr. This handles the special case # where the local ip address may change - as during a system sleep/awake + map_pid_wrkr = Map_pid_wrkr(role = role) if all(p -> (p==1) || (isa(map_pid_wrkr[p].manager, LocalManager)), pids) return true else @@ -1243,18 +1382,18 @@ function check_same_host(pids) end end -function terminate_all_workers() - myid() != 1 && return +function terminate_all_workers(;role= :default) + myid(role = role) != 1 && return - if nprocs() > 1 + if nprocs(role = role) > 1 try - rmprocs(workers(); waitfor=5.0) + rmprocs(workers(role = role); role = role, waitfor=5.0) catch _ex @warn "Forcibly interrupting busy workers" exception=_ex # Might be computation bound, interrupt them and try again - interrupt(workers()) + interrupt(workers(role = role)) try - rmprocs(workers(); waitfor=5.0) + rmprocs(workers(role = role); role = role, waitfor=5.0) catch _ex2 @error "Unable to terminate all workers" exception=_ex2,catch_backtrace() end @@ -1263,6 +1402,73 @@ function terminate_all_workers() end # initialize the local proc network address / port +#=function init_bind_addr() + opts = JLOptions() + if opts.bindto != C_NULL + bind_to = split(unsafe_string(opts.bindto), ":") + @info "A1: $bind_to" + bind_addr = string(parse(IPAddr, bind_to[1])) + if length(bind_to) > 1 + bind_port = parse(Int,bind_to[2]) + else + bind_port = 0 + end + else + bind_port = 0 + try + ips = getipaddrs(IPv4; loopback = false) + n = length(ips) + bind_addr = string(ips[n]) + catch + # All networking is unavailable, initialize bind_addr to the loopback address + # Will cause an exception to be raised only when used. + bind_addr = "127.0.0.1" + end + end + global LPROC + LPROC.bind_addr = bind_addr + LPROC.bind_port = UInt16(bind_port) +end +=# + +#=function init_bind_addr() + opts = JLOptions() + + @info "A2: $(getipaddrs(IPv4; loopback = false))" + bind_port = 0 + bind_addr = "" + try + ips = getipaddrs(IPv4; loopback = false) + n = length(ips) + bind_addr = string(ips[n]) + @info "ADDR: $ips --- $ips" + catch + # All networking is unavailable, initialize bind_addr to the loopback address + # Will cause an exception to be raised only when used. + bind_addr = "127.0.0.1" + end + + if opts.bindto != C_NULL + bind_to = split(unsafe_string(opts.bindto), ":") + @info "A1: $bind_to" + bind_addr_2 = string(parse(IPAddr, bind_to[1])) + if length(bind_to) > 1 + bind_port = parse(Int,bind_to[2]) + else + bind_port = 0 + end + else + bind_addr_2 = bind_addr + end + + global LPROC + @info "bind_addr=$bind_addr / bind_addr_2=$bind_addr_2" + LPROC.bind_addr = bind_addr + LPROC.bind_addr_2 = bind_addr_2 + LPROC.bind_port = UInt16(bind_port) +end +=# + function init_bind_addr() opts = JLOptions() if opts.bindto != C_NULL @@ -1285,34 +1491,37 @@ function init_bind_addr() end global LPROC LPROC.bind_addr = bind_addr + LPROC.bind_addr_2 = bind_addr LPROC.bind_port = UInt16(bind_port) end using Random: randstring -let inited = false - # do initialization that's only needed when there is more than 1 processor - global function init_multi() - if !inited - inited = true - push!(Base.package_callbacks, _require_callback) - atexit(terminate_all_workers) - init_bind_addr() - cluster_cookie(randstring(HDR_COOKIE_LEN)) - end - return nothing +# do initialization that's only needed when there is more than 1 processor +const inited = Threads.Atomic{Bool}(false) +function init_multi() + if !Threads.atomic_cas!(inited, false, true) + push!(Base.package_callbacks, _require_callback) + atexit(terminate_all_workers) + init_bind_addr() + cluster_cookie(randstring(HDR_COOKIE_LEN)) end + return nothing end function init_parallel() - start_gc_msgs_task() + start_gc_msgs_task(role = :master) # TO CHECK + start_gc_msgs_task(role = :worker) # TO CHECK # start in "head node" mode, if worker, will override later. - global PGRP + #global PGRP global LPROC - LPROC.id = 1 - @assert isempty(PGRP.workers) - register_worker(LPROC) + LPROC.id0 = 0 + LPROC.id1 = 1 + @assert isempty(PGRP(role = :master).workers) # TO CHECK + @assert isempty(PGRP(role = :worker).workers) # TO CHECK + register_worker(LPROC; role = :master) # TO CHECK + register_worker(LPROC; role = :worker) # TO CHECK end write_cookie(io::IO) = print(io.in, string(cluster_cookie(), "\n")) diff --git a/src/clusterserialize.jl b/src/clusterserialize.jl index 0acd4ce..bdd82b8 100644 --- a/src/clusterserialize.jl +++ b/src/clusterserialize.jl @@ -167,10 +167,17 @@ function deserialize_global_from_main(s::ClusterSerializer, sym) return nothing end end + Core.eval(Main, Expr(:global, sym)) if sym_isconst - ccall(:jl_set_const, Cvoid, (Any, Any, Any), Main, sym, v) + # Note that the post-lowering const form is not allowed in value + # position, so there needs to be a dummy `nothing` argument to drop the + # return value. + Core.eval(Main, Expr(:block, + Expr(:const, GlobalRef(Main, sym), v), + nothing)) else - setglobal!(Main, sym, v) + Core.eval(Main, Expr(:global, sym)) + invokelatest(setglobal!, Main, sym, v) end return nothing end @@ -241,14 +248,14 @@ reinitialized. Only those names found to be defined under `mod` are cleared. An exception is raised if a global constant is requested to be cleared. """ -function clear!(syms, pids=workers(); mod=Main) +function clear!(syms, pids=workers(); mod=Main, role= :default) @sync for p in pids - @async_unwrap remotecall_wait(clear_impl!, p, syms, mod) + @async_unwrap remotecall_wait(clear_impl!, p, syms, mod; role = role) end end -clear!(sym::Symbol, pid::Int; mod=Main) = clear!([sym], [pid]; mod=mod) -clear!(sym::Symbol, pids=workers(); mod=Main) = clear!([sym], pids; mod=mod) -clear!(syms, pid::Int; mod=Main) = clear!(syms, [pid]; mod=mod) +clear!(sym::Symbol, pid::Int; mod=Main, role= :default) = clear!([sym], [pid]; mod=mod, role = role) +clear!(sym::Symbol, pids=workers(); mod=Main, role= :default) = clear!([sym], pids; mod=mod, role = role) +clear!(syms, pid::Int; mod=Main, role= :default) = clear!(syms, [pid]; mod=mod, role = role) clear_impl!(syms, mod::Module) = foreach(x->clear_impl!(x,mod), syms) clear_impl!(sym::Symbol, mod::Module) = isdefined(mod, sym) && @eval(mod, global $sym = nothing) diff --git a/src/macros.jl b/src/macros.jl index a767c7a..aeb9084 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -2,15 +2,15 @@ let nextidx = Threads.Atomic{Int}(0) global nextproc - function nextproc() + function nextproc(;role= :default) idx = Threads.atomic_add!(nextidx, 1) - return workers()[(idx % nworkers()) + 1] + return workers(role = role)[(idx % nworkers(role = role)) + 1] end end -spawnat(p, thunk) = remotecall(thunk, p) +spawnat(p, thunk; role= :default) = remotecall(thunk, p; role = role) -spawn_somewhere(thunk) = spawnat(nextproc(),thunk) +spawn_somewhere(thunk; role= :default) = spawnat(nextproc(role = role),thunk; role = role) """ @spawn expr @@ -39,11 +39,31 @@ julia> fetch(f) !!! compat "Julia 1.3" As of Julia 1.3 this macro is deprecated. Use `@spawnat :any` instead. """ -macro spawn(expr) + + +#macro spawn(expr, role = :(:default)) + +function check_args_2(args...) + na = length(args) + if na==1 + role = Expr(:kw, :role, :(:defaut)) #:(role = :default) + expr = args[1] + elseif na==2 + role = args[1] + expr = args[2] + else + throw(ArgumentError("wrong number of arguments to spawn")) + end + return role, expr +end + +macro spawn(args...) + rolearg, expr = check_args_2(args...) + thunk = esc(:(()->($expr))) var = esc(Base.sync_varname) quote - local ref = spawn_somewhere($thunk) + local ref = spawn_somewhere($thunk; $(esc(rolearg))) if $(Expr(:islocal, var)) put!($var, ref) end @@ -51,13 +71,17 @@ macro spawn(expr) end end + """ @spawnat p expr Create a closure around an expression and run the closure asynchronously on process `p`. Return a [`Future`](@ref) to the result. + If `p` is the quoted literal symbol `:any`, then the system will pick a -processor to use automatically. +processor to use automatically. Using `:any` will not apply any form of +load-balancing, consider using a [`WorkerPool`](@ref) and [`remotecall(f, +::WorkerPool)`](@ref) if you need load-balancing. # Examples ```julia-repl @@ -79,15 +103,36 @@ julia> fetch(f) !!! compat "Julia 1.3" The `:any` argument is available as of Julia 1.3. """ -macro spawnat(p, expr) - thunk = esc(:(()->($expr))) - var = esc(Base.sync_varname) - if p === QuoteNode(:any) - spawncall = :(spawn_somewhere($thunk)) + +function check_args_3a(args...) + na = length(args) + if na==2 + role = Expr(:kw, :role, :(:defaut)) #:(role = :default) + p = args[1] + expr = args[2] + elseif na==3 + role = args[1] + p = args[2] + expr = args[3] else - spawncall = :(spawnat($(esc(p)), $thunk)) + throw(ArgumentError("wrong number of arguments to spawnat")) end - quote + return role, p, expr +end + +macro spawnat(args...) + rolearg, p, expr = check_args_3a(args...) + + #@info rolearg, typeof(rolearg) + + thunk = esc(:(()->($expr))) + var = esc(Base.sync_varname) + if p === QuoteNode(:any) + spawncall = :(spawn_somewhere($thunk; $(esc(rolearg)))) + else + spawncall = :(spawnat($(esc(p)), $thunk; $(esc(rolearg)))) + end + quote local ref = $spawncall if $(Expr(:islocal, var)) put!($var, ref) @@ -96,6 +141,7 @@ macro spawnat(p, expr) end end + """ @fetch expr @@ -119,9 +165,13 @@ julia> @fetch myid() 2 ``` """ -macro fetch(expr) + +macro fetch(args...) + + rolearg, expr = check_args_2(args...) + thunk = esc(:(()->($expr))) - :(remotecall_fetch($thunk, nextproc())) + :(remotecall_fetch($thunk, nextproc(); $(esc(rolearg)))) end """ @@ -141,9 +191,12 @@ julia> @fetchfrom 4 myid() 4 ``` """ -macro fetchfrom(p, expr) + + +macro fetchfrom(args...) + rolearg, p, expr = check_args_3a(args...) thunk = esc(:(()->($expr))) - :(remotecall_fetch($thunk, $(esc(p)))) + :(remotecall_fetch($thunk, $(esc(p)); $(esc(rolearg)))) end # extract a list of modules to import from an expression @@ -185,24 +238,58 @@ processes to have execute the expression. Similar to calling `remotecall_eval(Main, procs, expr)`, but with two extra features: - - `using` and `import` statements run on the calling process first, to ensure - packages are precompiled. - - The current source file path used by `include` is propagated to other processes. +- `using` and `import` statements run on the calling process first, to ensure + packages are precompiled. +- The current source file path used by `include` is propagated to other processes. """ -macro everywhere(ex) - procs = GlobalRef(@__MODULE__, :procs) - return esc(:($(Distributed).@everywhere $procs() $ex)) + +function check_args_3b(args...) + + na = length(args) + if na==1 + rolearg = Expr(:kw, :role, :(:defaut)) #:(role = :default) + reducer = nothing + loop = args[1] + elseif na==2 + if isa(args[1], Expr) && args[1].head == :(=) && args[1].args[1] === :role + rolearg = args[1] + reducer = nothing + loop = args[2] + else + rolearg = Expr(:kw, :role, :(:defaut)) #:(role = :default) + reducer = args[1] + loop = args[2] + end + elseif na==3 + rolearg = args[1] + reducer = args[2] + loop = args[3] + else + throw(ArgumentError("wrong number of arguments to @distributed")) + end + + return rolearg, reducer, loop end -macro everywhere(procs, ex) - imps = extract_imports(ex) - return quote - $(isempty(imps) ? nothing : Expr(:toplevel, imps...)) # run imports locally first - let ex = Expr(:toplevel, :(task_local_storage()[:SOURCE_PATH] = $(get(task_local_storage(), :SOURCE_PATH, nothing))), $(esc(Expr(:quote, ex)))), - procs = $(esc(procs)) - remotecall_eval(Main, procs, ex) +macro everywhere(args...) + + rolearg, procs, ex = check_args_3b(args...) + + if isnothing(procs) + procs = GlobalRef(@__MODULE__, :procs) + return esc(:($(Distributed).@everywhere $rolearg $procs(;$rolearg) $ex)) + else + imps = extract_imports(ex) + return quote + $(isempty(imps) ? nothing : Expr(:toplevel, imps...)) # run imports locally first + let ex = Expr(:toplevel, :(task_local_storage()[:SOURCE_PATH] = $(get(task_local_storage(), :SOURCE_PATH, nothing))), $(esc(Expr(:quote, ex)))), + procs = $(esc(procs)) + remotecall_eval(Main, procs, ex; $(esc(rolearg))) + end end + end + end """ @@ -215,14 +302,14 @@ Errors on any of the processes are collected into a See also [`@everywhere`](@ref). """ -function remotecall_eval(m::Module, procs, ex) +function remotecall_eval(m::Module, procs, ex; role=:default) @sync begin run_locally = 0 for pid in procs - if pid == myid() + if pid == myid(role=role) run_locally += 1 else - @async_unwrap remotecall_wait(Core.eval, pid, m, ex) + @async_unwrap remotecall_wait(Core.eval, pid, m, ex; role=role) end end yield() # ensure that the remotecalls have had a chance to start @@ -238,8 +325,8 @@ end # optimized version of remotecall_eval for a single pid # and which also fetches the return value -function remotecall_eval(m::Module, pid::Int, ex) - return remotecall_fetch(Core.eval, pid, m, ex) +function remotecall_eval(m::Module, pid::Int, ex; role=:default) + return remotecall_fetch(Core.eval, pid, m, ex; role=role) end @@ -261,22 +348,22 @@ function splitrange(firstIndex::Int, lastIndex::Int, np::Int) return chunks end -function preduce(reducer, f, R) - chunks = splitrange(Int(firstindex(R)), Int(lastindex(R)), nworkers()) - all_w = workers()[1:length(chunks)] +function preduce(reducer, f, R; role = :default) + chunks = splitrange(Int(firstindex(R)), Int(lastindex(R)), nworkers(role=role)) + all_w = workers(role=role)[1:length(chunks)] w_exec = Task[] for (idx,pid) in enumerate(all_w) - t = Task(()->remotecall_fetch(f, pid, reducer, R, first(chunks[idx]), last(chunks[idx]))) + t = Task(()->remotecall_fetch(f, pid, reducer, R, first(chunks[idx]), last(chunks[idx]), role=role)) schedule(t) push!(w_exec, t) end reduce(reducer, Any[fetch(t) for t in w_exec]) end -function pfor(f, R) - t = @async @sync for c in splitrange(Int(firstindex(R)), Int(lastindex(R)), nworkers()) - @spawnat :any f(R, first(c), last(c)) +function pfor(f, R; role = :default) + t = @async @sync for c in splitrange(Int(firstindex(R)), Int(lastindex(R)), nworkers(role=role)) + @spawnat role=role :any f(R, first(c), last(c)) end errormonitor(t) end @@ -328,15 +415,9 @@ completion. To wait for completion, prefix the call with [`@sync`](@ref), like : end """ macro distributed(args...) - na = length(args) - if na==1 - loop = args[1] - elseif na==2 - reducer = args[1] - loop = args[2] - else - throw(ArgumentError("wrong number of arguments to @distributed")) - end + + rolearg, reducer, loop = check_args_3b(args...) + if !isa(loop,Expr) || loop.head !== :for error("malformed @distributed loop") end @@ -346,16 +427,16 @@ macro distributed(args...) if Meta.isexpr(body, :block) && body.args[end] isa LineNumberNode resize!(body.args, length(body.args) - 1) end - if na==1 + if isnothing(reducer) syncvar = esc(Base.sync_varname) return quote - local ref = pfor($(make_pfor_body(var, body)), $(esc(r))) + local ref = pfor($(make_pfor_body(var, body)), $(esc(r)); $(esc(rolearg))) if $(Expr(:islocal, syncvar)) put!($syncvar, ref) end ref end else - return :(preduce($(esc(reducer)), $(make_preduce_body(var, body)), $(esc(r)))) + return :(preduce($(esc(reducer)), $(make_preduce_body(var, body)), $(esc(r)); $(esc(rolearg)))) # TO CHECK (role ?) end end diff --git a/src/managers.jl b/src/managers.jl index b2b655a..658c98a 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -111,7 +111,9 @@ addprocs([ version is used on all remote machines because serialization and code distribution might fail otherwise. -* `exeflags`: additional flags passed to the worker processes. +* `exeflags`: additional flags passed to the worker processes. It can either be a `Cmd`, a `String` + holding one flag, or a collection of strings, with one element per flag. + E.g. `\`--threads=auto project=.\``, `"--compile-trace=stderr"` or `["--threads=auto", "--compile=all"]`. * `topology`: Specifies how the workers connect to each other. Sending a message between unconnected workers results in an error. @@ -169,14 +171,16 @@ default_addprocs_params(::SSHManager) = :env => [], :tunnel => false, :multiplex => false, - :max_parallel => 10)) + :max_parallel => 10, + :ident => nothing, + :connect_idents => nothing)) function launch(manager::SSHManager, params::Dict, launched::Array, launch_ntfy::Condition) # Launch one worker on each unique host in parallel. Additional workers are launched later. # Wait for all launches to complete. @sync for (i, (machine, cnt)) in enumerate(manager.machines) let machine=machine, cnt=cnt - @async try + @async try launch_on_machine(manager, $machine, $cnt, params, launched, launch_ntfy) catch e print(stderr, "exception launching on machine $(machine) : $(e)\n") @@ -228,6 +232,7 @@ function parse_machine(machine::AbstractString) end function launch_on_machine(manager::SSHManager, machine::AbstractString, cnt, params::Dict, launched::Array, launch_ntfy::Condition) + shell = params[:shell] ssh = params[:ssh] dir = params[:dir] @@ -361,7 +366,15 @@ function launch_on_machine(manager::SSHManager, machine::AbstractString, cnt, pa wconfig.count = cnt wconfig.max_parallel = params[:max_parallel] wconfig.enable_threaded_blas = params[:enable_threaded_blas] - + #@info "will test connect_idents -- $(wconfig.ident)" + if haskey(params,:connect_idents) && !isnothing(params[:connect_idents]) + wconfig.connect_idents = Vector(params[:connect_idents]) + # @info "connect_idents = $(wconfig.connect_idents)" + end + if haskey(params, :ident) && !isnothing(params[:ident]) + wconfig.ident = params[:ident] + # @info "-------------- $(wconfig.ident)" + end push!(launched, wconfig) notify(launch_ntfy) @@ -572,16 +585,26 @@ workers. function connect(manager::ClusterManager, pid::Int, config::WorkerConfig) if config.connect_at !== nothing # this is a worker-to-worker setup call. + #(rhost, rport) = notnothing(config.connect_at)::Tuple{String, Int} + #config.host = rhost + #config.port = rport + #config.connect_at = nothing return connect_w2w(pid, config) + #return connect(manager, pid, config) end + #@info "CONNECT W1 " + # master connecting to workers if config.io !== nothing (bind_addr, port::Int) = read_worker_host_port(config.io) + # @info "CONNECT W2 $bind_addr $port $(config.host) $(config.bind_addr)" pubhost = something(config.host, bind_addr) + # @info "CONNECT W21 $pubhost" config.host = pubhost config.port = port else + #@info "CONNECT W3" pubhost = notnothing(config.host) port = notnothing(config.port) bind_addr = something(config.bind_addr, pubhost) @@ -619,6 +642,7 @@ function connect(manager::ClusterManager, pid::Int, config::WorkerConfig) release(sem) end else +# (s, bind_addr) = connect_to_worker(#=bind_addr=# pubhost, port) (s, bind_addr) = connect_to_worker(bind_addr, port) end @@ -681,6 +705,9 @@ function bind_client_port(sock::TCPSocket, iptype) end function connect_to_worker(host::AbstractString, port::Integer) + +# @info "--------- CONNECT TO WORKER $host $port" + # Avoid calling getaddrinfo if possible - involves a DNS lookup # host may be a stringified ipv4 / ipv6 address or a dns name bind_addr = nothing @@ -690,6 +717,7 @@ function connect_to_worker(host::AbstractString, port::Integer) bind_addr = getaddrinfo(host) end + iptype = typeof(bind_addr) sock = socket_reuse_port(iptype) connect(sock, bind_addr, UInt16(port)) @@ -699,6 +727,9 @@ end function connect_to_worker_with_tunnel(host::AbstractString, bind_addr::AbstractString, port::Integer, tunnel_user::AbstractString, sshflags, multiplex) + + # @info "++++++++ CONNECT TO WORKER WITH TUNNEL host=$host port=$port bind_addr=$bind_addr tunnel_user=$tunnel_user sshflags=$sshflags multiplex=$multiplex" + localport = ssh_tunnel(tunnel_user, host, bind_addr, UInt16(port), sshflags, multiplex) s = connect("localhost", localport) forward = "$localport:$bind_addr:$port" @@ -728,31 +759,39 @@ It should cause the remote worker specified by `pid` to exit. on `pid`. """ function kill(manager::ClusterManager, pid::Int, config::WorkerConfig) - remote_do(exit, pid) + remote_do(exit, pid; role = :master) nothing end function kill(manager::SSHManager, pid::Int, config::WorkerConfig) - remote_do(exit, pid) + remote_do(exit, pid; role = :master) cancel_ssh_tunnel(config) nothing end -function kill(manager::LocalManager, pid::Int, config::WorkerConfig; exit_timeout = 15, term_timeout = 15) +function kill(manager::LocalManager, pid::Int, config::WorkerConfig; profile_wait = 6, exit_timeout = 15, term_timeout = 15) + # profile_wait = 6 is 1s for profile, 5s for the report to show # First, try sending `exit()` to the remote over the usual control channels - remote_do(exit, pid) + remote_do(exit, pid; role = :master) timer_task = @async begin sleep(exit_timeout) # Check to see if our child exited, and if not, send an actual kill signal if !process_exited(config.process) - @warn("Failed to gracefully kill worker $(pid), sending SIGTERM") - kill(config.process, Base.SIGTERM) + @warn "Failed to gracefully kill worker $(pid)" + profile_sig = Sys.iswindows() ? nothing : Sys.isbsd() ? ("SIGINFO", 29) : ("SIGUSR1" , 10) + if profile_sig !== nothing + @warn("Sending profile $(profile_sig[1]) to worker $(pid)") + kill(config.process, profile_sig[2]) + sleep(profile_wait) + end + @warn("Sending SIGQUIT to worker $(pid)") + kill(config.process, Base.SIGQUIT) sleep(term_timeout) if !process_exited(config.process) - @warn("Worker $(pid) ignored SIGTERM, sending SIGKILL") + @warn("Worker $(pid) ignored SIGQUIT, sending SIGKILL") kill(config.process, Base.SIGKILL) end end diff --git a/src/messages.jl b/src/messages.jl index fe3e5ab..92afe8b 100644 --- a/src/messages.jl +++ b/src/messages.jl @@ -99,30 +99,30 @@ function send_msg_unknown(s::IO, header, msg) error("attempt to send to unknown socket") end -function send_msg(s::IO, header, msg) - id = worker_id_from_socket(s) +function send_msg(s::IO, header, msg; role= :default) + id = worker_id_from_socket(s; role = role) if id > -1 - return send_msg(worker_from_id(id), header, msg) + return send_msg(worker_from_id(id, role=role), header, msg; role = role) end send_msg_unknown(s, header, msg) end -function send_msg_now(s::IO, header, msg::AbstractMsg) - id = worker_id_from_socket(s) +function send_msg_now(s::IO, header, msg::AbstractMsg; role= :default) + id = worker_id_from_socket(s; role = role) if id > -1 - return send_msg_now(worker_from_id(id), header, msg) + return send_msg_now(worker_from_id(id; role=role), header, msg; role = role) end send_msg_unknown(s, header, msg) end -function send_msg_now(w::Worker, header, msg) - send_msg_(w, header, msg, true) +function send_msg_now(w::Worker, header, msg; role= :default) + send_msg_(w, header, msg, true; role = role) end -function send_msg(w::Worker, header, msg) - send_msg_(w, header, msg, false) +function send_msg(w::Worker, header, msg; role= :default) + send_msg_(w, header, msg, false; role = role) end -function flush_gc_msgs(w::Worker) +function flush_gc_msgs(w::Worker; role= :default) if !isdefined(w, :w_stream) return end @@ -144,10 +144,10 @@ function flush_gc_msgs(w::Worker) end end if add_msgs !== nothing - remote_do(add_clients, w, add_msgs) + remote_do((add_msgs, role) -> add_clients(add_msgs, role = role), w, add_msgs, wid(w,role=role) == 1 ? :master : :worker; role = role) end if del_msgs !== nothing - remote_do(del_clients, w, del_msgs) + remote_do((del_msgs, role) -> del_clients(del_msgs, role = role), w, del_msgs, wid(w,role=role) == 1 ? :master : :worker; role = role) end return end @@ -168,9 +168,9 @@ function deserialize_hdr_raw(io) return MsgHeader(RRID(data[1], data[2]), RRID(data[3], data[4])) end -function send_msg_(w::Worker, header, msg, now::Bool) - check_worker_state(w) - if myid() != 1 && !isa(msg, IdentifySocketMsg) && !isa(msg, IdentifySocketAckMsg) +function send_msg_(w::Worker, header, msg, now::Bool; role= :default) + check_worker_state(w; role = role) + if myid(role=role) != 1 && !isa(msg, IdentifySocketMsg) && !isa(msg, IdentifySocketAckMsg) wait(w.initialized) end io = w.w_stream @@ -182,7 +182,7 @@ function send_msg_(w::Worker, header, msg, now::Bool) write(io, MSG_BOUNDARY) if !now && w.gcflag - flush_gc_msgs(w) + flush_gc_msgs(w; role = role) else flush(io) end @@ -191,11 +191,11 @@ function send_msg_(w::Worker, header, msg, now::Bool) end end -function flush_gc_msgs() +function flush_gc_msgs(; role= :default) try - for w in (PGRP::ProcessGroup).workers - if isa(w,Worker) && (w.state == W_CONNECTED) && w.gcflag - flush_gc_msgs(w) + for w in (PGRP(role = role)::ProcessGroup).workers + if isa(w,Worker) && ((@atomic w.state) == W_CONNECTED) && w.gcflag + flush_gc_msgs(w; role = role) end end catch e diff --git a/src/pmap.jl b/src/pmap.jl index 39acc4d..225c9ad 100644 --- a/src/pmap.jl +++ b/src/pmap.jl @@ -18,16 +18,16 @@ Note that `f` must be made available to all worker processes; see [Code Availability and Loading Packages](@ref code-availability) for details. """ -function pgenerate(p::AbstractWorkerPool, f, c) +function pgenerate(p::AbstractWorkerPool, f, c; role= :default) if length(p) == 0 - return AsyncGenerator(f, c; ntasks=()->nworkers(p)) + return AsyncGenerator(f, c; ntasks=()->nworkers(p; role = role)) end batches = batchsplit(c, min_batch_count = length(p) * 3) - return Iterators.flatten(AsyncGenerator(remote(p, b -> asyncmap(f, b)), batches)) + return Iterators.flatten(AsyncGenerator(remote(p, b -> asyncmap(f, b); role = role), batches)) end -pgenerate(p::AbstractWorkerPool, f, c1, c...) = pgenerate(p, a->f(a...), zip(c1, c...)) -pgenerate(f, c) = pgenerate(default_worker_pool(), f, c) -pgenerate(f, c1, c...) = pgenerate(a->f(a...), zip(c1, c...)) +pgenerate(p::AbstractWorkerPool, f, c1, c...; role= :default) = pgenerate(p, a->f(a...), zip(c1, c...); role = role) +pgenerate(f, c; role= :default) = pgenerate(default_worker_pool(role=role), f, c; role = role) +pgenerate(f, c1, c...; role= :default) = pgenerate(a->f(a...), zip(c1, c...); role = role) """ pmap(f, [::AbstractWorkerPool], c...; distributed=true, batch_size=1, on_error=nothing, retry_delays=[], retry_check=nothing) -> collection @@ -97,10 +97,10 @@ pmap(f, c; on_error = e->(isa(e, InexactError) ? NaN : rethrow()), retry_delays ``` """ function pmap(f, p::AbstractWorkerPool, c; distributed=true, batch_size=1, on_error=nothing, - retry_delays=[], retry_check=nothing) + retry_delays=[], retry_check=nothing, role= :default) f_orig = f # Don't do remote calls if there are no workers. - if (length(p) == 0) || (length(p) == 1 && fetch(p.channel) == myid()) + if (length(p) == 0) || (length(p) == 1 && fetch(p.channel) == myid(role = role)) distributed = false end @@ -116,14 +116,14 @@ function pmap(f, p::AbstractWorkerPool, c; distributed=true, batch_size=1, on_er end if distributed - f = remote(p, f) + f = remote(p, f; role=role) end if length(retry_delays) > 0 f = wrap_retry(f, retry_delays, retry_check) end - return asyncmap(f, c; ntasks=()->nworkers(p)) + return asyncmap(f, c; ntasks=()->nworkers(p; role = role)) else # During batch processing, We need to ensure that if on_error is set, it is called # for each element in error, and that we return as many elements as the original list. @@ -140,12 +140,12 @@ function pmap(f, p::AbstractWorkerPool, c; distributed=true, batch_size=1, on_er f = wrap_on_error(f, (x,e)->BatchProcessingError(x,e); capture_data=true) end - f = wrap_batch(f, p, handle_errors) - results = asyncmap(f, c; ntasks=()->nworkers(p), batch_size=batch_size) + f = wrap_batch(f, p, handle_errors; role=role) + results = asyncmap(f, c; ntasks=()->nworkers(p; role = role), batch_size=batch_size) # process errors if any. if handle_errors - process_batch_errors!(p, f_orig, results, on_error, retry_delays, retry_check) + process_batch_errors!(p, f_orig, results, on_error, retry_delays, retry_check; role = role) end return results @@ -153,7 +153,7 @@ function pmap(f, p::AbstractWorkerPool, c; distributed=true, batch_size=1, on_er end pmap(f, p::AbstractWorkerPool, c1, c...; kwargs...) = pmap(a->f(a...), p, zip(c1, c...); kwargs...) -pmap(f, c; kwargs...) = pmap(f, CachingPool(workers()), c; kwargs...) +pmap(f, c; role = :default, kwargs...) = pmap(f, CachingPool(workers(role = role)), c; role = role, kwargs...) pmap(f, c1, c...; kwargs...) = pmap(a->f(a...), zip(c1, c...); kwargs...) function wrap_on_error(f, on_error; capture_data=false) @@ -180,11 +180,11 @@ function wrap_retry(f, retry_delays, retry_check) end end -function wrap_batch(f, p, handle_errors) +function wrap_batch(f, p, handle_errors; role= :default) f = asyncmap_batch(f) return batch -> begin try - remotecall_fetch(f, p, batch) + remotecall_fetch(f, p, batch; role=role) catch e if handle_errors return Any[BatchProcessingError(b, e) for b in batch] @@ -199,7 +199,7 @@ asyncmap_batch(f) = batch -> asyncmap(x->f(x...), batch) extract_exception(e) = isa(e, RemoteException) ? e.captured.ex : e -function process_batch_errors!(p, f, results, on_error, retry_delays, retry_check) +function process_batch_errors!(p, f, results, on_error, retry_delays, retry_check; role= :default) # Handle all the ones in error in another pmap, with batch size set to 1 reprocess = Tuple{Int,BatchProcessingError}[] for (idx, v) in enumerate(results) @@ -211,14 +211,14 @@ function process_batch_errors!(p, f, results, on_error, retry_delays, retry_chec if length(reprocess) > 0 errors = [x[2] for x in reprocess] exceptions = Any[x.ex for x in errors] - state = iterate(retry_delays) + state = iterate(retry_delays#=; role = role=#) state !== nothing && (state = state[2]) error_processed = let state=state if (length(retry_delays)::Int > 0) && (retry_check === nothing || all([retry_check(state,ex)[2] for ex in exceptions])) # BatchProcessingError.data is a tuple of original args pmap(x->f(x...), p, Any[x.data for x in errors]; - on_error = on_error, retry_delays = collect(retry_delays)[2:end::Int], retry_check = retry_check) + on_error = on_error, retry_delays = collect(retry_delays)[2:end::Int], retry_check = retry_check, role = role) elseif on_error !== nothing map(on_error, exceptions) else diff --git a/src/process_messages.jl b/src/process_messages.jl index 3032917..b21d3ea 100644 --- a/src/process_messages.jl +++ b/src/process_messages.jl @@ -58,70 +58,70 @@ Exceptions on remote computations are captured and rethrown locally. A `RemoteE wraps the `pid` of the worker and a captured exception. A `CapturedException` captures the remote exception and a serializable form of the call stack when the exception was raised. """ -RemoteException(captured) = RemoteException(myid(), captured) -function showerror(io::IO, re::RemoteException) - (re.pid != myid()) && print(io, "On worker ", re.pid, ":\n") - showerror(io, re.captured) +RemoteException(captured; role= :default) = RemoteException(myid(role=role), captured) +function showerror(io::IO, re::RemoteException#=; role= :default=#) + (re.pid != myid(#=role = role=#)) && print(io, "On worker ", re.pid, ":\n") + showerror(io, re.captured#=; role = role=#) end -function run_work_thunk(thunk::Function, print_error::Bool) +function run_work_thunk(thunk::Function, print_error::Bool; role=:default) local result try result = thunk() catch err ce = CapturedException(err, catch_backtrace()) - result = RemoteException(ce) - print_error && showerror(stderr, ce) + result = RemoteException(ce; role=role) + print_error && showerror(stderr, ce#=; role = role=#) end return result end -function run_work_thunk(rv::RemoteValue, thunk) - put!(rv, run_work_thunk(thunk, false)) +function run_work_thunk(rv::RemoteValue, thunk; role= :default) + put!(rv, run_work_thunk(thunk, false; role=role)) nothing end -function schedule_call(rid, thunk) +function schedule_call(rid, thunk; role= :default) return lock(client_refs) do rv = RemoteValue(def_rv_channel()) - (PGRP::ProcessGroup).refs[rid] = rv + (PGRP(role = role)::ProcessGroup).refs[rid] = rv push!(rv.clientset, rid.whence) - errormonitor(@async run_work_thunk(rv, thunk)) + errormonitor(@async run_work_thunk(rv, thunk; role=role)) return rv end end -function deliver_result(sock::IO, msg, oid, value) - #print("$(myid()) sending result $oid\n") +function deliver_result(sock::IO, msg, oid, value; role= :default) + #print("$(myid(role=role)) sending result $oid\n") if msg === :call_fetch || isa(value, RemoteException) val = value else val = :OK end try - send_msg_now(sock, MsgHeader(oid), ResultMsg(val)) + send_msg_now(sock, MsgHeader(oid), ResultMsg(val); role = role) catch e # terminate connection in case of serialization error # otherwise the reading end would hang - @error "Fatal error on process $(myid())" exception=e,catch_backtrace() - wid = worker_id_from_socket(sock) + @error "Fatal error on process $(myid(role=role))" exception=e,catch_backtrace() + wid = worker_id_from_socket(sock; role = role) close(sock) - if myid()==1 + if myid(role=role)==1 rmprocs(wid) elseif wid == 1 exit(1) else - remote_do(rmprocs, 1, wid) + remote_do(rmprocs, 1, wid; role = role) end end end ## message event handlers ## -function process_messages(r_stream::TCPSocket, w_stream::TCPSocket, incoming::Bool=true) - errormonitor(@async process_tcp_streams(r_stream, w_stream, incoming)) +function process_messages(r_stream::TCPSocket, w_stream::TCPSocket, incoming::Bool=true; role= :default) + errormonitor(@async process_tcp_streams(r_stream, w_stream, incoming; role = role)) end -function process_tcp_streams(r_stream::TCPSocket, w_stream::TCPSocket, incoming::Bool) +function process_tcp_streams(r_stream::TCPSocket, w_stream::TCPSocket, incoming::Bool; role= :default) Sockets.nagle(r_stream, false) Sockets.quickack(r_stream, true) wait_connected(r_stream) @@ -130,7 +130,7 @@ function process_tcp_streams(r_stream::TCPSocket, w_stream::TCPSocket, incoming: Sockets.quickack(w_stream, true) wait_connected(w_stream) end - message_handler_loop(r_stream, w_stream, incoming) + message_handler_loop(r_stream, w_stream, incoming; role = role) end """ @@ -147,22 +147,22 @@ Julia version number to perform the authentication handshake. See also [`cluster_cookie`](@ref). """ -function process_messages(r_stream::IO, w_stream::IO, incoming::Bool=true) - errormonitor(@async message_handler_loop(r_stream, w_stream, incoming)) +function process_messages(r_stream::IO, w_stream::IO, incoming::Bool=true; role= :default) + errormonitor(@async message_handler_loop(r_stream, w_stream, incoming; role = role)) end -function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool) +function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool; role= :default) wpid=0 # the worker r_stream is connected to. boundary = similar(MSG_BOUNDARY) try - version = process_hdr(r_stream, incoming) + version = process_hdr(r_stream, incoming; role = role) serializer = ClusterSerializer(r_stream) # The first message will associate wpid with r_stream header = deserialize_hdr_raw(r_stream) msg = deserialize_msg(serializer) - handle_msg(msg, header, r_stream, w_stream, version) - wpid = worker_id_from_socket(r_stream) + handle_msg(msg, header, r_stream, w_stream, version; role = role) + wpid = worker_id_from_socket(r_stream; role = role) @assert wpid > 0 readbytes!(r_stream, boundary, length(MSG_BOUNDARY)) @@ -170,11 +170,12 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool) while true reset_state(serializer) header = deserialize_hdr_raw(r_stream) - # println("header: ", header) + #println("header: ", header) try msg = invokelatest(deserialize_msg, serializer) catch e + #println("*************************************************") # Deserialization error; discard bytes in stream until boundary found boundary_idx = 1 while true @@ -193,42 +194,42 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool) # remotecalls only rethrow RemoteExceptions. Any other exception is treated as # data to be returned. Wrap this exception in a RemoteException. - remote_err = RemoteException(myid(), CapturedException(e, catch_backtrace())) + remote_err = RemoteException(myid(role=role), CapturedException(e, catch_backtrace())) # println("Deserialization error. ", remote_err) if !null_id(header.response_oid) - ref = lookup_ref(header.response_oid) + ref = lookup_ref(header.response_oid; role = role) put!(ref, remote_err) end if !null_id(header.notify_oid) - deliver_result(w_stream, :call_fetch, header.notify_oid, remote_err) + deliver_result(w_stream, :call_fetch, header.notify_oid, remote_err; role = role) end continue end readbytes!(r_stream, boundary, length(MSG_BOUNDARY)) - # println("got msg: ", typeof(msg)) - handle_msg(msg, header, r_stream, w_stream, version) + #println("got msg: ", typeof(msg)) + handle_msg(msg, header, r_stream, w_stream, version; role = role) end catch e oldstate = W_UNKNOWN_STATE # Check again as it may have been set in a message handler but not propagated to the calling block above if wpid < 1 - wpid = worker_id_from_socket(r_stream) + wpid = worker_id_from_socket(r_stream; role = role) end if wpid < 1 println(stderr, e, CapturedException(e, catch_backtrace())) - println(stderr, "Process($(myid())) - Unknown remote, closing connection.") + println(stderr, "Process($(myid(role=role))) - Unknown remote, closing connection.") elseif !(wpid in map_del_wrkr) werr = worker_from_id(wpid) - oldstate = werr.state + oldstate = @atomic werr.state set_worker_state(werr, W_TERMINATED) # If unhandleable error occurred talking to pid 1, exit if wpid == 1 if isopen(w_stream) - @error "Fatal error on process $(myid())" exception=e,catch_backtrace() + @error "Fatal error on process $(myid(role=role))" exception=e,catch_backtrace() end exit(1) end @@ -236,13 +237,13 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool) # Will treat any exception as death of node and cleanup # since currently we do not have a mechanism for workers to reconnect # to each other on unhandled errors - deregister_worker(wpid) + deregister_worker(wpid; role = role) end close(r_stream) close(w_stream) - if (myid() == 1) && (wpid > 1) + if (myid(role=role) == 1) && (wpid > 1) if oldstate != W_TERMINATING println(stderr, "Worker $wpid terminated.") rethrow() @@ -253,7 +254,7 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool) end end -function process_hdr(s, validate_cookie) +function process_hdr(s, validate_cookie; role= :default) if validate_cookie cookie = read(s, HDR_COOKIE_LEN) if length(cookie) < HDR_COOKIE_LEN @@ -263,7 +264,7 @@ function process_hdr(s, validate_cookie) self_cookie = cluster_cookie() for i in 1:HDR_COOKIE_LEN if UInt8(self_cookie[i]) != cookie[i] - error("Process($(myid())) - Invalid connection credentials sent by remote.") + error("Process($(myid(role = role))) - Invalid connection credentials sent by remote.") end end end @@ -279,67 +280,69 @@ function process_hdr(s, validate_cookie) return VersionNumber(strip(String(version))) end -function handle_msg(msg::CallMsg{:call}, header, r_stream, w_stream, version) - schedule_call(header.response_oid, ()->invokelatest(msg.f, msg.args...; msg.kwargs...)) +function handle_msg(msg::CallMsg{:call}, header, r_stream, w_stream, version; role= :default) + schedule_call(header.response_oid, ()->invokelatest(msg.f, msg.args...; msg.kwargs...); role = role) end -function handle_msg(msg::CallMsg{:call_fetch}, header, r_stream, w_stream, version) +function handle_msg(msg::CallMsg{:call_fetch}, header, r_stream, w_stream, version; role= :default) + #@info "handle ", msg errormonitor(@async begin - v = run_work_thunk(()->invokelatest(msg.f, msg.args...; msg.kwargs...), false) + v = run_work_thunk(()->invokelatest(msg.f, msg.args...; msg.kwargs...), false; role=role) if isa(v, SyncTake) try - deliver_result(w_stream, :call_fetch, header.notify_oid, v.v) + deliver_result(w_stream, :call_fetch, header.notify_oid, v.v; role = role) finally unlock(v.rv.synctake) end else - deliver_result(w_stream, :call_fetch, header.notify_oid, v) + deliver_result(w_stream, :call_fetch, header.notify_oid, v; role = role) end nothing end) end -function handle_msg(msg::CallWaitMsg, header, r_stream, w_stream, version) +function handle_msg(msg::CallWaitMsg, header, r_stream, w_stream, version; role= :default) errormonitor(@async begin - rv = schedule_call(header.response_oid, ()->invokelatest(msg.f, msg.args...; msg.kwargs...)) - deliver_result(w_stream, :call_wait, header.notify_oid, fetch(rv.c)) + rv = schedule_call(header.response_oid, ()->invokelatest(msg.f, msg.args...; msg.kwargs...); role = role) + deliver_result(w_stream, :call_wait, header.notify_oid, fetch(rv.c); role = role) nothing end) end -function handle_msg(msg::RemoteDoMsg, header, r_stream, w_stream, version) - errormonitor(@async run_work_thunk(()->invokelatest(msg.f, msg.args...; msg.kwargs...), true)) +function handle_msg(msg::RemoteDoMsg, header, r_stream, w_stream, version; role= :default) + errormonitor(@async run_work_thunk(()->invokelatest(msg.f, msg.args...; msg.kwargs...), true; role=role)) end -function handle_msg(msg::ResultMsg, header, r_stream, w_stream, version) - put!(lookup_ref(header.response_oid), msg.value) +function handle_msg(msg::ResultMsg, header, r_stream, w_stream, version; role= :default) + put!(lookup_ref(header.response_oid; role = role), msg.value) end -function handle_msg(msg::IdentifySocketMsg, header, r_stream, w_stream, version) +function handle_msg(msg::IdentifySocketMsg, header, r_stream, w_stream, version; role= :default) # register a new peer worker connection - w = Worker(msg.from_pid, r_stream, w_stream, cluster_manager; version=version) + w = Worker(msg.from_pid, r_stream, w_stream, cluster_manager; version=version, role = role) send_connection_hdr(w, false) - send_msg_now(w, MsgHeader(), IdentifySocketAckMsg()) + send_msg_now(w, MsgHeader(), IdentifySocketAckMsg(); role = role) notify(w.initialized) end -function handle_msg(msg::IdentifySocketAckMsg, header, r_stream, w_stream, version) +function handle_msg(msg::IdentifySocketAckMsg, header, r_stream, w_stream, version; role= :default) w = map_sock_wrkr[r_stream] w.version = version end -function handle_msg(msg::JoinPGRPMsg, header, r_stream, w_stream, version) - LPROC.id = msg.self_pid - controller = Worker(1, r_stream, w_stream, cluster_manager; version=version) +function handle_msg(msg::JoinPGRPMsg, header, r_stream, w_stream, version; role= :default) + #LPROC.id = msg.self_pid + myid!(msg.self_pid, role=role) + controller = Worker(1, r_stream, w_stream, cluster_manager; version=version, role = role) notify(controller.initialized) register_worker(LPROC) - topology(msg.topology) + topology(msg.topology; role=role) if !msg.enable_threaded_blas Base.disable_library_threading() end lazy = msg.lazy - PGRP.lazy = lazy + PGRP(role = role).lazy = lazy @sync for (connect_at, rpid) in msg.other_workers wconfig = WorkerConfig() @@ -348,32 +351,32 @@ function handle_msg(msg::JoinPGRPMsg, header, r_stream, w_stream, version) let rpid=rpid, wconfig=wconfig if lazy # The constructor registers the object with a global registry. - Worker(rpid, ()->connect_to_peer(cluster_manager, rpid, wconfig)) + Worker(rpid, ()->connect_to_peer(cluster_manager, rpid, wconfig; role = role); role = role) else - @async connect_to_peer(cluster_manager, rpid, wconfig) + @async connect_to_peer(cluster_manager, rpid, wconfig; role = role) end end end send_connection_hdr(controller, false) - send_msg_now(controller, MsgHeader(RRID(0,0), header.notify_oid), JoinCompleteMsg(Sys.CPU_THREADS, getpid())) + send_msg_now(controller, MsgHeader(RRID(0,0), header.notify_oid), JoinCompleteMsg(Sys.CPU_THREADS, getpid()); role = role) end -function connect_to_peer(manager::ClusterManager, rpid::Int, wconfig::WorkerConfig) +function connect_to_peer(manager::ClusterManager, rpid::Int, wconfig::WorkerConfig; role= :default) try (r_s, w_s) = connect(manager, rpid, wconfig) - w = Worker(rpid, r_s, w_s, manager; config=wconfig) - process_messages(w.r_stream, w.w_stream, false) + w = Worker(rpid, r_s, w_s, manager; config=wconfig, role = role) + process_messages(w.r_stream, w.w_stream, false; role = role) send_connection_hdr(w, true) - send_msg_now(w, MsgHeader(), IdentifySocketMsg(myid())) + send_msg_now(w, MsgHeader(), IdentifySocketMsg(myid(role=role)), role = role) notify(w.initialized) catch e - @error "Error on $(myid()) while connecting to peer $rpid, exiting" exception=e,catch_backtrace() + @error "Error on $(myid(role=role)) while connecting to peer $rpid, exiting" exception=e,catch_backtrace() exit(1) end end -function handle_msg(msg::JoinCompleteMsg, header, r_stream, w_stream, version) +function handle_msg(msg::JoinCompleteMsg, header, r_stream, w_stream, version; role= :default) w = map_sock_wrkr[r_stream] environ = something(w.config.environ, Dict()) environ[:cpu_threads] = msg.cpu_threads @@ -381,8 +384,8 @@ function handle_msg(msg::JoinCompleteMsg, header, r_stream, w_stream, version) w.config.ospid = msg.ospid w.version = version - ntfy_channel = lookup_ref(header.notify_oid) - put!(ntfy_channel, w.id) + ntfy_channel = lookup_ref(header.notify_oid; role = role) + put!(ntfy_channel, wid(w,role=role)) - push!(default_worker_pool(), w.id) + push!(default_worker_pool(role=role), wid(w,role=role), role = role) end diff --git a/src/remotecall.jl b/src/remotecall.jl index 0b1143d..38ca131 100644 --- a/src/remotecall.jl +++ b/src/remotecall.jl @@ -29,8 +29,8 @@ mutable struct Future <: AbstractRemoteRef lock::ReentrantLock @atomic v::Union{Some{Any}, Nothing} - Future(w::Int, rrid::RRID, v::Union{Some, Nothing}=nothing) = - (r = new(w,rrid.whence,rrid.id,ReentrantLock(),v); return test_existing_ref(r)) + Future(w::Int, rrid::RRID, v::Union{Some, Nothing}=nothing; role= :default) = + (r = new(w,rrid.whence,rrid.id,ReentrantLock(),v); return test_existing_ref(r; role = role)) Future(t::NTuple{4, Any}) = new(t[1],t[2],t[3],ReentrantLock(),t[4]) # Useful for creating dummy, zeroed-out instances end @@ -56,9 +56,9 @@ mutable struct RemoteChannel{T<:AbstractChannel} <: AbstractRemoteRef whence::Int id::Int - function RemoteChannel{T}(w::Int, rrid::RRID) where T<:AbstractChannel + function RemoteChannel{T}(w::Int, rrid::RRID; role= :default) where T<:AbstractChannel r = new(w, rrid.whence, rrid.id) - return test_existing_ref(r) + return test_existing_ref(r; role = role) end function RemoteChannel{T}(t::Tuple) where T<:AbstractChannel @@ -66,7 +66,7 @@ mutable struct RemoteChannel{T<:AbstractChannel} <: AbstractRemoteRef end end -function test_existing_ref(r::AbstractRemoteRef) +function test_existing_ref(r::AbstractRemoteRef; role= :default) found = getkey(client_refs, r, nothing) if found !== nothing @assert r.where > 0 @@ -76,7 +76,7 @@ function test_existing_ref(r::AbstractRemoteRef) rv_cache = @atomic :monotonic r.v if fv_cache === nothing && rv_cache !== nothing # we have recd the value from another source, probably a deserialized ref, send a del_client message - send_del_client(r) + send_del_client(r; role = role) @lock found.lock begin @atomicreplace found.v nothing => rv_cache end @@ -86,21 +86,21 @@ function test_existing_ref(r::AbstractRemoteRef) end client_refs[r] = nothing - finalizer(finalize_ref, r) + finalizer(r -> finalize_ref(r, role), r) return r end -function finalize_ref(r::AbstractRemoteRef) +function finalize_ref(r::AbstractRemoteRef, role) if r.where > 0 # Handle the case of the finalizer having been called manually if trylock(client_refs.lock) # trylock doesn't call wait which causes yields try delete!(client_refs.ht, r) # direct removal avoiding locks if isa(r, RemoteChannel) - send_del_client_no_lock(r) + send_del_client_no_lock(r; role = role) else # send_del_client only if the reference has not been set v_cache = @atomic :monotonic r.v - v_cache === nothing && send_del_client_no_lock(r) + v_cache === nothing && send_del_client_no_lock(r; role = role) @atomic :monotonic r.v = nothing end r.where = 0 @@ -108,10 +108,10 @@ function finalize_ref(r::AbstractRemoteRef) unlock(client_refs.lock) end else - finalizer(finalize_ref, r) + finalizer(r -> finalize_ref(r, role), r) return nothing end - end + end nothing end @@ -121,16 +121,17 @@ end Create a `Future` on process `pid`. The default `pid` is the current process. """ -Future(pid::Integer=myid()) = Future(pid, RRID()) -Future(w::LocalProcess) = Future(w.id) -Future(w::Worker) = Future(w.id) +Future(pid::Integer=-1; role =:default) = Future(pid < 0 ? myid(role = role) : pid, RRID(role = role); role = role) +Future(w::LocalProcess; role =:default) = Future(wid(w,role=role); role = role) +Future(w::Worker; role =:default) = Future(wid(w,role=role); role = role) -RemoteChannel(pid::Integer=myid()) = RemoteChannel{Channel{Any}}(pid, RRID()) +RemoteChannel(pid::Integer=-1; role= :default) = RemoteChannel{Channel{Any}}(pid < 0 ? myid(role = role) : pid, RRID(role = role); role = role) -function RemoteChannel(f::Function, pid::Integer=myid()) - remotecall_fetch(pid, f, RRID()) do f, rrid - rv=lookup_ref(rrid, f) - RemoteChannel{typeof(rv.c)}(myid(), rrid) +function RemoteChannel(f::Function, pid_::Integer=0; role= :default) + pid = pid_ == 0 ? myid(role = role) : pid_ + remotecall_fetch(pid, f, RRID(role = role); role = role) do f, rrid + rv=lookup_ref(rrid, f; role = role) + RemoteChannel{typeof(rv.c)}(myid(role = role), rrid; role = role) end end @@ -169,9 +170,9 @@ A low-level API which returns the backing `AbstractChannel` for an `id` returned [`remoteref_id`](@ref). The call is valid only on the node where the backing channel exists. """ -function channel_from_id(id) +function channel_from_id(id; role= :default) rv = lock(client_refs) do - return get(PGRP.refs, id, false) + return get(PGRP(role = role).refs, id, false) end if rv === false throw(ErrorException("Local instance of remote reference not found")) @@ -179,7 +180,7 @@ function channel_from_id(id) return rv.c end -lookup_ref(rrid::RRID, f=def_rv_channel) = lookup_ref(PGRP, rrid, f) +lookup_ref(rrid::RRID, f=def_rv_channel; role= :default) = lookup_ref(PGRP(role = role), rrid, f) function lookup_ref(pg, rrid, f) return lock(client_refs) do rv = get(pg.refs, rrid, false) @@ -209,15 +210,15 @@ errormonitor(@async put!(f, remotecall_fetch(long_computation, p))) isready(f) # will not block ``` """ -function isready(rr::Future) +function isready(rr::Future; role= :default) v_cache = @atomic rr.v v_cache === nothing || return true rid = remoteref_id(rr) - return if rr.where == myid() - isready(lookup_ref(rid).c) + return if rr.where == myid(role = role) + isready(lookup_ref(rid; role = role).c) else - remotecall_fetch(rid->isready(lookup_ref(rid).c), rr.where, rid) + remotecall_fetch((rid, role)->isready(lookup_ref(rid; role = role).c), rr.where, rid, rr.where == 1 ? :master : :worker; role = role) end end @@ -229,18 +230,18 @@ Note that this function can cause race conditions, since by the time you receive its result it may no longer be true. However, it can be safely used on a [`Future`](@ref) since they are assigned only once. """ -function isready(rr::RemoteChannel, args...) +function isready(rr::RemoteChannel, args...; role= :default) rid = remoteref_id(rr) - return if rr.where == myid() - isready(lookup_ref(rid).c, args...) + return if rr.where == myid(role = role) + isready(lookup_ref(rid; role = role).c, args...) else - remotecall_fetch(rid->isready(lookup_ref(rid).c, args...), rr.where, rid) + remotecall_fetch(rid->isready(lookup_ref(rid; role = rr.where == 1 ? :master : :worker).c, args...), rr.where, rid; role = role) end end -del_client(rr::AbstractRemoteRef) = del_client(remoteref_id(rr), myid()) +del_client(rr::AbstractRemoteRef; role= :default) = del_client(remoteref_id(rr), myid(role = role); role = role) -del_client(id, client) = del_client(PGRP, id, client) +del_client(id, client; role= :default) = del_client(PGRP(role = role), id, client) function del_client(pg, id, client) lock(client_refs) do _del_client(pg, id, client) @@ -260,9 +261,9 @@ function _del_client(pg, id, client) nothing end -function del_clients(pairs::Vector) +function del_clients(pairs::Vector; role= :default) for p in pairs - del_client(p[1], p[2]) + del_client(p[1], p[2]; role = role) end end @@ -272,9 +273,9 @@ end # XXX: Is this worth the additional complexity? # `flush_gc_msgs` has to iterate over all connected workers. const any_gc_flag = Threads.Condition() -function start_gc_msgs_task() +function start_gc_msgs_task(; role= :default) errormonitor( - Threads.@spawn begin + @async begin while true lock(any_gc_flag) do # this might miss events @@ -283,27 +284,27 @@ function start_gc_msgs_task() # Use invokelatest() so that custom message transport streams # for workers can be defined in a newer world age than the Task # which runs the loop here. - invokelatest(flush_gc_msgs) # handles throws internally + invokelatest(flush_gc_msgs#=; role = role=#) # handles throws internally end end ) end # Function can be called within a finalizer -function send_del_client(rr) - if rr.where == myid() - del_client(rr) +function send_del_client(rr; role= :default) + if rr.where == myid(role = role) + del_client(rr; role = role) elseif id_in_procs(rr.where) # process only if a valid worker - process_worker(rr) + process_worker(rr; role = role) end end -function send_del_client_no_lock(rr) +function send_del_client_no_lock(rr; role= :default) # for gc context to avoid yields - if rr.where == myid() - _del_client(PGRP, remoteref_id(rr), myid()) + if rr.where == myid(role = role) + _del_client(PGRP(role = role), remoteref_id(rr), myid(role = role)) elseif id_in_procs(rr.where) # process only if a valid worker - process_worker(rr) + process_worker(rr; role = role) end end @@ -317,12 +318,12 @@ function publish_del_msg!(w::Worker, msg) end end -function process_worker(rr) - w = worker_from_id(rr.where)::Worker - msg = (remoteref_id(rr), myid()) +function process_worker(rr; role= :default) + w = worker_from_id(rr.where; role = role)::Worker + msg = (remoteref_id(rr), myid(role = role)) # Needs to acquire a lock on the del_msg queue - T = Threads.@spawn begin + T = @async begin publish_del_msg!($w, $msg) end Base.errormonitor(T) @@ -330,28 +331,28 @@ function process_worker(rr) return end -function add_client(id, client) +function add_client(id, client; role= :default) lock(client_refs) do - rv = lookup_ref(id) + rv = lookup_ref(id; role = role) push!(rv.clientset, client) end nothing end -function add_clients(pairs::Vector) +function add_clients(pairs::Vector; role= :default) for p in pairs - add_client(p[1], p[2]...) + add_client(p[1], p[2]...; role = role) end end -function send_add_client(rr::AbstractRemoteRef, i) - if rr.where == myid() +function send_add_client(rr::AbstractRemoteRef, i; role= :default) + if rr.where == myid(role = role) add_client(remoteref_id(rr), i) elseif (i != rr.where) && id_in_procs(rr.where) # don't need to send add_client if the message is already going # to the processor that owns the remote ref. it will add_client # itself inside deserialize(). - w = worker_from_id(rr.where) + w = worker_from_id(rr.where; role = role) lock(w.msg_lock) do push!(w.add_msgs, (remoteref_id(rr), i)) @atomic w.gcflag = true @@ -364,24 +365,24 @@ end channel_type(rr::RemoteChannel{T}) where {T} = T -function serialize(s::ClusterSerializer, f::Future) +function serialize(s::ClusterSerializer, f::Future; role = :default) v_cache = @atomic f.v if v_cache === nothing - p = worker_id_from_socket(s.io) - (p !== f.where) && send_add_client(f, p) + p = worker_id_from_socket(s.io; role = role) + (p !== f.where) && send_add_client(f, p; role = role) end invoke(serialize, Tuple{ClusterSerializer, Any}, s, f) end -function serialize(s::ClusterSerializer, rr::RemoteChannel) - p = worker_id_from_socket(s.io) - (p !== rr.where) && send_add_client(rr, p) +function serialize(s::ClusterSerializer, rr::RemoteChannel; role = :default) + p = worker_id_from_socket(s.io; role = role) + (p !== rr.where) && send_add_client(rr, p; role = role) invoke(serialize, Tuple{ClusterSerializer, Any}, s, rr) end -function deserialize(s::ClusterSerializer, t::Type{<:Future}) +function deserialize(s::ClusterSerializer, t::Type{<:Future}; role = :default) fc = invoke(deserialize, Tuple{ClusterSerializer, DataType}, s, t) # deserialized copy - f2 = Future(fc.where, RRID(fc.whence, fc.id), fc.v) # ctor adds to client_refs table + f2 = Future(fc.where, RRID(fc.whence, fc.id), fc.v; role = role) # ctor adds to client_refs table # 1) send_add_client() is not executed when the ref is being serialized # to where it exists, hence do it here. @@ -389,21 +390,21 @@ function deserialize(s::ClusterSerializer, t::Type{<:Future}) # already 'fetch'ed instance in client_refs (Issue #25847), we should not # track it in the backing RemoteValue store. f2v_cache = @atomic f2.v - if f2.where == myid() && f2v_cache === nothing - add_client(remoteref_id(f2), myid()) + if f2.where == myid(role = role) && f2v_cache === nothing + add_client(remoteref_id(f2), myid(role = role); role = role) end f2 end -function deserialize(s::ClusterSerializer, t::Type{<:RemoteChannel}) +function deserialize(s::ClusterSerializer, t::Type{<:RemoteChannel}; role = :default) rr = invoke(deserialize, Tuple{ClusterSerializer, DataType}, s, t) - if rr.where == myid() + if rr.where == myid(role = role) # send_add_client() is not executed when the ref is being # serialized to where it exists - add_client(remoteref_id(rr), myid()) + add_client(remoteref_id(rr), myid(role = role); role = role) end # call ctor to make sure this rr gets added to the client_refs table - RemoteChannel{channel_type(rr)}(rr.where, RRID(rr.whence, rr.id)) + RemoteChannel{channel_type(rr)}(rr.where, RRID(rr.whence, rr.id); role = role) end # Future and RemoteChannel are serializable only in a running cluster. @@ -422,18 +423,19 @@ end # make a thunk to call f on args in a way that simulates what would happen if # the function were sent elsewhere function local_remotecall_thunk(f, args, kwargs) + #println("local_remotecall_thunk($f, $args, $kwargs)") return ()->invokelatest(f, args...; kwargs...) end -function remotecall(f, w::LocalProcess, args...; kwargs...) - rr = Future(w) - schedule_call(remoteref_id(rr), local_remotecall_thunk(f, args, kwargs)) +function remotecall(f, w::LocalProcess, args...; role= :default, kwargs...) + rr = Future(w; role = role) + schedule_call(remoteref_id(rr), local_remotecall_thunk(f, args, kwargs); role = role) return rr end -function remotecall(f, w::Worker, args...; kwargs...) - rr = Future(w) - send_msg(w, MsgHeader(remoteref_id(rr)), CallMsg{:call}(f, args, kwargs)) +function remotecall(f, w::Worker, args...; role= :default, kwargs...) + rr = Future(w; role = role) + send_msg(w, MsgHeader(remoteref_id(rr)), CallMsg{:call}(f, args, kwargs); role = role) return rr end @@ -444,26 +446,48 @@ Call a function `f` asynchronously on the given arguments on the specified proce Return a [`Future`](@ref). Keyword arguments, if any, are passed through to `f`. """ -remotecall(f, id::Integer, args...; kwargs...) = remotecall(f, worker_from_id(id), args...; kwargs...) +remotecall(f, id::Integer, args...; role= :default, kwargs...) = +# remotecall(f, worker_from_id(id; role = id == 1 ? :master : :worker), args...; role = role, kwargs...) + remotecall(f, worker_from_id(id; role = role), args...; role = role, kwargs...) + +function remotecall_fetch(f, w::LocalProcess, args...; role= :default, kwargs...) + v=run_work_thunk(local_remotecall_thunk(f, args, kwargs), false; role = role) + return isa(v, RemoteException) ? throw(v) : v +end + -function remotecall_fetch(f, w::LocalProcess, args...; kwargs...) - v=run_work_thunk(local_remotecall_thunk(f,args, kwargs), false) +function remotecall_fetch(f, w::Worker, args...; role= :default, kwargs...) + # can be weak, because the program will have no way to refer to the Ref + # itself, it only gets the result. + oid = RRID(role = role) + rv = lookup_ref(oid; role = role) + rv.waitingfor = wid(w, role = role) + send_msg(w, MsgHeader(RRID(0,0), oid), CallMsg{:call_fetch}(f, args, kwargs); role = role) + v = take!(rv) + lock(client_refs) do + delete!(PGRP(role = role).refs, oid) + end return isa(v, RemoteException) ? throw(v) : v end + +#= function remotecall_fetch(f, w::Worker, args...; kwargs...) # can be weak, because the program will have no way to refer to the Ref # itself, it only gets the result. - oid = RRID() - rv = lookup_ref(oid) - rv.waitingfor = w.id - send_msg(w, MsgHeader(RRID(0,0), oid), CallMsg{:call_fetch}(f, args, kwargs)) + role = haskey(kwargs, :role) ? kwargs[:role] : :default + oid = RRID(role = role) + rv = lookup_ref(oid; role = role) + rv.waitingfor = wid(w, role=role) + @info "send_msg ...$(Base.nameof(f)) === $(Base.kwarg_decl.(methods(f)))" + send_msg(w, MsgHeader(RRID(0,0), oid), CallMsg{:call_fetch}(f, args, kwargs); role = role) v = take!(rv) lock(client_refs) do - delete!(PGRP.refs, oid) + delete!(PGRP(role = role).refs, oid) end return isa(v, RemoteException) ? throw(v) : v end +=# """ remotecall_fetch(f, id::Integer, args...; kwargs...) @@ -489,20 +513,20 @@ sqrt was called with a negative real argument but will only return a complex res ... ``` """ -remotecall_fetch(f, id::Integer, args...; kwargs...) = - remotecall_fetch(f, worker_from_id(id), args...; kwargs...) +remotecall_fetch(f, id::Integer, args...; role= :default, kwargs...) = + remotecall_fetch(f, worker_from_id(id; role = role), args...; role = role, kwargs...) -remotecall_wait(f, w::LocalProcess, args...; kwargs...) = wait(remotecall(f, w, args...; kwargs...)) +remotecall_wait(f, w::LocalProcess, args...; role= :default, kwargs...) = wait(remotecall(f, w, args...; role = role, kwargs...); role = role) -function remotecall_wait(f, w::Worker, args...; kwargs...) - prid = RRID() - rv = lookup_ref(prid) - rv.waitingfor = w.id - rr = Future(w) - send_msg(w, MsgHeader(remoteref_id(rr), prid), CallWaitMsg(f, args, kwargs)) +function remotecall_wait(f, w::Worker, args...; role= :default, kwargs...) + prid = RRID(role = role) + rv = lookup_ref(prid; role = role) + rv.waitingfor = wid(w,role=role) + rr = Future(w; role = role) + send_msg(w, MsgHeader(remoteref_id(rr), prid), CallWaitMsg(f, args, kwargs); role = role) v = fetch(rv.c) lock(client_refs) do - delete!(PGRP.refs, prid) + delete!(PGRP(role = role).refs, prid) end isa(v, RemoteException) && throw(v) return rr @@ -516,10 +540,10 @@ Keyword arguments, if any, are passed through to `f`. See also [`wait`](@ref) and [`remotecall`](@ref). """ -remotecall_wait(f, id::Integer, args...; kwargs...) = - remotecall_wait(f, worker_from_id(id), args...; kwargs...) +remotecall_wait(f, id::Integer, args...; role= :default, kwargs...) = + remotecall_wait(f, worker_from_id(id; role = role), args...; kwargs...) -function remote_do(f, w::LocalProcess, args...; kwargs...) +function remote_do(f, w::LocalProcess, args...; role = :default, kwargs...) # the LocalProcess version just performs in local memory what a worker # does when it gets a :do message. # same for other messages on LocalProcess. @@ -528,8 +552,8 @@ function remote_do(f, w::LocalProcess, args...; kwargs...) nothing end -function remote_do(f, w::Worker, args...; kwargs...) - send_msg(w, MsgHeader(), RemoteDoMsg(f, args, kwargs)) +function remote_do(f, w::Worker, args...; role= :default, kwargs...) + send_msg(w, MsgHeader(), RemoteDoMsg(f, args, kwargs), role = role) nothing end @@ -554,22 +578,29 @@ Any exceptions thrown by `f` are printed to [`stderr`](@ref) on the remote worke Keyword arguments, if any, are passed through to `f`. """ -remote_do(f, id::Integer, args...; kwargs...) = remote_do(f, worker_from_id(id), args...; kwargs...) +remote_do(f, id::Integer, args...; role=:default, kwargs...) = remote_do(f, worker_from_id(id, role = role), role = role, args...; kwargs...) # TO CHECK (e se f não tiver role parameter ?) # have the owner of rr call f on it -function call_on_owner(f, rr::AbstractRemoteRef, args...) +function call_on_owner(f, rr::AbstractRemoteRef, args...; role= :default) rid = remoteref_id(rr) - if rr.where == myid() + if rr.where == myid(role = role) f(rid, args...) else - remotecall_fetch(f, rr.where, rid, args...) + #remotecall_fetch((rid,role) -> f(rid, role = role, args...), rr.where, rid, rr.where==1 ? :master : :worker; role = role) + remotecall_fetch((rid,role) -> f(rid, args...; role=role), rr.where, rid, rr.where==1 ? :master : :worker; role = role) + + + #remotecall_fetch(rid -> f(rid, role = rr.where==1 ? :master : :worker, args...), rr.where; role = role) + #remotecall_fetch(iiiii, rr.where, f, rid, rr.where==1 ? :master : :worker, args...; role = role) +# remotecall_fetch(f, rr.where, rid, args...) + end end -function wait_ref(rid, caller, args...) - v = fetch_ref(rid, args...) +function wait_ref(rid, caller, args...; role= :default) + v = fetch_ref(rid, args...; role = role) if isa(v, RemoteException) - if myid() == caller + if myid(role = role) == caller throw(v) else return v @@ -583,14 +614,20 @@ end Wait for a value to become available for the specified [`Future`](@ref). """ -wait(r::Future) = (v_cache = @atomic r.v; v_cache !== nothing && return r; call_on_owner(wait_ref, r, myid()); r) +wait(r::Future; role= :default) = (v_cache = @atomic r.v; v_cache !== nothing && return r; + call_on_owner(wait_ref, r, myid(role = role); role = role); + #call_on_owner((rid, caller, args...; role=role) -> wait_ref(rid, caller, args...; role=role), r, myid(role = role); role = role); + r) """ wait(r::RemoteChannel, args...) Wait for a value to become available on the specified [`RemoteChannel`](@ref). """ -wait(r::RemoteChannel, args...) = (call_on_owner(wait_ref, r, myid(), args...); r) +wait(r::RemoteChannel, args...; role= :default) = (call_on_owner(wait_ref, r, myid(role = role), args...; role = role); r) +#wait(r::RemoteChannel, args...; role= :default) = (call_on_owner((rid, caller, args...; role=role) -> wait_ref(rid, caller, args...; role=role), r, myid(role = role), args...; role = role); r) + + """ fetch(x::Future) @@ -599,14 +636,14 @@ Wait for and get the value of a [`Future`](@ref). The fetched value is cached lo Further calls to `fetch` on the same reference return the cached value. If the remote value is an exception, throws a [`RemoteException`](@ref) which captures the remote exception and backtrace. """ -function fetch(r::Future) +function fetch(r::Future; role= :default) v_cache = @atomic r.v v_cache !== nothing && return something(v_cache) - if r.where == myid() + if r.where == myid(role = role) rv, v_cache = @lock r.lock begin v_cache = @atomic :monotonic r.v - rv = v_cache === nothing ? lookup_ref(remoteref_id(r)) : nothing + rv = v_cache === nothing ? lookup_ref(remoteref_id(r); role = role) : nothing rv, v_cache end @@ -616,7 +653,8 @@ function fetch(r::Future) v_local = fetch(rv.c) end else - v_local = call_on_owner(fetch_ref, r) + #v_local = call_on_owner((rid, args...; role=role) -> fetch_ref(rid, args...;role=role), r; role = role) + v_local = call_on_owner(fetch_ref, r; role = role) end v_cache = @atomic r.v @@ -634,18 +672,22 @@ function fetch(r::Future) # remote calls getting the value from `call_on_owner` used to return the value directly without wrapping it in `Some(x)` # so we're doing the same thing here if status - send_del_client(r) + send_del_client(r; role = role) return v_local else # this `v_cache` is returned at the end of the function v_cache = v_old end end - send_del_client(r) + send_del_client(r; role = role) + something(v_cache) + end -fetch_ref(rid, args...) = fetch(lookup_ref(rid).c, args...) +fetch_ref(rid, args...; role=:default) = fetch(lookup_ref(rid; role = role).c, #=role=role,=# args...) + + """ fetch(c::RemoteChannel) @@ -653,7 +695,10 @@ fetch_ref(rid, args...) = fetch(lookup_ref(rid).c, args...) Wait for and get a value from a [`RemoteChannel`](@ref). Exceptions raised are the same as for a [`Future`](@ref). Does not remove the item fetched. """ -fetch(r::RemoteChannel, args...) = call_on_owner(fetch_ref, r, args...)::eltype(r) +fetch(r::RemoteChannel, args...; role= :default) = call_on_owner(fetch_ref, r, args...; role = role)::eltype(r) +#fetch(r::RemoteChannel, args...; role= :default) = call_on_owner((rid, args...; role=role) -> fetch_ref(rid, args...;role=role), r, args...; role = role)::eltype(r) + + isready(rv::RemoteValue, args...) = isready(rv.c, args...) @@ -666,19 +711,19 @@ A `put!` on an already set `Future` throws an `Exception`. All asynchronous remote calls return `Future`s and set the value to the return value of the call upon completion. """ -function put!(r::Future, v) - if r.where == myid() +function put!(r::Future, v; role= :default) + if r.where == myid(role = role) rid = remoteref_id(r) - rv = lookup_ref(rid) + rv = lookup_ref(rid; role = role) isready(rv) && error("Future can be set only once") @lock r.lock begin put!(rv, v) # this notifies the tasks waiting on the channel in fetch set_future_cache(r, v) # set the cache before leaving the lock, so that the notified tasks already see it cached end - del_client(rid, myid()) + del_client(rid, myid(role = role); role = role) else @lock r.lock begin # same idea as above if there were any local tasks fetching on this Future - call_on_owner(put_future, r, v, myid()) + call_on_owner(put_future, r, v, myid(role = role); role = role) set_future_cache(r, v) end end @@ -690,21 +735,21 @@ function set_future_cache(r::Future, v) ok || error("internal consistency error detected for Future") end -function put_future(rid, v, caller) - rv = lookup_ref(rid) +function put_future(rid, v, caller; role= :default) + rv = lookup_ref(rid; role = role) isready(rv) && error("Future can be set only once") put!(rv, v) # The caller has the value and hence can be removed from the remote store. - del_client(rid, caller) + del_client(rid, caller; role = role) nothing end put!(rv::RemoteValue, args...) = put!(rv.c, args...) -function put_ref(rid, caller, args...) - rv = lookup_ref(rid) +function put_ref(rid, caller, args...; role= :default) + rv = lookup_ref(rid; role = role) put!(rv, args...) - if myid() == caller && rv.synctake !== nothing + if myid(role = role) == caller && rv.synctake !== nothing # Wait till a "taken" value is serialized out - github issue #29932 lock(rv.synctake) unlock(rv.synctake) @@ -719,15 +764,17 @@ Store a set of values to the [`RemoteChannel`](@ref). If the channel is full, blocks until space is available. Return the first argument. """ -put!(rr::RemoteChannel, args...) = (call_on_owner(put_ref, rr, myid(), args...); rr) +put!(rr::RemoteChannel, args...; role= :default) = (call_on_owner(put_ref, rr, myid(role = role), args...; role = role); rr) +#put!(rr::RemoteChannel, args...; role= :default) = (call_on_owner((rid, caller, args...; role=role) -> put_ref(rid, caller, args...; role=role), rr, myid(role = role), args...; role = role); rr) + # take! is not supported on Future take!(rv::RemoteValue, args...) = take!(rv.c, args...) -function take_ref(rid, caller, args...) - rv = lookup_ref(rid) +function take_ref(rid, caller, args...; role=:default) + rv = lookup_ref(rid; role = role) synctake = false - if myid() != caller && rv.synctake !== nothing + if myid(role = role) != caller && rv.synctake !== nothing # special handling for local put! / remote take! on unbuffered channel # github issue #29932 synctake = true @@ -743,7 +790,7 @@ function take_ref(rid, caller, args...) rethrow(e) end - isa(v, RemoteException) && (myid() == caller) && throw(v) + isa(v, RemoteException) && (myid(role = role) == caller) && throw(v) if synctake return SyncTake(v, rv) @@ -758,31 +805,35 @@ end Fetch value(s) from a [`RemoteChannel`](@ref) `rr`, removing the value(s) in the process. """ -take!(rr::RemoteChannel, args...) = call_on_owner(take_ref, rr, myid(), args...)::eltype(rr) +#take!(rr::RemoteChannel, args...; role= :default) = call_on_owner((rid, caller, args...; role=role) -> take_ref(rid, caller, args...; role=role), rr, myid(role = role), args...; role = role)::eltype(rr) +take!(rr::RemoteChannel, args...; role= :default) = call_on_owner(take_ref, rr, myid(role = role), args...; role = role)::eltype(rr) # close and isopen are not supported on Future -close_ref(rid) = (close(lookup_ref(rid).c); nothing) -close(rr::RemoteChannel) = call_on_owner(close_ref, rr) +close_ref(rid; role= :default) = (close(lookup_ref(rid; role = role).c); nothing) +close(rr::RemoteChannel; role= :default) = call_on_owner(close_ref, rr; role = role) + +isopen_ref(rid; role= :default) = isopen(lookup_ref(rid; role = role).c) +isopen(rr::RemoteChannel; role= :default) = call_on_owner(isopen_ref, rr; role = role) -isopen_ref(rid) = isopen(lookup_ref(rid).c) -isopen(rr::RemoteChannel) = call_on_owner(isopen_ref, rr) +isempty_ref(rid; role= :default) = isempty(lookup_ref(rid; role = role).c) +Base.isempty(rr::RemoteChannel; role= :default) = call_on_owner(isempty_ref, rr; role=role) -getindex(r::RemoteChannel) = fetch(r) -getindex(r::Future) = fetch(r) +getindex(r::RemoteChannel; role= :default) = fetch(r; role = role) +getindex(r::Future; role= :default) = fetch(r; role = role) -getindex(r::Future, args...) = getindex(fetch(r), args...) -function getindex(r::RemoteChannel, args...) - if r.where == myid() - return getindex(fetch(r), args...) +getindex(r::Future, args...; role= :default) = getindex(fetch(r; role = role), args...#=; role = role=#) +function getindex(r::RemoteChannel, args...; role= :default) + if r.where == myid(role = role) + return getindex(fetch(r; role = role), args...#=; role = role=#) end - return remotecall_fetch(getindex, r.where, r, args...) + return remotecall_fetch((r,role) -> getindex(r, role = role, args...), r.where, r, r.where == 1 ? :master : :worker; role = role) end -function iterate(c::RemoteChannel, state=nothing) - if isopen(c) || isready(c) +function iterate(c::RemoteChannel, state=nothing; role= :default) + if isopen(c; role = role) || isready(c; role = role) try - return (take!(c), nothing) + return (take!(c; role=role), nothing) catch e if isa(e, InvalidStateException) || (isa(e, RemoteException) && diff --git a/src/workerpool.jl b/src/workerpool.jl index 5dd1c07..bb66245 100644 --- a/src/workerpool.jl +++ b/src/workerpool.jl @@ -8,6 +8,7 @@ An `AbstractWorkerPool` should implement: - [`push!`](@ref) - add a new worker to the overall pool (available + busy) - [`put!`](@ref) - put back a worker to the available pool - [`take!`](@ref) - take a worker from the available pool (to be used for remote function execution) + - [`wait`](@ref) - block until a worker is available - [`length`](@ref) - number of workers available in the overall pool - [`isready`](@ref) - return false if a `take!` on the pool would block, else true @@ -26,9 +27,9 @@ mutable struct WorkerPool <: AbstractWorkerPool WorkerPool(c::Channel, ref::RemoteChannel) = new(c, Set{Int}(), ref) end -function WorkerPool() - wp = WorkerPool(Channel{Int}(typemax(Int)), RemoteChannel()) - put!(wp.ref, WeakRef(wp)) +function WorkerPool(; role= :default) + wp = WorkerPool(Channel{Int}(typemax(Int)), RemoteChannel(role = role)) + put!(wp.ref, WeakRef(wp), role=role) wp end @@ -48,8 +49,8 @@ julia> WorkerPool(2:4) WorkerPool(Channel{Int64}(sz_max:9223372036854775807,sz_curr:2), Set([4, 2, 3]), RemoteChannel{Channel{Any}}(1, 1, 7)) ``` """ -function WorkerPool(workers::Union{Vector{Int},AbstractRange{Int}}) - pool = WorkerPool() +function WorkerPool(workers::Union{Vector{Int},AbstractRange{Int}}; role= :default) + pool = WorkerPool(role = role) foreach(w->push!(pool, w), workers) return pool end @@ -57,22 +58,22 @@ end # On workers where this pool has been serialized to, instantiate with a dummy local channel. WorkerPool(ref::RemoteChannel) = WorkerPool(Channel{Int}(1), ref) -function serialize(S::AbstractSerializer, pool::WorkerPool) +function serialize(S::AbstractSerializer, pool::WorkerPool; role = :default) # Allow accessing a worker pool from other processors. When serialized, # initialize the `ref` to point to self and only send the ref. # Other workers will forward all put!, take!, calls to the process owning # the ref (and hence the pool). Serialization.serialize_type(S, typeof(pool)) - serialize(S, pool.ref) + serialize(S, pool.ref; role = role) end deserialize(S::AbstractSerializer, t::Type{T}) where {T<:WorkerPool} = T(deserialize(S)) -wp_local_push!(pool::AbstractWorkerPool, w::Int) = (push!(pool.workers, w); put!(pool.channel, w); pool) -wp_local_length(pool::AbstractWorkerPool) = length(pool.workers) -wp_local_isready(pool::AbstractWorkerPool) = isready(pool.channel) +wp_local_push!(pool::AbstractWorkerPool, w::Int; role= :default) = (push!(pool.workers, w); put!(pool.channel, w); pool) +wp_local_length(pool::AbstractWorkerPool; role= :default) = length(pool.workers) +wp_local_isready(pool::AbstractWorkerPool; role= :default) = isready(pool.channel) # pool.channel::Channel{Int} -function wp_local_put!(pool::AbstractWorkerPool, w::Int) +function wp_local_put!(pool::AbstractWorkerPool, w::Int; role= :default) # In case of default_worker_pool, the master is implicitly considered a worker, i.e., # it is not present in pool.workers. # Confirm the that the worker is part of a pool before making it available. @@ -80,28 +81,28 @@ function wp_local_put!(pool::AbstractWorkerPool, w::Int) w end -function wp_local_workers(pool::AbstractWorkerPool) - if length(pool) == 0 && pool === default_worker_pool() +function wp_local_workers(pool::AbstractWorkerPool; role= :default) + if length(pool) == 0 && pool === default_worker_pool(role=role) return [1] else return collect(pool.workers) end end -function wp_local_nworkers(pool::AbstractWorkerPool) - if length(pool) == 0 && pool === default_worker_pool() +function wp_local_nworkers(pool::AbstractWorkerPool; role= :default) + if length(pool) == 0 && pool === default_worker_pool(role=role) return 1 else return length(pool.workers) end end -function wp_local_take!(pool::AbstractWorkerPool) +function wp_local_take!(pool::AbstractWorkerPool; role= :default) # Find an active worker worker = 0 while true if length(pool) == 0 - if pool === default_worker_pool() + if pool === default_worker_pool(role=role) # No workers, the master process is used as a worker worker = 1 break @@ -120,48 +121,74 @@ function wp_local_take!(pool::AbstractWorkerPool) return worker end -function remotecall_pool(rc_f, f, pool::AbstractWorkerPool, args...; kwargs...) +function wp_local_wait(pool::AbstractWorkerPool) + wait(pool.channel) + return nothing +end + +function remotecall_pool(rc_f, f, pool::AbstractWorkerPool, args...; role= :default, kwargs...) + worker = take!(pool; role=role) + try + rc_f(f, worker, role=role, args...; kwargs...) + finally + put!(pool, worker; role = role) + end +end + +# Specialization for remotecall. We have to wait for the Future it returns +# before putting the worker back in the pool. +function remotecall_pool(rc_f::typeof(remotecall), f, pool::AbstractWorkerPool, args...; kwargs...) worker = take!(pool) + local x try - rc_f(f, worker, args...; kwargs...) + x = rc_f(f, worker, args...; kwargs...) + catch + put!(pool, worker) + rethrow() + end + t = Threads.@spawn Threads.threadpool() try + wait(x) + catch # just wait, ignore errors here finally put!(pool, worker) end + errormonitor(t) + return x end # Check if pool is local or remote and forward calls if required. # NOTE: remotecall_fetch does it automatically, but this will be more efficient as # it avoids the overhead associated with a local remotecall. -for (func, rt) = ((:length, Int), (:isready, Bool), (:workers, Vector{Int}), (:nworkers, Int), (:take!, Int)) +for (func, rt) = ((:length, Int), (:isready, Bool), (:workers, Vector{Int}), (:nworkers, Int), (:take!, Int), (:wait, Nothing)) func_local = Symbol(string("wp_local_", func)) @eval begin - function ($func)(pool::WorkerPool) - if pool.ref.where != myid() - return remotecall_fetch(ref->($func_local)(fetch(ref).value), pool.ref.where, pool.ref)::$rt + function ($func)(pool::WorkerPool; role= :default) + if pool.ref.where != myid(role = role) + return remotecall_fetch((ref, role)->(($func_local)(fetch(ref; role=role).value; role = role)), pool.ref.where, pool.ref, pool.ref.where == 1 ? :master : :worker; role = role)::$rt else - return ($func_local)(pool) + return ($func_local)(pool; role = role) end end # default impl - ($func)(pool::AbstractWorkerPool) = ($func_local)(pool) + ($func)(pool::AbstractWorkerPool; role= :default) = ($func_local)(pool; role = role) end end for func = (:push!, :put!) func_local = Symbol(string("wp_local_", func)) @eval begin - function ($func)(pool::WorkerPool, w::Int) - if pool.ref.where != myid() - return remotecall_fetch((ref, w)->($func_local)(fetch(ref).value, w), pool.ref.where, pool.ref, w) + function ($func)(pool::WorkerPool, w::Int; role= :default) + if pool.ref.where != myid(role = role) + return remotecall_fetch((ref, w, role)->(($func_local)(fetch(ref; role = role).value, w; role = role)), pool.ref.where, pool.ref, w, pool.ref.where == 1 ? :master : :worker; role = role) else - return ($func_local)(pool, w) + return ($func_local)(pool, w; role = role) end end # default impl - ($func)(pool::AbstractWorkerPool, w::Int) = ($func_local)(pool, w) + ($func)(pool::AbstractWorkerPool, w::Int; role= :default) = ($func_local)(pool, w; role = role) end end @@ -184,6 +211,7 @@ Future(2, 1, 6, nothing) ``` In this example, the task ran on pid 2, called from pid 1. """ +#remotecall(f, pool::AbstractWorkerPool, args...; role= :default, kwargs...) = remotecall_pool((f, pool) -> remotecall(f, pool, role=role, args...; kwargs...); role=role) remotecall(f, pool::AbstractWorkerPool, args...; kwargs...) = remotecall_pool(remotecall, f, pool, args...; kwargs...) @@ -208,6 +236,7 @@ julia> fetch(f) 0.9995177101692958 ``` """ +#remotecall_wait(f, pool::AbstractWorkerPool, args...; role= :default, kwargs...) = remotecall_pool((f,pool) -> remotecall_wait(f, pool, role = role, args...; kwargs...); role=role) # TO CHECK (dúvida com "role = role") remotecall_wait(f, pool::AbstractWorkerPool, args...; kwargs...) = remotecall_pool(remotecall_wait, f, pool, args...; kwargs...) @@ -229,14 +258,21 @@ julia> remotecall_fetch(maximum, wp, A) 0.9995177101692958 ``` """ +#remotecall_fetch(f, pool::AbstractWorkerPool, args...; role= :default, kwargs...) = remotecall_pool((f,pool)->remotecall_fetch(f, pool, role = role, args...; kwargs...), f, pool; role = role) # TO CHECK (dúvida com o primeiro "role = role") remotecall_fetch(f, pool::AbstractWorkerPool, args...; kwargs...) = remotecall_pool(remotecall_fetch, f, pool, args...; kwargs...) +#remotecall_fetch(f, pool::AbstractWorkerPool, args...; role= :default, kwargs...) = remotecall_pool((f,pool)->remotecall_fetch((p, args...) -> f(p, args...), pool, args...; role = role, kwargs...), f, pool; role = role) # TO CHECK (dúvida com o primeiro "role = role") """ remote_do(f, pool::AbstractWorkerPool, args...; kwargs...) -> nothing [`WorkerPool`](@ref) variant of `remote_do(f, pid, ....)`. Wait for and take a free worker from `pool` and perform a `remote_do` on it. + +Note that it's not possible to wait for the result of a `remote_do()` to finish +so the worker will immediately be put back in the pool (i.e. potentially causing +oversubscription). """ +#remote_do(f, pool::AbstractWorkerPool, args...; role= :default, kwargs...) = remotecall_pool((f,pool) -> remote_do(f, pool, role = role, args...; kwargs...); role = role) remote_do(f, pool::AbstractWorkerPool, args...; kwargs...) = remotecall_pool(remote_do, f, pool, args...; kwargs...) const _default_worker_pool = Ref{Union{AbstractWorkerPool, Nothing}}(nothing) @@ -256,14 +292,14 @@ julia> default_worker_pool() WorkerPool(Channel{Int64}(sz_max:9223372036854775807,sz_curr:3), Set([4, 2, 3]), RemoteChannel{Channel{Any}}(1, 1, 4)) ``` """ -function default_worker_pool() +function default_worker_pool(;role=:default) # On workers retrieve the default worker pool from the master when accessed # for the first time if _default_worker_pool[] === nothing - if myid() == 1 - _default_worker_pool[] = WorkerPool() + if myid(role=role) == 1 + _default_worker_pool[] = WorkerPool(role = role) else - _default_worker_pool[] = remotecall_fetch(()->default_worker_pool(), 1) + _default_worker_pool[] = remotecall_fetch(role->default_worker_pool(role = role), 1, :master; role=role) end end return _default_worker_pool[] @@ -284,8 +320,8 @@ end Return an anonymous function that executes function `f` on an available worker (drawn from [`WorkerPool`](@ref) `p` if provided) using [`remotecall_fetch`](@ref). """ -remote(f) = (args...; kwargs...)->remotecall_fetch(f, default_worker_pool(), args...; kwargs...) -remote(p::AbstractWorkerPool, f) = (args...; kwargs...)->remotecall_fetch(f, p, args...; kwargs...) +remote(f; role= :default) = (args...; kwargs...)->remotecall_fetch(f, default_worker_pool(role=role), args...; role=role, kwargs...) +remote(p::AbstractWorkerPool, f; role= :default) = (args...; kwargs...)->remotecall_fetch(f, p, args...; role=role, kwargs...) mutable struct CachingPool <: AbstractWorkerPool channel::Channel{Int} @@ -351,20 +387,44 @@ function clear!(pool::CachingPool) pool end -exec_from_cache(rr::RemoteChannel, args...; kwargs...) = fetch(rr)(args...; kwargs...) -function exec_from_cache(f_ref::Tuple{Function, RemoteChannel}, args...; kwargs...) +exec_from_cache(rr::RemoteChannel, args...; role= :default, kwargs...) = fetch(rr; role = role)(args...; kwargs...) +function exec_from_cache(f_ref::Tuple{Function, RemoteChannel}, args...; role= :default, kwargs...) put!(f_ref[2], f_ref[1]) # Cache locally f_ref[1](args...; kwargs...) end -function remotecall_pool(rc_f, f, pool::CachingPool, args...; kwargs...) - worker = take!(pool) - f_ref = get(pool.map_obj2ref, (worker, f), (f, RemoteChannel(worker))) +function remotecall_pool(rc_f, f, pool::CachingPool, args...; role= :default, kwargs...) + worker = take!(pool; role=role) + f_ref = get(pool.map_obj2ref, (worker, f), (f, RemoteChannel(worker; role=role))) isa(f_ref, Tuple) && (pool.map_obj2ref[(worker, f)] = f_ref[2]) # Add to tracker try - rc_f(exec_from_cache, worker, f_ref, args...; kwargs...) + rc_f(exec_from_cache, worker, f_ref, args...; role=role, kwargs...) finally - put!(pool, worker) + put!(pool, worker; role=role) end end + + +# Specialization for remotecall. We have to wait for the Future it returns +# before putting the worker back in the pool. +function remotecall_pool(rc_f::typeof(remotecall), f, pool::CachingPool, args...; role= :default, kwargs...) + worker = take!(pool; role=role) + f_ref = get(pool.map_obj2ref, (worker, f), (f, RemoteChannel(worker; role=role))) + isa(f_ref, Tuple) && (pool.map_obj2ref[(worker, f)] = f_ref[2]) # Add to tracker + local x + try + x = rc_f(exec_from_cache, worker, f_ref, args...; role=role, kwargs...) + catch + put!(pool, worker; role=role) + rethrow() + end + t = Threads.@spawn Threads.threadpool() try + wait(x) + catch # just wait, ignore errors here + finally + put!(pool, worker; role=role) + end + errormonitor(t) + return x +end \ No newline at end of file diff --git a/test/aqua.jl b/test/aqua.jl new file mode 100644 index 0000000..56c01c5 --- /dev/null +++ b/test/aqua.jl @@ -0,0 +1,8 @@ +using Aqua +using Distributed +Aqua.test_all( + Distributed, + # This should be excluded, but it's not clear how to do that on Aqua's API + # given it's not-defined. (The Julia Base ambiguity test does it something like this) + # ambiguities=(exclude=[GlobalRef(Distributed, :cluster_manager)]) +) \ No newline at end of file diff --git a/test/distributed_exec.jl b/test/distributed_exec.jl index 7b5c983..63a00cc 100644 --- a/test/distributed_exec.jl +++ b/test/distributed_exec.jl @@ -3,12 +3,7 @@ using Test, Distributed, Random, Serialization, Sockets import Distributed: launch, manage -sharedir = normpath(joinpath(Sys.BINDIR, "..", "share")) -if parse(Bool, get(ENV, "JULIA_DISTRIBUTED_TESTING_STANDALONE", "false")) - @test !startswith(pathof(Distributed), sharedir) -else - @test startswith(pathof(Distributed), sharedir) -end +pathsep = Sys.iswindows() ? ";" : ":" @test cluster_cookie() isa String @@ -27,7 +22,7 @@ include(joinpath(Sys.BINDIR, "..", "share", "julia", "test", "testenv.jl")) addprocs_with_testenv(4) @test nprocs() == 5 -# distributed loading of packages +# Distributed loading of packages # setup @everywhere begin @@ -52,6 +47,7 @@ end id_me = myid() id_other = filter(x -> x != id_me, procs())[rand(1:(nprocs()-1))] + # Test role @everywhere using Distributed @test Distributed.myrole() === :master @@ -62,6 +58,9 @@ for wid = workers() @test wrole === :worker end +#sleep(3) + + # Test remote() let pool = default_worker_pool() @@ -79,17 +78,27 @@ let yield() end +# @info nworkers() +# sleep(30) + testchannels = [RemoteChannel() for i in 1:nworkers()] + # @info testchannels + # sleep(30) testcount = 0 @test isready(pool) == true for c in testchannels @test count == testcount +# @info c remote_wait(c) testcount += 1 end @test count == testcount @test isready(pool) == false + #sleep(3) + + try + for c in testchannels @test count == testcount put!(c, "foo") @@ -99,8 +108,14 @@ let @test isready(pool) == true end + catch e + @info e + end + @test count == 0 + #sleep(3) + for c in testchannels @test count == testcount remote_wait(c) @@ -109,6 +124,8 @@ let @test count == testcount @test isready(pool) == false + #sleep(3) + for c in reverse(testchannels) @test count == testcount put!(c, "foo") @@ -118,9 +135,14 @@ let @test isready(pool) == true end + #sleep(3) + @test count == 0 end +#sleep(3) + + # Test Futures function testf(id) f=Future(id) @@ -151,48 +173,27 @@ function poll_while(f::Function; timeout_seconds::Integer = 120) return true end -function _getenv_include_thread_unsafe() - environment_variable_name = "JULIA_TEST_INCLUDE_THREAD_UNSAFE" - default_value = "false" - environment_variable_value = strip(get(ENV, environment_variable_name, default_value)) - b = parse(Bool, environment_variable_value)::Bool - return b -end -const _env_include_thread_unsafe = _getenv_include_thread_unsafe() -function include_thread_unsafe_tests() - if Threads.maxthreadid() > 1 - if _env_include_thread_unsafe - return true - end - msg = "Skipping a thread-unsafe test because `Threads.maxthreadid() > 1`" - @warn msg Threads.maxthreadid() - Test.@test_broken false - return false - end - return true -end - # Distributed GC tests for Futures function test_futures_dgc(id) f = remotecall(myid, id) fid = remoteref_id(f) # remote value should be deleted after a fetch - @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, fid) == true + @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP().refs, k)), id, fid) == true @test f.v === nothing @test fetch(f) == id @test f.v !== nothing yield(); # flush gc msgs - @test poll_while(() -> remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, fid)) + @test poll_while(() -> remotecall_fetch(k->(yield();haskey(Distributed.PGRP().refs, k)), id, fid)) # if unfetched, it should be deleted after a finalize f = remotecall(myid, id) fid = remoteref_id(f) - @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, fid) == true + @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP().refs, k)), id, fid) == true @test f.v === nothing finalize(f) yield(); # flush gc msgs - @test poll_while(() -> remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, fid)) + @test poll_while(() -> remotecall_fetch(k->(yield();haskey(Distributed.PGRP().refs, k)), id, fid)) end test_futures_dgc(id_me) @@ -208,48 +209,63 @@ fstore = RemoteChannel(wid2) put!(fstore, f) @test fetch(f) == wid1 -@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == true +@test remotecall_fetch(k->haskey(Distributed.PGRP().refs, k), wid1, fid) == true remotecall_fetch(r->(fetch(fetch(r)); yield()), wid2, fstore) sleep(0.5) # to ensure that wid2 gc messages have been executed on wid1 -@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == false +@test remotecall_fetch(k->haskey(Distributed.PGRP().refs, k), wid1, fid) == false # put! should release remote reference since it would have been cached locally f = Future(wid1) fid = remoteref_id(f) # should not be created remotely till accessed -@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == false +@test remotecall_fetch(k->haskey(Distributed.PGRP().refs, k), wid1, fid) == false # create it remotely isready(f) -@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == true +@test remotecall_fetch(k->haskey(Distributed.PGRP().refs, k), wid1, fid) == true put!(f, :OK) -@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == false +@test remotecall_fetch(k->haskey(Distributed.PGRP().refs, k), wid1, fid) == false @test fetch(f) === :OK # RemoteException should be thrown on a put! when another process has set the value -f = Future(wid1) -fid = remoteref_id(f) - -fstore = RemoteChannel(wid2) -put!(fstore, f) # send f to wid2 -put!(f, :OK) # set value from master - -@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == true - -testval = remotecall_fetch(wid2, fstore) do x - try - put!(fetch(x), :OK) - return 0 - catch e - if isa(e, RemoteException) - return 1 - else - return 2 +# Test this multiple times as races have been seen where `@spawn` was used over +# `@async`. Issue #124 +max_attempts = 100 +for i in 1:max_attempts + let f = Future(wid1), fid = remoteref_id(f), fstore = RemoteChannel(wid2) + # RemoteException should be thrown on a put! when another process has set the value + + put!(fstore, f) # send f to wid2 + put!(f, :OK) # set value from master + + @test remotecall_fetch(k->haskey(Distributed.PGRP().refs, k), wid1, fid) == true + + # fstore should be ready immediately, but races due to use of `@spawn` have caused + # this to fail in the past. So we poll for readiness before the main test after this + # which internally checks for `isready` to decide whether to error or not + w = remotecall_fetch(wid2, fstore) do x + timedwait(() -> isready(fetch(x)), 10) end + w == :ok || @info "isready timed out on attempt $i (max $max_attempts)" + @test w == :ok + # This is the actual test. It should fail because the value is already set remotely + testval = remotecall_fetch(wid2, fstore) do x + try + put!(fetch(x), :OK) + return 0 + catch e + if isa(e, RemoteException) + return 1 + else + rethrow() + end + end + end + testval == 1 || @info "test failed on attempt $i (max $max_attempts)" + @test testval == 1 end end -@test testval == 1 # Issue number #25847 @everywhere function f25847(ref) @@ -260,14 +276,15 @@ end f = remotecall_wait(identity, id_other, ones(10)) rrid = Distributed.RRID(f.whence, f.id) remotecall_fetch(f25847, id_other, f) -@test BitSet([id_me]) == remotecall_fetch(()->Distributed.PGRP.refs[rrid].clientset, id_other) +@test BitSet([id_me]) == remotecall_fetch(()->Distributed.PGRP().refs[rrid].clientset, id_other) remotecall_fetch(f25847, id_other, f) -@test BitSet([id_me]) == remotecall_fetch(()->Distributed.PGRP.refs[rrid].clientset, id_other) +@test BitSet([id_me]) == remotecall_fetch(()->Distributed.PGRP().refs[rrid].clientset, id_other) finalize(f) yield() # flush gc msgs -@test poll_while(() -> remotecall_fetch(chk_rrid->(yield(); haskey(Distributed.PGRP.refs, chk_rrid)), id_other, rrid)) +@test poll_while(() -> remotecall_fetch(chk_rrid->(yield(); haskey(Distributed.PGRP().refs, chk_rrid)), id_other, rrid)) + # Distributed GC tests for RemoteChannels function test_remoteref_dgc(id) @@ -276,12 +293,12 @@ function test_remoteref_dgc(id) rrid = remoteref_id(rr) # remote value should be deleted after finalizing the ref - @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, rrid) == true + @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP().refs, k)), id, rrid) == true @test fetch(rr) === :OK - @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, rrid) == true + @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP().refs, k)), id, rrid) == true finalize(rr) yield(); # flush gc msgs - @test poll_while(() -> remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, rrid)) + @test poll_while(() -> remotecall_fetch(k->(yield();haskey(Distributed.PGRP().refs, k)), id, rrid)) end test_remoteref_dgc(id_me) test_remoteref_dgc(id_other) @@ -294,17 +311,19 @@ let wid1 = workers()[1], fstore = RemoteChannel(wid2) put!(fstore, rr) - if include_thread_unsafe_tests() - @test remotecall_fetch(k -> haskey(Distributed.PGRP.refs, k), wid1, rrid) == true - end + + # timedwait() is necessary because wid1 is asynchronously informed of + # the existence of rr/rrid through the call to `put!(fstore, rr)`. + @test timedwait(() -> remotecall_fetch(k -> haskey(Distributed.PGRP().refs, k), wid1, rrid), 10) === :ok + finalize(rr) # finalize locally yield() # flush gc msgs - if include_thread_unsafe_tests() - @test remotecall_fetch(k -> haskey(Distributed.PGRP.refs, k), wid1, rrid) == true - end + + @test timedwait(() -> remotecall_fetch(k -> haskey(Distributed.PGRP().refs, k), wid1, rrid), 10) === :ok + remotecall_fetch(r -> (finalize(take!(r)); yield(); nothing), wid2, fstore) # finalize remotely sleep(0.5) # to ensure that wid2 messages have been executed on wid1 - @test poll_while(() -> remotecall_fetch(k -> haskey(Distributed.PGRP.refs, k), wid1, rrid)) + @test poll_while(() -> remotecall_fetch(k -> haskey(Distributed.PGRP().refs, k), wid1, rrid)) end # Tests for issue #23109 - should not hang. @@ -332,7 +351,6 @@ for i in 1:nworkers() end @test sort(pids) == sort(workers()) - # test getindex on Futures and RemoteChannels function test_indexing(rr) a = rand(5,5) @@ -473,6 +491,8 @@ test_iteration(RemoteChannel(() -> Channel(10)), RemoteChannel(() -> Channel(10) return count end +@everywhere test_iteration_collect(ch) = length(collect(ch)) + @everywhere function test_iteration_put(ch, total) for i in 1:total put!(ch, i) @@ -483,10 +503,27 @@ end let ch = RemoteChannel(() -> Channel(1)) @async test_iteration_put(ch, 10) @test 10 == @fetchfrom id_other test_iteration_take(ch) + ch = RemoteChannel(() -> Channel(1)) + @async test_iteration_put(ch, 10) + @test 10 == @fetchfrom id_other test_iteration_collect(ch) # now reverse ch = RemoteChannel(() -> Channel(1)) @spawnat id_other test_iteration_put(ch, 10) @test 10 == test_iteration_take(ch) + ch = RemoteChannel(() -> Channel(1)) + @spawnat id_other test_iteration_put(ch, 10) + @test 10 == test_iteration_collect(ch) +end + +# Test isempty(::RemoteChannel). This should not modify the underlying +# AbstractChannel, which Base's default implementation will do. +let + chan = Channel(1) + push!(chan, 1) + remotechan = RemoteChannel(() -> chan) + @test !isempty(remotechan) + # Calling `isempty(remotechan)` shouldn't have modified `chan` + @test !isempty(chan) end # make sure exceptions propagate when waiting on Tasks @@ -567,7 +604,7 @@ let ex end # pmap tests. Needs at least 4 processors dedicated to the below tests. Which we currently have -# since the distributed tests are now spawned as a separate set. +# since the Distributed tests are now spawned as a separate set. # Test all combinations of pmap keyword args. pmap_args = [ @@ -660,7 +697,6 @@ generic_map_tests(pmap_fallback) run_map_equivalence_tests(pmap) @test pmap(uppercase, "Hello World!") == map(uppercase, "Hello World!") - # Simple test for pmap throws error let error_thrown = false try @@ -700,10 +736,36 @@ wp = WorkerPool(workers()) @test nworkers() == length(unique(remotecall_fetch(wp->pmap(_->myid(), wp, 1:100), id_other, wp))) wp = WorkerPool(2:3) @test sort(unique(pmap(_->myid(), wp, 1:100))) == [2,3] +@test fetch(remotecall(myid, wp)) in wp.workers +@test_throws RemoteException fetch(remotecall(error, wp)) + +# wait on worker pool +wp = WorkerPool(2:2) +w = take!(wp) + +# local call to _wait +@test !isready(wp) +t = @async wait(wp) +@test !istaskdone(t) +put!(wp, w) +status = timedwait(() -> istaskdone(t), 10) +@test status == :ok + +# remote call to _wait +take!(wp) +@test !isready(wp) +f = @spawnat w wait(wp) +@test !isready(f) +put!(wp, w) +status = timedwait(() -> isready(f), 10) +@test status == :ok + # CachingPool tests wp = CachingPool(workers()) @test [1:100...] == pmap(x->x, wp, 1:100) +@test fetch(remotecall(myid, wp)) in wp.workers +@test_throws RemoteException fetch(remotecall(error, wp)) clear!(wp) @test length(wp.map_obj2ref) == 0 @@ -742,7 +804,7 @@ if DoFullTest all_w = workers() # Test sending fake data to workers. The worker processes will print an # error message but should not terminate. - for w in Distributed.PGRP.workers + for w in Distributed.PGRP().workers if isa(w, Distributed.Worker) local s = connect(w.config.host, w.config.port) write(s, randstring(32)) @@ -769,6 +831,7 @@ if Sys.isunix() # aka have ssh remotecall_fetch(rmprocs, 1, new_pids) end + print("\n\nTesting SSHManager. A minimum of 4GB of RAM is recommended.\n") print("Please ensure: \n") print("1) sshd is running locally with passwordless login enabled.\n") @@ -887,7 +950,6 @@ v15406 = remotecall_wait(() -> 1, id_other) fetch(v15406) remotecall_wait(fetch, id_other, v15406) - # issue #43396 # Covers the remote fetch where the value returned is `nothing` # May be caused by attempting to unwrap a non-`Some` type with `something` @@ -896,7 +958,6 @@ remotecall_wait(fetch, id_other, v15406) @test nothing === fetch(remotecall(() -> nothing, workers()[1])) @test 10 === fetch(remotecall(() -> 10, workers()[1])) - # Test various forms of remotecall* invocations @everywhere f_args(v1, v2=0; kw1=0, kw2=0) = v1+v2+kw1+kw2 @@ -918,15 +979,16 @@ for tid in [id_other, id_me, default_worker_pool()] test_f_args(15, f_args, tid, 1, 2; kw1=4, kw2=8) end -# Test remote_do -f=Future(id_me) -remote_do(fut->put!(fut, myid()), id_me, f) -@test fetch(f) == id_me f=Future(id_other) remote_do(fut->put!(fut, myid()), id_other, f) @test fetch(f) == id_other +# Test remote_do +f=Future(id_me) +remote_do(fut->put!(fut, myid()), id_me, f) +@test fetch(f) == id_me + # Github issue #29932 rc_unbuffered = RemoteChannel(()->Channel{Vector{Float64}}(0)) @test eltype(rc_unbuffered) == Vector{Float64} @@ -966,33 +1028,32 @@ end # issue #16091 mutable struct T16091 end -wid = workers()[1] -try - remotecall_fetch(()->T16091, wid) - @test "unreachable" === true +wid0 = workers()[1] +@test try + remotecall_fetch(()->T16091, wid0) + false catch ex - ex = ((ex::RemoteException).captured::CapturedException).ex - @test (ex::UndefVarError).var === :T16091 + @info "----------------- $(((ex::RemoteException).captured::CapturedException).ex)" + ((ex::RemoteException).captured::CapturedException).ex === UndefVarError(:T16091, Main) end -try - remotecall_fetch(identity, wid, T16091) - @test "unreachable" === true +@test try + remotecall_fetch(identity, wid0, T16091) + false catch ex - ex = ((ex::RemoteException).captured::CapturedException).ex - @test (ex::UndefVarError).var === :T16091 + ((ex::RemoteException).captured::CapturedException).ex === UndefVarError(:T16091, Main) end f16091a() = 1 -remotecall_fetch(()->eval(:(f16091a() = 2)), wid) -@test remotecall_fetch(f16091a, wid) === 2 -@test remotecall_fetch((myid)->remotecall_fetch(f16091a, myid), wid, myid()) === 1 +remotecall_fetch(()->eval(:(f16091a() = 2)), wid0) +@test remotecall_fetch(f16091a, wid0) === 2 +@test remotecall_fetch((myid)->remotecall_fetch(f16091a, myid), wid0, myid()) === 1 # these will only heisen-fail, since it depends on the gensym counter collisions: f16091b = () -> 1 -remotecall_fetch(()->eval(:(f16091b = () -> 2)), wid) +remotecall_fetch(()->eval(:(f16091b = () -> 2)), wid0) @test remotecall_fetch(f16091b, 2) === 1 # Global anonymous functions are over-written... -@test remotecall_fetch((myid)->remotecall_fetch(f16091b, myid), wid, myid()) === 1 +@test remotecall_fetch((myid)->remotecall_fetch(f16091b, myid), wid0, myid()) === 1 # ...while local anonymous functions are by definition, local. let @@ -1004,7 +1065,7 @@ let f16091c = () -> 2 remotecall_fetch(f16091c, myid) end - end, wid, myid()) === 2 + end, wid0, myid()) === 2 end # issue #16451 @@ -1050,6 +1111,23 @@ let @test_throws RemoteException fetch(ref) end +# Test the behaviour of remotecall(f, ::AbstractWorkerPool), this should +# keep the worker out of the pool until the underlying remotecall has +# finished. +for PoolType in (WorkerPool, CachingPool) + let + remotechan = RemoteChannel(wrkr1) + pool = PoolType([wrkr1]) + put_future = remotecall(() -> wait(remotechan), pool) + @test !isready(pool) + put!(remotechan, 1) + wait(put_future) + # The task that waits on the future to put it back into the pool runs + # asynchronously so we use timedwait() to check when the worker is back in. + @test timedwait(() -> isready(pool), 10) === :ok + end +end + # Test calling @everywhere from a module not defined on the workers module LocalBar using Distributed @@ -1113,9 +1191,9 @@ function get_remote_num_threads(processes_added) return [remotecall_fetch(BLAS.get_num_threads, proc_id) for proc_id in processes_added] end -function test_blas_config(pid, expected) - for worker in Distributed.PGRP.workers - if worker.id == pid +function test_blas_config(pid, expected; role=:default) + for worker in Distributed.PGRP(role=role).workers + if Distributed.wid(worker,role=role) == pid @test worker.config.enable_threaded_blas == expected return end @@ -1197,16 +1275,16 @@ end end # Test addprocs/rmprocs from master node only -for f in [ ()->addprocs(1; exeflags=test_exeflags), ()->rmprocs(workers()) ] - local f - try - remotecall_fetch(f, id_other) - error("Unexpected") - catch ex - @test isa(ex, RemoteException) - @test ex.captured.ex.msg == "Only process 1 can add and remove workers" - end -end +#for f in [ ()->addprocs(1; exeflags=test_exeflags), ()->rmprocs(workers()) ] +# local f +# try +# remotecall_fetch(f, id_other) +# error("Unexpected") +# catch ex +# @test isa(ex, RemoteException) +# @test ex.captured.ex.msg == "Only process 1 can add and remove workers" +# end +#end # Test the following addprocs error conditions # - invalid host name - github issue #20372 @@ -1273,7 +1351,6 @@ for (addp_testf, expected_errstr, env) in testruns end end - # Auto serialization of globals from Main. # bitstypes global v1 = 1 @@ -1341,7 +1418,6 @@ v31252 = :b v31252 = :a @test :a == @fetchfrom id_other v31252 - # Test that a global is not being repeatedly serialized when # a) referenced multiple times in the closure # b) hash value has not changed. @@ -1440,7 +1516,7 @@ clust_ser = (Distributed.worker_from_id(id_other)).w_serializer # TODO Add test for cleanup from `clust_ser.glbs_in_tnobj` -# reported github issues - Mostly tests with globals and various distributed macros +# reported github issues - Mostly tests with globals and various Distributed macros #2669, #5390 v2669=10 @test fetch(@spawnat :any (1+v2669)) == 11 @@ -1557,8 +1633,7 @@ try catch ex @test isa(ex.captured.ex.exceptions[1].ex, ErrorException) @test occursin("BoundsError", ex.captured.ex.exceptions[1].ex.msg) - ex = ex.captured.ex.exceptions[2].ex - @test (ex::UndefVarError).var === :DontExistOn1 + @test ex.captured.ex.exceptions[2].ex == UndefVarError(:DontExistOn1) end let @@ -1682,21 +1757,21 @@ p1,p2 = addprocs_with_testenv(2) @test fill(2.,2) == remotecall_fetch(f22865, p1, p2) rmprocs(p1, p2) -function reuseport_tests() +function reuseport_tests(;role = :default) # Run the test on all processes. results = asyncmap(procs()) do p remotecall_fetch(p) do ports_lower = [] # ports of pids lower than myid() ports_higher = [] # ports of pids higher than myid() - for w in Distributed.PGRP.workers - w.id == myid() && continue + for w in Distributed.PGRP(role=role).workers + Distributed.wid(w,role=role) == myid() && continue port = Sockets._sockname(w.r_stream, true)[2] - if (w.id == 1) + if (Distributed.wid(w,role=role) == 1) # master connects to workers push!(ports_higher, port) - elseif w.id < myid() + elseif Distributed.wid(w,role=role) < myid(role=role) push!(ports_lower, port) - elseif w.id > myid() + elseif Distributed.wid(w,role=role) > myid(role=role) push!(ports_higher, port) end end @@ -1707,23 +1782,22 @@ function reuseport_tests() return 0 end end - return myid() + return myid(role=role) end end # Ensure that the code has indeed been successfully executed everywhere - @test all(in(results), procs()) + return all(in(results), procs()) end # Test that the client port is reused. SO_REUSEPORT may not be supported on # all UNIX platforms, Linux kernels prior to 3.9 and older versions of OSX @assert nprocs() == 1 addprocs_with_testenv(4; lazy=false) -if ccall(:jl_has_so_reuseport, Int32, ()) == 1 - reuseport_tests() -else - @info "SO_REUSEPORT is unsupported, skipping reuseport tests" -end + +skip_reuseexport = ccall(:jl_has_so_reuseport, Int32, ()) != 1 +skip_reuseexport && @debug "SO_REUSEPORT support missing, reuseport_tests skipped" +@test reuseport_tests() skip = skip_reuseexport # issue #27933 a27933 = :_not_defined_27933 @@ -1797,9 +1871,10 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp project = mkdir(joinpath(tmp, "project")) depots = [mkdir(joinpath(tmp, "depot1")), mkdir(joinpath(tmp, "depot2"))] load_path = [mkdir(joinpath(tmp, "load_path")), "@stdlib", "@"] - pathsep = Sys.iswindows() ? ";" : ":" + shipped_depots = DEPOT_PATH[2:end] # stdlib caches env = Dict( - "JULIA_DEPOT_PATH" => join(depots, pathsep), + # needs a trailing pathsep to access the stdlib depot + "JULIA_DEPOT_PATH" => join(depots, pathsep) * pathsep, "JULIA_LOAD_PATH" => join(load_path, pathsep), # Explicitly propagate `TMPDIR`, in the event that we're running on a # CI system where `TMPDIR` is special. @@ -1829,7 +1904,7 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp end """ cmd = setenv(`$(julia) -p1 -e $(testcode * extracode)`, env) - @test success(cmd) + @test success(pipeline(cmd; stdout, stderr)) # --project extracode = """ for w in workers() @@ -1838,11 +1913,11 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp end """ cmd = setenv(`$(julia) --project=$(project) -p1 -e $(testcode * extracode)`, env) - @test success(cmd) + @test success(pipeline(cmd; stdout, stderr)) # JULIA_PROJECT cmd = setenv(`$(julia) -p1 -e $(testcode * extracode)`, (env["JULIA_PROJECT"] = project; env)) - @test success(cmd) + @test success(pipeline(cmd; stdout, stderr)) # Pkg.activate(...) activateish = """ Base.ACTIVE_PROJECT[] = $(repr(project)) @@ -1850,11 +1925,17 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp addprocs(1) """ cmd = setenv(`$(julia) -e $(activateish * testcode * extracode)`, env) - @test success(cmd) + @test success(pipeline(cmd; stdout, stderr)) # JULIA_(LOAD|DEPOT)_PATH shufflecode = """ - d = reverse(DEPOT_PATH) - append!(empty!(DEPOT_PATH), d) + function reverse_first_two(depots) + custom_depots = depots[1:2] + standard_depots = depots[3:end] + custom_depots = reverse(custom_depots) + return append!(custom_depots, standard_depots) + end + new_depots = reverse_first_two(DEPOT_PATH) + append!(empty!(DEPOT_PATH), new_depots) l = reverse(LOAD_PATH) append!(empty!(LOAD_PATH), l) """ @@ -1869,23 +1950,23 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp end """ cmd = setenv(`$(julia) -e $(shufflecode * addcode * testcode * extracode)`, env) - @test success(cmd) + @test success(pipeline(cmd; stdout, stderr)) # Mismatch when shuffling after proc addition failcode = shufflecode * setupcode * """ for w in workers() @test remotecall_fetch(load_path, w) == reverse(LOAD_PATH) == $(repr(load_path)) - @test remotecall_fetch(depot_path, w) == reverse(DEPOT_PATH) == $(repr(depots)) + @test remotecall_fetch(depot_path, w) == $(repr(vcat(reverse(depots), shipped_depots))) end """ cmd = setenv(`$(julia) -p1 -e $(failcode)`, env) - @test success(cmd) + @test success(pipeline(cmd; stdout, stderr)) # Passing env or exeflags to addprocs(...) to override defaults envcode = """ using Distributed project = mktempdir() env = Dict( "JULIA_LOAD_PATH" => string(LOAD_PATH[1], $(repr(pathsep)), "@stdlib"), - "JULIA_DEPOT_PATH" => DEPOT_PATH[1], + "JULIA_DEPOT_PATH" => DEPOT_PATH[1] * $(repr(pathsep)), "TMPDIR" => ENV["TMPDIR"], ) addprocs(1; env = env, exeflags = `--project=\$(project)`) @@ -1893,14 +1974,14 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp addprocs(1; env = env) """ * setupcode * """ for w in workers() - @test remotecall_fetch(depot_path, w) == [DEPOT_PATH[1]] + @test remotecall_fetch(depot_path, w) == vcat(DEPOT_PATH[1], $(repr(shipped_depots))) @test remotecall_fetch(load_path, w) == [LOAD_PATH[1], "@stdlib"] @test remotecall_fetch(active_project, w) == project @test remotecall_fetch(Base.active_project, w) == joinpath(project, "Project.toml") end """ cmd = setenv(`$(julia) -e $(envcode)`, env) - @test success(cmd) + @test success(pipeline(cmd; stdout, stderr)) end end include("splitrange.jl") @@ -1916,7 +1997,7 @@ begin # Next, ensure we get a log message when a worker does not cleanly exit w = only(addprocs(1)) - @test_logs (:warn, r"sending SIGTERM") begin + @test_logs (:warn, r"Sending SIGQUIT") match_mode=:any begin remote_do(w) do # Cause the 'exit()' message that `rmprocs()` sends to do nothing Core.eval(Base, :(exit() = nothing)) @@ -1929,5 +2010,10 @@ end # Run topology tests last after removing all workers, since a given # cluster at any time only supports a single topology. -nprocs() > 1 && rmprocs(workers()) +if nprocs() > 1 + rmprocs(workers()) +end +include("threads.jl") include("topology.jl") + + diff --git a/test/runtests.jl b/test/runtests.jl index d34d07c..3651f70 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,14 +1,21 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license +using Test +using Distributed +# only run these if Aqua is installed. i.e. Pkg.test has installed it, or it is provided as a shared package +if Base.locate_package(Base.PkgId(Base.UUID("4c88cf16-eb10-579e-8560-4a9242c79595"), "Aqua")) isa String + @testset "Aqua.jl tests" begin + include("aqua.jl") + end +end + # Run the distributed test outside of the main driver since it needs its own # set of dedicated workers. include(joinpath(Sys.BINDIR, "..", "share", "julia", "test", "testenv.jl")) disttestfile = joinpath(@__DIR__, "distributed_exec.jl") -cmd = `$test_exename $test_exeflags $disttestfile` - -if !success(pipeline(cmd; stdout=stdout, stderr=stderr)) && ccall(:jl_running_on_valgrind,Cint,()) == 0 - error("Distributed test failed, cmd : $cmd") +@testset let cmd = `$test_exename $test_exeflags $disttestfile` + @test success(pipeline(cmd; stdout=stdout, stderr=stderr)) && ccall(:jl_running_on_valgrind,Cint,()) == 0 end include("managers.jl") diff --git a/test/threads.jl b/test/threads.jl new file mode 100644 index 0000000..c978dd4 --- /dev/null +++ b/test/threads.jl @@ -0,0 +1,55 @@ +using Test +using Distributed +using Base.Iterators: product +exeflags = ("--startup-file=no", + "--check-bounds=yes", + "--depwarn=error", + "--threads=2") +function call_on(f, wid, tid) + remotecall(wid) do + t = Task(f) + ccall(:jl_set_task_tid, Cvoid, (Any, Cint), t, tid - 1) + schedule(t) + @assert Threads.threadid(t) == tid + t + end +end +# Run function on process holding the data to only serialize the result of f. +# This becomes useful for things that cannot be serialized (e.g. running tasks) +# or that would be unnecessarily big if serialized. +fetch_from_owner(f, rr) = remotecall_fetch(f ∘ fetch, rr.where, rr) +isdone(rr) = fetch_from_owner(istaskdone, rr) +isfailed(rr) = fetch_from_owner(istaskfailed, rr) +@testset "RemoteChannel allows put!/take! from thread other than 1" begin + ws = ts = product(1:2, 1:2) + + # We want (the default) laziness, so that we wait for `Worker.c_state`! + procs_added = addprocs(2; exeflags, lazy=true) + + @testset "from worker $w1 to $w2 via 1" for (w1, w2) in ws + @testset "from thread $w1.$t1 to $w2.$t2" for (t1, t2) in ts + p1 = procs_added[w1] + p2 = procs_added[w2] + chan_id = first(procs_added) + chan = RemoteChannel(chan_id) + send = call_on(p1, t1) do + put!(chan, nothing) + end + recv = call_on(p2, t2) do + take!(chan) + end + # Wait on the spawned tasks on the owner. Note that we use + # timedwait() instead of @sync to avoid deadlocks. + t1 = Threads.@spawn fetch_from_owner(wait, recv) + t2 = Threads.@spawn fetch_from_owner(wait, send) + @test timedwait(() -> istaskdone(t1), 60) == :ok + @test timedwait(() -> istaskdone(t2), 60) == :ok + # Check the tasks + @test isdone(send) + @test isdone(recv) + @test !isfailed(send) + @test !isfailed(recv) + end + end + rmprocs(procs_added) +end \ No newline at end of file diff --git a/test/topology.jl b/test/topology.jl index a24efb2..5aeab68 100644 --- a/test/topology.jl +++ b/test/topology.jl @@ -99,7 +99,7 @@ remove_workers_and_test() function def_count_conn() @everywhere function count_connected_workers() count(x -> isa(x, Distributed.Worker) && isdefined(x, :r_stream) && isopen(x.r_stream), - Distributed.PGRP.workers) + Distributed.PGRP().workers) end end