From 3989bd8b28afd65df70876ae03e6d1ed1e9bd574 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Thu, 9 Nov 2023 09:17:17 -0300 Subject: [PATCH 01/54] reverting to previous commit --- .github/workflows/ci.yml | 3 ++- docs/src/index.md | 6 +++--- test/distributed_exec.jl | 24 +++++++++++++++++------- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 78d0786..01e6847 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -49,9 +49,10 @@ jobs: ${{ runner.os }}-test-${{ env.cache-name }}- ${{ runner.os }}-test-${{ matrix.os }} ${{ runner.os }}- - - run: julia --color=yes .ci/test_and_change_uuid.jl - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 + env: + JULIA_DISTRIBUTED_TESTING_STANDALONE: 1 - uses: julia-actions/julia-processcoverage@v1 - uses: codecov/codecov-action@v1 with: diff --git a/docs/src/index.md b/docs/src/index.md index 00b40de..22d63ce 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,7 +1,7 @@ # [Distributed Computing](@id man-distributed) -Tools for distributed parallel processing. ```@docs +Distributed Distributed.addprocs Distributed.nprocs Distributed.nworkers @@ -31,18 +31,18 @@ Distributed.AbstractWorkerPool Distributed.WorkerPool Distributed.CachingPool Distributed.default_worker_pool -Distributed.clear!(::CachingPool) +Distributed.clear! Distributed.remote Distributed.remotecall(::Any, ::AbstractWorkerPool, ::Any...) Distributed.remotecall_wait(::Any, ::AbstractWorkerPool, ::Any...) Distributed.remotecall_fetch(::Any, ::AbstractWorkerPool, ::Any...) Distributed.remote_do(::Any, ::AbstractWorkerPool, ::Any...) +Distributed.@spawn Distributed.@spawnat Distributed.@fetch Distributed.@fetchfrom Distributed.@distributed Distributed.@everywhere -Distributed.clear!(::Any, ::Any; ::Any) Distributed.remoteref_id Distributed.channel_from_id Distributed.worker_id_from_socket diff --git a/test/distributed_exec.jl b/test/distributed_exec.jl index 43e02c9..166ea6d 100644 --- a/test/distributed_exec.jl +++ b/test/distributed_exec.jl @@ -3,6 +3,13 @@ using Test, Distributed, Random, Serialization, Sockets import Distributed: launch, manage +sharedir = normpath(joinpath(Sys.BINDIR, "..", "share")) +if parse(Bool, get(ENV, "JULIA_DISTRIBUTED_TESTING_STANDALONE", "false")) + @test !startswith(pathof(Distributed), sharedir) +else + @test startswith(pathof(Distributed), sharedir) +end + @test cluster_cookie() isa String include(joinpath(Sys.BINDIR, "..", "share", "julia", "test", "testenv.jl")) @@ -960,17 +967,19 @@ end # issue #16091 mutable struct T16091 end wid = workers()[1] -@test try +try remotecall_fetch(()->T16091, wid) - false + @test "unreachable" === true catch ex - ((ex::RemoteException).captured::CapturedException).ex === UndefVarError(:T16091) + ex = ((ex::RemoteException).captured::CapturedException).ex + @test (ex::UndefVarError).var === :T16091 end -@test try +try remotecall_fetch(identity, wid, T16091) - false + @test "unreachable" === true catch ex - ((ex::RemoteException).captured::CapturedException).ex === UndefVarError(:T16091) + ex = ((ex::RemoteException).captured::CapturedException).ex + @test (ex::UndefVarError).var === :T16091 end f16091a() = 1 @@ -1548,7 +1557,8 @@ try catch ex @test isa(ex.captured.ex.exceptions[1].ex, ErrorException) @test occursin("BoundsError", ex.captured.ex.exceptions[1].ex.msg) - @test ex.captured.ex.exceptions[2].ex == UndefVarError(:DontExistOn1) + ex = ex.captured.ex.exceptions[2].ex + @test (ex::UndefVarError).var === :DontExistOn1 end let From 63539190b7a11051746372fd9f503d3f0559d2d1 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Thu, 9 Nov 2023 09:20:02 -0300 Subject: [PATCH 02/54] Multilevel paralelism support (after first succesful tests) --- src/cluster.jl | 447 +++++++++++++++++++++++++-------------- src/clusterserialize.jl | 10 +- src/macros.jl | 32 +-- src/managers.jl | 9 +- src/messages.jl | 40 ++-- src/pmap.jl | 40 ++-- src/precompile.jl | 12 +- src/process_messages.jl | 157 +++++++------- src/remotecall.jl | 319 +++++++++++++++------------- src/workerpool.jl | 93 ++++---- test/distributed_exec.jl | 348 ++++++++++++++++++++---------- test/managers.jl | 4 +- test/runtests.jl | 2 +- test/splitrange.jl | 4 +- test/topology.jl | 8 +- 15 files changed, 904 insertions(+), 621 deletions(-) diff --git a/src/cluster.jl b/src/cluster.jl index d8cc052..071863a 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -115,8 +115,8 @@ mutable struct Worker function Worker(id::Int, r_stream::IO, w_stream::IO, manager::ClusterManager; version::Union{VersionNumber, Nothing}=nothing, - config::WorkerConfig=WorkerConfig()) - w = Worker(id) + config::WorkerConfig=WorkerConfig(), role= :default) + w = Worker(id; role = role) w.r_stream = r_stream w.w_stream = buffer_writes(w_stream) w.w_serializer = ClusterSerializer(w.w_stream) @@ -128,56 +128,60 @@ mutable struct Worker w end - Worker(id::Int) = Worker(id, nothing) - function Worker(id::Int, conn_func) + Worker(id::Int; role= :default) = Worker(id, nothing; role = role) + function Worker(id::Int, conn_func; role= :default) @assert id > 0 + map_pid_wrkr = Map_pid_wrkr(role = role) if haskey(map_pid_wrkr, id) return map_pid_wrkr[id] end w=new(id, Threads.ReentrantLock(), [], [], false, W_CREATED, Condition(), time(), conn_func) w.initialized = Event() - register_worker(w) + register_worker(w; role = role) w end - Worker() = Worker(get_next_pid()) + Worker(;role= :default) = Worker(get_next_pid(); role = role) end +wid(w::Worker; role= :default) = w.id + function set_worker_state(w, state) w.state = state notify(w.c_state; all=true) end -function check_worker_state(w::Worker) +function check_worker_state(w::Worker; role= :default) if w.state === W_CREATED - if !isclusterlazy() - if PGRP.topology === :all_to_all + if !isclusterlazy(role = role) + pg = PGRP(role = role) + if pg.topology === :all_to_all # Since higher pids connect with lower pids, the remote worker # may not have connected to us yet. Wait for some time. - wait_for_conn(w) + wait_for_conn(w; role=role) else - error("peer $(w.id) is not connected to $(myid()). Topology : " * string(PGRP.topology)) + error("peer $(wid(w, role=role)) is not connected to $(myid(role=role)). Topology : " * string(pg.topology)) end else w.ct_time = time() - if myid() > w.id - t = @async exec_conn_func(w) + if myid(role=role) > wid(w, role=role) + t = @async exec_conn_func(w; role=role) else # route request via node 1 - t = @async remotecall_fetch((p,to_id) -> remotecall_fetch(exec_conn_func, p, to_id), 1, w.id, myid()) + t = @async remotecall_fetch((p,to_id) -> remotecall_fetch((to_id, role) -> exec_conn_func(to_id, role = role), p, to_id, p == 1 ? :manager : :worker; role = role), 1, wid(w, role=role), myid(role=role)) end errormonitor(t) - wait_for_conn(w) + wait_for_conn(w; role=role) end end end -exec_conn_func(id::Int) = exec_conn_func(worker_from_id(id)::Worker) -function exec_conn_func(w::Worker) +exec_conn_func(id::Int; role= :default) = exec_conn_func(worker_from_id(id; role = role)::Worker; role = role) +function exec_conn_func(w::Worker; role= :default) try f = notnothing(w.conn_func) # Will be called if some other task tries to connect at the same time. - w.conn_func = () -> wait_for_conn(w) + w.conn_func = () -> wait_for_conn(w; role=role) f() catch e w.conn_func = () -> throw(e) @@ -186,14 +190,14 @@ function exec_conn_func(w::Worker) nothing end -function wait_for_conn(w) +function wait_for_conn(w; role=:defaut) if w.state === W_CREATED timeout = worker_timeout() - (time() - w.ct_time) - timeout <= 0 && error("peer $(w.id) has not connected to $(myid())") + timeout <= 0 && error("peer $(wid(w, role=role)) has not connected to $(myid(role=role))") @async (sleep(timeout); notify(w.c_state; all=true)) wait(w.c_state) - w.state === W_CREATED && error("peer $(w.id) didn't connect to $(myid()) within $timeout seconds") + w.state === W_CREATED && error("peer $(wid(w, role=role)) didn't connect to $(myid(role=role)) within $timeout seconds") end nothing end @@ -201,11 +205,28 @@ end ## process group creation ## mutable struct LocalProcess - id::Int + id0::Int + id1::Int bind_addr::String bind_port::UInt16 cookie::String - LocalProcess() = new(1) + LocalProcess() = new(1,1) +end + +function wid(lp::LocalProcess; role= :default) + if role == :manager + return lp.id1 + elseif role == :worker + return lp.id0 + elseif role == :default && myrole() == :master + return lp.id1 # as :manager + elseif role == :default && myrole() == :worker + return lp.id0 # as :worker + else + return lp.id1 # as :manager + #throw("unexpected use of role=:default (wid)") + end + end worker_timeout() = parse(Float64, get(ENV, "JULIA_WORKER_TIMEOUT", "60.0")) @@ -230,6 +251,7 @@ It does not return. """ start_worker(cookie::AbstractString=readline(stdin); kwargs...) = start_worker(stdout, cookie; kwargs...) function start_worker(out::IO, cookie::AbstractString=readline(stdin); close_stdin::Bool=true, stderr_to_stdout::Bool=true) + init_multi() if close_stdin # workers will not use it @@ -249,12 +271,9 @@ function start_worker(out::IO, cookie::AbstractString=readline(stdin); close_std end errormonitor(@async while isopen(sock) client = accept(sock) - process_messages(client, client, true) + process_messages(client, client, true; role = :worker) end) - print(out, "julia_worker:") # print header - print(out, "$(string(LPROC.bind_port))#") # print port - print(out, LPROC.bind_addr) - print(out, '\n') + println(out, "julia_worker:$(string(LPROC.bind_port))#$(LPROC.bind_addr)\n") # print header flush(out) Sockets.nagle(sock, false) @@ -270,7 +289,7 @@ function start_worker(out::IO, cookie::AbstractString=readline(stdin); close_std check_master_connect() while true; wait(); end catch err - print(stderr, "unhandled exception on $(myid()): $(err)\nexiting.\n") + print(stderr, "unhandled exception on $(myid(role = :worker)): $(err)\nexiting.\n") end close(sock) @@ -379,12 +398,12 @@ function init_worker(cookie::AbstractString, manager::ClusterManager=DefaultClus # Since our pid has yet to be set, ensure no RemoteChannel / Future have been created or addprocs() called. @assert nprocs() <= 1 - @assert isempty(PGRP.refs) + @assert isempty(PGRP(role = :worker).refs) @assert isempty(client_refs) # System is started in head node mode, cleanup related entries - empty!(PGRP.workers) - empty!(map_pid_wrkr) + empty!(PGRP(role = :worker).workers) + empty!(Map_pid_wrkr(role = :worker)) cluster_cookie(cookie) nothing @@ -443,10 +462,16 @@ end function addprocs(manager::ClusterManager; kwargs...) init_multi() - cluster_mgmt_from_master_check() +# cluster_mgmt_from_master_check() lock(worker_lock) try + + if myrole() == :worker + myrole!(:manager_worker) + end + PGRP(role=:manager).level = PGRP(role=:worker).level + 1 + addprocs_locked(manager::ClusterManager; kwargs...) finally unlock(worker_lock) @@ -455,16 +480,18 @@ end function addprocs_locked(manager::ClusterManager; kwargs...) params = merge(default_addprocs_params(manager), Dict{Symbol,Any}(kwargs)) - topology(Symbol(params[:topology])) + topology(Symbol(params[:topology]); role = :manager) - if PGRP.topology !== :all_to_all + pgm = PGRP(role = :manager) + + if pgm.topology !== :all_to_all params[:lazy] = false end - if PGRP.lazy === nothing || nprocs() == 1 - PGRP.lazy = params[:lazy] - elseif isclusterlazy() != params[:lazy] - throw(ArgumentError(string("Active workers with lazy=", isclusterlazy(), + if pgm.lazy === nothing || nprocs() == 1 + pgm.lazy = params[:lazy] + elseif isclusterlazy(role = :manager) != params[:lazy] + throw(ArgumentError(string("Active workers with lazy=", isclusterlazy(role = :manager), ". Cannot set lazy=", params[:lazy]))) end @@ -509,17 +536,17 @@ function addprocs_locked(manager::ClusterManager; kwargs...) # Since all worker-to-worker setups may not have completed by the time this # function returns to the caller, send the complete list to all workers. # Useful for nprocs(), nworkers(), etc to return valid values on the workers. - all_w = workers() + all_w = workers(role = :manager) for pid in all_w - remote_do(set_valid_processes, pid, all_w) + remote_do((all_w, role) -> set_valid_processes(all_w, role = role), pid, all_w, pid == 1 ? :manager : :worker; role = :manager) end sort!(launched_q) end -function set_valid_processes(plist::Array{Int}) +function set_valid_processes(plist::Array{Int}; role= :default) for pid in setdiff(plist, workers()) - myid() != pid && Worker(pid) + myid(role=role) != pid && Worker(pid; role = role) end end @@ -566,7 +593,7 @@ function launch_n_additional_processes(manager, frompid, fromconfig, cnt, launch exeflags = something(fromconfig.exeflags, ``) cmd = `$exename $exeflags` - new_addresses = remotecall_fetch(launch_additional, frompid, cnt, cmd) + new_addresses = remotecall_fetch(launch_additional, frompid, cnt, cmd; role = :manager) for address in new_addresses (bind_addr, port) = address @@ -580,7 +607,7 @@ function launch_n_additional_processes(manager, frompid, fromconfig, cnt, launch let wconfig=wconfig @async begin pid = create_worker(manager, wconfig) - remote_do(redirect_output_from_additional_worker, frompid, pid, port) + remote_do(redirect_output_from_additional_worker, frompid, pid, port; role = :manager) push!(launched_q, pid) end end @@ -589,40 +616,42 @@ function launch_n_additional_processes(manager, frompid, fromconfig, cnt, launch end function create_worker(manager, wconfig) + role = :manager + # only node 1 can add new nodes, since nobody else has the full list of address:port - @assert LPROC.id == 1 + @assert myid(role=role) == 1 timeout = worker_timeout() # initiate a connect. Does not wait for connection completion in case of TCP. - w = Worker() + w = Worker(role = role) local r_s, w_s try - (r_s, w_s) = connect(manager, w.id, wconfig) + (r_s, w_s) = connect(manager, wid(w, role=role), wconfig) catch ex try - deregister_worker(w.id) - kill(manager, w.id, wconfig) + deregister_worker(wid(w, role=role), role = role) + kill(manager, wid(w, role=role), wconfig) finally rethrow(ex) end end - w = Worker(w.id, r_s, w_s, manager; config=wconfig) + w = Worker(wid(w, role=role), r_s, w_s, manager; config=wconfig, role = role) # install a finalizer to perform cleanup if necessary finalizer(w) do w - if myid() == 1 - manage(w.manager, w.id, w.config, :finalize) + if myid(role=role) == 1 + manage(w.manager, wid(w, role=role), w.config, :finalize) end end # set when the new worker has finished connections with all other workers - ntfy_oid = RRID() - rr_ntfy_join = lookup_ref(ntfy_oid) - rr_ntfy_join.waitingfor = myid() + ntfy_oid = RRID(role = role) + rr_ntfy_join = lookup_ref(ntfy_oid; role = role) + rr_ntfy_join.waitingfor = myid(role=role) # Start a new task to handle inbound messages from connected worker in master. # Also calls `wait_connected` on TCP streams. - process_messages(w.r_stream, w.w_stream, false) + process_messages(w.r_stream, w.w_stream, false; role = :manager) # send address information of all workers to the new worker. # Cluster managers set the address of each worker in `WorkerConfig.connect_at`. @@ -639,23 +668,24 @@ function create_worker(manager, wconfig) # - On master, receiving a JoinCompleteMsg triggers rr_ntfy_join (signifies that worker setup is complete) join_list = [] - if PGRP.topology === :all_to_all + pgm = PGRP(role = role) + if pgm.topology === :all_to_all # need to wait for lower worker pids to have completed connecting, since the numerical value # of pids is relevant to the connection process, i.e., higher pids connect to lower pids and they # require the value of config.connect_at which is set only upon connection completion - for jw in PGRP.workers - if (jw.id != 1) && (jw.id < w.id) + for jw in pgm.workers + if (wid(jw, role=role) != 1) && (wid(jw, role=role) < wid(w, role=role)) (jw.state === W_CREATED) && wait(jw.c_state) push!(join_list, jw) end end - elseif PGRP.topology === :custom + elseif pgm.topology === :custom # wait for requested workers to be up before connecting to them. - filterfunc(x) = (x.id != 1) && isdefined(x, :config) && + filterfunc(x) = (wid(x, role=role) != 1) && isdefined(x, :config) && (notnothing(x.config.ident) in something(wconfig.connect_idents, [])) - wlist = filter(filterfunc, PGRP.workers) + wlist = filter(filterfunc, pgm.workers) waittime = 0 while wconfig.connect_idents !== nothing && length(wlist) < length(wconfig.connect_idents) @@ -664,7 +694,7 @@ function create_worker(manager, wconfig) end sleep(1.0) waittime += 1 - wlist = filter(filterfunc, PGRP.workers) + wlist = filter(filterfunc, pgm.workers) end for wl in wlist @@ -674,15 +704,15 @@ function create_worker(manager, wconfig) end all_locs = mapany(x -> isa(x, Worker) ? - (something(x.config.connect_at, ()), x.id) : - ((), x.id, true), + (something(x.config.connect_at, ()), wid(x, role=role)) : + ((), wid(x, role=role), true), join_list) send_connection_hdr(w, true) enable_threaded_blas = something(wconfig.enable_threaded_blas, false) - join_message = JoinPGRPMsg(w.id, all_locs, PGRP.topology, enable_threaded_blas, isclusterlazy()) - send_msg_now(w, MsgHeader(RRID(0,0), ntfy_oid), join_message) + join_message = JoinPGRPMsg(wid(w, role=role), all_locs, pgm.topology, enable_threaded_blas, isclusterlazy(role = role)) + send_msg_now(w, MsgHeader(RRID(0,0), ntfy_oid), join_message; role = role) - @async manage(w.manager, w.id, w.config, :register) + @async manage(w.manager, wid(w, role=role), w.config, :register) # wait for rr_ntfy_join with timeout timedout = false @async (sleep($timeout); timedout = true; put!(rr_ntfy_join, 1)) @@ -691,10 +721,10 @@ function create_worker(manager, wconfig) error("worker did not connect within $timeout seconds") end lock(client_refs) do - delete!(PGRP.refs, ntfy_oid) + delete!(pgm.refs, ntfy_oid) end - return w.id + return wid(w, role=role) end @@ -736,6 +766,7 @@ function check_master_connect() return end @async begin + map_pid_wrkr = Map_pid_wrkr(role = :worker) start = time_ns() while !haskey(map_pid_wrkr, 1) && (time_ns() - start) < timeout sleep(1.0) @@ -784,34 +815,60 @@ let next_pid = 2 # 1 is reserved for the client (always) end mutable struct ProcessGroup + level::Integer name::String workers::Array{Any,1} refs::Dict{RRID,Any} # global references topology::Symbol lazy::Union{Bool, Nothing} - ProcessGroup(w::Array{Any,1}) = new("pg-default", w, Dict(), :all_to_all, nothing) + ProcessGroup(w::Array{Any,1}) = new(0, "pg-default", w, Dict(), :all_to_all, nothing) +end + +const _PGRP0 = ProcessGroup([]) +const _PGRP1 = ProcessGroup([]) + +function PGRP(;role= :default) + if role == :manager +# @info "$(role) / PGRP1 !" + return _PGRP1 + elseif role == :worker +# @info "$(role) / PGRP0 ! -- worker" + return _PGRP0 +# elseif role == :default && _PGRP0.level == 0 + elseif role == :default && myrole() == :master +# @info "$(role) / PGRP1 !" + return _PGRP1 # as :manager +# elseif role == :default && _PGRP0.level > 0 + elseif role == :default && myrole() == :worker +# @info "$(role) / PGRP0 !" + return _PGRP0 # as :worker + else + return _PGRP1 # as :manager + # throw("unexpected use of role = $role (PGRP) - $(myrole())") + end end -const PGRP = ProcessGroup([]) -function topology(t) +function topology(t; role= :default) @assert t in [:all_to_all, :master_worker, :custom] - if (PGRP.topology==t) || ((myid()==1) && (nprocs()==1)) || (myid() > 1) - PGRP.topology = t + pg = PGRP(role = role) + if (pg.topology==t) || ((myid(role=role)==1) && (nprocs()==1)) || (myid(role=role) > 1) + pg.topology = t else - error("Workers with Topology $(PGRP.topology) already exist. Requested Topology $(t) cannot be set.") + error("Workers with Topology $(pg.topology) already exist. Requested Topology $(t) cannot be set.") end t end -isclusterlazy() = something(PGRP.lazy, false) +isclusterlazy(; role= :default) = something(PGRP(role = role).lazy, false) -get_bind_addr(pid::Integer) = get_bind_addr(worker_from_id(pid)) -get_bind_addr(w::LocalProcess) = LPROC.bind_addr -function get_bind_addr(w::Worker) +get_bind_addr(pid::Integer) = get_bind_addr(worker_from_id(pid; role = :manager)) # always called as manager +get_bind_addr(w::LocalProcess) = LPROC.bind_addr # always called as manager +function get_bind_addr(w::Worker) + role = :worker # always called as worker if w.config.bind_addr === nothing - if w.id != myid() - w.config.bind_addr = remotecall_fetch(get_bind_addr, w.id, w.id) + if wid(w, role=role) != myid(role=role) + w.config.bind_addr = remotecall_fetch(get_bind_addr, wid(w, role=role), wid(w, role=role), role = role) end end w.config.bind_addr @@ -822,10 +879,33 @@ const LPROC = LocalProcess() const LPROCROLE = Ref{Symbol}(:master) const HDR_VERSION_LEN=16 const HDR_COOKIE_LEN=16 -const map_pid_wrkr = Dict{Int, Union{Worker, LocalProcess}}() +const _map_pid_wrkr_0 = Dict{Int, Union{Worker, LocalProcess}}() +const _map_pid_wrkr_1 = Dict{Int, Union{Worker, LocalProcess}}() const map_sock_wrkr = IdDict() const map_del_wrkr = Set{Int}() +function Map_pid_wrkr(;role= :default) + # @info ("_map_pid_wrkr_0", _map_pid_wrkr_0, "end") + # @info ("_map_pid_wrkr_1", _map_pid_wrkr_1, "end") + pg = PGRP(role = role) + if role == :manager + # @info "Map_pid_wrkr_1 ", role + return _map_pid_wrkr_1 + elseif role == :worker + # @info "Map_pid_wrkr_0 ", role + return _map_pid_wrkr_0 + elseif role == :default && myrole() == :master + # @info "Map_pid_wrkr_1 ", role, pg.level + return _map_pid_wrkr_1 # as :manager + elseif role == :default && myrole() == :worker + # @info "Map_pid_wrkr_0 ", role, pg.level + return _map_pid_wrkr_0 # as :worker + else + return _map_pid_wrkr_1 # as :manager + # throw("unexpected use of role = :default (Map_pid_wrkr)") + end +end + # whether process is a master or worker in a distributed setup myrole() = LPROCROLE[] function myrole!(proctype::Symbol) @@ -847,7 +927,38 @@ julia> remotecall_fetch(() -> myid(), 4) 4 ``` """ -myid() = LPROC.id +function myid(;role= :default) + if role == :manager + return LPROC.id1 + elseif role == :worker + return LPROC.id0 + elseif role == :default && myrole() == :master + return LPROC.id1 # as :manager + elseif role == :default && myrole() == :worker + return LPROC.id0 # as :worker + else + return LPROC.id1 # as :manager + #throw("unexpected use of role := default (myid) - $(myrole())") + end + +end + +function myid!(id;role= :default) + if role == :manager + LPROC.id1 = id + elseif role == :worker + LPROC.id0 = id + elseif role == :default && myrole() == :master + LPROC.id1 = id # as :manager + elseif role == :default && myrole() == :worker + LPROC.id0 = id # as :worker + else + LPROC.id1 = id # as :manager + #throw("unexpected use of role := default (myid!)") + end + +end + """ nprocs() @@ -865,18 +976,19 @@ julia> workers() 3 ``` """ -function nprocs() - if myid() == 1 || (PGRP.topology === :all_to_all && !isclusterlazy()) - n = length(PGRP.workers) +function nprocs(; role= :default) + pg = PGRP(role = role) + if myid(role=role) == 1 || (pg.topology === :all_to_all && !isclusterlazy(role = role)) + n = length(pg.workers) # filter out workers in the process of being setup/shutdown. - for jw in PGRP.workers + for jw in pg.workers if !isa(jw, LocalProcess) && (jw.state !== W_CONNECTED) n = n - 1 end end return n else - return length(PGRP.workers) + return length(pg.workers) end end @@ -897,8 +1009,8 @@ julia> nworkers() 2 ``` """ -function nworkers() - n = nprocs() +function nworkers(;role= :default) + n = nprocs(role = role) n == 1 ? 1 : n-1 end @@ -918,25 +1030,27 @@ julia> procs() 3 ``` """ -function procs() - if myid() == 1 || (PGRP.topology === :all_to_all && !isclusterlazy()) +function procs(; role= :default) + pg = PGRP(role = role) + if myid(role=role) == 1 || (pg.topology === :all_to_all && !isclusterlazy(role = role)) # filter out workers in the process of being setup/shutdown. - return Int[x.id for x in PGRP.workers if isa(x, LocalProcess) || (x.state === W_CONNECTED)] + return Int[wid(x, role=role) for x in pg.workers if isa(x, LocalProcess) || (x.state === W_CONNECTED)] else - return Int[x.id for x in PGRP.workers] + return Int[wid(x, role=role) for x in pg.workers] end end -function id_in_procs(id) # faster version of `id in procs()` - if myid() == 1 || (PGRP.topology === :all_to_all && !isclusterlazy()) - for x in PGRP.workers - if (x.id::Int) == id && (isa(x, LocalProcess) || (x::Worker).state === W_CONNECTED) +function id_in_procs(id0; role= :default) # faster version of `id in procs()` + pg = PGRP(role = role) + if myid(role=role) == 1 || (pg.topology === :all_to_all && !isclusterlazy(role = role)) + for x in pg.workers + if (wid(x, role=role)::Int) == id0 && (isa(x, LocalProcess) || (x::Worker).state === W_CONNECTED) return true end end else - for x in PGRP.workers - if (x.id::Int) == id + for x in pg.workers + if (wid(x, role=role)::Int) == id0 return true end end @@ -950,17 +1064,18 @@ end Return a list of all process identifiers on the same physical node. Specifically all workers bound to the same ip-address as `pid` are returned. """ -function procs(pid::Integer) - if myid() == 1 - all_workers = [x for x in PGRP.workers if isa(x, LocalProcess) || (x.state === W_CONNECTED)] +function procs(pid::Integer; role= :default) + if myid(role = role) == 1 + map_pid_wrkr = Map_pid_wrkr(role = role) + all_workers = [x for x in PGRP(role = role).workers if isa(x, LocalProcess) || (x.state === W_CONNECTED)] if (pid == 1) || (isa(map_pid_wrkr[pid].manager, LocalManager)) - Int[x.id for x in filter(w -> (w.id==1) || (isa(w.manager, LocalManager)), all_workers)] + Int[wid(x, role=role) for x in filter(w -> (wid(w, role=role)==1) || (isa(w.manager, LocalManager)), all_workers)] else ipatpid = get_bind_addr(pid) - Int[x.id for x in filter(w -> get_bind_addr(w) == ipatpid, all_workers)] + Int[wid(x, role=role) for x in filter(w -> get_bind_addr(w) == ipatpid, all_workers)] end else - remotecall_fetch(procs, 1, pid) + remotecall_fetch(pid -> procs(pid, role = :manager), 1; role = role) end end @@ -972,15 +1087,15 @@ Return a list of all worker process identifiers. # Examples ```julia-repl \$ julia -p 2 - +, pid julia> workers() 2-element Array{Int64,1}: 2 3 ``` """ -function workers() - allp = procs() +function workers(; role= :default) + allp = procs(role = role) if length(allp) == 1 allp else @@ -988,11 +1103,11 @@ function workers() end end -function cluster_mgmt_from_master_check() - if myid() != 1 - throw(ErrorException("Only process 1 can add and remove workers")) - end -end +#function cluster_mgmt_from_master_check() +# if myid() != 1 +# throw(ErrorException("Only process 1 can add and remove workers")) +# end +#end """ rmprocs(pids...; waitfor=typemax(Int)) @@ -1025,22 +1140,22 @@ julia> workers() 6 ``` """ -function rmprocs(pids...; waitfor=typemax(Int)) - cluster_mgmt_from_master_check() +function rmprocs(pids...; role = :default, waitfor=typemax(Int)) # supposed to be called always as :manager +# cluster_mgmt_from_master_check() pids = vcat(pids...) if waitfor == 0 - t = @async _rmprocs(pids, typemax(Int)) + t = @async _rmprocs(pids, role, typemax(Int)) yield() return t else - _rmprocs(pids, waitfor) + _rmprocs(pids, role, waitfor) # return a dummy task object that user code can wait on. return @async nothing end end -function _rmprocs(pids, waitfor) +function _rmprocs(pids, role, waitfor) lock(worker_lock) try rmprocset = Union{LocalProcess, Worker}[] @@ -1048,6 +1163,7 @@ function _rmprocs(pids, waitfor) if p == 1 @warn "rmprocs: process 1 not removed" else + map_pid_wrkr = Map_pid_wrkr(role = role) if haskey(map_pid_wrkr, p) w = map_pid_wrkr[p] set_worker_state(w, W_TERMINATING) @@ -1063,7 +1179,7 @@ function _rmprocs(pids, waitfor) sleep(min(0.1, waitfor - (time_ns() - start)/1e9)) end - unremoved = [wrkr.id for wrkr in filter(w -> w.state !== W_TERMINATED, rmprocset)] + unremoved = [wid(wrkr, role=role) for wrkr in filter(w -> w.state !== W_TERMINATED, rmprocset)] if length(unremoved) > 0 estr = string("rmprocs: pids ", unremoved, " not terminated after ", waitfor, " seconds.") throw(ErrorException(estr)) @@ -1087,17 +1203,18 @@ end # No-arg constructor added for compatibility with Julia 1.0 & 1.1, should be deprecated in the future ProcessExitedException() = ProcessExitedException(-1) -worker_from_id(i) = worker_from_id(PGRP, i) -function worker_from_id(pg::ProcessGroup, i) +worker_from_id(i; role= :default) = worker_from_id(PGRP(role = role), i; role = role) +function worker_from_id(pg::ProcessGroup, i; role= :default) if !isempty(map_del_wrkr) && in(i, map_del_wrkr) throw(ProcessExitedException(i)) end + map_pid_wrkr = Map_pid_wrkr(role = role) w = get(map_pid_wrkr, i, nothing) if w === nothing - if myid() == 1 - error("no process with id $i exists") + if myid(role=role) == 1 + error("no process with id $i exists ($role)") end - w = Worker(i) + w = Worker(i; role = role) map_pid_wrkr[i] = w else w = w::Union{Worker, LocalProcess} @@ -1113,25 +1230,26 @@ returns the `pid` of the worker it is connected to. This is useful when writing custom [`serialize`](@ref) methods for a type, which optimizes the data written out depending on the receiving process id. """ -function worker_id_from_socket(s) +function worker_id_from_socket(s; role= :default) w = get(map_sock_wrkr, s, nothing) if isa(w,Worker) if s === w.r_stream || s === w.w_stream - return w.id + return wid(w, role=role) end end if isa(s,IOStream) && fd(s)==-1 # serializing to a local buffer - return myid() + return myid(role=role) end return -1 end -register_worker(w) = register_worker(PGRP, w) -function register_worker(pg, w) +register_worker(w; role= :default) = register_worker(PGRP(role = role), w; role = role) +function register_worker(pg, w; role= :default) push!(pg.workers, w) - map_pid_wrkr[w.id] = w + map_pid_wrkr = Map_pid_wrkr(role = role) + map_pid_wrkr[wid(w, role=role)] = w end function register_worker_streams(w) @@ -1139,9 +1257,10 @@ function register_worker_streams(w) map_sock_wrkr[w.w_stream] = w end -deregister_worker(pid) = deregister_worker(PGRP, pid) -function deregister_worker(pg, pid) - pg.workers = filter(x -> !(x.id == pid), pg.workers) +deregister_worker(pid; role= :default) = deregister_worker(PGRP(role = role), pid, role=role) +function deregister_worker(pg, pid; role= :default) + pg.workers = filter(x -> !(wid(x, role=role) == pid), pg.workers) + map_pid_wrkr = Map_pid_wrkr(role = role) w = pop!(map_pid_wrkr, pid, nothing) if isa(w, Worker) if isdefined(w, :r_stream) @@ -1151,13 +1270,13 @@ function deregister_worker(pg, pid) end end - if myid() == 1 && (myrole() === :master) && isdefined(w, :config) + if myid(role=role) == 1 && #=role === :manager &&=# isdefined(w, :config) # Notify the cluster manager of this workers death - manage(w.manager, w.id, w.config, :deregister) - if PGRP.topology !== :all_to_all || isclusterlazy() - for rpid in workers() + manage(w.manager, wid(w, role=role), w.config, :deregister) + if pg.topology !== :all_to_all || isclusterlazy(role = role) + for rpid in workers(role=role) try - remote_do(deregister_worker, rpid, pid) + remote_do((pid,role) -> deregister_worker(pid, role=role), rpid, pid, rpid == 1 ? :manager : :worker; role = role) catch end end @@ -1192,11 +1311,12 @@ function deregister_worker(pg, pid) end -function interrupt(pid::Integer) - @assert myid() == 1 +function interrupt(pid::Integer) + @assert myid(role = :manager) == 1 + map_pid_wrkr = Map_pid_wrkr(role = :manager) w = map_pid_wrkr[pid] if isa(w, Worker) - manage(w.manager, w.id, w.config, :interrupt) + manage(w.manager, wid(w, role=:manager), w.config, :interrupt) end return end @@ -1215,8 +1335,8 @@ interrupt(pids::Integer...) = interrupt([pids...]) Interrupt the current executing task on the specified workers. This is equivalent to pressing Ctrl-C on the local machine. If no arguments are given, all workers are interrupted. """ -function interrupt(pids::AbstractVector=workers()) - @assert myid() == 1 +function interrupt(pids::AbstractVector=workers(role = :manager)) + @assert myid(role = :manager) == 1 @sync begin for pid in pids @async interrupt(pid) @@ -1227,13 +1347,14 @@ end wp_bind_addr(p::LocalProcess) = p.bind_addr wp_bind_addr(p) = p.config.bind_addr -function check_same_host(pids) - if myid() != 1 - return remotecall_fetch(check_same_host, 1, pids) +function check_same_host(pids; role= :default) + if myid(role = role) != 1 + return remotecall_fetch(pids -> check_same_host(pids, role = :manager), 1, pids; role = role) else # We checkfirst if all test pids have been started using the local manager, # else we check for the same bind_to addr. This handles the special case # where the local ip address may change - as during a system sleep/awake + map_pid_wrkr = Map_pid_wrkr(role = role) if all(p -> (p==1) || (isa(map_pid_wrkr[p].manager, LocalManager)), pids) return true else @@ -1243,18 +1364,18 @@ function check_same_host(pids) end end -function terminate_all_workers() - myid() != 1 && return +function terminate_all_workers(;role= :default) + myid(role = role) != 1 && return - if nprocs() > 1 + if nprocs(role = role) > 1 try - rmprocs(workers(); waitfor=5.0) + rmprocs(workers(role = role); role = role, waitfor=5.0) catch _ex @warn "Forcibly interrupting busy workers" exception=_ex # Might be computation bound, interrupt them and try again - interrupt(workers()) + interrupt(workers(role = role)) try - rmprocs(workers(); waitfor=5.0) + rmprocs(workers(role = role); role = role, waitfor=5.0) catch _ex2 @error "Unable to terminate all workers" exception=_ex2,catch_backtrace() end @@ -1296,7 +1417,7 @@ let inited = false if !inited inited = true push!(Base.package_callbacks, _require_callback) - atexit(terminate_all_workers) + atexit(() -> terminate_all_workers(role = :manager)) # TO CHECK (role argument ???) init_bind_addr() cluster_cookie(randstring(HDR_COOKIE_LEN)) end @@ -1305,14 +1426,18 @@ let inited = false end function init_parallel() - start_gc_msgs_task() + start_gc_msgs_task(role = :manager) # TO CHECK + start_gc_msgs_task(role = :worker) # TO CHECK # start in "head node" mode, if worker, will override later. - global PGRP + #global PGRP global LPROC - LPROC.id = 1 - @assert isempty(PGRP.workers) - register_worker(LPROC) + LPROC.id0 = 0 + LPROC.id1 = 1 + @assert isempty(PGRP(role = :manager).workers) # TO CHECK + @assert isempty(PGRP(role = :worker).workers) # TO CHECK + register_worker(LPROC; role = :manager) # TO CHECK + register_worker(LPROC; role = :worker) # TO CHECK end write_cookie(io::IO) = print(io.in, string(cluster_cookie(), "\n")) diff --git a/src/clusterserialize.jl b/src/clusterserialize.jl index 0acd4ce..3520a30 100644 --- a/src/clusterserialize.jl +++ b/src/clusterserialize.jl @@ -241,14 +241,14 @@ reinitialized. Only those names found to be defined under `mod` are cleared. An exception is raised if a global constant is requested to be cleared. """ -function clear!(syms, pids=workers(); mod=Main) +function clear!(syms, pids=workers(); mod=Main, role= :default) @sync for p in pids - @async_unwrap remotecall_wait(clear_impl!, p, syms, mod) + @async_unwrap remotecall_wait(clear_impl!, p, syms, mod; role = role) end end -clear!(sym::Symbol, pid::Int; mod=Main) = clear!([sym], [pid]; mod=mod) -clear!(sym::Symbol, pids=workers(); mod=Main) = clear!([sym], pids; mod=mod) -clear!(syms, pid::Int; mod=Main) = clear!(syms, [pid]; mod=mod) +clear!(sym::Symbol, pid::Int; mod=Main, role= :default) = clear!([sym], [pid]; mod=mod, role = role) +clear!(sym::Symbol, pids=workers(); mod=Main, role= :default) = clear!([sym], pids; mod=mod, role = role) +clear!(syms, pid::Int; mod=Main, role= :default) = clear!(syms, [pid]; mod=mod, role = role) clear_impl!(syms, mod::Module) = foreach(x->clear_impl!(x,mod), syms) clear_impl!(sym::Symbol, mod::Module) = isdefined(mod, sym) && @eval(mod, global $sym = nothing) diff --git a/src/macros.jl b/src/macros.jl index a767c7a..3ba168b 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -2,15 +2,15 @@ let nextidx = Threads.Atomic{Int}(0) global nextproc - function nextproc() + function nextproc(;role= :default) idx = Threads.atomic_add!(nextidx, 1) - return workers()[(idx % nworkers()) + 1] + return workers(role = role)[(idx % nworkers(role = role)) + 1] end end -spawnat(p, thunk) = remotecall(thunk, p) +spawnat(p, thunk; role= :default) = remotecall(thunk, p; role = role) -spawn_somewhere(thunk) = spawnat(nextproc(),thunk) +spawn_somewhere(thunk; role= :default) = spawnat(nextproc(role = role),thunk; role = role) """ @spawn expr @@ -191,7 +191,7 @@ Similar to calling `remotecall_eval(Main, procs, expr)`, but with two extra feat """ macro everywhere(ex) procs = GlobalRef(@__MODULE__, :procs) - return esc(:($(Distributed).@everywhere $procs() $ex)) + return esc(:($(MultiscaleCluster).@everywhere $procs(role = :manager) $ex)) end macro everywhere(procs, ex) @@ -200,7 +200,7 @@ macro everywhere(procs, ex) $(isempty(imps) ? nothing : Expr(:toplevel, imps...)) # run imports locally first let ex = Expr(:toplevel, :(task_local_storage()[:SOURCE_PATH] = $(get(task_local_storage(), :SOURCE_PATH, nothing))), $(esc(Expr(:quote, ex)))), procs = $(esc(procs)) - remotecall_eval(Main, procs, ex) + remotecall_eval(Main, procs, ex; role = :manager) end end end @@ -215,14 +215,14 @@ Errors on any of the processes are collected into a See also [`@everywhere`](@ref). """ -function remotecall_eval(m::Module, procs, ex) +function remotecall_eval(m::Module, procs, ex; role=:default) @sync begin run_locally = 0 for pid in procs - if pid == myid() + if pid == myid(role=role) run_locally += 1 else - @async_unwrap remotecall_wait(Core.eval, pid, m, ex) + @async_unwrap remotecall_wait(Core.eval, pid, m, ex; role=role) end end yield() # ensure that the remotecalls have had a chance to start @@ -238,8 +238,8 @@ end # optimized version of remotecall_eval for a single pid # and which also fetches the return value -function remotecall_eval(m::Module, pid::Int, ex) - return remotecall_fetch(Core.eval, pid, m, ex) +function remotecall_eval(m::Module, pid::Int, ex; role=:default) + return remotecall_fetch(Core.eval, pid, m, ex; role=role) end @@ -261,13 +261,13 @@ function splitrange(firstIndex::Int, lastIndex::Int, np::Int) return chunks end -function preduce(reducer, f, R) - chunks = splitrange(Int(firstindex(R)), Int(lastindex(R)), nworkers()) - all_w = workers()[1:length(chunks)] +function preduce(reducer, f, R; role= :default) + chunks = splitrange(Int(firstindex(R)), Int(lastindex(R)), nworkers(role = role)) + all_w = workers(role = role)[1:length(chunks)] w_exec = Task[] for (idx,pid) in enumerate(all_w) - t = Task(()->remotecall_fetch(f, pid, reducer, R, first(chunks[idx]), last(chunks[idx]))) + t = Task(()->remotecall_fetch(f, pid, reducer, R, first(chunks[idx]), last(chunks[idx]); role = role)) schedule(t) push!(w_exec, t) end @@ -356,6 +356,6 @@ macro distributed(args...) ref end else - return :(preduce($(esc(reducer)), $(make_preduce_body(var, body)), $(esc(r)))) + return :(preduce($(esc(reducer)), $(make_preduce_body(var, body)), $(esc(r)))) # TO CHECK (role ?) end end diff --git a/src/managers.jl b/src/managers.jl index 57f5859..9a095f8 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -228,6 +228,7 @@ function parse_machine(machine::AbstractString) end function launch_on_machine(manager::SSHManager, machine::AbstractString, cnt, params::Dict, launched::Array, launch_ntfy::Condition) + #@info "launch_on_machine" shell = params[:shell] ssh = params[:ssh] dir = params[:dir] @@ -476,7 +477,7 @@ function launch(manager::LocalManager, params::Dict, launched::Array, c::Conditi # TODO: Maybe this belongs in base/initdefs.jl as a package_environment() function # together with load_path() etc. Might be useful to have when spawning julia - # processes outside of Distributed.jl too. + # processes outside of MultiscaleCluster.jl too. # JULIA_(LOAD|DEPOT)_PATH are used to populate (LOAD|DEPOT)_PATH on startup, # but since (LOAD|DEPOT)_PATH might have changed they are re-serialized here. # Users can opt-out of this by passing `env = ...` to addprocs(...). @@ -723,19 +724,19 @@ It should cause the remote worker specified by `pid` to exit. on `pid`. """ function kill(manager::ClusterManager, pid::Int, config::WorkerConfig) - remote_do(exit, pid) + remote_do(exit, pid; role = :manager) nothing end function kill(manager::SSHManager, pid::Int, config::WorkerConfig) - remote_do(exit, pid) + remote_do(exit, pid; role = :manager) cancel_ssh_tunnel(config) nothing end function kill(manager::LocalManager, pid::Int, config::WorkerConfig; exit_timeout = 15, term_timeout = 15) # First, try sending `exit()` to the remote over the usual control channels - remote_do(exit, pid) + remote_do(exit, pid; role = :manager) timer_task = @async begin sleep(exit_timeout) diff --git a/src/messages.jl b/src/messages.jl index fe3e5ab..be491db 100644 --- a/src/messages.jl +++ b/src/messages.jl @@ -99,30 +99,30 @@ function send_msg_unknown(s::IO, header, msg) error("attempt to send to unknown socket") end -function send_msg(s::IO, header, msg) - id = worker_id_from_socket(s) +function send_msg(s::IO, header, msg; role= :default) + id = worker_id_from_socket(s; role = role) if id > -1 - return send_msg(worker_from_id(id), header, msg) + return send_msg(worker_from_id(id, role=role), header, msg; role = role) end send_msg_unknown(s, header, msg) end -function send_msg_now(s::IO, header, msg::AbstractMsg) - id = worker_id_from_socket(s) +function send_msg_now(s::IO, header, msg::AbstractMsg; role= :default) + id = worker_id_from_socket(s; role = role) if id > -1 - return send_msg_now(worker_from_id(id), header, msg) + return send_msg_now(worker_from_id(id; role=role), header, msg; role = role) end send_msg_unknown(s, header, msg) end -function send_msg_now(w::Worker, header, msg) - send_msg_(w, header, msg, true) +function send_msg_now(w::Worker, header, msg; role= :default) + send_msg_(w, header, msg, true; role = role) end -function send_msg(w::Worker, header, msg) - send_msg_(w, header, msg, false) +function send_msg(w::Worker, header, msg; role= :default) + send_msg_(w, header, msg, false; role = role) end -function flush_gc_msgs(w::Worker) +function flush_gc_msgs(w::Worker; role= :default) if !isdefined(w, :w_stream) return end @@ -144,10 +144,10 @@ function flush_gc_msgs(w::Worker) end end if add_msgs !== nothing - remote_do(add_clients, w, add_msgs) + remote_do((add_msgs, role) -> add_clients(add_msgs, role = role), w, add_msgs, wid(w,role=role) == 1 ? :manager : :worker; role = role) end if del_msgs !== nothing - remote_do(del_clients, w, del_msgs) + remote_do((del_msgs, role) -> del_clients(del_msgs, role = role), w, del_msgs, wid(w,role=role) == 1 ? :manager : :worker; role = role) end return end @@ -168,9 +168,9 @@ function deserialize_hdr_raw(io) return MsgHeader(RRID(data[1], data[2]), RRID(data[3], data[4])) end -function send_msg_(w::Worker, header, msg, now::Bool) - check_worker_state(w) - if myid() != 1 && !isa(msg, IdentifySocketMsg) && !isa(msg, IdentifySocketAckMsg) +function send_msg_(w::Worker, header, msg, now::Bool; role= :default) + check_worker_state(w; role = role) + if myid(role=role) != 1 && !isa(msg, IdentifySocketMsg) && !isa(msg, IdentifySocketAckMsg) wait(w.initialized) end io = w.w_stream @@ -182,7 +182,7 @@ function send_msg_(w::Worker, header, msg, now::Bool) write(io, MSG_BOUNDARY) if !now && w.gcflag - flush_gc_msgs(w) + flush_gc_msgs(w; role = role) else flush(io) end @@ -191,11 +191,11 @@ function send_msg_(w::Worker, header, msg, now::Bool) end end -function flush_gc_msgs() +function flush_gc_msgs(; role= :default) try - for w in (PGRP::ProcessGroup).workers + for w in (PGRP(role = role)::ProcessGroup).workers if isa(w,Worker) && (w.state == W_CONNECTED) && w.gcflag - flush_gc_msgs(w) + flush_gc_msgs(w; role = role) end end catch e diff --git a/src/pmap.jl b/src/pmap.jl index 39acc4d..b266664 100644 --- a/src/pmap.jl +++ b/src/pmap.jl @@ -18,16 +18,16 @@ Note that `f` must be made available to all worker processes; see [Code Availability and Loading Packages](@ref code-availability) for details. """ -function pgenerate(p::AbstractWorkerPool, f, c) +function pgenerate(p::AbstractWorkerPool, f, c; role= :default) if length(p) == 0 - return AsyncGenerator(f, c; ntasks=()->nworkers(p)) + return AsyncGenerator(f, c; ntasks=()->nworkers(p; role = role)) end batches = batchsplit(c, min_batch_count = length(p) * 3) - return Iterators.flatten(AsyncGenerator(remote(p, b -> asyncmap(f, b)), batches)) + return Iterators.flatten(AsyncGenerator(remote(p, b -> asyncmap(f, b); role = role), batches)) end -pgenerate(p::AbstractWorkerPool, f, c1, c...) = pgenerate(p, a->f(a...), zip(c1, c...)) -pgenerate(f, c) = pgenerate(default_worker_pool(), f, c) -pgenerate(f, c1, c...) = pgenerate(a->f(a...), zip(c1, c...)) +pgenerate(p::AbstractWorkerPool, f, c1, c...; role= :default) = pgenerate(p, a->f(a...), zip(c1, c...); role = role) +pgenerate(f, c; role= :default) = pgenerate(default_worker_pool(role=role), f, c; role = role) +pgenerate(f, c1, c...; role= :default) = pgenerate(a->f(a...), zip(c1, c...); role = role) """ pmap(f, [::AbstractWorkerPool], c...; distributed=true, batch_size=1, on_error=nothing, retry_delays=[], retry_check=nothing) -> collection @@ -97,10 +97,10 @@ pmap(f, c; on_error = e->(isa(e, InexactError) ? NaN : rethrow()), retry_delays ``` """ function pmap(f, p::AbstractWorkerPool, c; distributed=true, batch_size=1, on_error=nothing, - retry_delays=[], retry_check=nothing) + retry_delays=[], retry_check=nothing, role= :default) f_orig = f # Don't do remote calls if there are no workers. - if (length(p) == 0) || (length(p) == 1 && fetch(p.channel) == myid()) + if (length(p) == 0) || (length(p) == 1 && fetch(p.channel) == myid(role = role)) distributed = false end @@ -116,14 +116,14 @@ function pmap(f, p::AbstractWorkerPool, c; distributed=true, batch_size=1, on_er end if distributed - f = remote(p, f) + f = remote(p, f; role=role) end if length(retry_delays) > 0 f = wrap_retry(f, retry_delays, retry_check) end - return asyncmap(f, c; ntasks=()->nworkers(p)) + return asyncmap(f, c; ntasks=()->nworkers(p; role = role)) else # During batch processing, We need to ensure that if on_error is set, it is called # for each element in error, and that we return as many elements as the original list. @@ -140,12 +140,12 @@ function pmap(f, p::AbstractWorkerPool, c; distributed=true, batch_size=1, on_er f = wrap_on_error(f, (x,e)->BatchProcessingError(x,e); capture_data=true) end - f = wrap_batch(f, p, handle_errors) - results = asyncmap(f, c; ntasks=()->nworkers(p), batch_size=batch_size) + f = wrap_batch(f, p, handle_errors; role=role) + results = asyncmap(f, c; ntasks=()->nworkers(p; role = role), batch_size=batch_size) # process errors if any. if handle_errors - process_batch_errors!(p, f_orig, results, on_error, retry_delays, retry_check) + process_batch_errors!(p, f_orig, results, on_error, retry_delays, retry_check; role = role) end return results @@ -153,7 +153,7 @@ function pmap(f, p::AbstractWorkerPool, c; distributed=true, batch_size=1, on_er end pmap(f, p::AbstractWorkerPool, c1, c...; kwargs...) = pmap(a->f(a...), p, zip(c1, c...); kwargs...) -pmap(f, c; kwargs...) = pmap(f, CachingPool(workers()), c; kwargs...) +pmap(f, c; role = :default, kwargs...) = pmap(f, CachingPool(workers(role = role)), c; role = role, kwargs...) pmap(f, c1, c...; kwargs...) = pmap(a->f(a...), zip(c1, c...); kwargs...) function wrap_on_error(f, on_error; capture_data=false) @@ -180,11 +180,11 @@ function wrap_retry(f, retry_delays, retry_check) end end -function wrap_batch(f, p, handle_errors) +function wrap_batch(f, p, handle_errors; role= :default) f = asyncmap_batch(f) return batch -> begin try - remotecall_fetch(f, p, batch) + remotecall_fetch(f, p, batch; role=role) catch e if handle_errors return Any[BatchProcessingError(b, e) for b in batch] @@ -199,7 +199,7 @@ asyncmap_batch(f) = batch -> asyncmap(x->f(x...), batch) extract_exception(e) = isa(e, RemoteException) ? e.captured.ex : e -function process_batch_errors!(p, f, results, on_error, retry_delays, retry_check) +function process_batch_errors!(p, f, results, on_error, retry_delays, retry_check; role= :default) # Handle all the ones in error in another pmap, with batch size set to 1 reprocess = Tuple{Int,BatchProcessingError}[] for (idx, v) in enumerate(results) @@ -211,14 +211,14 @@ function process_batch_errors!(p, f, results, on_error, retry_delays, retry_chec if length(reprocess) > 0 errors = [x[2] for x in reprocess] exceptions = Any[x.ex for x in errors] - state = iterate(retry_delays) + state = iterate(retry_delays#=; role = role=#) state !== nothing && (state = state[2]) error_processed = let state=state if (length(retry_delays)::Int > 0) && (retry_check === nothing || all([retry_check(state,ex)[2] for ex in exceptions])) # BatchProcessingError.data is a tuple of original args pmap(x->f(x...), p, Any[x.data for x in errors]; - on_error = on_error, retry_delays = collect(retry_delays)[2:end::Int], retry_check = retry_check) + on_error = on_error, retry_delays = collect(retry_delays)[2:end::Int], retry_check = retry_check, role = role) elseif on_error !== nothing map(on_error, exceptions) else @@ -240,7 +240,7 @@ Return `head`: the first `n` elements of `c`; and `tail`: an iterator over the remaining elements. ```jldoctest -julia> b, c = Distributed.head_and_tail(1:10, 3) +julia> b, c = MultiscaleCluster.head_and_tail(1:10, 3) ([1, 2, 3], Base.Iterators.Rest{UnitRange{Int64}, Int64}(1:10, 3)) julia> collect(c) diff --git a/src/precompile.jl b/src/precompile.jl index 87380f6..b6a4ac3 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -1,12 +1,12 @@ -precompile(Tuple{typeof(Distributed.remotecall),Function,Int,Module,Vararg{Any, 100}}) -precompile(Tuple{typeof(Distributed.procs)}) -precompile(Tuple{typeof(Distributed.finalize_ref), Distributed.Future}) +precompile(Tuple{typeof(MultiscaleCluster.remotecall),Function,Int,Module,Vararg{Any, 100}}) +precompile(Tuple{typeof(MultiscaleCluster.procs)}) +precompile(Tuple{typeof(MultiscaleCluster.finalize_ref), MultiscaleCluster.Future}) # This is disabled because it doesn't give much benefit -# and the code in Distributed is poorly typed causing many invalidations -# TODO: Maybe reenable now that Distributed is not in sysimage. +# and the code in MultiscaleCluster is poorly typed causing many invalidations +# TODO: Maybe reenable now that MultiscaleCluster is not in sysimage. #= precompile_script *= """ - using Distributed + using MultiscaleCluster addprocs(2) pmap(x->iseven(x) ? 1 : 0, 1:4) @distributed (+) for i = 1:100 Int(rand(Bool)) end diff --git a/src/process_messages.jl b/src/process_messages.jl index e68e05b..25c62bc 100644 --- a/src/process_messages.jl +++ b/src/process_messages.jl @@ -58,70 +58,70 @@ Exceptions on remote computations are captured and rethrown locally. A `RemoteE wraps the `pid` of the worker and a captured exception. A `CapturedException` captures the remote exception and a serializable form of the call stack when the exception was raised. """ -RemoteException(captured) = RemoteException(myid(), captured) -function showerror(io::IO, re::RemoteException) - (re.pid != myid()) && print(io, "On worker ", re.pid, ":\n") - showerror(io, re.captured) +RemoteException(captured; role= :default) = RemoteException(myid(role=role), captured) +function showerror(io::IO, re::RemoteException#=; role= :default=#) + (re.pid != myid(#=role = role=#)) && print(io, "On worker ", re.pid, ":\n") + showerror(io, re.captured#=; role = role=#) end -function run_work_thunk(thunk::Function, print_error::Bool) +function run_work_thunk(thunk::Function, print_error::Bool; role=:default) local result try result = thunk() catch err ce = CapturedException(err, catch_backtrace()) - result = RemoteException(ce) - print_error && showerror(stderr, ce) + result = RemoteException(ce; role=role) + print_error && showerror(stderr, ce#=; role = role=#) end return result end -function run_work_thunk(rv::RemoteValue, thunk) - put!(rv, run_work_thunk(thunk, false)) +function run_work_thunk(rv::RemoteValue, thunk; role= :default) + put!(rv, run_work_thunk(thunk, false; role=role)) nothing end -function schedule_call(rid, thunk) +function schedule_call(rid, thunk; role= :default) return lock(client_refs) do rv = RemoteValue(def_rv_channel()) - (PGRP::ProcessGroup).refs[rid] = rv + (PGRP(role = role)::ProcessGroup).refs[rid] = rv push!(rv.clientset, rid.whence) - errormonitor(@async run_work_thunk(rv, thunk)) + errormonitor(@async run_work_thunk(rv, thunk; role=role)) return rv end end -function deliver_result(sock::IO, msg, oid, value) - #print("$(myid()) sending result $oid\n") +function deliver_result(sock::IO, msg, oid, value; role= :default) + #print("$(myid(role=role)) sending result $oid\n") if msg === :call_fetch || isa(value, RemoteException) val = value else val = :OK end try - send_msg_now(sock, MsgHeader(oid), ResultMsg(val)) + send_msg_now(sock, MsgHeader(oid), ResultMsg(val); role = role) catch e # terminate connection in case of serialization error # otherwise the reading end would hang - @error "Fatal error on process $(myid())" exception=e,catch_backtrace() - wid = worker_id_from_socket(sock) + @error "Fatal error on process $(myid(role=role))" exception=e,catch_backtrace() + wid = worker_id_from_socket(sock; role = role) close(sock) - if myid()==1 + if myid(role=role)==1 rmprocs(wid) elseif wid == 1 exit(1) else - remote_do(rmprocs, 1, wid) + remote_do(rmprocs, 1, wid; role = role) end end end ## message event handlers ## -function process_messages(r_stream::TCPSocket, w_stream::TCPSocket, incoming::Bool=true) - errormonitor(@async process_tcp_streams(r_stream, w_stream, incoming)) +function process_messages(r_stream::TCPSocket, w_stream::TCPSocket, incoming::Bool=true; role= :default) + errormonitor(@async process_tcp_streams(r_stream, w_stream, incoming; role = role)) end -function process_tcp_streams(r_stream::TCPSocket, w_stream::TCPSocket, incoming::Bool) +function process_tcp_streams(r_stream::TCPSocket, w_stream::TCPSocket, incoming::Bool; role= :default) Sockets.nagle(r_stream, false) Sockets.quickack(r_stream, true) wait_connected(r_stream) @@ -130,7 +130,7 @@ function process_tcp_streams(r_stream::TCPSocket, w_stream::TCPSocket, incoming: Sockets.quickack(w_stream, true) wait_connected(w_stream) end - message_handler_loop(r_stream, w_stream, incoming) + message_handler_loop(r_stream, w_stream, incoming; role = role) end """ @@ -147,22 +147,22 @@ Julia version number to perform the authentication handshake. See also [`cluster_cookie`](@ref). """ -function process_messages(r_stream::IO, w_stream::IO, incoming::Bool=true) - errormonitor(@async message_handler_loop(r_stream, w_stream, incoming)) +function process_messages(r_stream::IO, w_stream::IO, incoming::Bool=true; role= :default) + errormonitor(@async message_handler_loop(r_stream, w_stream, incoming; role = role)) end -function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool) +function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool; role= :default) wpid=0 # the worker r_stream is connected to. boundary = similar(MSG_BOUNDARY) try - version = process_hdr(r_stream, incoming) + version = process_hdr(r_stream, incoming; role = role) serializer = ClusterSerializer(r_stream) # The first message will associate wpid with r_stream header = deserialize_hdr_raw(r_stream) msg = deserialize_msg(serializer) - handle_msg(msg, header, r_stream, w_stream, version) - wpid = worker_id_from_socket(r_stream) + handle_msg(msg, header, r_stream, w_stream, version; role = role) + wpid = worker_id_from_socket(r_stream; role = role) @assert wpid > 0 readbytes!(r_stream, boundary, length(MSG_BOUNDARY)) @@ -170,11 +170,12 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool) while true reset_state(serializer) header = deserialize_hdr_raw(r_stream) - # println("header: ", header) + #println("header: ", header) try msg = invokelatest(deserialize_msg, serializer) catch e + #println("*************************************************") # Deserialization error; discard bytes in stream until boundary found boundary_idx = 1 while true @@ -193,41 +194,41 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool) # remotecalls only rethrow RemoteExceptions. Any other exception is treated as # data to be returned. Wrap this exception in a RemoteException. - remote_err = RemoteException(myid(), CapturedException(e, catch_backtrace())) + remote_err = RemoteException(myid(role=role), CapturedException(e, catch_backtrace())) # println("Deserialization error. ", remote_err) if !null_id(header.response_oid) - ref = lookup_ref(header.response_oid) + ref = lookup_ref(header.response_oid; role = role) put!(ref, remote_err) end if !null_id(header.notify_oid) - deliver_result(w_stream, :call_fetch, header.notify_oid, remote_err) + deliver_result(w_stream, :call_fetch, header.notify_oid, remote_err; role = role) end continue end readbytes!(r_stream, boundary, length(MSG_BOUNDARY)) - # println("got msg: ", typeof(msg)) - handle_msg(msg, header, r_stream, w_stream, version) + #println("got msg: ", typeof(msg)) + handle_msg(msg, header, r_stream, w_stream, version; role = role) end catch e - werr = worker_from_id(wpid) + werr = worker_from_id(wpid; role = role) oldstate = werr.state # Check again as it may have been set in a message handler but not propagated to the calling block above if wpid < 1 - wpid = worker_id_from_socket(r_stream) + wpid = worker_id_from_socket(r_stream; role = role) end if wpid < 1 println(stderr, e, CapturedException(e, catch_backtrace())) - println(stderr, "Process($(myid())) - Unknown remote, closing connection.") + println(stderr, "Process($(myid(role=role))) - Unknown remote, closing connection.") elseif !(wpid in map_del_wrkr) set_worker_state(werr, W_TERMINATED) # If unhandleable error occurred talking to pid 1, exit if wpid == 1 if isopen(w_stream) - @error "Fatal error on process $(myid())" exception=e,catch_backtrace() + @error "Fatal error on process $(myid(role=roleee))" exception=e,catch_backtrace() end exit(1) end @@ -235,13 +236,13 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool) # Will treat any exception as death of node and cleanup # since currently we do not have a mechanism for workers to reconnect # to each other on unhandled errors - deregister_worker(wpid) + deregister_worker(wpid; role = role) end close(r_stream) close(w_stream) - if (myid() == 1) && (wpid > 1) + if (myid(role=role) == 1) && (wpid > 1) if oldstate != W_TERMINATING println(stderr, "Worker $wpid terminated.") rethrow() @@ -252,7 +253,7 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool) end end -function process_hdr(s, validate_cookie) +function process_hdr(s, validate_cookie; role= :default) if validate_cookie cookie = read(s, HDR_COOKIE_LEN) if length(cookie) < HDR_COOKIE_LEN @@ -262,7 +263,7 @@ function process_hdr(s, validate_cookie) self_cookie = cluster_cookie() for i in 1:HDR_COOKIE_LEN if UInt8(self_cookie[i]) != cookie[i] - error("Process($(myid())) - Invalid connection credentials sent by remote.") + error("Process($(myid(role = role))) - Invalid connection credentials sent by remote.") end end end @@ -278,67 +279,69 @@ function process_hdr(s, validate_cookie) return VersionNumber(strip(String(version))) end -function handle_msg(msg::CallMsg{:call}, header, r_stream, w_stream, version) - schedule_call(header.response_oid, ()->invokelatest(msg.f, msg.args...; msg.kwargs...)) +function handle_msg(msg::CallMsg{:call}, header, r_stream, w_stream, version; role= :default) + schedule_call(header.response_oid, ()->invokelatest(msg.f, msg.args...; msg.kwargs...); role = role) end -function handle_msg(msg::CallMsg{:call_fetch}, header, r_stream, w_stream, version) +function handle_msg(msg::CallMsg{:call_fetch}, header, r_stream, w_stream, version; role= :default) + #@info "handle ", msg errormonitor(@async begin - v = run_work_thunk(()->invokelatest(msg.f, msg.args...; msg.kwargs...), false) + v = run_work_thunk(()->invokelatest(msg.f, msg.args...; msg.kwargs...), false; role=role) if isa(v, SyncTake) try - deliver_result(w_stream, :call_fetch, header.notify_oid, v.v) + deliver_result(w_stream, :call_fetch, header.notify_oid, v.v; role = role) finally unlock(v.rv.synctake) end else - deliver_result(w_stream, :call_fetch, header.notify_oid, v) + deliver_result(w_stream, :call_fetch, header.notify_oid, v; role = role) end nothing end) end -function handle_msg(msg::CallWaitMsg, header, r_stream, w_stream, version) +function handle_msg(msg::CallWaitMsg, header, r_stream, w_stream, version; role= :default) errormonitor(@async begin - rv = schedule_call(header.response_oid, ()->invokelatest(msg.f, msg.args...; msg.kwargs...)) - deliver_result(w_stream, :call_wait, header.notify_oid, fetch(rv.c)) + rv = schedule_call(header.response_oid, ()->invokelatest(msg.f, msg.args...; msg.kwargs...); role = role) + deliver_result(w_stream, :call_wait, header.notify_oid, fetch(rv.c); role = role) nothing end) end -function handle_msg(msg::RemoteDoMsg, header, r_stream, w_stream, version) - errormonitor(@async run_work_thunk(()->invokelatest(msg.f, msg.args...; msg.kwargs...), true)) +function handle_msg(msg::RemoteDoMsg, header, r_stream, w_stream, version; role= :default) + errormonitor(@async run_work_thunk(()->invokelatest(msg.f, msg.args...; msg.kwargs...), true; role=role)) end -function handle_msg(msg::ResultMsg, header, r_stream, w_stream, version) - put!(lookup_ref(header.response_oid), msg.value) +function handle_msg(msg::ResultMsg, header, r_stream, w_stream, version; role= :default) + put!(lookup_ref(header.response_oid; role = role), msg.value) end -function handle_msg(msg::IdentifySocketMsg, header, r_stream, w_stream, version) +function handle_msg(msg::IdentifySocketMsg, header, r_stream, w_stream, version; role= :default) # register a new peer worker connection - w = Worker(msg.from_pid, r_stream, w_stream, cluster_manager; version=version) + w = Worker(msg.from_pid, r_stream, w_stream, cluster_manager; version=version, role = role) send_connection_hdr(w, false) - send_msg_now(w, MsgHeader(), IdentifySocketAckMsg()) + send_msg_now(w, MsgHeader(), IdentifySocketAckMsg(); role = role) notify(w.initialized) end -function handle_msg(msg::IdentifySocketAckMsg, header, r_stream, w_stream, version) +function handle_msg(msg::IdentifySocketAckMsg, header, r_stream, w_stream, version; role= :default) w = map_sock_wrkr[r_stream] w.version = version end -function handle_msg(msg::JoinPGRPMsg, header, r_stream, w_stream, version) - LPROC.id = msg.self_pid - controller = Worker(1, r_stream, w_stream, cluster_manager; version=version) +function handle_msg(msg::JoinPGRPMsg, header, r_stream, w_stream, version; role= :default) + #LPROC.id = msg.self_pid + myid!(msg.self_pid, role=role) + controller = Worker(1, r_stream, w_stream, cluster_manager; version=version, role = role) notify(controller.initialized) register_worker(LPROC) - topology(msg.topology) + topology(msg.topology; role=role) if !msg.enable_threaded_blas Base.disable_library_threading() end lazy = msg.lazy - PGRP.lazy = lazy + PGRP(role = role).lazy = lazy @sync for (connect_at, rpid) in msg.other_workers wconfig = WorkerConfig() @@ -347,32 +350,32 @@ function handle_msg(msg::JoinPGRPMsg, header, r_stream, w_stream, version) let rpid=rpid, wconfig=wconfig if lazy # The constructor registers the object with a global registry. - Worker(rpid, ()->connect_to_peer(cluster_manager, rpid, wconfig)) + Worker(rpid, ()->connect_to_peer(cluster_manager, rpid, wconfig; role = role); role = role) else - @async connect_to_peer(cluster_manager, rpid, wconfig) + @async connect_to_peer(cluster_manager, rpid, wconfig; role = role) end end end send_connection_hdr(controller, false) - send_msg_now(controller, MsgHeader(RRID(0,0), header.notify_oid), JoinCompleteMsg(Sys.CPU_THREADS, getpid())) + send_msg_now(controller, MsgHeader(RRID(0,0), header.notify_oid), JoinCompleteMsg(Sys.CPU_THREADS, getpid()); role = role) end -function connect_to_peer(manager::ClusterManager, rpid::Int, wconfig::WorkerConfig) +function connect_to_peer(manager::ClusterManager, rpid::Int, wconfig::WorkerConfig; role= :default) try (r_s, w_s) = connect(manager, rpid, wconfig) - w = Worker(rpid, r_s, w_s, manager; config=wconfig) - process_messages(w.r_stream, w.w_stream, false) + w = Worker(rpid, r_s, w_s, manager; config=wconfig, role = role) + process_messages(w.r_stream, w.w_stream, false; role = role) send_connection_hdr(w, true) - send_msg_now(w, MsgHeader(), IdentifySocketMsg(myid())) + send_msg_now(w, MsgHeader(), IdentifySocketMsg(myid(role=role)), role = role) notify(w.initialized) catch e - @error "Error on $(myid()) while connecting to peer $rpid, exiting" exception=e,catch_backtrace() + @error "Error on $(myid(role=role)) while connecting to peer $rpid, exiting" exception=e,catch_backtrace() exit(1) end end -function handle_msg(msg::JoinCompleteMsg, header, r_stream, w_stream, version) +function handle_msg(msg::JoinCompleteMsg, header, r_stream, w_stream, version; role= :default) w = map_sock_wrkr[r_stream] environ = something(w.config.environ, Dict()) environ[:cpu_threads] = msg.cpu_threads @@ -380,8 +383,8 @@ function handle_msg(msg::JoinCompleteMsg, header, r_stream, w_stream, version) w.config.ospid = msg.ospid w.version = version - ntfy_channel = lookup_ref(header.notify_oid) - put!(ntfy_channel, w.id) + ntfy_channel = lookup_ref(header.notify_oid; role = role) + put!(ntfy_channel, wid(w,role=role)) - push!(default_worker_pool(), w.id) + push!(default_worker_pool(role=role), wid(w,role=role), role = role) end diff --git a/src/remotecall.jl b/src/remotecall.jl index 0b1143d..9f1312a 100644 --- a/src/remotecall.jl +++ b/src/remotecall.jl @@ -29,8 +29,8 @@ mutable struct Future <: AbstractRemoteRef lock::ReentrantLock @atomic v::Union{Some{Any}, Nothing} - Future(w::Int, rrid::RRID, v::Union{Some, Nothing}=nothing) = - (r = new(w,rrid.whence,rrid.id,ReentrantLock(),v); return test_existing_ref(r)) + Future(w::Int, rrid::RRID, v::Union{Some, Nothing}=nothing; role= :default) = + (r = new(w,rrid.whence,rrid.id,ReentrantLock(),v); return test_existing_ref(r; role = role)) Future(t::NTuple{4, Any}) = new(t[1],t[2],t[3],ReentrantLock(),t[4]) # Useful for creating dummy, zeroed-out instances end @@ -56,9 +56,9 @@ mutable struct RemoteChannel{T<:AbstractChannel} <: AbstractRemoteRef whence::Int id::Int - function RemoteChannel{T}(w::Int, rrid::RRID) where T<:AbstractChannel + function RemoteChannel{T}(w::Int, rrid::RRID; role= :default) where T<:AbstractChannel r = new(w, rrid.whence, rrid.id) - return test_existing_ref(r) + return test_existing_ref(r; role = role) end function RemoteChannel{T}(t::Tuple) where T<:AbstractChannel @@ -66,7 +66,7 @@ mutable struct RemoteChannel{T<:AbstractChannel} <: AbstractRemoteRef end end -function test_existing_ref(r::AbstractRemoteRef) +function test_existing_ref(r::AbstractRemoteRef; role= :default) found = getkey(client_refs, r, nothing) if found !== nothing @assert r.where > 0 @@ -76,7 +76,7 @@ function test_existing_ref(r::AbstractRemoteRef) rv_cache = @atomic :monotonic r.v if fv_cache === nothing && rv_cache !== nothing # we have recd the value from another source, probably a deserialized ref, send a del_client message - send_del_client(r) + send_del_client(r; role = role) @lock found.lock begin @atomicreplace found.v nothing => rv_cache end @@ -86,21 +86,21 @@ function test_existing_ref(r::AbstractRemoteRef) end client_refs[r] = nothing - finalizer(finalize_ref, r) + finalizer(r -> finalize_ref(r, role), r) return r end -function finalize_ref(r::AbstractRemoteRef) +function finalize_ref(r::AbstractRemoteRef, role) if r.where > 0 # Handle the case of the finalizer having been called manually if trylock(client_refs.lock) # trylock doesn't call wait which causes yields try delete!(client_refs.ht, r) # direct removal avoiding locks if isa(r, RemoteChannel) - send_del_client_no_lock(r) + send_del_client_no_lock(r; role = role) else # send_del_client only if the reference has not been set v_cache = @atomic :monotonic r.v - v_cache === nothing && send_del_client_no_lock(r) + v_cache === nothing && send_del_client_no_lock(r; role = role) @atomic :monotonic r.v = nothing end r.where = 0 @@ -108,10 +108,10 @@ function finalize_ref(r::AbstractRemoteRef) unlock(client_refs.lock) end else - finalizer(finalize_ref, r) + finalizer(r -> finalize_ref(r, role), r) return nothing end - end + end nothing end @@ -121,16 +121,17 @@ end Create a `Future` on process `pid`. The default `pid` is the current process. """ -Future(pid::Integer=myid()) = Future(pid, RRID()) -Future(w::LocalProcess) = Future(w.id) -Future(w::Worker) = Future(w.id) +Future(pid::Integer=-1; role =:default) = Future(pid < 0 ? myid(role = role) : pid, RRID(role = role); role = role) +Future(w::LocalProcess; role =:default) = Future(wid(w,role=role); role = role) +Future(w::Worker; role =:default) = Future(wid(w,role=role); role = role) -RemoteChannel(pid::Integer=myid()) = RemoteChannel{Channel{Any}}(pid, RRID()) +RemoteChannel(pid::Integer=-1; role= :default) = RemoteChannel{Channel{Any}}(pid < 0 ? myid(role = role) : pid, RRID(role = role); role = role) -function RemoteChannel(f::Function, pid::Integer=myid()) - remotecall_fetch(pid, f, RRID()) do f, rrid - rv=lookup_ref(rrid, f) - RemoteChannel{typeof(rv.c)}(myid(), rrid) +function RemoteChannel(f::Function, pid_::Integer=0; role= :default) + pid = pid_ == 0 ? myid(role = role) : pid_ + remotecall_fetch(pid, f, RRID(role = role); role = role) do f, rrid + rv=lookup_ref(rrid, f; role = role) + RemoteChannel{typeof(rv.c)}(myid(role = role), rrid; role = role) end end @@ -169,9 +170,9 @@ A low-level API which returns the backing `AbstractChannel` for an `id` returned [`remoteref_id`](@ref). The call is valid only on the node where the backing channel exists. """ -function channel_from_id(id) +function channel_from_id(id; role= :default) rv = lock(client_refs) do - return get(PGRP.refs, id, false) + return get(PGRP(role = role).refs, id, false) end if rv === false throw(ErrorException("Local instance of remote reference not found")) @@ -179,7 +180,7 @@ function channel_from_id(id) return rv.c end -lookup_ref(rrid::RRID, f=def_rv_channel) = lookup_ref(PGRP, rrid, f) +lookup_ref(rrid::RRID, f=def_rv_channel; role= :default) = lookup_ref(PGRP(role = role), rrid, f) function lookup_ref(pg, rrid, f) return lock(client_refs) do rv = get(pg.refs, rrid, false) @@ -209,15 +210,15 @@ errormonitor(@async put!(f, remotecall_fetch(long_computation, p))) isready(f) # will not block ``` """ -function isready(rr::Future) +function isready(rr::Future; role= :default) v_cache = @atomic rr.v v_cache === nothing || return true rid = remoteref_id(rr) - return if rr.where == myid() - isready(lookup_ref(rid).c) + return if rr.where == myid(role = role) + isready(lookup_ref(rid; role = role).c) else - remotecall_fetch(rid->isready(lookup_ref(rid).c), rr.where, rid) + remotecall_fetch((rid, role)->isready(lookup_ref(rid; role = role).c), rr.where, rid, rr.where == 1 ? :manager : :worker; role = role) end end @@ -229,18 +230,18 @@ Note that this function can cause race conditions, since by the time you receive its result it may no longer be true. However, it can be safely used on a [`Future`](@ref) since they are assigned only once. """ -function isready(rr::RemoteChannel, args...) +function isready(rr::RemoteChannel, args...; role= :default) rid = remoteref_id(rr) - return if rr.where == myid() - isready(lookup_ref(rid).c, args...) + return if rr.where == myid(role = role) + isready(lookup_ref(rid; role = role).c, args...) else - remotecall_fetch(rid->isready(lookup_ref(rid).c, args...), rr.where, rid) + remotecall_fetch(rid->isready(lookup_ref(rid; role = rr.where == 1 ? :manager : :worker).c, args...), rr.where, rid; role = role) end end -del_client(rr::AbstractRemoteRef) = del_client(remoteref_id(rr), myid()) +del_client(rr::AbstractRemoteRef; role= :default) = del_client(remoteref_id(rr), myid(role = role); role = role) -del_client(id, client) = del_client(PGRP, id, client) +del_client(id, client; role= :default) = del_client(PGRP(role = role), id, client) function del_client(pg, id, client) lock(client_refs) do _del_client(pg, id, client) @@ -260,9 +261,9 @@ function _del_client(pg, id, client) nothing end -function del_clients(pairs::Vector) +function del_clients(pairs::Vector; role= :default) for p in pairs - del_client(p[1], p[2]) + del_client(p[1], p[2]; role = role) end end @@ -272,7 +273,7 @@ end # XXX: Is this worth the additional complexity? # `flush_gc_msgs` has to iterate over all connected workers. const any_gc_flag = Threads.Condition() -function start_gc_msgs_task() +function start_gc_msgs_task(; role= :default) errormonitor( Threads.@spawn begin while true @@ -283,27 +284,27 @@ function start_gc_msgs_task() # Use invokelatest() so that custom message transport streams # for workers can be defined in a newer world age than the Task # which runs the loop here. - invokelatest(flush_gc_msgs) # handles throws internally + invokelatest(flush_gc_msgs#=; role = role=#) # handles throws internally end end ) end # Function can be called within a finalizer -function send_del_client(rr) - if rr.where == myid() - del_client(rr) +function send_del_client(rr; role= :default) + if rr.where == myid(role = role) + del_client(rr; role = role) elseif id_in_procs(rr.where) # process only if a valid worker - process_worker(rr) + process_worker(rr; role = role) end end -function send_del_client_no_lock(rr) +function send_del_client_no_lock(rr; role= :default) # for gc context to avoid yields - if rr.where == myid() - _del_client(PGRP, remoteref_id(rr), myid()) + if rr.where == myid(role = role) + _del_client(PGRP(role = role), remoteref_id(rr), myid(role = role)) elseif id_in_procs(rr.where) # process only if a valid worker - process_worker(rr) + process_worker(rr; role = role) end end @@ -317,9 +318,9 @@ function publish_del_msg!(w::Worker, msg) end end -function process_worker(rr) - w = worker_from_id(rr.where)::Worker - msg = (remoteref_id(rr), myid()) +function process_worker(rr; role= :default) + w = worker_from_id(rr.where; role = role)::Worker + msg = (remoteref_id(rr), myid(role = role)) # Needs to acquire a lock on the del_msg queue T = Threads.@spawn begin @@ -330,28 +331,28 @@ function process_worker(rr) return end -function add_client(id, client) +function add_client(id, client; role= :default) lock(client_refs) do - rv = lookup_ref(id) + rv = lookup_ref(id; role = role) push!(rv.clientset, client) end nothing end -function add_clients(pairs::Vector) +function add_clients(pairs::Vector; role= :default) for p in pairs - add_client(p[1], p[2]...) + add_client(p[1], p[2]...; role = role) end end -function send_add_client(rr::AbstractRemoteRef, i) - if rr.where == myid() +function send_add_client(rr::AbstractRemoteRef, i; role= :default) + if rr.where == myid(role = role) add_client(remoteref_id(rr), i) elseif (i != rr.where) && id_in_procs(rr.where) # don't need to send add_client if the message is already going # to the processor that owns the remote ref. it will add_client # itself inside deserialize(). - w = worker_from_id(rr.where) + w = worker_from_id(rr.where; role = role) lock(w.msg_lock) do push!(w.add_msgs, (remoteref_id(rr), i)) @atomic w.gcflag = true @@ -364,24 +365,24 @@ end channel_type(rr::RemoteChannel{T}) where {T} = T -function serialize(s::ClusterSerializer, f::Future) +function serialize(s::ClusterSerializer, f::Future; role = :default) v_cache = @atomic f.v if v_cache === nothing - p = worker_id_from_socket(s.io) - (p !== f.where) && send_add_client(f, p) + p = worker_id_from_socket(s.io; role = role) + (p !== f.where) && send_add_client(f, p; role = role) end invoke(serialize, Tuple{ClusterSerializer, Any}, s, f) end -function serialize(s::ClusterSerializer, rr::RemoteChannel) - p = worker_id_from_socket(s.io) - (p !== rr.where) && send_add_client(rr, p) +function serialize(s::ClusterSerializer, rr::RemoteChannel; role = :default) + p = worker_id_from_socket(s.io; role = role) + (p !== rr.where) && send_add_client(rr, p; role = role) invoke(serialize, Tuple{ClusterSerializer, Any}, s, rr) end -function deserialize(s::ClusterSerializer, t::Type{<:Future}) +function deserialize(s::ClusterSerializer, t::Type{<:Future}; role = :default) fc = invoke(deserialize, Tuple{ClusterSerializer, DataType}, s, t) # deserialized copy - f2 = Future(fc.where, RRID(fc.whence, fc.id), fc.v) # ctor adds to client_refs table + f2 = Future(fc.where, RRID(fc.whence, fc.id), fc.v; role = role) # ctor adds to client_refs table # 1) send_add_client() is not executed when the ref is being serialized # to where it exists, hence do it here. @@ -389,21 +390,21 @@ function deserialize(s::ClusterSerializer, t::Type{<:Future}) # already 'fetch'ed instance in client_refs (Issue #25847), we should not # track it in the backing RemoteValue store. f2v_cache = @atomic f2.v - if f2.where == myid() && f2v_cache === nothing - add_client(remoteref_id(f2), myid()) + if f2.where == myid(role = role) && f2v_cache === nothing + add_client(remoteref_id(f2), myid(role = role); role = role) end f2 end -function deserialize(s::ClusterSerializer, t::Type{<:RemoteChannel}) +function deserialize(s::ClusterSerializer, t::Type{<:RemoteChannel}; role = :default) rr = invoke(deserialize, Tuple{ClusterSerializer, DataType}, s, t) - if rr.where == myid() + if rr.where == myid(role = role) # send_add_client() is not executed when the ref is being # serialized to where it exists - add_client(remoteref_id(rr), myid()) + add_client(remoteref_id(rr), myid(role = role); role = role) end # call ctor to make sure this rr gets added to the client_refs table - RemoteChannel{channel_type(rr)}(rr.where, RRID(rr.whence, rr.id)) + RemoteChannel{channel_type(rr)}(rr.where, RRID(rr.whence, rr.id); role = role) end # Future and RemoteChannel are serializable only in a running cluster. @@ -425,15 +426,15 @@ function local_remotecall_thunk(f, args, kwargs) return ()->invokelatest(f, args...; kwargs...) end -function remotecall(f, w::LocalProcess, args...; kwargs...) - rr = Future(w) - schedule_call(remoteref_id(rr), local_remotecall_thunk(f, args, kwargs)) +function remotecall(f, w::LocalProcess, args...; role= :default, kwargs...) + rr = Future(w; role = role) + schedule_call(remoteref_id(rr), local_remotecall_thunk(f, args, kwargs); role = role) return rr end -function remotecall(f, w::Worker, args...; kwargs...) - rr = Future(w) - send_msg(w, MsgHeader(remoteref_id(rr)), CallMsg{:call}(f, args, kwargs)) +function remotecall(f, w::Worker, args...; role= :default, kwargs...) + rr = Future(w; role = role) + send_msg(w, MsgHeader(remoteref_id(rr)), CallMsg{:call}(f, args, kwargs); role = role) return rr end @@ -444,23 +445,25 @@ Call a function `f` asynchronously on the given arguments on the specified proce Return a [`Future`](@ref). Keyword arguments, if any, are passed through to `f`. """ -remotecall(f, id::Integer, args...; kwargs...) = remotecall(f, worker_from_id(id), args...; kwargs...) +remotecall(f, id::Integer, args...; role= :default, kwargs...) = +# remotecall(f, worker_from_id(id; role = id == 1 ? :manager : :worker), args...; role = role, kwargs...) + remotecall(f, worker_from_id(id; role = role), args...; role = role, kwargs...) -function remotecall_fetch(f, w::LocalProcess, args...; kwargs...) - v=run_work_thunk(local_remotecall_thunk(f,args, kwargs), false) +function remotecall_fetch(f, w::LocalProcess, args...; role= :default, kwargs...) + v=run_work_thunk(local_remotecall_thunk(f,args, kwargs), false; role = role) return isa(v, RemoteException) ? throw(v) : v end -function remotecall_fetch(f, w::Worker, args...; kwargs...) +function remotecall_fetch(f, w::Worker, args...; role= :default, kwargs...) # can be weak, because the program will have no way to refer to the Ref # itself, it only gets the result. - oid = RRID() - rv = lookup_ref(oid) - rv.waitingfor = w.id - send_msg(w, MsgHeader(RRID(0,0), oid), CallMsg{:call_fetch}(f, args, kwargs)) + oid = RRID(role = role) + rv = lookup_ref(oid; role = role) + rv.waitingfor = wid(w, role=role) + send_msg(w, MsgHeader(RRID(0,0), oid), CallMsg{:call_fetch}(f, args, kwargs); role = role) v = take!(rv) lock(client_refs) do - delete!(PGRP.refs, oid) + delete!(PGRP(role = role).refs, oid) end return isa(v, RemoteException) ? throw(v) : v end @@ -489,20 +492,20 @@ sqrt was called with a negative real argument but will only return a complex res ... ``` """ -remotecall_fetch(f, id::Integer, args...; kwargs...) = - remotecall_fetch(f, worker_from_id(id), args...; kwargs...) +remotecall_fetch(f, id::Integer, args...; role= :default, kwargs...) = + remotecall_fetch(f, worker_from_id(id; role = role), args...; role = role, kwargs...) -remotecall_wait(f, w::LocalProcess, args...; kwargs...) = wait(remotecall(f, w, args...; kwargs...)) +remotecall_wait(f, w::LocalProcess, args...; role= :default, kwargs...) = wait(remotecall(f, w, args...; role = role, kwargs...); role = role) -function remotecall_wait(f, w::Worker, args...; kwargs...) - prid = RRID() - rv = lookup_ref(prid) - rv.waitingfor = w.id - rr = Future(w) - send_msg(w, MsgHeader(remoteref_id(rr), prid), CallWaitMsg(f, args, kwargs)) +function remotecall_wait(f, w::Worker, args...; role= :default, kwargs...) + prid = RRID(role = role) + rv = lookup_ref(prid; role = role) + rv.waitingfor = wid(w,role=role) + rr = Future(w; role = role) + send_msg(w, MsgHeader(remoteref_id(rr), prid), CallWaitMsg(f, args, kwargs); role = role) v = fetch(rv.c) lock(client_refs) do - delete!(PGRP.refs, prid) + delete!(PGRP(role = role).refs, prid) end isa(v, RemoteException) && throw(v) return rr @@ -516,10 +519,10 @@ Keyword arguments, if any, are passed through to `f`. See also [`wait`](@ref) and [`remotecall`](@ref). """ -remotecall_wait(f, id::Integer, args...; kwargs...) = - remotecall_wait(f, worker_from_id(id), args...; kwargs...) +remotecall_wait(f, id::Integer, args...; role= :default, kwargs...) = + remotecall_wait(f, worker_from_id(id; role = role), args...; kwargs...) -function remote_do(f, w::LocalProcess, args...; kwargs...) +function remote_do(f, w::LocalProcess, args...; role = :default, kwargs...) # the LocalProcess version just performs in local memory what a worker # does when it gets a :do message. # same for other messages on LocalProcess. @@ -528,8 +531,8 @@ function remote_do(f, w::LocalProcess, args...; kwargs...) nothing end -function remote_do(f, w::Worker, args...; kwargs...) - send_msg(w, MsgHeader(), RemoteDoMsg(f, args, kwargs)) +function remote_do(f, w::Worker, args...; role= :default, kwargs...) + send_msg(w, MsgHeader(), RemoteDoMsg(f, args, kwargs), role = role) nothing end @@ -554,22 +557,29 @@ Any exceptions thrown by `f` are printed to [`stderr`](@ref) on the remote worke Keyword arguments, if any, are passed through to `f`. """ -remote_do(f, id::Integer, args...; kwargs...) = remote_do(f, worker_from_id(id), args...; kwargs...) +remote_do(f, id::Integer, args...; role=:default, kwargs...) = remote_do(f, worker_from_id(id, role = role), role = role, args...; kwargs...) # TO CHECK (e se f não tiver role parameter ?) # have the owner of rr call f on it -function call_on_owner(f, rr::AbstractRemoteRef, args...) +function call_on_owner(f, rr::AbstractRemoteRef, args...; role= :default) rid = remoteref_id(rr) - if rr.where == myid() + if rr.where == myid(role = role) f(rid, args...) else - remotecall_fetch(f, rr.where, rid, args...) + #remotecall_fetch((rid,role) -> f(rid, role = role, args...), rr.where, rid, rr.where==1 ? :manager : :worker; role = role) + remotecall_fetch((rid,role) -> f(rid, args...; role=role), rr.where, rid, rr.where==1 ? :manager : :worker; role = role) + + + #remotecall_fetch(rid -> f(rid, role = rr.where==1 ? :manager : :worker, args...), rr.where; role = role) + #remotecall_fetch(iiiii, rr.where, f, rid, rr.where==1 ? :manager : :worker, args...; role = role) +# remotecall_fetch(f, rr.where, rid, args...) + end end -function wait_ref(rid, caller, args...) - v = fetch_ref(rid, args...) +function wait_ref(rid, caller, args...; role= :default) + v = fetch_ref(rid, args...; role = role) if isa(v, RemoteException) - if myid() == caller + if myid(role = role) == caller throw(v) else return v @@ -583,14 +593,20 @@ end Wait for a value to become available for the specified [`Future`](@ref). """ -wait(r::Future) = (v_cache = @atomic r.v; v_cache !== nothing && return r; call_on_owner(wait_ref, r, myid()); r) +wait(r::Future; role= :default) = (v_cache = @atomic r.v; v_cache !== nothing && return r; + call_on_owner(wait_ref, r, myid(role = role); role = role); + #call_on_owner((rid, caller, args...; role=role) -> wait_ref(rid, caller, args...; role=role), r, myid(role = role); role = role); + r) """ wait(r::RemoteChannel, args...) Wait for a value to become available on the specified [`RemoteChannel`](@ref). """ -wait(r::RemoteChannel, args...) = (call_on_owner(wait_ref, r, myid(), args...); r) +wait(r::RemoteChannel, args...; role= :default) = (call_on_owner(wait_ref, r, myid(role = role), args...; role = role); r) +#wait(r::RemoteChannel, args...; role= :default) = (call_on_owner((rid, caller, args...; role=role) -> wait_ref(rid, caller, args...; role=role), r, myid(role = role), args...; role = role); r) + + """ fetch(x::Future) @@ -599,14 +615,14 @@ Wait for and get the value of a [`Future`](@ref). The fetched value is cached lo Further calls to `fetch` on the same reference return the cached value. If the remote value is an exception, throws a [`RemoteException`](@ref) which captures the remote exception and backtrace. """ -function fetch(r::Future) +function fetch(r::Future; role= :default) v_cache = @atomic r.v v_cache !== nothing && return something(v_cache) - if r.where == myid() + if r.where == myid(role = role) rv, v_cache = @lock r.lock begin v_cache = @atomic :monotonic r.v - rv = v_cache === nothing ? lookup_ref(remoteref_id(r)) : nothing + rv = v_cache === nothing ? lookup_ref(remoteref_id(r); role = role) : nothing rv, v_cache end @@ -616,7 +632,8 @@ function fetch(r::Future) v_local = fetch(rv.c) end else - v_local = call_on_owner(fetch_ref, r) + #v_local = call_on_owner((rid, args...; role=role) -> fetch_ref(rid, args...;role=role), r; role = role) + v_local = call_on_owner(fetch_ref, r; role = role) end v_cache = @atomic r.v @@ -634,18 +651,20 @@ function fetch(r::Future) # remote calls getting the value from `call_on_owner` used to return the value directly without wrapping it in `Some(x)` # so we're doing the same thing here if status - send_del_client(r) + send_del_client(r; role = role) return v_local else # this `v_cache` is returned at the end of the function v_cache = v_old end end - send_del_client(r) + send_del_client(r; role = role) something(v_cache) end -fetch_ref(rid, args...) = fetch(lookup_ref(rid).c, args...) +fetch_ref(rid, args...; role=:default) = (@info "fetch_ref $role"; fetch(lookup_ref(rid; role = role).c, #=role=role,=# args...)) + + """ fetch(c::RemoteChannel) @@ -653,7 +672,10 @@ fetch_ref(rid, args...) = fetch(lookup_ref(rid).c, args...) Wait for and get a value from a [`RemoteChannel`](@ref). Exceptions raised are the same as for a [`Future`](@ref). Does not remove the item fetched. """ -fetch(r::RemoteChannel, args...) = call_on_owner(fetch_ref, r, args...)::eltype(r) +fetch(r::RemoteChannel, args...; role= :default) = call_on_owner(fetch_ref, r, args...; role = role)::eltype(r) +#fetch(r::RemoteChannel, args...; role= :default) = call_on_owner((rid, args...; role=role) -> fetch_ref(rid, args...;role=role), r, args...; role = role)::eltype(r) + + isready(rv::RemoteValue, args...) = isready(rv.c, args...) @@ -666,19 +688,19 @@ A `put!` on an already set `Future` throws an `Exception`. All asynchronous remote calls return `Future`s and set the value to the return value of the call upon completion. """ -function put!(r::Future, v) - if r.where == myid() +function put!(r::Future, v; role= :default) + if r.where == myid(role = role) rid = remoteref_id(r) - rv = lookup_ref(rid) + rv = lookup_ref(rid; role = role) isready(rv) && error("Future can be set only once") @lock r.lock begin put!(rv, v) # this notifies the tasks waiting on the channel in fetch set_future_cache(r, v) # set the cache before leaving the lock, so that the notified tasks already see it cached end - del_client(rid, myid()) + del_client(rid, myid(role = role); role = role) else @lock r.lock begin # same idea as above if there were any local tasks fetching on this Future - call_on_owner(put_future, r, v, myid()) + call_on_owner(put_future, r, v, myid(role = role); role = role) set_future_cache(r, v) end end @@ -690,21 +712,21 @@ function set_future_cache(r::Future, v) ok || error("internal consistency error detected for Future") end -function put_future(rid, v, caller) - rv = lookup_ref(rid) +function put_future(rid, v, caller; role= :default) + rv = lookup_ref(rid; role = role) isready(rv) && error("Future can be set only once") put!(rv, v) # The caller has the value and hence can be removed from the remote store. - del_client(rid, caller) + del_client(rid, caller; role = role) nothing end put!(rv::RemoteValue, args...) = put!(rv.c, args...) -function put_ref(rid, caller, args...) - rv = lookup_ref(rid) +function put_ref(rid, caller, args...; role= :default) + rv = lookup_ref(rid; role = role) put!(rv, args...) - if myid() == caller && rv.synctake !== nothing + if myid(role = role) == caller && rv.synctake !== nothing # Wait till a "taken" value is serialized out - github issue #29932 lock(rv.synctake) unlock(rv.synctake) @@ -719,15 +741,17 @@ Store a set of values to the [`RemoteChannel`](@ref). If the channel is full, blocks until space is available. Return the first argument. """ -put!(rr::RemoteChannel, args...) = (call_on_owner(put_ref, rr, myid(), args...); rr) +put!(rr::RemoteChannel, args...; role= :default) = (call_on_owner(put_ref, rr, myid(role = role), args...; role = role); rr) +#put!(rr::RemoteChannel, args...; role= :default) = (call_on_owner((rid, caller, args...; role=role) -> put_ref(rid, caller, args...; role=role), rr, myid(role = role), args...; role = role); rr) + # take! is not supported on Future take!(rv::RemoteValue, args...) = take!(rv.c, args...) -function take_ref(rid, caller, args...) - rv = lookup_ref(rid) +function take_ref(rid, caller, args...; role=:default) + rv = lookup_ref(rid; role = role) synctake = false - if myid() != caller && rv.synctake !== nothing + if myid(role = role) != caller && rv.synctake !== nothing # special handling for local put! / remote take! on unbuffered channel # github issue #29932 synctake = true @@ -743,7 +767,7 @@ function take_ref(rid, caller, args...) rethrow(e) end - isa(v, RemoteException) && (myid() == caller) && throw(v) + isa(v, RemoteException) && (myid(role = role) == caller) && throw(v) if synctake return SyncTake(v, rv) @@ -758,31 +782,32 @@ end Fetch value(s) from a [`RemoteChannel`](@ref) `rr`, removing the value(s) in the process. """ -take!(rr::RemoteChannel, args...) = call_on_owner(take_ref, rr, myid(), args...)::eltype(rr) +#take!(rr::RemoteChannel, args...; role= :default) = call_on_owner((rid, caller, args...; role=role) -> take_ref(rid, caller, args...; role=role), rr, myid(role = role), args...; role = role)::eltype(rr) +take!(rr::RemoteChannel, args...; role= :default) = call_on_owner(take_ref, rr, myid(role = role), args...; role = role)::eltype(rr) # close and isopen are not supported on Future -close_ref(rid) = (close(lookup_ref(rid).c); nothing) -close(rr::RemoteChannel) = call_on_owner(close_ref, rr) +close_ref(rid; role= :default) = (close(lookup_ref(rid; role = role).c); nothing) +close(rr::RemoteChannel; role= :default) = call_on_owner(close_ref, rr; role = role) -isopen_ref(rid) = isopen(lookup_ref(rid).c) -isopen(rr::RemoteChannel) = call_on_owner(isopen_ref, rr) +isopen_ref(rid; role= :default) = isopen(lookup_ref(rid; role = role).c) +isopen(rr::RemoteChannel; role= :default) = call_on_owner(isopen_ref, rr; role = role) -getindex(r::RemoteChannel) = fetch(r) -getindex(r::Future) = fetch(r) +getindex(r::RemoteChannel; role= :default) = fetch(r; role = role) +getindex(r::Future; role= :default) = fetch(r; role = role) -getindex(r::Future, args...) = getindex(fetch(r), args...) -function getindex(r::RemoteChannel, args...) - if r.where == myid() - return getindex(fetch(r), args...) +getindex(r::Future, args...; role= :default) = getindex(fetch(r; role = role), args...#=; role = role=#) +function getindex(r::RemoteChannel, args...; role= :default) + if r.where == myid(role = role) + return getindex(fetch(r; role = role), args...#=; role = role=#) end - return remotecall_fetch(getindex, r.where, r, args...) + return remotecall_fetch((r,role) -> getindex(r, role = role, args...), r.where, r, r.where == 1 ? :manager : :worker; role = role) end -function iterate(c::RemoteChannel, state=nothing) - if isopen(c) || isready(c) +function iterate(c::RemoteChannel, state=nothing; role= :default) + if isopen(c; role = role) || isready(c; role = role) try - return (take!(c), nothing) + return (take!(c; role=role), nothing) catch e if isa(e, InvalidStateException) || (isa(e, RemoteException) && diff --git a/src/workerpool.jl b/src/workerpool.jl index 5dd1c07..133049f 100644 --- a/src/workerpool.jl +++ b/src/workerpool.jl @@ -26,9 +26,9 @@ mutable struct WorkerPool <: AbstractWorkerPool WorkerPool(c::Channel, ref::RemoteChannel) = new(c, Set{Int}(), ref) end -function WorkerPool() - wp = WorkerPool(Channel{Int}(typemax(Int)), RemoteChannel()) - put!(wp.ref, WeakRef(wp)) +function WorkerPool(; role= :default) + wp = WorkerPool(Channel{Int}(typemax(Int)), RemoteChannel(role = role)) + put!(wp.ref, WeakRef(wp), role=role) wp end @@ -48,8 +48,8 @@ julia> WorkerPool(2:4) WorkerPool(Channel{Int64}(sz_max:9223372036854775807,sz_curr:2), Set([4, 2, 3]), RemoteChannel{Channel{Any}}(1, 1, 7)) ``` """ -function WorkerPool(workers::Union{Vector{Int},AbstractRange{Int}}) - pool = WorkerPool() +function WorkerPool(workers::Union{Vector{Int},AbstractRange{Int}}; role= :default) + pool = WorkerPool(role = role) foreach(w->push!(pool, w), workers) return pool end @@ -57,22 +57,22 @@ end # On workers where this pool has been serialized to, instantiate with a dummy local channel. WorkerPool(ref::RemoteChannel) = WorkerPool(Channel{Int}(1), ref) -function serialize(S::AbstractSerializer, pool::WorkerPool) +function serialize(S::AbstractSerializer, pool::WorkerPool; role = :default) # Allow accessing a worker pool from other processors. When serialized, # initialize the `ref` to point to self and only send the ref. # Other workers will forward all put!, take!, calls to the process owning # the ref (and hence the pool). Serialization.serialize_type(S, typeof(pool)) - serialize(S, pool.ref) + serialize(S, pool.ref; role = role) end deserialize(S::AbstractSerializer, t::Type{T}) where {T<:WorkerPool} = T(deserialize(S)) -wp_local_push!(pool::AbstractWorkerPool, w::Int) = (push!(pool.workers, w); put!(pool.channel, w); pool) -wp_local_length(pool::AbstractWorkerPool) = length(pool.workers) -wp_local_isready(pool::AbstractWorkerPool) = isready(pool.channel) +wp_local_push!(pool::AbstractWorkerPool, w::Int; role= :default) = (push!(pool.workers, w); put!(pool.channel, w); pool) +wp_local_length(pool::AbstractWorkerPool; role= :default) = length(pool.workers) +wp_local_isready(pool::AbstractWorkerPool; role= :default) = isready(pool.channel) # pool.channel::Channel{Int} -function wp_local_put!(pool::AbstractWorkerPool, w::Int) +function wp_local_put!(pool::AbstractWorkerPool, w::Int; role= :default) # In case of default_worker_pool, the master is implicitly considered a worker, i.e., # it is not present in pool.workers. # Confirm the that the worker is part of a pool before making it available. @@ -80,28 +80,28 @@ function wp_local_put!(pool::AbstractWorkerPool, w::Int) w end -function wp_local_workers(pool::AbstractWorkerPool) - if length(pool) == 0 && pool === default_worker_pool() +function wp_local_workers(pool::AbstractWorkerPool; role= :default) + if length(pool) == 0 && pool === default_worker_pool(role=role) return [1] else return collect(pool.workers) end end -function wp_local_nworkers(pool::AbstractWorkerPool) - if length(pool) == 0 && pool === default_worker_pool() +function wp_local_nworkers(pool::AbstractWorkerPool; role= :default) + if length(pool) == 0 && pool === default_worker_pool(role=role) return 1 else return length(pool.workers) end end -function wp_local_take!(pool::AbstractWorkerPool) +function wp_local_take!(pool::AbstractWorkerPool; role= :default) # Find an active worker worker = 0 while true if length(pool) == 0 - if pool === default_worker_pool() + if pool === default_worker_pool(role=role) # No workers, the master process is used as a worker worker = 1 break @@ -120,12 +120,12 @@ function wp_local_take!(pool::AbstractWorkerPool) return worker end -function remotecall_pool(rc_f, f, pool::AbstractWorkerPool, args...; kwargs...) - worker = take!(pool) +function remotecall_pool(rc_f, f, pool::AbstractWorkerPool, args...; role= :default, kwargs...) + worker = take!(pool; role=role) try - rc_f(f, worker, args...; kwargs...) + rc_f(f, worker, role=role, args...; kwargs...) finally - put!(pool, worker) + put!(pool, worker; role = role) end end @@ -136,32 +136,32 @@ end for (func, rt) = ((:length, Int), (:isready, Bool), (:workers, Vector{Int}), (:nworkers, Int), (:take!, Int)) func_local = Symbol(string("wp_local_", func)) @eval begin - function ($func)(pool::WorkerPool) - if pool.ref.where != myid() - return remotecall_fetch(ref->($func_local)(fetch(ref).value), pool.ref.where, pool.ref)::$rt + function ($func)(pool::WorkerPool; role= :default) + if pool.ref.where != myid(role = role) + return remotecall_fetch((ref, role)->(($func_local)(fetch(ref; role=role).value; role = role)), pool.ref.where, pool.ref, pool.ref.where == 1 ? :manager : :worker; role = role)::$rt else - return ($func_local)(pool) + return ($func_local)(pool; role = role) end end # default impl - ($func)(pool::AbstractWorkerPool) = ($func_local)(pool) + ($func)(pool::AbstractWorkerPool; role= :default) = ($func_local)(pool; role = role) end end for func = (:push!, :put!) func_local = Symbol(string("wp_local_", func)) @eval begin - function ($func)(pool::WorkerPool, w::Int) - if pool.ref.where != myid() - return remotecall_fetch((ref, w)->($func_local)(fetch(ref).value, w), pool.ref.where, pool.ref, w) + function ($func)(pool::WorkerPool, w::Int; role= :default) + if pool.ref.where != myid(role = role) + return remotecall_fetch((ref, w, role)->(($func_local)(fetch(ref; role = role).value, w; role = role)), pool.ref.where, pool.ref, w, pool.ref.where == 1 ? :manager : :worker; role = role) else - return ($func_local)(pool, w) + return ($func_local)(pool, w; role = role) end end # default impl - ($func)(pool::AbstractWorkerPool, w::Int) = ($func_local)(pool, w) + ($func)(pool::AbstractWorkerPool, w::Int; role= :default) = ($func_local)(pool, w; role = role) end end @@ -184,6 +184,7 @@ Future(2, 1, 6, nothing) ``` In this example, the task ran on pid 2, called from pid 1. """ +#remotecall(f, pool::AbstractWorkerPool, args...; role= :default, kwargs...) = remotecall_pool((f, pool) -> remotecall(f, pool, role=role, args...; kwargs...); role=role) remotecall(f, pool::AbstractWorkerPool, args...; kwargs...) = remotecall_pool(remotecall, f, pool, args...; kwargs...) @@ -208,6 +209,7 @@ julia> fetch(f) 0.9995177101692958 ``` """ +#remotecall_wait(f, pool::AbstractWorkerPool, args...; role= :default, kwargs...) = remotecall_pool((f,pool) -> remotecall_wait(f, pool, role = role, args...; kwargs...); role=role) # TO CHECK (dúvida com "role = role") remotecall_wait(f, pool::AbstractWorkerPool, args...; kwargs...) = remotecall_pool(remotecall_wait, f, pool, args...; kwargs...) @@ -229,7 +231,9 @@ julia> remotecall_fetch(maximum, wp, A) 0.9995177101692958 ``` """ +#remotecall_fetch(f, pool::AbstractWorkerPool, args...; role= :default, kwargs...) = remotecall_pool((f,pool)->remotecall_fetch(f, pool, role = role, args...; kwargs...), f, pool; role = role) # TO CHECK (dúvida com o primeiro "role = role") remotecall_fetch(f, pool::AbstractWorkerPool, args...; kwargs...) = remotecall_pool(remotecall_fetch, f, pool, args...; kwargs...) +#remotecall_fetch(f, pool::AbstractWorkerPool, args...; role= :default, kwargs...) = remotecall_pool((f,pool)->remotecall_fetch((p, args...) -> f(p, args...), pool, args...; role = role, kwargs...), f, pool; role = role) # TO CHECK (dúvida com o primeiro "role = role") """ remote_do(f, pool::AbstractWorkerPool, args...; kwargs...) -> nothing @@ -237,6 +241,7 @@ remotecall_fetch(f, pool::AbstractWorkerPool, args...; kwargs...) = remotecall_p [`WorkerPool`](@ref) variant of `remote_do(f, pid, ....)`. Wait for and take a free worker from `pool` and perform a `remote_do` on it. """ +#remote_do(f, pool::AbstractWorkerPool, args...; role= :default, kwargs...) = remotecall_pool((f,pool) -> remote_do(f, pool, role = role, args...; kwargs...); role = role) remote_do(f, pool::AbstractWorkerPool, args...; kwargs...) = remotecall_pool(remote_do, f, pool, args...; kwargs...) const _default_worker_pool = Ref{Union{AbstractWorkerPool, Nothing}}(nothing) @@ -256,14 +261,14 @@ julia> default_worker_pool() WorkerPool(Channel{Int64}(sz_max:9223372036854775807,sz_curr:3), Set([4, 2, 3]), RemoteChannel{Channel{Any}}(1, 1, 4)) ``` """ -function default_worker_pool() +function default_worker_pool(;role=:default) # On workers retrieve the default worker pool from the master when accessed # for the first time if _default_worker_pool[] === nothing - if myid() == 1 - _default_worker_pool[] = WorkerPool() + if myid(role=role) == 1 + _default_worker_pool[] = WorkerPool(role = role) else - _default_worker_pool[] = remotecall_fetch(()->default_worker_pool(), 1) + _default_worker_pool[] = remotecall_fetch(role->default_worker_pool(role = role), 1, :manager; role=role) end end return _default_worker_pool[] @@ -284,8 +289,8 @@ end Return an anonymous function that executes function `f` on an available worker (drawn from [`WorkerPool`](@ref) `p` if provided) using [`remotecall_fetch`](@ref). """ -remote(f) = (args...; kwargs...)->remotecall_fetch(f, default_worker_pool(), args...; kwargs...) -remote(p::AbstractWorkerPool, f) = (args...; kwargs...)->remotecall_fetch(f, p, args...; kwargs...) +remote(f; role= :default) = (args...; kwargs...)->remotecall_fetch(f, default_worker_pool(role=role), args...; role=role, kwargs...) +remote(p::AbstractWorkerPool, f; role= :default) = (args...; kwargs...)->remotecall_fetch(f, p, args...; role=role, kwargs...) mutable struct CachingPool <: AbstractWorkerPool channel::Channel{Int} @@ -351,20 +356,20 @@ function clear!(pool::CachingPool) pool end -exec_from_cache(rr::RemoteChannel, args...; kwargs...) = fetch(rr)(args...; kwargs...) -function exec_from_cache(f_ref::Tuple{Function, RemoteChannel}, args...; kwargs...) +exec_from_cache(rr::RemoteChannel, args...; role= :default, kwargs...) = fetch(rr; role = role)(args...; kwargs...) +function exec_from_cache(f_ref::Tuple{Function, RemoteChannel}, args...; role= :default, kwargs...) put!(f_ref[2], f_ref[1]) # Cache locally f_ref[1](args...; kwargs...) end -function remotecall_pool(rc_f, f, pool::CachingPool, args...; kwargs...) - worker = take!(pool) - f_ref = get(pool.map_obj2ref, (worker, f), (f, RemoteChannel(worker))) +function remotecall_pool(rc_f, f, pool::CachingPool, args...; role= :default, kwargs...) + worker = take!(pool; role=role) + f_ref = get(pool.map_obj2ref, (worker, f), (f, RemoteChannel(worker; role=role))) isa(f_ref, Tuple) && (pool.map_obj2ref[(worker, f)] = f_ref[2]) # Add to tracker try - rc_f(exec_from_cache, worker, f_ref, args...; kwargs...) + rc_f(exec_from_cache, worker, f_ref, args...; role=role, kwargs...) finally - put!(pool, worker) + put!(pool, worker; role=role) end end diff --git a/test/distributed_exec.jl b/test/distributed_exec.jl index 166ea6d..34b1c27 100644 --- a/test/distributed_exec.jl +++ b/test/distributed_exec.jl @@ -1,20 +1,13 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -using Test, Distributed, Random, Serialization, Sockets -import Distributed: launch, manage - -sharedir = normpath(joinpath(Sys.BINDIR, "..", "share")) -if parse(Bool, get(ENV, "JULIA_DISTRIBUTED_TESTING_STANDALONE", "false")) - @test !startswith(pathof(Distributed), sharedir) -else - @test startswith(pathof(Distributed), sharedir) -end +using Test, MultiscaleCluster, Random, Serialization, Sockets +import MultiscaleCluster: launch, manage @test cluster_cookie() isa String include(joinpath(Sys.BINDIR, "..", "share", "julia", "test", "testenv.jl")) -@test Distributed.extract_imports(:(begin; import Foo, Bar; let; using Baz; end; end)) == +@test MultiscaleCluster.extract_imports(:(begin; import Foo, Bar; let; using Baz; end; end)) == Any[:(import Foo, Bar), :(using Baz)] # Test a few "remote" invocations when no workers are present @@ -27,7 +20,7 @@ include(joinpath(Sys.BINDIR, "..", "share", "julia", "test", "testenv.jl")) addprocs_with_testenv(4) @test nprocs() == 5 -# distributed loading of packages +# MultiscaleCluster loading of packages # setup @everywhere begin @@ -52,16 +45,21 @@ end id_me = myid() id_other = filter(x -> x != id_me, procs())[rand(1:(nprocs()-1))] + # Test role -@everywhere using Distributed -@test Distributed.myrole() === :master +@everywhere using MultiscaleCluster +@test MultiscaleCluster.myrole() === :master for wid = workers() wrole = remotecall_fetch(wid) do - Distributed.myrole() + MultiscaleCluster.myrole() end @test wrole === :worker end +@info "passed 1" +#sleep(3) + + # Test remote() let pool = default_worker_pool() @@ -69,6 +67,8 @@ let count = 0 count_condition = Condition() + @info "passed 2" + function remote_wait(c) @async_logerr begin count += 1 @@ -79,17 +79,30 @@ let yield() end + @info "passed 3" + +# @info nworkers() +# sleep(30) + testchannels = [RemoteChannel() for i in 1:nworkers()] + # @info testchannels + # sleep(30) testcount = 0 @test isready(pool) == true for c in testchannels @test count == testcount +# @info c remote_wait(c) testcount += 1 end @test count == testcount @test isready(pool) == false + @info "passed 4" + #sleep(3) + + try + for c in testchannels @test count == testcount put!(c, "foo") @@ -99,8 +112,15 @@ let @test isready(pool) == true end + catch e + @info e + end + @test count == 0 + @info "passed 5" + #sleep(3) + for c in testchannels @test count == testcount remote_wait(c) @@ -109,6 +129,9 @@ let @test count == testcount @test isready(pool) == false + @info "passed 6" + #sleep(3) + for c in reverse(testchannels) @test count == testcount put!(c, "foo") @@ -118,9 +141,16 @@ let @test isready(pool) == true end + @info "passed 7" + #sleep(3) + @test count == 0 end +@info "passed 8" +#sleep(3) + + # Test Futures function testf(id) f=Future(id) @@ -172,27 +202,29 @@ function include_thread_unsafe_tests() return true end -# Distributed GC tests for Futures +@info "passed 9" + +# MultiscaleCluster GC tests for Futures function test_futures_dgc(id) f = remotecall(myid, id) fid = remoteref_id(f) # remote value should be deleted after a fetch - @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, fid) == true + @test remotecall_fetch(k->(yield();haskey(MultiscaleCluster.PGRP().refs, k)), id, fid) == true @test f.v === nothing @test fetch(f) == id @test f.v !== nothing yield(); # flush gc msgs - @test poll_while(() -> remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, fid)) + @test poll_while(() -> remotecall_fetch(k->(yield();haskey(MultiscaleCluster.PGRP().refs, k)), id, fid)) # if unfetched, it should be deleted after a finalize f = remotecall(myid, id) fid = remoteref_id(f) - @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, fid) == true + @test remotecall_fetch(k->(yield();haskey(MultiscaleCluster.PGRP().refs, k)), id, fid) == true @test f.v === nothing finalize(f) yield(); # flush gc msgs - @test poll_while(() -> remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, fid)) + @test poll_while(() -> remotecall_fetch(k->(yield();haskey(MultiscaleCluster.PGRP().refs, k)), id, fid)) end test_futures_dgc(id_me) @@ -208,23 +240,23 @@ fstore = RemoteChannel(wid2) put!(fstore, f) @test fetch(f) == wid1 -@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == true +@test remotecall_fetch(k->haskey(MultiscaleCluster.PGRP().refs, k), wid1, fid) == true remotecall_fetch(r->(fetch(fetch(r)); yield()), wid2, fstore) sleep(0.5) # to ensure that wid2 gc messages have been executed on wid1 -@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == false +@test remotecall_fetch(k->haskey(MultiscaleCluster.PGRP().refs, k), wid1, fid) == false # put! should release remote reference since it would have been cached locally f = Future(wid1) fid = remoteref_id(f) # should not be created remotely till accessed -@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == false +@test remotecall_fetch(k->haskey(MultiscaleCluster.PGRP().refs, k), wid1, fid) == false # create it remotely isready(f) -@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == true +@test remotecall_fetch(k->haskey(MultiscaleCluster.PGRP().refs, k), wid1, fid) == true put!(f, :OK) -@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == false +@test remotecall_fetch(k->haskey(MultiscaleCluster.PGRP().refs, k), wid1, fid) == false @test fetch(f) === :OK # RemoteException should be thrown on a put! when another process has set the value @@ -235,7 +267,7 @@ fstore = RemoteChannel(wid2) put!(fstore, f) # send f to wid2 put!(f, :OK) # set value from master -@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == true +@test remotecall_fetch(k->haskey(MultiscaleCluster.PGRP().refs, k), wid1, fid) == true testval = remotecall_fetch(wid2, fstore) do x try @@ -258,30 +290,33 @@ end end f = remotecall_wait(identity, id_other, ones(10)) -rrid = Distributed.RRID(f.whence, f.id) +rrid = MultiscaleCluster.RRID(f.whence, f.id) remotecall_fetch(f25847, id_other, f) -@test BitSet([id_me]) == remotecall_fetch(()->Distributed.PGRP.refs[rrid].clientset, id_other) +@test BitSet([id_me]) == remotecall_fetch(()->MultiscaleCluster.PGRP().refs[rrid].clientset, id_other) remotecall_fetch(f25847, id_other, f) -@test BitSet([id_me]) == remotecall_fetch(()->Distributed.PGRP.refs[rrid].clientset, id_other) +@test BitSet([id_me]) == remotecall_fetch(()->MultiscaleCluster.PGRP().refs[rrid].clientset, id_other) finalize(f) yield() # flush gc msgs -@test poll_while(() -> remotecall_fetch(chk_rrid->(yield(); haskey(Distributed.PGRP.refs, chk_rrid)), id_other, rrid)) +@test poll_while(() -> remotecall_fetch(chk_rrid->(yield(); haskey(MultiscaleCluster.PGRP().refs, chk_rrid)), id_other, rrid)) -# Distributed GC tests for RemoteChannels + +@info "passed 10" + +# MultiscaleCluster GC tests for RemoteChannels function test_remoteref_dgc(id) rr = RemoteChannel(id) put!(rr, :OK) rrid = remoteref_id(rr) # remote value should be deleted after finalizing the ref - @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, rrid) == true + @test remotecall_fetch(k->(yield();haskey(MultiscaleCluster.PGRP().refs, k)), id, rrid) == true @test fetch(rr) === :OK - @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, rrid) == true + @test remotecall_fetch(k->(yield();haskey(MultiscaleCluster.PGRP().refs, k)), id, rrid) == true finalize(rr) yield(); # flush gc msgs - @test poll_while(() -> remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, rrid)) + @test poll_while(() -> remotecall_fetch(k->(yield();haskey(MultiscaleCluster.PGRP().refs, k)), id, rrid)) end test_remoteref_dgc(id_me) test_remoteref_dgc(id_other) @@ -295,18 +330,20 @@ let wid1 = workers()[1], put!(fstore, rr) if include_thread_unsafe_tests() - @test remotecall_fetch(k -> haskey(Distributed.PGRP.refs, k), wid1, rrid) == true + @test remotecall_fetch(k -> haskey(MultiscaleCluster.PGRP().refs, k), wid1, rrid) == true end finalize(rr) # finalize locally yield() # flush gc msgs if include_thread_unsafe_tests() - @test remotecall_fetch(k -> haskey(Distributed.PGRP.refs, k), wid1, rrid) == true + @test remotecall_fetch(k -> haskey(MultiscaleCluster.PGRP().refs, k), wid1, rrid) == true end remotecall_fetch(r -> (finalize(take!(r)); yield(); nothing), wid2, fstore) # finalize remotely sleep(0.5) # to ensure that wid2 messages have been executed on wid1 - @test poll_while(() -> remotecall_fetch(k -> haskey(Distributed.PGRP.refs, k), wid1, rrid)) + @test poll_while(() -> remotecall_fetch(k -> haskey(MultiscaleCluster.PGRP().refs, k), wid1, rrid)) end +@info "passed 11" + # Tests for issue #23109 - should not hang. f = @spawnat :any rand(1, 1) Base.Experimental.@sync begin @@ -332,6 +369,7 @@ for i in 1:nworkers() end @test sort(pids) == sort(workers()) +@info "passed 12" # test getindex on Futures and RemoteChannels function test_indexing(rr) @@ -346,8 +384,10 @@ test_indexing(Future(id_other)) test_indexing(RemoteChannel()) test_indexing(RemoteChannel(id_other)) +@info "passed 13" + # Test ser/deser to non-ClusterSerializer objects. -function test_regular_io_ser(ref::Distributed.AbstractRemoteRef) +function test_regular_io_ser(ref::MultiscaleCluster.AbstractRemoteRef) io = IOBuffer() serialize(io, ref) seekstart(io) @@ -396,6 +436,8 @@ s = [randstring() for x in 1:10^5] num_small_requests = 10000 @test fill(id_other, num_small_requests) == [remotecall_fetch(myid, id_other) for i in 1:num_small_requests] +@info "passed 14" + # test parallel sends of large arrays from multiple tasks to the same remote worker ntasks = 10 rr_list = [Channel(1) for x in 1:ntasks] @@ -444,6 +486,9 @@ test_channel(RemoteChannel(()->Channel(10))) c=Channel{Int}(1) @test_throws MethodError put!(c, "Hello") +@info "passed 15" + + # test channel iterations function test_iteration(in_c, out_c) t=@async for v in in_c @@ -545,6 +590,8 @@ for id in [id_other, id_me] end end +@info "passed 16" + # make sure the stackframe from the remote error can be serialized let ex try @@ -567,7 +614,9 @@ let ex end # pmap tests. Needs at least 4 processors dedicated to the below tests. Which we currently have -# since the distributed tests are now spawned as a separate set. +# since the MultiscaleCluster tests are now spawned as a separate set. + +@info "passed 17" # Test all combinations of pmap keyword args. pmap_args = [ @@ -660,6 +709,7 @@ generic_map_tests(pmap_fallback) run_map_equivalence_tests(pmap) @test pmap(uppercase, "Hello World!") == map(uppercase, "Hello World!") +@info "passed 18" # Simple test for pmap throws error let error_thrown = false @@ -679,7 +729,7 @@ end n = 10 as = [rand(4,4) for i in 1:n] bs = deepcopy(as) -cs = collect(Distributed.pgenerate(x->(sleep(rand()*0.1); svd(x)), bs)) +cs = collect(MultiscaleCluster.pgenerate(x->(sleep(rand()*0.1); svd(x)), bs)) svdas = map(svd, as) for i in 1:n @test cs[i].U ≈ svdas[i].U @@ -709,16 +759,16 @@ clear!(wp) @test length(wp.map_obj2ref) == 0 # default_worker_pool! tests -wp_default = Distributed.default_worker_pool() +wp_default = MultiscaleCluster.default_worker_pool() try local wp = CachingPool(workers()) - Distributed.default_worker_pool!(wp) + MultiscaleCluster.default_worker_pool!(wp) @test [1:100...] == pmap(x->x, wp, 1:100) @test !isempty(wp.map_obj2ref) clear!(wp) @test isempty(wp.map_obj2ref) finally - Distributed.default_worker_pool!(wp_default) + MultiscaleCluster.default_worker_pool!(wp_default) end # The below block of tests are usually run only on local development systems, since: @@ -742,8 +792,8 @@ if DoFullTest all_w = workers() # Test sending fake data to workers. The worker processes will print an # error message but should not terminate. - for w in Distributed.PGRP.workers - if isa(w, Distributed.Worker) + for w in MultiscaleCluster.PGRP().workers + if isa(w, MultiscaleCluster.Worker) local s = connect(w.config.host, w.config.port) write(s, randstring(32)) end @@ -769,6 +819,9 @@ if Sys.isunix() # aka have ssh remotecall_fetch(rmprocs, 1, new_pids) end + @info "passed 19" + + print("\n\nTesting SSHManager. A minimum of 4GB of RAM is recommended.\n") print("Please ensure: \n") print("1) sshd is running locally with passwordless login enabled.\n") @@ -844,6 +897,8 @@ let t = @task 42 @test_throws TaskFailedException(t) Base.wait(t) end +@info "passed 20" + # issue #8207 let A = Any[] @distributed (+) for i in (push!(A,1); 1:2) @@ -852,6 +907,8 @@ let A = Any[] @test length(A) == 1 end +@info "passed 21" + # issue #13168 function f13168(n) val = 0 @@ -871,9 +928,13 @@ let t = schedule(@task f13168(100)) @test isa(fetch(t), Float64) end +@info "passed 21.1" + # issue #13122 @test remotecall_fetch(identity, workers()[1], C_NULL) === C_NULL +@info "passed 21.2" + # issue #11062 function t11062() @async v11062 = 1 @@ -882,11 +943,14 @@ end @test t11062() == 2 +@info "passed 21.3" + # issue #15406 v15406 = remotecall_wait(() -> 1, id_other) fetch(v15406) remotecall_wait(fetch, id_other, v15406) +@info "passed 21.4" # issue #43396 # Covers the remote fetch where the value returned is `nothing` @@ -896,6 +960,7 @@ remotecall_wait(fetch, id_other, v15406) @test nothing === fetch(remotecall(() -> nothing, workers()[1])) @test 10 === fetch(remotecall(() -> 10, workers()[1])) +@info "passed 21.5" # Test various forms of remotecall* invocations @@ -918,19 +983,30 @@ for tid in [id_other, id_me, default_worker_pool()] test_f_args(15, f_args, tid, 1, 2; kw1=4, kw2=8) end + +@info "passed 21.6.2" + +f=Future(id_other) +remote_do(fut->put!(fut, myid()), id_other, f) +@test fetch(f) == id_other + +@info "passed 21.6.1" + # Test remote_do f=Future(id_me) +@info "passed 21.6.1.1" remote_do(fut->put!(fut, myid()), id_me, f) +@info "passed 21.6.1.2" @test fetch(f) == id_me -f=Future(id_other) -remote_do(fut->put!(fut, myid()), id_other, f) -@test fetch(f) == id_other +@info "passed 21.7" # Github issue #29932 rc_unbuffered = RemoteChannel(()->Channel{Vector{Float64}}(0)) @test eltype(rc_unbuffered) == Vector{Float64} +@info "passed 21.8" + @async begin # Trigger direct write (no buffering) of largish array array_sz = Int(Base.SZ_UNBUFFERED_IO/8) + 1 @@ -948,51 +1024,61 @@ end return :OK end, id_other, rc_unbuffered) === :OK +@info "passed 21.9" + # github issue 33972 rc_unbuffered_other = RemoteChannel(()->Channel{Int}(0), id_other) close(rc_unbuffered_other) try; take!(rc_unbuffered_other); catch; end -@test !remotecall_fetch(rc -> islocked(Distributed.lookup_ref(remoteref_id(rc)).synctake), +@test !remotecall_fetch(rc -> islocked(MultiscaleCluster.lookup_ref(remoteref_id(rc)).synctake), id_other, rc_unbuffered_other) +@info "passed 21.10" + # github PR #14456 n = DoFullTest ? 6 : 5 for i = 1:10^n fetch(@spawnat myid() myid()) end +@info "passed 21.11" + # issue #15451 @test remotecall_fetch(x->(y->2y)(x)+1, workers()[1], 3) == 7 +@info "passed 21.12" + # issue #16091 mutable struct T16091 end -wid = workers()[1] -try - remotecall_fetch(()->T16091, wid) - @test "unreachable" === true +wid0 = workers()[1] +@test try + remotecall_fetch(()->T16091, wid0) + false catch ex - ex = ((ex::RemoteException).captured::CapturedException).ex - @test (ex::UndefVarError).var === :T16091 + ((ex::RemoteException).captured::CapturedException).ex === UndefVarError(:T16091) end -try - remotecall_fetch(identity, wid, T16091) - @test "unreachable" === true +@test try + remotecall_fetch(identity, wid0, T16091) + false catch ex - ex = ((ex::RemoteException).captured::CapturedException).ex - @test (ex::UndefVarError).var === :T16091 + ((ex::RemoteException).captured::CapturedException).ex === UndefVarError(:T16091) end f16091a() = 1 -remotecall_fetch(()->eval(:(f16091a() = 2)), wid) -@test remotecall_fetch(f16091a, wid) === 2 -@test remotecall_fetch((myid)->remotecall_fetch(f16091a, myid), wid, myid()) === 1 +remotecall_fetch(()->eval(:(f16091a() = 2)), wid0) +@test remotecall_fetch(f16091a, wid0) === 2 +@test remotecall_fetch((myid)->remotecall_fetch(f16091a, myid), wid0, myid()) === 1 + +@info "passed 21.13" # these will only heisen-fail, since it depends on the gensym counter collisions: f16091b = () -> 1 -remotecall_fetch(()->eval(:(f16091b = () -> 2)), wid) +remotecall_fetch(()->eval(:(f16091b = () -> 2)), wid0) @test remotecall_fetch(f16091b, 2) === 1 # Global anonymous functions are over-written... -@test remotecall_fetch((myid)->remotecall_fetch(f16091b, myid), wid, myid()) === 1 +@test remotecall_fetch((myid)->remotecall_fetch(f16091b, myid), wid0, myid()) === 1 + +@info "passed 21.14" # ...while local anonymous functions are by definition, local. let @@ -1004,9 +1090,11 @@ let f16091c = () -> 2 remotecall_fetch(f16091c, myid) end - end, wid, myid()) === 2 + end, wid0, myid()) === 2 end +@info "passed 21.15" + # issue #16451 rng=RandomDevice() retval = @distributed (+) for _ in 1:10 @@ -1020,18 +1108,25 @@ retval = @distributed (+) for _ in 1:10 end @test retval > 0.0 && retval < 10.0 +@info "passed 21.16" + # serialization tests wrkr1 = workers()[1] wrkr2 = workers()[end] @test remotecall_fetch(p->remotecall_fetch(myid, p), wrkr1, wrkr2) == wrkr2 +@info "passed 21.17" + # Send f to wrkr1 and wrkr2. Then try calling f on wrkr2 from wrkr1 f_myid = ()->myid() @test wrkr1 == remotecall_fetch(f_myid, wrkr1) @test wrkr2 == remotecall_fetch(f_myid, wrkr2) @test wrkr2 == remotecall_fetch((f, p)->remotecall_fetch(f, p), wrkr1, f_myid, wrkr2) + +@info "passed 22" + # Deserialization error recovery test # locally defined module, but unavailable on workers module LocalFoo @@ -1052,7 +1147,7 @@ end # Test calling @everywhere from a module not defined on the workers module LocalBar - using Distributed + using MultiscaleCluster bar() = @everywhere new_bar()=myid() end LocalBar.bar() @@ -1107,15 +1202,17 @@ let (p, p2) = filter!(p -> p != myid(), procs()) test_throw_on([p2, p], "everywhere on p and p2") end +@info "passed 23" + # Test addprocs enable_threaded_blas parameter function get_remote_num_threads(processes_added) return [remotecall_fetch(BLAS.get_num_threads, proc_id) for proc_id in processes_added] end -function test_blas_config(pid, expected) - for worker in Distributed.PGRP.workers - if worker.id == pid +function test_blas_config(pid, expected; role=:default) + for worker in MultiscaleCluster.PGRP(role=role).workers + if MultiscaleCluster.wid(worker,role=role) == pid @test worker.config.enable_threaded_blas == expected return end @@ -1163,6 +1260,8 @@ function test_add_procs_threaded_blas() end test_add_procs_threaded_blas() +@info "passed 24" + #19687 if false ### TODO: The logic that is supposed to implement this is racy - Disabled for now # ensure no race conditions between rmprocs and addprocs @@ -1197,16 +1296,18 @@ end end # Test addprocs/rmprocs from master node only -for f in [ ()->addprocs(1; exeflags=test_exeflags), ()->rmprocs(workers()) ] - local f - try - remotecall_fetch(f, id_other) - error("Unexpected") - catch ex - @test isa(ex, RemoteException) - @test ex.captured.ex.msg == "Only process 1 can add and remove workers" - end -end +#for f in [ ()->addprocs(1; exeflags=test_exeflags), ()->rmprocs(workers()) ] +# local f +# try +# remotecall_fetch(f, id_other) +# error("Unexpected") +# catch ex +# @test isa(ex, RemoteException) +# @test ex.captured.ex.msg == "Only process 1 can add and remove workers" +# end +#end + +@info "passed 25" # Test the following addprocs error conditions # - invalid host name - github issue #20372 @@ -1273,6 +1374,8 @@ for (addp_testf, expected_errstr, env) in testruns end end +@info "passed 26" + # Auto serialization of globals from Main. # bitstypes @@ -1300,9 +1403,9 @@ global v4 = v3 # Global references to Types and Modules should work if they are locally defined global v5 = Int -global v6 = Distributed +global v6 = MultiscaleCluster @test remotecall_fetch(()->v5, id_other) === Int -@test remotecall_fetch(()->v6, id_other) === Distributed +@test remotecall_fetch(()->v6, id_other) === MultiscaleCluster struct FooStructLocal end module FooModLocal end @@ -1341,6 +1444,8 @@ v31252 = :b v31252 = :a @test :a == @fetchfrom id_other v31252 +@info "passed 27" + # Test that a global is not being repeatedly serialized when # a) referenced multiple times in the closure @@ -1435,12 +1540,14 @@ wrapped_var_ser_tests() global ids_cleanup = fill(1., 6) global ids_func = ()->ids_cleanup -clust_ser = (Distributed.worker_from_id(id_other)).w_serializer +clust_ser = (MultiscaleCluster.worker_from_id(id_other)).w_serializer @test remotecall_fetch(ids_func, id_other) == ids_cleanup +@info "passed 29" + # TODO Add test for cleanup from `clust_ser.glbs_in_tnobj` -# reported github issues - Mostly tests with globals and various distributed macros +# reported github issues - Mostly tests with globals and various MultiscaleCluster macros #2669, #5390 v2669=10 @test fetch(@spawnat :any (1+v2669)) == 11 @@ -1487,6 +1594,8 @@ let @test remotecall_fetch(Float64, id_other, 1) == Float64(1) end +@info "passed 30" + #19463 function foo19463() w1 = workers()[1] @@ -1543,6 +1652,8 @@ syms = setup_syms(3, workers()) clear!(syms, workers()) test_clear(syms, workers()) +@info "passed 31" + # Test partial recovery from a deserialization error in CapturedException try expr = quote @@ -1557,10 +1668,11 @@ try catch ex @test isa(ex.captured.ex.exceptions[1].ex, ErrorException) @test occursin("BoundsError", ex.captured.ex.exceptions[1].ex.msg) - ex = ex.captured.ex.exceptions[2].ex - @test (ex::UndefVarError).var === :DontExistOn1 + @test ex.captured.ex.exceptions[2].ex == UndefVarError(:DontExistOn1) end +@info "passed 32" + let # creates a new worker in a different folder and tries to include file tmp_dir = mktempdir() @@ -1605,10 +1717,10 @@ function launch(manager::WorkerArgTester, params::Dict, launched::Array, c::Cond exename = params[:exename] exeflags = params[:exeflags] - cmd = `$exename $exeflags --bind-to $(Distributed.LPROC.bind_addr) $(manager.worker_opt)` + cmd = `$exename $exeflags --bind-to $(MultiscaleCluster.LPROC.bind_addr) $(manager.worker_opt)` cmd = pipeline(detach(setenv(cmd, dir=dir))) io = open(cmd, "r+") - manager.write_cookie && Distributed.write_cookie(io) + manager.write_cookie && MultiscaleCluster.write_cookie(io) wconfig = WorkerConfig() wconfig.process = io @@ -1634,6 +1746,8 @@ cluster_cookie("foobar") # custom cookie npids = addprocs_with_testenv(WorkerArgTester(`--worker=foobar`, false)) @test remotecall_fetch(myid, npids[1]) == npids[1] +@info "passed 33" + # tests for start_worker options to retain stdio (issue #31035) struct RetainStdioTester <: ClusterManager close_stdin::Bool @@ -1645,8 +1759,8 @@ function launch(manager::RetainStdioTester, params::Dict, launched::Array, c::Co exename = params[:exename] exeflags = params[:exeflags] - jlcmd = "using Distributed; start_worker(\"\"; close_stdin=$(manager.close_stdin), stderr_to_stdout=$(manager.stderr_to_stdout));" - cmd = detach(setenv(`$exename $exeflags --bind-to $(Distributed.LPROC.bind_addr) -e $jlcmd`, dir=dir)) + jlcmd = "using MultiscaleCluster; start_worker(\"\"; close_stdin=$(manager.close_stdin), stderr_to_stdout=$(manager.stderr_to_stdout));" + cmd = detach(setenv(`$exename $exeflags --bind-to $(MultiscaleCluster.LPROC.bind_addr) -e $jlcmd`, dir=dir)) proc = open(cmd, "r+") wconfig = WorkerConfig() @@ -1682,21 +1796,21 @@ p1,p2 = addprocs_with_testenv(2) @test fill(2.,2) == remotecall_fetch(f22865, p1, p2) rmprocs(p1, p2) -function reuseport_tests() +function reuseport_tests(;role = :default) # Run the test on all processes. results = asyncmap(procs()) do p remotecall_fetch(p) do ports_lower = [] # ports of pids lower than myid() ports_higher = [] # ports of pids higher than myid() - for w in Distributed.PGRP.workers - w.id == myid() && continue + for w in MultiscaleCluster.PGRP(role=role).workers + MultiscaleCluster.wid(w,role=role) == myid() && continue port = Sockets._sockname(w.r_stream, true)[2] - if (w.id == 1) + if (MultiscaleCluster.wid(w,role=role) == 1) # master connects to workers push!(ports_higher, port) - elseif w.id < myid() + elseif MultiscaleCluster.wid(w,role=role) < myid(role=role) push!(ports_lower, port) - elseif w.id > myid() + elseif MultiscaleCluster.wid(w,role=role) > myid(role=role) push!(ports_higher, port) end end @@ -1707,7 +1821,7 @@ function reuseport_tests() return 0 end end - return myid() + return myid(role=role) end end @@ -1737,40 +1851,43 @@ for T in (UInt8, Int8, UInt16, Int16, UInt32, Int32, UInt64) @test n == 55 end +@info "passed 34" + + # issue #28966 let code = """ - import Distributed - Distributed.addprocs(1) - Distributed.@everywhere f() = myid() - for w in Distributed.workers() - @assert Distributed.remotecall_fetch(f, w) == w + import MultiscaleCluster + MultiscaleCluster.addprocs(1) + MultiscaleCluster.@everywhere f() = myid() + for w in MultiscaleCluster.workers() + @assert MultiscaleCluster.remotecall_fetch(f, w) == w end """ @test success(`$(Base.julia_cmd()) --startup-file=no -e $code`) end -# PR 32431: tests for internal Distributed.head_and_tail -let (h, t) = Distributed.head_and_tail(1:10, 3) +# PR 32431: tests for internal MultiscaleCluster.head_and_tail +let (h, t) = MultiscaleCluster.head_and_tail(1:10, 3) @test h == 1:3 @test collect(t) == 4:10 end -let (h, t) = Distributed.head_and_tail(1:10, 0) +let (h, t) = MultiscaleCluster.head_and_tail(1:10, 0) @test h == [] @test collect(t) == 1:10 end -let (h, t) = Distributed.head_and_tail(1:3, 5) +let (h, t) = MultiscaleCluster.head_and_tail(1:3, 5) @test h == 1:3 @test collect(t) == [] end -let (h, t) = Distributed.head_and_tail(1:3, 3) +let (h, t) = MultiscaleCluster.head_and_tail(1:3, 3) @test h == 1:3 @test collect(t) == [] end -let (h, t) = Distributed.head_and_tail(Int[], 3) +let (h, t) = MultiscaleCluster.head_and_tail(Int[], 3) @test h == [] @test collect(t) == [] end -let (h, t) = Distributed.head_and_tail(Int[], 0) +let (h, t) = MultiscaleCluster.head_and_tail(Int[], 0) @test h == [] @test collect(t) == [] end @@ -1806,7 +1923,7 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp "TMPDIR" => dirname(tmp), ) setupcode = """ - using Distributed, Test + using MultiscaleCluster, Test @everywhere begin depot_path() = DEPOT_PATH load_path() = LOAD_PATH @@ -1846,7 +1963,7 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp # Pkg.activate(...) activateish = """ Base.ACTIVE_PROJECT[] = $(repr(project)) - using Distributed + using MultiscaleCluster addprocs(1) """ cmd = setenv(`$(julia) -e $(activateish * testcode * extracode)`, env) @@ -1859,7 +1976,7 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp append!(empty!(LOAD_PATH), l) """ addcode = """ - using Distributed + using MultiscaleCluster addprocs(1) # after shuffling """ extracode = """ @@ -1881,7 +1998,7 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp @test success(cmd) # Passing env or exeflags to addprocs(...) to override defaults envcode = """ - using Distributed + using MultiscaleCluster project = mktempdir() env = Dict( "JULIA_LOAD_PATH" => string(LOAD_PATH[1], $(repr(pathsep)), "@stdlib"), @@ -1903,6 +2020,8 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp @test success(cmd) end end +@info "passed 35" + include("splitrange.jl") # Clear all workers for timeout tests (issue #45785) @@ -1925,7 +2044,12 @@ begin end end +@info "passed 36" + # Run topology tests last after removing all workers, since a given # cluster at any time only supports a single topology. rmprocs(workers()) include("topology.jl") + +@info "end test" + diff --git a/test/managers.jl b/test/managers.jl index 7971222..c6f2cdb 100644 --- a/test/managers.jl +++ b/test/managers.jl @@ -1,9 +1,9 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license using Test -using Distributed +using MultiscaleCluster using Sockets -using Distributed: parse_machine, SSHManager, LocalManager +using MultiscaleCluster: parse_machine, SSHManager, LocalManager @test parse_machine("127.0.0.1") == ("127.0.0.1", nothing) @test parse_machine("127.0.0.1:80") == ("127.0.0.1", 80) diff --git a/test/runtests.jl b/test/runtests.jl index d34d07c..6bd23ab 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -8,7 +8,7 @@ disttestfile = joinpath(@__DIR__, "distributed_exec.jl") cmd = `$test_exename $test_exeflags $disttestfile` if !success(pipeline(cmd; stdout=stdout, stderr=stderr)) && ccall(:jl_running_on_valgrind,Cint,()) == 0 - error("Distributed test failed, cmd : $cmd") + error("MultiscaleCluster test failed, cmd : $cmd") end include("managers.jl") diff --git a/test/splitrange.jl b/test/splitrange.jl index 1cb12e1..d9f50e6 100644 --- a/test/splitrange.jl +++ b/test/splitrange.jl @@ -1,8 +1,8 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license using Test -using Distributed -using Distributed: splitrange +using MultiscaleCluster +using MultiscaleCluster: splitrange @test splitrange(1, 11, 1) == Array{UnitRange{Int64},1}([1:11]) @test splitrange(0, 10, 1) == Array{UnitRange{Int64},1}([0:10]) diff --git a/test/topology.jl b/test/topology.jl index fc96932..378e7cd 100644 --- a/test/topology.jl +++ b/test/topology.jl @@ -43,11 +43,11 @@ function launch(manager::TopoTestManager, params::Dict, launched::Array, c::Cond exename = params[:exename] exeflags = params[:exeflags] - cmd = `$exename $exeflags --bind-to $(Distributed.LPROC.bind_addr) --worker` + cmd = `$exename $exeflags --bind-to $(MultiscaleCluster.LPROC.bind_addr) --worker` cmd = pipeline(detach(setenv(cmd, dir=dir))) for i in 1:manager.np io = open(cmd, "r+") - Distributed.write_cookie(io) + MultiscaleCluster.write_cookie(io) wconfig = WorkerConfig() wconfig.process = io @@ -98,8 +98,8 @@ remove_workers_and_test() # test `lazy` connection setup function def_count_conn() @everywhere function count_connected_workers() - count(x -> isa(x, Distributed.Worker) && isdefined(x, :r_stream) && isopen(x.r_stream), - Distributed.PGRP.workers) + count(x -> isa(x, MultiscaleCluster.Worker) && isdefined(x, :r_stream) && isopen(x.r_stream), + MultiscaleCluster.PGRP().workers) end end From 7bc392014b089f4aedbe3c5d37da7cd548d5881a Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior <102302676+decarvalhojunior-fh@users.noreply.github.com> Date: Thu, 9 Nov 2023 10:31:35 -0300 Subject: [PATCH 03/54] Update README.md Including a note about this extension for multiscale parallelism. --- README.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 76f6355..4866802 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,20 @@ -# Distributed +# Distributed (with a multiscale parallelism extension) The `Distributed` package provides functionality for creating and controlling multiple Julia processes remotely, and for performing distributed and parallel computing. It uses network sockets or other supported interfaces to communicate between Julia processes, and relies on Julia's `Serialization` stdlib package to transform Julia objects into a format that can be transferred between processes efficiently. It provides a full set of utilities to create and destroy new Julia processes and add them to a "cluster" (a collection of Julia processes connected together), as well as functions to perform Remote Procedure Calls (RPC) between the processes within a cluster. See [`API`](@ref) for details. This package ships as part of the Julia stdlib. +> [!NOTE] +> This repository is a fork of the original [`Distributed`](https://github.com/JuliaLang/Distributed.jl) package for developing ideas behind the support of _multiscale parallelism_ in Julia. In gross terms, such an extension allows worker processes to execute the `addprocs` operation, so that a worker process may also play the role of a master process with respect to a set of worker processes it creates by invoking `addprocs`. For that, all `Distributed` operations listed below are extended with a keyword parameter `role`, with three possible values: `:default` (default argument), `:master`, and `:worker`. So, a worker that created processes by means of `addprocs` may execute operations as: +> * a ***worker process*** by using `role = :worker`, for interacting with the master processes that created it, as well as other workers; or +> * a ***master process*** by using `role = :master`, for interacting with the workers it created. +> +> It is important to note that the modifications to the API do not affect usual `Distributed` programs. +> +> Mutiscale parallelism may help programmers in at least two scenarios: +> * to deploy _multicluster computations_, i.e. parallel computations employing multiple clusters by assuming the parallel programming patterns and tools at the multicluster and cluster levels are distinct; +> * better support for _multilevel parallel programming_ patterns. + ## Using development versions of this package To use a newer version of this package, you need to build Julia from scratch. The build process is the same as any other build except that you need to change the commit used in `stdlib/Distributed.version`. From 0bdc7068da0bc69c4bb7fcda292f91e0ad88d0d8 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Thu, 9 Nov 2023 10:38:53 -0300 Subject: [PATCH 04/54] all occurrences: :manager -> :master --- src/cluster.jl | 86 +++++++++++++++++++++++------------------------ src/macros.jl | 4 +-- src/managers.jl | 6 ++-- src/messages.jl | 4 +-- src/remotecall.jl | 17 +++------- src/workerpool.jl | 6 ++-- 6 files changed, 58 insertions(+), 65 deletions(-) diff --git a/src/cluster.jl b/src/cluster.jl index 071863a..71369a1 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -168,7 +168,7 @@ function check_worker_state(w::Worker; role= :default) t = @async exec_conn_func(w; role=role) else # route request via node 1 - t = @async remotecall_fetch((p,to_id) -> remotecall_fetch((to_id, role) -> exec_conn_func(to_id, role = role), p, to_id, p == 1 ? :manager : :worker; role = role), 1, wid(w, role=role), myid(role=role)) + t = @async remotecall_fetch((p,to_id) -> remotecall_fetch((to_id, role) -> exec_conn_func(to_id, role = role), p, to_id, p == 1 ? :master : :worker; role = role), 1, wid(w, role=role), myid(role=role)) end errormonitor(t) wait_for_conn(w; role=role) @@ -214,16 +214,16 @@ mutable struct LocalProcess end function wid(lp::LocalProcess; role= :default) - if role == :manager + if role == :master return lp.id1 elseif role == :worker return lp.id0 elseif role == :default && myrole() == :master - return lp.id1 # as :manager + return lp.id1 # as :master elseif role == :default && myrole() == :worker return lp.id0 # as :worker else - return lp.id1 # as :manager + return lp.id1 # as :master #throw("unexpected use of role=:default (wid)") end @@ -468,9 +468,9 @@ function addprocs(manager::ClusterManager; kwargs...) try if myrole() == :worker - myrole!(:manager_worker) + myrole!(:master) end - PGRP(role=:manager).level = PGRP(role=:worker).level + 1 + PGRP(role=:master).level = PGRP(role=:worker).level + 1 addprocs_locked(manager::ClusterManager; kwargs...) finally @@ -480,9 +480,9 @@ end function addprocs_locked(manager::ClusterManager; kwargs...) params = merge(default_addprocs_params(manager), Dict{Symbol,Any}(kwargs)) - topology(Symbol(params[:topology]); role = :manager) + topology(Symbol(params[:topology]); role = :master) - pgm = PGRP(role = :manager) + pgm = PGRP(role = :master) if pgm.topology !== :all_to_all params[:lazy] = false @@ -490,8 +490,8 @@ function addprocs_locked(manager::ClusterManager; kwargs...) if pgm.lazy === nothing || nprocs() == 1 pgm.lazy = params[:lazy] - elseif isclusterlazy(role = :manager) != params[:lazy] - throw(ArgumentError(string("Active workers with lazy=", isclusterlazy(role = :manager), + elseif isclusterlazy(role = :master) != params[:lazy] + throw(ArgumentError(string("Active workers with lazy=", isclusterlazy(role = :master), ". Cannot set lazy=", params[:lazy]))) end @@ -536,9 +536,9 @@ function addprocs_locked(manager::ClusterManager; kwargs...) # Since all worker-to-worker setups may not have completed by the time this # function returns to the caller, send the complete list to all workers. # Useful for nprocs(), nworkers(), etc to return valid values on the workers. - all_w = workers(role = :manager) + all_w = workers(role = :master) for pid in all_w - remote_do((all_w, role) -> set_valid_processes(all_w, role = role), pid, all_w, pid == 1 ? :manager : :worker; role = :manager) + remote_do((all_w, role) -> set_valid_processes(all_w, role = role), pid, all_w, pid == 1 ? :master : :worker; role = :master) end sort!(launched_q) @@ -593,7 +593,7 @@ function launch_n_additional_processes(manager, frompid, fromconfig, cnt, launch exeflags = something(fromconfig.exeflags, ``) cmd = `$exename $exeflags` - new_addresses = remotecall_fetch(launch_additional, frompid, cnt, cmd; role = :manager) + new_addresses = remotecall_fetch(launch_additional, frompid, cnt, cmd; role = :master) for address in new_addresses (bind_addr, port) = address @@ -607,7 +607,7 @@ function launch_n_additional_processes(manager, frompid, fromconfig, cnt, launch let wconfig=wconfig @async begin pid = create_worker(manager, wconfig) - remote_do(redirect_output_from_additional_worker, frompid, pid, port; role = :manager) + remote_do(redirect_output_from_additional_worker, frompid, pid, port; role = :master) push!(launched_q, pid) end end @@ -616,7 +616,7 @@ function launch_n_additional_processes(manager, frompid, fromconfig, cnt, launch end function create_worker(manager, wconfig) - role = :manager + role = :master # only node 1 can add new nodes, since nobody else has the full list of address:port @assert myid(role=role) == 1 @@ -651,7 +651,7 @@ function create_worker(manager, wconfig) # Start a new task to handle inbound messages from connected worker in master. # Also calls `wait_connected` on TCP streams. - process_messages(w.r_stream, w.w_stream, false; role = :manager) + process_messages(w.r_stream, w.w_stream, false; role = :master) # send address information of all workers to the new worker. # Cluster managers set the address of each worker in `WorkerConfig.connect_at`. @@ -829,7 +829,7 @@ const _PGRP0 = ProcessGroup([]) const _PGRP1 = ProcessGroup([]) function PGRP(;role= :default) - if role == :manager + if role == :master # @info "$(role) / PGRP1 !" return _PGRP1 elseif role == :worker @@ -838,13 +838,13 @@ function PGRP(;role= :default) # elseif role == :default && _PGRP0.level == 0 elseif role == :default && myrole() == :master # @info "$(role) / PGRP1 !" - return _PGRP1 # as :manager + return _PGRP1 # as :master # elseif role == :default && _PGRP0.level > 0 elseif role == :default && myrole() == :worker # @info "$(role) / PGRP0 !" return _PGRP0 # as :worker else - return _PGRP1 # as :manager + return _PGRP1 # as :master # throw("unexpected use of role = $role (PGRP) - $(myrole())") end end @@ -862,7 +862,7 @@ end isclusterlazy(; role= :default) = something(PGRP(role = role).lazy, false) -get_bind_addr(pid::Integer) = get_bind_addr(worker_from_id(pid; role = :manager)) # always called as manager +get_bind_addr(pid::Integer) = get_bind_addr(worker_from_id(pid; role = :master)) # always called as manager get_bind_addr(w::LocalProcess) = LPROC.bind_addr # always called as manager function get_bind_addr(w::Worker) role = :worker # always called as worker @@ -888,7 +888,7 @@ function Map_pid_wrkr(;role= :default) # @info ("_map_pid_wrkr_0", _map_pid_wrkr_0, "end") # @info ("_map_pid_wrkr_1", _map_pid_wrkr_1, "end") pg = PGRP(role = role) - if role == :manager + if role == :master # @info "Map_pid_wrkr_1 ", role return _map_pid_wrkr_1 elseif role == :worker @@ -896,12 +896,12 @@ function Map_pid_wrkr(;role= :default) return _map_pid_wrkr_0 elseif role == :default && myrole() == :master # @info "Map_pid_wrkr_1 ", role, pg.level - return _map_pid_wrkr_1 # as :manager + return _map_pid_wrkr_1 # as :master elseif role == :default && myrole() == :worker # @info "Map_pid_wrkr_0 ", role, pg.level return _map_pid_wrkr_0 # as :worker else - return _map_pid_wrkr_1 # as :manager + return _map_pid_wrkr_1 # as :master # throw("unexpected use of role = :default (Map_pid_wrkr)") end end @@ -928,32 +928,32 @@ julia> remotecall_fetch(() -> myid(), 4) ``` """ function myid(;role= :default) - if role == :manager + if role == :master return LPROC.id1 elseif role == :worker return LPROC.id0 elseif role == :default && myrole() == :master - return LPROC.id1 # as :manager + return LPROC.id1 # as :master elseif role == :default && myrole() == :worker return LPROC.id0 # as :worker else - return LPROC.id1 # as :manager + return LPROC.id1 # as :master #throw("unexpected use of role := default (myid) - $(myrole())") end end function myid!(id;role= :default) - if role == :manager + if role == :master LPROC.id1 = id elseif role == :worker LPROC.id0 = id elseif role == :default && myrole() == :master - LPROC.id1 = id # as :manager + LPROC.id1 = id # as :master elseif role == :default && myrole() == :worker LPROC.id0 = id # as :worker else - LPROC.id1 = id # as :manager + LPROC.id1 = id # as :master #throw("unexpected use of role := default (myid!)") end @@ -1075,7 +1075,7 @@ function procs(pid::Integer; role= :default) Int[wid(x, role=role) for x in filter(w -> get_bind_addr(w) == ipatpid, all_workers)] end else - remotecall_fetch(pid -> procs(pid, role = :manager), 1; role = role) + remotecall_fetch(pid -> procs(pid, role = :master), 1; role = role) end end @@ -1140,7 +1140,7 @@ julia> workers() 6 ``` """ -function rmprocs(pids...; role = :default, waitfor=typemax(Int)) # supposed to be called always as :manager +function rmprocs(pids...; role = :default, waitfor=typemax(Int)) # supposed to be called always as :master # cluster_mgmt_from_master_check() pids = vcat(pids...) @@ -1270,13 +1270,13 @@ function deregister_worker(pg, pid; role= :default) end end - if myid(role=role) == 1 && #=role === :manager &&=# isdefined(w, :config) + if myid(role=role) == 1 && #=role === :master &&=# isdefined(w, :config) # Notify the cluster manager of this workers death manage(w.manager, wid(w, role=role), w.config, :deregister) if pg.topology !== :all_to_all || isclusterlazy(role = role) for rpid in workers(role=role) try - remote_do((pid,role) -> deregister_worker(pid, role=role), rpid, pid, rpid == 1 ? :manager : :worker; role = role) + remote_do((pid,role) -> deregister_worker(pid, role=role), rpid, pid, rpid == 1 ? :master : :worker; role = role) catch end end @@ -1312,11 +1312,11 @@ end function interrupt(pid::Integer) - @assert myid(role = :manager) == 1 - map_pid_wrkr = Map_pid_wrkr(role = :manager) + @assert myid(role = :master) == 1 + map_pid_wrkr = Map_pid_wrkr(role = :master) w = map_pid_wrkr[pid] if isa(w, Worker) - manage(w.manager, wid(w, role=:manager), w.config, :interrupt) + manage(w.manager, wid(w, role=:master), w.config, :interrupt) end return end @@ -1335,8 +1335,8 @@ interrupt(pids::Integer...) = interrupt([pids...]) Interrupt the current executing task on the specified workers. This is equivalent to pressing Ctrl-C on the local machine. If no arguments are given, all workers are interrupted. """ -function interrupt(pids::AbstractVector=workers(role = :manager)) - @assert myid(role = :manager) == 1 +function interrupt(pids::AbstractVector=workers(role = :master)) + @assert myid(role = :master) == 1 @sync begin for pid in pids @async interrupt(pid) @@ -1349,7 +1349,7 @@ wp_bind_addr(p) = p.config.bind_addr function check_same_host(pids; role= :default) if myid(role = role) != 1 - return remotecall_fetch(pids -> check_same_host(pids, role = :manager), 1, pids; role = role) + return remotecall_fetch(pids -> check_same_host(pids, role = :master), 1, pids; role = role) else # We checkfirst if all test pids have been started using the local manager, # else we check for the same bind_to addr. This handles the special case @@ -1417,7 +1417,7 @@ let inited = false if !inited inited = true push!(Base.package_callbacks, _require_callback) - atexit(() -> terminate_all_workers(role = :manager)) # TO CHECK (role argument ???) + atexit(() -> terminate_all_workers(role = :master)) # TO CHECK (role argument ???) init_bind_addr() cluster_cookie(randstring(HDR_COOKIE_LEN)) end @@ -1426,7 +1426,7 @@ let inited = false end function init_parallel() - start_gc_msgs_task(role = :manager) # TO CHECK + start_gc_msgs_task(role = :master) # TO CHECK start_gc_msgs_task(role = :worker) # TO CHECK # start in "head node" mode, if worker, will override later. @@ -1434,9 +1434,9 @@ function init_parallel() global LPROC LPROC.id0 = 0 LPROC.id1 = 1 - @assert isempty(PGRP(role = :manager).workers) # TO CHECK + @assert isempty(PGRP(role = :master).workers) # TO CHECK @assert isempty(PGRP(role = :worker).workers) # TO CHECK - register_worker(LPROC; role = :manager) # TO CHECK + register_worker(LPROC; role = :master) # TO CHECK register_worker(LPROC; role = :worker) # TO CHECK end diff --git a/src/macros.jl b/src/macros.jl index 3ba168b..2503fdc 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -191,7 +191,7 @@ Similar to calling `remotecall_eval(Main, procs, expr)`, but with two extra feat """ macro everywhere(ex) procs = GlobalRef(@__MODULE__, :procs) - return esc(:($(MultiscaleCluster).@everywhere $procs(role = :manager) $ex)) + return esc(:($(MultiscaleCluster).@everywhere $procs(role = :master) $ex)) end macro everywhere(procs, ex) @@ -200,7 +200,7 @@ macro everywhere(procs, ex) $(isempty(imps) ? nothing : Expr(:toplevel, imps...)) # run imports locally first let ex = Expr(:toplevel, :(task_local_storage()[:SOURCE_PATH] = $(get(task_local_storage(), :SOURCE_PATH, nothing))), $(esc(Expr(:quote, ex)))), procs = $(esc(procs)) - remotecall_eval(Main, procs, ex; role = :manager) + remotecall_eval(Main, procs, ex; role = :master) end end end diff --git a/src/managers.jl b/src/managers.jl index 9a095f8..0096907 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -724,19 +724,19 @@ It should cause the remote worker specified by `pid` to exit. on `pid`. """ function kill(manager::ClusterManager, pid::Int, config::WorkerConfig) - remote_do(exit, pid; role = :manager) + remote_do(exit, pid; role = :master) nothing end function kill(manager::SSHManager, pid::Int, config::WorkerConfig) - remote_do(exit, pid; role = :manager) + remote_do(exit, pid; role = :master) cancel_ssh_tunnel(config) nothing end function kill(manager::LocalManager, pid::Int, config::WorkerConfig; exit_timeout = 15, term_timeout = 15) # First, try sending `exit()` to the remote over the usual control channels - remote_do(exit, pid; role = :manager) + remote_do(exit, pid; role = :master) timer_task = @async begin sleep(exit_timeout) diff --git a/src/messages.jl b/src/messages.jl index be491db..4f3f472 100644 --- a/src/messages.jl +++ b/src/messages.jl @@ -144,10 +144,10 @@ function flush_gc_msgs(w::Worker; role= :default) end end if add_msgs !== nothing - remote_do((add_msgs, role) -> add_clients(add_msgs, role = role), w, add_msgs, wid(w,role=role) == 1 ? :manager : :worker; role = role) + remote_do((add_msgs, role) -> add_clients(add_msgs, role = role), w, add_msgs, wid(w,role=role) == 1 ? :master : :worker; role = role) end if del_msgs !== nothing - remote_do((del_msgs, role) -> del_clients(del_msgs, role = role), w, del_msgs, wid(w,role=role) == 1 ? :manager : :worker; role = role) + remote_do((del_msgs, role) -> del_clients(del_msgs, role = role), w, del_msgs, wid(w,role=role) == 1 ? :master : :worker; role = role) end return end diff --git a/src/remotecall.jl b/src/remotecall.jl index 9f1312a..b071b7d 100644 --- a/src/remotecall.jl +++ b/src/remotecall.jl @@ -218,7 +218,7 @@ function isready(rr::Future; role= :default) return if rr.where == myid(role = role) isready(lookup_ref(rid; role = role).c) else - remotecall_fetch((rid, role)->isready(lookup_ref(rid; role = role).c), rr.where, rid, rr.where == 1 ? :manager : :worker; role = role) + remotecall_fetch((rid, role)->isready(lookup_ref(rid; role = role).c), rr.where, rid, rr.where == 1 ? :master : :worker; role = role) end end @@ -235,7 +235,7 @@ function isready(rr::RemoteChannel, args...; role= :default) return if rr.where == myid(role = role) isready(lookup_ref(rid; role = role).c, args...) else - remotecall_fetch(rid->isready(lookup_ref(rid; role = rr.where == 1 ? :manager : :worker).c, args...), rr.where, rid; role = role) + remotecall_fetch(rid->isready(lookup_ref(rid; role = rr.where == 1 ? :master : :worker).c, args...), rr.where, rid; role = role) end end @@ -446,7 +446,7 @@ Return a [`Future`](@ref). Keyword arguments, if any, are passed through to `f`. """ remotecall(f, id::Integer, args...; role= :default, kwargs...) = -# remotecall(f, worker_from_id(id; role = id == 1 ? :manager : :worker), args...; role = role, kwargs...) +# remotecall(f, worker_from_id(id; role = id == 1 ? :master : :worker), args...; role = role, kwargs...) remotecall(f, worker_from_id(id; role = role), args...; role = role, kwargs...) function remotecall_fetch(f, w::LocalProcess, args...; role= :default, kwargs...) @@ -565,14 +565,7 @@ function call_on_owner(f, rr::AbstractRemoteRef, args...; role= :default) if rr.where == myid(role = role) f(rid, args...) else - #remotecall_fetch((rid,role) -> f(rid, role = role, args...), rr.where, rid, rr.where==1 ? :manager : :worker; role = role) - remotecall_fetch((rid,role) -> f(rid, args...; role=role), rr.where, rid, rr.where==1 ? :manager : :worker; role = role) - - - #remotecall_fetch(rid -> f(rid, role = rr.where==1 ? :manager : :worker, args...), rr.where; role = role) - #remotecall_fetch(iiiii, rr.where, f, rid, rr.where==1 ? :manager : :worker, args...; role = role) -# remotecall_fetch(f, rr.where, rid, args...) - + remotecall_fetch((rid,role) -> f(rid, args...; role=role), rr.where, rid, rr.where==1 ? :master : :worker; role = role) end end @@ -801,7 +794,7 @@ function getindex(r::RemoteChannel, args...; role= :default) if r.where == myid(role = role) return getindex(fetch(r; role = role), args...#=; role = role=#) end - return remotecall_fetch((r,role) -> getindex(r, role = role, args...), r.where, r, r.where == 1 ? :manager : :worker; role = role) + return remotecall_fetch((r,role) -> getindex(r, role = role, args...), r.where, r, r.where == 1 ? :master : :worker; role = role) end function iterate(c::RemoteChannel, state=nothing; role= :default) diff --git a/src/workerpool.jl b/src/workerpool.jl index 133049f..261cde4 100644 --- a/src/workerpool.jl +++ b/src/workerpool.jl @@ -138,7 +138,7 @@ for (func, rt) = ((:length, Int), (:isready, Bool), (:workers, Vector{Int}), (:n @eval begin function ($func)(pool::WorkerPool; role= :default) if pool.ref.where != myid(role = role) - return remotecall_fetch((ref, role)->(($func_local)(fetch(ref; role=role).value; role = role)), pool.ref.where, pool.ref, pool.ref.where == 1 ? :manager : :worker; role = role)::$rt + return remotecall_fetch((ref, role)->(($func_local)(fetch(ref; role=role).value; role = role)), pool.ref.where, pool.ref, pool.ref.where == 1 ? :master : :worker; role = role)::$rt else return ($func_local)(pool; role = role) end @@ -154,7 +154,7 @@ for func = (:push!, :put!) @eval begin function ($func)(pool::WorkerPool, w::Int; role= :default) if pool.ref.where != myid(role = role) - return remotecall_fetch((ref, w, role)->(($func_local)(fetch(ref; role = role).value, w; role = role)), pool.ref.where, pool.ref, w, pool.ref.where == 1 ? :manager : :worker; role = role) + return remotecall_fetch((ref, w, role)->(($func_local)(fetch(ref; role = role).value, w; role = role)), pool.ref.where, pool.ref, w, pool.ref.where == 1 ? :master : :worker; role = role) else return ($func_local)(pool, w; role = role) end @@ -268,7 +268,7 @@ function default_worker_pool(;role=:default) if myid(role=role) == 1 _default_worker_pool[] = WorkerPool(role = role) else - _default_worker_pool[] = remotecall_fetch(role->default_worker_pool(role = role), 1, :manager; role=role) + _default_worker_pool[] = remotecall_fetch(role->default_worker_pool(role = role), 1, :master; role=role) end end return _default_worker_pool[] From ebbd9999d11ba06b04b9b01f67e3e0aead3116e7 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Fri, 10 Nov 2023 10:36:28 -0300 Subject: [PATCH 05/54] added role parameter to macros --- src/macros.jl | 160 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 119 insertions(+), 41 deletions(-) diff --git a/src/macros.jl b/src/macros.jl index 2503fdc..746bbbc 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -39,11 +39,31 @@ julia> fetch(f) !!! compat "Julia 1.3" As of Julia 1.3 this macro is deprecated. Use `@spawnat :any` instead. """ -macro spawn(expr) + + +#macro spawn(expr, role = :(:default)) + +function check_args_2(args...) + na = length(args) + if na==1 + role = :(role = :default) + expr = args[1] + elseif na==2 + role = args[1] + expr = args[2] + else + throw(ArgumentError("wrong number of arguments to spawn")) + end + return role, expr +end + +macro spawn(args...) + rolearg, expr = check_args_2(args...) + thunk = esc(:(()->($expr))) var = esc(Base.sync_varname) quote - local ref = spawn_somewhere($thunk) + local ref = spawn_somewhere($thunk; $(esc(rolearg))) if $(Expr(:islocal, var)) put!($var, ref) end @@ -51,6 +71,7 @@ macro spawn(expr) end end + """ @spawnat p expr @@ -79,14 +100,35 @@ julia> fetch(f) !!! compat "Julia 1.3" The `:any` argument is available as of Julia 1.3. """ -macro spawnat(p, expr) - thunk = esc(:(()->($expr))) - var = esc(Base.sync_varname) - if p === QuoteNode(:any) - spawncall = :(spawn_somewhere($thunk)) + +function check_args_3a(args...) + na = length(args) + if na==2 + role = :(role = :default) + p = args[1] + expr = args[2] + elseif na==3 + role = args[1] + p = args[2] + expr = args[3] else - spawncall = :(spawnat($(esc(p)), $thunk)) + throw(ArgumentError("wrong number of arguments to spawnat")) end + return role, p, expr +end + +macro spawnat(args...) + rolearg, p, expr = check_args_3a(args...) + + @info rolearg, typeof(rolearg) + + thunk = esc(:(()->($expr))) + var = esc(Base.sync_varname) + if p === QuoteNode(:any) + spawncall = :(spawn_somewhere($thunk; $(esc(rolearg)))) + else + spawncall = :(spawnat($(esc(p)), $thunk; $(esc(rolearg)))) + end quote local ref = $spawncall if $(Expr(:islocal, var)) @@ -96,6 +138,7 @@ macro spawnat(p, expr) end end + """ @fetch expr @@ -119,9 +162,13 @@ julia> @fetch myid() 2 ``` """ -macro fetch(expr) + +macro fetch(args...) + + rolearg, expr = check_args_2(args...) + thunk = esc(:(()->($expr))) - :(remotecall_fetch($thunk, nextproc())) + :(remotecall_fetch($thunk, nextproc(); $(esc(rolearg)))) end """ @@ -141,9 +188,12 @@ julia> @fetchfrom 4 myid() 4 ``` """ -macro fetchfrom(p, expr) + + +macro fetchfrom(args...) + rolearg, p, expr = check_args_3a(args...) thunk = esc(:(()->($expr))) - :(remotecall_fetch($thunk, $(esc(p)))) + :(remotecall_fetch($thunk, $(esc(p)); $(esc(rolearg)))) end # extract a list of modules to import from an expression @@ -189,20 +239,54 @@ Similar to calling `remotecall_eval(Main, procs, expr)`, but with two extra feat packages are precompiled. - The current source file path used by `include` is propagated to other processes. """ -macro everywhere(ex) - procs = GlobalRef(@__MODULE__, :procs) - return esc(:($(MultiscaleCluster).@everywhere $procs(role = :master) $ex)) + +function check_args_3b(args...) + + na = length(args) + if na==1 + rolearg = :(role = :default) + reducer = nothing + loop = args[1] + elseif na==2 + if isa(args[1], Expr) && args[1].head == :(=) && args[1].args[1] === :role + rolearg = args[1] + reducer = nothing + loop = args[2] + else + rolearg = :(role = :default) + reducer = args[1] + loop = args[2] + end + elseif na==3 + rolearg = args[1] + reducer = args[2] + loop = args[3] + else + throw(ArgumentError("wrong number of arguments to @distributed")) + end + + return rolearg, reducer, loop end -macro everywhere(procs, ex) - imps = extract_imports(ex) - return quote - $(isempty(imps) ? nothing : Expr(:toplevel, imps...)) # run imports locally first - let ex = Expr(:toplevel, :(task_local_storage()[:SOURCE_PATH] = $(get(task_local_storage(), :SOURCE_PATH, nothing))), $(esc(Expr(:quote, ex)))), - procs = $(esc(procs)) - remotecall_eval(Main, procs, ex; role = :master) +macro everywhere(args...) + + rolearg, procs, ex = check_args_3b(args...) + + if isnothing(procs) + procs = GlobalRef(@__MODULE__, :procs) + return esc(:($(MultiscaleCluster).@everywhere $rolearg $procs(;$rolearg) $ex)) + else + imps = extract_imports(ex) + return quote + $(isempty(imps) ? nothing : Expr(:toplevel, imps...)) # run imports locally first + let ex = Expr(:toplevel, :(task_local_storage()[:SOURCE_PATH] = $(get(task_local_storage(), :SOURCE_PATH, nothing))), $(esc(Expr(:quote, ex)))), + procs = $(esc(procs)) + remotecall_eval(Main, procs, ex; $(esc(rolearg))) + end end + end + end """ @@ -261,22 +345,22 @@ function splitrange(firstIndex::Int, lastIndex::Int, np::Int) return chunks end -function preduce(reducer, f, R; role= :default) - chunks = splitrange(Int(firstindex(R)), Int(lastindex(R)), nworkers(role = role)) - all_w = workers(role = role)[1:length(chunks)] +function preduce(reducer, f, R; role = :default) + chunks = splitrange(Int(firstindex(R)), Int(lastindex(R)), nworkers(role=role)) + all_w = workers(role=role)[1:length(chunks)] w_exec = Task[] for (idx,pid) in enumerate(all_w) - t = Task(()->remotecall_fetch(f, pid, reducer, R, first(chunks[idx]), last(chunks[idx]); role = role)) + t = Task(()->remotecall_fetch(f, pid, reducer, R, first(chunks[idx]), last(chunks[idx]), role=role)) schedule(t) push!(w_exec, t) end reduce(reducer, Any[fetch(t) for t in w_exec]) end -function pfor(f, R) - t = @async @sync for c in splitrange(Int(firstindex(R)), Int(lastindex(R)), nworkers()) - @spawnat :any f(R, first(c), last(c)) +function pfor(f, R; role = :default) + t = @async @sync for c in splitrange(Int(firstindex(R)), Int(lastindex(R)), nworkers(role=role)) + @spawnat role=role :any f(R, first(c), last(c)) end errormonitor(t) end @@ -328,15 +412,9 @@ completion. To wait for completion, prefix the call with [`@sync`](@ref), like : end """ macro distributed(args...) - na = length(args) - if na==1 - loop = args[1] - elseif na==2 - reducer = args[1] - loop = args[2] - else - throw(ArgumentError("wrong number of arguments to @distributed")) - end + + rolearg, reducer, loop = check_args_3b(args...) + if !isa(loop,Expr) || loop.head !== :for error("malformed @distributed loop") end @@ -346,16 +424,16 @@ macro distributed(args...) if Meta.isexpr(body, :block) && body.args[end] isa LineNumberNode resize!(body.args, length(body.args) - 1) end - if na==1 + if isnothing(reducer) syncvar = esc(Base.sync_varname) return quote - local ref = pfor($(make_pfor_body(var, body)), $(esc(r))) + local ref = pfor($(make_pfor_body(var, body)), $(esc(r)); $(esc(rolearg))) if $(Expr(:islocal, syncvar)) put!($syncvar, ref) end ref end else - return :(preduce($(esc(reducer)), $(make_preduce_body(var, body)), $(esc(r)))) # TO CHECK (role ?) + return :(preduce($(esc(reducer)), $(make_preduce_body(var, body)), $(esc(r)); $(esc(rolearg)))) # TO CHECK (role ?) end end From e651aacc4f6b9f38e2f54667b09125820a13b23c Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Tue, 14 Nov 2023 12:21:34 -0300 Subject: [PATCH 06/54] ... --- src/macros.jl | 2 +- src/managers.jl | 2 +- src/pmap.jl | 2 +- src/precompile.jl | 12 ++++++------ test/managers.jl | 4 ++-- test/runtests.jl | 2 +- test/splitrange.jl | 4 ++-- test/topology.jl | 8 ++++---- 8 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/macros.jl b/src/macros.jl index 746bbbc..e32ef22 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -274,7 +274,7 @@ macro everywhere(args...) if isnothing(procs) procs = GlobalRef(@__MODULE__, :procs) - return esc(:($(MultiscaleCluster).@everywhere $rolearg $procs(;$rolearg) $ex)) + return esc(:($(Distributed).@everywhere $rolearg $procs(;$rolearg) $ex)) else imps = extract_imports(ex) return quote diff --git a/src/managers.jl b/src/managers.jl index 0096907..03225e9 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -477,7 +477,7 @@ function launch(manager::LocalManager, params::Dict, launched::Array, c::Conditi # TODO: Maybe this belongs in base/initdefs.jl as a package_environment() function # together with load_path() etc. Might be useful to have when spawning julia - # processes outside of MultiscaleCluster.jl too. + # processes outside of Distributed.jl too. # JULIA_(LOAD|DEPOT)_PATH are used to populate (LOAD|DEPOT)_PATH on startup, # but since (LOAD|DEPOT)_PATH might have changed they are re-serialized here. # Users can opt-out of this by passing `env = ...` to addprocs(...). diff --git a/src/pmap.jl b/src/pmap.jl index b266664..225c9ad 100644 --- a/src/pmap.jl +++ b/src/pmap.jl @@ -240,7 +240,7 @@ Return `head`: the first `n` elements of `c`; and `tail`: an iterator over the remaining elements. ```jldoctest -julia> b, c = MultiscaleCluster.head_and_tail(1:10, 3) +julia> b, c = Distributed.head_and_tail(1:10, 3) ([1, 2, 3], Base.Iterators.Rest{UnitRange{Int64}, Int64}(1:10, 3)) julia> collect(c) diff --git a/src/precompile.jl b/src/precompile.jl index b6a4ac3..87380f6 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -1,12 +1,12 @@ -precompile(Tuple{typeof(MultiscaleCluster.remotecall),Function,Int,Module,Vararg{Any, 100}}) -precompile(Tuple{typeof(MultiscaleCluster.procs)}) -precompile(Tuple{typeof(MultiscaleCluster.finalize_ref), MultiscaleCluster.Future}) +precompile(Tuple{typeof(Distributed.remotecall),Function,Int,Module,Vararg{Any, 100}}) +precompile(Tuple{typeof(Distributed.procs)}) +precompile(Tuple{typeof(Distributed.finalize_ref), Distributed.Future}) # This is disabled because it doesn't give much benefit -# and the code in MultiscaleCluster is poorly typed causing many invalidations -# TODO: Maybe reenable now that MultiscaleCluster is not in sysimage. +# and the code in Distributed is poorly typed causing many invalidations +# TODO: Maybe reenable now that Distributed is not in sysimage. #= precompile_script *= """ - using MultiscaleCluster + using Distributed addprocs(2) pmap(x->iseven(x) ? 1 : 0, 1:4) @distributed (+) for i = 1:100 Int(rand(Bool)) end diff --git a/test/managers.jl b/test/managers.jl index c6f2cdb..7971222 100644 --- a/test/managers.jl +++ b/test/managers.jl @@ -1,9 +1,9 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license using Test -using MultiscaleCluster +using Distributed using Sockets -using MultiscaleCluster: parse_machine, SSHManager, LocalManager +using Distributed: parse_machine, SSHManager, LocalManager @test parse_machine("127.0.0.1") == ("127.0.0.1", nothing) @test parse_machine("127.0.0.1:80") == ("127.0.0.1", 80) diff --git a/test/runtests.jl b/test/runtests.jl index 6bd23ab..d34d07c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -8,7 +8,7 @@ disttestfile = joinpath(@__DIR__, "distributed_exec.jl") cmd = `$test_exename $test_exeflags $disttestfile` if !success(pipeline(cmd; stdout=stdout, stderr=stderr)) && ccall(:jl_running_on_valgrind,Cint,()) == 0 - error("MultiscaleCluster test failed, cmd : $cmd") + error("Distributed test failed, cmd : $cmd") end include("managers.jl") diff --git a/test/splitrange.jl b/test/splitrange.jl index d9f50e6..1cb12e1 100644 --- a/test/splitrange.jl +++ b/test/splitrange.jl @@ -1,8 +1,8 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license using Test -using MultiscaleCluster -using MultiscaleCluster: splitrange +using Distributed +using Distributed: splitrange @test splitrange(1, 11, 1) == Array{UnitRange{Int64},1}([1:11]) @test splitrange(0, 10, 1) == Array{UnitRange{Int64},1}([0:10]) diff --git a/test/topology.jl b/test/topology.jl index 378e7cd..13f560f 100644 --- a/test/topology.jl +++ b/test/topology.jl @@ -43,11 +43,11 @@ function launch(manager::TopoTestManager, params::Dict, launched::Array, c::Cond exename = params[:exename] exeflags = params[:exeflags] - cmd = `$exename $exeflags --bind-to $(MultiscaleCluster.LPROC.bind_addr) --worker` + cmd = `$exename $exeflags --bind-to $(Distributed.LPROC.bind_addr) --worker` cmd = pipeline(detach(setenv(cmd, dir=dir))) for i in 1:manager.np io = open(cmd, "r+") - MultiscaleCluster.write_cookie(io) + Distributed.write_cookie(io) wconfig = WorkerConfig() wconfig.process = io @@ -98,8 +98,8 @@ remove_workers_and_test() # test `lazy` connection setup function def_count_conn() @everywhere function count_connected_workers() - count(x -> isa(x, MultiscaleCluster.Worker) && isdefined(x, :r_stream) && isopen(x.r_stream), - MultiscaleCluster.PGRP().workers) + count(x -> isa(x, Distributed.Worker) && isdefined(x, :r_stream) && isopen(x.r_stream), + Distributed.PGRP().workers) end end From 846b32921971e037d0c55a0732601468e4159ef7 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Thu, 16 Nov 2023 16:20:53 -0300 Subject: [PATCH 07/54] some corrections --- src/Distributed.jl | 9 +- src/cluster.jl | 2 +- src/remotecall.jl | 9 +- test/distributed_exec.jl | 270 ++++++++++++++++++++++++++++----------- 4 files changed, 212 insertions(+), 78 deletions(-) diff --git a/src/Distributed.jl b/src/Distributed.jl index a7c5b17..6539f1a 100644 --- a/src/Distributed.jl +++ b/src/Distributed.jl @@ -72,14 +72,15 @@ export check_same_host function _require_callback(mod::Base.PkgId) - if Base.toplevel_load[] && myid() == 1 && nprocs() > 1 + if Base.toplevel_load[] && nprocs(role=:master) > 1 # broadcast top-level (e.g. from Main) import/using from node 1 (only) - @sync for p in procs() + @sync for p in procs(role = :master) + #@info "require callback", p p == 1 && continue # Extensions are already loaded on workers by their triggers being loaded # so no need to fire the callback upon extension being loaded on master. Base.loading_extension && continue - @async_unwrap remotecall_wait(p) do + @async_unwrap remotecall_wait(p; role = :master) do Base.require(mod) nothing end @@ -94,7 +95,7 @@ struct RRID whence::Int id::Int - RRID() = RRID(myid(), next_ref_id()) + RRID(;role= :default) = RRID(myid(role=role), next_ref_id()) RRID(whence, id) = new(whence, id) end diff --git a/src/cluster.jl b/src/cluster.jl index 71369a1..b80becf 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -468,7 +468,7 @@ function addprocs(manager::ClusterManager; kwargs...) try if myrole() == :worker - myrole!(:master) + myrole!(:master_worker) end PGRP(role=:master).level = PGRP(role=:worker).level + 1 diff --git a/src/remotecall.jl b/src/remotecall.jl index b071b7d..0f35161 100644 --- a/src/remotecall.jl +++ b/src/remotecall.jl @@ -565,7 +565,14 @@ function call_on_owner(f, rr::AbstractRemoteRef, args...; role= :default) if rr.where == myid(role = role) f(rid, args...) else + #remotecall_fetch((rid,role) -> f(rid, role = role, args...), rr.where, rid, rr.where==1 ? :master : :worker; role = role) remotecall_fetch((rid,role) -> f(rid, args...; role=role), rr.where, rid, rr.where==1 ? :master : :worker; role = role) + + + #remotecall_fetch(rid -> f(rid, role = rr.where==1 ? :master : :worker, args...), rr.where; role = role) + #remotecall_fetch(iiiii, rr.where, f, rid, rr.where==1 ? :master : :worker, args...; role = role) +# remotecall_fetch(f, rr.where, rid, args...) + end end @@ -655,7 +662,7 @@ function fetch(r::Future; role= :default) something(v_cache) end -fetch_ref(rid, args...; role=:default) = (@info "fetch_ref $role"; fetch(lookup_ref(rid; role = role).c, #=role=role,=# args...)) +fetch_ref(rid, args...; role=:default) = fetch(lookup_ref(rid; role = role).c, #=role=role,=# args...) diff --git a/test/distributed_exec.jl b/test/distributed_exec.jl index 166ea6d..572e8bb 100644 --- a/test/distributed_exec.jl +++ b/test/distributed_exec.jl @@ -3,13 +3,6 @@ using Test, Distributed, Random, Serialization, Sockets import Distributed: launch, manage -sharedir = normpath(joinpath(Sys.BINDIR, "..", "share")) -if parse(Bool, get(ENV, "JULIA_DISTRIBUTED_TESTING_STANDALONE", "false")) - @test !startswith(pathof(Distributed), sharedir) -else - @test startswith(pathof(Distributed), sharedir) -end - @test cluster_cookie() isa String include(joinpath(Sys.BINDIR, "..", "share", "julia", "test", "testenv.jl")) @@ -27,7 +20,7 @@ include(joinpath(Sys.BINDIR, "..", "share", "julia", "test", "testenv.jl")) addprocs_with_testenv(4) @test nprocs() == 5 -# distributed loading of packages +# Distributed loading of packages # setup @everywhere begin @@ -52,6 +45,7 @@ end id_me = myid() id_other = filter(x -> x != id_me, procs())[rand(1:(nprocs()-1))] + # Test role @everywhere using Distributed @test Distributed.myrole() === :master @@ -62,6 +56,10 @@ for wid = workers() @test wrole === :worker end +@info "passed 1" +#sleep(3) + + # Test remote() let pool = default_worker_pool() @@ -69,6 +67,8 @@ let count = 0 count_condition = Condition() + @info "passed 2" + function remote_wait(c) @async_logerr begin count += 1 @@ -79,17 +79,30 @@ let yield() end + @info "passed 3" + +# @info nworkers() +# sleep(30) + testchannels = [RemoteChannel() for i in 1:nworkers()] + # @info testchannels + # sleep(30) testcount = 0 @test isready(pool) == true for c in testchannels @test count == testcount +# @info c remote_wait(c) testcount += 1 end @test count == testcount @test isready(pool) == false + @info "passed 4" + #sleep(3) + + try + for c in testchannels @test count == testcount put!(c, "foo") @@ -99,8 +112,15 @@ let @test isready(pool) == true end + catch e + @info e + end + @test count == 0 + @info "passed 5" + #sleep(3) + for c in testchannels @test count == testcount remote_wait(c) @@ -109,6 +129,9 @@ let @test count == testcount @test isready(pool) == false + @info "passed 6" + #sleep(3) + for c in reverse(testchannels) @test count == testcount put!(c, "foo") @@ -118,9 +141,16 @@ let @test isready(pool) == true end + @info "passed 7" + #sleep(3) + @test count == 0 end +@info "passed 8" +#sleep(3) + + # Test Futures function testf(id) f=Future(id) @@ -172,27 +202,29 @@ function include_thread_unsafe_tests() return true end +@info "passed 9" + # Distributed GC tests for Futures function test_futures_dgc(id) f = remotecall(myid, id) fid = remoteref_id(f) # remote value should be deleted after a fetch - @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, fid) == true + @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP().refs, k)), id, fid) == true @test f.v === nothing @test fetch(f) == id @test f.v !== nothing yield(); # flush gc msgs - @test poll_while(() -> remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, fid)) + @test poll_while(() -> remotecall_fetch(k->(yield();haskey(Distributed.PGRP().refs, k)), id, fid)) # if unfetched, it should be deleted after a finalize f = remotecall(myid, id) fid = remoteref_id(f) - @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, fid) == true + @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP().refs, k)), id, fid) == true @test f.v === nothing finalize(f) yield(); # flush gc msgs - @test poll_while(() -> remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, fid)) + @test poll_while(() -> remotecall_fetch(k->(yield();haskey(Distributed.PGRP().refs, k)), id, fid)) end test_futures_dgc(id_me) @@ -208,23 +240,23 @@ fstore = RemoteChannel(wid2) put!(fstore, f) @test fetch(f) == wid1 -@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == true +@test remotecall_fetch(k->haskey(Distributed.PGRP().refs, k), wid1, fid) == true remotecall_fetch(r->(fetch(fetch(r)); yield()), wid2, fstore) sleep(0.5) # to ensure that wid2 gc messages have been executed on wid1 -@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == false +@test remotecall_fetch(k->haskey(Distributed.PGRP().refs, k), wid1, fid) == false # put! should release remote reference since it would have been cached locally f = Future(wid1) fid = remoteref_id(f) # should not be created remotely till accessed -@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == false +@test remotecall_fetch(k->haskey(Distributed.PGRP().refs, k), wid1, fid) == false # create it remotely isready(f) -@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == true +@test remotecall_fetch(k->haskey(Distributed.PGRP().refs, k), wid1, fid) == true put!(f, :OK) -@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == false +@test remotecall_fetch(k->haskey(Distributed.PGRP().refs, k), wid1, fid) == false @test fetch(f) === :OK # RemoteException should be thrown on a put! when another process has set the value @@ -235,7 +267,7 @@ fstore = RemoteChannel(wid2) put!(fstore, f) # send f to wid2 put!(f, :OK) # set value from master -@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == true +@test remotecall_fetch(k->haskey(Distributed.PGRP().refs, k), wid1, fid) == true testval = remotecall_fetch(wid2, fstore) do x try @@ -260,14 +292,17 @@ end f = remotecall_wait(identity, id_other, ones(10)) rrid = Distributed.RRID(f.whence, f.id) remotecall_fetch(f25847, id_other, f) -@test BitSet([id_me]) == remotecall_fetch(()->Distributed.PGRP.refs[rrid].clientset, id_other) +@test BitSet([id_me]) == remotecall_fetch(()->Distributed.PGRP().refs[rrid].clientset, id_other) remotecall_fetch(f25847, id_other, f) -@test BitSet([id_me]) == remotecall_fetch(()->Distributed.PGRP.refs[rrid].clientset, id_other) +@test BitSet([id_me]) == remotecall_fetch(()->Distributed.PGRP().refs[rrid].clientset, id_other) finalize(f) yield() # flush gc msgs -@test poll_while(() -> remotecall_fetch(chk_rrid->(yield(); haskey(Distributed.PGRP.refs, chk_rrid)), id_other, rrid)) +@test poll_while(() -> remotecall_fetch(chk_rrid->(yield(); haskey(Distributed.PGRP().refs, chk_rrid)), id_other, rrid)) + + +@info "passed 10" # Distributed GC tests for RemoteChannels function test_remoteref_dgc(id) @@ -276,12 +311,12 @@ function test_remoteref_dgc(id) rrid = remoteref_id(rr) # remote value should be deleted after finalizing the ref - @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, rrid) == true + @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP().refs, k)), id, rrid) == true @test fetch(rr) === :OK - @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, rrid) == true + @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP().refs, k)), id, rrid) == true finalize(rr) yield(); # flush gc msgs - @test poll_while(() -> remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, rrid)) + @test poll_while(() -> remotecall_fetch(k->(yield();haskey(Distributed.PGRP().refs, k)), id, rrid)) end test_remoteref_dgc(id_me) test_remoteref_dgc(id_other) @@ -295,18 +330,20 @@ let wid1 = workers()[1], put!(fstore, rr) if include_thread_unsafe_tests() - @test remotecall_fetch(k -> haskey(Distributed.PGRP.refs, k), wid1, rrid) == true + @test remotecall_fetch(k -> haskey(Distributed.PGRP().refs, k), wid1, rrid) == true end finalize(rr) # finalize locally yield() # flush gc msgs if include_thread_unsafe_tests() - @test remotecall_fetch(k -> haskey(Distributed.PGRP.refs, k), wid1, rrid) == true + @test remotecall_fetch(k -> haskey(Distributed.PGRP().refs, k), wid1, rrid) == true end remotecall_fetch(r -> (finalize(take!(r)); yield(); nothing), wid2, fstore) # finalize remotely sleep(0.5) # to ensure that wid2 messages have been executed on wid1 - @test poll_while(() -> remotecall_fetch(k -> haskey(Distributed.PGRP.refs, k), wid1, rrid)) + @test poll_while(() -> remotecall_fetch(k -> haskey(Distributed.PGRP().refs, k), wid1, rrid)) end +@info "passed 11" + # Tests for issue #23109 - should not hang. f = @spawnat :any rand(1, 1) Base.Experimental.@sync begin @@ -332,6 +369,7 @@ for i in 1:nworkers() end @test sort(pids) == sort(workers()) +@info "passed 12" # test getindex on Futures and RemoteChannels function test_indexing(rr) @@ -346,6 +384,8 @@ test_indexing(Future(id_other)) test_indexing(RemoteChannel()) test_indexing(RemoteChannel(id_other)) +@info "passed 13" + # Test ser/deser to non-ClusterSerializer objects. function test_regular_io_ser(ref::Distributed.AbstractRemoteRef) io = IOBuffer() @@ -396,6 +436,8 @@ s = [randstring() for x in 1:10^5] num_small_requests = 10000 @test fill(id_other, num_small_requests) == [remotecall_fetch(myid, id_other) for i in 1:num_small_requests] +@info "passed 14" + # test parallel sends of large arrays from multiple tasks to the same remote worker ntasks = 10 rr_list = [Channel(1) for x in 1:ntasks] @@ -444,6 +486,9 @@ test_channel(RemoteChannel(()->Channel(10))) c=Channel{Int}(1) @test_throws MethodError put!(c, "Hello") +@info "passed 15" + + # test channel iterations function test_iteration(in_c, out_c) t=@async for v in in_c @@ -545,6 +590,8 @@ for id in [id_other, id_me] end end +@info "passed 16" + # make sure the stackframe from the remote error can be serialized let ex try @@ -567,7 +614,9 @@ let ex end # pmap tests. Needs at least 4 processors dedicated to the below tests. Which we currently have -# since the distributed tests are now spawned as a separate set. +# since the Distributed tests are now spawned as a separate set. + +@info "passed 17" # Test all combinations of pmap keyword args. pmap_args = [ @@ -660,6 +709,7 @@ generic_map_tests(pmap_fallback) run_map_equivalence_tests(pmap) @test pmap(uppercase, "Hello World!") == map(uppercase, "Hello World!") +@info "passed 18" # Simple test for pmap throws error let error_thrown = false @@ -742,7 +792,7 @@ if DoFullTest all_w = workers() # Test sending fake data to workers. The worker processes will print an # error message but should not terminate. - for w in Distributed.PGRP.workers + for w in Distributed.PGRP().workers if isa(w, Distributed.Worker) local s = connect(w.config.host, w.config.port) write(s, randstring(32)) @@ -769,6 +819,9 @@ if Sys.isunix() # aka have ssh remotecall_fetch(rmprocs, 1, new_pids) end + @info "passed 19" + + print("\n\nTesting SSHManager. A minimum of 4GB of RAM is recommended.\n") print("Please ensure: \n") print("1) sshd is running locally with passwordless login enabled.\n") @@ -844,6 +897,8 @@ let t = @task 42 @test_throws TaskFailedException(t) Base.wait(t) end +@info "passed 20" + # issue #8207 let A = Any[] @distributed (+) for i in (push!(A,1); 1:2) @@ -852,6 +907,8 @@ let A = Any[] @test length(A) == 1 end +@info "passed 21" + # issue #13168 function f13168(n) val = 0 @@ -871,9 +928,13 @@ let t = schedule(@task f13168(100)) @test isa(fetch(t), Float64) end +@info "passed 21.1" + # issue #13122 @test remotecall_fetch(identity, workers()[1], C_NULL) === C_NULL +@info "passed 21.2" + # issue #11062 function t11062() @async v11062 = 1 @@ -882,11 +943,14 @@ end @test t11062() == 2 +@info "passed 21.3" + # issue #15406 v15406 = remotecall_wait(() -> 1, id_other) fetch(v15406) remotecall_wait(fetch, id_other, v15406) +@info "passed 21.4" # issue #43396 # Covers the remote fetch where the value returned is `nothing` @@ -896,6 +960,7 @@ remotecall_wait(fetch, id_other, v15406) @test nothing === fetch(remotecall(() -> nothing, workers()[1])) @test 10 === fetch(remotecall(() -> 10, workers()[1])) +@info "passed 21.5" # Test various forms of remotecall* invocations @@ -918,19 +983,30 @@ for tid in [id_other, id_me, default_worker_pool()] test_f_args(15, f_args, tid, 1, 2; kw1=4, kw2=8) end + +@info "passed 21.6.2" + +f=Future(id_other) +remote_do(fut->put!(fut, myid()), id_other, f) +@test fetch(f) == id_other + +@info "passed 21.6.1" + # Test remote_do f=Future(id_me) +@info "passed 21.6.1.1" remote_do(fut->put!(fut, myid()), id_me, f) +@info "passed 21.6.1.2" @test fetch(f) == id_me -f=Future(id_other) -remote_do(fut->put!(fut, myid()), id_other, f) -@test fetch(f) == id_other +@info "passed 21.7" # Github issue #29932 rc_unbuffered = RemoteChannel(()->Channel{Vector{Float64}}(0)) @test eltype(rc_unbuffered) == Vector{Float64} +@info "passed 21.8" + @async begin # Trigger direct write (no buffering) of largish array array_sz = Int(Base.SZ_UNBUFFERED_IO/8) + 1 @@ -948,6 +1024,8 @@ end return :OK end, id_other, rc_unbuffered) === :OK +@info "passed 21.9" + # github issue 33972 rc_unbuffered_other = RemoteChannel(()->Channel{Int}(0), id_other) close(rc_unbuffered_other) @@ -955,44 +1033,54 @@ try; take!(rc_unbuffered_other); catch; end @test !remotecall_fetch(rc -> islocked(Distributed.lookup_ref(remoteref_id(rc)).synctake), id_other, rc_unbuffered_other) +@info "passed 21.10" + # github PR #14456 n = DoFullTest ? 6 : 5 for i = 1:10^n fetch(@spawnat myid() myid()) end +@info "passed 21.11" + # issue #15451 @test remotecall_fetch(x->(y->2y)(x)+1, workers()[1], 3) == 7 +@info "passed 21.12" + # issue #16091 mutable struct T16091 end -wid = workers()[1] -try - remotecall_fetch(()->T16091, wid) - @test "unreachable" === true +wid0 = workers()[1] +@test try + remotecall_fetch(()->T16091, wid0) + @info "try ..." + false catch ex - ex = ((ex::RemoteException).captured::CapturedException).ex - @test (ex::UndefVarError).var === :T16091 + @info "catch $(((ex::RemoteException).captured::CapturedException).ex) --- $(UndefVarError(:T16091)) --- $(((ex::RemoteException).captured::CapturedException).ex === UndefVarError(:T16091))" + ((ex::RemoteException).captured::CapturedException).ex === UndefVarError(:T16091) end -try - remotecall_fetch(identity, wid, T16091) - @test "unreachable" === true +@test try + remotecall_fetch(identity, wid0, T16091) + false catch ex - ex = ((ex::RemoteException).captured::CapturedException).ex - @test (ex::UndefVarError).var === :T16091 + ((ex::RemoteException).captured::CapturedException).ex === UndefVarError(:T16091) end f16091a() = 1 -remotecall_fetch(()->eval(:(f16091a() = 2)), wid) -@test remotecall_fetch(f16091a, wid) === 2 -@test remotecall_fetch((myid)->remotecall_fetch(f16091a, myid), wid, myid()) === 1 +remotecall_fetch(()->eval(:(f16091a() = 2)), wid0) +@test remotecall_fetch(f16091a, wid0) === 2 +@test remotecall_fetch((myid)->remotecall_fetch(f16091a, myid), wid0, myid()) === 1 + +@info "passed 21.13" # these will only heisen-fail, since it depends on the gensym counter collisions: f16091b = () -> 1 -remotecall_fetch(()->eval(:(f16091b = () -> 2)), wid) +remotecall_fetch(()->eval(:(f16091b = () -> 2)), wid0) @test remotecall_fetch(f16091b, 2) === 1 # Global anonymous functions are over-written... -@test remotecall_fetch((myid)->remotecall_fetch(f16091b, myid), wid, myid()) === 1 +@test remotecall_fetch((myid)->remotecall_fetch(f16091b, myid), wid0, myid()) === 1 + +@info "passed 21.14" # ...while local anonymous functions are by definition, local. let @@ -1004,9 +1092,11 @@ let f16091c = () -> 2 remotecall_fetch(f16091c, myid) end - end, wid, myid()) === 2 + end, wid0, myid()) === 2 end +@info "passed 21.15" + # issue #16451 rng=RandomDevice() retval = @distributed (+) for _ in 1:10 @@ -1020,18 +1110,25 @@ retval = @distributed (+) for _ in 1:10 end @test retval > 0.0 && retval < 10.0 +@info "passed 21.16" + # serialization tests wrkr1 = workers()[1] wrkr2 = workers()[end] @test remotecall_fetch(p->remotecall_fetch(myid, p), wrkr1, wrkr2) == wrkr2 +@info "passed 21.17" + # Send f to wrkr1 and wrkr2. Then try calling f on wrkr2 from wrkr1 f_myid = ()->myid() @test wrkr1 == remotecall_fetch(f_myid, wrkr1) @test wrkr2 == remotecall_fetch(f_myid, wrkr2) @test wrkr2 == remotecall_fetch((f, p)->remotecall_fetch(f, p), wrkr1, f_myid, wrkr2) + +@info "passed 22" + # Deserialization error recovery test # locally defined module, but unavailable on workers module LocalFoo @@ -1107,15 +1204,17 @@ let (p, p2) = filter!(p -> p != myid(), procs()) test_throw_on([p2, p], "everywhere on p and p2") end +@info "passed 23" + # Test addprocs enable_threaded_blas parameter function get_remote_num_threads(processes_added) return [remotecall_fetch(BLAS.get_num_threads, proc_id) for proc_id in processes_added] end -function test_blas_config(pid, expected) - for worker in Distributed.PGRP.workers - if worker.id == pid +function test_blas_config(pid, expected; role=:default) + for worker in Distributed.PGRP(role=role).workers + if Distributed.wid(worker,role=role) == pid @test worker.config.enable_threaded_blas == expected return end @@ -1163,6 +1262,8 @@ function test_add_procs_threaded_blas() end test_add_procs_threaded_blas() +@info "passed 24" + #19687 if false ### TODO: The logic that is supposed to implement this is racy - Disabled for now # ensure no race conditions between rmprocs and addprocs @@ -1197,16 +1298,18 @@ end end # Test addprocs/rmprocs from master node only -for f in [ ()->addprocs(1; exeflags=test_exeflags), ()->rmprocs(workers()) ] - local f - try - remotecall_fetch(f, id_other) - error("Unexpected") - catch ex - @test isa(ex, RemoteException) - @test ex.captured.ex.msg == "Only process 1 can add and remove workers" - end -end +#for f in [ ()->addprocs(1; exeflags=test_exeflags), ()->rmprocs(workers()) ] +# local f +# try +# remotecall_fetch(f, id_other) +# error("Unexpected") +# catch ex +# @test isa(ex, RemoteException) +# @test ex.captured.ex.msg == "Only process 1 can add and remove workers" +# end +#end + +@info "passed 25" # Test the following addprocs error conditions # - invalid host name - github issue #20372 @@ -1273,6 +1376,8 @@ for (addp_testf, expected_errstr, env) in testruns end end +@info "passed 26" + # Auto serialization of globals from Main. # bitstypes @@ -1341,6 +1446,8 @@ v31252 = :b v31252 = :a @test :a == @fetchfrom id_other v31252 +@info "passed 27" + # Test that a global is not being repeatedly serialized when # a) referenced multiple times in the closure @@ -1438,9 +1545,11 @@ global ids_func = ()->ids_cleanup clust_ser = (Distributed.worker_from_id(id_other)).w_serializer @test remotecall_fetch(ids_func, id_other) == ids_cleanup +@info "passed 29" + # TODO Add test for cleanup from `clust_ser.glbs_in_tnobj` -# reported github issues - Mostly tests with globals and various distributed macros +# reported github issues - Mostly tests with globals and various Distributed macros #2669, #5390 v2669=10 @test fetch(@spawnat :any (1+v2669)) == 11 @@ -1487,6 +1596,8 @@ let @test remotecall_fetch(Float64, id_other, 1) == Float64(1) end +@info "passed 30" + #19463 function foo19463() w1 = workers()[1] @@ -1543,6 +1654,8 @@ syms = setup_syms(3, workers()) clear!(syms, workers()) test_clear(syms, workers()) +@info "passed 31" + # Test partial recovery from a deserialization error in CapturedException try expr = quote @@ -1557,10 +1670,11 @@ try catch ex @test isa(ex.captured.ex.exceptions[1].ex, ErrorException) @test occursin("BoundsError", ex.captured.ex.exceptions[1].ex.msg) - ex = ex.captured.ex.exceptions[2].ex - @test (ex::UndefVarError).var === :DontExistOn1 + @test ex.captured.ex.exceptions[2].ex == UndefVarError(:DontExistOn1) end +@info "passed 32" + let # creates a new worker in a different folder and tries to include file tmp_dir = mktempdir() @@ -1634,6 +1748,8 @@ cluster_cookie("foobar") # custom cookie npids = addprocs_with_testenv(WorkerArgTester(`--worker=foobar`, false)) @test remotecall_fetch(myid, npids[1]) == npids[1] +@info "passed 33" + # tests for start_worker options to retain stdio (issue #31035) struct RetainStdioTester <: ClusterManager close_stdin::Bool @@ -1682,21 +1798,21 @@ p1,p2 = addprocs_with_testenv(2) @test fill(2.,2) == remotecall_fetch(f22865, p1, p2) rmprocs(p1, p2) -function reuseport_tests() +function reuseport_tests(;role = :default) # Run the test on all processes. results = asyncmap(procs()) do p remotecall_fetch(p) do ports_lower = [] # ports of pids lower than myid() ports_higher = [] # ports of pids higher than myid() - for w in Distributed.PGRP.workers - w.id == myid() && continue + for w in Distributed.PGRP(role=role).workers + Distributed.wid(w,role=role) == myid() && continue port = Sockets._sockname(w.r_stream, true)[2] - if (w.id == 1) + if (Distributed.wid(w,role=role) == 1) # master connects to workers push!(ports_higher, port) - elseif w.id < myid() + elseif Distributed.wid(w,role=role) < myid(role=role) push!(ports_lower, port) - elseif w.id > myid() + elseif Distributed.wid(w,role=role) > myid(role=role) push!(ports_higher, port) end end @@ -1707,7 +1823,7 @@ function reuseport_tests() return 0 end end - return myid() + return myid(role=role) end end @@ -1737,6 +1853,9 @@ for T in (UInt8, Int8, UInt16, Int16, UInt32, Int32, UInt64) @test n == 55 end +@info "passed 34" + + # issue #28966 let code = """ import Distributed @@ -1903,6 +2022,8 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp @test success(cmd) end end +@info "passed 35" + include("splitrange.jl") # Clear all workers for timeout tests (issue #45785) @@ -1925,7 +2046,12 @@ begin end end +@info "passed 36" + # Run topology tests last after removing all workers, since a given # cluster at any time only supports a single topology. rmprocs(workers()) include("topology.jl") + +@info "end test" + From 295c342df6135e4f85f6665b51dc535da59fecc3 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior <102302676+decarvalhojunior-fh@users.noreply.github.com> Date: Thu, 16 Nov 2023 16:25:23 -0300 Subject: [PATCH 08/54] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 4866802..257aa50 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ This package ships as part of the Julia stdlib. > Mutiscale parallelism may help programmers in at least two scenarios: > * to deploy _multicluster computations_, i.e. parallel computations employing multiple clusters by assuming the parallel programming patterns and tools at the multicluster and cluster levels are distinct; > * better support for _multilevel parallel programming_ patterns. +> We are working on the implementation of case studies. ## Using development versions of this package From 5d87aa8bbd64c942486b3f47b33e96684a313d32 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior <102302676+decarvalhojunior-fh@users.noreply.github.com> Date: Thu, 16 Nov 2023 16:27:28 -0300 Subject: [PATCH 09/54] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 257aa50..5a5740b 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ This package ships as part of the Julia stdlib. > Mutiscale parallelism may help programmers in at least two scenarios: > * to deploy _multicluster computations_, i.e. parallel computations employing multiple clusters by assuming the parallel programming patterns and tools at the multicluster and cluster levels are distinct; > * better support for _multilevel parallel programming_ patterns. +> > We are working on the implementation of case studies. ## Using development versions of this package From 0dd3443efc346ff545c92c4cceb6702afe14c346 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Tue, 30 Jan 2024 07:41:04 -0300 Subject: [PATCH 10/54] testing fetch --- src/remotecall.jl | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/remotecall.jl b/src/remotecall.jl index 0f35161..46b21d0 100644 --- a/src/remotecall.jl +++ b/src/remotecall.jl @@ -616,29 +616,37 @@ Further calls to `fetch` on the same reference return the cached value. If the r is an exception, throws a [`RemoteException`](@ref) which captures the remote exception and backtrace. """ function fetch(r::Future; role= :default) + @info "fetch 1" v_cache = @atomic r.v v_cache !== nothing && return something(v_cache) + @info "fetch 2" if r.where == myid(role = role) + @info "fetch 2.1" rv, v_cache = @lock r.lock begin v_cache = @atomic :monotonic r.v rv = v_cache === nothing ? lookup_ref(remoteref_id(r); role = role) : nothing rv, v_cache end + @info "fetch 2.2" if v_cache !== nothing return something(v_cache) else v_local = fetch(rv.c) end + @info "fetch 2.3" else #v_local = call_on_owner((rid, args...; role=role) -> fetch_ref(rid, args...;role=role), r; role = role) v_local = call_on_owner(fetch_ref, r; role = role) end + @info "fetch 3" v_cache = @atomic r.v + @info "fetch 4" if v_cache === nothing # call_on_owner case + @info "fetch 4.1" v_old, status = @lock r.lock begin @atomicreplace r.v nothing => Some(v_local) end @@ -650,16 +658,23 @@ function fetch(r::Future; role= :default) # remote calls getting the value from `call_on_owner` used to return the value directly without wrapping it in `Some(x)` # so we're doing the same thing here + @info "fetch 4.2" if status send_del_client(r; role = role) return v_local else # this `v_cache` is returned at the end of the function v_cache = v_old end + @info "fetch 4.3" end + @info "fetch 5" send_del_client(r; role = role) + @info "fetch 6" + something(v_cache) + @info "fetch 7" + end fetch_ref(rid, args...; role=:default) = fetch(lookup_ref(rid; role = role).c, #=role=role,=# args...) From 9d2c233b530bbd103c1ef8c6bec015f90c838b78 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Thu, 15 Feb 2024 15:51:08 -0300 Subject: [PATCH 11/54] ... --- src/macros.jl | 2 +- src/remotecall.jl | 13 ------------- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/src/macros.jl b/src/macros.jl index e32ef22..ade5911 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -120,7 +120,7 @@ end macro spawnat(args...) rolearg, p, expr = check_args_3a(args...) - @info rolearg, typeof(rolearg) + #@info rolearg, typeof(rolearg) thunk = esc(:(()->($expr))) var = esc(Base.sync_varname) diff --git a/src/remotecall.jl b/src/remotecall.jl index 46b21d0..0a24ed5 100644 --- a/src/remotecall.jl +++ b/src/remotecall.jl @@ -616,37 +616,29 @@ Further calls to `fetch` on the same reference return the cached value. If the r is an exception, throws a [`RemoteException`](@ref) which captures the remote exception and backtrace. """ function fetch(r::Future; role= :default) - @info "fetch 1" v_cache = @atomic r.v v_cache !== nothing && return something(v_cache) - @info "fetch 2" if r.where == myid(role = role) - @info "fetch 2.1" rv, v_cache = @lock r.lock begin v_cache = @atomic :monotonic r.v rv = v_cache === nothing ? lookup_ref(remoteref_id(r); role = role) : nothing rv, v_cache end - @info "fetch 2.2" if v_cache !== nothing return something(v_cache) else v_local = fetch(rv.c) end - @info "fetch 2.3" else #v_local = call_on_owner((rid, args...; role=role) -> fetch_ref(rid, args...;role=role), r; role = role) v_local = call_on_owner(fetch_ref, r; role = role) end - @info "fetch 3" v_cache = @atomic r.v - @info "fetch 4" if v_cache === nothing # call_on_owner case - @info "fetch 4.1" v_old, status = @lock r.lock begin @atomicreplace r.v nothing => Some(v_local) end @@ -658,22 +650,17 @@ function fetch(r::Future; role= :default) # remote calls getting the value from `call_on_owner` used to return the value directly without wrapping it in `Some(x)` # so we're doing the same thing here - @info "fetch 4.2" if status send_del_client(r; role = role) return v_local else # this `v_cache` is returned at the end of the function v_cache = v_old end - @info "fetch 4.3" end - @info "fetch 5" send_del_client(r; role = role) - @info "fetch 6" something(v_cache) - @info "fetch 7" end From 96d4c0b1bd2bf087095dffefe95d0eab35978001 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Fri, 16 Feb 2024 00:21:56 -0300 Subject: [PATCH 12/54] correction --- src/process_messages.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/process_messages.jl b/src/process_messages.jl index 25c62bc..fa2d8c5 100644 --- a/src/process_messages.jl +++ b/src/process_messages.jl @@ -228,7 +228,7 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool; role= # If unhandleable error occurred talking to pid 1, exit if wpid == 1 if isopen(w_stream) - @error "Fatal error on process $(myid(role=roleee))" exception=e,catch_backtrace() + @error "Fatal error on process $(myid(role=role))" exception=e,catch_backtrace() end exit(1) end From 4fe45880a09d00b4264131d38d6323bfab3bc18e Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Fri, 16 Feb 2024 10:20:23 -0300 Subject: [PATCH 13/54] ... --- src/cluster.jl | 2 +- src/remotecall.jl | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/cluster.jl b/src/cluster.jl index b80becf..cdfe306 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -168,7 +168,7 @@ function check_worker_state(w::Worker; role= :default) t = @async exec_conn_func(w; role=role) else # route request via node 1 - t = @async remotecall_fetch((p,to_id) -> remotecall_fetch((to_id, role) -> exec_conn_func(to_id, role = role), p, to_id, p == 1 ? :master : :worker; role = role), 1, wid(w, role=role), myid(role=role)) + t = @async remotecall_fetch((p,to_id) -> remotecall_fetch((to_id, role) -> exec_conn_func(to_id; role = role), p, to_id, p == 1 ? :master : :worker; role = role), 1, wid(w, role=role), myid(role=role)) end errormonitor(t) wait_for_conn(w; role=role) diff --git a/src/remotecall.jl b/src/remotecall.jl index 0a24ed5..18b1c5d 100644 --- a/src/remotecall.jl +++ b/src/remotecall.jl @@ -423,6 +423,7 @@ end # make a thunk to call f on args in a way that simulates what would happen if # the function were sent elsewhere function local_remotecall_thunk(f, args, kwargs) + println("local_remotecall_thunk($f, $args, $kwargs)") return ()->invokelatest(f, args...; kwargs...) end @@ -460,7 +461,8 @@ function remotecall_fetch(f, w::Worker, args...; role= :default, kwargs...) oid = RRID(role = role) rv = lookup_ref(oid; role = role) rv.waitingfor = wid(w, role=role) - send_msg(w, MsgHeader(RRID(0,0), oid), CallMsg{:call_fetch}(f, args, kwargs); role = role) + @info "send_msg ..." + send_msg(w, MsgHeader(RRID(0,0), oid), CallMsg{:call_fetch}(f, args, role = role, kwargs); role = role) v = take!(rv) lock(client_refs) do delete!(PGRP(role = role).refs, oid) From 6df4f64b820daf6eb66a6d65d0ca8cd8563c29d9 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Fri, 16 Feb 2024 11:32:21 -0300 Subject: [PATCH 14/54] ... --- src/cluster.jl | 2 +- src/remotecall.jl | 25 ++++++++++++++++++++++--- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/cluster.jl b/src/cluster.jl index cdfe306..9451a4c 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -168,7 +168,7 @@ function check_worker_state(w::Worker; role= :default) t = @async exec_conn_func(w; role=role) else # route request via node 1 - t = @async remotecall_fetch((p,to_id) -> remotecall_fetch((to_id, role) -> exec_conn_func(to_id; role = role), p, to_id, p == 1 ? :master : :worker; role = role), 1, wid(w, role=role), myid(role=role)) + t = @async remotecall_fetch((p,to_id) -> remotecall_fetch((to_id, role) -> exec_conn_func(to_id; role = role), p, to_id, p == 1 ? :master : :worker; role = role), 1, wid(w, role=role), myid(role=role); role=role) end errormonitor(t) wait_for_conn(w; role=role) diff --git a/src/remotecall.jl b/src/remotecall.jl index 18b1c5d..4d1fedc 100644 --- a/src/remotecall.jl +++ b/src/remotecall.jl @@ -423,7 +423,7 @@ end # make a thunk to call f on args in a way that simulates what would happen if # the function were sent elsewhere function local_remotecall_thunk(f, args, kwargs) - println("local_remotecall_thunk($f, $args, $kwargs)") + #println("local_remotecall_thunk($f, $args, $kwargs)") return ()->invokelatest(f, args...; kwargs...) end @@ -455,20 +455,39 @@ function remotecall_fetch(f, w::LocalProcess, args...; role= :default, kwargs... return isa(v, RemoteException) ? throw(v) : v end + function remotecall_fetch(f, w::Worker, args...; role= :default, kwargs...) # can be weak, because the program will have no way to refer to the Ref # itself, it only gets the result. oid = RRID(role = role) rv = lookup_ref(oid; role = role) + rv.waitingfor = wid(w, role = role) + send_msg(w, MsgHeader(RRID(0,0), oid), CallMsg{:call_fetch}(f, args, kwargs); role = role) + v = take!(rv) + lock(client_refs) do + delete!(PGRP(role = role).refs, oid) + end + return isa(v, RemoteException) ? throw(v) : v +end + + +#= +function remotecall_fetch(f, w::Worker, args...; kwargs...) + # can be weak, because the program will have no way to refer to the Ref + # itself, it only gets the result. + role = haskey(kwargs, :role) ? kwargs[:role] : :default + oid = RRID(role = role) + rv = lookup_ref(oid; role = role) rv.waitingfor = wid(w, role=role) - @info "send_msg ..." - send_msg(w, MsgHeader(RRID(0,0), oid), CallMsg{:call_fetch}(f, args, role = role, kwargs); role = role) + @info "send_msg ...$(Base.nameof(f)) === $(Base.kwarg_decl.(methods(f)))" + send_msg(w, MsgHeader(RRID(0,0), oid), CallMsg{:call_fetch}(f, args, kwargs); role = role) v = take!(rv) lock(client_refs) do delete!(PGRP(role = role).refs, oid) end return isa(v, RemoteException) ? throw(v) : v end +=# """ remotecall_fetch(f, id::Integer, args...; kwargs...) From 61669cdeb5c1523bb401496413efa9841c814607 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Fri, 16 Feb 2024 19:55:42 -0300 Subject: [PATCH 15/54] bug correction --- src/cluster.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster.jl b/src/cluster.jl index 9451a4c..74ae5be 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -168,7 +168,7 @@ function check_worker_state(w::Worker; role= :default) t = @async exec_conn_func(w; role=role) else # route request via node 1 - t = @async remotecall_fetch((p,to_id) -> remotecall_fetch((to_id, role) -> exec_conn_func(to_id; role = role), p, to_id, p == 1 ? :master : :worker; role = role), 1, wid(w, role=role), myid(role=role); role=role) + t = @async remotecall_fetch((p,to_id) -> remotecall_fetch((to_id, role2) -> exec_conn_func(to_id; role = role2), p, to_id, p == 1 ? :master : :worker; role = :master), 1, wid(w, role=role), myid(role=role); role=role) end errormonitor(t) wait_for_conn(w; role=role) From 0d68586c94d3a1967c46957b627b5777893911e2 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Thu, 14 Mar 2024 15:39:14 -0300 Subject: [PATCH 16/54] Incorporatin changes --- src/cluster.jl | 2 +- src/process_messages.jl | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/cluster.jl b/src/cluster.jl index 74ae5be..82b41b6 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -92,7 +92,7 @@ mutable struct WorkerConfig end end -@enum WorkerState W_CREATED W_CONNECTED W_TERMINATING W_TERMINATED +@enum WorkerState W_CREATED W_CONNECTED W_TERMINATING W_TERMINATED W_UNKNOWN_STATE mutable struct Worker id::Int msg_lock::Threads.ReentrantLock # Lock for del_msgs, add_msgs, and gcflag diff --git a/src/process_messages.jl b/src/process_messages.jl index fa2d8c5..2f190e2 100644 --- a/src/process_messages.jl +++ b/src/process_messages.jl @@ -211,8 +211,7 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool; role= handle_msg(msg, header, r_stream, w_stream, version; role = role) end catch e - werr = worker_from_id(wpid; role = role) - oldstate = werr.state + oldstate = W_UNKNOWN_STATE # Check again as it may have been set in a message handler but not propagated to the calling block above if wpid < 1 @@ -223,6 +222,8 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool; role= println(stderr, e, CapturedException(e, catch_backtrace())) println(stderr, "Process($(myid(role=role))) - Unknown remote, closing connection.") elseif !(wpid in map_del_wrkr) + werr = worker_from_id(wpid) + oldstate = werr.state set_worker_state(werr, W_TERMINATED) # If unhandleable error occurred talking to pid 1, exit From 71cdd41ea4f37758606657d7248839af32d6d1f6 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Fri, 5 Apr 2024 09:25:55 -0300 Subject: [PATCH 17/54] teste --- src/managers.jl | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/managers.jl b/src/managers.jl index 4093414..71e0d54 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -573,7 +573,12 @@ workers. function connect(manager::ClusterManager, pid::Int, config::WorkerConfig) if config.connect_at !== nothing # this is a worker-to-worker setup call. - return connect_w2w(pid, config) + (rhost, rport) = notnothing(config.connect_at)::Tuple{String, Int} + config.host = rhost + config.port = rport + config.connect_at = nothing + #return connect_w2w(pid, config) + return connect(manager, pid, config) end # master connecting to workers From 5d48a4793af151c0f015fcb0c71e0f7179ef92e3 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Fri, 5 Apr 2024 09:44:52 -0300 Subject: [PATCH 18/54] teste --- src/managers.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/managers.jl b/src/managers.jl index 71e0d54..a6d9334 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -577,6 +577,7 @@ function connect(manager::ClusterManager, pid::Int, config::WorkerConfig) config.host = rhost config.port = rport config.connect_at = nothing + config.io = nothing #return connect_w2w(pid, config) return connect(manager, pid, config) end From 7b7c0402a2415a1a44d3f711ca1a18a80bf34ac7 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Fri, 5 Apr 2024 09:49:36 -0300 Subject: [PATCH 19/54] teste --- src/managers.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/managers.jl b/src/managers.jl index a6d9334..df4d7a6 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -585,6 +585,7 @@ function connect(manager::ClusterManager, pid::Int, config::WorkerConfig) # master connecting to workers if config.io !== nothing (bind_addr, port::Int) = read_worker_host_port(config.io) + @info "CONNECT W $bind_addr $port" pubhost = something(config.host, bind_addr) config.host = pubhost config.port = port From d9d823470305d5539e9b83a0f4979cfec906623e Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Fri, 5 Apr 2024 09:57:08 -0300 Subject: [PATCH 20/54] teste --- src/managers.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/managers.jl b/src/managers.jl index df4d7a6..7c1ac37 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -582,19 +582,23 @@ function connect(manager::ClusterManager, pid::Int, config::WorkerConfig) return connect(manager, pid, config) end + @info "CONNECT W1" + # master connecting to workers if config.io !== nothing (bind_addr, port::Int) = read_worker_host_port(config.io) - @info "CONNECT W $bind_addr $port" + @info "CONNECT W2 $bind_addr $port" pubhost = something(config.host, bind_addr) config.host = pubhost config.port = port else + @info "CONNECT W3" pubhost = notnothing(config.host) port = notnothing(config.port) bind_addr = something(config.bind_addr, pubhost) end + @info "CONNECT W4" tunnel = something(config.tunnel, false) s = split(pubhost,'@') From 21a1c0a9650e4683c39ae87295034d5ad3110dfc Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Fri, 5 Apr 2024 09:59:11 -0300 Subject: [PATCH 21/54] teste --- src/managers.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/managers.jl b/src/managers.jl index 7c1ac37..ba52c77 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -577,7 +577,6 @@ function connect(manager::ClusterManager, pid::Int, config::WorkerConfig) config.host = rhost config.port = rport config.connect_at = nothing - config.io = nothing #return connect_w2w(pid, config) return connect(manager, pid, config) end From eb045e7c2a3f5bc082ae411ebced82f17c58cd7f Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Fri, 5 Apr 2024 10:02:05 -0300 Subject: [PATCH 22/54] teste finished --- src/managers.jl | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/managers.jl b/src/managers.jl index ba52c77..d2f9868 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -573,31 +573,31 @@ workers. function connect(manager::ClusterManager, pid::Int, config::WorkerConfig) if config.connect_at !== nothing # this is a worker-to-worker setup call. - (rhost, rport) = notnothing(config.connect_at)::Tuple{String, Int} - config.host = rhost - config.port = rport - config.connect_at = nothing - #return connect_w2w(pid, config) - return connect(manager, pid, config) + #(rhost, rport) = notnothing(config.connect_at)::Tuple{String, Int} + #config.host = rhost + #config.port = rport + #config.connect_at = nothing + return connect_w2w(pid, config) + #return connect(manager, pid, config) end - @info "CONNECT W1" + #@info "CONNECT W1" # master connecting to workers if config.io !== nothing (bind_addr, port::Int) = read_worker_host_port(config.io) - @info "CONNECT W2 $bind_addr $port" + #@info "CONNECT W2 $bind_addr $port" pubhost = something(config.host, bind_addr) config.host = pubhost config.port = port else - @info "CONNECT W3" + #@info "CONNECT W3" pubhost = notnothing(config.host) port = notnothing(config.port) bind_addr = something(config.bind_addr, pubhost) end - @info "CONNECT W4" + #@info "CONNECT W4" tunnel = something(config.tunnel, false) s = split(pubhost,'@') From f0a6e3c9f676deb16c65765ed59885b7a8c23af3 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Fri, 5 Apr 2024 14:49:20 -0300 Subject: [PATCH 23/54] teste finished --- src/managers.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/managers.jl b/src/managers.jl index d2f9868..b9148d0 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -701,6 +701,8 @@ function connect_to_worker(host::AbstractString, port::Integer) bind_addr = getaddrinfo(host) end + @info "connect_to_worker: $host $port" + iptype = typeof(bind_addr) sock = socket_reuse_port(iptype) connect(sock, bind_addr, UInt16(port)) From a3b2ce744f0e8c92c5e06cc53445b2c3b83ea541 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Fri, 5 Apr 2024 15:14:05 -0300 Subject: [PATCH 24/54] teste --- src/process_messages.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/process_messages.jl b/src/process_messages.jl index 2f190e2..090ad74 100644 --- a/src/process_messages.jl +++ b/src/process_messages.jl @@ -170,7 +170,7 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool; role= while true reset_state(serializer) header = deserialize_hdr_raw(r_stream) - #println("header: ", header) + println("header: ", header) try msg = invokelatest(deserialize_msg, serializer) @@ -207,7 +207,7 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool; role= end readbytes!(r_stream, boundary, length(MSG_BOUNDARY)) - #println("got msg: ", typeof(msg)) + println("got msg: ", typeof(msg)) handle_msg(msg, header, r_stream, w_stream, version; role = role) end catch e From 8b5e4602d4e496704c6dc71e11260658feb3c001 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Fri, 5 Apr 2024 15:44:54 -0300 Subject: [PATCH 25/54] teste --- src/cluster.jl | 2 +- src/managers.jl | 2 +- src/process_messages.jl | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cluster.jl b/src/cluster.jl index 82b41b6..143e37d 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -1427,7 +1427,7 @@ end function init_parallel() start_gc_msgs_task(role = :master) # TO CHECK - start_gc_msgs_task(role = :worker) # TO CHECK + start_gc_msgs_task(role = :worker) # TO CHECK # start in "head node" mode, if worker, will override later. #global PGRP diff --git a/src/managers.jl b/src/managers.jl index b9148d0..7f689c6 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -701,7 +701,7 @@ function connect_to_worker(host::AbstractString, port::Integer) bind_addr = getaddrinfo(host) end - @info "connect_to_worker: $host $port" + @info "connect_to_worker: $host $port $bind_addr" iptype = typeof(bind_addr) sock = socket_reuse_port(iptype) diff --git a/src/process_messages.jl b/src/process_messages.jl index 090ad74..2f190e2 100644 --- a/src/process_messages.jl +++ b/src/process_messages.jl @@ -170,7 +170,7 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool; role= while true reset_state(serializer) header = deserialize_hdr_raw(r_stream) - println("header: ", header) + #println("header: ", header) try msg = invokelatest(deserialize_msg, serializer) @@ -207,7 +207,7 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool; role= end readbytes!(r_stream, boundary, length(MSG_BOUNDARY)) - println("got msg: ", typeof(msg)) + #println("got msg: ", typeof(msg)) handle_msg(msg, header, r_stream, w_stream, version; role = role) end catch e From e6551039cafb9a1923bd12a0d74934ec7212bb12 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Fri, 5 Apr 2024 15:49:00 -0300 Subject: [PATCH 26/54] teste --- src/managers.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/managers.jl b/src/managers.jl index 7f689c6..c041fcd 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -701,7 +701,7 @@ function connect_to_worker(host::AbstractString, port::Integer) bind_addr = getaddrinfo(host) end - @info "connect_to_worker: $host $port $bind_addr" + @info "connect_to_worker: $host $port $bind_addr $iptype" iptype = typeof(bind_addr) sock = socket_reuse_port(iptype) From 5d5ff72c9d1ca1ab018d6ab32813b72e5da3c496 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Fri, 5 Apr 2024 15:50:42 -0300 Subject: [PATCH 27/54] teste --- src/managers.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/managers.jl b/src/managers.jl index c041fcd..67032b5 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -701,9 +701,9 @@ function connect_to_worker(host::AbstractString, port::Integer) bind_addr = getaddrinfo(host) end - @info "connect_to_worker: $host $port $bind_addr $iptype" iptype = typeof(bind_addr) + @info "connect_to_worker: $host $port $bind_addr $iptype" sock = socket_reuse_port(iptype) connect(sock, bind_addr, UInt16(port)) From bb387679870146dc7ee1de735730eb1f293a7bb2 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Fri, 5 Apr 2024 16:13:17 -0300 Subject: [PATCH 28/54] teste --- src/cluster.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster.jl b/src/cluster.jl index 143e37d..4227e65 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -1397,7 +1397,7 @@ function init_bind_addr() else bind_port = 0 try - bind_addr = string(getipaddr()) + bind_addr = string(getipaddr(IPv6)) catch # All networking is unavailable, initialize bind_addr to the loopback address # Will cause an exception to be raised only when used. From 0be8014faea732faa6145941476b561cb14522cd Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Fri, 5 Apr 2024 16:15:15 -0300 Subject: [PATCH 29/54] teste --- src/cluster.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster.jl b/src/cluster.jl index 4227e65..1a4be09 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -261,7 +261,7 @@ function start_worker(out::IO, cookie::AbstractString=readline(stdin); close_std stderr_to_stdout && redirect_stderr(stdout) init_worker(cookie) - interface = IPv4(LPROC.bind_addr) + interface = IPv6(LPROC.bind_addr) if LPROC.bind_port == 0 port_hint = 9000 + (getpid() % 1000) (port, sock) = listenany(interface, UInt16(port_hint)) From 309b2d4b08fa399b05e8d612b958ca81903201e6 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Fri, 5 Apr 2024 16:17:16 -0300 Subject: [PATCH 30/54] teste --- src/cluster.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cluster.jl b/src/cluster.jl index 1a4be09..143e37d 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -261,7 +261,7 @@ function start_worker(out::IO, cookie::AbstractString=readline(stdin); close_std stderr_to_stdout && redirect_stderr(stdout) init_worker(cookie) - interface = IPv6(LPROC.bind_addr) + interface = IPv4(LPROC.bind_addr) if LPROC.bind_port == 0 port_hint = 9000 + (getpid() % 1000) (port, sock) = listenany(interface, UInt16(port_hint)) @@ -1397,7 +1397,7 @@ function init_bind_addr() else bind_port = 0 try - bind_addr = string(getipaddr(IPv6)) + bind_addr = string(getipaddr()) catch # All networking is unavailable, initialize bind_addr to the loopback address # Will cause an exception to be raised only when used. From eac5f115d19dcf36211db0d879f56e854caad841 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Fri, 5 Apr 2024 17:00:22 -0300 Subject: [PATCH 31/54] teste --- src/cluster.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cluster.jl b/src/cluster.jl index 143e37d..1078925 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -1398,6 +1398,7 @@ function init_bind_addr() bind_port = 0 try bind_addr = string(getipaddr()) + @info "ADDR: $(getipaddrs())" catch # All networking is unavailable, initialize bind_addr to the loopback address # Will cause an exception to be raised only when used. From 8f10b720b29119d82d6dbcd4d8f57158aff42425 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Fri, 5 Apr 2024 17:05:33 -0300 Subject: [PATCH 32/54] teste --- src/cluster.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/cluster.jl b/src/cluster.jl index 1078925..1ead8c0 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -1397,8 +1397,10 @@ function init_bind_addr() else bind_port = 0 try - bind_addr = string(getipaddr()) - @info "ADDR: $(getipaddrs())" + ips = getipaddrs(IPv4; loopback = false) + n = length(ips) + bind_addr = string(ips[n]) + #@info "ADDR: $(getipaddrs())" catch # All networking is unavailable, initialize bind_addr to the loopback address # Will cause an exception to be raised only when used. From 8156410bb8e8838d5fa2966b6baca9c3ad1144d1 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Sat, 27 Apr 2024 22:50:14 -0300 Subject: [PATCH 33/54] ... --- src/managers.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/managers.jl b/src/managers.jl index 67032b5..4866a3e 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -362,7 +362,11 @@ function launch_on_machine(manager::SSHManager, machine::AbstractString, cnt, pa wconfig.count = cnt wconfig.max_parallel = params[:max_parallel] wconfig.enable_threaded_blas = params[:enable_threaded_blas] - + @info "will test connect_idents -- $(wconfig.ident)" + if haskey(params,:connect_idents) + wconfig.connect_idents = Vector(params[:connect_idents]) + @info "connect_idents = $(wconfig.connect_idents)" + end push!(launched, wconfig) notify(launch_ntfy) From c403d15071574a5468e60a4e8156514bb96b2180 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Sat, 27 Apr 2024 23:05:07 -0300 Subject: [PATCH 34/54] ... --- src/managers.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/managers.jl b/src/managers.jl index 4866a3e..43a3a80 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -169,7 +169,8 @@ default_addprocs_params(::SSHManager) = :env => [], :tunnel => false, :multiplex => false, - :max_parallel => 10)) + :max_parallel => 10, + :connect_idents => nothing)) function launch(manager::SSHManager, params::Dict, launched::Array, launch_ntfy::Condition) # Launch one worker on each unique host in parallel. Additional workers are launched later. From b1342b49924ce5862cb0c059ce806600d1b6add7 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Sat, 27 Apr 2024 23:18:50 -0300 Subject: [PATCH 35/54] ... --- src/managers.jl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/managers.jl b/src/managers.jl index 43a3a80..4fb1da5 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -170,6 +170,7 @@ default_addprocs_params(::SSHManager) = :tunnel => false, :multiplex => false, :max_parallel => 10, + :ident => nothing, :connect_idents => nothing)) function launch(manager::SSHManager, params::Dict, launched::Array, launch_ntfy::Condition) @@ -368,6 +369,9 @@ function launch_on_machine(manager::SSHManager, machine::AbstractString, cnt, pa wconfig.connect_idents = Vector(params[:connect_idents]) @info "connect_idents = $(wconfig.connect_idents)" end + if haskey(params, :ident) + wconfig.ident = params[:ident] + end push!(launched, wconfig) notify(launch_ntfy) From 726336ef661bfea3b5ebf999670ed3490365d977 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Sat, 27 Apr 2024 23:27:10 -0300 Subject: [PATCH 36/54] ... --- src/managers.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/managers.jl b/src/managers.jl index 4fb1da5..594aa3c 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -365,7 +365,7 @@ function launch_on_machine(manager::SSHManager, machine::AbstractString, cnt, pa wconfig.max_parallel = params[:max_parallel] wconfig.enable_threaded_blas = params[:enable_threaded_blas] @info "will test connect_idents -- $(wconfig.ident)" - if haskey(params,:connect_idents) + if haskey(params,:connect_idents) && !isnothing(params[:connect_idents]) wconfig.connect_idents = Vector(params[:connect_idents]) @info "connect_idents = $(wconfig.connect_idents)" end From bbc1164dc8ae8e13507d16c5113316467c0854ee Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Sat, 27 Apr 2024 23:32:48 -0300 Subject: [PATCH 37/54] ... --- src/managers.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/managers.jl b/src/managers.jl index 594aa3c..3dfbc4e 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -369,8 +369,9 @@ function launch_on_machine(manager::SSHManager, machine::AbstractString, cnt, pa wconfig.connect_idents = Vector(params[:connect_idents]) @info "connect_idents = $(wconfig.connect_idents)" end - if haskey(params, :ident) + if haskey(params, :ident) && !isnothing(params[:ident]) wconfig.ident = params[:ident] + @info "-------------- $(wconfig.ident)" end push!(launched, wconfig) From 4ba94b5223c58764a10a91f1083624cb59740347 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Tue, 18 Jun 2024 15:55:03 -0300 Subject: [PATCH 38/54] ... --- src/managers.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/managers.jl b/src/managers.jl index 3dfbc4e..f2d3ac8 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -706,8 +706,10 @@ function connect_to_worker(host::AbstractString, port::Integer) # host may be a stringified ipv4 / ipv6 address or a dns name bind_addr = nothing try + @info "1: PARSE $host" bind_addr = parse(IPAddr,host) catch + @info "2: $(getalladdrinfo(host))" bind_addr = getaddrinfo(host) end From 58a96df430c86c691f4d7ad45715bf7b2ee5305d Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Tue, 18 Jun 2024 16:06:13 -0300 Subject: [PATCH 39/54] ... --- src/managers.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/managers.jl b/src/managers.jl index f2d3ac8..01038e6 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -706,10 +706,9 @@ function connect_to_worker(host::AbstractString, port::Integer) # host may be a stringified ipv4 / ipv6 address or a dns name bind_addr = nothing try - @info "1: PARSE $host" + @info "1: PARSE $host - $(getalladdrinfo(host))" bind_addr = parse(IPAddr,host) catch - @info "2: $(getalladdrinfo(host))" bind_addr = getaddrinfo(host) end From bf7df7163091e77658fba8b90c11e71ea152b5b1 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Tue, 18 Jun 2024 16:22:27 -0300 Subject: [PATCH 40/54] ... --- src/cluster.jl | 2 ++ src/managers.jl | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cluster.jl b/src/cluster.jl index 1ead8c0..a552726 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -1387,6 +1387,7 @@ end function init_bind_addr() opts = JLOptions() if opts.bindto != C_NULL + @info "A1: $(opt.bindto)" bind_to = split(unsafe_string(opts.bindto), ":") bind_addr = string(parse(IPAddr, bind_to[1])) if length(bind_to) > 1 @@ -1395,6 +1396,7 @@ function init_bind_addr() bind_port = 0 end else + @info "A2: $(getipaddrs(IPv4; loopback = false))" bind_port = 0 try ips = getipaddrs(IPv4; loopback = false) diff --git a/src/managers.jl b/src/managers.jl index 01038e6..3dfbc4e 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -706,7 +706,6 @@ function connect_to_worker(host::AbstractString, port::Integer) # host may be a stringified ipv4 / ipv6 address or a dns name bind_addr = nothing try - @info "1: PARSE $host - $(getalladdrinfo(host))" bind_addr = parse(IPAddr,host) catch bind_addr = getaddrinfo(host) From 3f9346790f6bb7f8500a10fe78df5bf587bf15ca Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Tue, 18 Jun 2024 18:19:50 -0300 Subject: [PATCH 41/54] ... --- src/cluster.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster.jl b/src/cluster.jl index a552726..90fa52a 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -1387,7 +1387,7 @@ end function init_bind_addr() opts = JLOptions() if opts.bindto != C_NULL - @info "A1: $(opt.bindto)" + @info "A1: $(opts.bindto)" bind_to = split(unsafe_string(opts.bindto), ":") bind_addr = string(parse(IPAddr, bind_to[1])) if length(bind_to) > 1 From 6517f6be2fac190efd3933edae2bd153651dfd99 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Tue, 18 Jun 2024 18:22:34 -0300 Subject: [PATCH 42/54] ... --- src/cluster.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster.jl b/src/cluster.jl index 90fa52a..52da968 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -1387,8 +1387,8 @@ end function init_bind_addr() opts = JLOptions() if opts.bindto != C_NULL - @info "A1: $(opts.bindto)" bind_to = split(unsafe_string(opts.bindto), ":") + @info "A1: $bind_to" bind_addr = string(parse(IPAddr, bind_to[1])) if length(bind_to) > 1 bind_port = parse(Int,bind_to[2]) From c93d767bf6df8c1cb91b0286d36ebbdd17718331 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Tue, 18 Jun 2024 20:32:49 -0300 Subject: [PATCH 43/54] ... --- src/managers.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/managers.jl b/src/managers.jl index 3dfbc4e..235b359 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -591,7 +591,7 @@ function connect(manager::ClusterManager, pid::Int, config::WorkerConfig) #return connect(manager, pid, config) end - #@info "CONNECT W1" + #@info "CONNECT W1 " # master connecting to workers if config.io !== nothing @@ -607,8 +607,8 @@ function connect(manager::ClusterManager, pid::Int, config::WorkerConfig) bind_addr = something(config.bind_addr, pubhost) end - #@info "CONNECT W4" tunnel = something(config.tunnel, false) + @info "CONNECT TUNNEL=$tunnel" s = split(pubhost,'@') user = "" From 9e06cf06899c95bfb38d83796a09018409d2eb4c Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Tue, 18 Jun 2024 20:35:42 -0300 Subject: [PATCH 44/54] ... --- src/managers.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/managers.jl b/src/managers.jl index 235b359..57cb855 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -607,7 +607,7 @@ function connect(manager::ClusterManager, pid::Int, config::WorkerConfig) bind_addr = something(config.bind_addr, pubhost) end - tunnel = something(config.tunnel, false) + tunnel = true # something(config.tunnel, false) @info "CONNECT TUNNEL=$tunnel" s = split(pubhost,'@') From 7c8c59843edfddcd4126048cce99b108ce1abd36 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Tue, 18 Jun 2024 20:37:24 -0300 Subject: [PATCH 45/54] ... --- src/managers.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/managers.jl b/src/managers.jl index 57cb855..235b359 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -607,7 +607,7 @@ function connect(manager::ClusterManager, pid::Int, config::WorkerConfig) bind_addr = something(config.bind_addr, pubhost) end - tunnel = true # something(config.tunnel, false) + tunnel = something(config.tunnel, false) @info "CONNECT TUNNEL=$tunnel" s = split(pubhost,'@') From f44d38007f990f7604c65eefa538f2c2fb480119 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Wed, 19 Jun 2024 07:51:56 -0300 Subject: [PATCH 46/54] test ... --- src/cluster.jl | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/src/cluster.jl b/src/cluster.jl index 52da968..2078bcc 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -208,6 +208,7 @@ mutable struct LocalProcess id0::Int id1::Int bind_addr::String + bind_addr_2::String bind_port::UInt16 cookie::String LocalProcess() = new(1,1) @@ -273,7 +274,7 @@ function start_worker(out::IO, cookie::AbstractString=readline(stdin); close_std client = accept(sock) process_messages(client, client, true; role = :worker) end) - println(out, "julia_worker:$(string(LPROC.bind_port))#$(LPROC.bind_addr)\n") # print header + println(out, "julia_worker:$(string(LPROC.bind_port))#$(LPROC.bind_addr_2)\n") # print header flush(out) Sockets.nagle(sock, false) @@ -1384,7 +1385,7 @@ function terminate_all_workers(;role= :default) end # initialize the local proc network address / port -function init_bind_addr() +#=function init_bind_addr() opts = JLOptions() if opts.bindto != C_NULL bind_to = split(unsafe_string(opts.bindto), ":") @@ -1413,6 +1414,43 @@ function init_bind_addr() LPROC.bind_addr = bind_addr LPROC.bind_port = UInt16(bind_port) end +=# + +function init_bind_addr() + opts = JLOptions() + + @info "A2: $(getipaddrs(IPv4; loopback = false))" + bind_port = 0 + try + ips = getipaddrs(IPv4; loopback = false) + n = length(ips) + bind_addr = string(ips[n]) + #@info "ADDR: $(getipaddrs())" + catch + # All networking is unavailable, initialize bind_addr to the loopback address + # Will cause an exception to be raised only when used. + bind_addr = "127.0.0.1" + end + + if opts.bindto != C_NULL + bind_to = split(unsafe_string(opts.bindto), ":") + @info "A1: $bind_to" + bind_addr_2 = string(parse(IPAddr, bind_to[1])) + if length(bind_to) > 1 + bind_port = parse(Int,bind_to[2]) + else + bind_port = 0 + end + else + bind_addr_2 = bind_addr + end + + global LPROC + LPROC.bind_addr = bind_addr + LPROC.bind_addr_2 = bind_addr_2 + LPROC.bind_port = UInt16(bind_port) +end + using Random: randstring From 61bc078e63be913ff6c73c84ce09a378932c37e7 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Wed, 19 Jun 2024 08:01:03 -0300 Subject: [PATCH 47/54] test ... --- src/cluster.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cluster.jl b/src/cluster.jl index 2078bcc..31b1fe2 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -1421,6 +1421,7 @@ function init_bind_addr() @info "A2: $(getipaddrs(IPv4; loopback = false))" bind_port = 0 + bind_addr = "" try ips = getipaddrs(IPv4; loopback = false) n = length(ips) From d329a0074e32d6a0b94112237991c55b4d3b02ab Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Tue, 3 Sep 2024 16:48:32 -0300 Subject: [PATCH 48/54] incorporating last commits of the original version: af89e6cd76b2a613d061d55864b8fc7ec8215911 (Aug 14, 2024) -> 3b889eecfdccb7febb9ae8abf9e3c604ffbc40ea (Apr 9, 2024) --- .github/workflows/ci.yml | 11 ++++++----- README.md | 5 ++--- src/Distributed.jl | 2 +- src/clusterserialize.jl | 3 ++- src/managers.jl | 10 ++++++---- src/workerpool.jl | 8 +++++++- test/distributed_exec.jl | 32 +++++++++++++++++++++++++++++++- 7 files changed, 55 insertions(+), 16 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 01e6847..a6a1e35 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -34,12 +34,12 @@ jobs: - os: macOS-latest arch: x86 steps: - - uses: actions/checkout@v2 - - uses: julia-actions/setup-julia@v1 + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@v2 with: version: ${{ matrix.version }} arch: ${{ matrix.arch }} - - uses: actions/cache@v1 + - uses: actions/cache@v4 env: cache-name: cache-artifacts with: @@ -54,13 +54,14 @@ jobs: env: JULIA_DISTRIBUTED_TESTING_STANDALONE: 1 - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v1 + - uses: codecov/codecov-action@v4 with: file: lcov.info + token: ${{ secrets.CODECOV_TOKEN }} docs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: julia-actions/setup-julia@latest with: # version: '1.6' diff --git a/README.md b/README.md index 5a5740b..f66be89 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Distributed (with a multiscale parallelism extension) -The `Distributed` package provides functionality for creating and controlling multiple Julia processes remotely, and for performing distributed and parallel computing. It uses network sockets or other supported interfaces to communicate between Julia processes, and relies on Julia's `Serialization` stdlib package to transform Julia objects into a format that can be transferred between processes efficiently. It provides a full set of utilities to create and destroy new Julia processes and add them to a "cluster" (a collection of Julia processes connected together), as well as functions to perform Remote Procedure Calls (RPC) between the processes within a cluster. See [`API`](@ref) for details. +The `Distributed` package provides functionality for creating and controlling multiple Julia processes remotely, and for performing distributed and parallel computing. It uses network sockets or other supported interfaces to communicate between Julia processes, and relies on Julia's `Serialization` stdlib package to transform Julia objects into a format that can be transferred between processes efficiently. It provides a full set of utilities to create and destroy new Julia processes and add them to a "cluster" (a collection of Julia processes connected together), as well as functions to perform Remote Procedure Calls (RPC) between the processes within a cluster. See the `API` section for details. This package ships as part of the Julia stdlib. @@ -64,5 +64,4 @@ For controlling multiple processes at once: Julia processes connected with `Distributed` are all assigned a cluster-unique `Int` identifier, starting from `1`. The first Julia process within a cluster is given ID `1`, while other processes added via `addprocs` get incrementing IDs (`2`, `3`, etc.). Functions and macros which communicate from one process to another usually take one or more identifiers to determine which process they target - for example, `remotecall_fetch(myid, 2)` calls `myid()` on process 2. -!!! note - Only process 1 (often called the "head", "primary", or "master") may add or remove processes, and manages the rest of the cluster. Other processes (called "workers" or "worker processes") may still call functions on each other and send and receive data, but `addprocs`/`rmprocs` on worker processes will fail with an error. +**Note:** Only process 1 (often called the "head", "primary", or "master") may add or remove processes, and manages the rest of the cluster. Other processes (called "workers" or "worker processes") may still call functions on each other and send and receive data, but `addprocs`/`rmprocs` on worker processes will fail with an error. diff --git a/src/Distributed.jl b/src/Distributed.jl index 6539f1a..9983d9b 100644 --- a/src/Distributed.jl +++ b/src/Distributed.jl @@ -15,7 +15,7 @@ using Base: Process, Semaphore, JLOptions, buffer_writes, @async_unwrap, julia_cmd, AsyncGenerator, acquire, release, invokelatest, shell_escape_posixly, shell_escape_csh, shell_escape_wincmd, escape_microsoft_c_args, - uv_error, something, notnothing, isbuffered, mapany + uv_error, something, notnothing, isbuffered, mapany, SizeUnknown using Base.Threads: Event using Serialization, Sockets diff --git a/src/clusterserialize.jl b/src/clusterserialize.jl index 3520a30..589ebe1 100644 --- a/src/clusterserialize.jl +++ b/src/clusterserialize.jl @@ -167,10 +167,11 @@ function deserialize_global_from_main(s::ClusterSerializer, sym) return nothing end end + Core.eval(Main, Expr(:global, sym)) if sym_isconst ccall(:jl_set_const, Cvoid, (Any, Any, Any), Main, sym, v) else - setglobal!(Main, sym, v) + invokelatest(setglobal!, Main, sym, v) end return nothing end diff --git a/src/managers.jl b/src/managers.jl index 235b359..b76856b 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -111,7 +111,9 @@ addprocs([ version is used on all remote machines because serialization and code distribution might fail otherwise. -* `exeflags`: additional flags passed to the worker processes. +* `exeflags`: additional flags passed to the worker processes. It can either be a `Cmd`, a `String` + holding one flag, or a collection of strings, with one element per flag. + E.g. `\`--threads=auto project=.\``, `"--compile-trace=stderr"` or `["--threads=auto", "--compile=all"]`. * `topology`: Specifies how the workers connect to each other. Sending a message between unconnected workers results in an error. @@ -770,12 +772,12 @@ function kill(manager::LocalManager, pid::Int, config::WorkerConfig; exit_timeou # Check to see if our child exited, and if not, send an actual kill signal if !process_exited(config.process) - @warn("Failed to gracefully kill worker $(pid), sending SIGTERM") - kill(config.process, Base.SIGTERM) + @warn("Failed to gracefully kill worker $(pid), sending SIGQUIT") + kill(config.process, Base.SIGQUIT) sleep(term_timeout) if !process_exited(config.process) - @warn("Worker $(pid) ignored SIGTERM, sending SIGKILL") + @warn("Worker $(pid) ignored SIGQUIT, sending SIGKILL") kill(config.process, Base.SIGKILL) end end diff --git a/src/workerpool.jl b/src/workerpool.jl index 261cde4..e166f5d 100644 --- a/src/workerpool.jl +++ b/src/workerpool.jl @@ -8,6 +8,7 @@ An `AbstractWorkerPool` should implement: - [`push!`](@ref) - add a new worker to the overall pool (available + busy) - [`put!`](@ref) - put back a worker to the available pool - [`take!`](@ref) - take a worker from the available pool (to be used for remote function execution) + - [`wait`](@ref) - block until a worker is available - [`length`](@ref) - number of workers available in the overall pool - [`isready`](@ref) - return false if a `take!` on the pool would block, else true @@ -120,6 +121,11 @@ function wp_local_take!(pool::AbstractWorkerPool; role= :default) return worker end +function wp_local_wait(pool::AbstractWorkerPool) + wait(pool.channel) + return nothing +end + function remotecall_pool(rc_f, f, pool::AbstractWorkerPool, args...; role= :default, kwargs...) worker = take!(pool; role=role) try @@ -133,7 +139,7 @@ end # NOTE: remotecall_fetch does it automatically, but this will be more efficient as # it avoids the overhead associated with a local remotecall. -for (func, rt) = ((:length, Int), (:isready, Bool), (:workers, Vector{Int}), (:nworkers, Int), (:take!, Int)) +for (func, rt) = ((:length, Int), (:isready, Bool), (:workers, Vector{Int}), (:nworkers, Int), (:take!, Int), (:wait, Nothing)) func_local = Symbol(string("wp_local_", func)) @eval begin function ($func)(pool::WorkerPool; role= :default) diff --git a/test/distributed_exec.jl b/test/distributed_exec.jl index 457ee13..fe62559 100644 --- a/test/distributed_exec.jl +++ b/test/distributed_exec.jl @@ -518,6 +518,8 @@ test_iteration(RemoteChannel(() -> Channel(10)), RemoteChannel(() -> Channel(10) return count end +@everywhere test_iteration_collect(ch) = length(collect(ch)) + @everywhere function test_iteration_put(ch, total) for i in 1:total put!(ch, i) @@ -528,10 +530,16 @@ end let ch = RemoteChannel(() -> Channel(1)) @async test_iteration_put(ch, 10) @test 10 == @fetchfrom id_other test_iteration_take(ch) + ch = RemoteChannel(() -> Channel(1)) + @async test_iteration_put(ch, 10) + @test 10 == @fetchfrom id_other test_iteration_collect(ch) # now reverse ch = RemoteChannel(() -> Channel(1)) @spawnat id_other test_iteration_put(ch, 10) @test 10 == test_iteration_take(ch) + ch = RemoteChannel(() -> Channel(1)) + @spawnat id_other test_iteration_put(ch, 10) + @test 10 == test_iteration_collect(ch) end # make sure exceptions propagate when waiting on Tasks @@ -751,6 +759,28 @@ wp = WorkerPool(workers()) wp = WorkerPool(2:3) @test sort(unique(pmap(_->myid(), wp, 1:100))) == [2,3] +# wait on worker pool +wp = WorkerPool(2:2) +w = take!(wp) + +# local call to _wait +@test !isready(wp) +t = @async wait(wp) +@test !istaskdone(t) +put!(wp, w) +status = timedwait(() -> istaskdone(t), 10) +@test status == :ok + +# remote call to _wait +take!(wp) +@test !isready(wp) +f = @spawnat w wait(wp) +@test !isready(f) +put!(wp, w) +status = timedwait(() -> isready(f), 10) +@test status == :ok + + # CachingPool tests wp = CachingPool(workers()) @test [1:100...] == pmap(x->x, wp, 1:100) @@ -2037,7 +2067,7 @@ begin # Next, ensure we get a log message when a worker does not cleanly exit w = only(addprocs(1)) - @test_logs (:warn, r"sending SIGTERM") begin + @test_logs (:warn, r"sending SIGQUIT") begin remote_do(w) do # Cause the 'exit()' message that `rmprocs()` sends to do nothing Core.eval(Base, :(exit() = nothing)) From 98d389a2ee8c21674b43b0e1994f1eccf0d663aa Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Wed, 4 Sep 2024 09:35:26 -0300 Subject: [PATCH 49/54] reverting modification in init_bind_addr --- src/cluster.jl | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/src/cluster.jl b/src/cluster.jl index 31b1fe2..07ce016 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -1397,13 +1397,11 @@ end bind_port = 0 end else - @info "A2: $(getipaddrs(IPv4; loopback = false))" bind_port = 0 try ips = getipaddrs(IPv4; loopback = false) n = length(ips) bind_addr = string(ips[n]) - #@info "ADDR: $(getipaddrs())" catch # All networking is unavailable, initialize bind_addr to the loopback address # Will cause an exception to be raised only when used. @@ -1416,7 +1414,7 @@ end end =# -function init_bind_addr() +#=function init_bind_addr() opts = JLOptions() @info "A2: $(getipaddrs(IPv4; loopback = false))" @@ -1426,7 +1424,7 @@ function init_bind_addr() ips = getipaddrs(IPv4; loopback = false) n = length(ips) bind_addr = string(ips[n]) - #@info "ADDR: $(getipaddrs())" + @info "ADDR: $ips --- $ips" catch # All networking is unavailable, initialize bind_addr to the loopback address # Will cause an exception to be raised only when used. @@ -1447,11 +1445,38 @@ function init_bind_addr() end global LPROC + @info "bind_addr=$bind_addr / bind_addr_2=$bind_addr_2" LPROC.bind_addr = bind_addr - LPROC.bind_addr_2 = bind_addr_2 + LPROC.bind_addr_2 = bind_addr_2 LPROC.bind_port = UInt16(bind_port) end +=# +function init_bind_addr() + opts = JLOptions() + if opts.bindto != C_NULL + bind_to = split(unsafe_string(opts.bindto), ":") + bind_addr = string(parse(IPAddr, bind_to[1])) + if length(bind_to) > 1 + bind_port = parse(Int,bind_to[2]) + else + bind_port = 0 + end + else + bind_port = 0 + try + bind_addr = string(getipaddr()) + catch + # All networking is unavailable, initialize bind_addr to the loopback address + # Will cause an exception to be raised only when used. + bind_addr = "127.0.0.1" + end + end + global LPROC + LPROC.bind_addr = bind_addr + LPROC.bind_addr_2 = bind_addr + LPROC.bind_port = UInt16(bind_port) +end using Random: randstring From 275d08cdfe8db09314b85bd0947d0eca733a8ac3 Mon Sep 17 00:00:00 2001 From: decarvalhojunior-fh Date: Mon, 25 Nov 2024 08:33:25 -0300 Subject: [PATCH 50/54] ... --- src/cluster.jl | 2 + src/managers.jl | 6 +-- test/distributed_exec.jl | 110 --------------------------------------- 3 files changed, 5 insertions(+), 113 deletions(-) diff --git a/src/cluster.jl b/src/cluster.jl index 07ce016..b5b67f7 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -350,6 +350,8 @@ function read_worker_host_port(io::IO) throw(LaunchWorkerError("Unable to read host:port string from worker. Launch command exited with error?")) end + @error "conninfo: $conninfo" + ntries -= 1 bind_addr, port = parse_connection_info(conninfo) if !isempty(bind_addr) diff --git a/src/managers.jl b/src/managers.jl index b76856b..452b2fb 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -366,14 +366,14 @@ function launch_on_machine(manager::SSHManager, machine::AbstractString, cnt, pa wconfig.count = cnt wconfig.max_parallel = params[:max_parallel] wconfig.enable_threaded_blas = params[:enable_threaded_blas] - @info "will test connect_idents -- $(wconfig.ident)" + #@info "will test connect_idents -- $(wconfig.ident)" if haskey(params,:connect_idents) && !isnothing(params[:connect_idents]) wconfig.connect_idents = Vector(params[:connect_idents]) - @info "connect_idents = $(wconfig.connect_idents)" + # @info "connect_idents = $(wconfig.connect_idents)" end if haskey(params, :ident) && !isnothing(params[:ident]) wconfig.ident = params[:ident] - @info "-------------- $(wconfig.ident)" + # @info "-------------- $(wconfig.ident)" end push!(launched, wconfig) diff --git a/test/distributed_exec.jl b/test/distributed_exec.jl index fe62559..523b4bd 100644 --- a/test/distributed_exec.jl +++ b/test/distributed_exec.jl @@ -56,7 +56,6 @@ for wid = workers() @test wrole === :worker end -@info "passed 1" #sleep(3) @@ -67,8 +66,6 @@ let count = 0 count_condition = Condition() - @info "passed 2" - function remote_wait(c) @async_logerr begin count += 1 @@ -79,8 +76,6 @@ let yield() end - @info "passed 3" - # @info nworkers() # sleep(30) @@ -98,7 +93,6 @@ let @test count == testcount @test isready(pool) == false - @info "passed 4" #sleep(3) try @@ -118,7 +112,6 @@ let @test count == 0 - @info "passed 5" #sleep(3) for c in testchannels @@ -129,7 +122,6 @@ let @test count == testcount @test isready(pool) == false - @info "passed 6" #sleep(3) for c in reverse(testchannels) @@ -141,13 +133,11 @@ let @test isready(pool) == true end - @info "passed 7" #sleep(3) @test count == 0 end -@info "passed 8" #sleep(3) @@ -202,8 +192,6 @@ function include_thread_unsafe_tests() return true end -@info "passed 9" - # Distributed GC tests for Futures function test_futures_dgc(id) f = remotecall(myid, id) @@ -302,8 +290,6 @@ yield() # flush gc msgs @test poll_while(() -> remotecall_fetch(chk_rrid->(yield(); haskey(Distributed.PGRP().refs, chk_rrid)), id_other, rrid)) -@info "passed 10" - # Distributed GC tests for RemoteChannels function test_remoteref_dgc(id) rr = RemoteChannel(id) @@ -342,8 +328,6 @@ let wid1 = workers()[1], @test poll_while(() -> remotecall_fetch(k -> haskey(Distributed.PGRP().refs, k), wid1, rrid)) end -@info "passed 11" - # Tests for issue #23109 - should not hang. f = @spawnat :any rand(1, 1) Base.Experimental.@sync begin @@ -369,8 +353,6 @@ for i in 1:nworkers() end @test sort(pids) == sort(workers()) -@info "passed 12" - # test getindex on Futures and RemoteChannels function test_indexing(rr) a = rand(5,5) @@ -384,8 +366,6 @@ test_indexing(Future(id_other)) test_indexing(RemoteChannel()) test_indexing(RemoteChannel(id_other)) -@info "passed 13" - # Test ser/deser to non-ClusterSerializer objects. function test_regular_io_ser(ref::Distributed.AbstractRemoteRef) io = IOBuffer() @@ -436,8 +416,6 @@ s = [randstring() for x in 1:10^5] num_small_requests = 10000 @test fill(id_other, num_small_requests) == [remotecall_fetch(myid, id_other) for i in 1:num_small_requests] -@info "passed 14" - # test parallel sends of large arrays from multiple tasks to the same remote worker ntasks = 10 rr_list = [Channel(1) for x in 1:ntasks] @@ -486,9 +464,6 @@ test_channel(RemoteChannel(()->Channel(10))) c=Channel{Int}(1) @test_throws MethodError put!(c, "Hello") -@info "passed 15" - - # test channel iterations function test_iteration(in_c, out_c) t=@async for v in in_c @@ -598,8 +573,6 @@ for id in [id_other, id_me] end end -@info "passed 16" - # make sure the stackframe from the remote error can be serialized let ex try @@ -624,8 +597,6 @@ end # pmap tests. Needs at least 4 processors dedicated to the below tests. Which we currently have # since the Distributed tests are now spawned as a separate set. -@info "passed 17" - # Test all combinations of pmap keyword args. pmap_args = [ (:distributed, [:default, false]), @@ -717,8 +688,6 @@ generic_map_tests(pmap_fallback) run_map_equivalence_tests(pmap) @test pmap(uppercase, "Hello World!") == map(uppercase, "Hello World!") -@info "passed 18" - # Simple test for pmap throws error let error_thrown = false try @@ -849,8 +818,6 @@ if Sys.isunix() # aka have ssh remotecall_fetch(rmprocs, 1, new_pids) end - @info "passed 19" - print("\n\nTesting SSHManager. A minimum of 4GB of RAM is recommended.\n") print("Please ensure: \n") @@ -927,8 +894,6 @@ let t = @task 42 @test_throws TaskFailedException(t) Base.wait(t) end -@info "passed 20" - # issue #8207 let A = Any[] @distributed (+) for i in (push!(A,1); 1:2) @@ -937,8 +902,6 @@ let A = Any[] @test length(A) == 1 end -@info "passed 21" - # issue #13168 function f13168(n) val = 0 @@ -958,13 +921,9 @@ let t = schedule(@task f13168(100)) @test isa(fetch(t), Float64) end -@info "passed 21.1" - # issue #13122 @test remotecall_fetch(identity, workers()[1], C_NULL) === C_NULL -@info "passed 21.2" - # issue #11062 function t11062() @async v11062 = 1 @@ -973,15 +932,11 @@ end @test t11062() == 2 -@info "passed 21.3" - # issue #15406 v15406 = remotecall_wait(() -> 1, id_other) fetch(v15406) remotecall_wait(fetch, id_other, v15406) -@info "passed 21.4" - # issue #43396 # Covers the remote fetch where the value returned is `nothing` # May be caused by attempting to unwrap a non-`Some` type with `something` @@ -990,8 +945,6 @@ remotecall_wait(fetch, id_other, v15406) @test nothing === fetch(remotecall(() -> nothing, workers()[1])) @test 10 === fetch(remotecall(() -> 10, workers()[1])) -@info "passed 21.5" - # Test various forms of remotecall* invocations @everywhere f_args(v1, v2=0; kw1=0, kw2=0) = v1+v2+kw1+kw2 @@ -1014,29 +967,19 @@ for tid in [id_other, id_me, default_worker_pool()] end -@info "passed 21.6.2" - f=Future(id_other) remote_do(fut->put!(fut, myid()), id_other, f) @test fetch(f) == id_other -@info "passed 21.6.1" - # Test remote_do f=Future(id_me) -@info "passed 21.6.1.1" remote_do(fut->put!(fut, myid()), id_me, f) -@info "passed 21.6.1.2" @test fetch(f) == id_me -@info "passed 21.7" - # Github issue #29932 rc_unbuffered = RemoteChannel(()->Channel{Vector{Float64}}(0)) @test eltype(rc_unbuffered) == Vector{Float64} -@info "passed 21.8" - @async begin # Trigger direct write (no buffering) of largish array array_sz = Int(Base.SZ_UNBUFFERED_IO/8) + 1 @@ -1054,8 +997,6 @@ end return :OK end, id_other, rc_unbuffered) === :OK -@info "passed 21.9" - # github issue 33972 rc_unbuffered_other = RemoteChannel(()->Channel{Int}(0), id_other) close(rc_unbuffered_other) @@ -1063,30 +1004,22 @@ try; take!(rc_unbuffered_other); catch; end @test !remotecall_fetch(rc -> islocked(Distributed.lookup_ref(remoteref_id(rc)).synctake), id_other, rc_unbuffered_other) -@info "passed 21.10" - # github PR #14456 n = DoFullTest ? 6 : 5 for i = 1:10^n fetch(@spawnat myid() myid()) end -@info "passed 21.11" - # issue #15451 @test remotecall_fetch(x->(y->2y)(x)+1, workers()[1], 3) == 7 -@info "passed 21.12" - # issue #16091 mutable struct T16091 end wid0 = workers()[1] @test try remotecall_fetch(()->T16091, wid0) - @info "try ..." false catch ex - @info "catch $(((ex::RemoteException).captured::CapturedException).ex) --- $(UndefVarError(:T16091)) --- $(((ex::RemoteException).captured::CapturedException).ex === UndefVarError(:T16091))" ((ex::RemoteException).captured::CapturedException).ex === UndefVarError(:T16091) end @test try @@ -1101,8 +1034,6 @@ remotecall_fetch(()->eval(:(f16091a() = 2)), wid0) @test remotecall_fetch(f16091a, wid0) === 2 @test remotecall_fetch((myid)->remotecall_fetch(f16091a, myid), wid0, myid()) === 1 -@info "passed 21.13" - # these will only heisen-fail, since it depends on the gensym counter collisions: f16091b = () -> 1 remotecall_fetch(()->eval(:(f16091b = () -> 2)), wid0) @@ -1110,8 +1041,6 @@ remotecall_fetch(()->eval(:(f16091b = () -> 2)), wid0) # Global anonymous functions are over-written... @test remotecall_fetch((myid)->remotecall_fetch(f16091b, myid), wid0, myid()) === 1 -@info "passed 21.14" - # ...while local anonymous functions are by definition, local. let f16091c = () -> 1 @@ -1125,8 +1054,6 @@ let end, wid0, myid()) === 2 end -@info "passed 21.15" - # issue #16451 rng=RandomDevice() retval = @distributed (+) for _ in 1:10 @@ -1140,25 +1067,18 @@ retval = @distributed (+) for _ in 1:10 end @test retval > 0.0 && retval < 10.0 -@info "passed 21.16" - # serialization tests wrkr1 = workers()[1] wrkr2 = workers()[end] @test remotecall_fetch(p->remotecall_fetch(myid, p), wrkr1, wrkr2) == wrkr2 -@info "passed 21.17" - # Send f to wrkr1 and wrkr2. Then try calling f on wrkr2 from wrkr1 f_myid = ()->myid() @test wrkr1 == remotecall_fetch(f_myid, wrkr1) @test wrkr2 == remotecall_fetch(f_myid, wrkr2) @test wrkr2 == remotecall_fetch((f, p)->remotecall_fetch(f, p), wrkr1, f_myid, wrkr2) - -@info "passed 22" - # Deserialization error recovery test # locally defined module, but unavailable on workers module LocalFoo @@ -1234,8 +1154,6 @@ let (p, p2) = filter!(p -> p != myid(), procs()) test_throw_on([p2, p], "everywhere on p and p2") end -@info "passed 23" - # Test addprocs enable_threaded_blas parameter function get_remote_num_threads(processes_added) @@ -1292,8 +1210,6 @@ function test_add_procs_threaded_blas() end test_add_procs_threaded_blas() -@info "passed 24" - #19687 if false ### TODO: The logic that is supposed to implement this is racy - Disabled for now # ensure no race conditions between rmprocs and addprocs @@ -1339,8 +1255,6 @@ end # end #end -@info "passed 25" - # Test the following addprocs error conditions # - invalid host name - github issue #20372 # - julia exe exiting with an error @@ -1406,9 +1320,6 @@ for (addp_testf, expected_errstr, env) in testruns end end -@info "passed 26" - - # Auto serialization of globals from Main. # bitstypes global v1 = 1 @@ -1476,9 +1387,6 @@ v31252 = :b v31252 = :a @test :a == @fetchfrom id_other v31252 -@info "passed 27" - - # Test that a global is not being repeatedly serialized when # a) referenced multiple times in the closure # b) hash value has not changed. @@ -1575,8 +1483,6 @@ global ids_func = ()->ids_cleanup clust_ser = (Distributed.worker_from_id(id_other)).w_serializer @test remotecall_fetch(ids_func, id_other) == ids_cleanup -@info "passed 29" - # TODO Add test for cleanup from `clust_ser.glbs_in_tnobj` # reported github issues - Mostly tests with globals and various Distributed macros @@ -1626,8 +1532,6 @@ let @test remotecall_fetch(Float64, id_other, 1) == Float64(1) end -@info "passed 30" - #19463 function foo19463() w1 = workers()[1] @@ -1684,8 +1588,6 @@ syms = setup_syms(3, workers()) clear!(syms, workers()) test_clear(syms, workers()) -@info "passed 31" - # Test partial recovery from a deserialization error in CapturedException try expr = quote @@ -1703,8 +1605,6 @@ catch ex @test ex.captured.ex.exceptions[2].ex == UndefVarError(:DontExistOn1) end -@info "passed 32" - let # creates a new worker in a different folder and tries to include file tmp_dir = mktempdir() @@ -1778,8 +1678,6 @@ cluster_cookie("foobar") # custom cookie npids = addprocs_with_testenv(WorkerArgTester(`--worker=foobar`, false)) @test remotecall_fetch(myid, npids[1]) == npids[1] -@info "passed 33" - # tests for start_worker options to retain stdio (issue #31035) struct RetainStdioTester <: ClusterManager close_stdin::Bool @@ -1883,9 +1781,6 @@ for T in (UInt8, Int8, UInt16, Int16, UInt32, Int32, UInt64) @test n == 55 end -@info "passed 34" - - # issue #28966 let code = """ import Distributed @@ -2052,8 +1947,6 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp @test success(cmd) end end -@info "passed 35" - include("splitrange.jl") # Clear all workers for timeout tests (issue #45785) @@ -2078,12 +1971,9 @@ begin end end -@info "passed 36" - # Run topology tests last after removing all workers, since a given # cluster at any time only supports a single topology. nprocs() > 1 && rmprocs(workers()) include("topology.jl") -@info "end test" From f38f11960b53d05b59ff12139fa3d8c7290af479 Mon Sep 17 00:00:00 2001 From: decarvalhojunior-fh Date: Mon, 25 Nov 2024 16:19:45 -0300 Subject: [PATCH 51/54] ... --- src/cluster.jl | 2 +- src/managers.jl | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/cluster.jl b/src/cluster.jl index b5b67f7..54f0062 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -350,7 +350,7 @@ function read_worker_host_port(io::IO) throw(LaunchWorkerError("Unable to read host:port string from worker. Launch command exited with error?")) end - @error "conninfo: $conninfo" + #@error "conninfo: $conninfo" ntries -= 1 bind_addr, port = parse_connection_info(conninfo) diff --git a/src/managers.jl b/src/managers.jl index 452b2fb..9458ff4 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -610,7 +610,6 @@ function connect(manager::ClusterManager, pid::Int, config::WorkerConfig) end tunnel = something(config.tunnel, false) - @info "CONNECT TUNNEL=$tunnel" s = split(pubhost,'@') user = "" @@ -715,7 +714,7 @@ function connect_to_worker(host::AbstractString, port::Integer) iptype = typeof(bind_addr) - @info "connect_to_worker: $host $port $bind_addr $iptype" + #@info "connect_to_worker: $host $port $bind_addr $iptype" sock = socket_reuse_port(iptype) connect(sock, bind_addr, UInt16(port)) From 0c450de5508f9113aeecd1476943b7ed2e76187e Mon Sep 17 00:00:00 2001 From: decarvalhojunior-fh Date: Fri, 31 Jan 2025 17:35:44 -0300 Subject: [PATCH 52/54] master connection through pubhost to allow tunnel=false --- src/cluster.jl | 2 +- src/managers.jl | 14 ++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/cluster.jl b/src/cluster.jl index 54f0062..919d335 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -350,7 +350,7 @@ function read_worker_host_port(io::IO) throw(LaunchWorkerError("Unable to read host:port string from worker. Launch command exited with error?")) end - #@error "conninfo: $conninfo" + @info "conninfo: $conninfo" ntries -= 1 bind_addr, port = parse_connection_info(conninfo) diff --git a/src/managers.jl b/src/managers.jl index 9458ff4..86431a2 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -232,7 +232,7 @@ function parse_machine(machine::AbstractString) end function launch_on_machine(manager::SSHManager, machine::AbstractString, cnt, params::Dict, launched::Array, launch_ntfy::Condition) - #@info "launch_on_machine" + shell = params[:shell] ssh = params[:ssh] dir = params[:dir] @@ -598,8 +598,9 @@ function connect(manager::ClusterManager, pid::Int, config::WorkerConfig) # master connecting to workers if config.io !== nothing (bind_addr, port::Int) = read_worker_host_port(config.io) - #@info "CONNECT W2 $bind_addr $port" + # @info "CONNECT W2 $bind_addr $port $(config.host) $(config.bind_addr)" pubhost = something(config.host, bind_addr) + # @info "CONNECT W21 $pubhost" config.host = pubhost config.port = port else @@ -641,7 +642,7 @@ function connect(manager::ClusterManager, pid::Int, config::WorkerConfig) release(sem) end else - (s, bind_addr) = connect_to_worker(bind_addr, port) + (s, bind_addr) = connect_to_worker(#=bind_addr=# pubhost, port) end config.bind_addr = bind_addr @@ -703,6 +704,9 @@ function bind_client_port(sock::TCPSocket, iptype) end function connect_to_worker(host::AbstractString, port::Integer) + +# @info "--------- CONNECT TO WORKER $host $port" + # Avoid calling getaddrinfo if possible - involves a DNS lookup # host may be a stringified ipv4 / ipv6 address or a dns name bind_addr = nothing @@ -714,7 +718,6 @@ function connect_to_worker(host::AbstractString, port::Integer) iptype = typeof(bind_addr) - #@info "connect_to_worker: $host $port $bind_addr $iptype" sock = socket_reuse_port(iptype) connect(sock, bind_addr, UInt16(port)) @@ -723,6 +726,9 @@ end function connect_to_worker_with_tunnel(host::AbstractString, bind_addr::AbstractString, port::Integer, tunnel_user::AbstractString, sshflags, multiplex) + + # @info "++++++++ CONNECT TO WORKER WITH TUNNEL host=$host port=$port bind_addr=$bind_addr tunnel_user=$tunnel_user sshflags=$sshflags multiplex=$multiplex" + localport = ssh_tunnel(tunnel_user, host, bind_addr, UInt16(port), sshflags, multiplex) s = connect("localhost", localport) forward = "$localport:$bind_addr:$port" From 2a8df42046a6e2009dab12b5bf87e915068fa909 Mon Sep 17 00:00:00 2001 From: decarvalhojunior-fh Date: Sun, 2 Feb 2025 01:00:56 -0300 Subject: [PATCH 53/54] cummulative commits until 2025-Jan-30 --- .github/workflows/ci.yml | 43 +++++----- .gitignore | 2 + Project.toml | 12 ++- README.md | 51 +++++++++-- src/Distributed.jl | 1 + src/cluster.jl | 109 +++++++++++++----------- src/clusterserialize.jl | 8 +- src/macros.jl | 11 ++- src/managers.jl | 16 +++- src/messages.jl | 2 +- src/process_messages.jl | 2 +- src/remotecall.jl | 9 +- src/workerpool.jl | 49 +++++++++++ test/aqua.jl | 8 ++ test/distributed_exec.jl | 180 ++++++++++++++++++++++++--------------- test/runtests.jl | 15 +++- test/threads.jl | 55 ++++++++++++ 17 files changed, 406 insertions(+), 167 deletions(-) create mode 100644 .gitignore create mode 100644 test/aqua.jl create mode 100644 test/threads.jl diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a6a1e35..caadcc0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,23 +14,28 @@ concurrency: # Cancel intermediate builds: only pull request builds group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref != 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release-') || github.run_number }} cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} + jobs: test: - name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} + name: julia -t${{ matrix.threads}} - ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} runs-on: ${{ matrix.os }} + timeout-minutes: 30 strategy: fail-fast: false matrix: - version: - - 'nightly' - os: - - ubuntu-latest - - macOS-latest - - windows-latest - arch: - - x64 - - x86 + threads: + # - '1' + - '4,4' + version: [nightly] + os: [ubuntu-latest, windows-latest, macOS-latest] + arch: [x64, x86, aarch64] exclude: + - os: ubuntu-latest + arch: aarch64 + - os: windows-latest + arch: aarch64 + - os: macOS-latest + arch: x64 - os: macOS-latest arch: x86 steps: @@ -39,25 +44,17 @@ jobs: with: version: ${{ matrix.version }} arch: ${{ matrix.arch }} - - uses: actions/cache@v4 - env: - cache-name: cache-artifacts - with: - path: ~/.julia/artifacts - key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }} - restore-keys: | - ${{ runner.os }}-test-${{ env.cache-name }}- - ${{ runner.os }}-test-${{ matrix.os }} - ${{ runner.os }}- - - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/cache@v2 - uses: julia-actions/julia-runtest@v1 env: JULIA_DISTRIBUTED_TESTING_STANDALONE: 1 + JULIA_NUM_THREADS: '${{ matrix.threads}}' - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v4 + - uses: codecov/codecov-action@v5 with: - file: lcov.info + files: lcov.info token: ${{ secrets.CODECOV_TOKEN }} + docs: runs-on: ubuntu-latest steps: diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..df02284 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +Manifest.toml +*.swp diff --git a/Project.toml b/Project.toml index bb30760..382b1ab 100644 --- a/Project.toml +++ b/Project.toml @@ -8,8 +8,18 @@ Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" Sockets = "6462fe0b-24de-5631-8697-dd941f90decc" [extras] +Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["LinearAlgebra", "Test"] +test = ["Aqua", "LinearAlgebra", "Test"] + +[compat] +Aqua = "0.8.10" +LinearAlgebra = "1" +Random = "1" +Serialization = "1" +Sockets = "1" +Test = "1" +julia = "1" diff --git a/README.md b/README.md index f66be89..845347a 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,14 @@ # Distributed (with a multiscale parallelism extension) -The `Distributed` package provides functionality for creating and controlling multiple Julia processes remotely, and for performing distributed and parallel computing. It uses network sockets or other supported interfaces to communicate between Julia processes, and relies on Julia's `Serialization` stdlib package to transform Julia objects into a format that can be transferred between processes efficiently. It provides a full set of utilities to create and destroy new Julia processes and add them to a "cluster" (a collection of Julia processes connected together), as well as functions to perform Remote Procedure Calls (RPC) between the processes within a cluster. See the `API` section for details. - +The `Distributed` package provides functionality for creating and controlling +multiple Julia processes remotely, and for performing distributed and parallel +computing. It uses network sockets or other supported interfaces to communicate +between Julia processes, and relies on Julia's `Serialization` stdlib package to +transform Julia objects into a format that can be transferred between processes +efficiently. It provides a full set of utilities to create and destroy new Julia +processes and add them to a "cluster" (a collection of Julia processes connected +together), as well as functions to perform Remote Procedure Calls (RPC) between +the processes within a cluster. See the `API` section for details. This package ships as part of the Julia stdlib. > [!NOTE] @@ -23,9 +30,28 @@ To use a newer version of this package, you need to build Julia from scratch. Th It's also possible to load a development version of the package using [the trick used in the Section named "Using the development version of Pkg.jl" in the `Pkg.jl` repo](https://github.com/JuliaLang/Pkg.jl#using-the-development-version-of-pkgjl), but the capabilities are limited as all other packages will depend on the stdlib version of the package and will not work with the modified package. +### On Julia 1.11+ +In Julia 1.11 Distributed was excised from the default system image and became +more of an independent package. As such, to use a different version it's enough +to just `dev` it explicitly: +```julia-repl +pkg> dev https://github.com/JuliaLang/Distributed.jl.git +``` +### On older Julia versions +To use a newer version of this package on older Julia versions, you need to build +Julia from scratch. The build process is the same as any other build except that +you need to change the commit used in `stdlib/Distributed.version`. +It's also possible to load a development version of the package using [the trick +used in the Section named "Using the development version of Pkg.jl" in the +`Pkg.jl` +repo](https://github.com/JuliaLang/Pkg.jl#using-the-development-version-of-pkgjl), +but the capabilities are limited as all other packages will depend on the stdlib +version of the package and will not work with the modified package. + ## API -The public API of `Distributed` consists of a variety of functions for various tasks; for creating and destroying processes within a cluster: +The public API of `Distributed` consists of a variety of functions for various +tasks; for creating and destroying processes within a cluster: - `addprocs` - create one or more Julia processes and connect them to the cluster - `rmprocs` - shutdown and remove one or more Julia processes from the cluster @@ -33,7 +59,9 @@ The public API of `Distributed` consists of a variety of functions for various t For controlling other processes via RPC: - `remotecall` - call a function on another process and return a `Future` referencing the result of that call -- `Future` - an object that references the result of a `remotecall` that hasn't yet completed - use `fetch` to return the call's result, or `wait` to just wait for the remote call to finish +- `Future` - an object that references the result of a `remotecall` that hasn't + yet completed - use `fetch` to return the call's result, or `wait` to just + wait for the remote call to finish. - `remotecall_fetch` - the same as `fetch(remotecall(...))` - `remotecall_wait` - the same as `wait(remotecall(...))` - `remote_do` - like `remotecall`, but does not provide a way to access the result of the call @@ -62,6 +90,15 @@ For controlling multiple processes at once: ### Process Identifiers -Julia processes connected with `Distributed` are all assigned a cluster-unique `Int` identifier, starting from `1`. The first Julia process within a cluster is given ID `1`, while other processes added via `addprocs` get incrementing IDs (`2`, `3`, etc.). Functions and macros which communicate from one process to another usually take one or more identifiers to determine which process they target - for example, `remotecall_fetch(myid, 2)` calls `myid()` on process 2. - -**Note:** Only process 1 (often called the "head", "primary", or "master") may add or remove processes, and manages the rest of the cluster. Other processes (called "workers" or "worker processes") may still call functions on each other and send and receive data, but `addprocs`/`rmprocs` on worker processes will fail with an error. +Julia processes connected with `Distributed` are all assigned a cluster-unique +`Int` identifier, starting from `1`. The first Julia process within a cluster is +given ID `1`, while other processes added via `addprocs` get incrementing IDs +(`2`, `3`, etc.). Functions and macros which communicate from one process to +another usually take one or more identifiers to determine which process they +target - for example, `remotecall_fetch(myid, 2)` calls `myid()` on process 2. + +**Note:** Only process 1 (often called the "head", "primary", or "master") may +add or remove processes, and manages the rest of the cluster. Other processes +(called "workers" or "worker processes") may still call functions on each other +and send and receive data, but `addprocs`/`rmprocs` on worker processes will +fail with an error. \ No newline at end of file diff --git a/src/Distributed.jl b/src/Distributed.jl index 9983d9b..4a44266 100644 --- a/src/Distributed.jl +++ b/src/Distributed.jl @@ -49,6 +49,7 @@ export procs, remote, remotecall, + remotecall_eval, remotecall_fetch, remotecall_wait, remote_do, diff --git a/src/cluster.jl b/src/cluster.jl index 919d335..53dca5d 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -99,10 +99,10 @@ mutable struct Worker del_msgs::Array{Any,1} # XXX: Could del_msgs and add_msgs be Channels? add_msgs::Array{Any,1} @atomic gcflag::Bool - state::WorkerState - c_state::Condition # wait for state changes - ct_time::Float64 # creation time - conn_func::Any # used to setup connections lazily + @atomic state::WorkerState + c_state::Threads.Condition # wait for state changes, lock for state + ct_time::Float64 # creation time + conn_func::Any # used to setup connections lazily r_stream::IO w_stream::IO @@ -135,7 +135,7 @@ mutable struct Worker if haskey(map_pid_wrkr, id) return map_pid_wrkr[id] end - w=new(id, Threads.ReentrantLock(), [], [], false, W_CREATED, Condition(), time(), conn_func) + w=new(id, Threads.ReentrantLock(), [], [], false, W_CREATED, Threads.Condition(), time(), conn_func) w.initialized = Event() register_worker(w; role = role) w @@ -147,12 +147,14 @@ end wid(w::Worker; role= :default) = w.id function set_worker_state(w, state) - w.state = state - notify(w.c_state; all=true) + lock(w.c_state) do + @atomic w.state = state + notify(w.c_state; all=true) + end end function check_worker_state(w::Worker; role= :default) - if w.state === W_CREATED + if (@atomic w.state) === W_CREATED if !isclusterlazy(role = role) pg = PGRP(role = role) if pg.topology === :all_to_all @@ -174,6 +176,7 @@ function check_worker_state(w::Worker; role= :default) wait_for_conn(w; role=role) end end + return nothing end exec_conn_func(id::Int; role= :default) = exec_conn_func(worker_from_id(id; role = role)::Worker; role = role) @@ -191,13 +194,15 @@ function exec_conn_func(w::Worker; role= :default) end function wait_for_conn(w; role=:defaut) - if w.state === W_CREATED + if (@atomic w.state) === W_CREATED timeout = worker_timeout() - (time() - w.ct_time) timeout <= 0 && error("peer $(wid(w, role=role)) has not connected to $(myid(role=role))") - @async (sleep(timeout); notify(w.c_state; all=true)) - wait(w.c_state) - w.state === W_CREATED && error("peer $(wid(w, role=role)) didn't connect to $(myid(role=role)) within $timeout seconds") + if timedwait(() -> (@atomic w.state) === W_CONNECTED, timeout) === :timed_out + # Notify any waiters on the state and throw + @lock w.c_state notify(w.c_state) + error("peer $(w.id) didn't connect to $(myid()) within $timeout seconds") + end end nothing end @@ -515,19 +520,24 @@ function addprocs_locked(manager::ClusterManager; kwargs...) # call manager's `launch` is a separate task. This allows the master # process initiate the connection setup process as and when workers come # online + # NOTE: Must be `@async`. See FIXME above t_launch = @async launch(manager, params, launched, launch_ntfy) @sync begin while true if isempty(launched) istaskdone(t_launch) && break - @async (sleep(1); notify(launch_ntfy)) + @async begin # NOTE: Must be `@async`. See FIXME above + sleep(1) + notify(launch_ntfy) + end wait(launch_ntfy) end if !isempty(launched) wconfig = popfirst!(launched) let wconfig=wconfig + # NOTE: Must be `@async`. See FIXME above @async setup_launched_worker(manager, wconfig, launched_q) end end @@ -678,7 +688,12 @@ function create_worker(manager, wconfig) # require the value of config.connect_at which is set only upon connection completion for jw in pgm.workers if (wid(jw, role=role) != 1) && (wid(jw, role=role) < wid(w, role=role)) - (jw.state === W_CREATED) && wait(jw.c_state) + # wait for wl to join + if (@atomic jw.state) === W_CREATED + lock(jw.c_state) do + wait(jw.c_state) + end + end push!(join_list, jw) end end @@ -701,7 +716,12 @@ function create_worker(manager, wconfig) end for wl in wlist - (wl.state === W_CREATED) && wait(wl.c_state) + lock(wl.c_state) do + if (@atomic wl.state) === W_CREATED + # wait for wl to join + wait(wl.c_state) + end + end push!(join_list, wl) end end @@ -716,11 +736,9 @@ function create_worker(manager, wconfig) send_msg_now(w, MsgHeader(RRID(0,0), ntfy_oid), join_message; role = role) @async manage(w.manager, wid(w, role=role), w.config, :register) + # wait for rr_ntfy_join with timeout - timedout = false - @async (sleep($timeout); timedout = true; put!(rr_ntfy_join, 1)) - wait(rr_ntfy_join) - if timedout + if timedwait(() -> isready(rr_ntfy_join), timeout) === :timed_out error("worker did not connect within $timeout seconds") end lock(client_refs) do @@ -762,24 +780,21 @@ function redirect_output_from_additional_worker(pid, port) end function check_master_connect() - timeout = worker_timeout() * 1e9 # If we do not have at least process 1 connect to us within timeout # we log an error and exit, unless we're running on valgrind if ccall(:jl_running_on_valgrind,Cint,()) != 0 return end - @async begin - map_pid_wrkr = Map_pid_wrkr(role = :worker) - start = time_ns() - while !haskey(map_pid_wrkr, 1) && (time_ns() - start) < timeout - sleep(1.0) - end - - if !haskey(map_pid_wrkr, 1) - print(stderr, "Master process (id 1) could not connect within $(timeout/1e9) seconds.\nexiting.\n") - exit(1) - end + errormonitor( + @async begin + map_pid_wrkr = Map_pid_wrkr(role = :worker) + timeout = worker_timeout() + if timedwait(() -> haskey(map_pid_wrkr, 1), timeout) === :timed_out + print(stderr, "Master process (id 1) could not connect within $(timeout) seconds.\nexiting.\n") + exit(1) + end end + ) end @@ -985,7 +1000,7 @@ function nprocs(; role= :default) n = length(pg.workers) # filter out workers in the process of being setup/shutdown. for jw in pg.workers - if !isa(jw, LocalProcess) && (jw.state !== W_CONNECTED) + if !isa(jw, LocalProcess) && ((@atomic jw.state) !== W_CONNECTED) n = n - 1 end end @@ -1037,7 +1052,7 @@ function procs(; role= :default) pg = PGRP(role = role) if myid(role=role) == 1 || (pg.topology === :all_to_all && !isclusterlazy(role = role)) # filter out workers in the process of being setup/shutdown. - return Int[wid(x, role=role) for x in pg.workers if isa(x, LocalProcess) || (x.state === W_CONNECTED)] + return Int[wid(x, role=role) for x in pg.workers if isa(x, LocalProcess) || ((@atomic x.state) === W_CONNECTED)] else return Int[wid(x, role=role) for x in pg.workers] end @@ -1047,7 +1062,7 @@ function id_in_procs(id0; role= :default) # faster version of `id in procs()` pg = PGRP(role = role) if myid(role=role) == 1 || (pg.topology === :all_to_all && !isclusterlazy(role = role)) for x in pg.workers - if (wid(x, role=role)::Int) == id0 && (isa(x, LocalProcess) || (x::Worker).state === W_CONNECTED) + if (wid(x, role=role)::Int) == id0 && (isa(x, LocalProcess) || (@atomic (x::Worker).state) === W_CONNECTED) return true end end @@ -1070,7 +1085,7 @@ Specifically all workers bound to the same ip-address as `pid` are returned. function procs(pid::Integer; role= :default) if myid(role = role) == 1 map_pid_wrkr = Map_pid_wrkr(role = role) - all_workers = [x for x in PGRP(role = role).workers if isa(x, LocalProcess) || (x.state === W_CONNECTED)] + all_workers = [x for x in PGRP(role = role).workers if isa(x, LocalProcess) || ((@atomic x.state) === W_CONNECTED)] if (pid == 1) || (isa(map_pid_wrkr[pid].manager, LocalManager)) Int[wid(x, role=role) for x in filter(w -> (wid(w, role=role)==1) || (isa(w.manager, LocalManager)), all_workers)] else @@ -1178,11 +1193,11 @@ function _rmprocs(pids, role, waitfor) start = time_ns() while (time_ns() - start) < waitfor*1e9 - all(w -> w.state === W_TERMINATED, rmprocset) && break + all(w -> (@atomic w.state) === W_TERMINATED, rmprocset) && break sleep(min(0.1, waitfor - (time_ns() - start)/1e9)) end - unremoved = [wid(wrkr, role=role) for wrkr in filter(w -> w.state !== W_TERMINATED, rmprocset)] + unremoved = [wid(wrkr, role=role) for wrkr in filter(w -> (@atomic w.state) !== W_TERMINATED, rmprocset)] if length(unremoved) > 0 estr = string("rmprocs: pids ", unremoved, " not terminated after ", waitfor, " seconds.") throw(ErrorException(estr)) @@ -1482,18 +1497,16 @@ end using Random: randstring -let inited = false - # do initialization that's only needed when there is more than 1 processor - global function init_multi() - if !inited - inited = true - push!(Base.package_callbacks, _require_callback) - atexit(() -> terminate_all_workers(role = :master)) # TO CHECK (role argument ???) - init_bind_addr() - cluster_cookie(randstring(HDR_COOKIE_LEN)) - end - return nothing +# do initialization that's only needed when there is more than 1 processor +const inited = Threads.Atomic{Bool}(false) +function init_multi() + if !Threads.atomic_cas!(inited, false, true) + push!(Base.package_callbacks, _require_callback) + atexit(terminate_all_workers) + init_bind_addr() + cluster_cookie(randstring(HDR_COOKIE_LEN)) end + return nothing end function init_parallel() diff --git a/src/clusterserialize.jl b/src/clusterserialize.jl index 589ebe1..bdd82b8 100644 --- a/src/clusterserialize.jl +++ b/src/clusterserialize.jl @@ -169,8 +169,14 @@ function deserialize_global_from_main(s::ClusterSerializer, sym) end Core.eval(Main, Expr(:global, sym)) if sym_isconst - ccall(:jl_set_const, Cvoid, (Any, Any, Any), Main, sym, v) + # Note that the post-lowering const form is not allowed in value + # position, so there needs to be a dummy `nothing` argument to drop the + # return value. + Core.eval(Main, Expr(:block, + Expr(:const, GlobalRef(Main, sym), v), + nothing)) else + Core.eval(Main, Expr(:global, sym)) invokelatest(setglobal!, Main, sym, v) end return nothing diff --git a/src/macros.jl b/src/macros.jl index ade5911..7a85cd2 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -77,8 +77,11 @@ end Create a closure around an expression and run the closure asynchronously on process `p`. Return a [`Future`](@ref) to the result. + If `p` is the quoted literal symbol `:any`, then the system will pick a -processor to use automatically. +processor to use automatically. Using `:any` will not apply any form of +load-balancing, consider using a [`WorkerPool`](@ref) and [`remotecall(f, +::WorkerPool)`](@ref) if you need load-balancing. # Examples ```julia-repl @@ -235,9 +238,9 @@ processes to have execute the expression. Similar to calling `remotecall_eval(Main, procs, expr)`, but with two extra features: - - `using` and `import` statements run on the calling process first, to ensure - packages are precompiled. - - The current source file path used by `include` is propagated to other processes. +- `using` and `import` statements run on the calling process first, to ensure + packages are precompiled. +- The current source file path used by `include` is propagated to other processes. """ function check_args_3b(args...) diff --git a/src/managers.jl b/src/managers.jl index 86431a2..739f13d 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -113,7 +113,7 @@ addprocs([ * `exeflags`: additional flags passed to the worker processes. It can either be a `Cmd`, a `String` holding one flag, or a collection of strings, with one element per flag. - E.g. `\`--threads=auto project=.\``, `"--compile-trace=stderr"` or `["--threads=auto", "--compile=all"]`. + E.g. `\`--threads=auto project=.\``, `"--compile-trace=stderr"` or `["--threads=auto", "--compile=all"]`. * `topology`: Specifies how the workers connect to each other. Sending a message between unconnected workers results in an error. @@ -180,7 +180,7 @@ function launch(manager::SSHManager, params::Dict, launched::Array, launch_ntfy: # Wait for all launches to complete. @sync for (i, (machine, cnt)) in enumerate(manager.machines) let machine=machine, cnt=cnt - @async try + @async try launch_on_machine(manager, $machine, $cnt, params, launched, launch_ntfy) catch e print(stderr, "exception launching on machine $(machine) : $(e)\n") @@ -768,7 +768,8 @@ function kill(manager::SSHManager, pid::Int, config::WorkerConfig) nothing end -function kill(manager::LocalManager, pid::Int, config::WorkerConfig; exit_timeout = 15, term_timeout = 15) +function kill(manager::LocalManager, pid::Int, config::WorkerConfig; profile_wait = 6, exit_timeout = 15, term_timeout = 15) + # profile_wait = 6 is 1s for profile, 5s for the report to show # First, try sending `exit()` to the remote over the usual control channels remote_do(exit, pid; role = :master) @@ -777,7 +778,14 @@ function kill(manager::LocalManager, pid::Int, config::WorkerConfig; exit_timeou # Check to see if our child exited, and if not, send an actual kill signal if !process_exited(config.process) - @warn("Failed to gracefully kill worker $(pid), sending SIGQUIT") + @warn "Failed to gracefully kill worker $(pid)" + profile_sig = Sys.iswindows() ? nothing : Sys.isbsd() ? ("SIGINFO", 29) : ("SIGUSR1" , 10) + if profile_sig !== nothing + @warn("Sending profile $(profile_sig[1]) to worker $(pid)") + kill(config.process, profile_sig[2]) + sleep(profile_wait) + end + @warn("Sending SIGQUIT to worker $(pid)") kill(config.process, Base.SIGQUIT) sleep(term_timeout) diff --git a/src/messages.jl b/src/messages.jl index 4f3f472..92afe8b 100644 --- a/src/messages.jl +++ b/src/messages.jl @@ -194,7 +194,7 @@ end function flush_gc_msgs(; role= :default) try for w in (PGRP(role = role)::ProcessGroup).workers - if isa(w,Worker) && (w.state == W_CONNECTED) && w.gcflag + if isa(w,Worker) && ((@atomic w.state) == W_CONNECTED) && w.gcflag flush_gc_msgs(w; role = role) end end diff --git a/src/process_messages.jl b/src/process_messages.jl index 2f190e2..b21d3ea 100644 --- a/src/process_messages.jl +++ b/src/process_messages.jl @@ -223,7 +223,7 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool; role= println(stderr, "Process($(myid(role=role))) - Unknown remote, closing connection.") elseif !(wpid in map_del_wrkr) werr = worker_from_id(wpid) - oldstate = werr.state + oldstate = @atomic werr.state set_worker_state(werr, W_TERMINATED) # If unhandleable error occurred talking to pid 1, exit diff --git a/src/remotecall.jl b/src/remotecall.jl index 4d1fedc..38ca131 100644 --- a/src/remotecall.jl +++ b/src/remotecall.jl @@ -275,7 +275,7 @@ end const any_gc_flag = Threads.Condition() function start_gc_msgs_task(; role= :default) errormonitor( - Threads.@spawn begin + @async begin while true lock(any_gc_flag) do # this might miss events @@ -323,7 +323,7 @@ function process_worker(rr; role= :default) msg = (remoteref_id(rr), myid(role = role)) # Needs to acquire a lock on the del_msg queue - T = Threads.@spawn begin + T = @async begin publish_del_msg!($w, $msg) end Base.errormonitor(T) @@ -451,7 +451,7 @@ remotecall(f, id::Integer, args...; role= :default, kwargs...) = remotecall(f, worker_from_id(id; role = role), args...; role = role, kwargs...) function remotecall_fetch(f, w::LocalProcess, args...; role= :default, kwargs...) - v=run_work_thunk(local_remotecall_thunk(f,args, kwargs), false; role = role) + v=run_work_thunk(local_remotecall_thunk(f, args, kwargs), false; role = role) return isa(v, RemoteException) ? throw(v) : v end @@ -816,6 +816,9 @@ close(rr::RemoteChannel; role= :default) = call_on_owner(close_ref, rr; role = r isopen_ref(rid; role= :default) = isopen(lookup_ref(rid; role = role).c) isopen(rr::RemoteChannel; role= :default) = call_on_owner(isopen_ref, rr; role = role) +isempty_ref(rid; role= :default) = isempty(lookup_ref(rid; role = role).c) +Base.isempty(rr::RemoteChannel; role= :default) = call_on_owner(isempty_ref, rr; role=role) + getindex(r::RemoteChannel; role= :default) = fetch(r; role = role) getindex(r::Future; role= :default) = fetch(r; role = role) diff --git a/src/workerpool.jl b/src/workerpool.jl index e166f5d..bb66245 100644 --- a/src/workerpool.jl +++ b/src/workerpool.jl @@ -135,6 +135,27 @@ function remotecall_pool(rc_f, f, pool::AbstractWorkerPool, args...; role= :defa end end +# Specialization for remotecall. We have to wait for the Future it returns +# before putting the worker back in the pool. +function remotecall_pool(rc_f::typeof(remotecall), f, pool::AbstractWorkerPool, args...; kwargs...) + worker = take!(pool) + local x + try + x = rc_f(f, worker, args...; kwargs...) + catch + put!(pool, worker) + rethrow() + end + t = Threads.@spawn Threads.threadpool() try + wait(x) + catch # just wait, ignore errors here + finally + put!(pool, worker) + end + errormonitor(t) + return x +end + # Check if pool is local or remote and forward calls if required. # NOTE: remotecall_fetch does it automatically, but this will be more efficient as # it avoids the overhead associated with a local remotecall. @@ -246,6 +267,10 @@ remotecall_fetch(f, pool::AbstractWorkerPool, args...; kwargs...) = remotecall_p [`WorkerPool`](@ref) variant of `remote_do(f, pid, ....)`. Wait for and take a free worker from `pool` and perform a `remote_do` on it. + +Note that it's not possible to wait for the result of a `remote_do()` to finish +so the worker will immediately be put back in the pool (i.e. potentially causing +oversubscription). """ #remote_do(f, pool::AbstractWorkerPool, args...; role= :default, kwargs...) = remotecall_pool((f,pool) -> remote_do(f, pool, role = role, args...; kwargs...); role = role) remote_do(f, pool::AbstractWorkerPool, args...; kwargs...) = remotecall_pool(remote_do, f, pool, args...; kwargs...) @@ -379,3 +404,27 @@ function remotecall_pool(rc_f, f, pool::CachingPool, args...; role= :default, kw put!(pool, worker; role=role) end end + + +# Specialization for remotecall. We have to wait for the Future it returns +# before putting the worker back in the pool. +function remotecall_pool(rc_f::typeof(remotecall), f, pool::CachingPool, args...; role= :default, kwargs...) + worker = take!(pool; role=role) + f_ref = get(pool.map_obj2ref, (worker, f), (f, RemoteChannel(worker; role=role))) + isa(f_ref, Tuple) && (pool.map_obj2ref[(worker, f)] = f_ref[2]) # Add to tracker + local x + try + x = rc_f(exec_from_cache, worker, f_ref, args...; role=role, kwargs...) + catch + put!(pool, worker; role=role) + rethrow() + end + t = Threads.@spawn Threads.threadpool() try + wait(x) + catch # just wait, ignore errors here + finally + put!(pool, worker; role=role) + end + errormonitor(t) + return x +end \ No newline at end of file diff --git a/test/aqua.jl b/test/aqua.jl new file mode 100644 index 0000000..56c01c5 --- /dev/null +++ b/test/aqua.jl @@ -0,0 +1,8 @@ +using Aqua +using Distributed +Aqua.test_all( + Distributed, + # This should be excluded, but it's not clear how to do that on Aqua's API + # given it's not-defined. (The Julia Base ambiguity test does it something like this) + # ambiguities=(exclude=[GlobalRef(Distributed, :cluster_manager)]) +) \ No newline at end of file diff --git a/test/distributed_exec.jl b/test/distributed_exec.jl index 523b4bd..63a00cc 100644 --- a/test/distributed_exec.jl +++ b/test/distributed_exec.jl @@ -3,6 +3,8 @@ using Test, Distributed, Random, Serialization, Sockets import Distributed: launch, manage +pathsep = Sys.iswindows() ? ";" : ":" + @test cluster_cookie() isa String include(joinpath(Sys.BINDIR, "..", "share", "julia", "test", "testenv.jl")) @@ -171,27 +173,6 @@ function poll_while(f::Function; timeout_seconds::Integer = 120) return true end -function _getenv_include_thread_unsafe() - environment_variable_name = "JULIA_TEST_INCLUDE_THREAD_UNSAFE" - default_value = "false" - environment_variable_value = strip(get(ENV, environment_variable_name, default_value)) - b = parse(Bool, environment_variable_value)::Bool - return b -end -const _env_include_thread_unsafe = _getenv_include_thread_unsafe() -function include_thread_unsafe_tests() - if Threads.maxthreadid() > 1 - if _env_include_thread_unsafe - return true - end - msg = "Skipping a thread-unsafe test because `Threads.maxthreadid() > 1`" - @warn msg Threads.maxthreadid() - Test.@test_broken false - return false - end - return true -end - # Distributed GC tests for Futures function test_futures_dgc(id) f = remotecall(myid, id) @@ -248,28 +229,43 @@ put!(f, :OK) @test fetch(f) === :OK # RemoteException should be thrown on a put! when another process has set the value -f = Future(wid1) -fid = remoteref_id(f) - -fstore = RemoteChannel(wid2) -put!(fstore, f) # send f to wid2 -put!(f, :OK) # set value from master - -@test remotecall_fetch(k->haskey(Distributed.PGRP().refs, k), wid1, fid) == true - -testval = remotecall_fetch(wid2, fstore) do x - try - put!(fetch(x), :OK) - return 0 - catch e - if isa(e, RemoteException) - return 1 - else - return 2 +# Test this multiple times as races have been seen where `@spawn` was used over +# `@async`. Issue #124 +max_attempts = 100 +for i in 1:max_attempts + let f = Future(wid1), fid = remoteref_id(f), fstore = RemoteChannel(wid2) + # RemoteException should be thrown on a put! when another process has set the value + + put!(fstore, f) # send f to wid2 + put!(f, :OK) # set value from master + + @test remotecall_fetch(k->haskey(Distributed.PGRP().refs, k), wid1, fid) == true + + # fstore should be ready immediately, but races due to use of `@spawn` have caused + # this to fail in the past. So we poll for readiness before the main test after this + # which internally checks for `isready` to decide whether to error or not + w = remotecall_fetch(wid2, fstore) do x + timedwait(() -> isready(fetch(x)), 10) + end + w == :ok || @info "isready timed out on attempt $i (max $max_attempts)" + @test w == :ok + # This is the actual test. It should fail because the value is already set remotely + testval = remotecall_fetch(wid2, fstore) do x + try + put!(fetch(x), :OK) + return 0 + catch e + if isa(e, RemoteException) + return 1 + else + rethrow() + end + end end + testval == 1 || @info "test failed on attempt $i (max $max_attempts)" + @test testval == 1 end end -@test testval == 1 # Issue number #25847 @everywhere function f25847(ref) @@ -315,14 +311,16 @@ let wid1 = workers()[1], fstore = RemoteChannel(wid2) put!(fstore, rr) - if include_thread_unsafe_tests() - @test remotecall_fetch(k -> haskey(Distributed.PGRP().refs, k), wid1, rrid) == true - end + + # timedwait() is necessary because wid1 is asynchronously informed of + # the existence of rr/rrid through the call to `put!(fstore, rr)`. + @test timedwait(() -> remotecall_fetch(k -> haskey(Distributed.PGRP().refs, k), wid1, rrid), 10) === :ok + finalize(rr) # finalize locally yield() # flush gc msgs - if include_thread_unsafe_tests() - @test remotecall_fetch(k -> haskey(Distributed.PGRP().refs, k), wid1, rrid) == true - end + + @test timedwait(() -> remotecall_fetch(k -> haskey(Distributed.PGRP().refs, k), wid1, rrid), 10) === :ok + remotecall_fetch(r -> (finalize(take!(r)); yield(); nothing), wid2, fstore) # finalize remotely sleep(0.5) # to ensure that wid2 messages have been executed on wid1 @test poll_while(() -> remotecall_fetch(k -> haskey(Distributed.PGRP().refs, k), wid1, rrid)) @@ -517,6 +515,17 @@ let ch = RemoteChannel(() -> Channel(1)) @test 10 == test_iteration_collect(ch) end +# Test isempty(::RemoteChannel). This should not modify the underlying +# AbstractChannel, which Base's default implementation will do. +let + chan = Channel(1) + push!(chan, 1) + remotechan = RemoteChannel(() -> chan) + @test !isempty(remotechan) + # Calling `isempty(remotechan)` shouldn't have modified `chan` + @test !isempty(chan) +end + # make sure exceptions propagate when waiting on Tasks @test_throws CompositeException (@sync (@async error("oops"))) try @@ -727,6 +736,8 @@ wp = WorkerPool(workers()) @test nworkers() == length(unique(remotecall_fetch(wp->pmap(_->myid(), wp, 1:100), id_other, wp))) wp = WorkerPool(2:3) @test sort(unique(pmap(_->myid(), wp, 1:100))) == [2,3] +@test fetch(remotecall(myid, wp)) in wp.workers +@test_throws RemoteException fetch(remotecall(error, wp)) # wait on worker pool wp = WorkerPool(2:2) @@ -753,6 +764,8 @@ status = timedwait(() -> isready(f), 10) # CachingPool tests wp = CachingPool(workers()) @test [1:100...] == pmap(x->x, wp, 1:100) +@test fetch(remotecall(myid, wp)) in wp.workers +@test_throws RemoteException fetch(remotecall(error, wp)) clear!(wp) @test length(wp.map_obj2ref) == 0 @@ -1020,13 +1033,14 @@ wid0 = workers()[1] remotecall_fetch(()->T16091, wid0) false catch ex - ((ex::RemoteException).captured::CapturedException).ex === UndefVarError(:T16091) + @info "----------------- $(((ex::RemoteException).captured::CapturedException).ex)" + ((ex::RemoteException).captured::CapturedException).ex === UndefVarError(:T16091, Main) end @test try remotecall_fetch(identity, wid0, T16091) false catch ex - ((ex::RemoteException).captured::CapturedException).ex === UndefVarError(:T16091) + ((ex::RemoteException).captured::CapturedException).ex === UndefVarError(:T16091, Main) end f16091a() = 1 @@ -1097,6 +1111,23 @@ let @test_throws RemoteException fetch(ref) end +# Test the behaviour of remotecall(f, ::AbstractWorkerPool), this should +# keep the worker out of the pool until the underlying remotecall has +# finished. +for PoolType in (WorkerPool, CachingPool) + let + remotechan = RemoteChannel(wrkr1) + pool = PoolType([wrkr1]) + put_future = remotecall(() -> wait(remotechan), pool) + @test !isready(pool) + put!(remotechan, 1) + wait(put_future) + # The task that waits on the future to put it back into the pool runs + # asynchronously so we use timedwait() to check when the worker is back in. + @test timedwait(() -> isready(pool), 10) === :ok + end +end + # Test calling @everywhere from a module not defined on the workers module LocalBar using Distributed @@ -1756,18 +1787,17 @@ function reuseport_tests(;role = :default) end # Ensure that the code has indeed been successfully executed everywhere - @test all(in(results), procs()) + return all(in(results), procs()) end # Test that the client port is reused. SO_REUSEPORT may not be supported on # all UNIX platforms, Linux kernels prior to 3.9 and older versions of OSX @assert nprocs() == 1 addprocs_with_testenv(4; lazy=false) -if ccall(:jl_has_so_reuseport, Int32, ()) == 1 - reuseport_tests() -else - @info "SO_REUSEPORT is unsupported, skipping reuseport tests" -end + +skip_reuseexport = ccall(:jl_has_so_reuseport, Int32, ()) != 1 +skip_reuseexport && @debug "SO_REUSEPORT support missing, reuseport_tests skipped" +@test reuseport_tests() skip = skip_reuseexport # issue #27933 a27933 = :_not_defined_27933 @@ -1841,9 +1871,10 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp project = mkdir(joinpath(tmp, "project")) depots = [mkdir(joinpath(tmp, "depot1")), mkdir(joinpath(tmp, "depot2"))] load_path = [mkdir(joinpath(tmp, "load_path")), "@stdlib", "@"] - pathsep = Sys.iswindows() ? ";" : ":" + shipped_depots = DEPOT_PATH[2:end] # stdlib caches env = Dict( - "JULIA_DEPOT_PATH" => join(depots, pathsep), + # needs a trailing pathsep to access the stdlib depot + "JULIA_DEPOT_PATH" => join(depots, pathsep) * pathsep, "JULIA_LOAD_PATH" => join(load_path, pathsep), # Explicitly propagate `TMPDIR`, in the event that we're running on a # CI system where `TMPDIR` is special. @@ -1873,7 +1904,7 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp end """ cmd = setenv(`$(julia) -p1 -e $(testcode * extracode)`, env) - @test success(cmd) + @test success(pipeline(cmd; stdout, stderr)) # --project extracode = """ for w in workers() @@ -1882,11 +1913,11 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp end """ cmd = setenv(`$(julia) --project=$(project) -p1 -e $(testcode * extracode)`, env) - @test success(cmd) + @test success(pipeline(cmd; stdout, stderr)) # JULIA_PROJECT cmd = setenv(`$(julia) -p1 -e $(testcode * extracode)`, (env["JULIA_PROJECT"] = project; env)) - @test success(cmd) + @test success(pipeline(cmd; stdout, stderr)) # Pkg.activate(...) activateish = """ Base.ACTIVE_PROJECT[] = $(repr(project)) @@ -1894,11 +1925,17 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp addprocs(1) """ cmd = setenv(`$(julia) -e $(activateish * testcode * extracode)`, env) - @test success(cmd) + @test success(pipeline(cmd; stdout, stderr)) # JULIA_(LOAD|DEPOT)_PATH shufflecode = """ - d = reverse(DEPOT_PATH) - append!(empty!(DEPOT_PATH), d) + function reverse_first_two(depots) + custom_depots = depots[1:2] + standard_depots = depots[3:end] + custom_depots = reverse(custom_depots) + return append!(custom_depots, standard_depots) + end + new_depots = reverse_first_two(DEPOT_PATH) + append!(empty!(DEPOT_PATH), new_depots) l = reverse(LOAD_PATH) append!(empty!(LOAD_PATH), l) """ @@ -1913,23 +1950,23 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp end """ cmd = setenv(`$(julia) -e $(shufflecode * addcode * testcode * extracode)`, env) - @test success(cmd) + @test success(pipeline(cmd; stdout, stderr)) # Mismatch when shuffling after proc addition failcode = shufflecode * setupcode * """ for w in workers() @test remotecall_fetch(load_path, w) == reverse(LOAD_PATH) == $(repr(load_path)) - @test remotecall_fetch(depot_path, w) == reverse(DEPOT_PATH) == $(repr(depots)) + @test remotecall_fetch(depot_path, w) == $(repr(vcat(reverse(depots), shipped_depots))) end """ cmd = setenv(`$(julia) -p1 -e $(failcode)`, env) - @test success(cmd) + @test success(pipeline(cmd; stdout, stderr)) # Passing env or exeflags to addprocs(...) to override defaults envcode = """ using Distributed project = mktempdir() env = Dict( "JULIA_LOAD_PATH" => string(LOAD_PATH[1], $(repr(pathsep)), "@stdlib"), - "JULIA_DEPOT_PATH" => DEPOT_PATH[1], + "JULIA_DEPOT_PATH" => DEPOT_PATH[1] * $(repr(pathsep)), "TMPDIR" => ENV["TMPDIR"], ) addprocs(1; env = env, exeflags = `--project=\$(project)`) @@ -1937,14 +1974,14 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp addprocs(1; env = env) """ * setupcode * """ for w in workers() - @test remotecall_fetch(depot_path, w) == [DEPOT_PATH[1]] + @test remotecall_fetch(depot_path, w) == vcat(DEPOT_PATH[1], $(repr(shipped_depots))) @test remotecall_fetch(load_path, w) == [LOAD_PATH[1], "@stdlib"] @test remotecall_fetch(active_project, w) == project @test remotecall_fetch(Base.active_project, w) == joinpath(project, "Project.toml") end """ cmd = setenv(`$(julia) -e $(envcode)`, env) - @test success(cmd) + @test success(pipeline(cmd; stdout, stderr)) end end include("splitrange.jl") @@ -1960,7 +1997,7 @@ begin # Next, ensure we get a log message when a worker does not cleanly exit w = only(addprocs(1)) - @test_logs (:warn, r"sending SIGQUIT") begin + @test_logs (:warn, r"Sending SIGQUIT") match_mode=:any begin remote_do(w) do # Cause the 'exit()' message that `rmprocs()` sends to do nothing Core.eval(Base, :(exit() = nothing)) @@ -1973,7 +2010,10 @@ end # Run topology tests last after removing all workers, since a given # cluster at any time only supports a single topology. -nprocs() > 1 && rmprocs(workers()) +if nprocs() > 1 + rmprocs(workers()) +end +include("threads.jl") include("topology.jl") diff --git a/test/runtests.jl b/test/runtests.jl index d34d07c..3651f70 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,14 +1,21 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license +using Test +using Distributed +# only run these if Aqua is installed. i.e. Pkg.test has installed it, or it is provided as a shared package +if Base.locate_package(Base.PkgId(Base.UUID("4c88cf16-eb10-579e-8560-4a9242c79595"), "Aqua")) isa String + @testset "Aqua.jl tests" begin + include("aqua.jl") + end +end + # Run the distributed test outside of the main driver since it needs its own # set of dedicated workers. include(joinpath(Sys.BINDIR, "..", "share", "julia", "test", "testenv.jl")) disttestfile = joinpath(@__DIR__, "distributed_exec.jl") -cmd = `$test_exename $test_exeflags $disttestfile` - -if !success(pipeline(cmd; stdout=stdout, stderr=stderr)) && ccall(:jl_running_on_valgrind,Cint,()) == 0 - error("Distributed test failed, cmd : $cmd") +@testset let cmd = `$test_exename $test_exeflags $disttestfile` + @test success(pipeline(cmd; stdout=stdout, stderr=stderr)) && ccall(:jl_running_on_valgrind,Cint,()) == 0 end include("managers.jl") diff --git a/test/threads.jl b/test/threads.jl new file mode 100644 index 0000000..c978dd4 --- /dev/null +++ b/test/threads.jl @@ -0,0 +1,55 @@ +using Test +using Distributed +using Base.Iterators: product +exeflags = ("--startup-file=no", + "--check-bounds=yes", + "--depwarn=error", + "--threads=2") +function call_on(f, wid, tid) + remotecall(wid) do + t = Task(f) + ccall(:jl_set_task_tid, Cvoid, (Any, Cint), t, tid - 1) + schedule(t) + @assert Threads.threadid(t) == tid + t + end +end +# Run function on process holding the data to only serialize the result of f. +# This becomes useful for things that cannot be serialized (e.g. running tasks) +# or that would be unnecessarily big if serialized. +fetch_from_owner(f, rr) = remotecall_fetch(f ∘ fetch, rr.where, rr) +isdone(rr) = fetch_from_owner(istaskdone, rr) +isfailed(rr) = fetch_from_owner(istaskfailed, rr) +@testset "RemoteChannel allows put!/take! from thread other than 1" begin + ws = ts = product(1:2, 1:2) + + # We want (the default) laziness, so that we wait for `Worker.c_state`! + procs_added = addprocs(2; exeflags, lazy=true) + + @testset "from worker $w1 to $w2 via 1" for (w1, w2) in ws + @testset "from thread $w1.$t1 to $w2.$t2" for (t1, t2) in ts + p1 = procs_added[w1] + p2 = procs_added[w2] + chan_id = first(procs_added) + chan = RemoteChannel(chan_id) + send = call_on(p1, t1) do + put!(chan, nothing) + end + recv = call_on(p2, t2) do + take!(chan) + end + # Wait on the spawned tasks on the owner. Note that we use + # timedwait() instead of @sync to avoid deadlocks. + t1 = Threads.@spawn fetch_from_owner(wait, recv) + t2 = Threads.@spawn fetch_from_owner(wait, send) + @test timedwait(() -> istaskdone(t1), 60) == :ok + @test timedwait(() -> istaskdone(t2), 60) == :ok + # Check the tasks + @test isdone(send) + @test isdone(recv) + @test !isfailed(send) + @test !isfailed(recv) + end + end + rmprocs(procs_added) +end \ No newline at end of file From 7d93f671c68b2e50419e4c319f27131d14148536 Mon Sep 17 00:00:00 2001 From: decarvalhojunior-fh Date: Fri, 20 Jun 2025 12:29:56 -0300 Subject: [PATCH 54/54] ... --- src/cluster.jl | 2 +- src/macros.jl | 12 ++++++------ src/managers.jl | 3 ++- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/cluster.jl b/src/cluster.jl index 53dca5d..f291b66 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -355,7 +355,7 @@ function read_worker_host_port(io::IO) throw(LaunchWorkerError("Unable to read host:port string from worker. Launch command exited with error?")) end - @info "conninfo: $conninfo" + #@info "conninfo: $conninfo" ntries -= 1 bind_addr, port = parse_connection_info(conninfo) diff --git a/src/macros.jl b/src/macros.jl index 7a85cd2..aeb9084 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -46,7 +46,7 @@ julia> fetch(f) function check_args_2(args...) na = length(args) if na==1 - role = :(role = :default) + role = Expr(:kw, :role, :(:defaut)) #:(role = :default) expr = args[1] elseif na==2 role = args[1] @@ -107,7 +107,7 @@ julia> fetch(f) function check_args_3a(args...) na = length(args) if na==2 - role = :(role = :default) + role = Expr(:kw, :role, :(:defaut)) #:(role = :default) p = args[1] expr = args[2] elseif na==3 @@ -123,7 +123,7 @@ end macro spawnat(args...) rolearg, p, expr = check_args_3a(args...) - #@info rolearg, typeof(rolearg) + #@info rolearg, typeof(rolearg) thunk = esc(:(()->($expr))) var = esc(Base.sync_varname) @@ -132,7 +132,7 @@ macro spawnat(args...) else spawncall = :(spawnat($(esc(p)), $thunk; $(esc(rolearg)))) end - quote + quote local ref = $spawncall if $(Expr(:islocal, var)) put!($var, ref) @@ -247,7 +247,7 @@ function check_args_3b(args...) na = length(args) if na==1 - rolearg = :(role = :default) + rolearg = Expr(:kw, :role, :(:defaut)) #:(role = :default) reducer = nothing loop = args[1] elseif na==2 @@ -256,7 +256,7 @@ function check_args_3b(args...) reducer = nothing loop = args[2] else - rolearg = :(role = :default) + rolearg = Expr(:kw, :role, :(:defaut)) #:(role = :default) reducer = args[1] loop = args[2] end diff --git a/src/managers.jl b/src/managers.jl index 739f13d..658c98a 100644 --- a/src/managers.jl +++ b/src/managers.jl @@ -642,7 +642,8 @@ function connect(manager::ClusterManager, pid::Int, config::WorkerConfig) release(sem) end else - (s, bind_addr) = connect_to_worker(#=bind_addr=# pubhost, port) +# (s, bind_addr) = connect_to_worker(#=bind_addr=# pubhost, port) + (s, bind_addr) = connect_to_worker(bind_addr, port) end config.bind_addr = bind_addr