diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 01e6847..caadcc0 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -14,53 +14,51 @@ concurrency:
   # Cancel intermediate builds: only pull request builds
   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref != 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release-') || github.run_number }}
   cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
+
 jobs:
   test:
-    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
+    name: julia -t${{ matrix.threads}} - ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
     runs-on: ${{ matrix.os }}
+    timeout-minutes: 30
     strategy:
       fail-fast: false
       matrix:
-        version:
-          - 'nightly'
-        os:
-          - ubuntu-latest
-          - macOS-latest
-          - windows-latest
-        arch:
-          - x64
-          - x86
+        threads:
+          # - '1'
+          - '4,4'
+        version: [nightly]
+        os: [ubuntu-latest, windows-latest, macOS-latest]
+        arch: [x64, x86, aarch64]
         exclude:
+          - os: ubuntu-latest
+            arch: aarch64
+          - os: windows-latest
+            arch: aarch64
+          - os: macOS-latest
+            arch: x64
           - os: macOS-latest
             arch: x86
     steps:
-      - uses: actions/checkout@v2
-      - uses: julia-actions/setup-julia@v1
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v2
         with:
           version: ${{ matrix.version }}
           arch: ${{ matrix.arch }}
-      - uses: actions/cache@v1
-        env:
-          cache-name: cache-artifacts
-        with:
-          path: ~/.julia/artifacts
-          key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
-          restore-keys: |
-            ${{ runner.os }}-test-${{ env.cache-name }}-
-            ${{ runner.os }}-test-${{ matrix.os }}
-            ${{ runner.os }}-
-      - uses: julia-actions/julia-buildpkg@v1
+      - uses: julia-actions/cache@v2
       - uses: julia-actions/julia-runtest@v1
         env:
           JULIA_DISTRIBUTED_TESTING_STANDALONE: 1
+          JULIA_NUM_THREADS: '${{ matrix.threads}}'
       - uses: julia-actions/julia-processcoverage@v1
-      - uses: codecov/codecov-action@v1
+      - uses: codecov/codecov-action@v5
         with:
-          file: lcov.info
+          files: lcov.info
+          token: ${{ secrets.CODECOV_TOKEN }}
+
   docs:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@latest
         with:
           # version: '1.6'
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..df02284
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+Manifest.toml
+*.swp
diff --git a/Project.toml b/Project.toml
index bb30760..382b1ab 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,8 +8,18 @@ Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 Sockets = "6462fe0b-24de-5631-8697-dd941f90decc"
 
 [extras]
+Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["LinearAlgebra", "Test"]
+test = ["Aqua", "LinearAlgebra", "Test"]
+
+[compat]
+Aqua = "0.8.10"
+LinearAlgebra = "1"
+Random = "1"
+Serialization = "1"
+Sockets = "1"
+Test = "1"
+julia = "1"
diff --git a/README.md b/README.md
index 76f6355..845347a 100644
--- a/README.md
+++ b/README.md
@@ -1,18 +1,57 @@
-# Distributed
-
-The `Distributed` package provides functionality for creating and controlling multiple Julia processes remotely, and for performing distributed and parallel computing. It uses network sockets or other supported interfaces to communicate between Julia processes, and relies on Julia's `Serialization` stdlib package to transform Julia objects into a format that can be transferred between processes efficiently. It provides a full set of utilities to create and destroy new Julia processes and add them to a "cluster" (a collection of Julia processes connected together), as well as functions to perform Remote Procedure Calls (RPC) between the processes within a cluster. See [`API`](@ref) for details.
-
+# Distributed (with a multiscale parallelism extension)
+
+The `Distributed` package provides functionality for creating and controlling
+multiple Julia processes remotely, and for performing distributed and parallel
+computing. It uses network sockets or other supported interfaces to communicate
+between Julia processes, and relies on Julia's `Serialization` stdlib package to
+transform Julia objects into a format that can be transferred between processes
+efficiently. It provides a full set of utilities to create and destroy new Julia
+processes and add them to a "cluster" (a collection of Julia processes connected
+together), as well as functions to perform Remote Procedure Calls (RPC) between
+the processes within a cluster. See the `API` section for details.
 This package ships as part of the Julia stdlib.
 
+> [!NOTE]
+> This repository is a fork of the original [`Distributed`](https://github.com/JuliaLang/Distributed.jl) package for developing ideas behind the support of _multiscale parallelism_ in Julia. In gross terms, such an extension allows worker processes to execute the `addprocs` operation, so that a worker process may also play the role of a master process with respect to a set of worker processes it creates by invoking `addprocs`. For that, all `Distributed` operations listed below are extended with a keyword parameter `role`, with three possible values: `:default` (default argument), `:master`, and `:worker`. So, a worker that created processes by means of `addprocs` may execute operations as:
+> * a ***worker process*** by using `role = :worker`, for interacting with the master processes that created it, as well as other workers; or
+> * a ***master process*** by using `role = :master`, for interacting with the workers it created.
+>
+> It is important to note that the modifications to the API do not affect usual `Distributed` programs.
+>   
+> Mutiscale parallelism may help programmers in at least two scenarios:
+> * to deploy _multicluster computations_, i.e. parallel computations employing multiple clusters by assuming the parallel programming patterns and tools at the multicluster and cluster levels are distinct;
+> * better support for _multilevel parallel programming_ patterns.
+> 
+> We are working on the implementation of case studies.
+
 ## Using development versions of this package
 
 To use a newer version of this package, you need to build Julia from scratch. The build process is the same as any other build except that you need to change the commit used in `stdlib/Distributed.version`.
 
 It's also possible to load a development version of the package using [the trick used in the Section named "Using the development version of Pkg.jl" in the `Pkg.jl` repo](https://github.com/JuliaLang/Pkg.jl#using-the-development-version-of-pkgjl), but the capabilities are limited as all other packages will depend on the stdlib version of the package and will not work with the modified package.
 
+### On Julia 1.11+
+In Julia 1.11 Distributed was excised from the default system image and became
+more of an independent package. As such, to use a different version it's enough
+to just `dev` it explicitly:
+```julia-repl
+pkg> dev https://github.com/JuliaLang/Distributed.jl.git
+```
+### On older Julia versions
+To use a newer version of this package on older Julia versions, you need to build
+Julia from scratch. The build process is the same as any other build except that
+you need to change the commit used in `stdlib/Distributed.version`.
+It's also possible to load a development version of the package using [the trick
+used in the Section named "Using the development version of Pkg.jl" in the
+`Pkg.jl`
+repo](https://github.com/JuliaLang/Pkg.jl#using-the-development-version-of-pkgjl),
+but the capabilities are limited as all other packages will depend on the stdlib
+version of the package and will not work with the modified package.
+
 ## API
 
-The public API of `Distributed` consists of a variety of functions for various tasks; for creating and destroying processes within a cluster:
+The public API of `Distributed` consists of a variety of functions for various
+tasks; for creating and destroying processes within a cluster:
 
 - `addprocs` - create one or more Julia processes and connect them to the cluster
 - `rmprocs` - shutdown and remove one or more Julia processes from the cluster
@@ -20,7 +59,9 @@ The public API of `Distributed` consists of a variety of functions for various t
 For controlling other processes via RPC:
 
 - `remotecall` - call a function on another process and return a `Future` referencing the result of that call
-- `Future` - an object that references the result of a `remotecall` that hasn't yet completed - use `fetch` to return the call's result, or `wait` to just wait for the remote call to finish
+- `Future` - an object that references the result of a `remotecall` that hasn't
+  yet completed - use `fetch` to return the call's result, or `wait` to just
+  wait for the remote call to finish.
 - `remotecall_fetch` - the same as `fetch(remotecall(...))`
 - `remotecall_wait` - the same as `wait(remotecall(...))`
 - `remote_do` - like `remotecall`, but does not provide a way to access the result of the call
@@ -49,7 +90,15 @@ For controlling multiple processes at once:
 
 ### Process Identifiers
 
-Julia processes connected with `Distributed` are all assigned a cluster-unique `Int` identifier, starting from `1`. The first Julia process within a cluster is given ID `1`, while other processes added via `addprocs` get incrementing IDs (`2`, `3`, etc.). Functions and macros which communicate from one process to another usually take one or more identifiers to determine which process they target - for example, `remotecall_fetch(myid, 2)` calls `myid()` on process 2.
-
-!!! note
-    Only process 1 (often called the "head", "primary", or "master") may add or remove processes, and manages the rest of the cluster. Other processes (called "workers" or "worker processes") may still call functions on each other and send and receive data, but `addprocs`/`rmprocs` on worker processes will fail with an error.
+Julia processes connected with `Distributed` are all assigned a cluster-unique
+`Int` identifier, starting from `1`. The first Julia process within a cluster is
+given ID `1`, while other processes added via `addprocs` get incrementing IDs
+(`2`, `3`, etc.). Functions and macros which communicate from one process to
+another usually take one or more identifiers to determine which process they
+target - for example, `remotecall_fetch(myid, 2)` calls `myid()` on process 2.
+
+**Note:** Only process 1 (often called the "head", "primary", or "master") may
+add or remove processes, and manages the rest of the cluster. Other processes
+(called "workers" or "worker processes") may still call functions on each other
+and send and receive data, but `addprocs`/`rmprocs` on worker processes will
+fail with an error.
\ No newline at end of file
diff --git a/docs/src/index.md b/docs/src/index.md
deleted file mode 100644
index 22d63ce..0000000
--- a/docs/src/index.md
+++ /dev/null
@@ -1,71 +0,0 @@
-# [Distributed Computing](@id man-distributed)
-
-```@docs
-Distributed
-Distributed.addprocs
-Distributed.nprocs
-Distributed.nworkers
-Distributed.procs()
-Distributed.procs(::Integer)
-Distributed.workers
-Distributed.rmprocs
-Distributed.interrupt
-Distributed.myid
-Distributed.pmap
-Distributed.RemoteException
-Distributed.ProcessExitedException
-Distributed.Future
-Distributed.RemoteChannel
-Distributed.fetch(::Distributed.Future)
-Distributed.fetch(::RemoteChannel)
-Distributed.remotecall(::Any, ::Integer, ::Any...)
-Distributed.remotecall_wait(::Any, ::Integer, ::Any...)
-Distributed.remotecall_fetch(::Any, ::Integer, ::Any...)
-Distributed.remote_do(::Any, ::Integer, ::Any...)
-Distributed.put!(::RemoteChannel, ::Any...)
-Distributed.put!(::Distributed.Future, ::Any)
-Distributed.take!(::RemoteChannel, ::Any...)
-Distributed.isready(::RemoteChannel, ::Any...)
-Distributed.isready(::Distributed.Future)
-Distributed.AbstractWorkerPool
-Distributed.WorkerPool
-Distributed.CachingPool
-Distributed.default_worker_pool
-Distributed.clear!
-Distributed.remote
-Distributed.remotecall(::Any, ::AbstractWorkerPool, ::Any...)
-Distributed.remotecall_wait(::Any, ::AbstractWorkerPool, ::Any...)
-Distributed.remotecall_fetch(::Any, ::AbstractWorkerPool, ::Any...)
-Distributed.remote_do(::Any, ::AbstractWorkerPool, ::Any...)
-Distributed.@spawn
-Distributed.@spawnat
-Distributed.@fetch
-Distributed.@fetchfrom
-Distributed.@distributed
-Distributed.@everywhere
-Distributed.remoteref_id
-Distributed.channel_from_id
-Distributed.worker_id_from_socket
-Distributed.cluster_cookie()
-Distributed.cluster_cookie(::Any)
-```
-
-## Cluster Manager Interface
-
-This interface provides a mechanism to launch and manage Julia workers on different cluster environments.
-There are two types of managers present in Base: `LocalManager`, for launching additional workers on the
-same host, and `SSHManager`, for launching on remote hosts via `ssh`. TCP/IP sockets are used to connect
-and transport messages between processes. It is possible for Cluster Managers to provide a different transport.
-
-```@docs
-Distributed.ClusterManager
-Distributed.WorkerConfig
-Distributed.launch
-Distributed.manage
-Distributed.kill(::ClusterManager, ::Int, ::WorkerConfig)
-Distributed.connect(::ClusterManager, ::Int, ::WorkerConfig)
-Distributed.init_worker
-Distributed.start_worker
-Distributed.process_messages
-Distributed.default_addprocs_params
-```
diff --git a/src/Distributed.jl b/src/Distributed.jl
index a7c5b17..4a44266 100644
--- a/src/Distributed.jl
+++ b/src/Distributed.jl
@@ -15,7 +15,7 @@ using Base: Process, Semaphore, JLOptions, buffer_writes, @async_unwrap,
             julia_cmd, AsyncGenerator, acquire, release, invokelatest,
             shell_escape_posixly, shell_escape_csh,
             shell_escape_wincmd, escape_microsoft_c_args,
-            uv_error, something, notnothing, isbuffered, mapany
+            uv_error, something, notnothing, isbuffered, mapany, SizeUnknown
 using Base.Threads: Event
 
 using Serialization, Sockets
@@ -49,6 +49,7 @@ export
     procs,
     remote,
     remotecall,
+    remotecall_eval,
     remotecall_fetch,
     remotecall_wait,
     remote_do,
@@ -72,14 +73,15 @@ export
     check_same_host
 
 function _require_callback(mod::Base.PkgId)
-    if Base.toplevel_load[] && myid() == 1 && nprocs() > 1
+    if Base.toplevel_load[] && nprocs(role=:master) > 1
         # broadcast top-level (e.g. from Main) import/using from node 1 (only)
-        @sync for p in procs()
+        @sync for p in procs(role = :master)
+            #@info "require callback", p
             p == 1 && continue
             # Extensions are already loaded on workers by their triggers being loaded
             # so no need to fire the callback upon extension being loaded on master.
             Base.loading_extension && continue
-            @async_unwrap remotecall_wait(p) do
+            @async_unwrap remotecall_wait(p; role = :master) do
                 Base.require(mod)
                 nothing
             end
@@ -94,7 +96,7 @@ struct RRID
     whence::Int
     id::Int
 
-    RRID() = RRID(myid(), next_ref_id())
+    RRID(;role= :default) = RRID(myid(role=role), next_ref_id())
     RRID(whence, id) = new(whence, id)
 end
 
diff --git a/src/cluster.jl b/src/cluster.jl
index 2444695..f291b66 100644
--- a/src/cluster.jl
+++ b/src/cluster.jl
@@ -99,10 +99,10 @@ mutable struct Worker
     del_msgs::Array{Any,1} # XXX: Could del_msgs and add_msgs be Channels?
     add_msgs::Array{Any,1}
     @atomic gcflag::Bool
-    state::WorkerState
-    c_state::Condition      # wait for state changes
-    ct_time::Float64        # creation time
-    conn_func::Any          # used to setup connections lazily
+    @atomic state::WorkerState
+    c_state::Threads.Condition # wait for state changes, lock for state
+    ct_time::Float64           # creation time
+    conn_func::Any             # used to setup connections lazily
 
     r_stream::IO
     w_stream::IO
@@ -115,8 +115,8 @@ mutable struct Worker
 
     function Worker(id::Int, r_stream::IO, w_stream::IO, manager::ClusterManager;
                              version::Union{VersionNumber, Nothing}=nothing,
-                             config::WorkerConfig=WorkerConfig())
-        w = Worker(id)
+                             config::WorkerConfig=WorkerConfig(), role= :default)
+        w = Worker(id; role = role)
         w.r_stream = r_stream
         w.w_stream = buffer_writes(w_stream)
         w.w_serializer = ClusterSerializer(w.w_stream)
@@ -128,56 +128,63 @@ mutable struct Worker
         w
     end
 
-    Worker(id::Int) = Worker(id, nothing)
-    function Worker(id::Int, conn_func)
+    Worker(id::Int; role= :default) = Worker(id, nothing; role = role)
+    function Worker(id::Int, conn_func; role= :default)
         @assert id > 0
+        map_pid_wrkr = Map_pid_wrkr(role = role)
         if haskey(map_pid_wrkr, id)
             return map_pid_wrkr[id]
         end
-        w=new(id, Threads.ReentrantLock(), [], [], false, W_CREATED, Condition(), time(), conn_func)
+        w=new(id, Threads.ReentrantLock(), [], [], false, W_CREATED, Threads.Condition(), time(), conn_func)
         w.initialized = Event()
-        register_worker(w)
+        register_worker(w; role = role)
         w
     end
 
-    Worker() = Worker(get_next_pid())
+    Worker(;role= :default) = Worker(get_next_pid(); role = role)
 end
 
+wid(w::Worker; role= :default) = w.id
+
 function set_worker_state(w, state)
-    w.state = state
-    notify(w.c_state; all=true)
+    lock(w.c_state) do
+        @atomic w.state = state
+        notify(w.c_state; all=true)
+    end
 end
 
-function check_worker_state(w::Worker)
-    if w.state === W_CREATED
-        if !isclusterlazy()
-            if PGRP.topology === :all_to_all
+function check_worker_state(w::Worker; role= :default)
+    if (@atomic w.state) === W_CREATED
+        if !isclusterlazy(role = role)
+            pg = PGRP(role = role)
+            if pg.topology === :all_to_all
                 # Since higher pids connect with lower pids, the remote worker
                 # may not have connected to us yet. Wait for some time.
-                wait_for_conn(w)
+                wait_for_conn(w; role=role)
             else
-                error("peer $(w.id) is not connected to $(myid()). Topology : " * string(PGRP.topology))
+                error("peer $(wid(w, role=role)) is not connected to $(myid(role=role)). Topology : " * string(pg.topology))
             end
         else
             w.ct_time = time()
-            if myid() > w.id
-                t = @async exec_conn_func(w)
+            if myid(role=role) > wid(w, role=role)
+                t = @async exec_conn_func(w; role=role)
             else
                 # route request via node 1
-                t = @async remotecall_fetch((p,to_id) -> remotecall_fetch(exec_conn_func, p, to_id), 1, w.id, myid())
+                t = @async remotecall_fetch((p,to_id) -> remotecall_fetch((to_id, role2) -> exec_conn_func(to_id; role = role2), p, to_id, p == 1 ? :master : :worker; role = :master), 1, wid(w, role=role), myid(role=role); role=role)
             end
             errormonitor(t)
-            wait_for_conn(w)
+            wait_for_conn(w; role=role)
         end
     end
+    return nothing
 end
 
-exec_conn_func(id::Int) = exec_conn_func(worker_from_id(id)::Worker)
-function exec_conn_func(w::Worker)
+exec_conn_func(id::Int; role= :default) = exec_conn_func(worker_from_id(id; role = role)::Worker; role = role)
+function exec_conn_func(w::Worker; role= :default)
     try
         f = notnothing(w.conn_func)
         # Will be called if some other task tries to connect at the same time.
-        w.conn_func = () -> wait_for_conn(w)
+        w.conn_func = () -> wait_for_conn(w; role=role)
         f()
     catch e
         w.conn_func = () -> throw(e)
@@ -186,14 +193,16 @@ function exec_conn_func(w::Worker)
     nothing
 end
 
-function wait_for_conn(w)
-    if w.state === W_CREATED
+function wait_for_conn(w; role=:defaut)
+    if (@atomic w.state) === W_CREATED
         timeout =  worker_timeout() - (time() - w.ct_time)
-        timeout <= 0 && error("peer $(w.id) has not connected to $(myid())")
+        timeout <= 0 && error("peer $(wid(w, role=role)) has not connected to $(myid(role=role))")
 
-        @async (sleep(timeout); notify(w.c_state; all=true))
-        wait(w.c_state)
-        w.state === W_CREATED && error("peer $(w.id) didn't connect to $(myid()) within $timeout seconds")
+        if timedwait(() -> (@atomic w.state) === W_CONNECTED, timeout) === :timed_out
+            # Notify any waiters on the state and throw
+            @lock w.c_state notify(w.c_state)
+            error("peer $(w.id) didn't connect to $(myid()) within $timeout seconds")
+        end
     end
     nothing
 end
@@ -201,11 +210,29 @@ end
 ## process group creation ##
 
 mutable struct LocalProcess
-    id::Int
+    id0::Int
+    id1::Int
     bind_addr::String
+    bind_addr_2::String
     bind_port::UInt16
     cookie::String
-    LocalProcess() = new(1)
+    LocalProcess() = new(1,1)
+end
+
+function wid(lp::LocalProcess; role= :default) 
+    if role == :master 
+        return lp.id1
+    elseif role == :worker 
+        return lp.id0
+    elseif role == :default && myrole() == :master
+        return lp.id1 # as :master
+    elseif role == :default && myrole() == :worker
+        return lp.id0 # as :worker
+    else
+        return lp.id1 # as :master
+        #throw("unexpected use of role=:default (wid)")
+    end
+
 end
 
 worker_timeout() = parse(Float64, get(ENV, "JULIA_WORKER_TIMEOUT", "60.0"))
@@ -230,6 +257,7 @@ It does not return.
 """
 start_worker(cookie::AbstractString=readline(stdin); kwargs...) = start_worker(stdout, cookie; kwargs...)
 function start_worker(out::IO, cookie::AbstractString=readline(stdin); close_stdin::Bool=true, stderr_to_stdout::Bool=true)
+
     init_multi()
 
     if close_stdin # workers will not use it
@@ -249,12 +277,9 @@ function start_worker(out::IO, cookie::AbstractString=readline(stdin); close_std
     end
     errormonitor(@async while isopen(sock)
         client = accept(sock)
-        process_messages(client, client, true)
+        process_messages(client, client, true; role = :worker)
     end)
-    print(out, "julia_worker:")  # print header
-    print(out, "$(string(LPROC.bind_port))#") # print port
-    print(out, LPROC.bind_addr)
-    print(out, '\n')
+    println(out, "julia_worker:$(string(LPROC.bind_port))#$(LPROC.bind_addr_2)\n")  # print header
     flush(out)
 
     Sockets.nagle(sock, false)
@@ -270,7 +295,7 @@ function start_worker(out::IO, cookie::AbstractString=readline(stdin); close_std
         check_master_connect()
         while true; wait(); end
     catch err
-        print(stderr, "unhandled exception on $(myid()): $(err)\nexiting.\n")
+        print(stderr, "unhandled exception on $(myid(role = :worker)): $(err)\nexiting.\n")
     end
 
     close(sock)
@@ -330,6 +355,8 @@ function read_worker_host_port(io::IO)
                 throw(LaunchWorkerError("Unable to read host:port string from worker. Launch command exited with error?"))
             end
 
+            #@info "conninfo: $conninfo"
+
             ntries -= 1
             bind_addr, port = parse_connection_info(conninfo)
             if !isempty(bind_addr)
@@ -379,12 +406,12 @@ function init_worker(cookie::AbstractString, manager::ClusterManager=DefaultClus
 
     # Since our pid has yet to be set, ensure no RemoteChannel / Future  have been created or addprocs() called.
     @assert nprocs() <= 1
-    @assert isempty(PGRP.refs)
+    @assert isempty(PGRP(role = :worker).refs)
     @assert isempty(client_refs)
 
     # System is started in head node mode, cleanup related entries
-    empty!(PGRP.workers)
-    empty!(map_pid_wrkr)
+    empty!(PGRP(role = :worker).workers)
+    empty!(Map_pid_wrkr(role = :worker))
 
     cluster_cookie(cookie)
     nothing
@@ -443,10 +470,16 @@ end
 function addprocs(manager::ClusterManager; kwargs...)
     init_multi()
 
-    cluster_mgmt_from_master_check()
+#    cluster_mgmt_from_master_check()
 
     lock(worker_lock)
     try
+
+        if myrole() == :worker
+            myrole!(:master_worker)
+        end
+        PGRP(role=:master).level = PGRP(role=:worker).level + 1
+
         addprocs_locked(manager::ClusterManager; kwargs...)
     finally
         unlock(worker_lock)
@@ -455,16 +488,18 @@ end
 
 function addprocs_locked(manager::ClusterManager; kwargs...)
     params = merge(default_addprocs_params(manager), Dict{Symbol,Any}(kwargs))
-    topology(Symbol(params[:topology]))
+    topology(Symbol(params[:topology]); role = :master)
 
-    if PGRP.topology !== :all_to_all
+    pgm = PGRP(role = :master) 
+
+    if pgm.topology !== :all_to_all
         params[:lazy] = false
     end
 
-    if PGRP.lazy === nothing || nprocs() == 1
-        PGRP.lazy = params[:lazy]
-    elseif isclusterlazy() != params[:lazy]
-        throw(ArgumentError(string("Active workers with lazy=", isclusterlazy(),
+    if pgm.lazy === nothing || nprocs() == 1
+        pgm.lazy = params[:lazy]
+    elseif isclusterlazy(role = :master) != params[:lazy]
+        throw(ArgumentError(string("Active workers with lazy=", isclusterlazy(role = :master),
                                     ". Cannot set lazy=", params[:lazy])))
     end
 
@@ -485,19 +520,24 @@ function addprocs_locked(manager::ClusterManager; kwargs...)
     # call manager's `launch` is a separate task. This allows the master
     # process initiate the connection setup process as and when workers come
     # online
+    # NOTE: Must be `@async`. See FIXME above
     t_launch = @async launch(manager, params, launched, launch_ntfy)
 
     @sync begin
         while true
             if isempty(launched)
                 istaskdone(t_launch) && break
-                @async (sleep(1); notify(launch_ntfy))
+                @async begin # NOTE: Must be `@async`. See FIXME above
+                    sleep(1)
+                    notify(launch_ntfy)
+                end
                 wait(launch_ntfy)
             end
 
             if !isempty(launched)
                 wconfig = popfirst!(launched)
                 let wconfig=wconfig
+                    # NOTE: Must be `@async`. See FIXME above
                     @async setup_launched_worker(manager, wconfig, launched_q)
                 end
             end
@@ -509,17 +549,17 @@ function addprocs_locked(manager::ClusterManager; kwargs...)
     # Since all worker-to-worker setups may not have completed by the time this
     # function returns to the caller, send the complete list to all workers.
     # Useful for nprocs(), nworkers(), etc to return valid values on the workers.
-    all_w = workers()
+    all_w = workers(role = :master)
     for pid in all_w
-        remote_do(set_valid_processes, pid, all_w)
+        remote_do((all_w, role) -> set_valid_processes(all_w, role = role), pid, all_w, pid == 1 ? :master : :worker; role = :master)
     end
 
     sort!(launched_q)
 end
 
-function set_valid_processes(plist::Array{Int})
+function set_valid_processes(plist::Array{Int}; role= :default)
     for pid in setdiff(plist, workers())
-        myid() != pid && Worker(pid)
+        myid(role=role) != pid && Worker(pid; role = role)
     end
 end
 
@@ -566,7 +606,7 @@ function launch_n_additional_processes(manager, frompid, fromconfig, cnt, launch
         exeflags = something(fromconfig.exeflags, ``)
         cmd = `$exename $exeflags`
 
-        new_addresses = remotecall_fetch(launch_additional, frompid, cnt, cmd)
+        new_addresses = remotecall_fetch(launch_additional, frompid, cnt, cmd; role = :master)
         for address in new_addresses
             (bind_addr, port) = address
 
@@ -580,7 +620,7 @@ function launch_n_additional_processes(manager, frompid, fromconfig, cnt, launch
             let wconfig=wconfig
                 @async begin
                     pid = create_worker(manager, wconfig)
-                    remote_do(redirect_output_from_additional_worker, frompid, pid, port)
+                    remote_do(redirect_output_from_additional_worker, frompid, pid, port; role = :master)
                     push!(launched_q, pid)
                 end
             end
@@ -589,40 +629,42 @@ function launch_n_additional_processes(manager, frompid, fromconfig, cnt, launch
 end
 
 function create_worker(manager, wconfig)
+    role = :master
+
     # only node 1 can add new nodes, since nobody else has the full list of address:port
-    @assert LPROC.id == 1
+    @assert myid(role=role) == 1
     timeout = worker_timeout()
 
     # initiate a connect. Does not wait for connection completion in case of TCP.
-    w = Worker()
+    w = Worker(role = role)
     local r_s, w_s
     try
-        (r_s, w_s) = connect(manager, w.id, wconfig)
+        (r_s, w_s) = connect(manager, wid(w, role=role), wconfig)
     catch ex
         try
-            deregister_worker(w.id)
-            kill(manager, w.id, wconfig)
+            deregister_worker(wid(w, role=role), role = role)
+            kill(manager, wid(w, role=role), wconfig)
         finally
             rethrow(ex)
         end
     end
 
-    w = Worker(w.id, r_s, w_s, manager; config=wconfig)
+    w = Worker(wid(w, role=role), r_s, w_s, manager; config=wconfig, role = role)
     # install a finalizer to perform cleanup if necessary
     finalizer(w) do w
-        if myid() == 1
-            manage(w.manager, w.id, w.config, :finalize)
+        if myid(role=role) == 1
+            manage(w.manager, wid(w, role=role), w.config, :finalize)
         end
     end
 
     # set when the new worker has finished connections with all other workers
-    ntfy_oid = RRID()
-    rr_ntfy_join = lookup_ref(ntfy_oid)
-    rr_ntfy_join.waitingfor = myid()
+    ntfy_oid = RRID(role = role)
+    rr_ntfy_join = lookup_ref(ntfy_oid; role = role)
+    rr_ntfy_join.waitingfor = myid(role=role)
 
     # Start a new task to handle inbound messages from connected worker in master.
     # Also calls `wait_connected` on TCP streams.
-    process_messages(w.r_stream, w.w_stream, false)
+    process_messages(w.r_stream, w.w_stream, false; role = :master)
 
     # send address information of all workers to the new worker.
     # Cluster managers set the address of each worker in `WorkerConfig.connect_at`.
@@ -639,23 +681,29 @@ function create_worker(manager, wconfig)
     # - On master, receiving a JoinCompleteMsg triggers rr_ntfy_join (signifies that worker setup is complete)
 
     join_list = []
-    if PGRP.topology === :all_to_all
+    pgm = PGRP(role = role)
+    if pgm.topology === :all_to_all
         # need to wait for lower worker pids to have completed connecting, since the numerical value
         # of pids is relevant to the connection process, i.e., higher pids connect to lower pids and they
         # require the value of config.connect_at which is set only upon connection completion
-        for jw in PGRP.workers
-            if (jw.id != 1) && (jw.id < w.id)
-                (jw.state === W_CREATED) && wait(jw.c_state)
+        for jw in pgm.workers
+            if (wid(jw, role=role) != 1) && (wid(jw, role=role) < wid(w, role=role))
+                # wait for wl to join
+                if (@atomic jw.state) === W_CREATED
+                    lock(jw.c_state) do
+                        wait(jw.c_state)
+                    end
+                end
                 push!(join_list, jw)
             end
         end
 
-    elseif PGRP.topology === :custom
+    elseif pgm.topology === :custom
         # wait for requested workers to be up before connecting to them.
-        filterfunc(x) = (x.id != 1) && isdefined(x, :config) &&
+        filterfunc(x) = (wid(x, role=role) != 1) && isdefined(x, :config) &&
             (notnothing(x.config.ident) in something(wconfig.connect_idents, []))
 
-        wlist = filter(filterfunc, PGRP.workers)
+        wlist = filter(filterfunc, pgm.workers)
         waittime = 0
         while wconfig.connect_idents !== nothing &&
               length(wlist) < length(wconfig.connect_idents)
@@ -664,37 +712,40 @@ function create_worker(manager, wconfig)
             end
             sleep(1.0)
             waittime += 1
-            wlist = filter(filterfunc, PGRP.workers)
+            wlist = filter(filterfunc, pgm.workers)
         end
 
         for wl in wlist
-            (wl.state === W_CREATED) && wait(wl.c_state)
+            lock(wl.c_state) do
+                if (@atomic wl.state) === W_CREATED
+                    # wait for wl to join
+                    wait(wl.c_state)
+                end
+            end
             push!(join_list, wl)
         end
     end
 
     all_locs = mapany(x -> isa(x, Worker) ?
-                      (something(x.config.connect_at, ()), x.id) :
-                      ((), x.id, true),
+                      (something(x.config.connect_at, ()), wid(x, role=role)) :
+                      ((), wid(x, role=role), true),
                       join_list)
     send_connection_hdr(w, true)
     enable_threaded_blas = something(wconfig.enable_threaded_blas, false)
-    join_message = JoinPGRPMsg(w.id, all_locs, PGRP.topology, enable_threaded_blas, isclusterlazy())
-    send_msg_now(w, MsgHeader(RRID(0,0), ntfy_oid), join_message)
+    join_message = JoinPGRPMsg(wid(w, role=role), all_locs, pgm.topology, enable_threaded_blas, isclusterlazy(role = role))
+    send_msg_now(w, MsgHeader(RRID(0,0), ntfy_oid), join_message; role = role)
 
-    @async manage(w.manager, w.id, w.config, :register)
+    @async manage(w.manager, wid(w, role=role), w.config, :register)
+    
     # wait for rr_ntfy_join with timeout
-    timedout = false
-    @async (sleep($timeout); timedout = true; put!(rr_ntfy_join, 1))
-    wait(rr_ntfy_join)
-    if timedout
+    if timedwait(() -> isready(rr_ntfy_join), timeout) === :timed_out
         error("worker did not connect within $timeout seconds")
     end
     lock(client_refs) do
-        delete!(PGRP.refs, ntfy_oid)
+        delete!(pgm.refs, ntfy_oid)
     end
 
-    return w.id
+    return wid(w, role=role)
 end
 
 
@@ -729,23 +780,21 @@ function redirect_output_from_additional_worker(pid, port)
 end
 
 function check_master_connect()
-    timeout = worker_timeout() * 1e9
     # If we do not have at least process 1 connect to us within timeout
     # we log an error and exit, unless we're running on valgrind
     if ccall(:jl_running_on_valgrind,Cint,()) != 0
         return
     end
-    @async begin
-        start = time_ns()
-        while !haskey(map_pid_wrkr, 1) && (time_ns() - start) < timeout
-            sleep(1.0)
-        end
-
-        if !haskey(map_pid_wrkr, 1)
-            print(stderr, "Master process (id 1) could not connect within $(timeout/1e9) seconds.\nexiting.\n")
-            exit(1)
-        end
+    errormonitor(
+        @async begin
+            map_pid_wrkr = Map_pid_wrkr(role = :worker)
+            timeout = worker_timeout()
+            if timedwait(() -> haskey(map_pid_wrkr, 1), timeout) === :timed_out
+                print(stderr, "Master process (id 1) could not connect within $(timeout) seconds.\nexiting.\n")
+                exit(1)
+            end
     end
+    )
 end
 
 
@@ -784,34 +833,60 @@ let next_pid = 2    # 1 is reserved for the client (always)
 end
 
 mutable struct ProcessGroup
+    level::Integer
     name::String
     workers::Array{Any,1}
     refs::Dict{RRID,Any}                  # global references
     topology::Symbol
     lazy::Union{Bool, Nothing}
 
-    ProcessGroup(w::Array{Any,1}) = new("pg-default", w, Dict(), :all_to_all, nothing)
+    ProcessGroup(w::Array{Any,1}) = new(0, "pg-default", w, Dict(), :all_to_all, nothing)
+end
+
+const _PGRP0 = ProcessGroup([])
+const _PGRP1 = ProcessGroup([])
+
+function PGRP(;role= :default)
+    if role == :master 
+#        @info "$(role) / PGRP1 !"
+        return _PGRP1
+    elseif role == :worker 
+#        @info "$(role) / PGRP0 ! -- worker"
+        return _PGRP0
+#    elseif role == :default && _PGRP0.level == 0
+    elseif role == :default && myrole() == :master
+#        @info "$(role) / PGRP1 !"
+        return _PGRP1 # as :master
+#    elseif role == :default && _PGRP0.level > 0
+    elseif role == :default && myrole() == :worker
+#        @info "$(role) / PGRP0 !"
+        return _PGRP0 # as :worker
+    else
+        return _PGRP1 # as :master
+       # throw("unexpected use of role = $role (PGRP) - $(myrole())")
+    end
 end
-const PGRP = ProcessGroup([])
 
-function topology(t)
+function topology(t; role= :default)
     @assert t in [:all_to_all, :master_worker, :custom]
-    if (PGRP.topology==t) || ((myid()==1) && (nprocs()==1)) || (myid() > 1)
-        PGRP.topology = t
+    pg = PGRP(role = role)
+    if (pg.topology==t) || ((myid(role=role)==1) && (nprocs()==1)) || (myid(role=role) > 1)
+        pg.topology = t
     else
-        error("Workers with Topology $(PGRP.topology) already exist. Requested Topology $(t) cannot be set.")
+        error("Workers with Topology $(pg.topology) already exist. Requested Topology $(t) cannot be set.")
     end
     t
 end
 
-isclusterlazy() = something(PGRP.lazy, false)
+isclusterlazy(; role= :default) = something(PGRP(role = role).lazy, false)
 
-get_bind_addr(pid::Integer) = get_bind_addr(worker_from_id(pid))
-get_bind_addr(w::LocalProcess) = LPROC.bind_addr
-function get_bind_addr(w::Worker)
+get_bind_addr(pid::Integer) = get_bind_addr(worker_from_id(pid; role = :master))  # always called as manager 
+get_bind_addr(w::LocalProcess) = LPROC.bind_addr                                   # always called as manager  
+function get_bind_addr(w::Worker)       
+    role = :worker                                           # always called as worker
     if w.config.bind_addr === nothing
-        if w.id != myid()
-            w.config.bind_addr = remotecall_fetch(get_bind_addr, w.id, w.id)
+        if wid(w, role=role) != myid(role=role)
+            w.config.bind_addr = remotecall_fetch(get_bind_addr, wid(w, role=role), wid(w, role=role), role = role)
         end
     end
     w.config.bind_addr
@@ -822,10 +897,33 @@ const LPROC = LocalProcess()
 const LPROCROLE = Ref{Symbol}(:master)
 const HDR_VERSION_LEN=16
 const HDR_COOKIE_LEN=16
-const map_pid_wrkr = Dict{Int, Union{Worker, LocalProcess}}()
+const _map_pid_wrkr_0 = Dict{Int, Union{Worker, LocalProcess}}()
+const _map_pid_wrkr_1 = Dict{Int, Union{Worker, LocalProcess}}()
 const map_sock_wrkr = IdDict()
 const map_del_wrkr = Set{Int}()
 
+function Map_pid_wrkr(;role= :default)
+   # @info ("_map_pid_wrkr_0", _map_pid_wrkr_0, "end")
+   # @info ("_map_pid_wrkr_1", _map_pid_wrkr_1, "end")
+    pg = PGRP(role = role)
+    if role == :master 
+    #    @info "Map_pid_wrkr_1 ", role
+        return _map_pid_wrkr_1
+    elseif role == :worker 
+    #    @info "Map_pid_wrkr_0 ", role
+        return _map_pid_wrkr_0
+    elseif role == :default && myrole() == :master
+    #    @info "Map_pid_wrkr_1 ", role, pg.level
+        return _map_pid_wrkr_1 # as :master
+    elseif role == :default && myrole() == :worker
+    #    @info "Map_pid_wrkr_0 ", role, pg.level
+        return _map_pid_wrkr_0 # as :worker
+    else
+        return _map_pid_wrkr_1 # as :master
+       # throw("unexpected use of role = :default (Map_pid_wrkr)")
+   end    
+end
+
 # whether process is a master or worker in a distributed setup
 myrole() = LPROCROLE[]
 function myrole!(proctype::Symbol)
@@ -847,7 +945,38 @@ julia> remotecall_fetch(() -> myid(), 4)
 4
 ```
 """
-myid() = LPROC.id
+function myid(;role= :default) 
+    if role == :master 
+        return LPROC.id1
+    elseif role == :worker 
+        return LPROC.id0
+    elseif role == :default && myrole() == :master
+        return LPROC.id1 # as :master
+    elseif role == :default && myrole() == :worker
+        return LPROC.id0 # as :worker
+    else
+        return LPROC.id1 # as :master
+        #throw("unexpected use of role := default (myid) - $(myrole())")
+    end
+
+end
+
+function myid!(id;role= :default) 
+    if role == :master 
+        LPROC.id1 = id
+    elseif role == :worker 
+        LPROC.id0 = id
+    elseif role == :default && myrole() == :master
+        LPROC.id1 = id # as :master
+    elseif role == :default && myrole() == :worker
+        LPROC.id0 = id # as :worker
+    else
+        LPROC.id1 = id # as :master
+        #throw("unexpected use of role := default (myid!)")
+    end
+
+end
+
 
 """
     nprocs()
@@ -865,18 +994,19 @@ julia> workers()
  3
 ```
 """
-function nprocs()
-    if myid() == 1 || (PGRP.topology === :all_to_all && !isclusterlazy())
-        n = length(PGRP.workers)
+function nprocs(; role= :default)
+    pg = PGRP(role = role)
+    if myid(role=role) == 1 || (pg.topology === :all_to_all && !isclusterlazy(role = role))
+        n = length(pg.workers)
         # filter out workers in the process of being setup/shutdown.
-        for jw in PGRP.workers
-            if !isa(jw, LocalProcess) && (jw.state !== W_CONNECTED)
+        for jw in pg.workers
+            if !isa(jw, LocalProcess) && ((@atomic jw.state) !== W_CONNECTED)
                 n = n - 1
             end
         end
         return n
     else
-        return length(PGRP.workers)
+        return length(pg.workers)
     end
 end
 
@@ -897,8 +1027,8 @@ julia> nworkers()
 2
 ```
 """
-function nworkers()
-    n = nprocs()
+function nworkers(;role= :default)
+    n = nprocs(role = role)
     n == 1 ? 1 : n-1
 end
 
@@ -918,25 +1048,27 @@ julia> procs()
  3
 ```
 """
-function procs()
-    if myid() == 1 || (PGRP.topology === :all_to_all  && !isclusterlazy())
+function procs(; role= :default)
+    pg = PGRP(role = role)
+    if myid(role=role) == 1 || (pg.topology === :all_to_all  && !isclusterlazy(role = role))
         # filter out workers in the process of being setup/shutdown.
-        return Int[x.id for x in PGRP.workers if isa(x, LocalProcess) || (x.state === W_CONNECTED)]
+        return Int[wid(x, role=role) for x in pg.workers if isa(x, LocalProcess) || ((@atomic x.state) === W_CONNECTED)]
     else
-        return Int[x.id for x in PGRP.workers]
+        return Int[wid(x, role=role) for x in pg.workers]
     end
 end
 
-function id_in_procs(id)  # faster version of `id in procs()`
-    if myid() == 1 || (PGRP.topology === :all_to_all  && !isclusterlazy())
-        for x in PGRP.workers
-            if (x.id::Int) == id && (isa(x, LocalProcess) || (x::Worker).state === W_CONNECTED)
+function id_in_procs(id0; role= :default)  # faster version of `id in procs()`
+    pg = PGRP(role = role)
+    if myid(role=role) == 1 || (pg.topology === :all_to_all  && !isclusterlazy(role = role))
+        for x in pg.workers
+            if (wid(x, role=role)::Int) == id0 && (isa(x, LocalProcess) || (@atomic (x::Worker).state) === W_CONNECTED)
                 return true
             end
         end
     else
-        for x in PGRP.workers
-            if (x.id::Int) == id
+        for x in pg.workers
+            if (wid(x, role=role)::Int) == id0
                 return true
             end
         end
@@ -950,17 +1082,18 @@ end
 Return a list of all process identifiers on the same physical node.
 Specifically all workers bound to the same ip-address as `pid` are returned.
 """
-function procs(pid::Integer)
-    if myid() == 1
-        all_workers = [x for x in PGRP.workers if isa(x, LocalProcess) || (x.state === W_CONNECTED)]
+function procs(pid::Integer; role= :default)
+    if myid(role = role) == 1
+        map_pid_wrkr = Map_pid_wrkr(role = role)
+        all_workers = [x for x in PGRP(role = role).workers if isa(x, LocalProcess) || ((@atomic x.state) === W_CONNECTED)]
         if (pid == 1) || (isa(map_pid_wrkr[pid].manager, LocalManager))
-            Int[x.id for x in filter(w -> (w.id==1) || (isa(w.manager, LocalManager)), all_workers)]
+            Int[wid(x, role=role) for x in filter(w -> (wid(w, role=role)==1) || (isa(w.manager, LocalManager)), all_workers)]
         else
             ipatpid = get_bind_addr(pid)
-            Int[x.id for x in filter(w -> get_bind_addr(w) == ipatpid, all_workers)]
+            Int[wid(x, role=role) for x in filter(w -> get_bind_addr(w) == ipatpid, all_workers)]
         end
     else
-        remotecall_fetch(procs, 1, pid)
+        remotecall_fetch(pid -> procs(pid, role = :master), 1; role = role)
     end
 end
 
@@ -972,15 +1105,15 @@ Return a list of all worker process identifiers.
 # Examples
 ```julia-repl
 \$ julia -p 2
-
+, pid
 julia> workers()
 2-element Array{Int64,1}:
  2
  3
 ```
 """
-function workers()
-    allp = procs()
+function workers(; role= :default)
+    allp = procs(role = role)
     if length(allp) == 1
        allp
     else
@@ -988,11 +1121,11 @@ function workers()
     end
 end
 
-function cluster_mgmt_from_master_check()
-    if myid() != 1
-        throw(ErrorException("Only process 1 can add and remove workers"))
-    end
-end
+#function cluster_mgmt_from_master_check()
+#    if myid() != 1
+#        throw(ErrorException("Only process 1 can add and remove workers"))
+#    end
+#end
 
 """
     rmprocs(pids...; waitfor=typemax(Int))
@@ -1025,22 +1158,22 @@ julia> workers()
  6
 ```
 """
-function rmprocs(pids...; waitfor=typemax(Int))
-    cluster_mgmt_from_master_check()
+function rmprocs(pids...; role = :default, waitfor=typemax(Int))    # supposed to be called always as :master
+#    cluster_mgmt_from_master_check()
 
     pids = vcat(pids...)
     if waitfor == 0
-        t = @async _rmprocs(pids, typemax(Int))
+        t = @async _rmprocs(pids, role, typemax(Int))
         yield()
         return t
     else
-        _rmprocs(pids, waitfor)
+        _rmprocs(pids, role, waitfor)
         # return a dummy task object that user code can wait on.
         return @async nothing
     end
 end
 
-function _rmprocs(pids, waitfor)
+function _rmprocs(pids, role, waitfor)
     lock(worker_lock)
     try
         rmprocset = Union{LocalProcess, Worker}[]
@@ -1048,6 +1181,7 @@ function _rmprocs(pids, waitfor)
             if p == 1
                 @warn "rmprocs: process 1 not removed"
             else
+                map_pid_wrkr = Map_pid_wrkr(role = role)
                 if haskey(map_pid_wrkr, p)
                     w = map_pid_wrkr[p]
                     set_worker_state(w, W_TERMINATING)
@@ -1059,11 +1193,11 @@ function _rmprocs(pids, waitfor)
 
         start = time_ns()
         while (time_ns() - start) < waitfor*1e9
-            all(w -> w.state === W_TERMINATED, rmprocset) && break
+            all(w -> (@atomic w.state) === W_TERMINATED, rmprocset) && break
             sleep(min(0.1, waitfor - (time_ns() - start)/1e9))
         end
 
-        unremoved = [wrkr.id for wrkr in filter(w -> w.state !== W_TERMINATED, rmprocset)]
+        unremoved = [wid(wrkr, role=role) for wrkr in filter(w -> (@atomic w.state) !== W_TERMINATED, rmprocset)]
         if length(unremoved) > 0
             estr = string("rmprocs: pids ", unremoved, " not terminated after ", waitfor, " seconds.")
             throw(ErrorException(estr))
@@ -1087,17 +1221,18 @@ end
 # No-arg constructor added for compatibility with Julia 1.0 & 1.1, should be deprecated in the future
 ProcessExitedException() = ProcessExitedException(-1)
 
-worker_from_id(i) = worker_from_id(PGRP, i)
-function worker_from_id(pg::ProcessGroup, i)
+worker_from_id(i; role= :default) = worker_from_id(PGRP(role = role), i; role = role)
+function worker_from_id(pg::ProcessGroup, i; role= :default)
     if !isempty(map_del_wrkr) && in(i, map_del_wrkr)
         throw(ProcessExitedException(i))
     end
+    map_pid_wrkr = Map_pid_wrkr(role = role)
     w = get(map_pid_wrkr, i, nothing)
     if w === nothing
-        if myid() == 1
-            error("no process with id $i exists")
+        if myid(role=role) == 1
+            error("no process with id $i exists ($role)")
         end
-        w = Worker(i)
+        w = Worker(i; role = role)
         map_pid_wrkr[i] = w
     else
         w = w::Union{Worker, LocalProcess}
@@ -1113,25 +1248,26 @@ returns the `pid` of the worker it is connected to.
 This is useful when writing custom [`serialize`](@ref) methods for a type,
 which optimizes the data written out depending on the receiving process id.
 """
-function worker_id_from_socket(s)
+function worker_id_from_socket(s; role= :default)
     w = get(map_sock_wrkr, s, nothing)
     if isa(w,Worker)
         if s === w.r_stream || s === w.w_stream
-            return w.id
+            return wid(w, role=role)
         end
     end
     if isa(s,IOStream) && fd(s)==-1
         # serializing to a local buffer
-        return myid()
+        return myid(role=role)
     end
     return -1
 end
 
 
-register_worker(w) = register_worker(PGRP, w)
-function register_worker(pg, w)
+register_worker(w; role= :default) = register_worker(PGRP(role = role), w; role = role)
+function register_worker(pg, w; role= :default)
     push!(pg.workers, w)
-    map_pid_wrkr[w.id] = w
+    map_pid_wrkr = Map_pid_wrkr(role = role)
+    map_pid_wrkr[wid(w, role=role)] = w
 end
 
 function register_worker_streams(w)
@@ -1139,9 +1275,10 @@ function register_worker_streams(w)
     map_sock_wrkr[w.w_stream] = w
 end
 
-deregister_worker(pid) = deregister_worker(PGRP, pid)
-function deregister_worker(pg, pid)
-    pg.workers = filter(x -> !(x.id == pid), pg.workers)
+deregister_worker(pid; role= :default) = deregister_worker(PGRP(role = role), pid, role=role)
+function deregister_worker(pg, pid; role= :default)
+    pg.workers = filter(x -> !(wid(x, role=role) == pid), pg.workers)
+    map_pid_wrkr = Map_pid_wrkr(role = role)
     w = pop!(map_pid_wrkr, pid, nothing)
     if isa(w, Worker)
         if isdefined(w, :r_stream)
@@ -1151,13 +1288,13 @@ function deregister_worker(pg, pid)
             end
         end
 
-        if myid() == 1 && (myrole() === :master) && isdefined(w, :config)
+        if myid(role=role) == 1 && #=role === :master &&=# isdefined(w, :config)
             # Notify the cluster manager of this workers death
-            manage(w.manager, w.id, w.config, :deregister)
-            if PGRP.topology !== :all_to_all || isclusterlazy()
-                for rpid in workers()
+            manage(w.manager, wid(w, role=role), w.config, :deregister)
+            if pg.topology !== :all_to_all || isclusterlazy(role = role)
+                for rpid in workers(role=role)
                     try
-                        remote_do(deregister_worker, rpid, pid)
+                        remote_do((pid,role) ->  deregister_worker(pid, role=role), rpid, pid, rpid == 1 ? :master : :worker; role = role)
                     catch
                     end
                 end
@@ -1192,11 +1329,12 @@ function deregister_worker(pg, pid)
 end
 
 
-function interrupt(pid::Integer)
-    @assert myid() == 1
+function interrupt(pid::Integer)  
+    @assert myid(role = :master) == 1
+    map_pid_wrkr = Map_pid_wrkr(role = :master)
     w = map_pid_wrkr[pid]
     if isa(w, Worker)
-        manage(w.manager, w.id, w.config, :interrupt)
+        manage(w.manager, wid(w, role=:master), w.config, :interrupt)
     end
     return
 end
@@ -1215,8 +1353,8 @@ interrupt(pids::Integer...) = interrupt([pids...])
 Interrupt the current executing task on the specified workers. This is equivalent to
 pressing Ctrl-C on the local machine. If no arguments are given, all workers are interrupted.
 """
-function interrupt(pids::AbstractVector=workers())
-    @assert myid() == 1
+function interrupt(pids::AbstractVector=workers(role = :master))
+    @assert myid(role = :master) == 1
     @sync begin
         for pid in pids
             @async interrupt(pid)
@@ -1227,13 +1365,14 @@ end
 wp_bind_addr(p::LocalProcess) = p.bind_addr
 wp_bind_addr(p) = p.config.bind_addr
 
-function check_same_host(pids)
-    if myid() != 1
-        return remotecall_fetch(check_same_host, 1, pids)
+function check_same_host(pids; role= :default)
+    if myid(role = role) != 1
+        return remotecall_fetch(pids -> check_same_host(pids, role = :master), 1, pids; role = role)
     else
         # We checkfirst if all test pids have been started using the local manager,
         # else we check for the same bind_to addr. This handles the special case
         # where the local ip address may change - as during a system sleep/awake
+        map_pid_wrkr = Map_pid_wrkr(role = role)
         if all(p -> (p==1) || (isa(map_pid_wrkr[p].manager, LocalManager)), pids)
             return true
         else
@@ -1243,18 +1382,18 @@ function check_same_host(pids)
     end
 end
 
-function terminate_all_workers()
-    myid() != 1 && return
+function terminate_all_workers(;role= :default)
+    myid(role = role) != 1 && return
 
-    if nprocs() > 1
+    if nprocs(role = role) > 1
         try
-            rmprocs(workers(); waitfor=5.0)
+            rmprocs(workers(role = role); role = role, waitfor=5.0)
         catch _ex
             @warn "Forcibly interrupting busy workers" exception=_ex
             # Might be computation bound, interrupt them and try again
-            interrupt(workers())
+            interrupt(workers(role = role))
             try
-                rmprocs(workers(); waitfor=5.0)
+                rmprocs(workers(role = role); role = role, waitfor=5.0)
             catch _ex2
                 @error "Unable to terminate all workers" exception=_ex2,catch_backtrace()
             end
@@ -1263,6 +1402,73 @@ function terminate_all_workers()
 end
 
 # initialize the local proc network address / port
+#=function init_bind_addr()
+    opts = JLOptions()
+    if opts.bindto != C_NULL
+        bind_to = split(unsafe_string(opts.bindto), ":")
+        @info "A1: $bind_to"
+        bind_addr = string(parse(IPAddr, bind_to[1]))
+        if length(bind_to) > 1
+            bind_port = parse(Int,bind_to[2])
+        else
+            bind_port = 0
+        end
+    else
+        bind_port = 0
+        try
+            ips = getipaddrs(IPv4; loopback = false)
+            n = length(ips)
+            bind_addr = string(ips[n])
+        catch
+            # All networking is unavailable, initialize bind_addr to the loopback address
+            # Will cause an exception to be raised only when used.
+            bind_addr = "127.0.0.1"
+        end
+    end
+    global LPROC
+    LPROC.bind_addr = bind_addr
+    LPROC.bind_port = UInt16(bind_port)
+end
+=#
+
+#=function init_bind_addr()
+    opts = JLOptions()
+
+    @info "A2: $(getipaddrs(IPv4; loopback = false))"
+    bind_port = 0
+    bind_addr = ""
+    try
+        ips = getipaddrs(IPv4; loopback = false)
+        n = length(ips)
+        bind_addr = string(ips[n])
+        @info "ADDR: $ips --- $ips"
+    catch
+        # All networking is unavailable, initialize bind_addr to the loopback address
+        # Will cause an exception to be raised only when used.
+        bind_addr = "127.0.0.1"
+    end
+
+    if opts.bindto != C_NULL
+        bind_to = split(unsafe_string(opts.bindto), ":")
+        @info "A1: $bind_to"
+        bind_addr_2 = string(parse(IPAddr, bind_to[1]))
+        if length(bind_to) > 1
+            bind_port = parse(Int,bind_to[2])
+        else
+            bind_port = 0
+        end
+    else
+        bind_addr_2 = bind_addr
+    end
+
+    global LPROC
+    @info "bind_addr=$bind_addr / bind_addr_2=$bind_addr_2"
+    LPROC.bind_addr = bind_addr
+    LPROC.bind_addr_2 = bind_addr_2
+    LPROC.bind_port = UInt16(bind_port)
+end
+=#
+
 function init_bind_addr()
     opts = JLOptions()
     if opts.bindto != C_NULL
@@ -1285,34 +1491,37 @@ function init_bind_addr()
     end
     global LPROC
     LPROC.bind_addr = bind_addr
+    LPROC.bind_addr_2 = bind_addr
     LPROC.bind_port = UInt16(bind_port)
 end
 
 using Random: randstring
 
-let inited = false
-    # do initialization that's only needed when there is more than 1 processor
-    global function init_multi()
-        if !inited
-            inited = true
-            push!(Base.package_callbacks, _require_callback)
-            atexit(terminate_all_workers)
-            init_bind_addr()
-            cluster_cookie(randstring(HDR_COOKIE_LEN))
-        end
-        return nothing
+# do initialization that's only needed when there is more than 1 processor
+const inited = Threads.Atomic{Bool}(false)
+function init_multi()
+    if !Threads.atomic_cas!(inited, false, true)
+        push!(Base.package_callbacks, _require_callback)
+        atexit(terminate_all_workers)
+        init_bind_addr()
+        cluster_cookie(randstring(HDR_COOKIE_LEN))
     end
+    return nothing
 end
 
 function init_parallel()
-    start_gc_msgs_task()
+    start_gc_msgs_task(role = :master)   # TO CHECK
+    start_gc_msgs_task(role = :worker)   # TO CHECK
 
     # start in "head node" mode, if worker, will override later.
-    global PGRP
+    #global PGRP
     global LPROC
-    LPROC.id = 1
-    @assert isempty(PGRP.workers)
-    register_worker(LPROC)
+    LPROC.id0 = 0
+    LPROC.id1 = 1
+    @assert isempty(PGRP(role = :master).workers)    # TO CHECK
+    @assert isempty(PGRP(role = :worker).workers)     # TO CHECK
+    register_worker(LPROC; role = :master)           # TO CHECK
+    register_worker(LPROC; role = :worker)            # TO CHECK
 end
 
 write_cookie(io::IO) = print(io.in, string(cluster_cookie(), "\n"))
diff --git a/src/clusterserialize.jl b/src/clusterserialize.jl
index 0acd4ce..bdd82b8 100644
--- a/src/clusterserialize.jl
+++ b/src/clusterserialize.jl
@@ -167,10 +167,17 @@ function deserialize_global_from_main(s::ClusterSerializer, sym)
             return nothing
         end
     end
+    Core.eval(Main, Expr(:global, sym))
     if sym_isconst
-        ccall(:jl_set_const, Cvoid, (Any, Any, Any), Main, sym, v)
+        # Note that the post-lowering const form is not allowed in value
+        # position, so there needs to be a dummy `nothing` argument to drop the
+        # return value.
+        Core.eval(Main, Expr(:block,
+            Expr(:const, GlobalRef(Main, sym), v),
+            nothing))
     else
-        setglobal!(Main, sym, v)
+        Core.eval(Main, Expr(:global, sym))
+        invokelatest(setglobal!, Main, sym, v)
     end
     return nothing
 end
@@ -241,14 +248,14 @@ reinitialized. Only those names found to be defined under `mod` are cleared.
 
 An exception is raised if a global constant is requested to be cleared.
 """
-function clear!(syms, pids=workers(); mod=Main)
+function clear!(syms, pids=workers(); mod=Main, role= :default)
     @sync for p in pids
-        @async_unwrap remotecall_wait(clear_impl!, p, syms, mod)
+        @async_unwrap remotecall_wait(clear_impl!, p, syms, mod; role = role)
     end
 end
-clear!(sym::Symbol, pid::Int; mod=Main) = clear!([sym], [pid]; mod=mod)
-clear!(sym::Symbol, pids=workers(); mod=Main) = clear!([sym], pids; mod=mod)
-clear!(syms, pid::Int; mod=Main) = clear!(syms, [pid]; mod=mod)
+clear!(sym::Symbol, pid::Int; mod=Main, role= :default) = clear!([sym], [pid]; mod=mod, role = role)
+clear!(sym::Symbol, pids=workers(); mod=Main, role= :default) = clear!([sym], pids; mod=mod, role = role)
+clear!(syms, pid::Int; mod=Main, role= :default) = clear!(syms, [pid]; mod=mod, role = role)
 
 clear_impl!(syms, mod::Module) = foreach(x->clear_impl!(x,mod), syms)
 clear_impl!(sym::Symbol, mod::Module) = isdefined(mod, sym) && @eval(mod, global $sym = nothing)
diff --git a/src/macros.jl b/src/macros.jl
index a767c7a..aeb9084 100644
--- a/src/macros.jl
+++ b/src/macros.jl
@@ -2,15 +2,15 @@
 
 let nextidx = Threads.Atomic{Int}(0)
     global nextproc
-    function nextproc()
+    function nextproc(;role= :default)
         idx = Threads.atomic_add!(nextidx, 1)
-        return workers()[(idx % nworkers()) + 1]
+        return workers(role = role)[(idx % nworkers(role = role)) + 1]
     end
 end
 
-spawnat(p, thunk) = remotecall(thunk, p)
+spawnat(p, thunk; role= :default) = remotecall(thunk, p; role = role)
 
-spawn_somewhere(thunk) = spawnat(nextproc(),thunk)
+spawn_somewhere(thunk; role= :default) = spawnat(nextproc(role = role),thunk; role = role)
 
 """
     @spawn expr
@@ -39,11 +39,31 @@ julia> fetch(f)
 !!! compat "Julia 1.3"
     As of Julia 1.3 this macro is deprecated. Use `@spawnat :any` instead.
 """
-macro spawn(expr)
+
+
+#macro spawn(expr, role = :(:default))
+
+function check_args_2(args...)
+    na = length(args)
+    if na==1
+        role = Expr(:kw, :role, :(:defaut)) #:(role = :default)
+        expr = args[1]
+    elseif na==2 
+        role = args[1]
+        expr = args[2]
+    else
+        throw(ArgumentError("wrong number of arguments to spawn"))
+    end
+    return role, expr
+end
+
+macro spawn(args...)
+    rolearg, expr = check_args_2(args...)
+
     thunk = esc(:(()->($expr)))
     var = esc(Base.sync_varname)
     quote
-        local ref = spawn_somewhere($thunk)
+        local ref = spawn_somewhere($thunk; $(esc(rolearg)))
         if $(Expr(:islocal, var))
             put!($var, ref)
         end
@@ -51,13 +71,17 @@ macro spawn(expr)
     end
 end
 
+
 """
     @spawnat p expr
 
 Create a closure around an expression and run the closure
 asynchronously on process `p`. Return a [`Future`](@ref) to the result.
+
 If `p` is the quoted literal symbol `:any`, then the system will pick a
-processor to use automatically.
+processor to use automatically. Using `:any` will not apply any form of
+load-balancing, consider using a [`WorkerPool`](@ref) and [`remotecall(f,
+::WorkerPool)`](@ref) if you need load-balancing.
 
 # Examples
 ```julia-repl
@@ -79,15 +103,36 @@ julia> fetch(f)
 !!! compat "Julia 1.3"
     The `:any` argument is available as of Julia 1.3.
 """
-macro spawnat(p, expr)
-    thunk = esc(:(()->($expr)))
-    var = esc(Base.sync_varname)
-    if p === QuoteNode(:any)
-        spawncall = :(spawn_somewhere($thunk))
+
+function check_args_3a(args...)
+    na = length(args)
+    if na==2
+        role = Expr(:kw, :role, :(:defaut)) #:(role = :default)
+        p = args[1]
+        expr = args[2]
+    elseif na==3 
+        role = args[1]
+        p = args[2]
+        expr = args[3]
     else
-        spawncall = :(spawnat($(esc(p)), $thunk))
+        throw(ArgumentError("wrong number of arguments to spawnat"))
     end
-    quote
+    return role, p, expr
+end
+
+macro spawnat(args...)
+   rolearg, p, expr = check_args_3a(args...)
+
+   #@info rolearg, typeof(rolearg)
+
+   thunk = esc(:(()->($expr)))
+   var = esc(Base.sync_varname)
+   if p === QuoteNode(:any)
+       spawncall = :(spawn_somewhere($thunk; $(esc(rolearg))))
+   else
+       spawncall = :(spawnat($(esc(p)), $thunk; $(esc(rolearg))))
+   end
+   quote
         local ref = $spawncall
         if $(Expr(:islocal, var))
             put!($var, ref)
@@ -96,6 +141,7 @@ macro spawnat(p, expr)
     end
 end
 
+
 """
     @fetch expr
 
@@ -119,9 +165,13 @@ julia> @fetch myid()
 2
 ```
 """
-macro fetch(expr)
+
+macro fetch(args...)
+
+    rolearg, expr = check_args_2(args...)
+
     thunk = esc(:(()->($expr)))
-    :(remotecall_fetch($thunk, nextproc()))
+    :(remotecall_fetch($thunk, nextproc(); $(esc(rolearg))))
 end
 
 """
@@ -141,9 +191,12 @@ julia> @fetchfrom 4 myid()
 4
 ```
 """
-macro fetchfrom(p, expr)
+
+
+macro fetchfrom(args...)
+    rolearg, p, expr = check_args_3a(args...)
     thunk = esc(:(()->($expr)))
-    :(remotecall_fetch($thunk, $(esc(p))))
+    :(remotecall_fetch($thunk, $(esc(p)); $(esc(rolearg))))
 end
 
 # extract a list of modules to import from an expression
@@ -185,24 +238,58 @@ processes to have execute the expression.
 
 Similar to calling `remotecall_eval(Main, procs, expr)`, but with two extra features:
 
-    - `using` and `import` statements run on the calling process first, to ensure
-      packages are precompiled.
-    - The current source file path used by `include` is propagated to other processes.
+- `using` and `import` statements run on the calling process first, to ensure
+  packages are precompiled.
+- The current source file path used by `include` is propagated to other processes.
 """
-macro everywhere(ex)
-    procs = GlobalRef(@__MODULE__, :procs)
-    return esc(:($(Distributed).@everywhere $procs() $ex))
+
+function check_args_3b(args...)
+
+    na = length(args)
+    if na==1
+        rolearg = Expr(:kw, :role, :(:defaut)) #:(role = :default)
+        reducer = nothing
+        loop = args[1]
+    elseif na==2
+        if isa(args[1], Expr) && args[1].head == :(=) && args[1].args[1] === :role 
+            rolearg = args[1]
+            reducer = nothing
+            loop = args[2]
+        else
+            rolearg = Expr(:kw, :role, :(:defaut)) #:(role = :default)
+            reducer = args[1]
+            loop = args[2]
+        end
+    elseif na==3
+        rolearg = args[1]
+        reducer = args[2]
+        loop = args[3]
+    else
+        throw(ArgumentError("wrong number of arguments to @distributed"))
+    end
+
+    return rolearg, reducer, loop
 end
 
-macro everywhere(procs, ex)
-    imps = extract_imports(ex)
-    return quote
-        $(isempty(imps) ? nothing : Expr(:toplevel, imps...)) # run imports locally first
-        let ex = Expr(:toplevel, :(task_local_storage()[:SOURCE_PATH] = $(get(task_local_storage(), :SOURCE_PATH, nothing))), $(esc(Expr(:quote, ex)))),
-            procs = $(esc(procs))
-            remotecall_eval(Main, procs, ex)
+macro everywhere(args...)
+
+    rolearg, procs, ex = check_args_3b(args...)
+
+    if isnothing(procs)
+        procs = GlobalRef(@__MODULE__, :procs)
+        return esc(:($(Distributed).@everywhere $rolearg $procs(;$rolearg) $ex))
+    else
+        imps = extract_imports(ex)
+        return quote
+            $(isempty(imps) ? nothing : Expr(:toplevel, imps...)) # run imports locally first
+            let ex = Expr(:toplevel, :(task_local_storage()[:SOURCE_PATH] = $(get(task_local_storage(), :SOURCE_PATH, nothing))), $(esc(Expr(:quote, ex)))),
+                procs = $(esc(procs))
+                remotecall_eval(Main, procs, ex; $(esc(rolearg)))
+            end
         end
+          
     end
+
 end
 
 """
@@ -215,14 +302,14 @@ Errors on any of the processes are collected into a
 
 See also [`@everywhere`](@ref).
 """
-function remotecall_eval(m::Module, procs, ex)
+function remotecall_eval(m::Module, procs, ex; role=:default)
     @sync begin
         run_locally = 0
         for pid in procs
-            if pid == myid()
+            if pid == myid(role=role)
                 run_locally += 1
             else
-                @async_unwrap remotecall_wait(Core.eval, pid, m, ex)
+                @async_unwrap remotecall_wait(Core.eval, pid, m, ex; role=role)
             end
         end
         yield() # ensure that the remotecalls have had a chance to start
@@ -238,8 +325,8 @@ end
 
 # optimized version of remotecall_eval for a single pid
 # and which also fetches the return value
-function remotecall_eval(m::Module, pid::Int, ex)
-    return remotecall_fetch(Core.eval, pid, m, ex)
+function remotecall_eval(m::Module, pid::Int, ex; role=:default)
+    return remotecall_fetch(Core.eval, pid, m, ex; role=role)
 end
 
 
@@ -261,22 +348,22 @@ function splitrange(firstIndex::Int, lastIndex::Int, np::Int)
     return chunks
 end
 
-function preduce(reducer, f, R)
-    chunks = splitrange(Int(firstindex(R)), Int(lastindex(R)), nworkers())
-    all_w = workers()[1:length(chunks)]
+function preduce(reducer, f, R; role = :default)
+    chunks = splitrange(Int(firstindex(R)), Int(lastindex(R)), nworkers(role=role))
+    all_w = workers(role=role)[1:length(chunks)]
 
     w_exec = Task[]
     for (idx,pid) in enumerate(all_w)
-        t = Task(()->remotecall_fetch(f, pid, reducer, R, first(chunks[idx]), last(chunks[idx])))
+        t = Task(()->remotecall_fetch(f, pid, reducer, R, first(chunks[idx]), last(chunks[idx]), role=role))
         schedule(t)
         push!(w_exec, t)
     end
     reduce(reducer, Any[fetch(t) for t in w_exec])
 end
 
-function pfor(f, R)
-    t = @async @sync for c in splitrange(Int(firstindex(R)), Int(lastindex(R)), nworkers())
-        @spawnat :any f(R, first(c), last(c))
+function pfor(f, R; role = :default)
+    t = @async @sync for c in splitrange(Int(firstindex(R)), Int(lastindex(R)), nworkers(role=role))
+        @spawnat role=role :any f(R, first(c), last(c))
     end
     errormonitor(t)
 end
@@ -328,15 +415,9 @@ completion. To wait for completion, prefix the call with [`@sync`](@ref), like :
     end
 """
 macro distributed(args...)
-    na = length(args)
-    if na==1
-        loop = args[1]
-    elseif na==2
-        reducer = args[1]
-        loop = args[2]
-    else
-        throw(ArgumentError("wrong number of arguments to @distributed"))
-    end
+    
+    rolearg, reducer, loop = check_args_3b(args...)
+    
     if !isa(loop,Expr) || loop.head !== :for
         error("malformed @distributed loop")
     end
@@ -346,16 +427,16 @@ macro distributed(args...)
     if Meta.isexpr(body, :block) && body.args[end] isa LineNumberNode
         resize!(body.args, length(body.args) - 1)
     end
-    if na==1
+    if isnothing(reducer)
         syncvar = esc(Base.sync_varname)
         return quote
-            local ref = pfor($(make_pfor_body(var, body)), $(esc(r)))
+            local ref = pfor($(make_pfor_body(var, body)), $(esc(r)); $(esc(rolearg)))
             if $(Expr(:islocal, syncvar))
                 put!($syncvar, ref)
             end
             ref
         end
     else
-        return :(preduce($(esc(reducer)), $(make_preduce_body(var, body)), $(esc(r))))
+        return :(preduce($(esc(reducer)), $(make_preduce_body(var, body)), $(esc(r)); $(esc(rolearg)))) # TO CHECK (role ?)
     end
 end
diff --git a/src/managers.jl b/src/managers.jl
index b2b655a..658c98a 100644
--- a/src/managers.jl
+++ b/src/managers.jl
@@ -111,7 +111,9 @@ addprocs([
   version is used on all remote machines because serialization and code distribution might
   fail otherwise.
 
-* `exeflags`: additional flags passed to the worker processes.
+* `exeflags`: additional flags passed to the worker processes. It can either be a `Cmd`, a `String`
+  holding one flag, or a collection of strings, with one element per flag.
+  E.g. `\`--threads=auto project=.\``, `"--compile-trace=stderr"` or `["--threads=auto", "--compile=all"]`. 
 
 * `topology`: Specifies how the workers connect to each other. Sending a message between
   unconnected workers results in an error.
@@ -169,14 +171,16 @@ default_addprocs_params(::SSHManager) =
               :env            => [],
               :tunnel         => false,
               :multiplex      => false,
-              :max_parallel   => 10))
+              :max_parallel   => 10,
+              :ident          => nothing,
+              :connect_idents => nothing))
 
 function launch(manager::SSHManager, params::Dict, launched::Array, launch_ntfy::Condition)
     # Launch one worker on each unique host in parallel. Additional workers are launched later.
     # Wait for all launches to complete.
     @sync for (i, (machine, cnt)) in enumerate(manager.machines)
         let machine=machine, cnt=cnt
-             @async try
+            @async try
                 launch_on_machine(manager, $machine, $cnt, params, launched, launch_ntfy)
             catch e
                 print(stderr, "exception launching on machine $(machine) : $(e)\n")
@@ -228,6 +232,7 @@ function parse_machine(machine::AbstractString)
 end
 
 function launch_on_machine(manager::SSHManager, machine::AbstractString, cnt, params::Dict, launched::Array, launch_ntfy::Condition)
+
     shell = params[:shell]
     ssh = params[:ssh]
     dir = params[:dir]
@@ -361,7 +366,15 @@ function launch_on_machine(manager::SSHManager, machine::AbstractString, cnt, pa
     wconfig.count = cnt
     wconfig.max_parallel = params[:max_parallel]
     wconfig.enable_threaded_blas = params[:enable_threaded_blas]
-
+    #@info "will test connect_idents -- $(wconfig.ident)"
+    if haskey(params,:connect_idents) && !isnothing(params[:connect_idents])
+       wconfig.connect_idents = Vector(params[:connect_idents])
+    #   @info "connect_idents = $(wconfig.connect_idents)"
+    end
+    if haskey(params, :ident) && !isnothing(params[:ident])
+        wconfig.ident = params[:ident]
+    #    @info "-------------- $(wconfig.ident)"
+    end
 
     push!(launched, wconfig)
     notify(launch_ntfy)
@@ -572,16 +585,26 @@ workers.
 function connect(manager::ClusterManager, pid::Int, config::WorkerConfig)
     if config.connect_at !== nothing
         # this is a worker-to-worker setup call.
+        #(rhost, rport) = notnothing(config.connect_at)::Tuple{String, Int}
+        #config.host = rhost
+        #config.port = rport
+        #config.connect_at = nothing
         return connect_w2w(pid, config)
+        #return connect(manager, pid, config)
     end
 
+    #@info "CONNECT W1 "
+
     # master connecting to workers
     if config.io !== nothing
         (bind_addr, port::Int) = read_worker_host_port(config.io)
+       # @info "CONNECT W2 $bind_addr $port $(config.host) $(config.bind_addr)"
         pubhost = something(config.host, bind_addr)
+       # @info "CONNECT W21 $pubhost"
         config.host = pubhost
         config.port = port
     else
+        #@info "CONNECT W3"
         pubhost = notnothing(config.host)
         port = notnothing(config.port)
         bind_addr = something(config.bind_addr, pubhost)
@@ -619,6 +642,7 @@ function connect(manager::ClusterManager, pid::Int, config::WorkerConfig)
             release(sem)
         end
     else
+#        (s, bind_addr) = connect_to_worker(#=bind_addr=# pubhost, port)
         (s, bind_addr) = connect_to_worker(bind_addr, port)
     end
 
@@ -681,6 +705,9 @@ function bind_client_port(sock::TCPSocket, iptype)
 end
 
 function connect_to_worker(host::AbstractString, port::Integer)
+
+#    @info "--------- CONNECT TO WORKER $host $port"
+
     # Avoid calling getaddrinfo if possible - involves a DNS lookup
     # host may be a stringified ipv4 / ipv6 address or a dns name
     bind_addr = nothing
@@ -690,6 +717,7 @@ function connect_to_worker(host::AbstractString, port::Integer)
         bind_addr = getaddrinfo(host)
     end
 
+
     iptype = typeof(bind_addr)
     sock = socket_reuse_port(iptype)
     connect(sock, bind_addr, UInt16(port))
@@ -699,6 +727,9 @@ end
 
 
 function connect_to_worker_with_tunnel(host::AbstractString, bind_addr::AbstractString, port::Integer, tunnel_user::AbstractString, sshflags, multiplex)
+
+   # @info "++++++++ CONNECT TO WORKER WITH TUNNEL host=$host port=$port bind_addr=$bind_addr tunnel_user=$tunnel_user sshflags=$sshflags multiplex=$multiplex"
+
     localport = ssh_tunnel(tunnel_user, host, bind_addr, UInt16(port), sshflags, multiplex)
     s = connect("localhost", localport)
     forward = "$localport:$bind_addr:$port"
@@ -728,31 +759,39 @@ It should cause the remote worker specified by `pid` to exit.
 on `pid`.
 """
 function kill(manager::ClusterManager, pid::Int, config::WorkerConfig)
-    remote_do(exit, pid)
+    remote_do(exit, pid; role = :master)
     nothing
 end
 
 function kill(manager::SSHManager, pid::Int, config::WorkerConfig)
-    remote_do(exit, pid)
+    remote_do(exit, pid; role = :master)
     cancel_ssh_tunnel(config)
     nothing
 end
 
-function kill(manager::LocalManager, pid::Int, config::WorkerConfig; exit_timeout = 15, term_timeout = 15)
+function kill(manager::LocalManager, pid::Int, config::WorkerConfig; profile_wait = 6, exit_timeout = 15, term_timeout = 15)
+    # profile_wait = 6 is 1s for profile, 5s for the report to show
     # First, try sending `exit()` to the remote over the usual control channels
-    remote_do(exit, pid)
+    remote_do(exit, pid; role = :master)
 
     timer_task = @async begin
         sleep(exit_timeout)
 
         # Check to see if our child exited, and if not, send an actual kill signal
         if !process_exited(config.process)
-            @warn("Failed to gracefully kill worker $(pid), sending SIGTERM")
-            kill(config.process, Base.SIGTERM)
+            @warn "Failed to gracefully kill worker $(pid)"
+            profile_sig = Sys.iswindows() ? nothing : Sys.isbsd() ? ("SIGINFO", 29) : ("SIGUSR1" , 10)
+            if profile_sig !== nothing
+                @warn("Sending profile $(profile_sig[1]) to worker $(pid)")
+                kill(config.process, profile_sig[2])
+                sleep(profile_wait)
+            end
+            @warn("Sending SIGQUIT to worker $(pid)")
+            kill(config.process, Base.SIGQUIT)
 
             sleep(term_timeout)
             if !process_exited(config.process)
-                @warn("Worker $(pid) ignored SIGTERM, sending SIGKILL")
+                @warn("Worker $(pid) ignored SIGQUIT, sending SIGKILL")
                 kill(config.process, Base.SIGKILL)
             end
         end
diff --git a/src/messages.jl b/src/messages.jl
index fe3e5ab..92afe8b 100644
--- a/src/messages.jl
+++ b/src/messages.jl
@@ -99,30 +99,30 @@ function send_msg_unknown(s::IO, header, msg)
     error("attempt to send to unknown socket")
 end
 
-function send_msg(s::IO, header, msg)
-    id = worker_id_from_socket(s)
+function send_msg(s::IO, header, msg; role= :default)
+    id = worker_id_from_socket(s; role = role)
     if id > -1
-        return send_msg(worker_from_id(id), header, msg)
+        return send_msg(worker_from_id(id, role=role), header, msg; role = role)
     end
     send_msg_unknown(s, header, msg)
 end
 
-function send_msg_now(s::IO, header, msg::AbstractMsg)
-    id = worker_id_from_socket(s)
+function send_msg_now(s::IO, header, msg::AbstractMsg; role= :default)
+    id = worker_id_from_socket(s; role = role)
     if id > -1
-        return send_msg_now(worker_from_id(id), header, msg)
+        return send_msg_now(worker_from_id(id; role=role), header, msg; role = role)
     end
     send_msg_unknown(s, header, msg)
 end
-function send_msg_now(w::Worker, header, msg)
-    send_msg_(w, header, msg, true)
+function send_msg_now(w::Worker, header, msg; role= :default)
+    send_msg_(w, header, msg, true; role = role)
 end
 
-function send_msg(w::Worker, header, msg)
-    send_msg_(w, header, msg, false)
+function send_msg(w::Worker, header, msg; role= :default)
+    send_msg_(w, header, msg, false; role = role)
 end
 
-function flush_gc_msgs(w::Worker)
+function flush_gc_msgs(w::Worker; role= :default)
     if !isdefined(w, :w_stream)
         return
     end
@@ -144,10 +144,10 @@ function flush_gc_msgs(w::Worker)
         end
     end
     if add_msgs !== nothing
-        remote_do(add_clients, w, add_msgs)
+        remote_do((add_msgs, role) -> add_clients(add_msgs, role = role), w, add_msgs, wid(w,role=role) == 1 ? :master : :worker; role = role)
     end
     if del_msgs !== nothing
-        remote_do(del_clients, w, del_msgs)
+        remote_do((del_msgs, role) -> del_clients(del_msgs, role = role), w, del_msgs, wid(w,role=role) == 1 ? :master : :worker; role = role)
     end
     return
 end
@@ -168,9 +168,9 @@ function deserialize_hdr_raw(io)
     return MsgHeader(RRID(data[1], data[2]), RRID(data[3], data[4]))
 end
 
-function send_msg_(w::Worker, header, msg, now::Bool)
-    check_worker_state(w)
-    if myid() != 1 && !isa(msg, IdentifySocketMsg) && !isa(msg, IdentifySocketAckMsg)
+function send_msg_(w::Worker, header, msg, now::Bool; role= :default)
+    check_worker_state(w; role = role)
+    if myid(role=role) != 1 && !isa(msg, IdentifySocketMsg) && !isa(msg, IdentifySocketAckMsg)
         wait(w.initialized)
     end
     io = w.w_stream
@@ -182,7 +182,7 @@ function send_msg_(w::Worker, header, msg, now::Bool)
         write(io, MSG_BOUNDARY)
 
         if !now && w.gcflag
-            flush_gc_msgs(w)
+            flush_gc_msgs(w; role = role)
         else
             flush(io)
         end
@@ -191,11 +191,11 @@ function send_msg_(w::Worker, header, msg, now::Bool)
     end
 end
 
-function flush_gc_msgs()
+function flush_gc_msgs(; role= :default)
     try
-        for w in (PGRP::ProcessGroup).workers
-            if isa(w,Worker) && (w.state == W_CONNECTED) && w.gcflag
-                flush_gc_msgs(w)
+        for w in (PGRP(role = role)::ProcessGroup).workers
+            if isa(w,Worker) && ((@atomic w.state) == W_CONNECTED) && w.gcflag
+                flush_gc_msgs(w; role = role)
             end
         end
     catch e
diff --git a/src/pmap.jl b/src/pmap.jl
index 39acc4d..225c9ad 100644
--- a/src/pmap.jl
+++ b/src/pmap.jl
@@ -18,16 +18,16 @@ Note that `f` must be made available to all worker processes; see
 [Code Availability and Loading Packages](@ref code-availability)
 for details.
 """
-function pgenerate(p::AbstractWorkerPool, f, c)
+function pgenerate(p::AbstractWorkerPool, f, c; role= :default)
     if length(p) == 0
-        return AsyncGenerator(f, c; ntasks=()->nworkers(p))
+        return AsyncGenerator(f, c; ntasks=()->nworkers(p; role = role))
     end
     batches = batchsplit(c, min_batch_count = length(p) * 3)
-    return Iterators.flatten(AsyncGenerator(remote(p, b -> asyncmap(f, b)), batches))
+    return Iterators.flatten(AsyncGenerator(remote(p, b -> asyncmap(f, b); role = role), batches))
 end
-pgenerate(p::AbstractWorkerPool, f, c1, c...) = pgenerate(p, a->f(a...), zip(c1, c...))
-pgenerate(f, c) = pgenerate(default_worker_pool(), f, c)
-pgenerate(f, c1, c...) = pgenerate(a->f(a...), zip(c1, c...))
+pgenerate(p::AbstractWorkerPool, f, c1, c...; role= :default) = pgenerate(p, a->f(a...), zip(c1, c...); role = role)
+pgenerate(f, c; role= :default) = pgenerate(default_worker_pool(role=role), f, c; role = role)
+pgenerate(f, c1, c...; role= :default) = pgenerate(a->f(a...), zip(c1, c...); role = role)
 
 """
     pmap(f, [::AbstractWorkerPool], c...; distributed=true, batch_size=1, on_error=nothing, retry_delays=[], retry_check=nothing) -> collection
@@ -97,10 +97,10 @@ pmap(f, c; on_error = e->(isa(e, InexactError) ? NaN : rethrow()), retry_delays
 ```
 """
 function pmap(f, p::AbstractWorkerPool, c; distributed=true, batch_size=1, on_error=nothing,
-                                           retry_delays=[], retry_check=nothing)
+                                           retry_delays=[], retry_check=nothing, role= :default)
     f_orig = f
     # Don't do remote calls if there are no workers.
-    if (length(p) == 0) || (length(p) == 1 && fetch(p.channel) == myid())
+    if (length(p) == 0) || (length(p) == 1 && fetch(p.channel) == myid(role = role))
         distributed = false
     end
 
@@ -116,14 +116,14 @@ function pmap(f, p::AbstractWorkerPool, c; distributed=true, batch_size=1, on_er
         end
 
         if distributed
-            f = remote(p, f)
+            f = remote(p, f; role=role)
         end
 
         if length(retry_delays) > 0
             f = wrap_retry(f, retry_delays, retry_check)
         end
 
-        return asyncmap(f, c; ntasks=()->nworkers(p))
+        return asyncmap(f, c; ntasks=()->nworkers(p; role = role))
     else
         # During batch processing, We need to ensure that if on_error is set, it is called
         # for each element in error, and that we return as many elements as the original list.
@@ -140,12 +140,12 @@ function pmap(f, p::AbstractWorkerPool, c; distributed=true, batch_size=1, on_er
             f = wrap_on_error(f, (x,e)->BatchProcessingError(x,e); capture_data=true)
         end
 
-        f = wrap_batch(f, p, handle_errors)
-        results = asyncmap(f, c; ntasks=()->nworkers(p), batch_size=batch_size)
+        f = wrap_batch(f, p, handle_errors; role=role)
+        results = asyncmap(f, c; ntasks=()->nworkers(p; role = role), batch_size=batch_size)
 
         # process errors if any.
         if handle_errors
-            process_batch_errors!(p, f_orig, results, on_error, retry_delays, retry_check)
+            process_batch_errors!(p, f_orig, results, on_error, retry_delays, retry_check; role = role)
         end
 
         return results
@@ -153,7 +153,7 @@ function pmap(f, p::AbstractWorkerPool, c; distributed=true, batch_size=1, on_er
 end
 
 pmap(f, p::AbstractWorkerPool, c1, c...; kwargs...) = pmap(a->f(a...), p, zip(c1, c...); kwargs...)
-pmap(f, c; kwargs...) = pmap(f, CachingPool(workers()), c; kwargs...)
+pmap(f, c; role = :default, kwargs...) = pmap(f, CachingPool(workers(role = role)), c; role = role, kwargs...)
 pmap(f, c1, c...; kwargs...) = pmap(a->f(a...), zip(c1, c...); kwargs...)
 
 function wrap_on_error(f, on_error; capture_data=false)
@@ -180,11 +180,11 @@ function wrap_retry(f, retry_delays, retry_check)
     end
 end
 
-function wrap_batch(f, p, handle_errors)
+function wrap_batch(f, p, handle_errors; role= :default)
     f = asyncmap_batch(f)
     return batch -> begin
         try
-            remotecall_fetch(f, p, batch)
+            remotecall_fetch(f, p, batch; role=role)
         catch e
             if handle_errors
                 return Any[BatchProcessingError(b, e) for b in batch]
@@ -199,7 +199,7 @@ asyncmap_batch(f) = batch -> asyncmap(x->f(x...), batch)
 extract_exception(e) = isa(e, RemoteException) ? e.captured.ex : e
 
 
-function process_batch_errors!(p, f, results, on_error, retry_delays, retry_check)
+function process_batch_errors!(p, f, results, on_error, retry_delays, retry_check; role= :default)
     # Handle all the ones in error in another pmap, with batch size set to 1
     reprocess = Tuple{Int,BatchProcessingError}[]
     for (idx, v) in enumerate(results)
@@ -211,14 +211,14 @@ function process_batch_errors!(p, f, results, on_error, retry_delays, retry_chec
     if length(reprocess) > 0
         errors = [x[2] for x in reprocess]
         exceptions = Any[x.ex for x in errors]
-        state = iterate(retry_delays)
+        state = iterate(retry_delays#=; role = role=#)
         state !== nothing && (state = state[2])
         error_processed = let state=state
             if (length(retry_delays)::Int > 0) &&
                     (retry_check === nothing || all([retry_check(state,ex)[2] for ex in exceptions]))
                 # BatchProcessingError.data is a tuple of original args
                 pmap(x->f(x...), p, Any[x.data for x in errors];
-                        on_error = on_error, retry_delays = collect(retry_delays)[2:end::Int], retry_check = retry_check)
+                        on_error = on_error, retry_delays = collect(retry_delays)[2:end::Int], retry_check = retry_check, role = role)
             elseif on_error !== nothing
                 map(on_error, exceptions)
             else
diff --git a/src/process_messages.jl b/src/process_messages.jl
index 3032917..b21d3ea 100644
--- a/src/process_messages.jl
+++ b/src/process_messages.jl
@@ -58,70 +58,70 @@ Exceptions on remote computations are captured and rethrown locally.  A `RemoteE
 wraps the `pid` of the worker and a captured exception. A `CapturedException` captures the
 remote exception and a serializable form of the call stack when the exception was raised.
 """
-RemoteException(captured) = RemoteException(myid(), captured)
-function showerror(io::IO, re::RemoteException)
-    (re.pid != myid()) && print(io, "On worker ", re.pid, ":\n")
-    showerror(io, re.captured)
+RemoteException(captured; role= :default) = RemoteException(myid(role=role), captured)
+function showerror(io::IO, re::RemoteException#=; role= :default=#)
+    (re.pid != myid(#=role = role=#)) && print(io, "On worker ", re.pid, ":\n")
+    showerror(io, re.captured#=; role = role=#)
 end
 
-function run_work_thunk(thunk::Function, print_error::Bool)
+function run_work_thunk(thunk::Function, print_error::Bool; role=:default)
     local result
     try
         result = thunk()
     catch err
         ce = CapturedException(err, catch_backtrace())
-        result = RemoteException(ce)
-        print_error && showerror(stderr, ce)
+        result = RemoteException(ce; role=role)
+        print_error && showerror(stderr, ce#=; role = role=#)
     end
     return result
 end
-function run_work_thunk(rv::RemoteValue, thunk)
-    put!(rv, run_work_thunk(thunk, false))
+function run_work_thunk(rv::RemoteValue, thunk; role= :default)
+    put!(rv, run_work_thunk(thunk, false; role=role))
     nothing
 end
 
-function schedule_call(rid, thunk)
+function schedule_call(rid, thunk; role= :default)
     return lock(client_refs) do
         rv = RemoteValue(def_rv_channel())
-        (PGRP::ProcessGroup).refs[rid] = rv
+        (PGRP(role = role)::ProcessGroup).refs[rid] = rv
         push!(rv.clientset, rid.whence)
-        errormonitor(@async run_work_thunk(rv, thunk))
+        errormonitor(@async run_work_thunk(rv, thunk; role=role))
         return rv
     end
 end
 
 
-function deliver_result(sock::IO, msg, oid, value)
-    #print("$(myid()) sending result $oid\n")
+function deliver_result(sock::IO, msg, oid, value; role= :default)
+    #print("$(myid(role=role)) sending result $oid\n")
     if msg === :call_fetch || isa(value, RemoteException)
         val = value
     else
         val = :OK
     end
     try
-        send_msg_now(sock, MsgHeader(oid), ResultMsg(val))
+        send_msg_now(sock, MsgHeader(oid), ResultMsg(val); role = role)
     catch e
         # terminate connection in case of serialization error
         # otherwise the reading end would hang
-        @error "Fatal error on process $(myid())" exception=e,catch_backtrace()
-        wid = worker_id_from_socket(sock)
+        @error "Fatal error on process $(myid(role=role))" exception=e,catch_backtrace()
+        wid = worker_id_from_socket(sock; role = role)
         close(sock)
-        if myid()==1
+        if myid(role=role)==1
             rmprocs(wid)
         elseif wid == 1
             exit(1)
         else
-            remote_do(rmprocs, 1, wid)
+            remote_do(rmprocs, 1, wid; role = role)
         end
     end
 end
 
 ## message event handlers ##
-function process_messages(r_stream::TCPSocket, w_stream::TCPSocket, incoming::Bool=true)
-    errormonitor(@async process_tcp_streams(r_stream, w_stream, incoming))
+function process_messages(r_stream::TCPSocket, w_stream::TCPSocket, incoming::Bool=true; role= :default)
+    errormonitor(@async process_tcp_streams(r_stream, w_stream, incoming; role = role))
 end
 
-function process_tcp_streams(r_stream::TCPSocket, w_stream::TCPSocket, incoming::Bool)
+function process_tcp_streams(r_stream::TCPSocket, w_stream::TCPSocket, incoming::Bool; role= :default)
     Sockets.nagle(r_stream, false)
     Sockets.quickack(r_stream, true)
     wait_connected(r_stream)
@@ -130,7 +130,7 @@ function process_tcp_streams(r_stream::TCPSocket, w_stream::TCPSocket, incoming:
         Sockets.quickack(w_stream, true)
         wait_connected(w_stream)
     end
-    message_handler_loop(r_stream, w_stream, incoming)
+    message_handler_loop(r_stream, w_stream, incoming; role = role)
 end
 
 """
@@ -147,22 +147,22 @@ Julia version number to perform the authentication handshake.
 
 See also [`cluster_cookie`](@ref).
 """
-function process_messages(r_stream::IO, w_stream::IO, incoming::Bool=true)
-    errormonitor(@async message_handler_loop(r_stream, w_stream, incoming))
+function process_messages(r_stream::IO, w_stream::IO, incoming::Bool=true; role= :default)
+    errormonitor(@async message_handler_loop(r_stream, w_stream, incoming; role = role))
 end
 
-function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool)
+function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool; role= :default)
     wpid=0          # the worker r_stream is connected to.
     boundary = similar(MSG_BOUNDARY)
     try
-        version = process_hdr(r_stream, incoming)
+        version = process_hdr(r_stream, incoming; role = role)
         serializer = ClusterSerializer(r_stream)
 
         # The first message will associate wpid with r_stream
         header = deserialize_hdr_raw(r_stream)
         msg = deserialize_msg(serializer)
-        handle_msg(msg, header, r_stream, w_stream, version)
-        wpid = worker_id_from_socket(r_stream)
+        handle_msg(msg, header, r_stream, w_stream, version; role = role)
+        wpid = worker_id_from_socket(r_stream; role = role)
         @assert wpid > 0
 
         readbytes!(r_stream, boundary, length(MSG_BOUNDARY))
@@ -170,11 +170,12 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool)
         while true
             reset_state(serializer)
             header = deserialize_hdr_raw(r_stream)
-            # println("header: ", header)
+            #println("header: ", header)
 
             try
                 msg = invokelatest(deserialize_msg, serializer)
             catch e
+                #println("*************************************************")
                 # Deserialization error; discard bytes in stream until boundary found
                 boundary_idx = 1
                 while true
@@ -193,42 +194,42 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool)
 
                 # remotecalls only rethrow RemoteExceptions. Any other exception is treated as
                 # data to be returned. Wrap this exception in a RemoteException.
-                remote_err = RemoteException(myid(), CapturedException(e, catch_backtrace()))
+                remote_err = RemoteException(myid(role=role), CapturedException(e, catch_backtrace()))
                 # println("Deserialization error. ", remote_err)
                 if !null_id(header.response_oid)
-                    ref = lookup_ref(header.response_oid)
+                    ref = lookup_ref(header.response_oid; role = role)
                     put!(ref, remote_err)
                 end
                 if !null_id(header.notify_oid)
-                    deliver_result(w_stream, :call_fetch, header.notify_oid, remote_err)
+                    deliver_result(w_stream, :call_fetch, header.notify_oid, remote_err; role = role)
                 end
                 continue
             end
             readbytes!(r_stream, boundary, length(MSG_BOUNDARY))
 
-            # println("got msg: ", typeof(msg))
-            handle_msg(msg, header, r_stream, w_stream, version)
+            #println("got msg: ", typeof(msg))
+            handle_msg(msg, header, r_stream, w_stream, version; role = role)
         end
     catch e
         oldstate = W_UNKNOWN_STATE
 
         # Check again as it may have been set in a message handler but not propagated to the calling block above
         if wpid < 1
-            wpid = worker_id_from_socket(r_stream)
+            wpid = worker_id_from_socket(r_stream; role = role)
         end
 
         if wpid < 1
             println(stderr, e, CapturedException(e, catch_backtrace()))
-            println(stderr, "Process($(myid())) - Unknown remote, closing connection.")
+            println(stderr, "Process($(myid(role=role))) - Unknown remote, closing connection.")
         elseif !(wpid in map_del_wrkr)
             werr = worker_from_id(wpid)
-            oldstate = werr.state
+            oldstate = @atomic werr.state
             set_worker_state(werr, W_TERMINATED)
 
             # If unhandleable error occurred talking to pid 1, exit
             if wpid == 1
                 if isopen(w_stream)
-                    @error "Fatal error on process $(myid())" exception=e,catch_backtrace()
+                    @error "Fatal error on process $(myid(role=role))" exception=e,catch_backtrace()
                 end
                 exit(1)
             end
@@ -236,13 +237,13 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool)
             # Will treat any exception as death of node and cleanup
             # since currently we do not have a mechanism for workers to reconnect
             # to each other on unhandled errors
-            deregister_worker(wpid)
+            deregister_worker(wpid; role = role)
         end
 
         close(r_stream)
         close(w_stream)
 
-        if (myid() == 1) && (wpid > 1)
+        if (myid(role=role) == 1) && (wpid > 1)
             if oldstate != W_TERMINATING
                 println(stderr, "Worker $wpid terminated.")
                 rethrow()
@@ -253,7 +254,7 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool)
     end
 end
 
-function process_hdr(s, validate_cookie)
+function process_hdr(s, validate_cookie; role= :default)
     if validate_cookie
         cookie = read(s, HDR_COOKIE_LEN)
         if length(cookie) < HDR_COOKIE_LEN
@@ -263,7 +264,7 @@ function process_hdr(s, validate_cookie)
         self_cookie = cluster_cookie()
         for i in 1:HDR_COOKIE_LEN
             if UInt8(self_cookie[i]) != cookie[i]
-                error("Process($(myid())) - Invalid connection credentials sent by remote.")
+                error("Process($(myid(role = role))) - Invalid connection credentials sent by remote.")
             end
         end
     end
@@ -279,67 +280,69 @@ function process_hdr(s, validate_cookie)
     return VersionNumber(strip(String(version)))
 end
 
-function handle_msg(msg::CallMsg{:call}, header, r_stream, w_stream, version)
-    schedule_call(header.response_oid, ()->invokelatest(msg.f, msg.args...; msg.kwargs...))
+function handle_msg(msg::CallMsg{:call}, header, r_stream, w_stream, version; role= :default)
+    schedule_call(header.response_oid, ()->invokelatest(msg.f, msg.args...; msg.kwargs...); role = role)
 end
-function handle_msg(msg::CallMsg{:call_fetch}, header, r_stream, w_stream, version)
+function handle_msg(msg::CallMsg{:call_fetch}, header, r_stream, w_stream, version; role= :default)
+    #@info "handle ", msg
     errormonitor(@async begin
-        v = run_work_thunk(()->invokelatest(msg.f, msg.args...; msg.kwargs...), false)
+        v = run_work_thunk(()->invokelatest(msg.f, msg.args...; msg.kwargs...), false; role=role)
         if isa(v, SyncTake)
             try
-                deliver_result(w_stream, :call_fetch, header.notify_oid, v.v)
+                deliver_result(w_stream, :call_fetch, header.notify_oid, v.v; role = role)
             finally
                 unlock(v.rv.synctake)
             end
         else
-            deliver_result(w_stream, :call_fetch, header.notify_oid, v)
+            deliver_result(w_stream, :call_fetch, header.notify_oid, v; role = role)
         end
         nothing
     end)
 end
 
-function handle_msg(msg::CallWaitMsg, header, r_stream, w_stream, version)
+function handle_msg(msg::CallWaitMsg, header, r_stream, w_stream, version; role= :default)
     errormonitor(@async begin
-        rv = schedule_call(header.response_oid, ()->invokelatest(msg.f, msg.args...; msg.kwargs...))
-        deliver_result(w_stream, :call_wait, header.notify_oid, fetch(rv.c))
+        rv = schedule_call(header.response_oid, ()->invokelatest(msg.f, msg.args...; msg.kwargs...); role = role)
+        deliver_result(w_stream, :call_wait, header.notify_oid, fetch(rv.c); role = role)
         nothing
     end)
 end
 
-function handle_msg(msg::RemoteDoMsg, header, r_stream, w_stream, version)
-    errormonitor(@async run_work_thunk(()->invokelatest(msg.f, msg.args...; msg.kwargs...), true))
+function handle_msg(msg::RemoteDoMsg, header, r_stream, w_stream, version; role= :default)
+    errormonitor(@async run_work_thunk(()->invokelatest(msg.f, msg.args...; msg.kwargs...), true; role=role))
 end
 
-function handle_msg(msg::ResultMsg, header, r_stream, w_stream, version)
-    put!(lookup_ref(header.response_oid), msg.value)
+function handle_msg(msg::ResultMsg, header, r_stream, w_stream, version; role= :default)
+    put!(lookup_ref(header.response_oid; role = role), msg.value)
 end
 
-function handle_msg(msg::IdentifySocketMsg, header, r_stream, w_stream, version)
+function handle_msg(msg::IdentifySocketMsg, header, r_stream, w_stream, version; role= :default)
     # register a new peer worker connection
-    w = Worker(msg.from_pid, r_stream, w_stream, cluster_manager; version=version)
+    w = Worker(msg.from_pid, r_stream, w_stream, cluster_manager; version=version, role = role)
     send_connection_hdr(w, false)
-    send_msg_now(w, MsgHeader(), IdentifySocketAckMsg())
+    send_msg_now(w, MsgHeader(), IdentifySocketAckMsg(); role = role)
     notify(w.initialized)
 end
 
-function handle_msg(msg::IdentifySocketAckMsg, header, r_stream, w_stream, version)
+function handle_msg(msg::IdentifySocketAckMsg, header, r_stream, w_stream, version; role= :default)
     w = map_sock_wrkr[r_stream]
     w.version = version
 end
 
-function handle_msg(msg::JoinPGRPMsg, header, r_stream, w_stream, version)
-    LPROC.id = msg.self_pid
-    controller = Worker(1, r_stream, w_stream, cluster_manager; version=version)
+function handle_msg(msg::JoinPGRPMsg, header, r_stream, w_stream, version; role= :default)
+    #LPROC.id = msg.self_pid
+    myid!(msg.self_pid, role=role)
+    controller = Worker(1, r_stream, w_stream, cluster_manager; version=version, role = role)
     notify(controller.initialized)
     register_worker(LPROC)
-    topology(msg.topology)
+    topology(msg.topology; role=role)
 
     if !msg.enable_threaded_blas
         Base.disable_library_threading()
     end
 
     lazy = msg.lazy
-    PGRP.lazy = lazy
+    PGRP(role = role).lazy = lazy
 
     @sync for (connect_at, rpid) in msg.other_workers
         wconfig = WorkerConfig()
@@ -348,32 +351,32 @@ function handle_msg(msg::JoinPGRPMsg, header, r_stream, w_stream, version)
         let rpid=rpid, wconfig=wconfig
             if lazy
                 # The constructor registers the object with a global registry.
-                Worker(rpid, ()->connect_to_peer(cluster_manager, rpid, wconfig))
+                Worker(rpid, ()->connect_to_peer(cluster_manager, rpid, wconfig; role = role); role = role)
             else
-                @async connect_to_peer(cluster_manager, rpid, wconfig)
+                @async connect_to_peer(cluster_manager, rpid, wconfig; role = role)
             end
         end
     end
 
     send_connection_hdr(controller, false)
-    send_msg_now(controller, MsgHeader(RRID(0,0), header.notify_oid), JoinCompleteMsg(Sys.CPU_THREADS, getpid()))
+    send_msg_now(controller, MsgHeader(RRID(0,0), header.notify_oid), JoinCompleteMsg(Sys.CPU_THREADS, getpid()); role = role)
 end
 
-function connect_to_peer(manager::ClusterManager, rpid::Int, wconfig::WorkerConfig)
+function connect_to_peer(manager::ClusterManager, rpid::Int, wconfig::WorkerConfig; role= :default)
     try
         (r_s, w_s) = connect(manager, rpid, wconfig)
-        w = Worker(rpid, r_s, w_s, manager; config=wconfig)
-        process_messages(w.r_stream, w.w_stream, false)
+        w = Worker(rpid, r_s, w_s, manager; config=wconfig, role = role)
+        process_messages(w.r_stream, w.w_stream, false; role = role)
         send_connection_hdr(w, true)
-        send_msg_now(w, MsgHeader(), IdentifySocketMsg(myid()))
+        send_msg_now(w, MsgHeader(), IdentifySocketMsg(myid(role=role)), role = role)
         notify(w.initialized)
     catch e
-        @error "Error on $(myid()) while connecting to peer $rpid, exiting" exception=e,catch_backtrace()
+        @error "Error on $(myid(role=role)) while connecting to peer $rpid, exiting" exception=e,catch_backtrace()
         exit(1)
     end
 end
 
-function handle_msg(msg::JoinCompleteMsg, header, r_stream, w_stream, version)
+function handle_msg(msg::JoinCompleteMsg, header, r_stream, w_stream, version; role= :default)
     w = map_sock_wrkr[r_stream]
     environ = something(w.config.environ, Dict())
     environ[:cpu_threads] = msg.cpu_threads
@@ -381,8 +384,8 @@ function handle_msg(msg::JoinCompleteMsg, header, r_stream, w_stream, version)
     w.config.ospid = msg.ospid
     w.version = version
 
-    ntfy_channel = lookup_ref(header.notify_oid)
-    put!(ntfy_channel, w.id)
+    ntfy_channel = lookup_ref(header.notify_oid; role = role)
+    put!(ntfy_channel, wid(w,role=role))
 
-    push!(default_worker_pool(), w.id)
+    push!(default_worker_pool(role=role), wid(w,role=role), role = role)
 end
diff --git a/src/remotecall.jl b/src/remotecall.jl
index 0b1143d..38ca131 100644
--- a/src/remotecall.jl
+++ b/src/remotecall.jl
@@ -29,8 +29,8 @@ mutable struct Future <: AbstractRemoteRef
     lock::ReentrantLock
     @atomic v::Union{Some{Any}, Nothing}
 
-    Future(w::Int, rrid::RRID, v::Union{Some, Nothing}=nothing) =
-        (r = new(w,rrid.whence,rrid.id,ReentrantLock(),v); return test_existing_ref(r))
+    Future(w::Int, rrid::RRID, v::Union{Some, Nothing}=nothing; role= :default) =
+        (r = new(w,rrid.whence,rrid.id,ReentrantLock(),v); return test_existing_ref(r; role = role))
 
     Future(t::NTuple{4, Any}) = new(t[1],t[2],t[3],ReentrantLock(),t[4])  # Useful for creating dummy, zeroed-out instances
 end
@@ -56,9 +56,9 @@ mutable struct RemoteChannel{T<:AbstractChannel} <: AbstractRemoteRef
     whence::Int
     id::Int
 
-    function RemoteChannel{T}(w::Int, rrid::RRID) where T<:AbstractChannel
+    function RemoteChannel{T}(w::Int, rrid::RRID; role= :default) where T<:AbstractChannel
         r = new(w, rrid.whence, rrid.id)
-        return test_existing_ref(r)
+        return test_existing_ref(r; role = role)
     end
 
     function RemoteChannel{T}(t::Tuple) where T<:AbstractChannel
@@ -66,7 +66,7 @@ mutable struct RemoteChannel{T<:AbstractChannel} <: AbstractRemoteRef
     end
 end
 
-function test_existing_ref(r::AbstractRemoteRef)
+function test_existing_ref(r::AbstractRemoteRef; role= :default)
     found = getkey(client_refs, r, nothing)
     if found !== nothing
         @assert r.where > 0
@@ -76,7 +76,7 @@ function test_existing_ref(r::AbstractRemoteRef)
             rv_cache = @atomic :monotonic r.v
             if fv_cache === nothing && rv_cache !== nothing
                 # we have recd the value from another source, probably a deserialized ref, send a del_client message
-                send_del_client(r)
+                send_del_client(r; role = role)
                 @lock found.lock begin
                     @atomicreplace found.v nothing => rv_cache
                 end
@@ -86,21 +86,21 @@ function test_existing_ref(r::AbstractRemoteRef)
     end
 
     client_refs[r] = nothing
-    finalizer(finalize_ref, r)
+    finalizer(r -> finalize_ref(r, role), r)
     return r
 end
 
-function finalize_ref(r::AbstractRemoteRef)
+function finalize_ref(r::AbstractRemoteRef, role)
     if r.where > 0 # Handle the case of the finalizer having been called manually
         if trylock(client_refs.lock) # trylock doesn't call wait which causes yields
             try
                 delete!(client_refs.ht, r) # direct removal avoiding locks
                 if isa(r, RemoteChannel)
-                    send_del_client_no_lock(r)
+                    send_del_client_no_lock(r; role = role)
                 else
                     # send_del_client only if the reference has not been set
                     v_cache = @atomic :monotonic r.v
-                    v_cache === nothing && send_del_client_no_lock(r)
+                    v_cache === nothing && send_del_client_no_lock(r; role = role)
                     @atomic :monotonic r.v = nothing
                 end
                 r.where = 0
@@ -108,10 +108,10 @@ function finalize_ref(r::AbstractRemoteRef)
                 unlock(client_refs.lock)
             end
         else
-            finalizer(finalize_ref, r)
+            finalizer(r -> finalize_ref(r, role), r)
             return nothing
         end
-    end
+    end 
     nothing
 end
 
@@ -121,16 +121,17 @@ end
 Create a `Future` on process `pid`.
 The default `pid` is the current process.
 """
-Future(pid::Integer=myid()) = Future(pid, RRID())
-Future(w::LocalProcess) = Future(w.id)
-Future(w::Worker) = Future(w.id)
+Future(pid::Integer=-1; role =:default) = Future(pid < 0 ? myid(role = role) : pid, RRID(role = role); role = role)
+Future(w::LocalProcess; role =:default) = Future(wid(w,role=role); role = role)
+Future(w::Worker; role =:default) = Future(wid(w,role=role); role = role)
 
-RemoteChannel(pid::Integer=myid()) = RemoteChannel{Channel{Any}}(pid, RRID())
+RemoteChannel(pid::Integer=-1; role= :default) = RemoteChannel{Channel{Any}}(pid < 0 ? myid(role = role) : pid, RRID(role = role); role = role)
 
-function RemoteChannel(f::Function, pid::Integer=myid())
-    remotecall_fetch(pid, f, RRID()) do f, rrid
-        rv=lookup_ref(rrid, f)
-        RemoteChannel{typeof(rv.c)}(myid(), rrid)
+function RemoteChannel(f::Function, pid_::Integer=0; role= :default)
+    pid = pid_ == 0 ? myid(role = role) : pid_
+    remotecall_fetch(pid, f, RRID(role = role); role = role) do f, rrid
+        rv=lookup_ref(rrid, f; role = role)
+        RemoteChannel{typeof(rv.c)}(myid(role = role), rrid; role = role)
     end
 end
 
@@ -169,9 +170,9 @@ A low-level API which returns the backing `AbstractChannel` for an `id` returned
 [`remoteref_id`](@ref).
 The call is valid only on the node where the backing channel exists.
 """
-function channel_from_id(id)
+function channel_from_id(id; role= :default)
     rv = lock(client_refs) do
-        return get(PGRP.refs, id, false)
+        return get(PGRP(role = role).refs, id, false)
     end
     if rv === false
         throw(ErrorException("Local instance of remote reference not found"))
@@ -179,7 +180,7 @@ function channel_from_id(id)
     return rv.c
 end
 
-lookup_ref(rrid::RRID, f=def_rv_channel) = lookup_ref(PGRP, rrid, f)
+lookup_ref(rrid::RRID, f=def_rv_channel; role= :default) = lookup_ref(PGRP(role = role), rrid, f)
 function lookup_ref(pg, rrid, f)
     return lock(client_refs) do
         rv = get(pg.refs, rrid, false)
@@ -209,15 +210,15 @@ errormonitor(@async put!(f, remotecall_fetch(long_computation, p)))
 isready(f)  # will not block
 ```
 """
-function isready(rr::Future)
+function isready(rr::Future; role= :default)
     v_cache = @atomic rr.v
     v_cache === nothing || return true
 
     rid = remoteref_id(rr)
-    return if rr.where == myid()
-        isready(lookup_ref(rid).c)
+    return if rr.where == myid(role = role)
+        isready(lookup_ref(rid; role = role).c)
     else
-        remotecall_fetch(rid->isready(lookup_ref(rid).c), rr.where, rid)
+        remotecall_fetch((rid, role)->isready(lookup_ref(rid; role = role).c), rr.where, rid, rr.where == 1 ? :master : :worker; role = role)
     end
 end
 
@@ -229,18 +230,18 @@ Note that this function can cause race conditions, since by the
 time you receive its result it may no longer be true. However,
 it can be safely used on a [`Future`](@ref) since they are assigned only once.
 """
-function isready(rr::RemoteChannel, args...)
+function isready(rr::RemoteChannel, args...; role= :default)
     rid = remoteref_id(rr)
-    return if rr.where == myid()
-        isready(lookup_ref(rid).c, args...)
+    return if rr.where == myid(role = role)
+        isready(lookup_ref(rid; role = role).c, args...)
     else
-        remotecall_fetch(rid->isready(lookup_ref(rid).c, args...), rr.where, rid)
+        remotecall_fetch(rid->isready(lookup_ref(rid; role = rr.where == 1 ? :master : :worker).c, args...), rr.where, rid; role = role)
     end
 end
 
-del_client(rr::AbstractRemoteRef) = del_client(remoteref_id(rr), myid())
+del_client(rr::AbstractRemoteRef; role= :default) = del_client(remoteref_id(rr), myid(role = role); role = role)
 
-del_client(id, client) = del_client(PGRP, id, client)
+del_client(id, client; role= :default) = del_client(PGRP(role = role), id, client)
 function del_client(pg, id, client)
     lock(client_refs) do
         _del_client(pg, id, client)
@@ -260,9 +261,9 @@ function _del_client(pg, id, client)
     nothing
 end
 
-function del_clients(pairs::Vector)
+function del_clients(pairs::Vector; role= :default)
     for p in pairs
-        del_client(p[1], p[2])
+        del_client(p[1], p[2]; role = role)
     end
 end
 
@@ -272,9 +273,9 @@ end
 # XXX: Is this worth the additional complexity?
 #      `flush_gc_msgs` has to iterate over all connected workers.
 const any_gc_flag = Threads.Condition()
-function start_gc_msgs_task()
+function start_gc_msgs_task(; role= :default)
     errormonitor(
-        Threads.@spawn begin
+        @async begin
             while true
                 lock(any_gc_flag) do
                     # this might miss events
@@ -283,27 +284,27 @@ function start_gc_msgs_task()
                 # Use invokelatest() so that custom message transport streams
                 # for workers can be defined in a newer world age than the Task
                 # which runs the loop here.
-                invokelatest(flush_gc_msgs) # handles throws internally
+                invokelatest(flush_gc_msgs#=; role = role=#) # handles throws internally
             end
         end
     )
 end
 
 # Function can be called within a finalizer
-function send_del_client(rr)
-    if rr.where == myid()
-        del_client(rr)
+function send_del_client(rr; role= :default)
+    if rr.where == myid(role = role)
+        del_client(rr; role = role)
     elseif id_in_procs(rr.where) # process only if a valid worker
-        process_worker(rr)
+        process_worker(rr; role = role)
     end
 end
 
-function send_del_client_no_lock(rr)
+function send_del_client_no_lock(rr; role= :default)
     # for gc context to avoid yields
-    if rr.where == myid()
-        _del_client(PGRP, remoteref_id(rr), myid())
+    if rr.where == myid(role = role)
+        _del_client(PGRP(role = role), remoteref_id(rr), myid(role = role))
     elseif id_in_procs(rr.where) # process only if a valid worker
-        process_worker(rr)
+        process_worker(rr; role = role)
     end
 end
 
@@ -317,12 +318,12 @@ function publish_del_msg!(w::Worker, msg)
     end
 end
 
-function process_worker(rr)
-    w = worker_from_id(rr.where)::Worker
-    msg = (remoteref_id(rr), myid())
+function process_worker(rr; role= :default)
+    w = worker_from_id(rr.where; role = role)::Worker
+    msg = (remoteref_id(rr), myid(role = role))
 
     # Needs to acquire a lock on the del_msg queue
-    T = Threads.@spawn begin
+    T = @async begin
         publish_del_msg!($w, $msg)
     end
     Base.errormonitor(T)
@@ -330,28 +331,28 @@ function process_worker(rr)
     return
 end
 
-function add_client(id, client)
+function add_client(id, client; role= :default)
     lock(client_refs) do
-        rv = lookup_ref(id)
+        rv = lookup_ref(id; role = role)
         push!(rv.clientset, client)
     end
     nothing
 end
 
-function add_clients(pairs::Vector)
+function add_clients(pairs::Vector; role= :default)
     for p in pairs
-        add_client(p[1], p[2]...)
+        add_client(p[1], p[2]...; role = role)
     end
 end
 
-function send_add_client(rr::AbstractRemoteRef, i)
-    if rr.where == myid()
+function send_add_client(rr::AbstractRemoteRef, i; role= :default)
+    if rr.where == myid(role = role)
         add_client(remoteref_id(rr), i)
     elseif (i != rr.where) && id_in_procs(rr.where)
         # don't need to send add_client if the message is already going
         # to the processor that owns the remote ref. it will add_client
         # itself inside deserialize().
-        w = worker_from_id(rr.where)
+        w = worker_from_id(rr.where; role = role)
         lock(w.msg_lock) do
             push!(w.add_msgs, (remoteref_id(rr), i))
             @atomic w.gcflag = true
@@ -364,24 +365,24 @@ end
 
 channel_type(rr::RemoteChannel{T}) where {T} = T
 
-function serialize(s::ClusterSerializer, f::Future)
+function serialize(s::ClusterSerializer, f::Future; role = :default)
     v_cache = @atomic f.v
     if v_cache === nothing
-        p = worker_id_from_socket(s.io)
-        (p !== f.where) && send_add_client(f, p)
+        p = worker_id_from_socket(s.io; role = role)
+        (p !== f.where) && send_add_client(f, p; role = role)
     end
     invoke(serialize, Tuple{ClusterSerializer, Any}, s, f)
 end
 
-function serialize(s::ClusterSerializer, rr::RemoteChannel)
-    p = worker_id_from_socket(s.io)
-    (p !== rr.where) && send_add_client(rr, p)
+function serialize(s::ClusterSerializer, rr::RemoteChannel; role = :default)
+    p = worker_id_from_socket(s.io; role = role)
+    (p !== rr.where) && send_add_client(rr, p; role = role)
     invoke(serialize, Tuple{ClusterSerializer, Any}, s, rr)
 end
 
-function deserialize(s::ClusterSerializer, t::Type{<:Future})
+function deserialize(s::ClusterSerializer, t::Type{<:Future}; role = :default)
     fc = invoke(deserialize, Tuple{ClusterSerializer, DataType}, s, t) # deserialized copy
-    f2 = Future(fc.where, RRID(fc.whence, fc.id), fc.v) # ctor adds to client_refs table
+    f2 = Future(fc.where, RRID(fc.whence, fc.id), fc.v; role = role) # ctor adds to client_refs table
 
     # 1) send_add_client() is not executed when the ref is being serialized
     #    to where it exists, hence do it here.
@@ -389,21 +390,21 @@ function deserialize(s::ClusterSerializer, t::Type{<:Future})
     #    already 'fetch'ed instance in client_refs (Issue #25847), we should not
     #    track it in the backing RemoteValue store.
     f2v_cache = @atomic f2.v
-    if f2.where == myid() && f2v_cache === nothing
-        add_client(remoteref_id(f2), myid())
+    if f2.where == myid(role = role) && f2v_cache === nothing
+        add_client(remoteref_id(f2), myid(role = role); role = role)
     end
     f2
 end
 
-function deserialize(s::ClusterSerializer, t::Type{<:RemoteChannel})
+function deserialize(s::ClusterSerializer, t::Type{<:RemoteChannel}; role = :default)
     rr = invoke(deserialize, Tuple{ClusterSerializer, DataType}, s, t)
-    if rr.where == myid()
+    if rr.where == myid(role = role)
         # send_add_client() is not executed when the ref is being
         # serialized to where it exists
-        add_client(remoteref_id(rr), myid())
+        add_client(remoteref_id(rr), myid(role = role); role = role)
     end
     # call ctor to make sure this rr gets added to the client_refs table
-    RemoteChannel{channel_type(rr)}(rr.where, RRID(rr.whence, rr.id))
+    RemoteChannel{channel_type(rr)}(rr.where, RRID(rr.whence, rr.id); role = role)
 end
 
 # Future and RemoteChannel are serializable only in a running cluster.
@@ -422,18 +423,19 @@ end
 # make a thunk to call f on args in a way that simulates what would happen if
 # the function were sent elsewhere
 function local_remotecall_thunk(f, args, kwargs)
+    #println("local_remotecall_thunk($f, $args, $kwargs)")
     return ()->invokelatest(f, args...; kwargs...)
 end
 
-function remotecall(f, w::LocalProcess, args...; kwargs...)
-    rr = Future(w)
-    schedule_call(remoteref_id(rr), local_remotecall_thunk(f, args, kwargs))
+function remotecall(f, w::LocalProcess, args...; role= :default, kwargs...)
+    rr = Future(w; role = role)
+    schedule_call(remoteref_id(rr), local_remotecall_thunk(f, args, kwargs); role = role)
     return rr
 end
 
-function remotecall(f, w::Worker, args...; kwargs...)
-    rr = Future(w)
-    send_msg(w, MsgHeader(remoteref_id(rr)), CallMsg{:call}(f, args, kwargs))
+function remotecall(f, w::Worker, args...; role= :default,  kwargs...)
+    rr = Future(w; role = role)
+    send_msg(w, MsgHeader(remoteref_id(rr)), CallMsg{:call}(f, args, kwargs); role = role)
     return rr
 end
 
@@ -444,26 +446,48 @@ Call a function `f` asynchronously on the given arguments on the specified proce
 Return a [`Future`](@ref).
 Keyword arguments, if any, are passed through to `f`.
 """
-remotecall(f, id::Integer, args...; kwargs...) = remotecall(f, worker_from_id(id), args...; kwargs...)
+remotecall(f, id::Integer, args...; role= :default, kwargs...) = 
+#            remotecall(f, worker_from_id(id; role = id == 1 ? :master : :worker), args...; role = role, kwargs...)
+            remotecall(f, worker_from_id(id; role = role), args...; role = role, kwargs...)
+
+function remotecall_fetch(f, w::LocalProcess, args...; role= :default, kwargs...)
+    v=run_work_thunk(local_remotecall_thunk(f, args, kwargs), false; role = role)
+    return isa(v, RemoteException) ? throw(v) : v
+end
+
 
-function remotecall_fetch(f, w::LocalProcess, args...; kwargs...)
-    v=run_work_thunk(local_remotecall_thunk(f,args, kwargs), false)
+function remotecall_fetch(f, w::Worker, args...; role= :default, kwargs...)
+    # can be weak, because the program will have no way to refer to the Ref
+    # itself, it only gets the result.
+    oid = RRID(role = role)
+    rv = lookup_ref(oid; role = role)
+    rv.waitingfor = wid(w, role = role)
+    send_msg(w, MsgHeader(RRID(0,0), oid), CallMsg{:call_fetch}(f, args, kwargs); role = role)
+    v = take!(rv)
+    lock(client_refs) do
+        delete!(PGRP(role = role).refs, oid)
+    end
     return isa(v, RemoteException) ? throw(v) : v
 end
 
+
+#=
 function remotecall_fetch(f, w::Worker, args...; kwargs...)
     # can be weak, because the program will have no way to refer to the Ref
     # itself, it only gets the result.
-    oid = RRID()
-    rv = lookup_ref(oid)
-    rv.waitingfor = w.id
-    send_msg(w, MsgHeader(RRID(0,0), oid), CallMsg{:call_fetch}(f, args, kwargs))
+    role = haskey(kwargs, :role) ? kwargs[:role] : :default
+    oid = RRID(role = role)
+    rv = lookup_ref(oid; role = role)
+    rv.waitingfor = wid(w, role=role)
+    @info "send_msg ...$(Base.nameof(f)) === $(Base.kwarg_decl.(methods(f)))"
+    send_msg(w, MsgHeader(RRID(0,0), oid), CallMsg{:call_fetch}(f, args, kwargs); role = role)
     v = take!(rv)
     lock(client_refs) do
-        delete!(PGRP.refs, oid)
+        delete!(PGRP(role = role).refs, oid)
     end
     return isa(v, RemoteException) ? throw(v) : v
 end
+=#
 
 """
     remotecall_fetch(f, id::Integer, args...; kwargs...)
@@ -489,20 +513,20 @@ sqrt was called with a negative real argument but will only return a complex res
 ...
 ```
 """
-remotecall_fetch(f, id::Integer, args...; kwargs...) =
-    remotecall_fetch(f, worker_from_id(id), args...; kwargs...)
+remotecall_fetch(f, id::Integer, args...; role= :default, kwargs...) =
+    remotecall_fetch(f, worker_from_id(id; role = role), args...; role = role, kwargs...)
 
-remotecall_wait(f, w::LocalProcess, args...; kwargs...) = wait(remotecall(f, w, args...; kwargs...))
+remotecall_wait(f, w::LocalProcess, args...; role= :default, kwargs...) = wait(remotecall(f, w, args...; role = role, kwargs...); role = role)
 
-function remotecall_wait(f, w::Worker, args...; kwargs...)
-    prid = RRID()
-    rv = lookup_ref(prid)
-    rv.waitingfor = w.id
-    rr = Future(w)
-    send_msg(w, MsgHeader(remoteref_id(rr), prid), CallWaitMsg(f, args, kwargs))
+function remotecall_wait(f, w::Worker, args...; role= :default, kwargs...)
+    prid = RRID(role = role)
+    rv = lookup_ref(prid; role = role)
+    rv.waitingfor = wid(w,role=role)
+    rr = Future(w; role = role)
+    send_msg(w, MsgHeader(remoteref_id(rr), prid), CallWaitMsg(f, args, kwargs); role = role)
     v = fetch(rv.c)
     lock(client_refs) do
-        delete!(PGRP.refs, prid)
+        delete!(PGRP(role = role).refs, prid)
     end
     isa(v, RemoteException) && throw(v)
     return rr
@@ -516,10 +540,10 @@ Keyword arguments, if any, are passed through to `f`.
 
 See also [`wait`](@ref) and [`remotecall`](@ref).
 """
-remotecall_wait(f, id::Integer, args...; kwargs...) =
-    remotecall_wait(f, worker_from_id(id), args...; kwargs...)
+remotecall_wait(f, id::Integer, args...; role= :default, kwargs...) =
+    remotecall_wait(f, worker_from_id(id; role = role), args...; kwargs...)
 
-function remote_do(f, w::LocalProcess, args...; kwargs...)
+function remote_do(f, w::LocalProcess, args...; role = :default, kwargs...)
     # the LocalProcess version just performs in local memory what a worker
     # does when it gets a :do message.
     # same for other messages on LocalProcess.
@@ -528,8 +552,8 @@ function remote_do(f, w::LocalProcess, args...; kwargs...)
     nothing
 end
 
-function remote_do(f, w::Worker, args...; kwargs...)
-    send_msg(w, MsgHeader(), RemoteDoMsg(f, args, kwargs))
+function remote_do(f, w::Worker, args...; role= :default, kwargs...)
+    send_msg(w, MsgHeader(), RemoteDoMsg(f, args, kwargs), role = role)
     nothing
 end
 
@@ -554,22 +578,29 @@ Any exceptions thrown by `f` are printed to [`stderr`](@ref) on the remote worke
 
 Keyword arguments, if any, are passed through to `f`.
 """
-remote_do(f, id::Integer, args...; kwargs...) = remote_do(f, worker_from_id(id), args...; kwargs...)
+remote_do(f, id::Integer, args...; role=:default, kwargs...) = remote_do(f, worker_from_id(id, role = role), role = role,  args...; kwargs...) # TO CHECK (e se f não tiver role parameter ?)
 
 # have the owner of rr call f on it
-function call_on_owner(f, rr::AbstractRemoteRef, args...)
+function call_on_owner(f, rr::AbstractRemoteRef, args...; role= :default)
     rid = remoteref_id(rr)
-    if rr.where == myid()
+    if rr.where == myid(role = role)
         f(rid, args...)
     else
-        remotecall_fetch(f, rr.where, rid, args...)
+        #remotecall_fetch((rid,role) -> f(rid, role = role, args...), rr.where, rid, rr.where==1 ? :master : :worker; role = role)
+        remotecall_fetch((rid,role) -> f(rid, args...; role=role), rr.where, rid, rr.where==1 ? :master : :worker; role = role)
+
+
+        #remotecall_fetch(rid -> f(rid, role = rr.where==1 ? :master : :worker, args...), rr.where; role = role)
+        #remotecall_fetch(iiiii, rr.where, f, rid, rr.where==1 ? :master : :worker, args...; role = role)
+#        remotecall_fetch(f, rr.where, rid, args...)
+
     end
 end
 
-function wait_ref(rid, caller, args...)
-    v = fetch_ref(rid, args...)
+function wait_ref(rid, caller, args...; role= :default)
+    v = fetch_ref(rid, args...; role = role)
     if isa(v, RemoteException)
-        if myid() == caller
+        if myid(role = role) == caller
             throw(v)
         else
             return v
@@ -583,14 +614,20 @@ end
 
 Wait for a value to become available for the specified [`Future`](@ref).
 """
-wait(r::Future) = (v_cache = @atomic r.v; v_cache !== nothing && return r; call_on_owner(wait_ref, r, myid()); r)
+wait(r::Future; role= :default) = (v_cache = @atomic r.v; v_cache !== nothing && return r; 
+                                   call_on_owner(wait_ref, r, myid(role = role); role = role); 
+                                   #call_on_owner((rid, caller, args...; role=role) -> wait_ref(rid, caller, args...; role=role), r, myid(role = role); role = role);
+                                   r)
 
 """
     wait(r::RemoteChannel, args...)
 
 Wait for a value to become available on the specified [`RemoteChannel`](@ref).
 """
-wait(r::RemoteChannel, args...) = (call_on_owner(wait_ref, r, myid(), args...); r)
+wait(r::RemoteChannel, args...; role= :default) = (call_on_owner(wait_ref, r, myid(role = role), args...; role = role); r)
+#wait(r::RemoteChannel, args...; role= :default) = (call_on_owner((rid, caller, args...; role=role) -> wait_ref(rid, caller, args...; role=role), r, myid(role = role), args...; role = role); r)
+
+
 
 """
     fetch(x::Future)
@@ -599,14 +636,14 @@ Wait for and get the value of a [`Future`](@ref). The fetched value is cached lo
 Further calls to `fetch` on the same reference return the cached value. If the remote value
 is an exception, throws a [`RemoteException`](@ref) which captures the remote exception and backtrace.
 """
-function fetch(r::Future)
+function fetch(r::Future; role= :default)
     v_cache = @atomic r.v
     v_cache !== nothing && return something(v_cache)
 
-    if r.where == myid()
+    if r.where == myid(role = role)
         rv, v_cache = @lock r.lock begin
             v_cache = @atomic :monotonic r.v
-            rv = v_cache === nothing ? lookup_ref(remoteref_id(r)) : nothing
+            rv = v_cache === nothing ? lookup_ref(remoteref_id(r); role = role) : nothing
             rv, v_cache
         end
 
@@ -616,7 +653,8 @@ function fetch(r::Future)
             v_local = fetch(rv.c)
         end
     else
-        v_local = call_on_owner(fetch_ref, r)
+        #v_local = call_on_owner((rid, args...; role=role) -> fetch_ref(rid, args...;role=role), r; role = role)
+        v_local = call_on_owner(fetch_ref, r; role = role)
     end
 
     v_cache = @atomic r.v
@@ -634,18 +672,22 @@ function fetch(r::Future)
         # remote calls getting the value from `call_on_owner` used to return the value directly without wrapping it in `Some(x)`
         # so we're doing the same thing here
         if status
-            send_del_client(r)
+            send_del_client(r; role = role)
             return v_local
         else # this `v_cache` is returned at the end of the function
             v_cache = v_old
         end
     end
 
-    send_del_client(r)
+    send_del_client(r; role = role)
+
     something(v_cache)
+
 end
 
-fetch_ref(rid, args...) = fetch(lookup_ref(rid).c, args...)
+fetch_ref(rid, args...; role=:default) = fetch(lookup_ref(rid; role = role).c, #=role=role,=# args...)
+
+
 
 """
     fetch(c::RemoteChannel)
@@ -653,7 +695,10 @@ fetch_ref(rid, args...) = fetch(lookup_ref(rid).c, args...)
 Wait for and get a value from a [`RemoteChannel`](@ref). Exceptions raised are the
 same as for a [`Future`](@ref). Does not remove the item fetched.
 """
-fetch(r::RemoteChannel, args...) = call_on_owner(fetch_ref, r, args...)::eltype(r)
+fetch(r::RemoteChannel, args...; role= :default) = call_on_owner(fetch_ref, r, args...; role = role)::eltype(r)
+#fetch(r::RemoteChannel, args...; role= :default) = call_on_owner((rid, args...; role=role) -> fetch_ref(rid, args...;role=role), r, args...; role = role)::eltype(r)
+
+
 
 isready(rv::RemoteValue, args...) = isready(rv.c, args...)
 
@@ -666,19 +711,19 @@ A `put!` on an already set `Future` throws an `Exception`.
 All asynchronous remote calls return `Future`s and set the
 value to the return value of the call upon completion.
 """
-function put!(r::Future, v)
-    if r.where == myid()
+function put!(r::Future, v; role= :default)
+    if r.where == myid(role = role)
         rid = remoteref_id(r)
-        rv = lookup_ref(rid)
+        rv = lookup_ref(rid; role = role)
         isready(rv) && error("Future can be set only once")
         @lock r.lock begin
             put!(rv, v) # this notifies the tasks waiting on the channel in fetch
             set_future_cache(r, v) # set the cache before leaving the lock, so that the notified tasks already see it cached
         end
-        del_client(rid, myid())
+        del_client(rid, myid(role = role); role = role)
     else
         @lock r.lock begin # same idea as above if there were any local tasks fetching on this Future
-            call_on_owner(put_future, r, v, myid())
+            call_on_owner(put_future, r, v, myid(role = role); role = role)
             set_future_cache(r, v)
         end
     end
@@ -690,21 +735,21 @@ function set_future_cache(r::Future, v)
     ok || error("internal consistency error detected for Future")
 end
 
-function put_future(rid, v, caller)
-    rv = lookup_ref(rid)
+function put_future(rid, v, caller; role= :default)
+    rv = lookup_ref(rid; role = role)
     isready(rv) && error("Future can be set only once")
     put!(rv, v)
     # The caller has the value and hence can be removed from the remote store.
-    del_client(rid, caller)
+    del_client(rid, caller; role = role)
     nothing
 end
 
 
 put!(rv::RemoteValue, args...) = put!(rv.c, args...)
-function put_ref(rid, caller, args...)
-    rv = lookup_ref(rid)
+function put_ref(rid, caller, args...; role= :default)
+    rv = lookup_ref(rid; role = role)
     put!(rv, args...)
-    if myid() == caller && rv.synctake !== nothing
+    if myid(role = role) == caller && rv.synctake !== nothing
         # Wait till a "taken" value is serialized out - github issue #29932
         lock(rv.synctake)
         unlock(rv.synctake)
@@ -719,15 +764,17 @@ Store a set of values to the [`RemoteChannel`](@ref).
 If the channel is full, blocks until space is available.
 Return the first argument.
 """
-put!(rr::RemoteChannel, args...) = (call_on_owner(put_ref, rr, myid(), args...); rr)
+put!(rr::RemoteChannel, args...; role= :default) = (call_on_owner(put_ref, rr, myid(role = role), args...; role = role); rr)
+#put!(rr::RemoteChannel, args...; role= :default) = (call_on_owner((rid, caller, args...; role=role) -> put_ref(rid, caller, args...; role=role), rr, myid(role = role), args...; role = role); rr)
+
 
 # take! is not supported on Future
 
 take!(rv::RemoteValue, args...) = take!(rv.c, args...)
-function take_ref(rid, caller, args...)
-    rv = lookup_ref(rid)
+function take_ref(rid, caller, args...; role=:default)
+    rv = lookup_ref(rid; role = role)
     synctake = false
-    if myid() != caller && rv.synctake !== nothing
+    if myid(role = role) != caller && rv.synctake !== nothing
         # special handling for local put! / remote take! on unbuffered channel
         # github issue #29932
         synctake = true
@@ -743,7 +790,7 @@ function take_ref(rid, caller, args...)
         rethrow(e)
     end
 
-    isa(v, RemoteException) && (myid() == caller) && throw(v)
+    isa(v, RemoteException) && (myid(role = role) == caller) && throw(v)
 
     if synctake
         return SyncTake(v, rv)
@@ -758,31 +805,35 @@ end
 Fetch value(s) from a [`RemoteChannel`](@ref) `rr`,
 removing the value(s) in the process.
 """
-take!(rr::RemoteChannel, args...) = call_on_owner(take_ref, rr, myid(), args...)::eltype(rr)
+#take!(rr::RemoteChannel, args...; role= :default) = call_on_owner((rid, caller, args...; role=role) -> take_ref(rid, caller, args...; role=role), rr, myid(role = role), args...; role = role)::eltype(rr)
+take!(rr::RemoteChannel, args...; role= :default) = call_on_owner(take_ref, rr, myid(role = role), args...; role = role)::eltype(rr)
 
 # close and isopen are not supported on Future
 
-close_ref(rid) = (close(lookup_ref(rid).c); nothing)
-close(rr::RemoteChannel) = call_on_owner(close_ref, rr)
+close_ref(rid; role= :default) = (close(lookup_ref(rid; role = role).c); nothing)
+close(rr::RemoteChannel; role= :default) = call_on_owner(close_ref, rr; role = role)
+
+isopen_ref(rid; role= :default) = isopen(lookup_ref(rid; role = role).c)
+isopen(rr::RemoteChannel; role= :default) = call_on_owner(isopen_ref, rr; role = role)
 
-isopen_ref(rid) = isopen(lookup_ref(rid).c)
-isopen(rr::RemoteChannel) = call_on_owner(isopen_ref, rr)
+isempty_ref(rid; role= :default) = isempty(lookup_ref(rid; role = role).c)
+Base.isempty(rr::RemoteChannel; role= :default) = call_on_owner(isempty_ref, rr; role=role)
 
-getindex(r::RemoteChannel) = fetch(r)
-getindex(r::Future) = fetch(r)
+getindex(r::RemoteChannel; role= :default) = fetch(r; role = role)
+getindex(r::Future; role= :default) = fetch(r; role = role)
 
-getindex(r::Future, args...) = getindex(fetch(r), args...)
-function getindex(r::RemoteChannel, args...)
-    if r.where == myid()
-        return getindex(fetch(r), args...)
+getindex(r::Future, args...; role= :default) = getindex(fetch(r; role = role), args...#=; role = role=#)
+function getindex(r::RemoteChannel, args...; role= :default)
+    if r.where == myid(role = role)
+        return getindex(fetch(r; role = role), args...#=; role = role=#)
     end
-    return remotecall_fetch(getindex, r.where, r, args...)
+    return remotecall_fetch((r,role) -> getindex(r, role = role, args...), r.where, r, r.where == 1 ? :master : :worker; role = role)
 end
 
-function iterate(c::RemoteChannel, state=nothing)
-    if isopen(c) || isready(c)
+function iterate(c::RemoteChannel, state=nothing; role= :default)
+    if isopen(c; role = role) || isready(c; role = role)
         try
-            return (take!(c), nothing)
+            return (take!(c; role=role), nothing)
         catch e
             if isa(e, InvalidStateException) ||
                 (isa(e, RemoteException) &&
diff --git a/src/workerpool.jl b/src/workerpool.jl
index 5dd1c07..bb66245 100644
--- a/src/workerpool.jl
+++ b/src/workerpool.jl
@@ -8,6 +8,7 @@ An `AbstractWorkerPool` should implement:
   - [`push!`](@ref) - add a new worker to the overall pool (available + busy)
   - [`put!`](@ref) - put back a worker to the available pool
   - [`take!`](@ref) - take a worker from the available pool (to be used for remote function execution)
+  - [`wait`](@ref) - block until a worker is available
   - [`length`](@ref) - number of workers available in the overall pool
   - [`isready`](@ref) - return false if a `take!` on the pool would block, else true
 
@@ -26,9 +27,9 @@ mutable struct WorkerPool <: AbstractWorkerPool
     WorkerPool(c::Channel, ref::RemoteChannel) = new(c, Set{Int}(), ref)
 end
 
-function WorkerPool()
-    wp = WorkerPool(Channel{Int}(typemax(Int)), RemoteChannel())
-    put!(wp.ref, WeakRef(wp))
+function WorkerPool(; role= :default)
+    wp = WorkerPool(Channel{Int}(typemax(Int)), RemoteChannel(role = role))
+    put!(wp.ref, WeakRef(wp), role=role)
     wp
 end
 
@@ -48,8 +49,8 @@ julia> WorkerPool(2:4)
 WorkerPool(Channel{Int64}(sz_max:9223372036854775807,sz_curr:2), Set([4, 2, 3]), RemoteChannel{Channel{Any}}(1, 1, 7))
 ```
 """
-function WorkerPool(workers::Union{Vector{Int},AbstractRange{Int}})
-    pool = WorkerPool()
+function WorkerPool(workers::Union{Vector{Int},AbstractRange{Int}}; role= :default)
+    pool = WorkerPool(role = role)
     foreach(w->push!(pool, w), workers)
     return pool
 end
@@ -57,22 +58,22 @@ end
 # On workers where this pool has been serialized to, instantiate with a dummy local channel.
 WorkerPool(ref::RemoteChannel) = WorkerPool(Channel{Int}(1), ref)
 
-function serialize(S::AbstractSerializer, pool::WorkerPool)
+function serialize(S::AbstractSerializer, pool::WorkerPool; role = :default)
     # Allow accessing a worker pool from other processors. When serialized,
     # initialize the `ref` to point to self and only send the ref.
     # Other workers will forward all put!, take!, calls to the process owning
     # the ref (and hence the pool).
     Serialization.serialize_type(S, typeof(pool))
-    serialize(S, pool.ref)
+    serialize(S, pool.ref; role = role)
 end
 
 deserialize(S::AbstractSerializer, t::Type{T}) where {T<:WorkerPool} = T(deserialize(S))
 
-wp_local_push!(pool::AbstractWorkerPool, w::Int) = (push!(pool.workers, w); put!(pool.channel, w); pool)
-wp_local_length(pool::AbstractWorkerPool) = length(pool.workers)
-wp_local_isready(pool::AbstractWorkerPool) = isready(pool.channel)
+wp_local_push!(pool::AbstractWorkerPool, w::Int; role= :default) = (push!(pool.workers, w); put!(pool.channel, w); pool)
+wp_local_length(pool::AbstractWorkerPool; role= :default) = length(pool.workers)
+wp_local_isready(pool::AbstractWorkerPool; role= :default) = isready(pool.channel)  # pool.channel::Channel{Int}
 
-function wp_local_put!(pool::AbstractWorkerPool, w::Int)
+function wp_local_put!(pool::AbstractWorkerPool, w::Int; role= :default)
     # In case of default_worker_pool, the master is implicitly considered a worker, i.e.,
     # it is not present in pool.workers.
     # Confirm the that the worker is part of a pool before making it available.
@@ -80,28 +81,28 @@ function wp_local_put!(pool::AbstractWorkerPool, w::Int)
     w
 end
 
-function wp_local_workers(pool::AbstractWorkerPool)
-    if length(pool) == 0 && pool === default_worker_pool()
+function wp_local_workers(pool::AbstractWorkerPool; role= :default)
+    if length(pool) == 0 && pool === default_worker_pool(role=role)
         return [1]
     else
         return collect(pool.workers)
     end
 end
 
-function wp_local_nworkers(pool::AbstractWorkerPool)
-    if length(pool) == 0 && pool === default_worker_pool()
+function wp_local_nworkers(pool::AbstractWorkerPool; role= :default)
+    if length(pool) == 0 && pool === default_worker_pool(role=role)
         return 1
     else
         return length(pool.workers)
     end
 end
 
-function wp_local_take!(pool::AbstractWorkerPool)
+function wp_local_take!(pool::AbstractWorkerPool; role= :default)
     # Find an active worker
     worker = 0
     while true
         if length(pool) == 0
-            if pool === default_worker_pool()
+            if pool === default_worker_pool(role=role)
                 # No workers, the master process is used as a worker
                 worker = 1
                 break
@@ -120,48 +121,74 @@ function wp_local_take!(pool::AbstractWorkerPool)
     return worker
 end
 
-function remotecall_pool(rc_f, f, pool::AbstractWorkerPool, args...; kwargs...)
+function wp_local_wait(pool::AbstractWorkerPool)
+    wait(pool.channel)
+    return nothing
+end
+
+function remotecall_pool(rc_f, f, pool::AbstractWorkerPool, args...; role= :default, kwargs...)
+    worker = take!(pool; role=role)
+    try
+        rc_f(f, worker, role=role, args...; kwargs...)
+    finally
+        put!(pool, worker; role = role)
+    end
+end
+
+# Specialization for remotecall. We have to wait for the Future it returns
+# before putting the worker back in the pool.
+function remotecall_pool(rc_f::typeof(remotecall), f, pool::AbstractWorkerPool, args...; kwargs...)
     worker = take!(pool)
+    local x
     try
-        rc_f(f, worker, args...; kwargs...)
+        x = rc_f(f, worker, args...; kwargs...)
+    catch
+        put!(pool, worker)
+        rethrow()
+    end
+    t = Threads.@spawn Threads.threadpool() try
+        wait(x)
+    catch # just wait, ignore errors here
     finally
         put!(pool, worker)
     end
+    errormonitor(t)
+    return x
 end
 
 # Check if pool is local or remote and forward calls if required.
 # NOTE: remotecall_fetch does it automatically, but this will be more efficient as
 # it avoids the overhead associated with a local remotecall.
 
-for (func, rt) = ((:length, Int), (:isready, Bool), (:workers, Vector{Int}), (:nworkers, Int), (:take!, Int))
+for (func, rt) = ((:length, Int), (:isready, Bool), (:workers, Vector{Int}), (:nworkers, Int), (:take!, Int), (:wait, Nothing))
     func_local = Symbol(string("wp_local_", func))
     @eval begin
-        function ($func)(pool::WorkerPool)
-            if pool.ref.where != myid()
-                return remotecall_fetch(ref->($func_local)(fetch(ref).value), pool.ref.where, pool.ref)::$rt
+        function ($func)(pool::WorkerPool; role= :default)
+            if pool.ref.where != myid(role = role)
+                return remotecall_fetch((ref, role)->(($func_local)(fetch(ref; role=role).value; role = role)), pool.ref.where, pool.ref, pool.ref.where == 1 ? :master : :worker; role = role)::$rt
             else
-                return ($func_local)(pool)
+                return ($func_local)(pool; role = role)
             end
         end
 
         # default impl
-        ($func)(pool::AbstractWorkerPool) = ($func_local)(pool)
+        ($func)(pool::AbstractWorkerPool; role= :default) = ($func_local)(pool; role = role)
     end
 end
 
 for func = (:push!, :put!)
     func_local = Symbol(string("wp_local_", func))
     @eval begin
-        function ($func)(pool::WorkerPool, w::Int)
-            if pool.ref.where != myid()
-                return remotecall_fetch((ref, w)->($func_local)(fetch(ref).value, w), pool.ref.where, pool.ref, w)
+        function ($func)(pool::WorkerPool, w::Int; role= :default)
+            if pool.ref.where != myid(role = role)
+                return remotecall_fetch((ref, w, role)->(($func_local)(fetch(ref; role = role).value, w; role = role)), pool.ref.where, pool.ref, w, pool.ref.where == 1 ? :master : :worker; role = role)
             else
-                return ($func_local)(pool, w)
+                return ($func_local)(pool, w; role = role)
             end
         end
 
         # default impl
-        ($func)(pool::AbstractWorkerPool, w::Int) = ($func_local)(pool, w)
+        ($func)(pool::AbstractWorkerPool, w::Int; role= :default) = ($func_local)(pool, w; role = role)
     end
 end
 
@@ -184,6 +211,7 @@ Future(2, 1, 6, nothing)
 ```
 In this example, the task ran on pid 2, called from pid 1.
 """
+#remotecall(f, pool::AbstractWorkerPool, args...; role= :default, kwargs...) = remotecall_pool((f, pool) -> remotecall(f, pool, role=role, args...; kwargs...); role=role)
 remotecall(f, pool::AbstractWorkerPool, args...; kwargs...) = remotecall_pool(remotecall, f, pool, args...; kwargs...)
 
 
@@ -208,6 +236,7 @@ julia> fetch(f)
 0.9995177101692958
 ```
 """
+#remotecall_wait(f, pool::AbstractWorkerPool, args...; role= :default, kwargs...) = remotecall_pool((f,pool) -> remotecall_wait(f, pool, role = role, args...; kwargs...); role=role)  # TO CHECK (dúvida com "role = role")
 remotecall_wait(f, pool::AbstractWorkerPool, args...; kwargs...) = remotecall_pool(remotecall_wait, f, pool, args...; kwargs...)
 
 
@@ -229,14 +258,21 @@ julia> remotecall_fetch(maximum, wp, A)
 0.9995177101692958
 ```
 """
+#remotecall_fetch(f, pool::AbstractWorkerPool, args...; role= :default, kwargs...) = remotecall_pool((f,pool)->remotecall_fetch(f, pool, role = role, args...; kwargs...), f, pool; role = role) # TO CHECK (dúvida com o primeiro "role = role")
 remotecall_fetch(f, pool::AbstractWorkerPool, args...; kwargs...) = remotecall_pool(remotecall_fetch, f, pool, args...; kwargs...)
+#remotecall_fetch(f, pool::AbstractWorkerPool, args...; role= :default, kwargs...) = remotecall_pool((f,pool)->remotecall_fetch((p, args...) -> f(p, args...), pool, args...; role = role, kwargs...), f, pool; role = role) # TO CHECK (dúvida com o primeiro "role = role")
 
 """
     remote_do(f, pool::AbstractWorkerPool, args...; kwargs...) -> nothing
 
 [`WorkerPool`](@ref) variant of `remote_do(f, pid, ....)`. Wait for and take a free worker from `pool` and
 perform a `remote_do` on it.
+
+Note that it's not possible to wait for the result of a `remote_do()` to finish
+so the worker will immediately be put back in the pool (i.e. potentially causing
+oversubscription).
 """
+#remote_do(f, pool::AbstractWorkerPool, args...; role= :default, kwargs...) = remotecall_pool((f,pool) -> remote_do(f, pool, role = role, args...; kwargs...); role = role)
 remote_do(f, pool::AbstractWorkerPool, args...; kwargs...) = remotecall_pool(remote_do, f, pool, args...; kwargs...)
 
 const _default_worker_pool = Ref{Union{AbstractWorkerPool, Nothing}}(nothing)
@@ -256,14 +292,14 @@ julia> default_worker_pool()
 WorkerPool(Channel{Int64}(sz_max:9223372036854775807,sz_curr:3), Set([4, 2, 3]), RemoteChannel{Channel{Any}}(1, 1, 4))
 ```
 """
-function default_worker_pool()
+function default_worker_pool(;role=:default)
     # On workers retrieve the default worker pool from the master when accessed
     # for the first time
     if _default_worker_pool[] === nothing
-        if myid() == 1
-            _default_worker_pool[] = WorkerPool()
+        if myid(role=role) == 1
+            _default_worker_pool[] = WorkerPool(role = role)
         else
-            _default_worker_pool[] = remotecall_fetch(()->default_worker_pool(), 1)
+            _default_worker_pool[] = remotecall_fetch(role->default_worker_pool(role = role), 1, :master; role=role)
         end
     end
     return _default_worker_pool[]
@@ -284,8 +320,8 @@ end
 Return an anonymous function that executes function `f` on an available worker
 (drawn from [`WorkerPool`](@ref) `p` if provided) using [`remotecall_fetch`](@ref).
 """
-remote(f) = (args...; kwargs...)->remotecall_fetch(f, default_worker_pool(), args...; kwargs...)
-remote(p::AbstractWorkerPool, f) = (args...; kwargs...)->remotecall_fetch(f, p, args...; kwargs...)
+remote(f; role= :default) = (args...; kwargs...)->remotecall_fetch(f, default_worker_pool(role=role), args...; role=role, kwargs...)
+remote(p::AbstractWorkerPool, f; role= :default) = (args...; kwargs...)->remotecall_fetch(f, p, args...; role=role, kwargs...)
 
 mutable struct CachingPool <: AbstractWorkerPool
     channel::Channel{Int}
@@ -351,20 +387,44 @@ function clear!(pool::CachingPool)
     pool
 end
 
-exec_from_cache(rr::RemoteChannel, args...; kwargs...) = fetch(rr)(args...; kwargs...)
-function exec_from_cache(f_ref::Tuple{Function, RemoteChannel}, args...; kwargs...)
+exec_from_cache(rr::RemoteChannel, args...; role= :default, kwargs...) = fetch(rr; role = role)(args...; kwargs...)
+function exec_from_cache(f_ref::Tuple{Function, RemoteChannel}, args...; role= :default, kwargs...)
     put!(f_ref[2], f_ref[1])        # Cache locally
     f_ref[1](args...; kwargs...)
 end
 
-function remotecall_pool(rc_f, f, pool::CachingPool, args...; kwargs...)
-    worker = take!(pool)
-    f_ref = get(pool.map_obj2ref, (worker, f), (f, RemoteChannel(worker)))
+function remotecall_pool(rc_f, f, pool::CachingPool, args...; role= :default, kwargs...)
+    worker = take!(pool; role=role)
+    f_ref = get(pool.map_obj2ref, (worker, f), (f, RemoteChannel(worker; role=role)))
     isa(f_ref, Tuple) && (pool.map_obj2ref[(worker, f)] = f_ref[2])   # Add to tracker
 
     try
-        rc_f(exec_from_cache, worker, f_ref, args...; kwargs...)
+        rc_f(exec_from_cache, worker, f_ref, args...; role=role, kwargs...)
     finally
-        put!(pool, worker)
+        put!(pool, worker; role=role)
     end
 end
+
+
+# Specialization for remotecall. We have to wait for the Future it returns
+# before putting the worker back in the pool.
+function remotecall_pool(rc_f::typeof(remotecall), f, pool::CachingPool, args...; role= :default, kwargs...)
+    worker = take!(pool; role=role)
+    f_ref = get(pool.map_obj2ref, (worker, f), (f, RemoteChannel(worker; role=role)))
+    isa(f_ref, Tuple) && (pool.map_obj2ref[(worker, f)] = f_ref[2])   # Add to tracker
+    local x
+    try
+        x = rc_f(exec_from_cache, worker, f_ref, args...; role=role, kwargs...)
+    catch
+        put!(pool, worker; role=role)
+        rethrow()
+    end
+    t = Threads.@spawn Threads.threadpool() try
+        wait(x)
+    catch # just wait, ignore errors here
+    finally
+        put!(pool, worker; role=role)
+    end
+    errormonitor(t)
+    return x
+end
\ No newline at end of file
diff --git a/test/aqua.jl b/test/aqua.jl
new file mode 100644
index 0000000..56c01c5
--- /dev/null
+++ b/test/aqua.jl
@@ -0,0 +1,8 @@
+using Aqua
+using Distributed
+Aqua.test_all(
+    Distributed,
+    # This should be excluded, but it's not clear how to do that on Aqua's API
+    # given it's not-defined. (The Julia Base ambiguity test does it something like this)
+    # ambiguities=(exclude=[GlobalRef(Distributed, :cluster_manager)])
+)
\ No newline at end of file
diff --git a/test/distributed_exec.jl b/test/distributed_exec.jl
index 7b5c983..63a00cc 100644
--- a/test/distributed_exec.jl
+++ b/test/distributed_exec.jl
@@ -3,12 +3,7 @@
 using Test, Distributed, Random, Serialization, Sockets
 import Distributed: launch, manage
 
-sharedir = normpath(joinpath(Sys.BINDIR, "..", "share"))
-if parse(Bool, get(ENV, "JULIA_DISTRIBUTED_TESTING_STANDALONE", "false"))
-    @test !startswith(pathof(Distributed), sharedir)
-else
-    @test startswith(pathof(Distributed), sharedir)
-end
+pathsep = Sys.iswindows() ? ";" : ":"
 
 @test cluster_cookie() isa String
 
@@ -27,7 +22,7 @@ include(joinpath(Sys.BINDIR, "..", "share", "julia", "test", "testenv.jl"))
 addprocs_with_testenv(4)
 @test nprocs() == 5
 
-# distributed loading of packages
+# Distributed loading of packages
 
 # setup
 @everywhere begin
@@ -52,6 +47,7 @@ end
 id_me = myid()
 id_other = filter(x -> x != id_me, procs())[rand(1:(nprocs()-1))]
 
+
 # Test role
 @everywhere using Distributed
 @test Distributed.myrole() === :master
@@ -62,6 +58,9 @@ for wid = workers()
     @test wrole === :worker
 end
 
+#sleep(3)
+
+
 # Test remote()
 let
     pool = default_worker_pool()
@@ -79,17 +78,27 @@ let
         yield()
     end
 
+#    @info nworkers()
+#    sleep(30)
+
     testchannels = [RemoteChannel() for i in 1:nworkers()]
+ #   @info testchannels
+ #   sleep(30)
     testcount = 0
     @test isready(pool) == true
     for c in testchannels
         @test count == testcount
+#        @info c
         remote_wait(c)
         testcount += 1
     end
     @test count == testcount
     @test isready(pool) == false
 
+    #sleep(3)
+
+    try
+
     for c in testchannels
         @test count == testcount
         put!(c, "foo")
@@ -99,8 +108,14 @@ let
         @test isready(pool) == true
     end
 
+    catch e
+        @info e
+    end
+
     @test count == 0
 
+    #sleep(3)
+
     for c in testchannels
         @test count == testcount
         remote_wait(c)
@@ -109,6 +124,8 @@ let
     @test count == testcount
     @test isready(pool) == false
 
+    #sleep(3)
+
     for c in reverse(testchannels)
         @test count == testcount
         put!(c, "foo")
@@ -118,9 +135,14 @@ let
         @test isready(pool) == true
     end
 
+    #sleep(3)
+
     @test count == 0
 end
 
+#sleep(3)
+
+
 # Test Futures
 function testf(id)
     f=Future(id)
@@ -151,48 +173,27 @@ function poll_while(f::Function; timeout_seconds::Integer = 120)
     return true
 end
 
-function _getenv_include_thread_unsafe()
-    environment_variable_name = "JULIA_TEST_INCLUDE_THREAD_UNSAFE"
-    default_value = "false"
-    environment_variable_value = strip(get(ENV, environment_variable_name, default_value))
-    b = parse(Bool, environment_variable_value)::Bool
-    return b
-end
-const _env_include_thread_unsafe = _getenv_include_thread_unsafe()
-function include_thread_unsafe_tests()
-    if Threads.maxthreadid() > 1
-        if _env_include_thread_unsafe
-            return true
-        end
-        msg = "Skipping a thread-unsafe test because `Threads.maxthreadid() > 1`"
-        @warn msg Threads.maxthreadid()
-        Test.@test_broken false
-        return false
-    end
-    return true
-end
-
 # Distributed GC tests for Futures
 function test_futures_dgc(id)
     f = remotecall(myid, id)
     fid = remoteref_id(f)
 
     # remote value should be deleted after a fetch
-    @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, fid) == true
+    @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP().refs, k)), id, fid) == true
     @test f.v === nothing
     @test fetch(f) == id
     @test f.v !== nothing
     yield(); # flush gc msgs
-    @test poll_while(() -> remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, fid))
+    @test poll_while(() -> remotecall_fetch(k->(yield();haskey(Distributed.PGRP().refs, k)), id, fid))
 
     # if unfetched, it should be deleted after a finalize
     f = remotecall(myid, id)
     fid = remoteref_id(f)
-    @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, fid) == true
+    @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP().refs, k)), id, fid) == true
     @test f.v === nothing
     finalize(f)
     yield(); # flush gc msgs
-    @test poll_while(() -> remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, fid))
+    @test poll_while(() -> remotecall_fetch(k->(yield();haskey(Distributed.PGRP().refs, k)), id, fid))
 end
 
 test_futures_dgc(id_me)
@@ -208,48 +209,63 @@ fstore = RemoteChannel(wid2)
 put!(fstore, f)
 
 @test fetch(f) == wid1
-@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == true
+@test remotecall_fetch(k->haskey(Distributed.PGRP().refs, k), wid1, fid) == true
 remotecall_fetch(r->(fetch(fetch(r)); yield()), wid2, fstore)
 sleep(0.5) # to ensure that wid2 gc messages have been executed on wid1
-@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == false
+@test remotecall_fetch(k->haskey(Distributed.PGRP().refs, k), wid1, fid) == false
 
 # put! should release remote reference since it would have been cached locally
 f = Future(wid1)
 fid = remoteref_id(f)
 
 # should not be created remotely till accessed
-@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == false
+@test remotecall_fetch(k->haskey(Distributed.PGRP().refs, k), wid1, fid) == false
 # create it remotely
 isready(f)
 
-@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == true
+@test remotecall_fetch(k->haskey(Distributed.PGRP().refs, k), wid1, fid) == true
 put!(f, :OK)
-@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == false
+@test remotecall_fetch(k->haskey(Distributed.PGRP().refs, k), wid1, fid) == false
 @test fetch(f) === :OK
 
 # RemoteException should be thrown on a put! when another process has set the value
-f = Future(wid1)
-fid = remoteref_id(f)
-
-fstore = RemoteChannel(wid2)
-put!(fstore, f) # send f to wid2
-put!(f, :OK) # set value from master
-
-@test remotecall_fetch(k->haskey(Distributed.PGRP.refs, k), wid1, fid) == true
-
-testval = remotecall_fetch(wid2, fstore) do x
-    try
-        put!(fetch(x), :OK)
-        return 0
-    catch e
-        if isa(e, RemoteException)
-            return 1
-        else
-            return 2
+# Test this multiple times as races have been seen where `@spawn` was used over
+# `@async`. Issue #124
+max_attempts = 100
+for i in 1:max_attempts
+    let f = Future(wid1), fid = remoteref_id(f), fstore = RemoteChannel(wid2)
+        # RemoteException should be thrown on a put! when another process has set the value
+
+        put!(fstore, f) # send f to wid2
+        put!(f, :OK) # set value from master
+
+        @test remotecall_fetch(k->haskey(Distributed.PGRP().refs, k), wid1, fid) == true
+
+        # fstore should be ready immediately, but races due to use of `@spawn` have caused
+        # this to fail in the past. So we poll for readiness before the main test after this
+        # which internally checks for `isready` to decide whether to error or not
+        w = remotecall_fetch(wid2, fstore) do x
+            timedwait(() -> isready(fetch(x)), 10)
         end
+        w == :ok || @info "isready timed out on attempt $i (max $max_attempts)"
+        @test w == :ok
+        # This is the actual test. It should fail because the value is already set remotely
+        testval = remotecall_fetch(wid2, fstore) do x
+            try
+                put!(fetch(x), :OK)
+                return 0
+            catch e
+                if isa(e, RemoteException)
+                    return 1
+                else
+                    rethrow()
+                end
+            end
+        end
+        testval == 1 || @info "test failed on attempt $i (max $max_attempts)"
+        @test testval == 1
     end
 end
-@test testval == 1
 
 # Issue number #25847
 @everywhere function f25847(ref)
@@ -260,14 +276,15 @@ end
 f = remotecall_wait(identity, id_other, ones(10))
 rrid = Distributed.RRID(f.whence, f.id)
 remotecall_fetch(f25847, id_other, f)
-@test BitSet([id_me]) == remotecall_fetch(()->Distributed.PGRP.refs[rrid].clientset, id_other)
+@test BitSet([id_me]) == remotecall_fetch(()->Distributed.PGRP().refs[rrid].clientset, id_other)
 
 remotecall_fetch(f25847, id_other, f)
-@test BitSet([id_me]) == remotecall_fetch(()->Distributed.PGRP.refs[rrid].clientset, id_other)
+@test BitSet([id_me]) == remotecall_fetch(()->Distributed.PGRP().refs[rrid].clientset, id_other)
 
 finalize(f)
 yield() # flush gc msgs
-@test poll_while(() -> remotecall_fetch(chk_rrid->(yield(); haskey(Distributed.PGRP.refs, chk_rrid)), id_other, rrid))
+@test poll_while(() -> remotecall_fetch(chk_rrid->(yield(); haskey(Distributed.PGRP().refs, chk_rrid)), id_other, rrid))
+
 
 # Distributed GC tests for RemoteChannels
 function test_remoteref_dgc(id)
@@ -276,12 +293,12 @@ function test_remoteref_dgc(id)
     rrid = remoteref_id(rr)
 
     # remote value should be deleted after finalizing the ref
-    @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, rrid) == true
+    @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP().refs, k)), id, rrid) == true
     @test fetch(rr) === :OK
-    @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, rrid) == true
+    @test remotecall_fetch(k->(yield();haskey(Distributed.PGRP().refs, k)), id, rrid) == true
     finalize(rr)
     yield(); # flush gc msgs
-    @test poll_while(() -> remotecall_fetch(k->(yield();haskey(Distributed.PGRP.refs, k)), id, rrid))
+    @test poll_while(() -> remotecall_fetch(k->(yield();haskey(Distributed.PGRP().refs, k)), id, rrid))
 end
 test_remoteref_dgc(id_me)
 test_remoteref_dgc(id_other)
@@ -294,17 +311,19 @@ let wid1 = workers()[1],
     fstore = RemoteChannel(wid2)
 
     put!(fstore, rr)
-    if include_thread_unsafe_tests()
-        @test remotecall_fetch(k -> haskey(Distributed.PGRP.refs, k), wid1, rrid) == true
-    end
+
+    # timedwait() is necessary because wid1 is asynchronously informed of
+    # the existence of rr/rrid through the call to `put!(fstore, rr)`.
+    @test timedwait(() -> remotecall_fetch(k -> haskey(Distributed.PGRP().refs, k), wid1, rrid), 10) === :ok
+
     finalize(rr) # finalize locally
     yield() # flush gc msgs
-    if include_thread_unsafe_tests()
-        @test remotecall_fetch(k -> haskey(Distributed.PGRP.refs, k), wid1, rrid) == true
-    end
+
+    @test timedwait(() -> remotecall_fetch(k -> haskey(Distributed.PGRP().refs, k), wid1, rrid), 10) === :ok
+
     remotecall_fetch(r -> (finalize(take!(r)); yield(); nothing), wid2, fstore) # finalize remotely
     sleep(0.5) # to ensure that wid2 messages have been executed on wid1
-    @test poll_while(() -> remotecall_fetch(k -> haskey(Distributed.PGRP.refs, k), wid1, rrid))
+    @test poll_while(() -> remotecall_fetch(k -> haskey(Distributed.PGRP().refs, k), wid1, rrid))
 end
 
 # Tests for issue #23109 - should not hang.
@@ -332,7 +351,6 @@ for i in 1:nworkers()
 end
 @test sort(pids) == sort(workers())
 
-
 # test getindex on Futures and RemoteChannels
 function test_indexing(rr)
     a = rand(5,5)
@@ -473,6 +491,8 @@ test_iteration(RemoteChannel(() -> Channel(10)), RemoteChannel(() -> Channel(10)
     return count
 end
 
+@everywhere test_iteration_collect(ch) = length(collect(ch))
+
 @everywhere function test_iteration_put(ch, total)
     for i in 1:total
         put!(ch, i)
@@ -483,10 +503,27 @@ end
 let ch = RemoteChannel(() -> Channel(1))
     @async test_iteration_put(ch, 10)
     @test 10 == @fetchfrom id_other test_iteration_take(ch)
+    ch = RemoteChannel(() -> Channel(1))
+    @async test_iteration_put(ch, 10)
+    @test 10 == @fetchfrom id_other test_iteration_collect(ch)    
     # now reverse
     ch = RemoteChannel(() -> Channel(1))
     @spawnat id_other test_iteration_put(ch, 10)
     @test 10 == test_iteration_take(ch)
+    ch = RemoteChannel(() -> Channel(1))
+    @spawnat id_other test_iteration_put(ch, 10)
+    @test 10 == test_iteration_collect(ch)
+end
+
+# Test isempty(::RemoteChannel). This should not modify the underlying
+# AbstractChannel, which Base's default implementation will do.
+let
+    chan = Channel(1)
+    push!(chan, 1)
+    remotechan = RemoteChannel(() -> chan)
+    @test !isempty(remotechan)
+    # Calling `isempty(remotechan)` shouldn't have modified `chan`
+    @test !isempty(chan)
 end
 
 # make sure exceptions propagate when waiting on Tasks
@@ -567,7 +604,7 @@ let ex
 end
 
 # pmap tests. Needs at least 4 processors dedicated to the below tests. Which we currently have
-# since the distributed tests are now spawned as a separate set.
+# since the Distributed tests are now spawned as a separate set.
 
 # Test all combinations of pmap keyword args.
 pmap_args = [
@@ -660,7 +697,6 @@ generic_map_tests(pmap_fallback)
 run_map_equivalence_tests(pmap)
 @test pmap(uppercase, "Hello World!") == map(uppercase, "Hello World!")
 
-
 # Simple test for pmap throws error
 let error_thrown = false
     try
@@ -700,10 +736,36 @@ wp = WorkerPool(workers())
 @test nworkers() == length(unique(remotecall_fetch(wp->pmap(_->myid(), wp, 1:100), id_other, wp)))
 wp = WorkerPool(2:3)
 @test sort(unique(pmap(_->myid(), wp, 1:100))) == [2,3]
+@test fetch(remotecall(myid, wp)) in wp.workers
+@test_throws RemoteException fetch(remotecall(error, wp))
+
+# wait on worker pool
+wp = WorkerPool(2:2)
+w = take!(wp)
+
+# local call to _wait
+@test !isready(wp)
+t = @async wait(wp)
+@test !istaskdone(t)
+put!(wp, w)
+status = timedwait(() -> istaskdone(t), 10)
+@test status == :ok
+
+# remote call to _wait
+take!(wp)
+@test !isready(wp)
+f = @spawnat w wait(wp)
+@test !isready(f)
+put!(wp, w)
+status = timedwait(() -> isready(f), 10)
+@test status == :ok
+
 
 # CachingPool tests
 wp = CachingPool(workers())
 @test [1:100...] == pmap(x->x, wp, 1:100)
+@test fetch(remotecall(myid, wp)) in wp.workers
+@test_throws RemoteException fetch(remotecall(error, wp))
 
 clear!(wp)
 @test length(wp.map_obj2ref) == 0
@@ -742,7 +804,7 @@ if DoFullTest
     all_w = workers()
     # Test sending fake data to workers. The worker processes will print an
     # error message but should not terminate.
-    for w in Distributed.PGRP.workers
+    for w in Distributed.PGRP().workers
         if isa(w, Distributed.Worker)
             local s = connect(w.config.host, w.config.port)
             write(s, randstring(32))
@@ -769,6 +831,7 @@ if Sys.isunix() # aka have ssh
         remotecall_fetch(rmprocs, 1, new_pids)
     end
 
+
     print("\n\nTesting SSHManager. A minimum of 4GB of RAM is recommended.\n")
     print("Please ensure: \n")
     print("1) sshd is running locally with passwordless login enabled.\n")
@@ -887,7 +950,6 @@ v15406 = remotecall_wait(() -> 1, id_other)
 fetch(v15406)
 remotecall_wait(fetch, id_other, v15406)
 
-
 # issue #43396
 # Covers the remote fetch where the value returned is `nothing`
 # May be caused by attempting to unwrap a non-`Some` type with `something`
@@ -896,7 +958,6 @@ remotecall_wait(fetch, id_other, v15406)
 @test nothing === fetch(remotecall(() -> nothing, workers()[1]))
 @test 10 === fetch(remotecall(() -> 10, workers()[1]))
 
-
 # Test various forms of remotecall* invocations
 
 @everywhere f_args(v1, v2=0; kw1=0, kw2=0) = v1+v2+kw1+kw2
@@ -918,15 +979,16 @@ for tid in [id_other, id_me, default_worker_pool()]
     test_f_args(15, f_args, tid, 1, 2; kw1=4, kw2=8)
 end
 
-# Test remote_do
-f=Future(id_me)
-remote_do(fut->put!(fut, myid()), id_me, f)
-@test fetch(f) == id_me
 
 f=Future(id_other)
 remote_do(fut->put!(fut, myid()), id_other, f)
 @test fetch(f) == id_other
 
+# Test remote_do
+f=Future(id_me)
+remote_do(fut->put!(fut, myid()), id_me, f)
+@test fetch(f) == id_me
+
 # Github issue #29932
 rc_unbuffered = RemoteChannel(()->Channel{Vector{Float64}}(0))
 @test eltype(rc_unbuffered) == Vector{Float64}
@@ -966,33 +1028,32 @@ end
 
 # issue #16091
 mutable struct T16091 end
-wid = workers()[1]
-try
-    remotecall_fetch(()->T16091, wid)
-    @test "unreachable" === true
+wid0 = workers()[1]
+@test try
+    remotecall_fetch(()->T16091, wid0)
+    false
 catch ex
-    ex = ((ex::RemoteException).captured::CapturedException).ex
-    @test (ex::UndefVarError).var === :T16091
+    @info "----------------- $(((ex::RemoteException).captured::CapturedException).ex)"
+    ((ex::RemoteException).captured::CapturedException).ex === UndefVarError(:T16091, Main)
 end
-try
-    remotecall_fetch(identity, wid, T16091)
-    @test "unreachable" === true
+@test try
+    remotecall_fetch(identity, wid0, T16091)
+    false
 catch ex
-    ex = ((ex::RemoteException).captured::CapturedException).ex
-    @test (ex::UndefVarError).var === :T16091
+    ((ex::RemoteException).captured::CapturedException).ex === UndefVarError(:T16091, Main)
 end
 
 f16091a() = 1
-remotecall_fetch(()->eval(:(f16091a() = 2)), wid)
-@test remotecall_fetch(f16091a, wid) === 2
-@test remotecall_fetch((myid)->remotecall_fetch(f16091a, myid), wid, myid()) === 1
+remotecall_fetch(()->eval(:(f16091a() = 2)), wid0)
+@test remotecall_fetch(f16091a, wid0) === 2
+@test remotecall_fetch((myid)->remotecall_fetch(f16091a, myid), wid0, myid()) === 1
 
 # these will only heisen-fail, since it depends on the gensym counter collisions:
 f16091b = () -> 1
-remotecall_fetch(()->eval(:(f16091b = () -> 2)), wid)
+remotecall_fetch(()->eval(:(f16091b = () -> 2)), wid0)
 @test remotecall_fetch(f16091b, 2) === 1
 # Global anonymous functions are over-written...
-@test remotecall_fetch((myid)->remotecall_fetch(f16091b, myid), wid, myid()) === 1
+@test remotecall_fetch((myid)->remotecall_fetch(f16091b, myid), wid0, myid()) === 1
 
 # ...while local anonymous functions are by definition, local.
 let
@@ -1004,7 +1065,7 @@ let
                 f16091c = () -> 2
                 remotecall_fetch(f16091c, myid)
             end
-        end, wid, myid()) === 2
+        end, wid0, myid()) === 2
 end
 
 # issue #16451
@@ -1050,6 +1111,23 @@ let
     @test_throws RemoteException fetch(ref)
 end
 
+# Test the behaviour of remotecall(f, ::AbstractWorkerPool), this should
+# keep the worker out of the pool until the underlying remotecall has
+# finished.
+for PoolType in (WorkerPool, CachingPool)
+    let
+        remotechan = RemoteChannel(wrkr1)
+        pool = PoolType([wrkr1])
+        put_future = remotecall(() -> wait(remotechan), pool)
+        @test !isready(pool)
+        put!(remotechan, 1)
+        wait(put_future)
+        # The task that waits on the future to put it back into the pool runs
+        # asynchronously so we use timedwait() to check when the worker is back in.
+        @test timedwait(() -> isready(pool), 10) === :ok
+    end
+end
+
 # Test calling @everywhere from a module not defined on the workers
 module LocalBar
     using Distributed
@@ -1113,9 +1191,9 @@ function get_remote_num_threads(processes_added)
     return [remotecall_fetch(BLAS.get_num_threads, proc_id) for proc_id in processes_added]
 end
 
-function test_blas_config(pid, expected)
-    for worker in Distributed.PGRP.workers
-        if worker.id == pid
+function test_blas_config(pid, expected; role=:default)
+    for worker in Distributed.PGRP(role=role).workers
+        if Distributed.wid(worker,role=role) == pid
             @test worker.config.enable_threaded_blas == expected
             return
         end
@@ -1197,16 +1275,16 @@ end
 end
 
 # Test addprocs/rmprocs from master node only
-for f in [ ()->addprocs(1; exeflags=test_exeflags), ()->rmprocs(workers()) ]
-    local f
-    try
-        remotecall_fetch(f, id_other)
-        error("Unexpected")
-    catch ex
-        @test isa(ex, RemoteException)
-        @test ex.captured.ex.msg == "Only process 1 can add and remove workers"
-    end
-end
+#for f in [ ()->addprocs(1; exeflags=test_exeflags), ()->rmprocs(workers()) ]
+#    local f
+#    try
+#        remotecall_fetch(f, id_other)
+#        error("Unexpected")
+#    catch ex
+#        @test isa(ex, RemoteException)
+#        @test ex.captured.ex.msg == "Only process 1 can add and remove workers"
+#    end
+#end
 
 # Test the following addprocs error conditions
 # - invalid host name - github issue #20372
@@ -1273,7 +1351,6 @@ for (addp_testf, expected_errstr, env) in testruns
     end
 end
 
-
 # Auto serialization of globals from Main.
 # bitstypes
 global v1 = 1
@@ -1341,7 +1418,6 @@ v31252 = :b
 v31252 = :a
 @test :a == @fetchfrom id_other v31252
 
-
 # Test that a global is not being repeatedly serialized when
 # a) referenced multiple times in the closure
 # b) hash value has not changed.
@@ -1440,7 +1516,7 @@ clust_ser = (Distributed.worker_from_id(id_other)).w_serializer
 
 # TODO Add test for cleanup from `clust_ser.glbs_in_tnobj`
 
-# reported github issues - Mostly tests with globals and various distributed macros
+# reported github issues - Mostly tests with globals and various Distributed macros
 #2669, #5390
 v2669=10
 @test fetch(@spawnat :any (1+v2669)) == 11
@@ -1557,8 +1633,7 @@ try
 catch ex
     @test isa(ex.captured.ex.exceptions[1].ex, ErrorException)
     @test occursin("BoundsError", ex.captured.ex.exceptions[1].ex.msg)
-    ex = ex.captured.ex.exceptions[2].ex
-    @test (ex::UndefVarError).var === :DontExistOn1
+    @test ex.captured.ex.exceptions[2].ex == UndefVarError(:DontExistOn1)
 end
 
 let
@@ -1682,21 +1757,21 @@ p1,p2 = addprocs_with_testenv(2)
 @test fill(2.,2) == remotecall_fetch(f22865, p1, p2)
 rmprocs(p1, p2)
 
-function reuseport_tests()
+function reuseport_tests(;role = :default)
     # Run the test on all processes.
     results = asyncmap(procs()) do p
         remotecall_fetch(p) do
             ports_lower = []        # ports of pids lower than myid()
             ports_higher = []       # ports of pids higher than myid()
-            for w in Distributed.PGRP.workers
-                w.id == myid() && continue
+            for w in Distributed.PGRP(role=role).workers
+                Distributed.wid(w,role=role) == myid() && continue
                 port = Sockets._sockname(w.r_stream, true)[2]
-                if (w.id == 1)
+                if (Distributed.wid(w,role=role) == 1)
                     # master connects to workers
                     push!(ports_higher, port)
-                elseif w.id < myid()
+                elseif Distributed.wid(w,role=role) < myid(role=role)
                     push!(ports_lower, port)
-                elseif w.id > myid()
+                elseif Distributed.wid(w,role=role) > myid(role=role)
                     push!(ports_higher, port)
                 end
             end
@@ -1707,23 +1782,22 @@ function reuseport_tests()
                     return 0
                 end
             end
-            return myid()
+            return myid(role=role)
         end
     end
 
     # Ensure that the code has indeed been successfully executed everywhere
-    @test all(in(results), procs())
+    return all(in(results), procs())
 end
 
 # Test that the client port is reused. SO_REUSEPORT may not be supported on
 # all UNIX platforms, Linux kernels prior to 3.9 and older versions of OSX
 @assert nprocs() == 1
 addprocs_with_testenv(4; lazy=false)
-if ccall(:jl_has_so_reuseport, Int32, ()) == 1
-    reuseport_tests()
-else
-    @info "SO_REUSEPORT is unsupported, skipping reuseport tests"
-end
+
+skip_reuseexport = ccall(:jl_has_so_reuseport, Int32, ()) != 1
+skip_reuseexport && @debug "SO_REUSEPORT support missing, reuseport_tests skipped"
+@test reuseport_tests() skip = skip_reuseexport
 
 # issue #27933
 a27933 = :_not_defined_27933
@@ -1797,9 +1871,10 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp
     project = mkdir(joinpath(tmp, "project"))
     depots = [mkdir(joinpath(tmp, "depot1")), mkdir(joinpath(tmp, "depot2"))]
     load_path = [mkdir(joinpath(tmp, "load_path")), "@stdlib", "@"]
-    pathsep = Sys.iswindows() ? ";" : ":"
+    shipped_depots = DEPOT_PATH[2:end] # stdlib caches
     env = Dict(
-        "JULIA_DEPOT_PATH" => join(depots, pathsep),
+        # needs a trailing pathsep to access the stdlib depot
+        "JULIA_DEPOT_PATH" => join(depots, pathsep) * pathsep,
         "JULIA_LOAD_PATH" => join(load_path, pathsep),
         # Explicitly propagate `TMPDIR`, in the event that we're running on a
         # CI system where `TMPDIR` is special.
@@ -1829,7 +1904,7 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp
     end
     """
     cmd = setenv(`$(julia) -p1 -e $(testcode * extracode)`, env)
-    @test success(cmd)
+    @test success(pipeline(cmd; stdout, stderr))
     # --project
     extracode = """
     for w in workers()
@@ -1838,11 +1913,11 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp
     end
     """
     cmd = setenv(`$(julia) --project=$(project) -p1 -e $(testcode * extracode)`, env)
-    @test success(cmd)
+    @test success(pipeline(cmd; stdout, stderr))
     # JULIA_PROJECT
     cmd = setenv(`$(julia) -p1 -e $(testcode * extracode)`,
                  (env["JULIA_PROJECT"] = project; env))
-    @test success(cmd)
+    @test success(pipeline(cmd; stdout, stderr))
     # Pkg.activate(...)
     activateish = """
     Base.ACTIVE_PROJECT[] = $(repr(project))
@@ -1850,11 +1925,17 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp
     addprocs(1)
     """
     cmd = setenv(`$(julia) -e $(activateish * testcode * extracode)`, env)
-    @test success(cmd)
+    @test success(pipeline(cmd; stdout, stderr))
     # JULIA_(LOAD|DEPOT)_PATH
     shufflecode = """
-    d = reverse(DEPOT_PATH)
-    append!(empty!(DEPOT_PATH), d)
+    function reverse_first_two(depots)
+        custom_depots = depots[1:2]
+        standard_depots = depots[3:end]
+        custom_depots = reverse(custom_depots)
+        return append!(custom_depots, standard_depots)
+    end
+    new_depots = reverse_first_two(DEPOT_PATH)
+    append!(empty!(DEPOT_PATH), new_depots)
     l = reverse(LOAD_PATH)
     append!(empty!(LOAD_PATH), l)
     """
@@ -1869,23 +1950,23 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp
     end
     """
     cmd = setenv(`$(julia) -e $(shufflecode * addcode * testcode * extracode)`, env)
-    @test success(cmd)
+    @test success(pipeline(cmd; stdout, stderr))
     # Mismatch when shuffling after proc addition
     failcode = shufflecode * setupcode * """
     for w in workers()
         @test remotecall_fetch(load_path, w) == reverse(LOAD_PATH) == $(repr(load_path))
-        @test remotecall_fetch(depot_path, w) == reverse(DEPOT_PATH) == $(repr(depots))
+        @test remotecall_fetch(depot_path, w) == $(repr(vcat(reverse(depots), shipped_depots)))
     end
     """
     cmd = setenv(`$(julia) -p1 -e $(failcode)`, env)
-    @test success(cmd)
+    @test success(pipeline(cmd; stdout, stderr))
     # Passing env or exeflags to addprocs(...) to override defaults
     envcode = """
     using Distributed
     project = mktempdir()
     env = Dict(
         "JULIA_LOAD_PATH" => string(LOAD_PATH[1], $(repr(pathsep)), "@stdlib"),
-        "JULIA_DEPOT_PATH" => DEPOT_PATH[1],
+        "JULIA_DEPOT_PATH" => DEPOT_PATH[1] * $(repr(pathsep)),
         "TMPDIR" => ENV["TMPDIR"],
     )
     addprocs(1; env = env, exeflags = `--project=\$(project)`)
@@ -1893,14 +1974,14 @@ let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp
     addprocs(1; env = env)
     """ * setupcode * """
     for w in workers()
-        @test remotecall_fetch(depot_path, w)          == [DEPOT_PATH[1]]
+        @test remotecall_fetch(depot_path, w)          == vcat(DEPOT_PATH[1], $(repr(shipped_depots)))
         @test remotecall_fetch(load_path, w)           == [LOAD_PATH[1], "@stdlib"]
         @test remotecall_fetch(active_project, w)      == project
         @test remotecall_fetch(Base.active_project, w) == joinpath(project, "Project.toml")
     end
     """
     cmd = setenv(`$(julia) -e $(envcode)`, env)
-    @test success(cmd)
+    @test success(pipeline(cmd; stdout, stderr))
 end end
 
 include("splitrange.jl")
@@ -1916,7 +1997,7 @@ begin
 
     # Next, ensure we get a log message when a worker does not cleanly exit
     w = only(addprocs(1))
-    @test_logs (:warn, r"sending SIGTERM") begin
+    @test_logs (:warn, r"Sending SIGQUIT") match_mode=:any begin
         remote_do(w) do
             # Cause the 'exit()' message that `rmprocs()` sends to do nothing
             Core.eval(Base, :(exit() = nothing))
@@ -1929,5 +2010,10 @@ end
 
 # Run topology tests last after removing all workers, since a given
 # cluster at any time only supports a single topology.
-nprocs() > 1 && rmprocs(workers())
+if nprocs() > 1
+    rmprocs(workers())
+end
+include("threads.jl") 
 include("topology.jl")
+
+
diff --git a/test/runtests.jl b/test/runtests.jl
index d34d07c..3651f70 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,14 +1,21 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
+using Test
+using Distributed
+# only run these if Aqua is installed. i.e. Pkg.test has installed it, or it is provided as a shared package
+if Base.locate_package(Base.PkgId(Base.UUID("4c88cf16-eb10-579e-8560-4a9242c79595"), "Aqua")) isa String
+    @testset "Aqua.jl tests" begin
+        include("aqua.jl")
+    end
+end
+
 # Run the distributed test outside of the main driver since it needs its own
 # set of dedicated workers.
 include(joinpath(Sys.BINDIR, "..", "share", "julia", "test", "testenv.jl"))
 disttestfile = joinpath(@__DIR__, "distributed_exec.jl")
 
-cmd = `$test_exename $test_exeflags $disttestfile`
-
-if !success(pipeline(cmd; stdout=stdout, stderr=stderr)) && ccall(:jl_running_on_valgrind,Cint,()) == 0
-    error("Distributed test failed, cmd : $cmd")
+@testset let cmd = `$test_exename $test_exeflags $disttestfile`
+    @test success(pipeline(cmd; stdout=stdout, stderr=stderr)) && ccall(:jl_running_on_valgrind,Cint,()) == 0
 end
 
 include("managers.jl")
diff --git a/test/threads.jl b/test/threads.jl
new file mode 100644
index 0000000..c978dd4
--- /dev/null
+++ b/test/threads.jl
@@ -0,0 +1,55 @@
+using Test
+using Distributed
+using Base.Iterators: product
+exeflags = ("--startup-file=no",
+            "--check-bounds=yes",
+            "--depwarn=error",
+            "--threads=2")
+function call_on(f, wid, tid)
+    remotecall(wid) do
+        t = Task(f)
+        ccall(:jl_set_task_tid, Cvoid, (Any, Cint), t, tid - 1)
+        schedule(t)
+        @assert Threads.threadid(t) == tid
+        t
+    end
+end
+# Run function on process holding the data to only serialize the result of f.
+# This becomes useful for things that cannot be serialized (e.g. running tasks)
+# or that would be unnecessarily big if serialized.
+fetch_from_owner(f, rr) = remotecall_fetch(f ∘ fetch, rr.where, rr)
+isdone(rr) = fetch_from_owner(istaskdone, rr)
+isfailed(rr) = fetch_from_owner(istaskfailed, rr)
+@testset "RemoteChannel allows put!/take! from thread other than 1" begin
+    ws = ts = product(1:2, 1:2)
+
+    # We want (the default) laziness, so that we wait for `Worker.c_state`!
+    procs_added = addprocs(2; exeflags, lazy=true)
+
+    @testset "from worker $w1 to $w2 via 1" for (w1, w2) in ws
+        @testset "from thread $w1.$t1 to $w2.$t2" for (t1, t2) in ts
+            p1 = procs_added[w1]
+            p2 = procs_added[w2]
+            chan_id = first(procs_added)
+            chan = RemoteChannel(chan_id)
+            send = call_on(p1, t1) do
+                put!(chan, nothing)
+            end
+            recv = call_on(p2, t2) do
+                take!(chan)
+            end
+            # Wait on the spawned tasks on the owner. Note that we use
+            # timedwait() instead of @sync to avoid deadlocks.
+            t1 = Threads.@spawn fetch_from_owner(wait, recv)
+            t2 = Threads.@spawn fetch_from_owner(wait, send)
+            @test timedwait(() -> istaskdone(t1), 60) == :ok
+            @test timedwait(() -> istaskdone(t2), 60) == :ok
+            # Check the tasks
+            @test isdone(send)
+            @test isdone(recv)
+            @test !isfailed(send)
+            @test !isfailed(recv)
+        end
+    end
+    rmprocs(procs_added)
+end
\ No newline at end of file
diff --git a/test/topology.jl b/test/topology.jl
index a24efb2..5aeab68 100644
--- a/test/topology.jl
+++ b/test/topology.jl
@@ -99,7 +99,7 @@ remove_workers_and_test()
 function def_count_conn()
     @everywhere function count_connected_workers()
         count(x -> isa(x, Distributed.Worker) && isdefined(x, :r_stream) && isopen(x.r_stream),
-                Distributed.PGRP.workers)
+                Distributed.PGRP().workers)
     end
 end