Skip to content

Commit 6e7def9

Browse files
ppwwyyxxfacebook-github-bot
authored andcommitted
allow torchscript tests to run on CPUs
Summary: close T88689498 Reviewed By: vaibhava0 Differential Revision: D30063379 fbshipit-source-id: 23cddad66c17857f96bd60f59cf9f67291aedfff
1 parent ff638c9 commit 6e7def9

File tree

4 files changed

+33
-11
lines changed

4 files changed

+33
-11
lines changed

detectron2/config/defaults.py

+4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
# Copyright (c) Facebook, Inc. and its affiliates.
22
from .config import CfgNode as CN
33

4+
# NOTE: given the new config system
5+
# (https://detectron2.readthedocs.io/en/latest/tutorials/lazyconfigs.html),
6+
# we will stop adding new functionalities to default CfgNode.
7+
48
# -----------------------------------------------------------------------------
59
# Convention about Training / Test specific parameters
610
# -----------------------------------------------------------------------------

detectron2/utils/collect_env.py

+28-1
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,12 @@ def collect_env_info():
144144
msg = " - invalid!" if not (ROCM_HOME and os.path.isdir(ROCM_HOME)) else ""
145145
data.append(("ROCM_HOME", str(ROCM_HOME) + msg))
146146
else:
147+
try:
148+
from torch.utils.collect_env import get_nvidia_driver_version, run as _run
149+
150+
data.append(("Driver version", get_nvidia_driver_version(_run)))
151+
except Exception:
152+
pass
147153
msg = " - invalid!" if not (CUDA_HOME and os.path.isdir(CUDA_HOME)) else ""
148154
data.append(("CUDA_HOME", str(CUDA_HOME) + msg))
149155

@@ -194,6 +200,24 @@ def collect_env_info():
194200
return env_str
195201

196202

203+
def test_nccl_ops():
204+
num_gpu = torch.cuda.device_count()
205+
if os.access("/tmp", os.W_OK):
206+
import torch.multiprocessing as mp
207+
208+
dist_url = "file:///tmp/nccl_tmp_file"
209+
print("Testing NCCL connectivity ... this should not hang.")
210+
mp.spawn(_test_nccl_worker, nprocs=num_gpu, args=(num_gpu, dist_url), daemon=False)
211+
print("NCCL succeeded.")
212+
213+
214+
def _test_nccl_worker(rank, num_gpu, dist_url):
215+
import torch.distributed as dist
216+
217+
dist.init_process_group(backend="NCCL", init_method=dist_url, rank=rank, world_size=num_gpu)
218+
dist.barrier(device_ids=[rank])
219+
220+
197221
if __name__ == "__main__":
198222
try:
199223
from detectron2.utils.collect_env import collect_env_info as f
@@ -203,7 +227,8 @@ def collect_env_info():
203227
print(collect_env_info())
204228

205229
if torch.cuda.is_available():
206-
for k in range(torch.cuda.device_count()):
230+
num_gpu = torch.cuda.device_count()
231+
for k in range(num_gpu):
207232
device = f"cuda:{k}"
208233
try:
209234
x = torch.tensor([1, 2.0], dtype=torch.float32)
@@ -213,3 +238,5 @@ def collect_env_info():
213238
f"Unable to copy tensor to device={device}: {e}. "
214239
"Your CUDA environment is broken."
215240
)
241+
if num_gpu > 1:
242+
test_nccl_ops()

docs/modules/config.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,4 @@ Yaml Config References
1515
.. literalinclude:: ../../detectron2/config/defaults.py
1616
:language: python
1717
:linenos:
18-
:lines: 4-
18+
:lines: 7-

tests/test_export_torchscript.py

-9
Original file line numberDiff line numberDiff line change
@@ -35,19 +35,13 @@
3535

3636
@unittest.skipIf(os.environ.get("CI") or TORCH_VERSION < (1, 8), "Insufficient Pytorch version")
3737
class TestScripting(unittest.TestCase):
38-
@unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
3938
def testMaskRCNNFPN(self):
40-
# TODO: this test requires manifold access, see: T88318502
4139
self._test_rcnn_model("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
4240

43-
@unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
4441
def testMaskRCNNC4(self):
45-
# TODO: this test requires manifold access, see: T88318502
4642
self._test_rcnn_model("COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml")
4743

48-
@unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
4944
def testRetinaNet(self):
50-
# TODO: this test requires manifold access, see: T88318502
5145
self._test_retinanet_model("COCO-Detection/retinanet_R_50_FPN_3x.yaml")
5246

5347
def _test_rcnn_model(self, config_path):
@@ -94,7 +88,6 @@ def _test_retinanet_model(self, config_path):
9488

9589
@unittest.skipIf(os.environ.get("CI") or TORCH_VERSION < (1, 8), "Insufficient Pytorch version")
9690
class TestTracing(unittest.TestCase):
97-
@unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
9891
def testMaskRCNNFPN(self):
9992
# TODO: this test requires manifold access, see: T88318502
10093
def inference_func(model, image):
@@ -103,15 +96,13 @@ def inference_func(model, image):
10396

10497
self._test_model("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml", inference_func)
10598

106-
@unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
10799
def testMaskRCNNC4(self):
108100
def inference_func(model, image):
109101
inputs = [{"image": image}]
110102
return model.inference(inputs, do_postprocess=False)[0]
111103

112104
self._test_model("COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml", inference_func)
113105

114-
@unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
115106
def testRetinaNet(self):
116107
# TODO: this test requires manifold access, see: T88318502
117108
def inference_func(model, image):

0 commit comments

Comments
 (0)