allow torchscript tests to run on CPUs

ppwwyyxx · facebook-github-bot · commit 6e7def97f723 · 2021-08-24T01:03:05.000-07:00
Summary: close T88689498

Reviewed By: vaibhava0

Differential Revision: D30063379

fbshipit-source-id: 23cddad66c17857f96bd60f59cf9f67291aedfff
diff --git a/detectron2/config/defaults.py b/detectron2/config/defaults.py
@@ -1,6 +1,10 @@
 # Copyright (c) Facebook, Inc. and its affiliates.
 from .config import CfgNode as CN
 
+# NOTE: given the new config system
+# (https://detectron2.readthedocs.io/en/latest/tutorials/lazyconfigs.html),
+# we will stop adding new functionalities to default CfgNode.
+
 # -----------------------------------------------------------------------------
 # Convention about Training / Test specific parameters
 # -----------------------------------------------------------------------------
diff --git a/detectron2/utils/collect_env.py b/detectron2/utils/collect_env.py
@@ -144,6 +144,12 @@ def collect_env_info():
             msg = " - invalid!" if not (ROCM_HOME and os.path.isdir(ROCM_HOME)) else ""
             data.append(("ROCM_HOME", str(ROCM_HOME) + msg))
         else:
+            try:
+                from torch.utils.collect_env import get_nvidia_driver_version, run as _run
+
+                data.append(("Driver version", get_nvidia_driver_version(_run)))
+            except Exception:
+                pass
             msg = " - invalid!" if not (CUDA_HOME and os.path.isdir(CUDA_HOME)) else ""
             data.append(("CUDA_HOME", str(CUDA_HOME) + msg))
 
@@ -194,6 +200,24 @@ def collect_env_info():
     return env_str
 
 
+def test_nccl_ops():
+    num_gpu = torch.cuda.device_count()
+    if os.access("/tmp", os.W_OK):
+        import torch.multiprocessing as mp
+
+        dist_url = "file:///tmp/nccl_tmp_file"
+        print("Testing NCCL connectivity ... this should not hang.")
+        mp.spawn(_test_nccl_worker, nprocs=num_gpu, args=(num_gpu, dist_url), daemon=False)
+        print("NCCL succeeded.")
+
+
+def _test_nccl_worker(rank, num_gpu, dist_url):
+    import torch.distributed as dist
+
+    dist.init_process_group(backend="NCCL", init_method=dist_url, rank=rank, world_size=num_gpu)
+    dist.barrier(device_ids=[rank])
+
+
 if __name__ == "__main__":
     try:
         from detectron2.utils.collect_env import collect_env_info as f
@@ -203,7 +227,8 @@ def collect_env_info():
         print(collect_env_info())
 
     if torch.cuda.is_available():
-        for k in range(torch.cuda.device_count()):
+        num_gpu = torch.cuda.device_count()
+        for k in range(num_gpu):
             device = f"cuda:{k}"
             try:
                 x = torch.tensor([1, 2.0], dtype=torch.float32)
@@ -213,3 +238,5 @@ def collect_env_info():
                     f"Unable to copy tensor to device={device}: {e}. "
                     "Your CUDA environment is broken."
                 )
+        if num_gpu > 1:
+            test_nccl_ops()
diff --git a/docs/modules/config.rst b/docs/modules/config.rst
@@ -15,4 +15,4 @@ Yaml Config References
 .. literalinclude:: ../../detectron2/config/defaults.py
   :language: python
   :linenos:
-  :lines: 4-
+  :lines: 7-
diff --git a/tests/test_export_torchscript.py b/tests/test_export_torchscript.py
@@ -35,19 +35,13 @@
 
 @unittest.skipIf(os.environ.get("CI") or TORCH_VERSION < (1, 8), "Insufficient Pytorch version")
 class TestScripting(unittest.TestCase):
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def testMaskRCNNFPN(self):
-        # TODO: this test requires manifold access, see: T88318502
         self._test_rcnn_model("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
 
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def testMaskRCNNC4(self):
-        # TODO: this test requires manifold access, see: T88318502
         self._test_rcnn_model("COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml")
 
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def testRetinaNet(self):
-        # TODO: this test requires manifold access, see: T88318502
         self._test_retinanet_model("COCO-Detection/retinanet_R_50_FPN_3x.yaml")
 
     def _test_rcnn_model(self, config_path):
@@ -94,7 +88,6 @@ def _test_retinanet_model(self, config_path):
 
 @unittest.skipIf(os.environ.get("CI") or TORCH_VERSION < (1, 8), "Insufficient Pytorch version")
 class TestTracing(unittest.TestCase):
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def testMaskRCNNFPN(self):
         # TODO: this test requires manifold access, see: T88318502
         def inference_func(model, image):
@@ -103,15 +96,13 @@ def inference_func(model, image):
 
         self._test_model("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml", inference_func)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def testMaskRCNNC4(self):
         def inference_func(model, image):
             inputs = [{"image": image}]
             return model.inference(inputs, do_postprocess=False)[0]
 
         self._test_model("COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml", inference_func)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def testRetinaNet(self):
         # TODO: this test requires manifold access, see: T88318502
         def inference_func(model, image):