RandomSubsetTrainingSampler to randomly sample training data subset for accuracy-data curve

FCChen · facebook-github-bot · commit f410d5b26cbf · 2021-08-04T13:05:56.000-07:00
Summary:
Add a sampler class `RandomSubsetTrainingSampler`, which is similar to TrainingSampler but only sample a random subset (e.g., 50%) of indices.
`RandomSubsetTrainingSampler` is useful when you want to estimate the accuracy vs data-volume curves by training the model with different `subset_ratio`.

Reviewed By: ppwwyyxx

Differential Revision: D29892290

fbshipit-source-id: a342a6f1aa7852feb6566c648bd673028a3e0668
diff --git a/detectron2/data/build.py b/detectron2/data/build.py
@@ -19,7 +19,12 @@
 from .common import AspectRatioGroupedDataset, DatasetFromList, MapDataset
 from .dataset_mapper import DatasetMapper
 from .detection_utils import check_metadata_consistency
-from .samplers import InferenceSampler, RepeatFactorTrainingSampler, TrainingSampler
+from .samplers import (
+    InferenceSampler,
+    RandomSubsetTrainingSampler,
+    RepeatFactorTrainingSampler,
+    TrainingSampler,
+)
 
 """
 This file contains the default logic to build a dataloader for training or testing.
@@ -331,6 +336,8 @@ def _train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler=None):
                 dataset, cfg.DATALOADER.REPEAT_THRESHOLD
             )
             sampler = RepeatFactorTrainingSampler(repeat_factors)
+        elif sampler_name == "RandomSubsetTrainingSampler":
+            sampler = RandomSubsetTrainingSampler(len(dataset), cfg.DATALOADER.RANDOM_SUBSET_RATIO)
         else:
             raise ValueError("Unknown training sampler: {}".format(sampler_name))
 
diff --git a/detectron2/data/samplers/__init__.py b/detectron2/data/samplers/__init__.py
@@ -1,10 +1,17 @@
 # Copyright (c) Facebook, Inc. and its affiliates.
-from .distributed_sampler import InferenceSampler, RepeatFactorTrainingSampler, TrainingSampler
+from .distributed_sampler import (
+    InferenceSampler,
+    RandomSubsetTrainingSampler,
+    RepeatFactorTrainingSampler,
+    TrainingSampler,
+)
+
 from .grouped_batch_sampler import GroupedBatchSampler
 
 __all__ = [
     "GroupedBatchSampler",
     "TrainingSampler",
+    "RandomSubsetTrainingSampler",
     "InferenceSampler",
     "RepeatFactorTrainingSampler",
 ]
diff --git a/detectron2/data/samplers/distributed_sampler.py b/detectron2/data/samplers/distributed_sampler.py
@@ -1,5 +1,6 @@
 # Copyright (c) Facebook, Inc. and its affiliates.
 import itertools
+import logging
 import math
 from collections import defaultdict
 from typing import Optional
@@ -8,6 +9,8 @@
 
 from detectron2.utils import comm
 
+logger = logging.getLogger(__name__)
+
 
 class TrainingSampler(Sampler):
     """
@@ -66,6 +69,63 @@ def _infinite_indices(self):
                 yield from torch.arange(self._size).tolist()
 
 
+class RandomSubsetTrainingSampler(TrainingSampler):
+    """
+    Similar to TrainingSampler, but only sample a random subset of indices.
+    This is useful when you want to estimate the accuracy vs data-number curves by
+      training the model with different subset_ratio.
+    """
+
+    def __init__(
+        self,
+        size: int,
+        subset_ratio: float,
+        shuffle: bool = True,
+        seed_shuffle: Optional[int] = None,
+        seed_subset: Optional[int] = None,
+    ):
+        """
+        Args:
+            size (int): the total number of data of the underlying dataset to sample from
+            subset_ratio (float): the ratio of subset data to sample from the underlying dataset
+            shuffle (bool): whether to shuffle the indices or not
+            seed_shuffle (int): the initial seed of the shuffle. Must be the same
+                across all workers. If None, will use a random seed shared
+                among workers (require synchronization among all workers).
+            seed_subset (int): the seed to randomize the subset to be sampled.
+                Must be the same across all workers. If None, will use a random seed shared
+                among workers (require synchronization among all workers).
+        """
+        super().__init__(size=size, shuffle=shuffle, seed=seed_shuffle)
+
+        assert 0.0 < subset_ratio <= 1.0
+        self._size_subset = int(size * subset_ratio)
+        assert self._size_subset > 0
+        if seed_subset is None:
+            seed_subset = comm.shared_random_seed()
+        self._seed_subset = int(seed_subset)
+
+        # randomly generate the subset indexes to be sampled from
+        g = torch.Generator()
+        g.manual_seed(self._seed_subset)
+        indexes_randperm = torch.randperm(self._size, generator=g)
+        self._indexes_subset = indexes_randperm[: self._size_subset]
+
+        logger.info("Using RandomSubsetTrainingSampler......")
+        logger.info(f"Randomly sample {self._size_subset} data from the original {self._size} data")
+
+    def _infinite_indices(self):
+        g = torch.Generator()
+        g.manual_seed(self._seed)  # self._seed equals seed_shuffle from __init__()
+        while True:
+            if self._shuffle:
+                # generate a random permutation to shuffle self._indexes_subset
+                randperm = torch.randperm(self._size_subset, generator=g)
+                yield from self._indexes_subset[randperm].tolist()
+            else:
+                yield from self._indexes_subset.tolist()
+
+
 class RepeatFactorTrainingSampler(Sampler):
     """
     Similar to TrainingSampler, but a sample may appear more times than others based