|
1 | 1 | # Copyright (c) Facebook, Inc. and its affiliates.
|
2 | 2 | import itertools
|
| 3 | +import logging |
3 | 4 | import math
|
4 | 5 | from collections import defaultdict
|
5 | 6 | from typing import Optional
|
|
8 | 9 |
|
9 | 10 | from detectron2.utils import comm
|
10 | 11 |
|
| 12 | +logger = logging.getLogger(__name__) |
| 13 | + |
11 | 14 |
|
12 | 15 | class TrainingSampler(Sampler):
|
13 | 16 | """
|
@@ -66,6 +69,63 @@ def _infinite_indices(self):
|
66 | 69 | yield from torch.arange(self._size).tolist()
|
67 | 70 |
|
68 | 71 |
|
| 72 | +class RandomSubsetTrainingSampler(TrainingSampler): |
| 73 | + """ |
| 74 | + Similar to TrainingSampler, but only sample a random subset of indices. |
| 75 | + This is useful when you want to estimate the accuracy vs data-number curves by |
| 76 | + training the model with different subset_ratio. |
| 77 | + """ |
| 78 | + |
| 79 | + def __init__( |
| 80 | + self, |
| 81 | + size: int, |
| 82 | + subset_ratio: float, |
| 83 | + shuffle: bool = True, |
| 84 | + seed_shuffle: Optional[int] = None, |
| 85 | + seed_subset: Optional[int] = None, |
| 86 | + ): |
| 87 | + """ |
| 88 | + Args: |
| 89 | + size (int): the total number of data of the underlying dataset to sample from |
| 90 | + subset_ratio (float): the ratio of subset data to sample from the underlying dataset |
| 91 | + shuffle (bool): whether to shuffle the indices or not |
| 92 | + seed_shuffle (int): the initial seed of the shuffle. Must be the same |
| 93 | + across all workers. If None, will use a random seed shared |
| 94 | + among workers (require synchronization among all workers). |
| 95 | + seed_subset (int): the seed to randomize the subset to be sampled. |
| 96 | + Must be the same across all workers. If None, will use a random seed shared |
| 97 | + among workers (require synchronization among all workers). |
| 98 | + """ |
| 99 | + super().__init__(size=size, shuffle=shuffle, seed=seed_shuffle) |
| 100 | + |
| 101 | + assert 0.0 < subset_ratio <= 1.0 |
| 102 | + self._size_subset = int(size * subset_ratio) |
| 103 | + assert self._size_subset > 0 |
| 104 | + if seed_subset is None: |
| 105 | + seed_subset = comm.shared_random_seed() |
| 106 | + self._seed_subset = int(seed_subset) |
| 107 | + |
| 108 | + # randomly generate the subset indexes to be sampled from |
| 109 | + g = torch.Generator() |
| 110 | + g.manual_seed(self._seed_subset) |
| 111 | + indexes_randperm = torch.randperm(self._size, generator=g) |
| 112 | + self._indexes_subset = indexes_randperm[: self._size_subset] |
| 113 | + |
| 114 | + logger.info("Using RandomSubsetTrainingSampler......") |
| 115 | + logger.info(f"Randomly sample {self._size_subset} data from the original {self._size} data") |
| 116 | + |
| 117 | + def _infinite_indices(self): |
| 118 | + g = torch.Generator() |
| 119 | + g.manual_seed(self._seed) # self._seed equals seed_shuffle from __init__() |
| 120 | + while True: |
| 121 | + if self._shuffle: |
| 122 | + # generate a random permutation to shuffle self._indexes_subset |
| 123 | + randperm = torch.randperm(self._size_subset, generator=g) |
| 124 | + yield from self._indexes_subset[randperm].tolist() |
| 125 | + else: |
| 126 | + yield from self._indexes_subset.tolist() |
| 127 | + |
| 128 | + |
69 | 129 | class RepeatFactorTrainingSampler(Sampler):
|
70 | 130 | """
|
71 | 131 | Similar to TrainingSampler, but a sample may appear more times than others based
|
|
0 commit comments