Skip to content

Commit e556640

Browse files
authoredMay 3, 2022
Reduce variance of evaluation in reference (#5819)
* Change code to reduce variance in eval * Remove unnecessary new line * Fix missing import warnings * Fix the warning on video_classification * Fix bug to get len of UniformClipSampler
1 parent aef2b58 commit e556640

File tree

7 files changed

+138
-20
lines changed

7 files changed

+138
-20
lines changed
 

‎references/detection/train.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,10 @@ def get_args_parser(add_help=True):
132132
action="store_true",
133133
)
134134

135+
parser.add_argument(
136+
"--use-deterministic-algorithms", action="store_true", help="Forces the use of deterministic algorithms only."
137+
)
138+
135139
# distributed training parameters
136140
parser.add_argument("--world-size", default=1, type=int, help="number of distributed processes")
137141
parser.add_argument("--dist-url", default="env://", type=str, help="url used to set up distributed training")
@@ -153,6 +157,12 @@ def main(args):
153157

154158
device = torch.device(args.device)
155159

160+
if args.use_deterministic_algorithms:
161+
torch.backends.cudnn.benchmark = False
162+
torch.use_deterministic_algorithms(True)
163+
else:
164+
torch.backends.cudnn.benchmark = True
165+
156166
# Data loading code
157167
print("Loading data")
158168

@@ -162,7 +172,7 @@ def main(args):
162172
print("Creating data loaders")
163173
if args.distributed:
164174
train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
165-
test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test)
175+
test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test, shuffle=False)
166176
else:
167177
train_sampler = torch.utils.data.RandomSampler(dataset)
168178
test_sampler = torch.utils.data.SequentialSampler(dataset_test)
@@ -243,6 +253,9 @@ def main(args):
243253
scaler.load_state_dict(checkpoint["scaler"])
244254

245255
if args.test_only:
256+
# We disable the cudnn benchmarking because it can noticeably affect the accuracy
257+
torch.backends.cudnn.benchmark = False
258+
torch.backends.cudnn.deterministic = True
246259
evaluate(model, data_loader_test, device=device)
247260
return
248261

‎references/optical_flow/train.py

+9
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,12 @@ def main(args):
209209
raise ValueError("The device must be cuda if we want to run in distributed mode using torchrun")
210210
device = torch.device(args.device)
211211

212+
if args.use_deterministic_algorithms:
213+
torch.backends.cudnn.benchmark = False
214+
torch.use_deterministic_algorithms(True)
215+
else:
216+
torch.backends.cudnn.benchmark = True
217+
212218
model = torchvision.models.optical_flow.__dict__[args.model](weights=args.weights)
213219

214220
if args.distributed:
@@ -370,6 +376,9 @@ def get_args_parser(add_help=True):
370376

371377
parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load.")
372378
parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu, Default: cuda)")
379+
parser.add_argument(
380+
"--use-deterministic-algorithms", action="store_true", help="Forces the use of deterministic algorithms only."
381+
)
373382

374383
return parser
375384

‎references/segmentation/train.py

+32-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import datetime
22
import os
33
import time
4+
import warnings
45

56
import presets
67
import torch
@@ -61,16 +62,34 @@ def evaluate(model, data_loader, device, num_classes):
6162
confmat = utils.ConfusionMatrix(num_classes)
6263
metric_logger = utils.MetricLogger(delimiter=" ")
6364
header = "Test:"
65+
num_processed_samples = 0
6466
with torch.inference_mode():
6567
for image, target in metric_logger.log_every(data_loader, 100, header):
6668
image, target = image.to(device), target.to(device)
6769
output = model(image)
6870
output = output["out"]
6971

7072
confmat.update(target.flatten(), output.argmax(1).flatten())
73+
# FIXME need to take into account that the datasets
74+
# could have been padded in distributed setup
75+
num_processed_samples += image.shape[0]
7176

7277
confmat.reduce_from_all_processes()
7378

79+
num_processed_samples = utils.reduce_across_processes(num_processed_samples)
80+
if (
81+
hasattr(data_loader.dataset, "__len__")
82+
and len(data_loader.dataset) != num_processed_samples
83+
and torch.distributed.get_rank() == 0
84+
):
85+
# See FIXME above
86+
warnings.warn(
87+
f"It looks like the dataset has {len(data_loader.dataset)} samples, but {num_processed_samples} "
88+
"samples were used for the validation, which might bias the results. "
89+
"Try adjusting the batch size and / or the world size. "
90+
"Setting the world size to 1 is always a safe bet."
91+
)
92+
7493
return confmat
7594

7695

@@ -108,12 +127,18 @@ def main(args):
108127

109128
device = torch.device(args.device)
110129

130+
if args.use_deterministic_algorithms:
131+
torch.backends.cudnn.benchmark = False
132+
torch.use_deterministic_algorithms(True)
133+
else:
134+
torch.backends.cudnn.benchmark = True
135+
111136
dataset, num_classes = get_dataset(args.data_path, args.dataset, "train", get_transform(True, args))
112137
dataset_test, _ = get_dataset(args.data_path, args.dataset, "val", get_transform(False, args))
113138

114139
if args.distributed:
115140
train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
116-
test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test)
141+
test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test, shuffle=False)
117142
else:
118143
train_sampler = torch.utils.data.RandomSampler(dataset)
119144
test_sampler = torch.utils.data.SequentialSampler(dataset_test)
@@ -191,6 +216,9 @@ def main(args):
191216
scaler.load_state_dict(checkpoint["scaler"])
192217

193218
if args.test_only:
219+
# We disable the cudnn benchmarking because it can noticeably affect the accuracy
220+
torch.backends.cudnn.benchmark = False
221+
torch.backends.cudnn.deterministic = True
194222
confmat = evaluate(model, data_loader_test, device=device, num_classes=num_classes)
195223
print(confmat)
196224
return
@@ -261,6 +289,9 @@ def get_args_parser(add_help=True):
261289
help="Only test the model",
262290
action="store_true",
263291
)
292+
parser.add_argument(
293+
"--use-deterministic-algorithms", action="store_true", help="Forces the use of deterministic algorithms only."
294+
)
264295
# distributed training parameters
265296
parser.add_argument("--world-size", default=1, type=int, help="number of distributed processes")
266297
parser.add_argument("--dist-url", default="env://", type=str, help="url used to set up distributed training")

‎references/segmentation/utils.py

+13-11
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,7 @@ def synchronize_between_processes(self):
3030
"""
3131
Warning: does not synchronize the deque!
3232
"""
33-
if not is_dist_avail_and_initialized():
34-
return
35-
t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
36-
dist.barrier()
37-
dist.all_reduce(t)
33+
t = reduce_across_processes([self.count, self.total])
3834
t = t.tolist()
3935
self.count = int(t[0])
4036
self.total = t[1]
@@ -92,12 +88,7 @@ def compute(self):
9288
return acc_global, acc, iu
9389

9490
def reduce_from_all_processes(self):
95-
if not torch.distributed.is_available():
96-
return
97-
if not torch.distributed.is_initialized():
98-
return
99-
torch.distributed.barrier()
100-
torch.distributed.all_reduce(self.mat)
91+
reduce_across_processes(self.mat)
10192

10293
def __str__(self):
10394
acc_global, acc, iu = self.compute()
@@ -296,3 +287,14 @@ def init_distributed_mode(args):
296287
)
297288
torch.distributed.barrier()
298289
setup_for_distributed(args.rank == 0)
290+
291+
292+
def reduce_across_processes(val):
293+
if not is_dist_avail_and_initialized():
294+
# nothing to sync, but we still convert to tensor for consistency with the distributed case.
295+
return torch.tensor(val)
296+
297+
t = torch.tensor(val, device="cuda")
298+
dist.barrier()
299+
dist.all_reduce(t)
300+
return t

‎references/similarity/train.py

+23
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,13 @@ def save(model, epoch, save_dir, file_name):
8888

8989
def main(args):
9090
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
91+
92+
if args.use_deterministic_algorithms:
93+
torch.backends.cudnn.benchmark = False
94+
torch.use_deterministic_algorithms(True)
95+
else:
96+
torch.backends.cudnn.benchmark = True
97+
9198
p = args.labels_per_batch
9299
k = args.samples_per_label
93100
batch_size = p * k
@@ -126,6 +133,13 @@ def main(args):
126133
)
127134
test_loader = DataLoader(test_dataset, batch_size=args.eval_batch_size, shuffle=False, num_workers=args.workers)
128135

136+
if args.test_only:
137+
# We disable the cudnn benchmarking because it can noticeably affect the accuracy
138+
torch.backends.cudnn.benchmark = False
139+
torch.backends.cudnn.deterministic = True
140+
evaluate(model, test_loader, device)
141+
return
142+
129143
for epoch in range(1, args.epochs + 1):
130144
print("Training...")
131145
train_epoch(model, optimizer, criterion, train_loader, device, epoch, args.print_freq)
@@ -155,6 +169,15 @@ def parse_args():
155169
parser.add_argument("--print-freq", default=20, type=int, help="print frequency")
156170
parser.add_argument("--save-dir", default=".", type=str, help="Model save directory")
157171
parser.add_argument("--resume", default="", type=str, help="path of checkpoint")
172+
parser.add_argument(
173+
"--test-only",
174+
dest="test_only",
175+
help="Only test the model",
176+
action="store_true",
177+
)
178+
parser.add_argument(
179+
"--use-deterministic-algorithms", action="store_true", help="Forces the use of deterministic algorithms only."
180+
)
158181

159182
return parser.parse_args()
160183

‎references/video_classification/train.py

+35-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import datetime
22
import os
33
import time
4+
import warnings
45

56
import presets
67
import torch
@@ -50,6 +51,7 @@ def evaluate(model, criterion, data_loader, device):
5051
model.eval()
5152
metric_logger = utils.MetricLogger(delimiter=" ")
5253
header = "Test:"
54+
num_processed_samples = 0
5355
with torch.inference_mode():
5456
for video, target in metric_logger.log_every(data_loader, 100, header):
5557
video = video.to(device, non_blocking=True)
@@ -64,7 +66,28 @@ def evaluate(model, criterion, data_loader, device):
6466
metric_logger.update(loss=loss.item())
6567
metric_logger.meters["acc1"].update(acc1.item(), n=batch_size)
6668
metric_logger.meters["acc5"].update(acc5.item(), n=batch_size)
69+
num_processed_samples += batch_size
6770
# gather the stats from all processes
71+
num_processed_samples = utils.reduce_across_processes(num_processed_samples)
72+
if isinstance(data_loader.sampler, DistributedSampler):
73+
# Get the len of UniformClipSampler inside DistributedSampler
74+
num_data_from_sampler = len(data_loader.sampler.dataset)
75+
else:
76+
num_data_from_sampler = len(data_loader.sampler)
77+
78+
if (
79+
hasattr(data_loader.dataset, "__len__")
80+
and num_data_from_sampler != num_processed_samples
81+
and torch.distributed.get_rank() == 0
82+
):
83+
# See FIXME above
84+
warnings.warn(
85+
f"It looks like the sampler has {num_data_from_sampler} samples, but {num_processed_samples} "
86+
"samples were used for the validation, which might bias the results. "
87+
"Try adjusting the batch size and / or the world size. "
88+
"Setting the world size to 1 is always a safe bet."
89+
)
90+
6891
metric_logger.synchronize_between_processes()
6992

7093
print(
@@ -99,7 +122,11 @@ def main(args):
99122

100123
device = torch.device(args.device)
101124

102-
torch.backends.cudnn.benchmark = True
125+
if args.use_deterministic_algorithms:
126+
torch.backends.cudnn.benchmark = False
127+
torch.use_deterministic_algorithms(True)
128+
else:
129+
torch.backends.cudnn.benchmark = True
103130

104131
# Data loading code
105132
print("Loading data")
@@ -173,7 +200,7 @@ def main(args):
173200
test_sampler = UniformClipSampler(dataset_test.video_clips, args.clips_per_video)
174201
if args.distributed:
175202
train_sampler = DistributedSampler(train_sampler)
176-
test_sampler = DistributedSampler(test_sampler)
203+
test_sampler = DistributedSampler(test_sampler, shuffle=False)
177204

178205
data_loader = torch.utils.data.DataLoader(
179206
dataset,
@@ -248,6 +275,9 @@ def main(args):
248275
scaler.load_state_dict(checkpoint["scaler"])
249276

250277
if args.test_only:
278+
# We disable the cudnn benchmarking because it can noticeably affect the accuracy
279+
torch.backends.cudnn.benchmark = False
280+
torch.backends.cudnn.deterministic = True
251281
evaluate(model, criterion, data_loader_test, device=device)
252282
return
253283

@@ -335,6 +365,9 @@ def parse_args():
335365
help="Only test the model",
336366
action="store_true",
337367
)
368+
parser.add_argument(
369+
"--use-deterministic-algorithms", action="store_true", help="Forces the use of deterministic algorithms only."
370+
)
338371

339372
# distributed training parameters
340373
parser.add_argument("--world-size", default=1, type=int, help="number of distributed processes")

‎references/video_classification/utils.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,7 @@ def synchronize_between_processes(self):
3030
"""
3131
Warning: does not synchronize the deque!
3232
"""
33-
if not is_dist_avail_and_initialized():
34-
return
35-
t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
36-
dist.barrier()
37-
dist.all_reduce(t)
33+
t = reduce_across_processes([self.count, self.total])
3834
t = t.tolist()
3935
self.count = int(t[0])
4036
self.total = t[1]
@@ -255,3 +251,14 @@ def init_distributed_mode(args):
255251
)
256252
torch.distributed.barrier()
257253
setup_for_distributed(args.rank == 0)
254+
255+
256+
def reduce_across_processes(val):
257+
if not is_dist_avail_and_initialized():
258+
# nothing to sync, but we still convert to tensor for consistency with the distributed case.
259+
return torch.tensor(val)
260+
261+
t = torch.tensor(val, device="cuda")
262+
dist.barrier()
263+
dist.all_reduce(t)
264+
return t

0 commit comments

Comments
 (0)
Please sign in to comment.