Skip to content

Commit e43e112

Browse files
update desc for map in all examples (#12226)
* update desc for map in all examples * added plm * suggestions
1 parent adb70ed commit e43e112

20 files changed

+84
-7
lines changed
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
torch >= 1.3
2-
datasets >= 1.1.3
2+
datasets >= 1.8.0
33
sentencepiece != 0.1.92
44
protobuf

examples/pytorch/language-modeling/run_clm.py

+4
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,12 @@
4646
from transformers.testing_utils import CaptureLogger
4747
from transformers.trainer_utils import get_last_checkpoint
4848
from transformers.utils import check_min_version
49+
from transformers.utils.versions import require_version
4950

5051

5152
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
5253
check_min_version("4.8.0.dev0")
54+
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
5355

5456
logger = logging.getLogger(__name__)
5557

@@ -355,6 +357,7 @@ def tokenize_function(examples):
355357
num_proc=data_args.preprocessing_num_workers,
356358
remove_columns=column_names,
357359
load_from_cache_file=not data_args.overwrite_cache,
360+
desc="Running tokenizer on dataset",
358361
)
359362

360363
if data_args.block_size is None:
@@ -401,6 +404,7 @@ def group_texts(examples):
401404
batched=True,
402405
num_proc=data_args.preprocessing_num_workers,
403406
load_from_cache_file=not data_args.overwrite_cache,
407+
desc=f"Grouping texts in chunks of {block_size}",
404408
)
405409

406410
if training_args.do_train:

examples/pytorch/language-modeling/run_clm_no_trainer.py

+6
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,13 @@
4848
get_scheduler,
4949
set_seed,
5050
)
51+
from transformers.utils.versions import require_version
5152

5253

5354
logger = logging.getLogger(__name__)
55+
56+
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
57+
5458
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
5559
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
5660

@@ -300,6 +304,7 @@ def tokenize_function(examples):
300304
num_proc=args.preprocessing_num_workers,
301305
remove_columns=column_names,
302306
load_from_cache_file=not args.overwrite_cache,
307+
desc="Running tokenizer on dataset",
303308
)
304309

305310
if args.block_size is None:
@@ -346,6 +351,7 @@ def group_texts(examples):
346351
batched=True,
347352
num_proc=args.preprocessing_num_workers,
348353
load_from_cache_file=not args.overwrite_cache,
354+
desc=f"Grouping texts in chunks of {block_size}",
349355
)
350356

351357
train_dataset = lm_datasets["train"]

examples/pytorch/language-modeling/run_mlm.py

+5
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,12 @@
4545
)
4646
from transformers.trainer_utils import get_last_checkpoint
4747
from transformers.utils import check_min_version
48+
from transformers.utils.versions import require_version
4849

4950

5051
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
5152
check_min_version("4.8.0.dev0")
53+
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
5254

5355
logger = logging.getLogger(__name__)
5456
MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
@@ -380,6 +382,7 @@ def tokenize_function(examples):
380382
num_proc=data_args.preprocessing_num_workers,
381383
remove_columns=[text_column_name],
382384
load_from_cache_file=not data_args.overwrite_cache,
385+
desc="Running tokenizer on dataset line_by_line",
383386
)
384387
else:
385388
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
@@ -394,6 +397,7 @@ def tokenize_function(examples):
394397
num_proc=data_args.preprocessing_num_workers,
395398
remove_columns=column_names,
396399
load_from_cache_file=not data_args.overwrite_cache,
400+
desc="Running tokenizer on every text in dataset",
397401
)
398402

399403
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
@@ -424,6 +428,7 @@ def group_texts(examples):
424428
batched=True,
425429
num_proc=data_args.preprocessing_num_workers,
426430
load_from_cache_file=not data_args.overwrite_cache,
431+
desc=f"Grouping texts in chunks of {max_seq_length}",
427432
)
428433

429434
if training_args.do_train:

examples/pytorch/language-modeling/run_mlm_no_trainer.py

+5
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,11 @@
4848
get_scheduler,
4949
set_seed,
5050
)
51+
from transformers.utils.versions import require_version
5152

5253

5354
logger = logging.getLogger(__name__)
55+
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
5456
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
5557
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
5658

@@ -346,6 +348,7 @@ def tokenize_function(examples):
346348
num_proc=args.preprocessing_num_workers,
347349
remove_columns=[text_column_name],
348350
load_from_cache_file=not args.overwrite_cache,
351+
desc="Running tokenizer on dataset line_by_line",
349352
)
350353
else:
351354
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
@@ -360,6 +363,7 @@ def tokenize_function(examples):
360363
num_proc=args.preprocessing_num_workers,
361364
remove_columns=column_names,
362365
load_from_cache_file=not args.overwrite_cache,
366+
desc="Running tokenizer on every text in dataset",
363367
)
364368

365369
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
@@ -390,6 +394,7 @@ def group_texts(examples):
390394
batched=True,
391395
num_proc=args.preprocessing_num_workers,
392396
load_from_cache_file=not args.overwrite_cache,
397+
desc=f"Grouping texts in chunks of {max_seq_length}",
393398
)
394399

395400
train_dataset = tokenized_datasets["train"]

examples/pytorch/language-modeling/run_plm.py

+5
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,12 @@
4141
)
4242
from transformers.trainer_utils import get_last_checkpoint
4343
from transformers.utils import check_min_version
44+
from transformers.utils.versions import require_version
4445

4546

4647
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
4748
check_min_version("4.8.0.dev0")
49+
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
4850

4951
logger = logging.getLogger(__name__)
5052

@@ -358,6 +360,7 @@ def tokenize_function(examples):
358360
num_proc=data_args.preprocessing_num_workers,
359361
remove_columns=[text_column_name],
360362
load_from_cache_file=not data_args.overwrite_cache,
363+
desc="Running tokenizer on dataset line_by_line",
361364
)
362365
else:
363366
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
@@ -370,6 +373,7 @@ def tokenize_function(examples):
370373
num_proc=data_args.preprocessing_num_workers,
371374
remove_columns=column_names,
372375
load_from_cache_file=not data_args.overwrite_cache,
376+
desc="Running tokenizer on every text in dataset",
373377
)
374378

375379
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
@@ -400,6 +404,7 @@ def group_texts(examples):
400404
batched=True,
401405
num_proc=data_args.preprocessing_num_workers,
402406
load_from_cache_file=not data_args.overwrite_cache,
407+
desc=f"Grouping texts in chunks of {max_seq_length}",
403408
)
404409

405410
if training_args.do_train:
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
datasets >= 1.4.0
1+
datasets >= 1.8.0
22
torch >= 1.3.0

examples/pytorch/question-answering/run_qa.py

+5
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,13 @@
4242
)
4343
from transformers.trainer_utils import get_last_checkpoint
4444
from transformers.utils import check_min_version
45+
from transformers.utils.versions import require_version
4546
from utils_qa import postprocess_qa_predictions
4647

4748

4849
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
4950
check_min_version("4.8.0.dev0")
51+
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
5052

5153
logger = logging.getLogger(__name__)
5254

@@ -417,6 +419,7 @@ def prepare_train_features(examples):
417419
num_proc=data_args.preprocessing_num_workers,
418420
remove_columns=column_names,
419421
load_from_cache_file=not data_args.overwrite_cache,
422+
desc="Running tokenizer on train dataset",
420423
)
421424
if data_args.max_train_samples is not None:
422425
# Number of samples might increase during Feature Creation, We select only specified max samples
@@ -478,6 +481,7 @@ def prepare_validation_features(examples):
478481
num_proc=data_args.preprocessing_num_workers,
479482
remove_columns=column_names,
480483
load_from_cache_file=not data_args.overwrite_cache,
484+
desc="Running tokenizer on validation dataset",
481485
)
482486
if data_args.max_eval_samples is not None:
483487
# During Feature creation dataset samples might increase, we will select required samples again
@@ -497,6 +501,7 @@ def prepare_validation_features(examples):
497501
num_proc=data_args.preprocessing_num_workers,
498502
remove_columns=column_names,
499503
load_from_cache_file=not data_args.overwrite_cache,
504+
desc="Running tokenizer on prediction dataset",
500505
)
501506
if data_args.max_predict_samples is not None:
502507
# During Feature creation dataset samples might increase, we will select required samples again

examples/pytorch/question-answering/run_qa_beam_search.py

+5
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,13 @@
4141
)
4242
from transformers.trainer_utils import get_last_checkpoint
4343
from transformers.utils import check_min_version
44+
from transformers.utils.versions import require_version
4445
from utils_qa import postprocess_qa_predictions_with_beam_search
4546

4647

4748
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
4849
check_min_version("4.8.0.dev0")
50+
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
4951

5052
logger = logging.getLogger(__name__)
5153

@@ -429,6 +431,7 @@ def prepare_train_features(examples):
429431
num_proc=data_args.preprocessing_num_workers,
430432
remove_columns=column_names,
431433
load_from_cache_file=not data_args.overwrite_cache,
434+
desc="Running tokenizer on train dataset",
432435
)
433436
if data_args.max_train_samples is not None:
434437
# Select samples from dataset again since Feature Creation might increase number of features
@@ -514,6 +517,7 @@ def prepare_validation_features(examples):
514517
num_proc=data_args.preprocessing_num_workers,
515518
remove_columns=column_names,
516519
load_from_cache_file=not data_args.overwrite_cache,
520+
desc="Running tokenizer on validation dataset",
517521
)
518522
if data_args.max_eval_samples is not None:
519523
# Selecting Samples from Dataset again since Feature Creation might increase samples size
@@ -533,6 +537,7 @@ def prepare_validation_features(examples):
533537
num_proc=data_args.preprocessing_num_workers,
534538
remove_columns=column_names,
535539
load_from_cache_file=not data_args.overwrite_cache,
540+
desc="Running tokenizer on prediction dataset",
536541
)
537542
if data_args.max_predict_samples is not None:
538543
# During Feature creation dataset samples might increase, we will select required samples again

examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py

+5
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,13 @@
4646
set_seed,
4747
)
4848
from transformers.utils import check_min_version
49+
from transformers.utils.versions import require_version
4950
from utils_qa import postprocess_qa_predictions_with_beam_search
5051

5152

5253
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
5354
check_min_version("4.8.0.dev0")
55+
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
5456

5557
logger = logging.getLogger(__name__)
5658

@@ -419,6 +421,7 @@ def prepare_train_features(examples):
419421
num_proc=args.preprocessing_num_workers,
420422
remove_columns=column_names,
421423
load_from_cache_file=not args.overwrite_cache,
424+
desc="Running tokenizer on train dataset",
422425
)
423426
if args.max_train_samples is not None:
424427
# Number of samples might increase during Feature Creation, We select only specified max samples
@@ -503,6 +506,7 @@ def prepare_validation_features(examples):
503506
num_proc=args.preprocessing_num_workers,
504507
remove_columns=column_names,
505508
load_from_cache_file=not args.overwrite_cache,
509+
desc="Running tokenizer on validation dataset",
506510
)
507511

508512
if args.max_eval_samples is not None:
@@ -523,6 +527,7 @@ def prepare_validation_features(examples):
523527
num_proc=args.preprocessing_num_workers,
524528
remove_columns=column_names,
525529
load_from_cache_file=not args.overwrite_cache,
530+
desc="Running tokenizer on prediction dataset",
526531
)
527532
if args.max_predict_samples is not None:
528533
# During Feature creation dataset samples might increase, we will select required samples again

examples/pytorch/question-answering/run_qa_no_trainer.py

+5
Original file line numberDiff line numberDiff line change
@@ -48,11 +48,13 @@
4848
set_seed,
4949
)
5050
from transformers.utils import check_min_version
51+
from transformers.utils.versions import require_version
5152
from utils_qa import postprocess_qa_predictions
5253

5354

5455
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
5556
check_min_version("4.8.0.dev0")
57+
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
5658

5759
logger = logging.getLogger(__name__)
5860
# You should update this to your particular problem to have better documentation of `model_type`
@@ -448,6 +450,7 @@ def prepare_train_features(examples):
448450
num_proc=args.preprocessing_num_workers,
449451
remove_columns=column_names,
450452
load_from_cache_file=not args.overwrite_cache,
453+
desc="Running tokenizer on train dataset",
451454
)
452455
if args.max_train_samples is not None:
453456
# Number of samples might increase during Feature Creation, We select only specified max samples
@@ -508,6 +511,7 @@ def prepare_validation_features(examples):
508511
num_proc=args.preprocessing_num_workers,
509512
remove_columns=column_names,
510513
load_from_cache_file=not args.overwrite_cache,
514+
desc="Running tokenizer on validation dataset",
511515
)
512516

513517
if args.max_eval_samples is not None:
@@ -528,6 +532,7 @@ def prepare_validation_features(examples):
528532
num_proc=args.preprocessing_num_workers,
529533
remove_columns=column_names,
530534
load_from_cache_file=not args.overwrite_cache,
535+
desc="Running tokenizer on prediction dataset",
531536
)
532537
if args.max_predict_samples is not None:
533538
# During Feature creation dataset samples might increase, we will select required samples again

examples/pytorch/summarization/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
datasets >= 1.1.3
1+
datasets >= 1.8.0
22
sentencepiece != 0.1.92
33
protobuf
44
rouge-score

examples/pytorch/summarization/run_summarization.py

+5
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,12 @@
4343
from transformers.file_utils import is_offline_mode
4444
from transformers.trainer_utils import get_last_checkpoint
4545
from transformers.utils import check_min_version
46+
from transformers.utils.versions import require_version
4647

4748

4849
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
4950
check_min_version("4.8.0.dev0")
51+
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
5052

5153
logger = logging.getLogger(__name__)
5254

@@ -433,6 +435,7 @@ def preprocess_function(examples):
433435
num_proc=data_args.preprocessing_num_workers,
434436
remove_columns=column_names,
435437
load_from_cache_file=not data_args.overwrite_cache,
438+
desc="Running tokenizer on train dataset",
436439
)
437440

438441
if training_args.do_eval:
@@ -448,6 +451,7 @@ def preprocess_function(examples):
448451
num_proc=data_args.preprocessing_num_workers,
449452
remove_columns=column_names,
450453
load_from_cache_file=not data_args.overwrite_cache,
454+
desc="Running tokenizer on validation dataset",
451455
)
452456

453457
if training_args.do_predict:
@@ -463,6 +467,7 @@ def preprocess_function(examples):
463467
num_proc=data_args.preprocessing_num_workers,
464468
remove_columns=column_names,
465469
load_from_cache_file=not data_args.overwrite_cache,
470+
desc="Running tokenizer on prediction dataset",
466471
)
467472

468473
# Data collator

examples/pytorch/summarization/run_summarization_no_trainer.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,12 @@
4848
set_seed,
4949
)
5050
from transformers.file_utils import is_offline_mode
51+
from transformers.utils.versions import require_version
5152

5253

5354
logger = logging.getLogger(__name__)
55+
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
56+
5457
# You should update this to your particular problem to have better documentation of `model_type`
5558
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
5659
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
@@ -419,7 +422,11 @@ def preprocess_function(examples):
419422
return model_inputs
420423

421424
processed_datasets = raw_datasets.map(
422-
preprocess_function, batched=True, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache
425+
preprocess_function,
426+
batched=True,
427+
remove_columns=column_names,
428+
load_from_cache_file=not args.overwrite_cache,
429+
desc="Running tokenizer on dataset",
423430
)
424431

425432
train_dataset = processed_datasets["train"]

0 commit comments

Comments
 (0)