Skip to content

Commit 367f497

Browse files
authored
Fix max length in run_plm script (#8738)
1 parent e84786a commit 367f497

File tree

1 file changed

+9
-12
lines changed

1 file changed

+9
-12
lines changed

examples/language-modeling/run_plm.py

+9-12
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,11 @@ class DataTrainingArguments:
9393
overwrite_cache: bool = field(
9494
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
9595
)
96-
max_seq_length: Optional[int] = field(
97-
default=None,
96+
max_seq_length: int = field(
97+
default=512,
9898
metadata={
9999
"help": "The maximum total input sequence length after tokenization. Sequences longer "
100-
"than this will be truncated. Default to the max input length of the model."
100+
"than this will be truncated."
101101
},
102102
)
103103
preprocessing_num_workers: Optional[int] = field(
@@ -286,15 +286,12 @@ def tokenize_function(examples):
286286
load_from_cache_file=not data_args.overwrite_cache,
287287
)
288288

289-
if data_args.max_seq_length is None:
290-
max_seq_length = tokenizer.model_max_length
291-
else:
292-
if data_args.max_seq_length > tokenizer.model_max_length:
293-
logger.warn(
294-
f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
295-
f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
296-
)
297-
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
289+
if data_args.max_seq_length > tokenizer.model_max_length:
290+
logger.warn(
291+
f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
292+
f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
293+
)
294+
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
298295

299296
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
300297
# max_seq_length.

0 commit comments

Comments
 (0)