Skip to content

Commit 3bc9624

Browse files
authored
Update notebooks by re-syncing to transformers documentation (#118)
* Update notebooks * Updates * Final changes
1 parent f1ed99c commit 3bc9624

18 files changed

+2786
-2130
lines changed

transformers_doc/custom_datasets.ipynb

+646-581
Large diffs are not rendered by default.

transformers_doc/preprocessing.ipynb

+7-6
Original file line numberDiff line numberDiff line change
@@ -478,7 +478,7 @@
478478
"metadata": {},
479479
"source": [
480480
"We have seen the commands that will work for most cases (pad your batch to the length of the maximum sentence and\n",
481-
"truncate to the maximum length the mode can accept). However, the API supports more strategies if you need them. The\n",
481+
"truncate to the maximum length the model can accept). However, the API supports more strategies if you need them. The\n",
482482
"three arguments you need to know for this are `padding`, `truncation` and `max_length`.\n",
483483
"\n",
484484
"- `padding` controls the padding. It can be a boolean or a string which should be:\n",
@@ -493,15 +493,16 @@
493493
"\n",
494494
"- `truncation` controls the truncation. It can be a boolean or a string which should be:\n",
495495
"\n",
496-
" - `True` or `'only_first'` truncate to a maximum length specified by the `max_length` argument or\n",
496+
" - `True` or `'longest_first'` truncate to a maximum length specified by the `max_length` argument or\n",
497497
" the maximum length accepted by the model if no `max_length` is provided (`max_length=None`). This will\n",
498-
" only truncate the first sentence of a pair if a pair of sequence (or a batch of pairs of sequences) is provided.\n",
498+
" truncate token by token, removing a token from the longest sequence in the pair until the proper length is\n",
499+
" reached.\n",
499500
" - `'only_second'` truncate to a maximum length specified by the `max_length` argument or the maximum\n",
500501
" length accepted by the model if no `max_length` is provided (`max_length=None`). This will only truncate\n",
501502
" the second sentence of a pair if a pair of sequence (or a batch of pairs of sequences) is provided.\n",
502-
" - `'longest_first'` truncate to a maximum length specified by the `max_length` argument or the maximum\n",
503-
" length accepted by the model if no `max_length` is provided (`max_length=None`). This will truncate token\n",
504-
" by token, removing a token from the longest sequence in the pair until the proper length is reached.\n",
503+
" - `'only_first'` truncate to a maximum length specified by the `max_length` argument or the maximum\n",
504+
" length accepted by the model if no `max_length` is provided (`max_length=None`). This will only truncate\n",
505+
" the first sentence of a pair if a pair of sequence (or a batch of pairs of sequences) is provided.\n",
505506
" - `False` or `'do_not_truncate'` to not truncate the sequences. As we have seen before, this is the\n",
506507
" default behavior.\n",
507508
"\n",

transformers_doc/pytorch/custom_datasets.ipynb

+707-455
Large diffs are not rendered by default.

transformers_doc/pytorch/preprocessing.ipynb

+7-6
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,7 @@
436436
"metadata": {},
437437
"source": [
438438
"We have seen the commands that will work for most cases (pad your batch to the length of the maximum sentence and\n",
439-
"truncate to the maximum length the mode can accept). However, the API supports more strategies if you need them. The\n",
439+
"truncate to the maximum length the model can accept). However, the API supports more strategies if you need them. The\n",
440440
"three arguments you need to know for this are `padding`, `truncation` and `max_length`.\n",
441441
"\n",
442442
"- `padding` controls the padding. It can be a boolean or a string which should be:\n",
@@ -451,15 +451,16 @@
451451
"\n",
452452
"- `truncation` controls the truncation. It can be a boolean or a string which should be:\n",
453453
"\n",
454-
" - `True` or `'only_first'` truncate to a maximum length specified by the `max_length` argument or\n",
454+
" - `True` or `'longest_first'` truncate to a maximum length specified by the `max_length` argument or\n",
455455
" the maximum length accepted by the model if no `max_length` is provided (`max_length=None`). This will\n",
456-
" only truncate the first sentence of a pair if a pair of sequence (or a batch of pairs of sequences) is provided.\n",
456+
" truncate token by token, removing a token from the longest sequence in the pair until the proper length is\n",
457+
" reached.\n",
457458
" - `'only_second'` truncate to a maximum length specified by the `max_length` argument or the maximum\n",
458459
" length accepted by the model if no `max_length` is provided (`max_length=None`). This will only truncate\n",
459460
" the second sentence of a pair if a pair of sequence (or a batch of pairs of sequences) is provided.\n",
460-
" - `'longest_first'` truncate to a maximum length specified by the `max_length` argument or the maximum\n",
461-
" length accepted by the model if no `max_length` is provided (`max_length=None`). This will truncate token\n",
462-
" by token, removing a token from the longest sequence in the pair until the proper length is reached.\n",
461+
" - `'only_first'` truncate to a maximum length specified by the `max_length` argument or the maximum\n",
462+
" length accepted by the model if no `max_length` is provided (`max_length=None`). This will only truncate\n",
463+
" the first sentence of a pair if a pair of sequence (or a batch of pairs of sequences) is provided.\n",
463464
" - `False` or `'do_not_truncate'` to not truncate the sequences. As we have seen before, this is the\n",
464465
" default behavior.\n",
465466
"\n",

transformers_doc/pytorch/quicktour.ipynb

+39-13
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,18 @@
7878
"- Translation: translate a text in another language.\n",
7979
"- Feature extraction: return a tensor representation of the text.\n",
8080
"\n",
81-
"Let's see how this work for sentiment analysis (the other tasks are all covered in the [task summary](https://huggingface.co/transformers/task_summary.html)):"
81+
"Let's see how this work for sentiment analysis (the other tasks are all covered in the [task summary](https://huggingface.co/transformers/task_summary.html)):\n",
82+
"\n",
83+
"Install the following dependencies (if not already installed):"
84+
]
85+
},
86+
{
87+
"cell_type": "code",
88+
"execution_count": null,
89+
"metadata": {},
90+
"outputs": [],
91+
"source": [
92+
"! pip install torch"
8293
]
8394
},
8495
{
@@ -109,7 +120,7 @@
109120
{
110121
"data": {
111122
"text/plain": [
112-
"[{'label': 'POSITIVE', 'score': 0.9997795224189758}]"
123+
"[{'label': 'POSITIVE', 'score': 0.9998}]"
113124
]
114125
},
115126
"execution_count": null,
@@ -125,8 +136,8 @@
125136
"cell_type": "markdown",
126137
"metadata": {},
127138
"source": [
128-
"That's encouraging! You can use it on a list of sentences, which will be preprocessed then fed to the model as a\n",
129-
"*batch*, returning a list of dictionaries like this one:"
139+
"That's encouraging! You can use it on a list of sentences, which will be preprocessed then fed to the model, returning\n",
140+
"a list of dictionaries like this one:"
130141
]
131142
},
132143
{
@@ -157,6 +168,8 @@
157168
"cell_type": "markdown",
158169
"metadata": {},
159170
"source": [
171+
"To use with a large dataset, look at [iterating over a pipeline](https://huggingface.co/transformers/./main_classes/pipelines.html)\n",
172+
"\n",
160173
"You can see the second sentence has been classified as negative (it needs to be positive or negative) but its score is\n",
161174
"fairly neutral.\n",
162175
"\n",
@@ -338,7 +351,8 @@
338351
{
339352
"data": {
340353
"text/plain": [
341-
"{'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
354+
"{'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102],\n",
355+
" 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
342356
]
343357
},
344358
"execution_count": null,
@@ -453,7 +467,7 @@
453467
"data": {
454468
"text/plain": [
455469
"SequenceClassifierOutput(loss=None, logits=tensor([[-4.0833, 4.3364],\n",
456-
" [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)"
470+
" [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)"
457471
]
458472
},
459473
"execution_count": null,
@@ -542,7 +556,7 @@
542556
"data": {
543557
"text/plain": [
544558
"SequenceClassifierOutput(loss=tensor(0.3167, grad_fn=<NllLossBackward>), logits=tensor([[-4.0833, 4.3364],\n",
545-
"[ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)"
559+
" [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)"
546560
]
547561
},
548562
"execution_count": null,
@@ -588,8 +602,20 @@
588602
"metadata": {},
589603
"outputs": [],
590604
"source": [
591-
"tokenizer.save_pretrained(save_directory)\n",
592-
"model.save_pretrained(save_directory)"
605+
"pt_save_directory = './pt_save_pretrained'\n",
606+
"tokenizer.save_pretrained(pt_save_directory)\n",
607+
"pt_model.save_pretrained(pt_save_directory)"
608+
]
609+
},
610+
{
611+
"cell_type": "code",
612+
"execution_count": null,
613+
"metadata": {},
614+
"outputs": [],
615+
"source": [
616+
"tf_save_directory = './tf_save_pretrained'\n",
617+
"tokenizer.save_pretrained(tf_save_directory)\n",
618+
"tf_model.save_pretrained(tf_save_directory)"
593619
]
594620
},
595621
{
@@ -609,8 +635,8 @@
609635
"outputs": [],
610636
"source": [
611637
"from transformers import TFAutoModel\n",
612-
"tokenizer = AutoTokenizer.from_pretrained(save_directory)\n",
613-
"model = TFAutoModel.from_pretrained(save_directory, from_pt=True)"
638+
"tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)\n",
639+
"tf_model = TFAutoModel.from_pretrained(pt_save_directory, from_pt=True)"
614640
]
615641
},
616642
{
@@ -627,8 +653,8 @@
627653
"outputs": [],
628654
"source": [
629655
"from transformers import AutoModel\n",
630-
"tokenizer = AutoTokenizer.from_pretrained(save_directory)\n",
631-
"model = AutoModel.from_pretrained(save_directory, from_tf=True)"
656+
"tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)\n",
657+
"pt_model = AutoModel.from_pretrained(tf_save_directory, from_tf=True)"
632658
]
633659
},
634660
{

0 commit comments

Comments
 (0)