This repository was archived by the owner on Jan 21, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 256
/
Copy pathdataset_test.py
124 lines (109 loc) · 4.58 KB
/
dataset_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# coding=utf-8
# Copyright 2023 The Mesh TensorFlow Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for mesh_tensorflow.transformer.dataset."""
from absl.testing import absltest
from absl.testing import parameterized
from mesh_tensorflow.transformer import dataset
import numpy as np
import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds
tf.disable_v2_behavior()
tf.enable_eager_execution()
class DatasetTest(parameterized.TestCase):
_PACK_PARAMETERS = ({"use_custom_ops": False},)
def assert_dataset(self, ds, expected_ds, expected_dtypes):
actual_ds = list(tfds.as_numpy(ds))
self.assertLen(actual_ds, len(expected_ds))
for actual, expected in zip(actual_ds, expected_ds):
self.assertCountEqual(list(actual.keys()), list(expected.keys()))
for k, v in actual.items():
np.testing.assert_array_equal(v, expected[k])
if k in expected_dtypes:
self.assertEqual(v.dtype.type, expected_dtypes[k])
@parameterized.parameters(*_PACK_PARAMETERS)
def test_pack_dataset(self, use_custom_ops):
x = [{"inputs": [7, 8, 5, 1], "targets": [3, 9, 1], "idx": [0]},
{"inputs": [8, 4, 9, 3, 1], "targets": [4, 1], "idx": [1]}]
ds = create_default_dataset(x, feature_names=("inputs", "targets", "idx"))
packed_ds = dataset.pack_dataset(
ds,
length={"inputs": 10, "targets": 7},
keys=("inputs", "targets"),
use_custom_ops=use_custom_ops)
expected = [{
"inputs": [7, 8, 5, 1, 8, 4, 9, 3, 1, 0],
"inputs_segmentation": [1, 1, 1, 1, 2, 2, 2, 2, 2, 0],
"inputs_position": [0, 1, 2, 3, 0, 1, 2, 3, 4, 0],
"targets": [3, 9, 1, 4, 1, 0, 0],
"targets_position": [0, 1, 2, 0, 1, 0, 0],
"targets_segmentation": [1, 1, 1, 2, 2, 0, 0],
}]
self.assert_dataset(
packed_ds, expected, {"inputs": tf.int32, "targets": tf.int32})
@parameterized.parameters(*_PACK_PARAMETERS)
def test_pack_dataset_no_eos(self, use_custom_ops):
x = [{"inputs": [7, 8, 5], "targets": [3, 9]},
{"inputs": [8, 4, 9, 3], "targets": [4]}]
ds = create_default_dataset(x)
packed_ds = dataset.pack_dataset(
ds,
length={"inputs": 8, "targets": 5},
use_custom_ops=use_custom_ops)
# Packing still works without the eos.
expected = [{
"inputs": [7, 8, 5, 8, 4, 9, 3, 0],
"inputs_segmentation": [1, 1, 1, 2, 2, 2, 2, 0],
"inputs_position": [0, 1, 2, 0, 1, 2, 3, 0],
"targets": [3, 9, 4, 0, 0],
"targets_position": [0, 1, 0, 0, 0],
"targets_segmentation": [1, 1, 2, 0, 0],
}]
self.assert_dataset(
packed_ds, expected, {"inputs": tf.int32, "targets": tf.int32})
@parameterized.parameters(*_PACK_PARAMETERS)
def test_pack_dataset_long_seq(self, use_custom_ops):
x = [{"inputs": [7, 8, 5, 6, 9, 4, 1], "targets": [3, 9, 1]},
{"inputs": [8, 4, 9, 3, 5, 7, 9, 1], "targets": [4, 1]}]
ds = create_default_dataset(x)
packed_ds = dataset.pack_dataset(
ds,
length={"inputs": 7, "targets": 3},
use_custom_ops=use_custom_ops)
expected = [{
"inputs": [7, 8, 5, 6, 9, 4, 1],
"inputs_segmentation": [1, 1, 1, 1, 1, 1, 1],
"inputs_position": [0, 1, 2, 3, 4, 5, 6],
"targets": [3, 9, 1],
"targets_position": [0, 1, 2],
"targets_segmentation": [1, 1, 1],
}, {
# EOS is trimmed
"inputs": [8, 4, 9, 3, 5, 7, 9],
"inputs_segmentation": [1, 1, 1, 1, 1, 1, 1],
"inputs_position": [0, 1, 2, 3, 4, 5, 6],
"targets": [4, 1, 0],
"targets_position": [0, 1, 0],
"targets_segmentation": [1, 1, 0],
}]
self.assert_dataset(
packed_ds, expected, {"inputs": tf.int32, "targets": tf.int32})
def create_default_dataset(x, feature_names=("inputs", "targets")):
output_types = {feature_name: tf.int32 for feature_name in feature_names}
output_shapes = {feature_name: [None] for feature_name in feature_names}
ds = tf.data.Dataset.from_generator(
lambda: x, output_types=output_types, output_shapes=output_shapes)
return ds
if __name__ == "__main__":
absltest.main()