forked from tensorflow/models
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkeypoint_ops.py
376 lines (306 loc) · 15.2 KB
/
keypoint_ops.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keypoint operations.
Keypoints are represented as tensors of shape [num_instances, num_keypoints, 2],
where the last dimension holds rank 2 tensors of the form [y, x] representing
the coordinates of the keypoint.
"""
import numpy as np
import tensorflow.compat.v1 as tf
def scale(keypoints, y_scale, x_scale, scope=None):
"""Scales keypoint coordinates in x and y dimensions.
Args:
keypoints: a tensor of shape [num_instances, num_keypoints, 2]
y_scale: (float) scalar tensor
x_scale: (float) scalar tensor
scope: name scope.
Returns:
new_keypoints: a tensor of shape [num_instances, num_keypoints, 2]
"""
with tf.name_scope(scope, 'Scale'):
y_scale = tf.cast(y_scale, tf.float32)
x_scale = tf.cast(x_scale, tf.float32)
new_keypoints = keypoints * [[[y_scale, x_scale]]]
return new_keypoints
def clip_to_window(keypoints, window, scope=None):
"""Clips keypoints to a window.
This op clips any input keypoints to a window.
Args:
keypoints: a tensor of shape [num_instances, num_keypoints, 2]
window: a tensor of shape [4] representing the [y_min, x_min, y_max, x_max]
window to which the op should clip the keypoints.
scope: name scope.
Returns:
new_keypoints: a tensor of shape [num_instances, num_keypoints, 2]
"""
with tf.name_scope(scope, 'ClipToWindow'):
y, x = tf.split(value=keypoints, num_or_size_splits=2, axis=2)
win_y_min, win_x_min, win_y_max, win_x_max = tf.unstack(window)
y = tf.maximum(tf.minimum(y, win_y_max), win_y_min)
x = tf.maximum(tf.minimum(x, win_x_max), win_x_min)
new_keypoints = tf.concat([y, x], 2)
return new_keypoints
def prune_outside_window(keypoints, window, scope=None):
"""Prunes keypoints that fall outside a given window.
This function replaces keypoints that fall outside the given window with nan.
See also clip_to_window which clips any keypoints that fall outside the given
window.
Args:
keypoints: a tensor of shape [num_instances, num_keypoints, 2]
window: a tensor of shape [4] representing the [y_min, x_min, y_max, x_max]
window outside of which the op should prune the keypoints.
scope: name scope.
Returns:
new_keypoints: a tensor of shape [num_instances, num_keypoints, 2]
"""
with tf.name_scope(scope, 'PruneOutsideWindow'):
y, x = tf.split(value=keypoints, num_or_size_splits=2, axis=2)
win_y_min, win_x_min, win_y_max, win_x_max = tf.unstack(window)
valid_indices = tf.logical_and(
tf.logical_and(y >= win_y_min, y <= win_y_max),
tf.logical_and(x >= win_x_min, x <= win_x_max))
new_y = tf.where(valid_indices, y, np.nan * tf.ones_like(y))
new_x = tf.where(valid_indices, x, np.nan * tf.ones_like(x))
new_keypoints = tf.concat([new_y, new_x], 2)
return new_keypoints
def change_coordinate_frame(keypoints, window, scope=None):
"""Changes coordinate frame of the keypoints to be relative to window's frame.
Given a window of the form [y_min, x_min, y_max, x_max], changes keypoint
coordinates from keypoints of shape [num_instances, num_keypoints, 2]
to be relative to this window.
An example use case is data augmentation: where we are given groundtruth
keypoints and would like to randomly crop the image to some window. In this
case we need to change the coordinate frame of each groundtruth keypoint to be
relative to this new window.
Args:
keypoints: a tensor of shape [num_instances, num_keypoints, 2]
window: a tensor of shape [4] representing the [y_min, x_min, y_max, x_max]
window we should change the coordinate frame to.
scope: name scope.
Returns:
new_keypoints: a tensor of shape [num_instances, num_keypoints, 2]
"""
with tf.name_scope(scope, 'ChangeCoordinateFrame'):
win_height = window[2] - window[0]
win_width = window[3] - window[1]
new_keypoints = scale(keypoints - [window[0], window[1]], 1.0 / win_height,
1.0 / win_width)
return new_keypoints
def keypoints_to_enclosing_bounding_boxes(keypoints):
"""Creates enclosing bounding boxes from keypoints.
Args:
keypoints: a [num_instances, num_keypoints, 2] float32 tensor with keypoints
in [y, x] format.
Returns:
A [num_instances, 4] float32 tensor that tightly covers all the keypoints
for each instance.
"""
ymin = tf.math.reduce_min(keypoints[:, :, 0], axis=1)
xmin = tf.math.reduce_min(keypoints[:, :, 1], axis=1)
ymax = tf.math.reduce_max(keypoints[:, :, 0], axis=1)
xmax = tf.math.reduce_max(keypoints[:, :, 1], axis=1)
return tf.stack([ymin, xmin, ymax, xmax], axis=1)
def to_normalized_coordinates(keypoints, height, width,
check_range=True, scope=None):
"""Converts absolute keypoint coordinates to normalized coordinates in [0, 1].
Usually one uses the dynamic shape of the image or conv-layer tensor:
keypoints = keypoint_ops.to_normalized_coordinates(keypoints,
tf.shape(images)[1],
tf.shape(images)[2]),
This function raises an assertion failed error at graph execution time when
the maximum coordinate is smaller than 1.01 (which means that coordinates are
already normalized). The value 1.01 is to deal with small rounding errors.
Args:
keypoints: A tensor of shape [num_instances, num_keypoints, 2].
height: Maximum value for y coordinate of absolute keypoint coordinates.
width: Maximum value for x coordinate of absolute keypoint coordinates.
check_range: If True, checks if the coordinates are normalized.
scope: name scope.
Returns:
tensor of shape [num_instances, num_keypoints, 2] with normalized
coordinates in [0, 1].
"""
with tf.name_scope(scope, 'ToNormalizedCoordinates'):
height = tf.cast(height, tf.float32)
width = tf.cast(width, tf.float32)
if check_range:
max_val = tf.reduce_max(keypoints)
max_assert = tf.Assert(tf.greater(max_val, 1.01),
['max value is lower than 1.01: ', max_val])
with tf.control_dependencies([max_assert]):
width = tf.identity(width)
return scale(keypoints, 1.0 / height, 1.0 / width)
def to_absolute_coordinates(keypoints, height, width,
check_range=True, scope=None):
"""Converts normalized keypoint coordinates to absolute pixel coordinates.
This function raises an assertion failed error when the maximum keypoint
coordinate value is larger than 1.01 (in which case coordinates are already
absolute).
Args:
keypoints: A tensor of shape [num_instances, num_keypoints, 2]
height: Maximum value for y coordinate of absolute keypoint coordinates.
width: Maximum value for x coordinate of absolute keypoint coordinates.
check_range: If True, checks if the coordinates are normalized or not.
scope: name scope.
Returns:
tensor of shape [num_instances, num_keypoints, 2] with absolute coordinates
in terms of the image size.
"""
with tf.name_scope(scope, 'ToAbsoluteCoordinates'):
height = tf.cast(height, tf.float32)
width = tf.cast(width, tf.float32)
# Ensure range of input keypoints is correct.
if check_range:
max_val = tf.reduce_max(keypoints)
max_assert = tf.Assert(tf.greater_equal(1.01, max_val),
['maximum keypoint coordinate value is larger '
'than 1.01: ', max_val])
with tf.control_dependencies([max_assert]):
width = tf.identity(width)
return scale(keypoints, height, width)
def flip_horizontal(keypoints, flip_point, flip_permutation=None, scope=None):
"""Flips the keypoints horizontally around the flip_point.
This operation flips the x coordinate for each keypoint around the flip_point
and also permutes the keypoints in a manner specified by flip_permutation.
Args:
keypoints: a tensor of shape [num_instances, num_keypoints, 2]
flip_point: (float) scalar tensor representing the x coordinate to flip the
keypoints around.
flip_permutation: integer list or rank 1 int32 tensor containing the
keypoint flip permutation. This specifies the mapping from original
keypoint indices to the flipped keypoint indices. This is used primarily
for keypoints that are not reflection invariant. E.g. Suppose there are 3
keypoints representing ['head', 'right_eye', 'left_eye'], then a logical
choice for flip_permutation might be [0, 2, 1] since we want to swap the
'left_eye' and 'right_eye' after a horizontal flip.
Default to None or empty list to keep the original order after flip.
scope: name scope.
Returns:
new_keypoints: a tensor of shape [num_instances, num_keypoints, 2]
"""
with tf.name_scope(scope, 'FlipHorizontal'):
keypoints = tf.transpose(keypoints, [1, 0, 2])
if flip_permutation:
keypoints = tf.gather(keypoints, flip_permutation)
v, u = tf.split(value=keypoints, num_or_size_splits=2, axis=2)
u = flip_point * 2.0 - u
new_keypoints = tf.concat([v, u], 2)
new_keypoints = tf.transpose(new_keypoints, [1, 0, 2])
return new_keypoints
def flip_vertical(keypoints, flip_point, flip_permutation=None, scope=None):
"""Flips the keypoints vertically around the flip_point.
This operation flips the y coordinate for each keypoint around the flip_point
and also permutes the keypoints in a manner specified by flip_permutation.
Args:
keypoints: a tensor of shape [num_instances, num_keypoints, 2]
flip_point: (float) scalar tensor representing the y coordinate to flip the
keypoints around.
flip_permutation: integer list or rank 1 int32 tensor containing the
keypoint flip permutation. This specifies the mapping from original
keypoint indices to the flipped keypoint indices. This is used primarily
for keypoints that are not reflection invariant. E.g. Suppose there are 3
keypoints representing ['head', 'right_eye', 'left_eye'], then a logical
choice for flip_permutation might be [0, 2, 1] since we want to swap the
'left_eye' and 'right_eye' after a horizontal flip.
Default to None or empty list to keep the original order after flip.
scope: name scope.
Returns:
new_keypoints: a tensor of shape [num_instances, num_keypoints, 2]
"""
with tf.name_scope(scope, 'FlipVertical'):
keypoints = tf.transpose(keypoints, [1, 0, 2])
if flip_permutation:
keypoints = tf.gather(keypoints, flip_permutation)
v, u = tf.split(value=keypoints, num_or_size_splits=2, axis=2)
v = flip_point * 2.0 - v
new_keypoints = tf.concat([v, u], 2)
new_keypoints = tf.transpose(new_keypoints, [1, 0, 2])
return new_keypoints
def rot90(keypoints, rotation_permutation=None, scope=None):
"""Rotates the keypoints counter-clockwise by 90 degrees.
Args:
keypoints: a tensor of shape [num_instances, num_keypoints, 2]
rotation_permutation: integer list or rank 1 int32 tensor containing the
keypoint flip permutation. This specifies the mapping from original
keypoint indices to the rotated keypoint indices. This is used primarily
for keypoints that are not rotation invariant.
Default to None or empty list to keep the original order after rotation.
scope: name scope.
Returns:
new_keypoints: a tensor of shape [num_instances, num_keypoints, 2]
"""
with tf.name_scope(scope, 'Rot90'):
keypoints = tf.transpose(keypoints, [1, 0, 2])
if rotation_permutation:
keypoints = tf.gather(keypoints, rotation_permutation)
v, u = tf.split(value=keypoints[:, :, ::-1], num_or_size_splits=2, axis=2)
v = 1.0 - v
new_keypoints = tf.concat([v, u], 2)
new_keypoints = tf.transpose(new_keypoints, [1, 0, 2])
return new_keypoints
def keypoint_weights_from_visibilities(keypoint_visibilities,
per_keypoint_weights=None):
"""Returns a keypoint weights tensor.
During training, it is often beneficial to consider only those keypoints that
are labeled. This function returns a weights tensor that combines default
per-keypoint weights, as well as the visibilities of individual keypoints.
The returned tensor satisfies:
keypoint_weights[i, k] = per_keypoint_weights[k] * keypoint_visibilities[i, k]
where per_keypoint_weights[k] is set to 1 if not provided.
Args:
keypoint_visibilities: A [num_instances, num_keypoints] boolean tensor
indicating whether a keypoint is labeled (and perhaps even visible).
per_keypoint_weights: A list or 1-d tensor of length `num_keypoints` with
per-keypoint weights. If None, will use 1 for each visible keypoint
weight.
Returns:
A [num_instances, num_keypoints] float32 tensor with keypoint weights. Those
keypoints deemed visible will have the provided per-keypoint weight, and
all others will be set to zero.
"""
if per_keypoint_weights is None:
num_keypoints = keypoint_visibilities.shape.as_list()[1]
per_keypoint_weight_mult = tf.ones((1, num_keypoints,), dtype=tf.float32)
else:
per_keypoint_weight_mult = tf.expand_dims(per_keypoint_weights, axis=0)
return per_keypoint_weight_mult * tf.cast(keypoint_visibilities, tf.float32)
def set_keypoint_visibilities(keypoints, initial_keypoint_visibilities=None):
"""Sets keypoint visibilities based on valid/invalid keypoints.
Some keypoint operations set invisible keypoints (e.g. cropped keypoints) to
NaN, without affecting any keypoint "visibility" variables. This function is
used to update (or create) keypoint visibilities to agree with visible /
invisible keypoint coordinates.
Args:
keypoints: a float32 tensor of shape [num_instances, num_keypoints, 2].
initial_keypoint_visibilities: a boolean tensor of shape
[num_instances, num_keypoints]. If provided, will maintain the visibility
designation of a keypoint, so long as the corresponding coordinates are
not NaN. If not provided, will create keypoint visibilities directly from
the values in `keypoints` (i.e. NaN coordinates map to False, otherwise
they map to True).
Returns:
keypoint_visibilities: a bool tensor of shape [num_instances, num_keypoints]
indicating whether a keypoint is visible or not.
"""
if initial_keypoint_visibilities is not None:
keypoint_visibilities = tf.cast(initial_keypoint_visibilities, tf.bool)
else:
keypoint_visibilities = tf.ones_like(keypoints[:, :, 0], dtype=tf.bool)
keypoints_with_nan = tf.math.reduce_any(tf.math.is_nan(keypoints), axis=2)
keypoint_visibilities = tf.where(
keypoints_with_nan,
tf.zeros_like(keypoint_visibilities, dtype=tf.bool),
keypoint_visibilities)
return keypoint_visibilities