-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenerate_videos.py
378 lines (304 loc) · 17.1 KB
/
generate_videos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
"""Generate videos of NuScene object instances
Example usage: python3 generate_videos.py --dataroot ~/repositories/nuscenes-devkit/data/sets/v1.0-mini --version v1.0-mini -o ~/repositories/nuscenes-devkit/output -f 4 -s 150 150 2
"""
import argparse
from collections import defaultdict
import json
import os
import numpy as np
from PIL import Image
import cv2
from tqdm import tqdm
import pathlib
from shutil import rmtree
def convert_annotation_list_to_dict(annotation_list, categories=None, visibilities=['', '1', '2', '3', '4']):
"""
When saving the list of annotations to a dictionary, special attention must be paid to the
correct keys to use.
For example, you will have bounding boxes with the same instance_token and sample_annotation_token
because there are multiple cameras on the car, so you can have the same object appearing across
multiple sensors. Each sensor's data is identified with a sample_data_token.
{'attribute_tokens': ['58aa28b1c2a54dc88e169808c07331e3'], 'bbox_corners': [1370.3079971217335, 446.66394956158524, 1600.0, 607.4567037983365], 'category_name': 'vehicle.car', 'filename': 'samples/CAM_FRONT/n008-2018-08-27-11-48-51-0400__CAM_FRONT__1535385095912404.jpg', 'instance_token': '0f8696c5e7284236b29a806d3d6f3513', 'next': '624a662244a241529e9f4d42fe75d2bd', 'num_lidar_pts': 4, 'num_radar_pts': 2, 'prev': '8291db1bc2704230867275bad5f42297', 'sample_annotation_token': 'ee04de72a30e4517a366ddad89d64fef', 'sample_data_token': '60ade2dececb46c69b114ce4c8a0bd3e', 'visibility_token': '1'}
{'attribute_tokens': ['58aa28b1c2a54dc88e169808c07331e3'], 'bbox_corners': [0.0, 446.3944232196225, 387.13952090477727, 618.0310593208171], 'category_name': 'vehicle.car', 'filename': 'samples/CAM_FRONT_RIGHT/n008-2018-08-27-11-48-51-0400__CAM_FRONT_RIGHT__1535385095920482.jpg', 'instance_token': '0f8696c5e7284236b29a806d3d6f3513', 'next': '624a662244a241529e9f4d42fe75d2bd', 'num_lidar_pts': 4, 'num_radar_pts': 2, 'prev': '8291db1bc2704230867275bad5f42297', 'sample_annotation_token': 'ee04de72a30e4517a366ddad89d64fef', 'sample_data_token': '92d49452e5804d0a9724ab4161a26147', 'visibility_token': '1'}
A combination of [instance_token][sample_data_token] can be used to uniquely identify
the bounding boxes. You can enumerate through [instance_token][x] to find all the different
views of a single bounding box.
"""
# Convert the list of instance to a dictionary that uses the
# instance_token -> sample_annotation_token -> camera
# to look up the instance
bbox_2d_annotations = defaultdict(lambda: defaultdict(dict))
num_dups = 0
for instance in annotation_list:
instance_token = instance['instance_token']
# 3. `sample` - An annotated snapshot of a scene at a particular timestamp.
# This is identified by `sample_annotation_token`.
# 4. `sample_data` - Data collected from a particular sensor.
# sample_data refers to the picture captured by a single sensor at a single timestamp.
# sample_annotation_token refers to a single bounding box, which might exist in multiple
# sample_data (across the different cameras)
sample_token = instance['sample_annotation_token']
category = instance['category_name']
visibility = instance['visibility_token']
camera_name = extract_camera_key_from_filename(instance['filename'])
# Append additional information
instance['camera_name'] = camera_name
instance['bbox_area'] = calculate_bb_area(instance['bbox_corners'])
if (categories is not None and category not in categories) or visibility not in visibilities:
continue
if instance_token in bbox_2d_annotations and sample_token in bbox_2d_annotations[instance_token] and camera_name in bbox_2d_annotations[instance_token][sample_token]:
num_dups += 1
print('Duplicate instance {}, sample {}, and camera {}'.format(
instance_token, sample_token, camera_name))
bbox_2d_annotations[instance_token][sample_token][camera_name] = instance
print("Number of duplicates (should be zero)", num_dups)
return bbox_2d_annotations
def extract_camera_key_from_filename(filename):
"""
Parameters:
- filename: the name of the file where the samples image is stored.
Ex: 'samples/CAM_BACK/n015-2018-10-02-10-50-40+0800__CAM_BACK__1538448750037525.jpg',
"""
camera_name = filename.split('/')[1]
# Validate the camera name is valid
camera_names = ['CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT',
'CAM_FRONT', 'CAM_FRONT_LEFT', 'CAM_FRONT_RIGHT']
assert(camera_name in camera_names), "Invalid camera name: {} from path: {}".format(
camera_name, filename)
return camera_name
def calculate_bb_area(bounding_box):
"""
Calculates area of a 2D bounding box
Parameters:
- bounding_box: np.array of length 4 (x min, y min, x max, y max)
"""
x_min, y_min, x_max, y_max = bounding_box
return (x_max - x_min) * (y_max - y_min)
def get_most_visible_camera_annotation(camera_data_dict):
"""
Parameters:
- camera_data_dict: dictionary of form:
{
'CAM_BACK': {'attribute_tokens': ['cb5118da1ab342aa947717dc53544259'],
'bbox_corners': [600.8315617945755,
426.38901275036744,
643.6756536789582,
476.66593163100237],
'category_name': 'vehicle.bus.rigid',
'filename': 'samples/CAM_BACK/n015-2018-10-02-10-50-40+0800__CAM_BACK__1538448750037525.jpg',
'instance_token': '9cba9cd8af85487fb010652c90d845b5',
'next': 'ef90c2e525244b7d9eeb759837cf2277',
'num_lidar_pts': 0,
'num_radar_pts': 0,
'prev': '6628e81912584a72bd448a44931afb42',
'sample_annotation_token': '06b4886e79d2435c80bd23e7ac60c618',
'sample_data_token': '0008443755a14b3ca483f1489c767040',
'visibility_token': '4'},
'CAM_FRONT': ...
...
}
"""
# Loop through all the camera views to find the best view of this instance
# Each of the cameras will have a corresponding bounding box and visibility
# we want the largest bounding box and highest visibility
best_visibility = ''
largest_area = -1
best_camera_token = None
for camera_token in camera_data_dict:
visibility = camera_data_dict[camera_token]['visibility_token']
bbox_area = camera_data_dict[camera_token]['bbox_area']
if visibility > best_visibility or (visibility == best_visibility and bbox_area > largest_area):
best_camera_token = camera_token
largest_area = bbox_area
best_visibility = visibility
if not best_camera_token:
print('Unable to find any good views for camera data dict: {}'.format(
camera_data_dict))
best_instance_data = camera_data_dict[best_camera_token]
return best_instance_data
def get_cropped_image_for_annotation(sample_data_annotation, data_directory, output_size):
"""
Parameters:
- sample_data_annotation: of form:
```
{'attribute_tokens': ['cb5118da1ab342aa947717dc53544259'],
'bbox_corners': [600.8315617945755,
426.38901275036744,
643.6756536789582,
476.66593163100237],
'category_name': 'vehicle.bus.rigid',
'filename': 'samples/CAM_BACK/n015-2018-10-02-10-50-40+0800__CAM_BACK__1538448750037525.jpg',
'instance_token': '9cba9cd8af85487fb010652c90d845b5',
'next': 'ef90c2e525244b7d9eeb759837cf2277',
'num_lidar_pts': 0,
'num_radar_pts': 0,
'prev': '6628e81912584a72bd448a44931afb42',
'sample_annotation_token': '06b4886e79d2435c80bd23e7ac60c618',
'sample_data_token': '0008443755a14b3ca483f1489c767040',
'visibility_token': '4'},
```
"""
data_path = os.path.join(data_directory,
sample_data_annotation['filename'])
bbox = sample_data_annotation['bbox_corners']
im = Image.open(data_path)
im1 = im.crop(bbox)
im1 = im1.resize(output_size)
np_img = np.asarray(im1)
return np_img
def sort_sample_annotations_chronologically(instance_dict):
"""
Parameters:
- instance_dict: taken by indexing bbox_2d_annotations[instance_token]
Uses ['bd26c2cdb22d4bb1834e808c89128898'][sample_annotation_token]['best_annotation']
to find the correct sequence
"""
# Find the first sample token
first_sample_token = None
for sample_token in instance_dict:
if instance_dict[sample_token]['best_annotation']['prev'] == '':
first_sample_token = sample_token
break
if first_sample_token is None:
print("Unable to find a start token")
# Now iterate and find a list of the sample_tokens in order
sequential_sample_tokens = [first_sample_token]
while True:
try:
next_sample_token = instance_dict[sequential_sample_tokens[-1]
]['best_annotation']['next']
except:
print("Unrecognized sample annotaton token: {}", sequential_sample_tokens)
break
if next_sample_token == '':
break
sequential_sample_tokens.append(next_sample_token)
return sequential_sample_tokens
def remove_bad_samples(instance_annotation, minimum_bb_area, minimum_visibility, image_area=1600*900):
"""Removes bad samples from an instance annotation's sample sequence
Args:
instance_annotation (object): an instance annotation
minimum_bb_area (float): The minimum fraction of a frame a bounding box take up to be used (0, 1)
minimum_visibility (string): The minimum visibility a frame is allowed to haev ('', '1', '2', '3', '4')
image_area (int, optional): The area of an image frame. Defaults to 1600*900.
Returns: a cleaned list of sample annotation tokens that meet requirements
"""
sample_token_sequence = instance_annotation['sample_annotation_sequence']
cleaned = []
for sample_token in sample_token_sequence:
area = instance_annotation[sample_token]['best_annotation']['bbox_area']
visibility = instance_annotation[sample_token]['best_annotation']['visibility_token']
if area / image_area > minimum_bb_area and visibility >= minimum_visibility:
cleaned.append(sample_token)
return cleaned
def main(version, dataroot, output, object_categories, fps, output_size, minimum_frames, minimum_bb_area, visibility):
"""Generates video sequences of NuScene object instances over time.
Expects the data to be organized as:
```
"$dataroot"/
samples - Sensor data for keyframes.
sweeps - Sensor data for intermediate frames.
maps - Folder for all map files: rasterized .png images and vectorized .json files.
v1.0-* - JSON tables that include all the meta data and annotations. Each split (trainval, test, mini) is provided in a separate folder.
Note that image_annotations.json should be inside this directory.
```
Args:
version (string): the NuScenes data version
dataroot (string): the path to the data root directory
output (string): the path to the output video directory
fps: frames per second to use for the video
output_size (tuple): the output dimension to resize every cropped bounding box to. Defaults to (112, 112)
minimum_frames (int): the minimum number of frames an instance must have
minimum_bb_area (float): the minimum fraction of a frame a bounding box take up to be used (0, 1)
visibility (string): the minimum visibility a frame is allowed to haev ('', '1', '2', '3', '4')
"""
print('='*20)
print('Generating video sequences:')
print('\t* Size: {}'.format(output_size))
print('\t* FPS: {}'.format(fps))
print('\t* Minimum frame count: {}'.format(minimum_frames))
print('\t* Minimum BB area: {}'.format(minimum_bb_area))
print('\t* Minimum visibility: {}'.format(visibility))
# ================================Load image annotations ========================================
with open(os.path.join(dataroot, version, 'image_annotations.json')) as json_file:
# A list of dictionaries
bbox_2d_annotations_list = json.load(json_file)
# These can be indexed with [instance_token][sample_annotation_token][camera_name] -> data about the annotation
# You can use the sample_annotation_token with the nuscenes helper in order to get
# the sample tokens
bbox_2d_annotations = convert_annotation_list_to_dict(
bbox_2d_annotations_list, categories=object_categories)
print('Number of unique vehicle instances: {}'.format(len(bbox_2d_annotations)))
# ==============================================================================================
# ===== For each instance and each sample annotation, find the best camera sensor to use ======
# Get sorted sample annotation tokens per instance per camera
for instance_token in bbox_2d_annotations:
for sample_annotation_token in bbox_2d_annotations[instance_token]:
bbox_2d_annotations[instance_token][sample_annotation_token]['best_annotation'] = get_most_visible_camera_annotation(
bbox_2d_annotations[instance_token][sample_annotation_token])
# ==============================================================================================
# ====== For each instance, find the correct sequence of sample annotations ====================
# Get sorted sample annotation tokens per instance per camera
for instance_token in bbox_2d_annotations:
bbox_2d_annotations[instance_token]['sample_annotation_sequence'] = sort_sample_annotations_chronologically(
bbox_2d_annotations[instance_token])
# ==============================================================================================
# ====== Remove samples from sequence that don't meet requirements ====================
for instance_token in bbox_2d_annotations:
bbox_2d_annotations[instance_token]['sample_annotation_sequence'] = remove_bad_samples(
bbox_2d_annotations[instance_token], minimum_bb_area, visibility)
# ==============================================================================================
# ====== Create videos for every instance ======================================================
# Remove the directory if it already exists and create new one
rmtree(output, ignore_errors=True)
pathlib.Path(output).mkdir(parents=True, exist_ok=True)
print("Creating videos and storing in '{}'...".format(output))
total_videos = 0
for instance_token in tqdm(bbox_2d_annotations):
sample_annotation_tokens = bbox_2d_annotations[instance_token]['sample_annotation_sequence']
if len(sample_annotation_tokens) < minimum_frames:
continue
video_path = os.path.join(
output, '{}.mp4'.format(instance_token))
# Need to use vp09 to be able to upload to certain data annotation platforms
out = cv2.VideoWriter(
video_path, cv2.VideoWriter_fourcc(*'vp09'), fps, output_size)
for sample_annotation_token in sample_annotation_tokens:
best_annotation = bbox_2d_annotations[instance_token][sample_annotation_token]['best_annotation']
cropped_img = get_cropped_image_for_annotation(
best_annotation, dataroot, output_size)
# Convert from PIL's RGB to cv2 BGR
out.write(cropped_img[:, :, ::-1])
out.release()
total_videos += 1
print('Created {} videos ({} did not meet requirements).'.format(
total_videos, len(bbox_2d_annotations) - total_videos, minimum_frames))
# ==============================================================================================
print('='*20)
if __name__ == "__main__":
# Construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-d", "--dataroot", type=str,
help="The path to the root directory where the data is stored")
ap.add_argument("-v", "--version", type=str,
help="The NuScene's data version")
ap.add_argument("-o", "--output", type=str,
help="The output video directory")
ap.add_argument("-f", "--fps", type=int, default=2,
help="Frames per second for output video (use 2 to match speed of original data)")
ap.add_argument("-m", "--minimum_frames", type=int, default=9,
help="The minimum number of frames an instance must have")
ap.add_argument("-p", "--minimum_percentage", type=float, default=0.01,
help="The minimum fraction of a frame a bounding box take up to be used (0, 1)")
ap.add_argument("--visibility", type=str, default='2',
help="The minimum visibility a frame is allowed ('', '1', '2', '3', '4')")
ap.add_argument("-s", "--size", type=int, default=[112, 112], nargs=2,
help="Size of the output video")
# Excludes bicycle and motorcycle by default
vehicle_categories = ['vehicle.bus.bendy', 'vehicle.bus.rigid',
'vehicle.car', 'vehicle.construction', 'vehicle.trailer', 'vehicle.truck']
ap.add_argument("-c", "--categories", nargs='+',
help="The categories to extract videos for", required=False, default=vehicle_categories)
args = vars(ap.parse_args())
main(args['version'], args['dataroot'], args['output'], args['categories'],
args['fps'], tuple(args['size']), args['minimum_frames'], args['minimum_percentage'], args["visibility"])