From 4b1833a65eb213be1e361a9b9dafb8fe8143c62e Mon Sep 17 00:00:00 2001 From: Janusz Lisiecki <39967756+JanuszL@users.noreply.github.com> Date: Wed, 11 Sep 2024 07:29:28 +0200 Subject: [PATCH] Adds `enable_frame_num` to the experimental video reader (#5628) - adds an ability to output frame numbers in the experimental video reader - this allows making VideoReaderDecoderCpuTest.RandomShuffle_* test independent from the hardcoding of the expected frame order (in case random generator implementation changes) Signed-off-by: Janusz Lisiecki --- .../loader/video/video_loader_decoder_base.h | 5 +- .../loader/video/video_loader_decoder_cpu.cc | 3 + .../reader/video_reader_decoder_cpu_op.cc | 21 +++++- .../reader/video_reader_decoder_cpu_op.h | 1 + .../reader/video_reader_decoder_gpu_op.cc | 66 +++++++++++++------ .../reader/video_reader_decoder_gpu_op.h | 1 + .../reader/video_reader_decoder_op_test.cc | 34 ++++++++-- 7 files changed, 100 insertions(+), 31 deletions(-) diff --git a/dali/operators/reader/loader/video/video_loader_decoder_base.h b/dali/operators/reader/loader/video/video_loader_decoder_base.h index 1d64263fcfa..79087d28542 100644 --- a/dali/operators/reader/loader/video/video_loader_decoder_base.h +++ b/dali/operators/reader/loader/video/video_loader_decoder_base.h @@ -35,7 +35,8 @@ template class VideoSample { public: Tensor data_; - int label_; + int label_ = -1; + int first_frame_ = -1; }; class VideoLoaderDecoderBase { @@ -46,6 +47,7 @@ class VideoLoaderDecoderBase { stride_(spec.GetArgument("stride")), step_(spec.GetArgument("step")) { has_labels_ = spec.TryGetRepeatedArgument(labels_, "labels"); + has_frame_idx_ = spec.GetArgument("enable_frame_num"); DALI_ENFORCE( !has_labels_ || labels_.size() == filenames_.size(), make_string( @@ -61,6 +63,7 @@ class VideoLoaderDecoderBase { std::vector filenames_; std::vector labels_; bool has_labels_ = false; + bool has_frame_idx_ = false; Index current_index_ = 0; diff --git a/dali/operators/reader/loader/video/video_loader_decoder_cpu.cc b/dali/operators/reader/loader/video/video_loader_decoder_cpu.cc index 3fea566242c..3a3a3715d8a 100644 --- a/dali/operators/reader/loader/video/video_loader_decoder_cpu.cc +++ b/dali/operators/reader/loader/video/video_loader_decoder_cpu.cc @@ -45,6 +45,9 @@ void VideoLoaderDecoderCpu::ReadSample(VideoSample &sample) { if (has_labels_) { sample.label_ = labels_[sample_span.video_idx_]; } + if (has_frame_idx_) { + sample.first_frame_ = sample_span.start_; + } } Index VideoLoaderDecoderCpu::SizeImpl() { diff --git a/dali/operators/reader/video_reader_decoder_cpu_op.cc b/dali/operators/reader/video_reader_decoder_cpu_op.cc index bae530606dd..72826a16619 100644 --- a/dali/operators/reader/video_reader_decoder_cpu_op.cc +++ b/dali/operators/reader/video_reader_decoder_cpu_op.cc @@ -20,7 +20,8 @@ namespace dali { VideoReaderDecoderCpu::VideoReaderDecoderCpu(const OpSpec &spec) : DataReader(spec), - has_labels_(spec.HasArgument("labels")) { + has_labels_(spec.HasArgument("labels")), + has_frame_idx_(spec.GetArgument("enable_frame_num")) { loader_ = InitLoader(spec); this->SetInitialSnapshot(); } @@ -32,16 +33,26 @@ void VideoReaderDecoderCpu::RunImpl(SampleWorkspace &ws) { video_output.Copy(sample.data_); video_output.SetSourceInfo(sample.data_.GetSourceInfo()); + int out_index = 1; if (has_labels_) { - auto &label_output = ws.Output(1); + auto &label_output = ws.Output(out_index); label_output.Resize({}, DALIDataType::DALI_INT32); label_output.mutable_data()[0] = sample.label_; + out_index++; + } + if (has_frame_idx_) { + auto &frame_idx_output = ws.Output(out_index); + frame_idx_output.Resize({}, DALIDataType::DALI_INT32); + frame_idx_output.mutable_data()[0] = sample.first_frame_; + out_index++; } } namespace detail { inline int VideoReaderDecoderOutputFn(const OpSpec &spec) { - return spec.HasArgument("labels") ? 2 : 1; + bool has_labels = spec.HasArgument("labels"); + bool has_frame_num_output = spec.GetArgument("enable_frame_num"); + return 1 + has_labels + has_frame_num_output; } } // namespace detail @@ -68,6 +79,10 @@ even in the variable frame rate scenario.)code") .AddArg("sequence_length", R"code(Frames to load per sequence.)code", DALI_INT32) + .AddOptionalArg("enable_frame_num", + R"code(If set, returns the index of the first frame in the decoded sequence +as an additional output.)code", + false) .AddOptionalArg("step", R"code(Frame interval between each sequence. diff --git a/dali/operators/reader/video_reader_decoder_cpu_op.h b/dali/operators/reader/video_reader_decoder_cpu_op.h index 3292e8fe777..9912f18bcd3 100644 --- a/dali/operators/reader/video_reader_decoder_cpu_op.h +++ b/dali/operators/reader/video_reader_decoder_cpu_op.h @@ -29,6 +29,7 @@ class VideoReaderDecoderCpu private: bool has_labels_ = false; + bool has_frame_idx_ = false; }; } // namespace dali diff --git a/dali/operators/reader/video_reader_decoder_gpu_op.cc b/dali/operators/reader/video_reader_decoder_gpu_op.cc index 3e570490b8d..ec09f7741d9 100644 --- a/dali/operators/reader/video_reader_decoder_gpu_op.cc +++ b/dali/operators/reader/video_reader_decoder_gpu_op.cc @@ -20,7 +20,8 @@ namespace dali { VideoReaderDecoderGpu::VideoReaderDecoderGpu(const OpSpec &spec) : DataReader(spec), - has_labels_(spec.HasArgument("labels")) { + has_labels_(spec.HasArgument("labels")), + has_frame_idx_(spec.GetArgument("enable_frame_num")) { loader_ = InitLoader(spec); this->SetInitialSnapshot(); } @@ -50,14 +51,21 @@ bool VideoReaderDecoderGpu::SetupImpl( output_desc[0] = { video_shape, DALI_UINT8 }; - if (!has_labels_) { - return true; + int out_index = 1; + if (has_labels_) { + output_desc[out_index] = { + uniform_list_shape<1>(batch_size, {1}), + DALI_INT32 + }; + out_index++; + } + if (has_frame_idx_) { + output_desc[out_index] = { + uniform_list_shape<1>(batch_size, {1}), + DALI_INT32 + }; + out_index++; } - - output_desc[1] = { - uniform_list_shape<1>(batch_size, {1}), - DALI_INT32 - }; return true; } @@ -80,23 +88,39 @@ void VideoReaderDecoderGpu::RunImpl(Workspace &ws) { video_output.SetSourceInfo(sample_id, sample.data_.GetSourceInfo()); } - if (!has_labels_) { - return; - } + int out_index = 1; + if (has_labels_) { + auto &labels_output = ws.Output(out_index); + SmallVector labels_cpu; - auto &labels_output = ws.Output(1); - SmallVector labels_cpu; + for (int sample_id = 0; sample_id < batch_size; ++sample_id) { + auto &sample = GetSample(sample_id); + labels_cpu[sample_id] = sample.label_; + } - for (int sample_id = 0; sample_id < batch_size; ++sample_id) { - auto &sample = GetSample(sample_id); - labels_cpu[sample_id] = sample.label_; + MemCopy( + labels_output.AsTensor().raw_mutable_data(), + labels_cpu.data(), + batch_size * sizeof(DALI_INT32), + ws.stream()); + out_index++; } + if (has_frame_idx_) { + auto &frame_idx_output = ws.Output(out_index); + SmallVector frame_idx_output_cpu; + + for (int sample_id = 0; sample_id < batch_size; ++sample_id) { + auto &sample = GetSample(sample_id); + frame_idx_output_cpu[sample_id] = sample.span_ ? sample.span_->start_ : -1; + } - MemCopy( - labels_output.AsTensor().raw_mutable_data(), - labels_cpu.data(), - batch_size * sizeof(DALI_INT32), - ws.stream()); + MemCopy( + frame_idx_output.AsTensor().raw_mutable_data(), + frame_idx_output_cpu.data(), + batch_size * sizeof(DALI_INT32), + ws.stream()); + out_index++; + } } DALI_REGISTER_OPERATOR(experimental__readers__Video, VideoReaderDecoderGpu, GPU); diff --git a/dali/operators/reader/video_reader_decoder_gpu_op.h b/dali/operators/reader/video_reader_decoder_gpu_op.h index e452554b57a..64c24163e0c 100644 --- a/dali/operators/reader/video_reader_decoder_gpu_op.h +++ b/dali/operators/reader/video_reader_decoder_gpu_op.h @@ -35,6 +35,7 @@ class VideoReaderDecoderGpu : public DataReader + int GetFrameIdx(dali::TensorList &device_frame_idx); + private: template void RunTestImpl( @@ -129,15 +132,15 @@ class VideoReaderDecoderBaseTest : public VideoTestBase { .AddArg("device", backend) .AddArg("sequence_length", sequence_length) .AddArg("random_shuffle", true) + .AddArg("enable_frame_num", true) .AddArg("initial_fill", cfr_videos_[0].NumFrames()) .AddArg( "filenames", std::vector{cfr_videos_paths_[0]}) - .AddOutput("frames", backend)); - - pipe.Build({{"frames", backend}}); + .AddOutput("frames", backend) + .AddOutput("frame_idx", backend)); - std::vector expected_order = {29, 46, 33, 6, 37}; + pipe.Build({{"frames", backend}, {"frame_idx", backend}}); int num_sequences = 5; @@ -148,9 +151,10 @@ class VideoReaderDecoderBaseTest : public VideoTestBase { auto &frame_video_output = ws.Output(0); const auto sample = frame_video_output.template tensor(0); + int frame_idx = GetFrameIdx(ws.Output(1)); - // We want to access correct order, so we comapre only the first frame of the sequence - AssertFrame(expected_order[sequence_id], sample, ground_truth_video); + // We want to access correct order, so we compare only the first frame of the sequence + AssertFrame(frame_idx, sample, ground_truth_video); } } }; @@ -168,6 +172,15 @@ void VideoReaderDecoderBaseTest::RunShuffleTest() { RunShuffleTestImpl("cpu", dali::CPU_ONLY_DEVICE_ID); } +template<> +int VideoReaderDecoderBaseTest::GetFrameIdx( + dali::TensorList &device_frame_idx) { + const auto frame_idx = device_frame_idx.template tensor(0); + int frame_idx_buffer = -1; + std::copy_n(frame_idx, 1, &frame_idx_buffer); + return frame_idx_buffer; +} + template<> void VideoReaderDecoderBaseTest::RunTest( std::vector &videos_paths, @@ -181,6 +194,15 @@ void VideoReaderDecoderBaseTest::RunShuffleTest() { RunShuffleTestImpl("gpu", 0); } +template<> +int VideoReaderDecoderBaseTest::GetFrameIdx( + dali::TensorList &device_frame_idx) { + const auto frame_idx = device_frame_idx.template tensor(0); + int frame_idx_buffer = -1; + MemCopy(&frame_idx_buffer, frame_idx, sizeof(int)); + return frame_idx_buffer; +} + class VideoReaderDecoderCpuTest : public VideoReaderDecoderBaseTest { public: void AssertLabel(const int *label, int ground_truth_label) override {