Fix some bugs in CPU version of BooleanMask and add GPU version

Reviewed By: akyrola Differential Revision: D5397208 fbshipit-source-id: 0314cc181e315f3b6cda846292b2e2ea73bb015b
facebookarchive · Jul 21, 2017 · 3a0ad3f · 3a0ad3f
1 parent c340c20
commit 3a0ad3f
Show file tree

Hide file tree

Showing 4 changed files with 243 additions and 65 deletions.
diff --git a/caffe2/operators/boolean_mask_ops.cc b/caffe2/operators/boolean_mask_ops.cc
@@ -1,3 +1,4 @@
+#include "caffe2/operators/boolean_mask_ops.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
 
@@ -42,78 +43,72 @@ class BooleanMaskLengthsOp final : public Operator<Context> {
     return true;
   }
 };
+}
 
-template <class Context>
-class BooleanMaskOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  BooleanMaskOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws) {}
+template <>
+bool BooleanMaskOp<CPUContext>::RunOnDevice() {
+  auto& data = Input(0);
+  auto& mask = Input(1);
+  auto* dataOut = Output(0);
+  CAFFE_ENFORCE(data.ndim() >= 1);
+  CAFFE_ENFORCE_EQ(mask.ndim(), 1);
+  CAFFE_ENFORCE(data.dims()[0] == mask.dims()[0]);
 
-  bool RunOnDevice() override {
-    auto& data = Input(0);
-    auto& mask = Input(1);
-    auto* dataOut = Output(0);
-    CAFFE_ENFORCE(data.ndim() >= 1);
-    CAFFE_ENFORCE_EQ(mask.ndim(), 1);
-    CAFFE_ENFORCE(data.dims()[0] == mask.dims()[0]);
+  const auto* maskPtr = mask.template data<bool>();
+  int numOutputs = 0;
+  int outerSize = mask.size();
+  for (int i = 0; i < outerSize; ++i) {
+    if (maskPtr[i]) {
+      ++numOutputs;
+    }
+  }
+  std::vector<TIndex> outShape;
+  outShape.push_back(numOutputs);
+  outShape.insert(outShape.end(), data.dims().begin() + 1, data.dims().end());
+  dataOut->Resize(outShape);
+  auto* outPtr = (char*)dataOut->raw_mutable_data(data.meta());
 
-    const auto* maskPtr = mask.template data<bool>();
-    int numOutputs = 0;
-    int outerSize = mask.size();
-    for (int i = 0; i < outerSize; ++i) {
-      if (maskPtr[i]) {
-        ++numOutputs;
-      }
+  int64_t* out_vec;
+  if (OutputSize() == 2) {
+    auto* indicesOut = Output(1);
+    indicesOut->Resize(numOutputs);
+    out_vec = indicesOut->template mutable_data<int64_t>();
+  }
+
+  if (numOutputs == 0) {
+    return true;
+  }
+  const auto innerSize = data.size_from_dim(1);
+  const auto innerSizeBytes = innerSize * data.meta().itemsize();
+
+  TIndex lastStart = -1;
+  const auto* inPtr = (char*)data.raw_data();
+  TIndex outStart = 0;
+
+  for (TIndex i = 0;; ++i) {
+    // mask was true and either a) became false, or b) sequence finished
+    if (lastStart != -1 && ((i >= outerSize) || !maskPtr[i])) {
+      const auto* src = inPtr + lastStart * innerSizeBytes;
+      auto* dst = outPtr + outStart * innerSizeBytes;
+      int numItems = i - lastStart;
+      context_.template CopyItems<CPUContext, CPUContext>(
+          data.meta(), numItems * innerSize, src, dst);
+      outStart += numItems;
+      lastStart = -1;
     }
-    std::vector<TIndex> outShape;
-    outShape.push_back(numOutputs);
-    outShape.insert(outShape.end(), data.dims().begin() + 1, data.dims().end());
-    dataOut->Resize(outShape);
-    auto* outPtr = (char*)dataOut->raw_mutable_data(data.meta());
-    if (numOutputs == 0) {
-      return true;
+    if (i >= outerSize) {
+      break;
     }
-    auto innerSizeBytes = std::accumulate(
-                              data.dims().begin() + 1,
-                              data.dims().end(),
-                              1,
-                              std::multiplies<TIndex>()) *
-        data.meta().itemsize();
-    TIndex lastStart = -1;
-    const auto* inPtr = (char*)data.raw_data();
-    TIndex outStart = 0;
-    int64_t* out_vec;
-    if (OutputSize() == 2) {
-      auto* indicesOut = Output(1);
-      indicesOut->Resize(numOutputs);
-      out_vec = indicesOut->template mutable_data<int64_t>();
+    // mask was false and became true
+    if (lastStart == -1 && maskPtr[i]) {
+      lastStart = i;
     }
-    for (TIndex i = 0;; ++i) {
-      // mask was true and either a) became false, or b) sequence finished
-      if (lastStart != -1 && ((i >= outerSize) || !maskPtr[i])) {
-        const auto* src = inPtr + lastStart * innerSizeBytes;
-        auto* dst = outPtr + outStart * innerSizeBytes;
-        int numItems = i - lastStart;
-        context_.template CopyItems<CPUContext, CPUContext>(
-            data.meta(), numItems, src, dst);
-        outStart += numItems;
-        lastStart = -1;
-      }
-      if (i >= outerSize) {
-        break;
-      }
-      // mask was false and became true
-      if (lastStart == -1 && maskPtr[i]) {
-        lastStart = i;
-      }
-      if (maskPtr[i] && OutputSize() == 2) {
-        *(out_vec++) = i;
-      }
+    if (maskPtr[i] && OutputSize() == 2) {
+      *(out_vec++) = i;
     }
-    return true;
   }
-};
+  return true;
+}
 
 REGISTER_CPU_OPERATOR(BooleanMask, BooleanMaskOp<CPUContext>);
 REGISTER_CPU_OPERATOR(BooleanMaskLengths, BooleanMaskLengthsOp<CPUContext>);
@@ -145,4 +140,3 @@ applied.
 NO_GRADIENT(BooleanMask)
 NO_GRADIENT(BooleanMaskLengths);
 }
-}
diff --git a/caffe2/operators/boolean_mask_ops.cu b/caffe2/operators/boolean_mask_ops.cu
@@ -0,0 +1,119 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/boolean_mask_ops.h"
+
+#include <cub/cub.cuh>
+
+namespace caffe2 {
+
+namespace {
+template <typename T>
+__global__ void BooleanMaskCopyKernel(
+    const TIndex numOfOutput,
+    const TIndex numBytes,
+    const TIndex* indices,
+    const T* src,
+    T* dest) {
+  for (TIndex i = blockIdx.x; i < numOfOutput; i += gridDim.x) {
+    const auto srcBase = indices[i] * numBytes;
+    const auto destBase = i * numBytes;
+    for (TIndex j = threadIdx.x; j < numBytes; j += blockDim.x) {
+      dest[destBase + j] = src[srcBase + j];
+    }
+  }
+}
+}
+
+template <>
+class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {
+ public:
+  BooleanMaskOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    const auto& src = Input(0);
+    const auto& mask = Input(1);
+    auto* dest = Output(0);
+
+    CAFFE_ENFORCE(src.ndim() >= 1);
+    CAFFE_ENFORCE_EQ(mask.ndim(), 1);
+    CAFFE_ENFORCE(src.dims()[0] == mask.dims()[0]);
+
+    const auto* maskData = mask.template data<bool>();
+    const auto outerSize = mask.dims()[0];
+    indices_.Resize(outerSize);
+    auto* indicesData = indices_.template mutable_data<TIndex>();
+
+    size_t numBytes = 0;
+    cub::CountingInputIterator<int> itr(0);
+    cub::DeviceSelect::Flagged(
+        nullptr,
+        numBytes,
+        itr,
+        maskData,
+        indicesData,
+        static_cast<TIndex*>(nullptr),
+        outerSize,
+        context_.cuda_stream());
+
+    auto numTIndex =
+        static_cast<TIndex>((numBytes + sizeof(TIndex) - 1) / sizeof(TIndex));
+    // allocate one more TIndex at the end of scratch for storing numOfOutput
+    scratch_.Resize(numTIndex + 1);
+    auto* scratchData = scratch_.template mutable_data<TIndex>();
+    auto* numOfOutputData = scratchData + numTIndex;
+
+    cub::DeviceSelect::Flagged(
+        static_cast<void*>(scratchData),
+        numBytes,
+        itr,
+        maskData,
+        indicesData,
+        numOfOutputData,
+        outerSize,
+        context_.cuda_stream());
+
+    // Copy numOfOutput from gpu to cpu
+    TIndex numOfOutput;
+    context_.Copy<TIndex, CUDAContext, CPUContext>(
+        1, numOfOutputData, &numOfOutput);
+
+    indices_.Resize(numOfOutput);
+    std::vector<TIndex> dims = src.dims();
+    dims[0] = numOfOutput;
+    dest->Resize(dims);
+    auto* destData = (char*)dest->raw_mutable_data(src.meta());
+    const auto* srcData = (char*)src.raw_data();
+    if (OutputSize() == 2) {
+      auto* indicesOut = Output(1);
+      indicesOut->Resize(numOfOutput);
+      indicesOut->template mutable_data<TIndex>();
+    }
+
+    if (numOfOutput > 0) {
+      BooleanMaskCopyKernel<<<
+          min(numOfOutput, static_cast<TIndex>(CAFFE_MAXIMUM_NUM_BLOCKS)),
+          CAFFE_CUDA_NUM_THREADS,
+          0,
+          context_.cuda_stream()>>>(
+          numOfOutput,
+          src.size_from_dim(1) * src.meta().itemsize(),
+          indicesData,
+          srcData,
+          destData);
+
+      if (OutputSize() == 2) {
+        Output(1)->CopyFrom(indices_, &context_);
+      }
+    }
+
+    return true;
+  }
+
+ private:
+  Tensor<CUDAContext> indices_;
+  Tensor<CUDAContext> scratch_;
+};
+
+REGISTER_CUDA_OPERATOR(BooleanMask, BooleanMaskOp<CUDAContext>);
+
+} // caffe2
diff --git a/caffe2/operators/boolean_mask_ops.h b/caffe2/operators/boolean_mask_ops.h
@@ -0,0 +1,21 @@
+#ifndef BOOLEAN_MASK_OPS_H
+#define BOOLEAN_MASK_OPS_H
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+template <class Context>
+class BooleanMaskOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  BooleanMaskOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override;
+};
+} // caffe2
+
+#endif
diff --git a/caffe2/python/operator_test/boolean_mask_test.py b/caffe2/python/operator_test/boolean_mask_test.py
@@ -0,0 +1,44 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from hypothesis import given
+import hypothesis.strategies as st
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+
+
+class TestBooleanMaskOp(hu.HypothesisTestCase):
+
+    @given(x=hu.tensor(min_dim=1,
+                       max_dim=5,
+                       elements=st.floats(min_value=0.5, max_value=1.0)),
+           **hu.gcs)
+    def test_boolean_mask(self, x, gc, dc):
+        op = core.CreateOperator("BooleanMask",
+                                 ["data", "mask"],
+                                 "masked_data")
+        mask = np.random.choice(a=[True, False], size=x.shape[0])
+
+        def ref(x, mask):
+            return (x[mask],)
+
+        self.assertReferenceChecks(gc, op, [x, mask], ref)
+        self.assertDeviceChecks(dc, op, [x, mask], [0])
+
+    @given(x=hu.tensor(min_dim=1,
+                       max_dim=5,
+                       elements=st.floats(min_value=0.5, max_value=1.0)),
+           **hu.gcs)
+    def test_boolean_mask_indices(self, x, gc, dc):
+        op = core.CreateOperator("BooleanMask",
+                                 ["data", "mask"],
+                                 ["masked_data", "masked_indices"])
+        mask = np.random.choice(a=[True, False], size=x.shape[0])
+
+        def ref(x, mask):
+            return (x[mask], np.where(mask)[0])
+
+        self.assertReferenceChecks(gc, op, [x, mask], ref)
+        self.assertDeviceChecks(dc, op, [x, mask], [0])