todo: 打开更多东西以缩小问题范围

Signed-off-by: YdrMaster <[email protected]>
InfiniTensor · Jan 26, 2024 · ab270e4 · ab270e4
1 parent a91cc98
commit ab270e4
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 43 deletions.
diff --git a/3rd-party/cccl b/3rd-party/cccl
diff --git a/src/04kernel/src/kernels/pad/cuda_kernel.cu b/src/04kernel/src/kernels/pad/cuda_kernel.cu
@@ -8,32 +8,33 @@ namespace refactor::kernel {
 
     auto PadCuda::lower(Resources &) const noexcept -> RoutineWorkspace {
         thrust::host_vector<cuda::DimInfo> dims(info.dims.size());
-        std::transform(info.dims.begin(), info.dims.end(),
-                       dims.begin(),
-                       [](auto const &d) {
-                           return cuda::DimInfo{
-                               d.strideI,
-                               d.strideO,
-                               d.padS,
-                               d.dimI,
-                           };
-                       });
-        return [dims = thrust::device_vector<cuda::DimInfo>(dims),
-                params = cuda::ThreadsDistributer()(info.blockCount),
-                blockSize = info.blockSize,
-                value = this->valueLength](Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
-            auto src = reinterpret_cast<uint8_t const *>(inputs[0]);
-            thrust::device_vector<uint8_t> defaultValue(blockSize, 0);
-            if (value != 0) {
-                auto constValue = reinterpret_cast<uint8_t const *>(inputs[2]);
-                for (auto i : range0_(blockSize / value)) {
-                    cudaMemcpy(defaultValue.data().get() + i * value, constValue, value, cudaMemcpyDeviceToDevice);
-                }
-            }
-            cuda::launchPad(params, src, defaultValue.data().get(), dims.data().get(), outputs[0],
-                            dims.size(),
-                            blockSize);
-        };
+        exit(999);
+        // std::transform(info.dims.begin(), info.dims.end(),
+        //                dims.begin(),
+        //                [](auto const &d) {
+        //                    return cuda::DimInfo{
+        //                        d.strideI,
+        //                        d.strideO,
+        //                        d.padS,
+        //                        d.dimI,
+        //                    };
+        //                });
+        // return [dims = thrust::device_vector<cuda::DimInfo>(dims),
+        //         params = cuda::ThreadsDistributer()(info.blockCount),
+        //         blockSize = info.blockSize,
+        //         value = this->valueLength](Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
+        //     auto src = reinterpret_cast<uint8_t const *>(inputs[0]);
+        //     thrust::device_vector<uint8_t> defaultValue(blockSize, 0);
+        //     if (value != 0) {
+        //         auto constValue = reinterpret_cast<uint8_t const *>(inputs[2]);
+        //         for (auto i : range0_(blockSize / value)) {
+        //             cudaMemcpy(defaultValue.data().get() + i * value, constValue, value, cudaMemcpyDeviceToDevice);
+        //         }
+        //     }
+        //     cuda::launchPad(params, src, defaultValue.data().get(), dims.data().get(), outputs[0],
+        //                     dims.size(),
+        //                     blockSize);
+        // };
     }
 
 }// namespace refactor::kernel
diff --git a/src/04kernel/test/kernels/pad/test_cuda.cpp b/src/04kernel/test/kernels/pad/test_cuda.cpp
@@ -23,44 +23,42 @@ TEST(kernel, PadCuda) {
     auto yTensor = Tensor::share(DataType::F32, Shape{4, 5, 1, 8});
     PadType type = PadType::Constant;
     auto kCpu = PadCpu::build(PadInfo(dims, *t1Tensor), type, std::make_optional(std::reference_wrapper(*t3Tensor)));
-    // auto kernel = PadCuda::build(PadInfo(dims, *t1Tensor), type, std::make_optional(std::reference_wrapper(*t3Tensor)));
-    // ASSERT_TRUE(kernel && kCpu);
+    auto kernel = PadCuda::build(PadInfo(dims, *t1Tensor), type, std::make_optional(std::reference_wrapper(*t3Tensor)));
+    ASSERT_TRUE(kernel && kCpu);
     auto res = runtime::Resources();
     // auto routine = kernel->lower(res).routine,
     //      rCpu = kCpu->lower(res).routine;
     auto rCpu = kCpu->lower(res).routine;
     // malloc
     auto &dev = *device::init(Device::Type::Nvidia, 0, "");
-    // auto gpuIn = dev.malloc(t1Tensor->bytesSize()),
-    //      gpuIn2 = dev.malloc(t2Tensor->bytesSize()),
-    //      gpuIn3 = dev.malloc(t3Tensor->bytesSize()),
-    //      gpuOut = dev.malloc(yTensor->bytesSize());
+    auto gpuIn = dev.malloc(t1Tensor->bytesSize()),
+         gpuIn2 = dev.malloc(t2Tensor->bytesSize()),
+         gpuIn3 = dev.malloc(t3Tensor->bytesSize()),
+         gpuOut = dev.malloc(yTensor->bytesSize());
     // put input data
     std::vector<float> data(t1Tensor->elementsSize()),
         constvalue(1, 1.2f),
         cpuOut(yTensor->elementsSize());
     std::vector<int64_t> pads{1, 1, 0, 2, 1, 1, 0, 2};
 
-
     for (auto i : range0_(data.size())) { data[i] = i; }
-    // gpuIn->copyFromHost(data.data(), t1Tensor->bytesSize());
-    // gpuIn2->copyFromHost(pads.data(), t2Tensor->bytesSize());
-    // gpuIn3->copyFromHost(constvalue.data(), t3Tensor->bytesSize());
-
+    gpuIn->copyFromHost(data.data(), t1Tensor->bytesSize());
+    gpuIn2->copyFromHost(pads.data(), t2Tensor->bytesSize());
+    gpuIn3->copyFromHost(constvalue.data(), t3Tensor->bytesSize());
     // inference
-    // {
-    //     void const *inputs[]{*gpuIn, *gpuIn2, *gpuIn3};
-    //     void *outputs[]{*gpuOut};
-    //     routine(res, nullptr, inputs, outputs);
-    // }
+    {
+        void const *inputs[]{*gpuIn, *gpuIn2, *gpuIn3};
+        void *outputs[]{*gpuOut};
+        //     routine(res, nullptr, inputs, outputs);
+    }
     {
         void const *inputs[]{data.data(), pads.data(), constvalue.data()};
         void *outputs[]{cpuOut.data()};
         rCpu(res, nullptr, inputs, outputs);
     }
     // take output data
     std::vector<float> result(yTensor->elementsSize());
-    // gpuOut->copyToHost(result.data(), yTensor->bytesSize());
+    gpuOut->copyToHost(result.data(), yTensor->bytesSize());
     // check
     for (auto i : range0_(cpuOut.size())) {
         // fmt::println("i = {}, cpuout = {}, gpuout = {}", i, cpuOut[i], result[i]);