Skip to content

Commit

Permalink
todo: 打开更多东西以缩小问题范围
Browse files Browse the repository at this point in the history
Signed-off-by: YdrMaster <[email protected]>
  • Loading branch information
YdrMaster committed Jan 26, 2024
1 parent a91cc98 commit ab270e4
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 43 deletions.
1 change: 1 addition & 0 deletions 3rd-party/cccl
Submodule cccl added at b7d422
53 changes: 27 additions & 26 deletions src/04kernel/src/kernels/pad/cuda_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,32 +8,33 @@ namespace refactor::kernel {

auto PadCuda::lower(Resources &) const noexcept -> RoutineWorkspace {
thrust::host_vector<cuda::DimInfo> dims(info.dims.size());
std::transform(info.dims.begin(), info.dims.end(),
dims.begin(),
[](auto const &d) {
return cuda::DimInfo{
d.strideI,
d.strideO,
d.padS,
d.dimI,
};
});
return [dims = thrust::device_vector<cuda::DimInfo>(dims),
params = cuda::ThreadsDistributer()(info.blockCount),
blockSize = info.blockSize,
value = this->valueLength](Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
auto src = reinterpret_cast<uint8_t const *>(inputs[0]);
thrust::device_vector<uint8_t> defaultValue(blockSize, 0);
if (value != 0) {
auto constValue = reinterpret_cast<uint8_t const *>(inputs[2]);
for (auto i : range0_(blockSize / value)) {
cudaMemcpy(defaultValue.data().get() + i * value, constValue, value, cudaMemcpyDeviceToDevice);
}
}
cuda::launchPad(params, src, defaultValue.data().get(), dims.data().get(), outputs[0],
dims.size(),
blockSize);
};
exit(999);
// std::transform(info.dims.begin(), info.dims.end(),
// dims.begin(),
// [](auto const &d) {
// return cuda::DimInfo{
// d.strideI,
// d.strideO,
// d.padS,
// d.dimI,
// };
// });
// return [dims = thrust::device_vector<cuda::DimInfo>(dims),
// params = cuda::ThreadsDistributer()(info.blockCount),
// blockSize = info.blockSize,
// value = this->valueLength](Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
// auto src = reinterpret_cast<uint8_t const *>(inputs[0]);
// thrust::device_vector<uint8_t> defaultValue(blockSize, 0);
// if (value != 0) {
// auto constValue = reinterpret_cast<uint8_t const *>(inputs[2]);
// for (auto i : range0_(blockSize / value)) {
// cudaMemcpy(defaultValue.data().get() + i * value, constValue, value, cudaMemcpyDeviceToDevice);
// }
// }
// cuda::launchPad(params, src, defaultValue.data().get(), dims.data().get(), outputs[0],
// dims.size(),
// blockSize);
// };
}

}// namespace refactor::kernel
32 changes: 15 additions & 17 deletions src/04kernel/test/kernels/pad/test_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,44 +23,42 @@ TEST(kernel, PadCuda) {
auto yTensor = Tensor::share(DataType::F32, Shape{4, 5, 1, 8});
PadType type = PadType::Constant;
auto kCpu = PadCpu::build(PadInfo(dims, *t1Tensor), type, std::make_optional(std::reference_wrapper(*t3Tensor)));
// auto kernel = PadCuda::build(PadInfo(dims, *t1Tensor), type, std::make_optional(std::reference_wrapper(*t3Tensor)));
// ASSERT_TRUE(kernel && kCpu);
auto kernel = PadCuda::build(PadInfo(dims, *t1Tensor), type, std::make_optional(std::reference_wrapper(*t3Tensor)));
ASSERT_TRUE(kernel && kCpu);
auto res = runtime::Resources();
// auto routine = kernel->lower(res).routine,
// rCpu = kCpu->lower(res).routine;
auto rCpu = kCpu->lower(res).routine;
// malloc
auto &dev = *device::init(Device::Type::Nvidia, 0, "");
// auto gpuIn = dev.malloc(t1Tensor->bytesSize()),
// gpuIn2 = dev.malloc(t2Tensor->bytesSize()),
// gpuIn3 = dev.malloc(t3Tensor->bytesSize()),
// gpuOut = dev.malloc(yTensor->bytesSize());
auto gpuIn = dev.malloc(t1Tensor->bytesSize()),
gpuIn2 = dev.malloc(t2Tensor->bytesSize()),
gpuIn3 = dev.malloc(t3Tensor->bytesSize()),
gpuOut = dev.malloc(yTensor->bytesSize());
// put input data
std::vector<float> data(t1Tensor->elementsSize()),
constvalue(1, 1.2f),
cpuOut(yTensor->elementsSize());
std::vector<int64_t> pads{1, 1, 0, 2, 1, 1, 0, 2};


for (auto i : range0_(data.size())) { data[i] = i; }
// gpuIn->copyFromHost(data.data(), t1Tensor->bytesSize());
// gpuIn2->copyFromHost(pads.data(), t2Tensor->bytesSize());
// gpuIn3->copyFromHost(constvalue.data(), t3Tensor->bytesSize());

gpuIn->copyFromHost(data.data(), t1Tensor->bytesSize());
gpuIn2->copyFromHost(pads.data(), t2Tensor->bytesSize());
gpuIn3->copyFromHost(constvalue.data(), t3Tensor->bytesSize());
// inference
// {
// void const *inputs[]{*gpuIn, *gpuIn2, *gpuIn3};
// void *outputs[]{*gpuOut};
// routine(res, nullptr, inputs, outputs);
// }
{
void const *inputs[]{*gpuIn, *gpuIn2, *gpuIn3};
void *outputs[]{*gpuOut};
// routine(res, nullptr, inputs, outputs);
}
{
void const *inputs[]{data.data(), pads.data(), constvalue.data()};
void *outputs[]{cpuOut.data()};
rCpu(res, nullptr, inputs, outputs);
}
// take output data
std::vector<float> result(yTensor->elementsSize());
// gpuOut->copyToHost(result.data(), yTensor->bytesSize());
gpuOut->copyToHost(result.data(), yTensor->bytesSize());
// check
for (auto i : range0_(cpuOut.size())) {
// fmt::println("i = {}, cpuout = {}, gpuout = {}", i, cpuOut[i], result[i]);
Expand Down

0 comments on commit ab270e4

Please sign in to comment.