-
Notifications
You must be signed in to change notification settings - Fork 111
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
#0: Float32 support for Training mode in Batch Norm
- Loading branch information
1 parent
e1a028f
commit 922ddf2
Showing
7 changed files
with
423 additions
and
44 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
228 changes: 228 additions & 0 deletions
228
...ations/normalization/batch_norm/device/kernels/compute/running_statistics_sfpu_kernel.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,228 @@ | ||
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
#include <cstdint> | ||
#include "compute_kernel_api/eltwise_binary.h" | ||
#include "compute_kernel_api/tile_move_copy.h" | ||
#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp" | ||
#include "compute_kernel_api/eltwise_binary_sfpu.h" | ||
#include "compute_kernel_api/eltwise_unary/sfpu_split_includes.h" | ||
#include "compute_kernel_api/eltwise_unary/eltwise_unary.h" | ||
|
||
namespace NAMESPACE { | ||
void MAIN { | ||
uint32_t num_tiles = get_arg_val<uint32_t>(0); | ||
constexpr uint32_t old_running_mean_has_value = get_compile_time_arg_val(0) == 1; | ||
constexpr uint32_t old_running_var_has_value = get_compile_time_arg_val(1) == 1; | ||
|
||
constexpr auto cb_batch_mean = tt::CBIndex::c_0; // batch mean | ||
constexpr auto cb_batch_var = tt::CBIndex::c_1; // batch var | ||
constexpr auto cb_out0 = tt::CBIndex::c_2; | ||
constexpr auto cb_old_running_mean = tt::CBIndex::c_3; // old running mean tensor | ||
constexpr auto cb_old_running_var = tt::CBIndex::c_4; // old running var tensor | ||
constexpr auto cb_updated_running_mean = tt::CBIndex::c_27; // updated running mean tensor | ||
constexpr auto cb_updated_running_var = tt::CBIndex::c_28; // updated running var tensor | ||
constexpr auto cb_momentum = tt::CBIndex::c_5; // momentum | ||
constexpr auto cb_one = tt::CBIndex::c_6; // stores 1 | ||
constexpr auto cb_tmp1 = tt::CBIndex::c_21; // tmp 1 | ||
constexpr auto cb_tmp2 = tt::CBIndex::c_22; // tmp 2 | ||
constexpr auto cb_tmp3 = tt::CBIndex::c_23; // tmp 3 | ||
|
||
unary_op_init_common(cb_batch_mean, cb_out0); | ||
constexpr uint32_t onetile = 1; | ||
|
||
// updated_running_stat = (1 − momentum) × running_stat + momentum × batch_stat | ||
for (uint32_t tile_id = 0; tile_id < num_tiles; ++tile_id) { | ||
tile_regs_acquire(); | ||
cb_wait_front(cb_one, 1); | ||
cb_wait_front(cb_momentum, 1); | ||
|
||
if constexpr (old_running_mean_has_value) { | ||
// 1 - momentum | ||
cb_reserve_back(cb_tmp1, onetile); | ||
sub_binary_tile_init(); | ||
tile_regs_acquire(); | ||
tile_regs_wait(); | ||
copy_tile_to_dst_init_short_with_dt(cb_momentum, cb_one); | ||
for (uint32_t i = 0; i < onetile; ++i) { | ||
copy_tile(cb_one, i, i * 2); | ||
} | ||
copy_tile_to_dst_init_short_with_dt(cb_one, cb_momentum); | ||
for (uint32_t i = 0; i < onetile; ++i) { | ||
copy_tile(cb_momentum, i, i * 2 + 1); | ||
sub_binary_tile(i * 2, i * 2 + 1); | ||
tile_regs_commit(); | ||
pack_tile(i * 2, cb_tmp1); | ||
} | ||
tile_regs_release(); | ||
cb_push_back(cb_tmp1, onetile); | ||
|
||
// momentum * batch stat | ||
cb_wait_front(cb_batch_mean, onetile); | ||
cb_reserve_back(cb_tmp2, onetile); | ||
mul_binary_tile_init(); | ||
tile_regs_acquire(); | ||
tile_regs_wait(); | ||
copy_tile_to_dst_init_short_with_dt(cb_momentum, cb_batch_mean); | ||
for (uint32_t i = 0; i < onetile; ++i) { | ||
copy_tile(cb_batch_mean, i, i * 2); | ||
} | ||
copy_tile_to_dst_init_short_with_dt(cb_batch_mean, cb_momentum); | ||
for (uint32_t i = 0; i < onetile; ++i) { | ||
copy_tile(cb_momentum, i, i * 2 + 1); | ||
mul_binary_tile(i * 2, i * 2 + 1); | ||
tile_regs_commit(); | ||
pack_tile(i * 2, cb_tmp2); | ||
} | ||
tile_regs_release(); | ||
cb_push_back(cb_tmp2, onetile); | ||
cb_pop_front(cb_batch_mean, onetile); | ||
|
||
// cb_tmp1 * running stats --> (1 - momentum) * running stats | ||
cb_wait_front(cb_tmp1, onetile); | ||
cb_wait_front(cb_old_running_mean, onetile); | ||
cb_reserve_back(cb_tmp3, onetile); | ||
mul_binary_tile_init(); | ||
tile_regs_acquire(); | ||
tile_regs_wait(); | ||
copy_tile_to_dst_init_short_with_dt(cb_tmp1, cb_old_running_mean); | ||
for (uint32_t i = 0; i < onetile; ++i) { | ||
copy_tile(cb_old_running_mean, i, i * 2); | ||
} | ||
copy_tile_to_dst_init_short_with_dt(cb_old_running_mean, cb_tmp1); | ||
for (uint32_t i = 0; i < onetile; ++i) { | ||
copy_tile(cb_tmp1, i, i * 2 + 1); | ||
mul_binary_tile(i * 2, i * 2 + 1); | ||
tile_regs_commit(); | ||
pack_tile(i * 2, cb_tmp3); | ||
} | ||
tile_regs_release(); | ||
cb_push_back(cb_tmp3, onetile); | ||
cb_pop_front(cb_old_running_mean, onetile); | ||
cb_pop_front(cb_tmp1, onetile); | ||
|
||
// cb_tmp2 + cb_tmp3 --> (momentum * batch stat) + ((1 - momentum) * running stats) | ||
cb_wait_front(cb_tmp2, onetile); | ||
cb_wait_front(cb_tmp3, onetile); | ||
|
||
cb_reserve_back(cb_updated_running_mean, onetile); | ||
|
||
add_binary_tile_init(); | ||
tile_regs_acquire(); | ||
tile_regs_wait(); | ||
copy_tile_to_dst_init_short_with_dt(cb_tmp2, cb_tmp3); | ||
for (uint32_t i = 0; i < onetile; ++i) { | ||
copy_tile(cb_tmp3, i, i * 2); | ||
} | ||
copy_tile_to_dst_init_short_with_dt(cb_tmp3, cb_tmp2); | ||
for (uint32_t i = 0; i < onetile; ++i) { | ||
copy_tile(cb_tmp2, i, i * 2 + 1); | ||
add_binary_tile(i * 2, i * 2 + 1); | ||
tile_regs_commit(); | ||
pack_tile(i * 2, cb_updated_running_mean); | ||
} | ||
tile_regs_release(); | ||
cb_push_back(cb_updated_running_mean, onetile); | ||
cb_pop_front(cb_tmp3, onetile); | ||
cb_pop_front(cb_tmp2, onetile); | ||
} | ||
if constexpr (old_running_var_has_value) { | ||
// 1 - momentum | ||
cb_reserve_back(cb_tmp1, onetile); | ||
sub_binary_tile_init(); | ||
tile_regs_acquire(); | ||
tile_regs_wait(); | ||
copy_tile_to_dst_init_short_with_dt(cb_momentum, cb_one); | ||
for (uint32_t i = 0; i < onetile; ++i) { | ||
copy_tile(cb_one, i, i * 2); | ||
} | ||
copy_tile_to_dst_init_short_with_dt(cb_one, cb_momentum); | ||
for (uint32_t i = 0; i < onetile; ++i) { | ||
copy_tile(cb_momentum, i, i * 2 + 1); | ||
sub_binary_tile(i * 2, i * 2 + 1); | ||
tile_regs_commit(); | ||
pack_tile(i * 2, cb_tmp1); | ||
} | ||
tile_regs_release(); | ||
cb_push_back(cb_tmp1, onetile); | ||
|
||
// momentum * batch stat | ||
cb_wait_front(cb_batch_var, onetile); | ||
cb_reserve_back(cb_tmp2, onetile); | ||
mul_binary_tile_init(); | ||
tile_regs_acquire(); | ||
tile_regs_wait(); | ||
copy_tile_to_dst_init_short_with_dt(cb_momentum, cb_batch_var); | ||
for (uint32_t i = 0; i < onetile; ++i) { | ||
copy_tile(cb_batch_var, i, i * 2); | ||
} | ||
copy_tile_to_dst_init_short_with_dt(cb_batch_var, cb_momentum); | ||
for (uint32_t i = 0; i < onetile; ++i) { | ||
copy_tile(cb_momentum, i, i * 2 + 1); | ||
mul_binary_tile(i * 2, i * 2 + 1); | ||
tile_regs_commit(); | ||
pack_tile(i * 2, cb_tmp2); | ||
} | ||
tile_regs_release(); | ||
cb_push_back(cb_tmp2, onetile); | ||
cb_pop_front(cb_batch_var, onetile); | ||
|
||
// cb_tmp1 * running stats --> (1 - momentum) * running stats | ||
cb_wait_front(cb_tmp1, onetile); | ||
cb_wait_front(cb_old_running_var, onetile); | ||
cb_reserve_back(cb_tmp3, onetile); | ||
mul_binary_tile_init(); | ||
tile_regs_acquire(); | ||
tile_regs_wait(); | ||
copy_tile_to_dst_init_short_with_dt(cb_tmp1, cb_old_running_var); | ||
for (uint32_t i = 0; i < onetile; ++i) { | ||
copy_tile(cb_old_running_var, i, i * 2); | ||
} | ||
copy_tile_to_dst_init_short_with_dt(cb_old_running_var, cb_tmp1); | ||
for (uint32_t i = 0; i < onetile; ++i) { | ||
copy_tile(cb_tmp1, i, i * 2 + 1); | ||
mul_binary_tile(i * 2, i * 2 + 1); | ||
tile_regs_commit(); | ||
pack_tile(i * 2, cb_tmp3); | ||
} | ||
tile_regs_release(); | ||
cb_push_back(cb_tmp3, onetile); | ||
cb_pop_front(cb_old_running_var, onetile); | ||
cb_pop_front(cb_tmp1, onetile); | ||
|
||
// cb_tmp2 + cb_tmp3 --> (momentum * batch stat) + ((1 - momentum) * running stats) | ||
cb_wait_front(cb_tmp2, onetile); | ||
cb_wait_front(cb_tmp3, onetile); | ||
|
||
cb_reserve_back(cb_updated_running_var, onetile); | ||
|
||
add_binary_tile_init(); | ||
tile_regs_acquire(); | ||
tile_regs_wait(); | ||
copy_tile_to_dst_init_short_with_dt(cb_tmp2, cb_tmp3); | ||
for (uint32_t i = 0; i < onetile; ++i) { | ||
copy_tile(cb_tmp3, i, i * 2); | ||
} | ||
copy_tile_to_dst_init_short_with_dt(cb_tmp3, cb_tmp2); | ||
for (uint32_t i = 0; i < onetile; ++i) { | ||
copy_tile(cb_tmp2, i, i * 2 + 1); | ||
add_binary_tile(i * 2, i * 2 + 1); | ||
tile_regs_commit(); | ||
pack_tile(i * 2, cb_updated_running_var); | ||
} | ||
tile_regs_release(); | ||
cb_push_back(cb_updated_running_var, onetile); | ||
cb_pop_front(cb_tmp3, onetile); | ||
cb_pop_front(cb_tmp2, onetile); | ||
} | ||
} | ||
tile_regs_commit(); | ||
tile_regs_wait(); | ||
pack_tile(0, cb_out0); | ||
tile_regs_release(); | ||
cb_pop_front(cb_momentum, 1); | ||
cb_pop_front(cb_one, 1); | ||
cb_push_back(cb_out0, 1); | ||
} | ||
} // namespace NAMESPACE |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.