diff --git a/CMakeLists.txt b/CMakeLists.txt index b8f4379b9..afeeb4aba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -116,6 +116,7 @@ option(SIMENG_SANITIZE "Enable compiler sanitizers" OFF) option(SIMENG_OPTIMIZE "Enable Extra Compiler Optimizations" OFF) option(SIMENG_ENABLE_SST "Compile SimEng SST Wrapper" OFF) option(SIMENG_ENABLE_SST_TESTS "Enable testing for SST" OFF) +option(SIMENG_ENABLE_BF16 "Enable __bf16 instruction execution logic" OFF) # Set CXX flag for Apple Mac so that `binary_function` and `unary_function` types that are used in SST can be recognised. # They were deprecated in C++11 and removed in C++17, and Apple Clang v15 no longer supports these types without the following flag @@ -155,10 +156,9 @@ if(SIMENG_ENABLE_TESTS) # Print message containing if the full test suite will run if (${LLVM_PACKAGE_VERSION} VERSION_LESS "14.0") - message(STATUS "LLVM version does not support AArch64 extensions SME or SVE2. These test suites will be skipped.") - endif() - if (${LLVM_PACKAGE_VERSION} VERSION_LESS "18.0") - message(STATUS "LLVM version does not support AArch64 extensions SME2. These test suites will be skipped.") + message(STATUS "LLVM version does not support AArch64 extensions SVE2, SVE2.1, SME, or SME2. Related tests will fail.") + elseif (${LLVM_PACKAGE_VERSION} VERSION_LESS "18.0") + message(STATUS "LLVM version does not support AArch64 extensions SME2 or SVE2.1. Related test will fail.") endif() else() diff --git a/src/include/simeng/arch/aarch64/ArchInfo.hh b/src/include/simeng/arch/aarch64/ArchInfo.hh index 1403da08f..b7f274035 100644 --- a/src/include/simeng/arch/aarch64/ArchInfo.hh +++ b/src/include/simeng/arch/aarch64/ArchInfo.hh @@ -18,7 +18,8 @@ class ArchInfo : public simeng::arch::ArchInfo { aarch64_sysreg::AARCH64_SYSREG_MIDR_EL1, aarch64_sysreg::AARCH64_SYSREG_CNTVCT_EL0, aarch64_sysreg::AARCH64_SYSREG_PMCCNTR_EL0, - aarch64_sysreg::AARCH64_SYSREG_SVCR}), + aarch64_sysreg::AARCH64_SYSREG_SVCR, + aarch64_sysreg::AARCH64_SYSREG_TPIDR2_EL0}), zaSize_(config["Core"]["Streaming-Vector-Length"].as() / 8) { // Generate the architecture-defined architectural register structure archRegStruct_ = { diff --git a/src/include/simeng/arch/aarch64/Instruction.hh b/src/include/simeng/arch/aarch64/Instruction.hh index d510c1f37..f37089219 100644 --- a/src/include/simeng/arch/aarch64/Instruction.hh +++ b/src/include/simeng/arch/aarch64/Instruction.hh @@ -283,6 +283,40 @@ enum class InsnType : uint32_t { isBranch = 1 << 14 }; +/** Convert Predicate-as-Counter to Predicate-as-Masks. + * T represents the element type (i.e. for pg.s, T = uint32_t). + * V represents the number of vectors the predicate-as-counter is being used + * for. */ +template +std::vector> predAsCounterToMasks( + const uint64_t predAsCounter, const uint16_t VL_bits) { + std::vector> out(V, {0, 0, 0, 0}); + + const uint16_t elemsPerVec = VL_bits / (sizeof(T) * 8); + // Get predicate-as-counter information + const bool invert = (predAsCounter & 0b1000000000000000) != 0; + const uint64_t predElemCount = + (predAsCounter & static_cast(0b0111111111111111)) >> + static_cast(std::log2f(sizeof(T)) + 1); + + for (int r = 0; r < V; r++) { + for (uint16_t i = 0; i < elemsPerVec; i++) { + // Move bit to next position based on element type + uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T)); + // If invert = True (invert bit = 1), predElemCount dictates number of + // initial inactive elements. + // If invert = False (invert bit = 0), it indicates the number of initial + // active elements. + if (static_cast(r * elemsPerVec) + i < predElemCount) { + out[r][i / (64 / sizeof(T))] |= (invert) ? 0 : shifted_active; + } else { + out[r][i / (64 / sizeof(T))] |= (invert) ? shifted_active : 0; + } + } + } + return out; +} + /** A basic Armv9.2-a implementation of the `Instruction` interface. */ class Instruction : public simeng::Instruction { public: diff --git a/src/include/simeng/arch/aarch64/helpers/float.hh b/src/include/simeng/arch/aarch64/helpers/float.hh index 454f50070..0d198f926 100644 --- a/src/include/simeng/arch/aarch64/helpers/float.hh +++ b/src/include/simeng/arch/aarch64/helpers/float.hh @@ -194,6 +194,23 @@ D fcvtzu_integer(srcValContainer& sourceValues) { return result; } +/** Helper function for SCALAR/FP instructions with the format ucvtf rd, rn + * #fbits. + * D represents the destination register type (e.g. for Sd, D = float). + * N represents the source register type (e.g. for Xn, N = uint32_t). + * Returns single value of type D. */ +template +D ucvtf_fixedToFloat( + srcValContainer& sourceValues, + const simeng::arch::aarch64::InstructionMetadata& metadata) { + // Convert Fixed-Point to FP + // Using algorithm from + // https://embeddedartistry.com/blog/2018/07/12/simple-fixed-point-conversion-in-c/ + const N xn = sourceValues[0].get(); + const N fbits = static_cast(metadata.operands[2].imm); + return (static_cast(xn) / static_cast(1ull << fbits)); +} + } // namespace aarch64 } // namespace arch } // namespace simeng \ No newline at end of file diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh index cc9aa0346..2a9ac3d0f 100644 --- a/src/include/simeng/arch/aarch64/helpers/neon.hh +++ b/src/include/simeng/arch/aarch64/helpers/neon.hh @@ -951,6 +951,63 @@ RegisterValue vecUzp(srcValContainer& sourceValues, bool isUzp1) { return {out, 256}; } +/** Helper function for NEON instructions with the format `udot vd.s, vn.b, + * vm.b`. D represents the number of elements in the output vector to be updated + * (i.e. for vd.2s D = 2). Only 2 or 4 are valid. Returns correctly formatted + * RegisterValue. */ +template +RegisterValue vecUdot( + srcValContainer& sourceValues, + const simeng::arch::aarch64::InstructionMetadata& metadata) { + // Check D and N are valid values + static_assert((D == 2 || D == 4) && + "D must be either 2 or 4 to align with vd.2s or vd.4s."); + + const uint32_t* vd = sourceValues[0].getAsVector(); + const uint8_t* vn = sourceValues[1].getAsVector(); + const uint8_t* vm = sourceValues[2].getAsVector(); + + uint32_t out[D] = {0}; + for (int i = 0; i < D; i++) { + out[i] = vd[i]; + for (int j = 0; j < 4; j++) { + out[i] += (static_cast(vn[(4 * i) + j]) * + static_cast(vm[(4 * i) + j])); + } + } + return {out, 256}; +} + +/** Helper function for NEON instructions with the format `udot vd.s, vn.b, + * vm.4b[index]`. + * D represents the number of elements in the output vector to be updated (i.e. + * for vd.2s D = 2). Only 2 or 4 are valid. + * Returns correctly formatted RegisterValue. */ +template +RegisterValue vecUdot_byElement( + srcValContainer& sourceValues, + const simeng::arch::aarch64::InstructionMetadata& metadata) { + // Check D and N are valid values + static_assert((D == 2 || D == 4) && + "D must be either 2 or 4 to align with vd.2s or vd.4s."); + + const uint32_t* vd = sourceValues[0].getAsVector(); + const uint8_t* vn = sourceValues[1].getAsVector(); + const uint8_t* vm = sourceValues[2].getAsVector(); + const int index = metadata.operands[2].vector_index; + + uint32_t out[D] = {0}; + for (int i = 0; i < D; i++) { + uint32_t acc = vd[i]; + for (int j = 0; j < 4; j++) { + acc += (static_cast(vn[(4 * i) + j]) * + static_cast(vm[(4 * index) + j])); + } + out[i] = acc; + } + return {out, 256}; +} + /** Helper function for NEON instructions with the format `zip<1,2> vd.T, * vn.T, vm.T`. * T represents the type of sourceValues (e.g. for vn.d, T = uint64_t). diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index 2c33ccfbe..2315021a1 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -626,6 +626,27 @@ std::enable_if_t, RegisterValue> sveFDivPredicated( return {out, 256}; } +/** Helper function for SVE instructions with the format `faddv rd, pg, zn. + * D represents the source vector element type and the destination scalar + * register type (i.e. for zn.s and sd, D = float). + * Returns correctly formatted RegisterValue. */ +template +RegisterValue sveFaddv_predicated(srcValContainer& sourceValues, + const uint16_t VL_bits) { + const uint64_t* p = sourceValues[0].getAsVector(); + const D* zn = sourceValues[1].getAsVector(); + + const uint16_t partition_num = VL_bits / (8 * sizeof(D)); + D out[256 / sizeof(D)] = {0}; + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % (64 / sizeof(D))) * sizeof(D)); + if (p[i / (64 / sizeof(D))] & shifted_active) { + out[0] += zn[i]; + } + } + return {out, 256}; +} + /** Helper function for SVE instructions with the format `fmad zd, pg/m, zn, * zm`. * T represents the type of sourceValues (e.g. for zn.d, T = double). @@ -1319,6 +1340,40 @@ std::array svePtrue( return out; } +/** Helper function for SVE instructions with the format `ptrue pnd. + * T represents the type of sourceValues (e.g. for pnd.d, T = uint64_t). + * Returns an array of 4 uint64_t elements. */ +template +std::array svePtrue_counter(const uint16_t VL_bits) { + // Predicate as counter is 16-bits and has the following encoding: + // - Up to first 4 bits (named LSZ) encode the element size (0b1, 0b10, + // 0b100, 0b1000 for b h s d respectively) + // - bits 0->LSZ + // - Bits LSZ -> 14 represent a uint of the number of consecutive elements + // from element 0 that are active / inactive + // - If invert bit = 0 it is number of active elements + // - If invert bit = 1 it is number of inactive elements + // - Bit 15 represents the invert bit + std::array out = {0, 0, 0, 0}; + + // Set invert bit to 1 and count to 0 so that the first 0 elements are FALSE. + // This is how the spec defines all true to be encoded. + out[0] |= 0b1000000000000000; + + // Set Element size field + if (sizeof(T) == 1) { + out[0] |= 0b1; + } else if (sizeof(T) == 2) { + out[0] |= 0b10; + } else if (sizeof(T) == 4) { + out[0] |= 0b100; + } else if (sizeof(T) == 8) { + out[0] |= 0b1000; + } + + return out; +} + /** Helper function for SVE instructions with the format `punpk pd.h, * pn.b`. * If `isHI` = false, then PUNPKLO is performed. @@ -1563,6 +1618,69 @@ RegisterValue sveTrn2_3vecs(srcValContainer& sourceValues, return {out, 256}; } +/** Helper function for SVE instructions with the format `udot zd, zn, zm`. + * D represents the element type of the destination register (i.e. for zd.s, + * D = uint32_t). + * N represents the element type of the source registers (i.e. for zn.b, N = + * uint8_t). + * W represents how many source elements are multiplied to form an output + * element (i.e. for 4-way, W = 4). + * Returns correctly formatted RegisterValue. */ +template +RegisterValue sveUdot( + srcValContainer& sourceValues, + const simeng::arch::aarch64::InstructionMetadata& metadata, + const uint16_t VL_bits) { + const D* zd = sourceValues[0].getAsVector(); + const N* zn = sourceValues[1].getAsVector(); + const N* zm = sourceValues[2].getAsVector(); + + D out[256 / sizeof(D)] = {0}; + for (size_t i = 0; i < (VL_bits / (sizeof(D) * 8)); i++) { + out[i] = zd[i]; + for (int j = 0; j < W; j++) { + out[i] += + (static_cast(zn[(W * i) + j]) * static_cast(zm[(W * i) + j])); + } + } + return {out, 256}; +} + +/** Helper function for SVE instructions with the format `udot zd, zn, + * zm[index]`. + * D represents the element type of the destination register (i.e. for uint32_t, + * D = uint32_t). + * N represents the element type of the source registers (i.e. for uint8_t, N = + * uint8_t). + * W represents how many source elements are multiplied to form an output + * element (i.e. for 4-way, W = 4). + * Returns correctly formatted RegisterValue. */ +template +RegisterValue sveUdot_indexed( + srcValContainer& sourceValues, + const simeng::arch::aarch64::InstructionMetadata& metadata, + const uint16_t VL_bits) { + const D* zd = sourceValues[0].getAsVector(); + const N* zn = sourceValues[1].getAsVector(); + const N* zm = sourceValues[2].getAsVector(); + const int index = metadata.operands[2].vector_index; + + D out[256 / sizeof(D)] = {0}; + for (size_t i = 0; i < (VL_bits / (sizeof(D) * 8)); i++) { + D acc = zd[i]; + // Index into zm selects which D-type element within each 128-bit vector + // segment to use + int base = i - (i % (128 / (sizeof(D) * 8))); + int zmIndex = base + index; + for (int j = 0; j < W; j++) { + acc += (static_cast(zn[(W * i) + j]) * + static_cast(zm[(W * zmIndex) + j])); + } + out[i] = acc; + } + return {out, 256}; +} + /** Helper function for SVE instructions with the format `unpk>hi,lo> zd, * zn`. * D represents the type of the destination register (e.g. int32_t for diff --git a/src/include/simeng/arch/aarch64/operandContainer.hh b/src/include/simeng/arch/aarch64/operandContainer.hh index c73b8881d..996454b00 100644 --- a/src/include/simeng/arch/aarch64/operandContainer.hh +++ b/src/include/simeng/arch/aarch64/operandContainer.hh @@ -10,7 +10,7 @@ namespace arch { namespace aarch64 { /** The maximum number of source registers a non-SME instruction can have. */ -const uint8_t MAX_SOURCE_REGISTERS = 6; +const uint8_t MAX_SOURCE_REGISTERS = 7; /** The maximum number of destination registers a non-SME instruction can have. */ diff --git a/src/include/simeng/version.hh.in b/src/include/simeng/version.hh.in index 5f1e8f410..f563e281f 100644 --- a/src/include/simeng/version.hh.in +++ b/src/include/simeng/version.hh.in @@ -9,5 +9,6 @@ #define SIMENG_LLVM_VERSION @SIMENG_LLVM_VERSION@ #define SIMENG_ENABLE_TESTS "${SIMENG_ENABLE_TESTS}" #define SIMENG_BUILD_DIR "${CMAKE_BINARY_DIR}" +#define SIMENG_ENABLE_BF16 ${SIMENG_ENABLE_BF16} #endif \ No newline at end of file diff --git a/src/lib/arch/aarch64/ExceptionHandler.cc b/src/lib/arch/aarch64/ExceptionHandler.cc index ae98dddb1..ff7375339 100644 --- a/src/lib/arch/aarch64/ExceptionHandler.cc +++ b/src/lib/arch/aarch64/ExceptionHandler.cc @@ -626,8 +626,7 @@ bool ExceptionHandler::init() { break; } - case 293: // rseq - { + case 293: { // rseq stateChange = {ChangeType::REPLACEMENT, {R0}, {0ull}}; break; } @@ -818,7 +817,7 @@ void ExceptionHandler::readLinkAt(span path) { for (size_t i = 0; i < bytesCopied; i += 256) { uint8_t size = std::min(bytesCopied - i, 256ul); stateChange.memoryAddresses.push_back({bufAddress + i, size}); - stateChange.memoryAddressValues.push_back(RegisterValue(bufPtr, size)); + stateChange.memoryAddressValues.push_back(RegisterValue(bufPtr + i, size)); } concludeSyscall(stateChange); diff --git a/src/lib/arch/aarch64/InstructionMetadata.cc b/src/lib/arch/aarch64/InstructionMetadata.cc index 56e438a3d..07deed41a 100644 --- a/src/lib/arch/aarch64/InstructionMetadata.cc +++ b/src/lib/arch/aarch64/InstructionMetadata.cc @@ -279,7 +279,7 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn) if (isAlias) { exceptionString_ = "This instruction is an alias. The printed mnemonic and operand string " - "differ from what is expected of the Capstone opcode."; + "may differ from the underlying opcode."; } } diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index ec4f269a8..d0c792096 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -190,6 +190,18 @@ span Instruction::generateAddresses() { } break; } + case Opcode::AArch64_LD1RQ_B: { // ld1rqb {zd.b}, pg/z, [xn, xm] + uint64_t addr = + sourceValues_[1].get() + sourceValues_[2].get(); + setMemoryAddresses({addr, static_cast(16)}); + break; + } + case Opcode::AArch64_LD1RQ_B_IMM: { // ld1rqb {zd.b}, pg/z, [xn{, #imm}] + uint64_t addr = + sourceValues_[1].get() + metadata_.operands[2].mem.disp; + setMemoryAddresses({addr, static_cast(16)}); + break; + } case Opcode::AArch64_LD1RQ_D_IMM: { // ld1rqd {zd.d}, pg/z, [xn{, #imm}] uint64_t addr = sourceValues_[1].get() + metadata_.operands[2].mem.disp; @@ -292,6 +304,10 @@ span Instruction::generateAddresses() { setMemoryAddresses({{sourceValues_[0].get(), 16}}); break; } + case Opcode::AArch64_LD1Onev8b_POST: { // ld1 {vt.8b}, [xn], <#imm|xm> + setMemoryAddresses({{sourceValues_[0].get(), 8}}); + break; + } case Opcode::AArch64_LD1Fourv16b: // ld1 {vt1.16b, vt2.16b, vt3.16b, // vt4.16b}, [xn] [[fallthrough]]; @@ -324,6 +340,9 @@ span Instruction::generateAddresses() { case Opcode::AArch64_LD1Twov2d_POST: // ld1 {vt1.2d, vt2.2d}, [xn], // <#imm|xm> [[fallthrough]]; + case Opcode::AArch64_LD1Twov8h_POST: // ld1 {vt1.8h, vt2.8h}, [xn], + // <#imm|xm> + [[fallthrough]]; case Opcode::AArch64_LD1Twov4s: // ld1 {vt1.4s, vt2.4s}, [xn] [[fallthrough]]; case Opcode::AArch64_LD1Twov4s_POST: { // ld1 {vt1.4s, vt2.4s}, [xn], @@ -349,6 +368,100 @@ span Instruction::generateAddresses() { setMemoryAddresses({addr, static_cast(VL_bits / 8)}); break; } + case Opcode::AArch64_LD1B_2Z: { // ld1b {zt1.b, zt2.b}, png/z, [xn, xm] + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = sourceValues_[2].get(); + const uint64_t addr = base + offset; + + std::vector addresses; + addresses.reserve(2); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_LD1B_2Z_IMM: { // ld1b {zt1.b, zt2.b}, png/z, [xn{, + // #imm, mul vl}] + const uint16_t partition_num = VL_bits / 8; + + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = + static_cast(metadata_.operands[3].mem.disp); + const uint64_t addr = base + (offset * partition_num); + + std::vector addresses; + addresses.reserve(2); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_LD1B_4Z_STRIDED: { // ld1b {zt1.b, zt2.b, zt3.b, + // zt4.b}, png/z, [xn, xm] + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = sourceValues_[2].get(); + const uint64_t addr = base + offset; + + std::vector addresses; + addresses.reserve(4); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + addresses.push_back({addr + 2 * blockSize, blockSize}); + addresses.push_back({addr + 3 * blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_LD1B_4Z_STRIDED_IMM: // ld1b {zt1.b, zt2.b, zt3.b, + // zt4.b}, png/z, [xn{, #imm, + // mul vl}] + [[fallthrough]]; + case Opcode::AArch64_LD1B_4Z_IMM: { // ld1b {zt1.b - zt4.b}, png/z, [xn{, + // #imm, mul vl}] + const uint16_t partition_num = VL_bits / 8; + + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = + static_cast(metadata_.operands[5].mem.disp); + const uint64_t addr = base + (offset * partition_num); + + std::vector addresses; + addresses.reserve(4); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + addresses.push_back({addr + 2 * blockSize, blockSize}); + addresses.push_back({addr + 3 * blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_LD1B_4Z: { // ld1b {zt1.b - zt4.b}, png/z, [xn, xm] + const uint64_t base = sourceValues_[1].get(); + const int64_t offset = sourceValues_[2].get(); + const uint64_t addr = base + offset; + + std::vector addresses; + addresses.reserve(4); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + addresses.push_back({addr + 2 * blockSize, blockSize}); + addresses.push_back({addr + 3 * blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } case Opcode::AArch64_LD1D: { // ld1d {zt.d}, pg/z, [xn, xm, lsl #3] const uint64_t base = sourceValues_[1].get(); const uint64_t offset = sourceValues_[2].get(); @@ -357,6 +470,64 @@ span Instruction::generateAddresses() { setMemoryAddresses({addr, static_cast(VL_bits / 8)}); break; } + case Opcode::AArch64_LD1D_2Z_IMM: { // ld1d {zt1.d, zt2.d}, png/z, [xn{, + // #imm, mul vl}] + const uint16_t partition_num = VL_bits / 64; + + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = + static_cast(metadata_.operands[3].mem.disp); + const uint64_t addr = base + (offset * partition_num * 8); + + std::vector addresses; + addresses.reserve(2); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_LD1D_4Z: { // ld1d {zt1.d - zt4.d}, png/z, [xn, + // xm, lsl #3] + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = sourceValues_[2].get(); + const uint64_t addr = base + (offset << 3); + + std::vector addresses; + addresses.reserve(4); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + addresses.push_back({addr + 2 * blockSize, blockSize}); + addresses.push_back({addr + 3 * blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_LD1D_4Z_IMM: { // ld1d {zt1.d - zt4.d}, png/z, [xn{, + // #imm, mul vl}] + const uint16_t partition_num = VL_bits / 64; + + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = + static_cast(metadata_.operands[5].mem.disp); + const uint64_t addr = base + (offset * partition_num * 8); + + std::vector addresses; + addresses.reserve(4); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + addresses.push_back({addr + 2 * blockSize, blockSize}); + addresses.push_back({addr + 3 * blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } case Opcode::AArch64_LD1D_IMM: { // ld1d {zt.d}, pg/z, [xn{, #imm, // mul vl}] const uint16_t partition_num = VL_bits / 64; @@ -377,6 +548,52 @@ span Instruction::generateAddresses() { setMemoryAddresses({addr, static_cast(VL_bits / 8)}); break; } + case Opcode::AArch64_LD1H_IMM: { // ld1h {zt.h}, pg/z, [xn{, #imm, mul + // vl}] + const uint16_t partition_num = VL_bits / 16; + + const uint64_t base = sourceValues_[1].get(); + const int64_t offset = metadata_.operands[2].mem.disp; + const uint64_t addr = base + (offset * partition_num * 2); + + setMemoryAddresses({addr, static_cast(VL_bits / 8)}); + break; + } + case Opcode::AArch64_LD1H_2Z: { // ld1h {zt1.h, zt2.h}, png/z, [xn, xm, + // lsl #1] + const uint64_t base = sourceValues_[1].get(); + const int64_t offset = sourceValues_[2].get(); + const uint64_t addr = base + (offset << 1); + + std::vector addresses; + addresses.reserve(2); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_LD1H_2Z_IMM: { // ld1h {zt1.h, zt2.h}, png/z, [xn{, + // #imm, mul vl}] + const uint16_t partition_num = VL_bits / 16; + + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = + static_cast(metadata_.operands[3].mem.disp); + const uint64_t addr = base + (offset * partition_num * 4); + + std::vector addresses; + addresses.reserve(2); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } case Opcode::AArch64_LD1W: { // ld1w {zt.s}, pg/z, [xn, xm, lsl #2] const uint64_t base = sourceValues_[1].get(); const uint64_t offset = sourceValues_[2].get(); @@ -397,6 +614,80 @@ span Instruction::generateAddresses() { setMemoryAddresses({addr, static_cast(VL_bits / 8)}); break; } + case Opcode::AArch64_LD1W_2Z: { // ld1w {zt1.s, zt2.s}, png/z, [xn, + // xm, lsl #2] + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = sourceValues_[2].get(); + const uint64_t addr = base + (offset << 2); + + std::vector addresses; + addresses.reserve(2); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_LD1W_2Z_IMM: { // ld1w {zt1.s, zt2.s}, png/z, [xn{, + // #imm, mul vl}] + const uint16_t partition_num = VL_bits / 32; + + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = + static_cast(metadata_.operands[3].mem.disp); + const uint64_t addr = base + (offset * partition_num * 4); + + std::vector addresses; + addresses.reserve(2); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_LD1W_4Z: { // ld1w {zt1.s - zt4.s}, png/z, [xn, + // xm, lsl #2] + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = sourceValues_[2].get(); + const uint64_t addr = base + (offset << 2); + + std::vector addresses; + addresses.reserve(4); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + addresses.push_back({addr + 2 * blockSize, blockSize}); + addresses.push_back({addr + 3 * blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_LD1W_4Z_IMM: { // ld1w {zt1.s - zt4.s}, png/z, [xn{, + // #imm, mul vl}] + const uint16_t partition_num = VL_bits / 32; + + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = + static_cast(metadata_.operands[5].mem.disp); + const uint64_t addr = base + (offset * partition_num * 4); + + std::vector addresses; + addresses.reserve(4); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + addresses.push_back({addr + 2 * blockSize, blockSize}); + addresses.push_back({addr + 3 * blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } case Opcode::AArch64_LD2D: { // ld2d {zt1.d, zt2.d}, pg/z, [xn|sp, xm, // lsl #3] const uint64_t base = sourceValues_[1].get(); @@ -771,6 +1062,10 @@ span Instruction::generateAddresses() { setMemoryAddresses({{base, 4}, {base + 4, 4}}); break; } + case Opcode::AArch64_LDRSBWpost: { // ldrsb wt, [xn], #imm + setMemoryAddresses({{sourceValues_[0].get(), 1}}); + break; + } case Opcode::AArch64_LDRSBWroX: { // ldrsb wt, [xn, xm{, extend // {#amount}}] uint64_t offset = extendOffset(sourceValues_[1].get(), @@ -1031,6 +1326,74 @@ span Instruction::generateAddresses() { setMemoryAddresses(std::move(addresses)); break; } + case Opcode::AArch64_ST1D_2Z: { // st1d {zt1.d, zt2.d}, png, [xn, xm, lsl + // #3] + const uint64_t pn = sourceValues_[2].get(); + auto preds = predAsCounterToMasks(pn, VL_bits); + const uint16_t partition_num = VL_bits / 64; + + const uint64_t base = sourceValues_[3].get(); + const uint64_t offset = sourceValues_[4].get(); + const uint64_t addr = base + (offset << 3); + + std::vector addresses; + + generatePredicatedContiguousAddressBlocks(addr, partition_num, 8, 8, + preds[0].data(), addresses); + generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8), + partition_num, 8, 8, + preds[1].data(), addresses); + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_ST1D_2Z_IMM: { // st1d {zt1.d, zt2.d}, png, [xn{, + // #imm, mul vl}] + const uint64_t pn = sourceValues_[2].get(); + auto preds = predAsCounterToMasks(pn, VL_bits); + const uint16_t partition_num = VL_bits / 64; + + const uint64_t base = sourceValues_[3].get(); + const int64_t offset = + static_cast(metadata_.operands[3].mem.disp); + const uint64_t addr = base + (offset * partition_num * 8); + + std::vector addresses; + + generatePredicatedContiguousAddressBlocks(addr, partition_num, 8, 8, + preds[0].data(), addresses); + generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8), + partition_num, 8, 8, + preds[1].data(), addresses); + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_ST1D_4Z_IMM: { // st1d {zt1.d - zt4.d}, png, [xn{, + // #imm, mul vl}] + const uint64_t pn = sourceValues_[4].get(); + auto preds = predAsCounterToMasks(pn, VL_bits); + const uint16_t partition_num = VL_bits / 64; + + const uint64_t base = sourceValues_[5].get(); + const int64_t offset = + static_cast(metadata_.operands[5].mem.disp); + const uint64_t addr = base + (offset * partition_num * 8); + + std::vector addresses; + + generatePredicatedContiguousAddressBlocks(addr, partition_num, 8, 8, + preds[0].data(), addresses); + generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8), + partition_num, 8, 8, + preds[1].data(), addresses); + generatePredicatedContiguousAddressBlocks(addr + 2 * (VL_bits / 8), + partition_num, 8, 8, + preds[2].data(), addresses); + generatePredicatedContiguousAddressBlocks(addr + 3 * (VL_bits / 8), + partition_num, 8, 8, + preds[3].data(), addresses); + setMemoryAddresses(std::move(addresses)); + break; + } case Opcode::AArch64_ST2D_IMM: { // st2d {zt1.d, zt2.d}, pg, [{, // #imm, mul vl}] const uint64_t* p = sourceValues_[2].getAsVector(); @@ -1045,8 +1408,84 @@ span Instruction::generateAddresses() { uint64_t addr = base + (offset * partition_num * 8); - generatePredicatedContiguousAddressBlocks(addr, partition_num, 16, 8, p, - addresses); + // As vectors are stored in an interleaved manner (i.e. zt1[0], zt2[0], + // zt1[1], zt2[1], ...) we must generate an address for each element (if + // the predicate is true for that element). This is because, if the + // predicate indicates that all elements are active, a single address + // and MemoryAccessTarget will be generated with a size of 2xVL. This + // could lead to issues for core models which have a maximum store + // bandwidth of 1xVL. + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % 8) * 8); + if (p[i / 8] & shifted_active) { + addresses.push_back({addr + (2 * i * 8), 8}); + addresses.push_back({addr + (2 * i * 8) + 8, 8}); + } + } + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_ST4W: { // st4w {zt1.s, zt2.s, zt3.s, zt4.s}, + // pg, [, xm, lsl #2] + const uint64_t* p = sourceValues_[4].getAsVector(); + const uint16_t partition_num = VL_bits / 32; + + const uint64_t base = sourceValues_[5].get(); + const int64_t offset = sourceValues_[6].get(); + + std::vector addresses; + addresses.reserve(partition_num * 4); + + uint64_t addr = base + (offset << 2); + + // As vectors are stored in an interleaved manner (i.e. zt1[0], zt2[0], + // zt3[0], zt4[0], zt1[1], zt2[1], zt3[1], zt4[1] ...) we must generate + // an address for each element (if the predicate is true for that + // element). This is because, if the predicate indicates that all + // elements are active, a single address and MemoryAccessTarget will be + // generated with a size of 4xVL. This could lead to issues for core + // models which have a maximum store bandwidth of 1xVL. + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % 16) * 4); + if (p[i / 16] & shifted_active) { + addresses.push_back({addr + (4 * i * 4), 4}); + addresses.push_back({addr + (4 * i * 4) + 4, 4}); + addresses.push_back({addr + (4 * i * 4) + 8, 4}); + addresses.push_back({addr + (4 * i * 4) + 12, 4}); + } + } + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_ST4W_IMM: { // st4w {zt1.s, zt2.s, zt3.s, zt4.s}, + // pg, [{, #imm, mul vl}] + const uint64_t* p = sourceValues_[4].getAsVector(); + const uint16_t partition_num = VL_bits / 32; + + const uint64_t base = sourceValues_[5].get(); + const int64_t offset = + static_cast(metadata_.operands[5].mem.disp); + + std::vector addresses; + addresses.reserve(partition_num * 4); + uint64_t addr = base + (offset * partition_num * 4); + + // As vectors are stored in an interleaved manner (i.e. zt1[0], zt2[0], + // zt3[0], zt4[0], zt1[1], zt2[1], zt3[1], zt4[1] ...) we must generate + // an address for each element (if the predicate is true for that + // element). This is because, if the predicate indicates that all + // elements are active, a single address and MemoryAccessTarget will be + // generated with a size of 4xVL. This could lead to issues for core + // models which have a maximum store bandwidth of 1xVL. + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % 16) * 4); + if (p[i / 16] & shifted_active) { + addresses.push_back({addr + (4 * i * 4), 4}); + addresses.push_back({addr + (4 * i * 4) + 4, 4}); + addresses.push_back({addr + (4 * i * 4) + 8, 4}); + addresses.push_back({addr + (4 * i * 4) + 12, 4}); + } + } setMemoryAddresses(std::move(addresses)); break; } @@ -1213,6 +1652,74 @@ span Instruction::generateAddresses() { setMemoryAddresses(std::move(addresses)); break; } + case Opcode::AArch64_ST1W_2Z: { // st1w {zt1.s, zt2.s}, png, [xn, xm, lsl + // #2] + const uint64_t pn = sourceValues_[2].get(); + auto preds = predAsCounterToMasks(pn, VL_bits); + const uint16_t partition_num = VL_bits / 32; + + const uint64_t base = sourceValues_[3].get(); + const uint64_t offset = sourceValues_[4].get(); + const uint64_t addr = base + (offset << 2); + + std::vector addresses; + + generatePredicatedContiguousAddressBlocks(addr, partition_num, 4, 4, + preds[0].data(), addresses); + generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8), + partition_num, 4, 4, + preds[1].data(), addresses); + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_ST1W_2Z_IMM: { // st1w {zt1.s, zt2.s}, png, [xn{, + // #imm, mul vl}] + const uint64_t pn = sourceValues_[2].get(); + auto preds = predAsCounterToMasks(pn, VL_bits); + const uint16_t partition_num = VL_bits / 32; + + const uint64_t base = sourceValues_[3].get(); + const int64_t offset = + static_cast(metadata_.operands[3].mem.disp); + const uint64_t addr = base + (offset * partition_num * 4); + + std::vector addresses; + + generatePredicatedContiguousAddressBlocks(addr, partition_num, 4, 4, + preds[0].data(), addresses); + generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8), + partition_num, 4, 4, + preds[1].data(), addresses); + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_ST1W_4Z_IMM: { // st1w {zt1.s - zt4.s}, png, [xn{, + // #imm, mul vl}] + const uint64_t pn = sourceValues_[4].get(); + auto preds = predAsCounterToMasks(pn, VL_bits); + const uint16_t partition_num = VL_bits / 32; + + const uint64_t base = sourceValues_[5].get(); + const int64_t offset = + static_cast(metadata_.operands[5].mem.disp); + const uint64_t addr = base + (offset * partition_num * 4); + + std::vector addresses; + + generatePredicatedContiguousAddressBlocks(addr, partition_num, 4, 4, + preds[0].data(), addresses); + generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8), + partition_num, 4, 4, + preds[1].data(), addresses); + generatePredicatedContiguousAddressBlocks(addr + 2 * (VL_bits / 8), + partition_num, 4, 4, + preds[2].data(), addresses); + generatePredicatedContiguousAddressBlocks(addr + 3 * (VL_bits / 8), + partition_num, 4, 4, + preds[3].data(), addresses); + setMemoryAddresses(std::move(addresses)); + break; + } case Opcode::AArch64_SST1W_D_IMM: { // st1w {zt.d}, pg, [zn.d{, #imm}] const uint64_t* p = sourceValues_[1].getAsVector(); const uint16_t partition_num = VL_bits / 64; @@ -1442,6 +1949,11 @@ span Instruction::generateAddresses() { setMemoryAddresses(std::move(addresses)); break; } + case Opcode::AArch64_ST1Onev4s_POST: { // st1 {vt.4s}, [xn|sp], <#imm|xm> + const uint64_t base = sourceValues_[1].get(); + setMemoryAddresses({base, 16}); + break; + } case Opcode::AArch64_ST1Twov16b: // st1 {vt.16b, vt2.16b}, [xn] [[fallthrough]]; case Opcode::AArch64_ST1Twov16b_POST: // st1 {vt.16b, vt2.16b}, [xn], diff --git a/src/lib/arch/aarch64/Instruction_decode.cc b/src/lib/arch/aarch64/Instruction_decode.cc index 3535ce590..5e9987258 100644 --- a/src/lib/arch/aarch64/Instruction_decode.cc +++ b/src/lib/arch/aarch64/Instruction_decode.cc @@ -532,8 +532,8 @@ void Instruction::decode() { if (isInstruction(InsnType::isStoreData)) { // Identify store instruction group - if (AARCH64_REG_Z0 <= metadata_.operands[0].reg && - metadata_.operands[0].reg <= AARCH64_REG_Z31) { + if ((AARCH64_REG_Z0 <= metadata_.operands[0].reg && + metadata_.operands[0].reg <= AARCH64_REG_Z31)) { setInstructionType(InsnType::isSVEData); } else if ((metadata_.operands[0].reg <= AARCH64_REG_S31 && metadata_.operands[0].reg >= AARCH64_REG_Q0) || @@ -639,8 +639,8 @@ void Instruction::decode() { } } } else { - // For SME instructions, resize the following structures to have the - // exact amount of space required + // For SME instructions (not using ZT0), resize the following structures to + // have the exact amount of space required sourceRegisters_.resize(sourceRegisterCount_); destinationRegisters_.resize(destinationRegisterCount_); sourceValues_.resize(sourceRegisterCount_); diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 8f4bc3814..f2d575673 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -470,6 +470,40 @@ void Instruction::execute() { results_[0] = vecAdd_3ops(sourceValues_); break; } + case Opcode::AArch64_ADD_VG2_M2Z_S: { // add za.s[wv, off, vgx2], {zn1.s, + // zn2.s} + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + const uint16_t elemCount = VL_bits / 32; + + // Get ZA stride between halves and index into each ZA half + const uint16_t zaStride = zaRowCount / 2; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[0].sme.slice_offset.imm) % + zaStride; + + // Pre-set all ZA result rows as only 2 will be updated in loop below + for (int z = 0; z < zaRowCount; z++) { + results_[z] = sourceValues_[z]; + } + + for (int r = 0; r < 2; r++) { + const uint32_t* zaRow = + sourceValues_[(r * zaStride) + zaIndex].getAsVector(); + const uint32_t* znr = + sourceValues_[zaRowCount + 1 + r].getAsVector(); + uint32_t out[64] = {0}; + for (int i = 0; i < elemCount; i++) { + out[i] = zaRow[i] + znr[i]; + } + results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256); + } + break; + } case Opcode::AArch64_ADR: { // adr xd, #imm results_[0] = instructionAddress_ + metadata_.operands[1].imm; break; @@ -628,6 +662,66 @@ void Instruction::execute() { branchAddress_ = instructionAddress_ + metadata_.operands[0].imm; break; } +#if SIMENG_ENABLE_BF16 == 1 + case Opcode::AArch64_BF16DOTlanev8bf16: { // bfdot vd.4s, vn.8h, + // vm.2h[index] + // BF16 -- EXPERIMENTAL + // Must be enabled at SimEng compile time + // Not verified to be working for all compilers or OSs. + // No Tests written + + const float* vd = sourceValues_[0].getAsVector(); + const __bf16* vn = sourceValues_[1].getAsVector<__bf16>(); + const __bf16* vm = sourceValues_[2].getAsVector<__bf16>(); + const int vmIndex = metadata_.operands[2].vector_index; + + float out[4] = {vd[0], vd[1], vd[2], vd[3]}; + for (int i = 0; i < 4; i++) { + out[i] += (static_cast(vn[2 * i]) * + static_cast(vm[2 * vmIndex])) + + (static_cast(vn[2 * i + 1]) * + static_cast(vm[2 * vmIndex + 1])); + } + results_[0] = RegisterValue(out, 256); + break; + } + case Opcode::AArch64_BFDOT_ZZI: { // bfdot zd.s, zn.h, zm.h[index] + // BF16 -- EXPERIMENTAL + // Must be enabled at SimEng compile time + // Not verified to be working for all compilers or OSs. + // No Tests written + + const uint16_t partition_num = VL_bits / 16; + + const float* zd = sourceValues_[0].getAsVector(); + // Extract data as uint16_t so that bytes-per-element is correct + const uint16_t* zn = sourceValues_[1].getAsVector(); + const uint16_t* zm = sourceValues_[2].getAsVector(); + const int index = metadata_.operands[2].vector_index; + + float out[64] = {0.0f}; + for (int i = 0; i < partition_num; i++) { + // MOD 4 as 4 32-bit elements in each 128-bit segment + const int zmBase = i - (i % 4); + const int zmIndex = zmBase + index; + + float zn1, zn2, zm1, zm2; + // Horrible hack in order to convert bf16 (currently stored in a + // uint16_t) into a float. + // Each bf16 is copied into the most significant 16-bits of each + // float variable; given IEEE FP32 and BF16 have the same width + // exponent and one sign bit. + memcpy((uint16_t*)&zn1 + 1, &zn[2 * i], 2); + memcpy((uint16_t*)&zn2 + 1, &zn[2 * i + 1], 2); + memcpy((uint16_t*)&zm1 + 1, &zm[2 * zmIndex], 2); + memcpy((uint16_t*)&zm2 + 1, &zm[2 * zmIndex + 1], 2); + + out[i] = zd[i] + ((zn1 * zm1) + (zn2 * zm2)); + } + results_[0] = RegisterValue(out, 256); + break; + } +#endif case Opcode::AArch64_BFMWri: { // bfm wd, wn, #immr, #imms results_[0] = { bfm_2imms(sourceValues_, metadata_, false, false), 8}; @@ -1757,6 +1851,80 @@ void Instruction::execute() { results_[0] = {add_3ops(sourceValues_), 256}; break; } + case Opcode::AArch64_FADD_VG2_M2Z_D: { // fadd za.d[wv, #off, vgx2], + // {zn1.d, zn2.d} + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + const uint16_t elemCount = VL_bits / 64; + // Get ZA stride between halves and index into each ZA half + const uint16_t zaStride = zaRowCount / 2; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[0].sme.slice_offset.imm) % + zaStride; + + // Pre-set all ZA result rows as only 2 will be updated in loop below + for (int z = 0; z < zaRowCount; z++) { + results_[z] = sourceValues_[z]; + } + + // For each source vector and ZA Row pair + for (int r = 0; r < 2; r++) { + // Get row in correct ZA half + const double* zaRow = + sourceValues_[(r * zaStride) + zaIndex].getAsVector(); + // Get current source vector + const double* znr = + sourceValues_[zaRowCount + 1 + r].getAsVector(); + double out[32] = {0.0}; + // Loop over all elements and destructively add + for (int e = 0; e < elemCount; e++) { + out[e] = zaRow[e] + znr[e]; + } + results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256); + } + break; + } + case Opcode::AArch64_FADD_VG2_M2Z_S: { // fadd za.s[wv, #off, vgx2], + // {zn1.s, zn2.s} + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + const uint16_t elemCount = VL_bits / 32; + // Get ZA stride between halves and index into each ZA half + const uint16_t zaStride = zaRowCount / 2; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[0].sme.slice_offset.imm) % + zaStride; + + // Pre-set all ZA result rows as only 2 will be updated in loop below + for (int z = 0; z < zaRowCount; z++) { + results_[z] = sourceValues_[z]; + } + + // For each source vector and ZA Row pair + for (int r = 0; r < 2; r++) { + // Get row in correct ZA half + const float* zaRow = + sourceValues_[(r * zaStride) + zaIndex].getAsVector(); + // Get current source vector + const float* znr = + sourceValues_[zaRowCount + 1 + r].getAsVector(); + float out[64] = {0.0f}; + // Loop over all elements and destructively add + for (int e = 0; e < elemCount; e++) { + out[e] = zaRow[e] + znr[e]; + } + results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256); + } + break; + } case Opcode::AArch64_FADD_ZPmI_D: { // fadd zdn.d, pg/m, zdn.d, const results_[0] = sveAddPredicated_const(sourceValues_, metadata_, VL_bits); @@ -1795,6 +1963,16 @@ void Instruction::execute() { results_[0] = vecAdd_3ops(sourceValues_); break; } + case Opcode::AArch64_FADDV_VPZ_D: { // faddv dd, p0, zn.d + + results_[0] = sveFaddv_predicated(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_FADDV_VPZ_S: { // faddv sd, p0, zn.s + + results_[0] = sveFaddv_predicated(sourceValues_, VL_bits); + break; + } case Opcode::AArch64_FCADD_ZPmZ_D: { // fcadd zdn.d, pg/m, zdn.d, zm.d, // #imm results_[0] = @@ -2221,6 +2399,196 @@ void Instruction::execute() { [](double x, double y) -> double { return std::fmin(x, y); }); break; } + case Opcode::AArch64_FMLA_VG4_M4Z4Z_D: { // fmla za.d[wv, offs, vgx4], + // {zn1.d - zn4.d}, {zm1.d - + // zm4.d} + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + const uint16_t elemCount = VL_bits / 64; + // Get ZA stride between quarters and index into each ZA quarter + const uint16_t zaStride = zaRowCount / 4; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[0].sme.slice_offset.imm) % + zaStride; + + // Pre-set all ZA result rows as only 4 will be updated in loop below + for (int z = 0; z < zaRowCount; z++) { + results_[z] = sourceValues_[z]; + } + + // Get sourceValues_ index of first zn and zm regs + const uint16_t n = zaRowCount + 1; + const uint16_t m = zaRowCount + 5; + + // Loop over each source vector and destination vector (from the za + // single-vector group) pair + for (int r = 0; r < 4; r++) { + // For ZA single-vector groups of 4 vectors (vgx4), each vector is in + // a different quarter of ZA; indexed into it by Wv+off. + const double* zaRow = + sourceValues_[(r * zaStride) + zaIndex].getAsVector(); + const double* zn = sourceValues_[n + r].getAsVector(); + const double* zm = sourceValues_[m + r].getAsVector(); + double out[32] = {0.0}; + for (int e = 0; e < elemCount; e++) { + out[e] = zaRow[e] + (zn[e] * zm[e]); + } + results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256); + } + break; + } + case Opcode::AArch64_FMLA_VG4_M4Z4Z_S: { // fmla za.s[wv, offs, vgx4], + // {zn1.s - zn4.s}, {zm1.s - + // zm4.s} + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + const uint16_t elemCount = VL_bits / 32; + // Get ZA stride between quarters and index into each ZA quarter + const uint16_t zaStride = zaRowCount / 4; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[0].sme.slice_offset.imm) % + zaStride; + + // Pre-set all ZA result rows as only 4 will be updated in loop below + for (int z = 0; z < zaRowCount; z++) { + results_[z] = sourceValues_[z]; + } + + // Get sourceValues_ index of first zn and zm regs + const uint16_t n = zaRowCount + 1; + const uint16_t m = zaRowCount + 5; + + // Loop over each source vector and destination vector (from the za + // single-vector group) pair + for (int r = 0; r < 4; r++) { + // For ZA single-vector groups of 4 vectors (vgx4), each vector is in + // a different quarter of ZA; indexed into it by Wv+off. + const float* zaRow = + sourceValues_[(r * zaStride) + zaIndex].getAsVector(); + const float* zn = sourceValues_[n + r].getAsVector(); + const float* zm = sourceValues_[m + r].getAsVector(); + float out[64] = {0.0f}; + for (int e = 0; e < elemCount; e++) { + out[e] = zaRow[e] + (zn[e] * zm[e]); + } + results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256); + } + break; + } + case Opcode::AArch64_FMLA_VG4_M4ZZI_D: { // fmla za.d[wv, offs, vgx4], + // {zn1.d - zn4.d}, zm.d[index] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + const uint16_t elemCount = VL_bits / 64; + // Get ZA stride between quarters and index into each ZA quarter + const uint16_t zaStride = zaRowCount / 4; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[0].sme.slice_offset.imm) % + zaStride; + // Get zm vector and zm's index + const double* zm = sourceValues_[zaRowCount + 5].getAsVector(); + const int zmIndex = metadata_.operands[5].vector_index; + + // Pre-set all ZA result rows as only 4 will be updated in loop below + for (int z = 0; z < zaRowCount; z++) { + results_[z] = sourceValues_[z]; + } + + // Loop over each source vector and destination vector (from the za + // single-vector group) pair + for (int r = 0; r < 4; r++) { + // For ZA single-vector groups of 4 vectors (vgx4), each vector is in + // a different quarter of ZA; indexed into it by Wv+off. + const double* zaRow = + sourceValues_[(r * zaStride) + zaIndex].getAsVector(); + const double* znr = + sourceValues_[zaRowCount + 1 + r].getAsVector(); + double out[32] = {0.0}; + // Loop over all elements of output row vector `zaRow` + for (int e = 0; e < elemCount; e++) { + // This instruction multiplies each element of the current `znr` by + // an indexed element of `zm` and destructively adds the result to + // the corresponding element in the current `zaRow`. + // + // The index for `zm` specifies which element in each 128-bit + // segment to use. The 128-bit segment of `zm` currently in use + // corresponds to the 128-bit segment that the current element of + // `znr` and `zaRow` is within. + + // MOD 2 as there are 2 64-bit elements per 128-bit segment of `zm` + const int zmSegBase = e - (e % 2); + out[e] = zaRow[e] + (znr[e] * zm[zmSegBase + zmIndex]); + } + // Update results_ for completed row + results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256); + } + break; + } + case Opcode::AArch64_FMLA_VG4_M4ZZI_S: { // fmla za.s[wv, offs, vgx4], + // {zn1.s - zn4.s}, zm.s[index] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + const uint16_t elemCount = VL_bits / 32; + // Get ZA stride between quarters and index into each ZA quarter + const uint16_t zaStride = zaRowCount / 4; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[0].sme.slice_offset.imm) % + zaStride; + // Get zm vector and zm's index + const float* zm = sourceValues_[zaRowCount + 5].getAsVector(); + const int zmIndex = metadata_.operands[5].vector_index; + + // Pre-set all ZA result rows as only 4 will be updated in loop below + for (int z = 0; z < zaRowCount; z++) { + results_[z] = sourceValues_[z]; + } + + // Loop over each source vector and destination vector (from the za + // single-vector group) pair + for (int r = 0; r < 4; r++) { + // For ZA single-vector groups of 4 vectors (vgx4), each vector is in + // a different quarter of ZA; indexed into it by Wv+off. + const float* zaRow = + sourceValues_[(r * zaStride) + zaIndex].getAsVector(); + const float* znr = + sourceValues_[zaRowCount + 1 + r].getAsVector(); + float out[64] = {0.0f}; + // Loop over all elements of output row vector `zaRow` + for (int e = 0; e < elemCount; e++) { + // This instruction multiplies each element of the current `znr` by + // an indexed element of `zm` and destructively adds the result to + // the corresponding element in the current `zaRow`. + // + // The index for `zm` specifies which element in each 128-bit + // segment to use. The 128-bit segment of `zm` currently in use + // corresponds to the 128-bit segment that the current element of + // `znr` and `zaRow` is within. + + // MOD 4 as there are 4 32-bit elements per 128-bit segment of `zm` + const int zmSegBase = e - (e % 4); + out[e] = zaRow[e] + (znr[e] * zm[zmSegBase + zmIndex]); + } + // Update results_ for completed row + results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256); + } + break; + } case Opcode::AArch64_FMLA_ZPmZZ_D: { // fmla zd.d, pg/m, zn.d, zm.d results_[0] = sveMlaPredicated_vecs(sourceValues_, VL_bits); break; @@ -2291,6 +2659,63 @@ void Instruction::execute() { results_[0] = vecFmlsIndexed_3vecs(sourceValues_, metadata_); break; } +#if SIMENG_ENABLE_BF16 == 1 + case Opcode::AArch64_BFMOPA_MPPZZ: { // bfmopa zada.s, pn/m, pm/m, zn.h, + // zm.h + // SME + // BF16 -- EXPERIMENTAL + // Must be enabled at SimEng compile time + // Not verified to be working for all compilers or OSs. + // No Tests written + + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 32; + const uint64_t* pn = sourceValues_[rowCount].getAsVector(); + const uint64_t* pm = + sourceValues_[rowCount + 1].getAsVector(); + // Use uint16_t to get 2-byte elements + const uint16_t* zn = + sourceValues_[rowCount + 2].getAsVector(); + const uint16_t* zm = + sourceValues_[rowCount + 3].getAsVector(); + + // zn is row, zm is col + for (int row = 0; row < rowCount; row++) { + float outRow[64] = {0.0f}; + // Shifted active is for bf16 elements + uint64_t shifted_active_row = 1ull << ((row % 32) * 2); + const float* zadaRow = sourceValues_[row].getAsVector(); + for (int col = 0; col < rowCount; col++) { + outRow[col] = zadaRow[col]; + // Shifted active is for bf16 elements + uint64_t shifted_active_col = 1ull << ((col % 32) * 2); + bool pred_row1 = pn[(2 * row) / 32] & shifted_active_row; + bool pred_row2 = pn[(2 * row + 1) / 32] & shifted_active_row; + bool pred_col1 = pm[(2 * col) / 32] & shifted_active_col; + bool pred_col2 = pm[(2 * col + 1) / 32] & shifted_active_col; + if ((pred_row1 && pred_col1) || (pred_row2 && pred_col2)) { + float zn1, zn2, zm1, zm2; + // Horrible hack in order to convert bf16 (currently stored in a + // uint16_t) into a float. + // Each bf16 is copied into the most significant 16-bits of each + // float variable; given IEEE FP32 and BF16 have the same width + // exponent and one sign bit. + memcpy((uint16_t*)&zn1 + 1, &zn[2 * row], 2); + memcpy((uint16_t*)&zn2 + 1, &zn[2 * row + 1], 2); + memcpy((uint16_t*)&zm1 + 1, &zm[2 * col], 2); + memcpy((uint16_t*)&zm2 + 1, &zm[2 * col + 1], 2); + outRow[col] += (pred_row1 && pred_col1) ? zn1 * zm1 : 0.0f; + outRow[col] += (pred_row2 && pred_col2) ? zn2 * zm2 : 0.0f; + } + } + results_[row] = {outRow, 256}; + } + break; + } +#endif case Opcode::AArch64_FMOPA_MPPZZ_D: { // fmopa zada.d, pn/m, pm/m, zn.d, // zm.d // SME @@ -3657,7 +4082,7 @@ void Instruction::execute() { } break; } - case Opcode::AArch64_LD1B: { // ld1b {zt.b}, pg/z, [xn, xm] + case Opcode::AArch64_LD1B: { // ld1b {zt.b}, pg/z, [xn, xm] // LOAD const uint64_t* p = sourceValues_[0].getAsVector(); @@ -3695,6 +4120,69 @@ void Instruction::execute() { results_[0] = {out, 256}; break; } + case Opcode::AArch64_LD1B_2Z: // ld1b {zt1.b, zt2.b}, png/z, [xn, xm] + // LOAD + [[fallthrough]]; + case Opcode::AArch64_LD1B_2Z_IMM: { // ld1b {zt1.b, zt2.b}, png/z, [xn{, + // #imm, mul vl}] + // LOAD + const uint64_t pn = sourceValues_[0].get(); + + auto preds = predAsCounterToMasks(pn, VL_bits); + + uint8_t out[2][256] = {{0}, {0}}; + const uint16_t partition_num = VL_bits / 8; + + for (int r = 0; r < 2; r++) { + const uint8_t* data = memoryData_[r].getAsVector(); + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << (i % 64); + if (preds[r][i / 64] & shifted_active) { + out[r][i] = data[i]; + } + } + } + results_[0] = {out[0], 256}; + results_[1] = {out[1], 256}; + break; + } + case Opcode::AArch64_LD1B_4Z_STRIDED: // ld1b {zt1.b, zt2.b, zt3.b, + // zt4.b}, png/z, [xn, xm] + // LOAD + [[fallthrough]]; + case Opcode::AArch64_LD1B_4Z_STRIDED_IMM: // ld1b {zt1.b, zt2.b, zt3.b, + // zt4.b}, png/z, [xn{, #imm, + // mul vl}] + // LOAD + [[fallthrough]]; + case Opcode::AArch64_LD1B_4Z: // ld1b {zt1.b - zt4.b}, png/z, [xn, xm] + // LOAD + [[fallthrough]]; + case Opcode::AArch64_LD1B_4Z_IMM: { // ld1b {zt1.b - zt4.b}, png/z, [xn{, + // #imm, mul vl}] + // LOAD + const uint64_t pn = sourceValues_[0].get(); + + auto preds = predAsCounterToMasks(pn, VL_bits); + + uint8_t out[4][256] = {{0}, {0}, {0}, {0}}; + const uint16_t partition_num = VL_bits / 8; + + for (int r = 0; r < 4; r++) { + const uint8_t* data = memoryData_[r].getAsVector(); + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << (i % 64); + if (preds[r][i / 64] & shifted_active) { + out[r][i] = data[i]; + } + } + } + results_[0] = {out[0], 256}; + results_[1] = {out[1], 256}; + results_[2] = {out[2], 256}; + results_[3] = {out[3], 256}; + break; + } case Opcode::AArch64_LD1D: { // ld1d {zt.d}, pg/z, [xn, xm, lsl #3] // LOAD const uint64_t* p = sourceValues_[0].getAsVector(); @@ -3714,6 +4202,58 @@ void Instruction::execute() { results_[0] = {out, 256}; break; } + case Opcode::AArch64_LD1D_2Z_IMM: { // ld1d {zt1.d, zt2.d}, png/z, [xn{, + // #imm, mul vl}] + // LOAD + const uint64_t pn = sourceValues_[0].get(); + + auto preds = predAsCounterToMasks(pn, VL_bits); + + uint64_t out[2][32] = {{0}, {0}}; + const uint16_t partition_num = VL_bits / 64; + + for (int r = 0; r < 2; r++) { + const uint64_t* data = memoryData_[r].getAsVector(); + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % 8) * 8); + if (preds[r][i / 8] & shifted_active) { + out[r][i] = data[i]; + } + } + } + results_[0] = {out[0], 256}; + results_[1] = {out[1], 256}; + break; + } + case Opcode::AArch64_LD1D_4Z: // ld1d {zt1.d - zt4.d}, png/z, [xn, + // xm, lsl #3] + // LOAD + [[fallthrough]]; + case Opcode::AArch64_LD1D_4Z_IMM: { // ld1d {zt1.d - zt4.d}, png/z, [xn{, + // #imm, mul vl}] + // LOAD + const uint64_t pn = sourceValues_[0].get(); + + auto preds = predAsCounterToMasks(pn, VL_bits); + + uint64_t out[4][32] = {{0}, {0}, {0}, {0}}; + const uint16_t partition_num = VL_bits / 64; + + for (int r = 0; r < 4; r++) { + const uint64_t* data = memoryData_[r].getAsVector(); + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % 8) * 8); + if (preds[r][i / 8] & shifted_active) { + out[r][i] = data[i]; + } + } + } + results_[0] = {out[0], 256}; + results_[1] = {out[1], 256}; + results_[2] = {out[2], 256}; + results_[3] = {out[3], 256}; + break; + } case Opcode::AArch64_LD1D_IMM: { // ld1d {zt.d}, pg/z, [xn{, #imm, // mul vl}] // LOAD @@ -3734,6 +4274,10 @@ void Instruction::execute() { results_[0] = {out, 256}; break; } + case Opcode::AArch64_LD1H_IMM: // ld1h {zt.h}, pg/z, [xn{, #imm, mul + // vl}] + // LOAD + [[fallthrough]]; case Opcode::AArch64_LD1H: { // ld1h {zt.h}, pg/z, [xn, xm, lsl #1] // LOAD const uint64_t* p = sourceValues_[0].getAsVector(); @@ -3753,6 +4297,33 @@ void Instruction::execute() { results_[0] = {out, 256}; break; } + case Opcode::AArch64_LD1H_2Z: // ld1h {zt1.h, zt2.h}, png/z, [xn, xm, + // lsl #1] + // LOAD + [[fallthrough]]; + case Opcode::AArch64_LD1H_2Z_IMM: { // ld1h {zt1.h, zt2.h}, png/z, [xn{, + // #imm, mul vl}] + // LOAD + const uint64_t pn = sourceValues_[0].get(); + + auto preds = predAsCounterToMasks(pn, VL_bits); + + uint16_t out[2][128] = {{0}, {0}}; + const uint16_t partition_num = VL_bits / 16; + + for (int r = 0; r < 2; r++) { + const uint16_t* data = memoryData_[r].getAsVector(); + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % 32) * 2); + if (preds[r][i / 32] & shifted_active) { + out[r][i] = data[i]; + } + } + } + results_[0] = {out[0], 256}; + results_[1] = {out[1], 256}; + break; + } case Opcode::AArch64_LD1Onev16b: { // ld1 {vt.16b} [xn] results_[0] = memoryData_[0].zeroExtend(memoryData_[0].size(), 256); break; @@ -3767,6 +4338,16 @@ void Instruction::execute() { results_[1] = memoryData_[0].zeroExtend(memoryData_[0].size(), 256); break; } + case Opcode::AArch64_LD1Onev8b_POST: { // ld1 {vt.8b}, [xn], <#imm|xm> + // if #imm post-index, value can only be 8 + const uint64_t postIndex = + (metadata_.operands[2].type == AARCH64_OP_REG) + ? sourceValues_[1].get() + : 8; + results_[0] = sourceValues_[0].get() + postIndex; + results_[1] = memoryData_[0].zeroExtend(memoryData_[0].size(), 256); + break; + } case Opcode::AArch64_LD1RD_IMM: { // ld1rd {zt.d}, pg/z, [xn, #imm] // LOAD const uint16_t partition_num = VL_bits / 64; @@ -3794,6 +4375,30 @@ void Instruction::execute() { results_[0] = {out, 256}; break; } + case Opcode::AArch64_LD1RQ_B: // ld1rqb {zd.b}, pg/z, [xn, xm] + case Opcode::AArch64_LD1RQ_B_IMM: { // ld1rqb {zd.b}, pg/z, [xn{, #imm }] + // LOAD + const uint64_t* p = sourceValues_[0].getAsVector(); + const uint16_t partition_num = VL_bits / 8; + uint8_t out[256] = {0}; + const uint8_t* data = memoryData_[0].getAsVector(); + + // Get mini-vector (quadword) + uint8_t mini[16] = {0}; + for (int i = 0; i < 16; i++) { + uint64_t shifted_active = 1ull << (i % 64); + if (p[i / 64] & shifted_active) { + mini[i] = data[i]; + } + } + + // Duplicate mini-vector into output vector + for (int i = 0; i < partition_num; i++) { + out[i] = mini[i % 16]; + } + results_[0] = {out, 256}; + break; + } case Opcode::AArch64_LD1RQ_D_IMM: { // ld1rqd {zd.d}, pg/z, [xn{, #imm}] // LOAD const uint64_t* p = sourceValues_[0].getAsVector(); @@ -4078,6 +4683,9 @@ void Instruction::execute() { case Opcode::AArch64_LD1Twov2d_POST: // ld1 {vt1.2d, vt2.2d}, [xn], // <#imm|xm> [[fallthrough]]; + case Opcode::AArch64_LD1Twov8h_POST: // ld1 {vt1.8h, vt2.8h}, [xn], + // <#imm|xm> + [[fallthrough]]; case Opcode::AArch64_LD1Twov4s_POST: { // ld1 {vt1.4s, vt2.4s}, [xn], // <#imm|xm> // LOAD @@ -4130,6 +4738,62 @@ void Instruction::execute() { results_[0] = {out, 256}; break; } + case Opcode::AArch64_LD1W_2Z: // ld1w {zt1.s, zt2.s}, png/z, [xn, xm, + // lsl #2] + // LOAD + [[fallthrough]]; + case Opcode::AArch64_LD1W_2Z_IMM: { // ld1w {zt1.s, zt2.s}, png/z, [xn{, + // #imm, mul vl}] + // LOAD + const uint64_t pn = sourceValues_[0].get(); + + auto preds = predAsCounterToMasks(pn, VL_bits); + + uint32_t out[2][64] = {{0}, {0}}; + const uint16_t partition_num = VL_bits / 32; + + for (int r = 0; r < 2; r++) { + const uint32_t* data = memoryData_[r].getAsVector(); + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % 16) * 4); + if (preds[r][i / 16] & shifted_active) { + out[r][i] = data[i]; + } + } + } + results_[0] = {out[0], 256}; + results_[1] = {out[1], 256}; + break; + } + case Opcode::AArch64_LD1W_4Z: // ld1w {zt1.s - zt4.s}, png/z, [xn, + // xm, lsl #2] + // LOAD + [[fallthrough]]; + case Opcode::AArch64_LD1W_4Z_IMM: { // ld1w {zt1.s - zt4.s}, png/z, [xn{, + // #imm, mul vl}] + // LOAD + const uint64_t pn = sourceValues_[0].get(); + + auto preds = predAsCounterToMasks(pn, VL_bits); + + uint32_t out[4][64] = {{0}, {0}, {0}, {0}}; + const uint16_t partition_num = VL_bits / 32; + + for (int r = 0; r < 4; r++) { + const uint32_t* data = memoryData_[r].getAsVector(); + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % 16) * 4); + if (preds[r][i / 16] & shifted_active) { + out[r][i] = data[i]; + } + } + } + results_[0] = {out[0], 256}; + results_[1] = {out[1], 256}; + results_[2] = {out[2], 256}; + results_[3] = {out[3], 256}; + break; + } case Opcode::AArch64_LD1i32: { // ld1 {vt.s}[index], [xn] // LOAD const int index = metadata_.operands[0].vector_index; @@ -4494,6 +5158,15 @@ void Instruction::execute() { results_[0] = memoryData_[0].zeroExtend(16, 256); break; } + case Opcode::AArch64_LDRSBWpost: { // ldrsb wt, [xn], #imm + // LOAD + results_[1] = + RegisterValue(static_cast(memoryData_[0].get()), 4) + .zeroExtend(4, 8); + results_[0] = RegisterValue( + sourceValues_[0].get() + metadata_.operands[2].imm, 8); + break; + } case Opcode::AArch64_LDRSBWroX: { // ldrsb wt, [xn, xm{, extend // {#amount}}] // LOAD @@ -4768,6 +5441,65 @@ void Instruction::execute() { results_[0] = sveMlaPredicated_vecs(sourceValues_, VL_bits); break; } + case Opcode::AArch64_MOVA_4ZMXI_H_B: { // mova {zd1.b - zd4.b}, + // za0h.b[ws, offs1:offs4] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t sliceCount = VL_bits / 8; + + const uint32_t ws = sourceValues_[sliceCount].get(); + const uint8_t offs1 = + metadata_.operands[4].sme.slice_offset.imm_range.first; + const uint8_t offs4 = + metadata_.operands[4].sme.slice_offset.imm_range.offset; + + for (uint8_t i = offs1; i <= offs4; i++) { + const uint8_t index = i - offs1; + results_[index] = sourceValues_[(ws + i) % sliceCount]; + } + break; + } + case Opcode::AArch64_MOVA_VG2_2ZMXI: { // mova {zd1.d, zd2.d}, za.d[wv, + // offs, vgx2] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + // Get ZA stride between halves and index into each ZA quarter + const uint16_t zaStride = zaRowCount / 2; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[2].sme.slice_offset.imm) % + zaStride; + + results_[0] = sourceValues_[zaIndex]; + results_[1] = sourceValues_[zaStride + zaIndex]; + break; + } + case Opcode::AArch64_MOVA_VG4_4ZMXI: { // mova {zd1.d - zd4.d}, za.d[wv, + // offs, vgx4] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + // Get ZA stride between quarters and index into each ZA quarter + const uint16_t zaStride = zaRowCount / 4; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[4].sme.slice_offset.imm) % + zaStride; + + results_[0] = sourceValues_[zaIndex]; + results_[1] = sourceValues_[zaStride + zaIndex]; + results_[2] = sourceValues_[(2 * zaStride) + zaIndex]; + results_[3] = sourceValues_[(3 * zaStride) + zaIndex]; + break; + } case Opcode::AArch64_MOVID: { // movi dd, #imm results_[0] = {static_cast(metadata_.operands[1].imm), 256}; break; @@ -5004,6 +5736,14 @@ void Instruction::execute() { [](uint8_t x, uint8_t y) -> uint8_t { return x | y; }); break; } + case Opcode::AArch64_AUTIASP: // autiasp + [[fallthrough]]; + case Opcode::AArch64_PACIASP: { // paciasp + const uint64_t x30 = sourceValues_[0].get(); + // Mimic execution by writing leaving x30 unmodified + results_[0] = {x30, 8}; + break; + } case Opcode::AArch64_PFALSE: { // pfalse pd.b uint64_t out[4] = {0, 0, 0, 0}; results_[0] = out; @@ -5053,6 +5793,22 @@ void Instruction::execute() { results_[0] = svePtrue(metadata_, VL_bits); break; } + case Opcode::AArch64_PTRUE_C_B: { // ptrue pnd.b + results_[0] = svePtrue_counter(VL_bits); + break; + } + case Opcode::AArch64_PTRUE_C_D: { // ptrue pnd.d + results_[0] = svePtrue_counter(VL_bits); + break; + } + case Opcode::AArch64_PTRUE_C_H: { // ptrue pnd.h + results_[0] = svePtrue_counter(VL_bits); + break; + } + case Opcode::AArch64_PTRUE_C_S: { // ptrue pnd.s + results_[0] = svePtrue_counter(VL_bits); + break; + } case Opcode::AArch64_PUNPKHI_PP: { // punpkhi pd.h, pn.b results_[0] = svePunpk(sourceValues_, VL_bits, true); break; @@ -5069,9 +5825,18 @@ void Instruction::execute() { results_[0] = rbit(sourceValues_, metadata_); break; } + case Opcode::AArch64_RDSVLI_XI: { // rdsvl xd, #imm + // Uses Streaming SVE vector register size, regardless of streaming mode + // state + int64_t imm = metadata_.operands[1].imm; + results_[0] = imm * static_cast( + architecture_.getStreamingVectorLength() / 8); + break; + } case Opcode::AArch64_RDVLI_XI: { // rdvl xd, #imm - int8_t imm = static_cast(metadata_.operands[1].imm); - results_[0] = (uint64_t)(imm * (VL_bits / 8)); + // Uses current vector register size + int64_t imm = metadata_.operands[1].imm; + results_[0] = imm * static_cast(VL_bits / 8); break; } case Opcode::AArch64_RET: { // ret {xr} @@ -5961,6 +6726,50 @@ void Instruction::execute() { memoryData_ = sve_merge_store_data(d, p, VL_bits); break; } + case Opcode::AArch64_ST1D_2Z: // st1d {zt1.d, zt2.d}, png, [xn, xm, lsl + // #3] + // STORE + [[fallthrough]]; + case Opcode::AArch64_ST1D_2Z_IMM: { // st1d {zt1.d, zt2.d}, png, [xn{, + // #imm, mul vl}] + // STORE + const uint64_t* t1 = sourceValues_[0].getAsVector(); + const uint64_t* t2 = sourceValues_[1].getAsVector(); + const uint64_t pn = sourceValues_[2].get(); + + auto preds = predAsCounterToMasks(pn, VL_bits); + + memoryData_ = + sve_merge_store_data(t1, preds[0].data(), VL_bits); + std::vector out2 = + sve_merge_store_data(t2, preds[1].data(), VL_bits); + memoryData_.insert(memoryData_.end(), out2.begin(), out2.end()); + break; + } + case Opcode::AArch64_ST1D_4Z_IMM: { // st1d {zt1.d - zt4.d}, png, [xn{, + // #imm, mul vl}] + // STORE + const uint64_t* t1 = sourceValues_[0].getAsVector(); + const uint64_t* t2 = sourceValues_[1].getAsVector(); + const uint64_t* t3 = sourceValues_[2].getAsVector(); + const uint64_t* t4 = sourceValues_[3].getAsVector(); + const uint64_t pn = sourceValues_[4].get(); + + auto preds = predAsCounterToMasks(pn, VL_bits); + + memoryData_ = + sve_merge_store_data(t1, preds[0].data(), VL_bits); + std::vector out2 = + sve_merge_store_data(t2, preds[1].data(), VL_bits); + std::vector out3 = + sve_merge_store_data(t3, preds[2].data(), VL_bits); + std::vector out4 = + sve_merge_store_data(t4, preds[3].data(), VL_bits); + memoryData_.insert(memoryData_.end(), out2.begin(), out2.end()); + memoryData_.insert(memoryData_.end(), out3.begin(), out3.end()); + memoryData_.insert(memoryData_.end(), out4.begin(), out4.end()); + break; + } case Opcode::AArch64_ST1Fourv16b: { // st1 {vt.16b, vt2.16b, vt3.16b, // vt4.16b}, [xn|sp] // STORE @@ -6056,6 +6865,19 @@ void Instruction::execute() { results_[0] = sourceValues_[4].get() + postIndex; break; } + case Opcode::AArch64_ST1Onev4s_POST: { // st1 {vt.4s}, [xn|sp], <#imm|xm> + // STORE + const uint32_t* vt = sourceValues_[0].getAsVector(); + memoryData_[0] = RegisterValue((char*)vt, 4 * sizeof(uint32_t)); + + // if #imm post-index, value can only be 16 + const uint64_t postIndex = + (metadata_.operands[2].type == AARCH64_OP_REG) + ? sourceValues_[2].get() + : 16; + results_[0] = sourceValues_[1].get() + postIndex; + break; + } case Opcode::AArch64_ST1Twov16b: { // st1 {vt.16b, vt2.16b}, [xn|sp] // STORE const uint8_t* t = sourceValues_[0].getAsVector(); @@ -6152,6 +6974,50 @@ void Instruction::execute() { memoryData_ = sve_merge_store_data(d, p, VL_bits); break; } + case Opcode::AArch64_ST1W_2Z: // st1w {zt1.s, zt2.s}, png, [xn, xm, lsl + // #2] + // STORE + [[fallthrough]]; + case Opcode::AArch64_ST1W_2Z_IMM: { // st1w {zt1.s, zt2.s}, png, [xn{, + // #imm, mul vl}] + // STORE + const uint32_t* t1 = sourceValues_[0].getAsVector(); + const uint32_t* t2 = sourceValues_[1].getAsVector(); + const uint64_t pn = sourceValues_[2].get(); + + auto preds = predAsCounterToMasks(pn, VL_bits); + + memoryData_ = + sve_merge_store_data(t1, preds[0].data(), VL_bits); + std::vector out2 = + sve_merge_store_data(t2, preds[1].data(), VL_bits); + memoryData_.insert(memoryData_.end(), out2.begin(), out2.end()); + break; + } + case Opcode::AArch64_ST1W_4Z_IMM: { // st1w {zt1.s - zt4.s}, png, [xn{, + // #imm, mul vl}] + // STORE + const uint32_t* t1 = sourceValues_[0].getAsVector(); + const uint32_t* t2 = sourceValues_[1].getAsVector(); + const uint32_t* t3 = sourceValues_[2].getAsVector(); + const uint32_t* t4 = sourceValues_[3].getAsVector(); + const uint64_t pn = sourceValues_[4].get(); + + auto preds = predAsCounterToMasks(pn, VL_bits); + + memoryData_ = + sve_merge_store_data(t1, preds[0].data(), VL_bits); + std::vector out2 = + sve_merge_store_data(t2, preds[1].data(), VL_bits); + std::vector out3 = + sve_merge_store_data(t3, preds[2].data(), VL_bits); + std::vector out4 = + sve_merge_store_data(t4, preds[3].data(), VL_bits); + memoryData_.insert(memoryData_.end(), out2.begin(), out2.end()); + memoryData_.insert(memoryData_.end(), out3.begin(), out3.end()); + memoryData_.insert(memoryData_.end(), out4.begin(), out4.end()); + break; + } case Opcode::AArch64_ST1i16: { // st1 {vt.h}[index], [xn] // STORE const uint16_t* t = sourceValues_[0].getAsVector(); @@ -6231,33 +7097,15 @@ void Instruction::execute() { const uint64_t* d2 = sourceValues_[1].getAsVector(); const uint64_t* p = sourceValues_[2].getAsVector(); - std::vector memData; - bool inActiveBlock = false; - const uint16_t partition_num = VL_bits / 64; uint16_t index = 0; for (int i = 0; i < partition_num; i++) { uint64_t shifted_active = 1ull << ((i % 8) * 8); if (p[i / 8] & shifted_active) { - // If active and not in active block, initialise - if (!inActiveBlock) { - memData.clear(); - inActiveBlock = true; - } - memData.push_back(d1[i]); - memData.push_back(d2[i]); - } else if (inActiveBlock) { - inActiveBlock = false; - memoryData_[index] = RegisterValue( - (char*)memData.data(), sizeof(uint64_t) * memData.size()); - index++; + memoryData_[index++] = RegisterValue(d1[i], 8); + memoryData_[index++] = RegisterValue(d2[i], 8); } } - // Add final block if needed - if (inActiveBlock) - memoryData_[index] = RegisterValue((char*)memData.data(), - sizeof(uint64_t) * memData.size()); - break; } case Opcode::AArch64_ST2Twov4s_POST: { // st2 {vt1.4s, vt2.4s}, [xn], @@ -6277,6 +7125,31 @@ void Instruction::execute() { results_[0] = sourceValues_[2].get() + postIndex; break; } + case Opcode::AArch64_ST4W: // st4w {zt1.s, zt2.s, zt3.s, zt4.s}, + // pg, [, xm, lsl #2] + [[fallthrough]]; + case Opcode::AArch64_ST4W_IMM: { // st4w {zt1.s, zt2.s, zt3.s, zt4.s}, + // pg, [{, #imm, mul vl}] + // STORE + const uint32_t* d1 = sourceValues_[0].getAsVector(); + const uint32_t* d2 = sourceValues_[1].getAsVector(); + const uint32_t* d3 = sourceValues_[2].getAsVector(); + const uint32_t* d4 = sourceValues_[3].getAsVector(); + const uint64_t* p = sourceValues_[4].getAsVector(); + + const uint16_t partition_num = VL_bits / 32; + uint16_t index = 0; + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % 16) * 4); + if (p[i / 16] & shifted_active) { + memoryData_[index++] = RegisterValue(d1[i], 4); + memoryData_[index++] = RegisterValue(d2[i], 4); + memoryData_[index++] = RegisterValue(d3[i], 4); + memoryData_[index++] = RegisterValue(d4[i], 4); + } + } + break; + } case Opcode::AArch64_STLRB: { // stlrb wt, [xn] // STORE memoryData_[0] = sourceValues_[0]; @@ -7028,6 +7901,11 @@ void Instruction::execute() { bfm_2imms(sourceValues_, metadata_, false, true); break; } + case Opcode::AArch64_UCVTFSXSri: { // ucvtf sd, xn, #fbits + results_[0] = { + ucvtf_fixedToFloat(sourceValues_, metadata_), 256}; + break; + } case Opcode::AArch64_UCVTFUWDri: { // ucvtf dd, wn results_[0] = {static_cast(sourceValues_[0].get()), 256}; @@ -7066,6 +7944,200 @@ void Instruction::execute() { results_[0] = {div_3ops(sourceValues_), 8}; break; } + case Opcode::AArch64_UDOT_VG4_M4Z4Z_BtoS: { // udot za.s[wv, #off, vgx4], + // {zn1.b - zn4.b}, {zm1.b - + // zm4.b} + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + const uint16_t elemCount = VL_bits / 32; + // Get ZA stride between quarters and index into each ZA quarter + const uint16_t zaStride = zaRowCount / 4; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[0].sme.slice_offset.imm) % + zaStride; + + // Pre-set all ZA result rows as only 4 will be updated in loop below + for (int z = 0; z < zaRowCount; z++) { + results_[z] = sourceValues_[z]; + } + + // Get base zn and zm register indexed in sourceValues + const uint16_t znBase = zaRowCount + 1; + const uint16_t zmBase = zaRowCount + 5; + + // Loop over each source vector and destination vector (from the za + // single-vector group) pair + for (int r = 0; r < 4; r++) { + // For ZA single-vector groups of 4 vectors (vgx4), each vector is in + // a different quarter of ZA; indexed into it by Wv+off. + const uint32_t* zaRow = + sourceValues_[(r * zaStride) + zaIndex].getAsVector(); + const uint8_t* znr = sourceValues_[znBase + r].getAsVector(); + const uint8_t* zmr = sourceValues_[zmBase + r].getAsVector(); + uint32_t out[64] = {0}; + // Loop over all 32-bit elements of output row vector `zaRow` + for (int e = 0; e < elemCount; e++) { + out[e] = zaRow[e]; + // There are 4 8-bit elements per 32-bit element of `znr` and `zmr` + for (int i = 0; i < 4; i++) { + out[e] += static_cast(znr[4 * e + i]) * + static_cast(zmr[4 * e + i]); + } + } + // Update results_ for completed row + results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256); + } + break; + } + case Opcode::AArch64_UDOT_VG4_M4ZZI_BtoS: { // udot za.s[wv, #off, vgx4], + // {zn1.b - zn4.b}, + // zm.b[#index] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + const uint16_t elemCount = VL_bits / 32; + // Get ZA stride between quarters and index into each ZA quarter + const uint16_t zaStride = zaRowCount / 4; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[0].sme.slice_offset.imm) % + zaStride; + // Get zm vector and zm's index + const uint8_t* zm = + sourceValues_[zaRowCount + 5].getAsVector(); + const int zmIndex = metadata_.operands[5].vector_index; + + // Pre-set all ZA result rows as only 4 will be updated in loop below + for (int z = 0; z < zaRowCount; z++) { + results_[z] = sourceValues_[z]; + } + + // Loop over each source vector and destination vector (from the za + // single-vector group) pair + for (int r = 0; r < 4; r++) { + // For ZA single-vector groups of 4 vectors (vgx4), each vector is in + // a different quarter of ZA; indexed into it by Wv+off. + const uint32_t* zaRow = + sourceValues_[(r * zaStride) + zaIndex].getAsVector(); + const uint8_t* znr = + sourceValues_[zaRowCount + 1 + r].getAsVector(); + uint32_t out[64] = {0}; + // Loop over all 32-bit elements of output row vector `zaRow` + for (int e = 0; e < elemCount; e++) { + // This instruction destructively adds the widened dot product + // (4x 8-bit --> 1x 32-bit) of the following to each 32-bit element + // in the current `zaRow`: + // - four 8-bit values in each corresponding 32-bit element of + // the current source `znr` vector + // - four 8-bit values from a 32-bit element of `zm`, selected + // from each 128-bit segment of `zm` using an index + // + // The 128-bit segment of `zm` currently in use corresponds to the + // 128-bit segment that the current 32-bit elements of `znr` + // and `zaRow` are within. + // For example, with a SVL = 512-bits, elements `e` of `zaRow` in + // the range 0->15, and zmIndex = 1: + // - When `e` = 0 -> 3, the 32-bit element used from `zm` will be + // zm[1] (1st 32-bit element in 0th 128-bit + // segment) + // - When `e` = 4 -> 7, the 32-bit element used from `zm` will be + // zm[5] (1st 32-bit element in 1st 128-bit + // segment) + out[e] = zaRow[e]; + // MOD 4 as there are 4 32-bit elements per 128-bit segment of `zm` + const int zmSegBase = e - (e % 4); + const int s = zmSegBase + zmIndex; + // There are 4 8-bit elements per 32-bit element of `znr` and `zm` + for (int i = 0; i < 4; i++) { + out[e] += static_cast(znr[4 * e + i]) * + static_cast(zm[4 * s + i]); + } + } + // Update results_ for completed row + results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256); + } + break; + } + case Opcode::AArch64_UVDOT_VG4_M4ZZI_BtoS: { // uvdot za.s[wv, #off, + // vgx4], {zn1.b - zn4.b}, + // zm.b[#index] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + const uint16_t elemCount = VL_bits / 32; + // Get ZA stride between quarters and index into each ZA quarter + const uint16_t zaStride = zaRowCount / 4; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[0].sme.slice_offset.imm) % + zaStride; + // Get zm vector and zm's index + const uint8_t* zm = + sourceValues_[zaRowCount + 5].getAsVector(); + const int zmIndex = metadata_.operands[5].vector_index; + + // Pre-set all ZA result rows as only 4 will be updated in loop below + for (int z = 0; z < zaRowCount; z++) { + results_[z] = sourceValues_[z]; + } + + // Loop over each source vector and destination vector (from the za + // single-vector group) pair + for (int r = 0; r < 4; r++) { + // For ZA single-vector groups of 4 vectors (vgx4), each vector is in + // a different quarter of ZA; indexed into it by Wv+off. + const uint32_t* zaRow = + sourceValues_[(r * zaStride) + zaIndex].getAsVector(); + uint32_t out[64] = {0}; + // Loop over all 32-bit elements of output row vector `zaRow` + for (int e = 0; e < elemCount; e++) { + out[e] = zaRow[e]; + // MOD 4 as there are 4 32-bit elements per 128-bit segment of `zm` + const int zmSegBase = e - (e % 4); + const int s = zmSegBase + zmIndex; + // There are 4 8-bit elements per 32-bit element of `znr` and `zm` + for (int i = 0; i < 4; i++) { + const uint8_t* znr = + sourceValues_[zaRowCount + 1 + i].getAsVector(); + out[e] += static_cast(znr[4 * e + r]) * + static_cast(zm[4 * s + i]); + } + } + // Update results_ for completed row + results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256); + } + break; + } + case Opcode::AArch64_UDOT_ZZZ_S: { // udot zd.s, zn.b, zm.b + results_[0] = + sveUdot(sourceValues_, metadata_, VL_bits); + break; + } + case Opcode::AArch64_UDOT_ZZZI_S: { // udot zd.s, zn.b, zm.b[index] + results_[0] = sveUdot_indexed(sourceValues_, + metadata_, VL_bits); + break; + } + case Opcode::AArch64_UDOTv16i8: { // udot vd.4s, vn.16b, vm.16b + results_[0] = vecUdot<4>(sourceValues_, metadata_); + break; + } + case Opcode::AArch64_UDOTlanev16i8: { // udot vd.4s, vn.16b, vm.4b[index] + results_[0] = vecUdot_byElement<4>(sourceValues_, metadata_); + break; + } + case Opcode::AArch64_UDOTlanev8i8: { // udot vd.2s, vn.8b, vm.4b[index] + results_[0] = vecUdot_byElement<2>(sourceValues_, metadata_); + break; + } case Opcode::AArch64_UMADDLrrr: { // umaddl xd, wn, wm, xa results_[0] = maddl_4ops(sourceValues_); break; @@ -7078,6 +8150,32 @@ void Instruction::execute() { results_[0] = vecUMinP(sourceValues_); break; } + case Opcode::AArch64_UMLALv2i32_indexed: { // umlal vd.2d, vn.2s, + // vm.s[index] + const uint64_t* vd = sourceValues_[0].getAsVector(); + const uint32_t* vn = sourceValues_[1].getAsVector(); + const uint32_t* vm = sourceValues_[2].getAsVector(); + const int64_t index = metadata_.operands[2].vector_index; + const uint64_t vm_idx_elem = static_cast(vm[index]); + + uint64_t out[2] = {vd[0] + static_cast(vn[0]) * vm_idx_elem, + vd[1] + static_cast(vn[1]) * vm_idx_elem}; + results_[0] = {out, 256}; + break; + } + case Opcode::AArch64_UMLALv4i32_indexed: { // umlal2 vd.2d, vn.4s, + // vm.s[index] + const uint64_t* vd = sourceValues_[0].getAsVector(); + const uint32_t* vn = sourceValues_[1].getAsVector(); + const uint32_t* vm = sourceValues_[2].getAsVector(); + const int64_t index = metadata_.operands[2].vector_index; + const uint64_t vm_idx_elem = static_cast(vm[index]); + + uint64_t out[2] = {vd[0] + static_cast(vn[2]) * vm_idx_elem, + vd[1] + static_cast(vn[3]) * vm_idx_elem}; + results_[0] = {out, 256}; + break; + } case Opcode::AArch64_UMOPA_MPPZZ_D: { // umopa zada.d, pn/m, pm/m, zn.h, // zm.h // SME @@ -7257,6 +8355,17 @@ void Instruction::execute() { sourceValues_[1].get()); break; } + case Opcode::AArch64_UMULLv4i16_v4i32: { // umull vd.4s, vn.4h, vm.4h + const uint16_t* vn = sourceValues_[0].getAsVector(); + const uint16_t* vm = sourceValues_[1].getAsVector(); + + uint32_t out[4] = {0}; + for (int i = 0; i < 4; i++) { + out[i] = static_cast(vn[i]) * static_cast(vm[i]); + } + results_[0] = {out, 256}; + break; + } case Opcode::AArch64_UQDECD_WPiI: { // uqdecd wd{, pattern{, MUL #imm}} results_[0] = sveUqdec(sourceValues_, metadata_, VL_bits); @@ -7655,6 +8764,10 @@ void Instruction::execute() { results_[0] = sveZip_preds(sourceValues_, VL_bits, false); break; } + case Opcode::AArch64_ZIP1_ZZZ_B: { // zip1 zd.b, zn.b, zm.b + results_[0] = sveZip_vecs(sourceValues_, VL_bits, false); + break; + } case Opcode::AArch64_ZIP1_ZZZ_D: { // zip1 zd.d, zn.d, zm.d results_[0] = sveZip_vecs(sourceValues_, VL_bits, false); break; @@ -7707,6 +8820,10 @@ void Instruction::execute() { results_[0] = sveZip_preds(sourceValues_, VL_bits, true); break; } + case Opcode::AArch64_ZIP2_ZZZ_B: { // zip2 zd.b, zn.b, zm.b + results_[0] = sveZip_vecs(sourceValues_, VL_bits, true); + break; + } case Opcode::AArch64_ZIP2_ZZZ_D: { // zip2 zd.d, zn.d, zm.d results_[0] = sveZip_vecs(sourceValues_, VL_bits, true); break; @@ -7743,6 +8860,29 @@ void Instruction::execute() { results_[0] = vecZip(sourceValues_, true); break; } + case Opcode::AArch64_ZIP_VG4_4Z4Z_S: { // zip {zd1.s - zd4.s}, {zn1.s - + // zn4.s} + const uint32_t* zn[4]; + zn[0] = sourceValues_[0].getAsVector(); + zn[1] = sourceValues_[1].getAsVector(); + zn[2] = sourceValues_[2].getAsVector(); + zn[3] = sourceValues_[3].getAsVector(); + + const uint16_t quads = VL_bits / (32 * 4); + + uint32_t out[4][64] = {{0}, {0}, {0}, {0}}; + for (int r = 0; r < 4; r++) { + const uint16_t base = r * quads; + for (int q = 0; q < quads; q++) { + out[r][4 * q] = zn[0][base + q]; + out[r][4 * q + 1] = zn[1][base + q]; + out[r][4 * q + 2] = zn[2][base + q]; + out[r][4 * q + 3] = zn[3][base + q]; + } + results_[r] = RegisterValue(out[r], 256); + } + break; + } case Opcode::AArch64_ZERO_M: { // zero {mask} // SME // Not in right context mode. Raise exception @@ -7753,6 +8893,15 @@ void Instruction::execute() { } break; } + case Opcode::AArch64_ZERO_T: { // zero {zt0} + // SME + // Not in right context mode. Raise exception + if (!ZAenabled) return ZAdisabled(); + + // ZT0 has a fixed width of 512-bits + results_[0] = RegisterValue(0, 64); + break; + } default: return executionNYI(); } diff --git a/test/integration/ConfigTest.cc b/test/integration/ConfigTest.cc index 48975eeac..49a028ebb 100644 --- a/test/integration/ConfigTest.cc +++ b/test/integration/ConfigTest.cc @@ -24,7 +24,8 @@ TEST(ConfigTest, Default) { aarch64_sysreg::AARCH64_SYSREG_MIDR_EL1, aarch64_sysreg::AARCH64_SYSREG_CNTVCT_EL0, aarch64_sysreg::AARCH64_SYSREG_PMCCNTR_EL0, - aarch64_sysreg::AARCH64_SYSREG_SVCR}; + aarch64_sysreg::AARCH64_SYSREG_SVCR, + aarch64_sysreg::AARCH64_SYSREG_TPIDR2_EL0}; EXPECT_EQ(simeng::config::SimInfo::getSysRegVec(), sysRegisterEnums); std::vector archRegStruct = { {8, 32}, @@ -384,7 +385,8 @@ TEST(ConfigTest, configFromFile) { aarch64_sysreg::AARCH64_SYSREG_MIDR_EL1, aarch64_sysreg::AARCH64_SYSREG_CNTVCT_EL0, aarch64_sysreg::AARCH64_SYSREG_PMCCNTR_EL0, - aarch64_sysreg::AARCH64_SYSREG_SVCR}; + aarch64_sysreg::AARCH64_SYSREG_SVCR, + aarch64_sysreg::AARCH64_SYSREG_TPIDR2_EL0}; EXPECT_EQ(simeng::config::SimInfo::getSysRegVec(), sysRegisterEnums); std::vector archRegStruct = { {8, 32}, diff --git a/test/regression/aarch64/AArch64RegressionTest.hh b/test/regression/aarch64/AArch64RegressionTest.hh index 32d975b09..6afdc47d2 100644 --- a/test/regression/aarch64/AArch64RegressionTest.hh +++ b/test/regression/aarch64/AArch64RegressionTest.hh @@ -190,6 +190,23 @@ inline std::vector> genCoreTypeSVLPairs( checkMatrixRegisterCol(tag, index, __VA_ARGS__); \ } +/** Check each element of the Lookup Table register ZT0 against expected values. + * + * The `type` argument is the C++ data type to use for value comparisons. The + * third argument should be an initializer list containing one value for each + * register element (for a total of `(64 / sizeof(type))` values). + * + * For example: + * + * // Compare zt0 to some expected 32-bit uint64 values. + * CHECK_TABLE(0, uint32_t, {1, 2, 3, 4, ..., 16}); + */ +#define CHECK_TABLE(type, ...) \ + { \ + SCOPED_TRACE("<<== error generated here"); \ + checkTableRegister(__VA_ARGS__); \ + } + /** A helper macro to predecode the first instruction in a snippet of Armv9.2-a * assembly code and check the assigned group(s) for each micro-op matches the * expected group(s). Returns from the calling function if a fatal error occurs. @@ -239,13 +256,16 @@ class AArch64RegressionTest : public RegressionTest { /** Get the subtarget feature string based on LLVM version being used */ std::string getSubtargetFeaturesString() { -#if SIMENG_LLVM_VERSION < 14 - return "+sve,+lse"; -#elif SIMENG_LLVM_VERSION < 18 - return "+sve,+lse,+sve2,+sme,+sme-f64"; -#else - return "+sve,+lse,+sve2,+sme,+sme-f64f64,+sme-i16i64,+sme2"; + std::string features = "+dotprod,+sve,+lse"; +#if SIMENG_LLVM_VERSION > 13 + // "+dotprod,+sve,+lse,+sve2,+sme,+sme-f64"; + features += ",+sve2,+sme,+sme-f64"; +#endif +#if SIMENG_LLVM_VERSION > 17 + // "+dotprod,+sve,+lse,+sve2,+sme,+sme-f64f64,+sme-i16i64,+sme2"; + features += "f64,+sme-i16i64,+sme2"; #endif + return features; } /** Check the elements of a Neon register. @@ -358,6 +378,21 @@ class AArch64RegressionTest : public RegressionTest { } } + /** Check the elements of the ZT0 lookup table register. + * + * This should be invoked via the `CHECK_TABLE` macro in order to provide + * better diagnostic messages, rather than called directly from test code. + */ + template + void checkTableRegister(const std::array& values) const { + const T* data = RegressionTest::getVectorRegister( + {simeng::arch::aarch64::RegisterType::TABLE, 0}); + for (unsigned i = 0; i < (64 / sizeof(T)); i++) { + EXPECT_NEAR(data[i], values[i], 0.0005) + << "Mismatch for element " << i << "."; + } + } + /** Get the value of a general purpose register. */ template T getGeneralRegister(uint8_t tag) const { diff --git a/test/regression/aarch64/instructions/bitmanip.cc b/test/regression/aarch64/instructions/bitmanip.cc index a72dcb64d..30eb27fce 100644 --- a/test/regression/aarch64/instructions/bitmanip.cc +++ b/test/regression/aarch64/instructions/bitmanip.cc @@ -274,11 +274,20 @@ TEST_P(InstBitmanip, ubfm) { ubfm w2, w0, #16, #31 ubfm w3, w0, #28, #23 ubfm w4, w0, #30, #27 + + # check alias + mov w10, #-1 + mov w11, #-1 + mov w12, #128 + lsl w10, w12, #1 + lsr w11, w12, #1 )"); EXPECT_EQ(getGeneralRegister(1), 0x000007A0ull); EXPECT_EQ(getGeneralRegister(2), 0x0000007Aull); EXPECT_EQ(getGeneralRegister(3), 0x07A00000ull); EXPECT_EQ(getGeneralRegister(4), 0x01E80000ull); + EXPECT_EQ(getGeneralRegister(10), 256); + EXPECT_EQ(getGeneralRegister(11), 64); RUN_AARCH64(R"( # Fill destination registers with 1s @@ -295,11 +304,20 @@ TEST_P(InstBitmanip, ubfm) { ubfm x2, x0, #16, #63 ubfm x3, x0, #32, #23 ubfm x4, x0, #60, #55 + + # check alias + mov x10, #-1 + mov x11, #-1 + mov x12, #128 + lsl x10, x12, #1 + lsr x11, x12, #1 )"); EXPECT_EQ(getGeneralRegister(1), 0x00000000000007A0ull); EXPECT_EQ(getGeneralRegister(2), 0x000000000000007Aull); EXPECT_EQ(getGeneralRegister(3), 0x007A000000000000ull); EXPECT_EQ(getGeneralRegister(4), 0x0000000007A00000ull); + EXPECT_EQ(getGeneralRegister(10), 256); + EXPECT_EQ(getGeneralRegister(11), 64); } INSTANTIATE_TEST_SUITE_P(AArch64, InstBitmanip, diff --git a/test/regression/aarch64/instructions/float.cc b/test/regression/aarch64/instructions/float.cc index 03f3f799d..627e710e7 100644 --- a/test/regression/aarch64/instructions/float.cc +++ b/test/regression/aarch64/instructions/float.cc @@ -1453,6 +1453,37 @@ TEST_P(InstFloat, ucvtf) { CHECK_NEON(9, float, {static_cast(UINT64_C(1) << 48), 0.f, 0.f, 0.f}); CHECK_NEON(10, float, {static_cast(UINT64_MAX), 0.f, 0.f, 0.f}); CHECK_NEON(11, float, {0.f, 0.f, 0.f, 0.f}); + + // 32-bit unsigned fixed-point to float + // Numbers have been chosen to have less than 0.0005 fixed-point + // representation error to ensure tests pass + initialHeapData_.resize(12); + heap32 = reinterpret_cast(initialHeapData_.data()); + heap32[0] = 0x000001EE; // 123.5 (2 fraction bits) + heap32[1] = 0x00021F3B; // 543.23 (8 fraction bits) + heap32[2] = 0x32FE6B75; // 101.987654321 (23 fraction bits) + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + # 2 fraction-bits (123.5) + ldr w1, [x0], #4 + ucvtf s1, x1, #0x2 + + # 8 fraction-bits (543.23) + ldr w2, [x0], #4 + ucvtf s2, x2, #0x8 + + + # 23 fraction-bits (101.987654321) + ldr w3, [x0] + ucvtf s3, x3, #0x17 + )"); + CHECK_NEON(1, float, {123.5f, 0.0f, 0.0f, 0.0f}); + CHECK_NEON(2, float, {543.23f, 0.0f, 0.0f, 0.0f}); + CHECK_NEON(3, float, {101.987654321f, 0.0f, 0.0f, 0.0f}); } TEST_P(InstFloat, frintp) { diff --git a/test/regression/aarch64/instructions/load.cc b/test/regression/aarch64/instructions/load.cc index 09269eebb..b98013d2a 100644 --- a/test/regression/aarch64/instructions/load.cc +++ b/test/regression/aarch64/instructions/load.cc @@ -231,6 +231,41 @@ TEST_P(InstLoad, ld1_multi_struct) { EXPECT_EQ(getGeneralRegister(12), getGeneralRegister(10) + 16); + // One reg, 8b elements + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #8 + + # save heap address before post index + mov x10, x0 + + # Load values from heap with imm post-index + ld1 {v1.8b}, [x0], #8 + + # save heap address after post index + mov x11, x0 + + # Load values from heap with reg post-index + ld1 {v2.8b}, [x0], x1 + + mov x12, x0 + )"); + + CHECK_NEON(1, uint8_t, + {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00}); + CHECK_NEON(2, uint8_t, + {0x77, 0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00}); + EXPECT_EQ(getGeneralRegister(11), + getGeneralRegister(10) + 8); + EXPECT_EQ(getGeneralRegister(12), + getGeneralRegister(10) + 16); + // Two reg, 16b elements RUN_AARCH64(R"( # Get heap address @@ -282,6 +317,53 @@ TEST_P(InstLoad, ld1_multi_struct) { EXPECT_EQ(getGeneralRegister(12), getGeneralRegister(10) + 32); + // Two reg, 8h elements + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #32 + + # Load values from heap + # ld1 {v0.8h, v1.8h}, [x0] + + # save heap address before post index + mov x10, x0 + + # Load values from heap with imm post-index + ld1 {v2.8h, v3.8h}, [x0], #32 + + # save heap address after post index + mov x11, x0 + sub x0, x0, #32 + + # Load values from heap with reg post-index + ld1 {v4.8h, v5.8h}, [x0], x1 + + mov x12, x0 + )"); + + // CHECK_NEON(0, uint16_t, + // {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, + // 0xEEDD}); + // CHECK_NEON(1, uint16_t, + // {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, + // 0xEEDD}); + CHECK_NEON(2, uint16_t, + {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD}); + CHECK_NEON(3, uint16_t, + {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD}); + CHECK_NEON(4, uint16_t, + {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD}); + CHECK_NEON(5, uint16_t, + {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD}); + EXPECT_EQ(getGeneralRegister(11), + getGeneralRegister(10) + 32); + EXPECT_EQ(getGeneralRegister(12), + getGeneralRegister(10) + 32); + // Two reg, 2d elements RUN_AARCH64(R"( # Get heap address @@ -1222,14 +1304,23 @@ TEST_P(InstLoad, ldrsb) { mov x5, 1 # Load 8-bit values from heap and sign-extend to 32-bits ldrsb w1, [x0, x5, sxtx] + # Post Index + mov x20, x0 + ldrsb w2, [x20], #16 + # Load 8-bit values from heap and sign-extend to 64-bits - ldrsb x2, [x0] - ldrsb x3, [x0, #3] + ldrsb x3, [x0] + ldrsb x4, [x0, #3] + )"); EXPECT_EQ(getGeneralRegister(1), INT8_MAX); - EXPECT_EQ(getGeneralRegister(2), -2); - EXPECT_EQ(getGeneralRegister(3), 64); + EXPECT_EQ(getGeneralRegister(2), -2); + EXPECT_EQ(getGeneralRegister(20), + getGeneralRegister(0) + 16); + + EXPECT_EQ(getGeneralRegister(3), -2); + EXPECT_EQ(getGeneralRegister(4), 64); } TEST_P(InstLoad, ldrsh) { diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc index 96d23590a..f3341e23f 100644 --- a/test/regression/aarch64/instructions/neon.cc +++ b/test/regression/aarch64/instructions/neon.cc @@ -3648,6 +3648,65 @@ TEST_P(InstNeon, trn) { CHECK_NEON(8, uint64_t, {0x1e1c1a1816141210, 0x1f1d1b1917151311}); } +TEST_P(InstNeon, udot) { + // udot by element + initialHeapData_.resize(128); + uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); + heap64[0] = 0xDEADBEEFFFFF00FF; + heap64[1] = 0x01234567ABBACAFE; + heap64[2] = 0xFEDCBA9876543210; + heap64[3] = 0xDEADCAFEABBABEEF; + RUN_AARCH64(R"( + # Get heap address + mov x0, #0 + mov x8, #214 + svc #0 + + ldr q0, [x0] + ldr q1, [x0, #16] + + movi v2.4s, #3 + movi v3.4s, #4 + movi v4.4s, #5 + movi v5.4s, #6 + + udot v2.2s, v1.8b, v0.4b[0] + udot v3.4s, v1.16b, v0.4b[1] + udot v4.2s, v1.8b, v0.4b[2] + udot v5.4s, v1.16b, v0.4b[3] + )"); + CHECK_NEON(0, uint64_t, {0xDEADBEEFFFFF00FF, 0x01234567ABBACAFE}); + CHECK_NEON(1, uint64_t, {0xFEDCBA9876543210, 0xDEADCAFEABBABEEF}); + CHECK_NEON(2, uint32_t, {0xd929, 0x26f91, 0x0, 0x0}); + CHECK_NEON(3, uint32_t, {0xd328, 0x288e8, 0x27e25, 0x2b87f}); + CHECK_NEON(4, uint32_t, {0xc333, 0x2731b, 0x0, 0x0}); + CHECK_NEON(5, uint32_t, {0x1fe2, 0x8e62, 0xad7e, 0xb52f}); + + // udot by vector + initialHeapData_.resize(128); + heap64 = reinterpret_cast(initialHeapData_.data()); + heap64[0] = 0xDEADBEEFFFFFFFFF; + heap64[1] = 0x01234567ABBACAFE; + heap64[2] = 0xFEDCBA98FFFFFFFF; + heap64[3] = 0xDEADCAFEABBABEEF; + RUN_AARCH64(R"( + # Get heap address + mov x0, #0 + mov x8, #214 + svc #0 + + ldr q0, [x0] + ldr q1, [x0, #16] + + movi v2.4s, #3 + + udot v2.4s, v1.16b, v0.16b + )"); + CHECK_NEON(0, uint64_t, {0xDEADBEEFFFFFFFFF, 0x01234567ABBACAFE}); + CHECK_NEON(1, uint64_t, {0xFEDCBA98FFFFFFFF, 0xDEADCAFEABBABEEF}); + CHECK_NEON(2, uint32_t, {0x3F807, 0x288E7, 0x27C6E, 0xB52C}); +} + TEST_P(InstNeon, uzp) { initialHeapData_.resize(128); uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); @@ -3723,6 +3782,92 @@ TEST_P(InstNeon, uzp) { CHECK_NEON(8, uint64_t, {0x1e1c1a1816141210, 0x1f1d1b1917151311}); } +TEST_P(InstNeon, umlal) { + // uint32 to uint64, lower half + RUN_AARCH64(R"( + mov w0, #-1 + mov w1, #344 + mov v0.s[0], w0 + mov v0.s[3], w1 + + mov w2, #-1 + mov w3, #3 + mov v1.s[0], w2 + mov v1.s[1], w3 + + mov v2.d[0], xzr + mov v2.d[1], xzr + mov v3.d[0], xzr + mov v3.d[1], xzr + + umlal v2.2d, v1.2s, v0.s[0] + umlal v3.2d, v1.2s, v0.s[3] + )"); + CHECK_NEON(0, uint32_t, {UINT32_MAX, 0, 0, 344}); + CHECK_NEON(2, uint64_t, {18446744065119617025ull, 12884901885ull}); + CHECK_NEON(3, uint64_t, {1477468749480ull, 1032ull}); + + // uint32 to uint64, upper half + RUN_AARCH64(R"( + mov w0, #-1 + mov w1, #344 + mov v0.s[0], w0 + mov v0.s[3], w1 + + mov w2, #-1 + mov w3, #3 + mov v1.s[2], w2 + mov v1.s[3], w3 + + mov v2.d[0], xzr + mov v2.d[1], xzr + mov v3.d[0], xzr + mov v3.d[1], xzr + + umlal2 v2.2d, v1.4s, v0.s[0] + umlal2 v3.2d, v1.4s, v0.s[3] + )"); + CHECK_NEON(0, uint32_t, {UINT32_MAX, 0, 0, 344}); + CHECK_NEON(2, uint64_t, {18446744065119617025ull, 12884901885ull}); + CHECK_NEON(3, uint64_t, {1477468749480ull, 1032ull}); +} + +TEST_P(InstNeon, umull) { + // uint16_t to uint32_t + initialHeapData_.resize(32); + uint16_t* heap16 = reinterpret_cast(initialHeapData_.data()); + heap16[0] = UINT16_MAX; + heap16[1] = 0; + heap16[2] = 1234; + heap16[3] = 0xBEEF; + heap16[4] = 0xABBA; + heap16[5] = 0xCAFE; + heap16[6] = 0xDEAD; + heap16[7] = 0xACDC; + + heap16[8] = UINT16_MAX; + heap16[9] = 0xACDC; + heap16[10] = 0xCAFE; + heap16[11] = 0xABBA; + heap16[12] = 0xBEEF; + heap16[13] = 0xDEAD; + heap16[14] = 9876; + heap16[15] = 0; + + RUN_AARCH64(R"( + # Get heap address + mov x0, #0 + mov x8, #214 + svc #0 + + ldr q0, [x0] + ldr q1, [x0, #16] + + umull v2.4s, v0.4h, v1.4h + )"); + CHECK_NEON(2, uint32_t, {4294836225u, 0, 64126044u, 2148818598u}); +} + TEST_P(InstNeon, zip) { initialHeapData_.resize(128); uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc index a54c0c981..9a7c3b4ec 100644 --- a/test/regression/aarch64/instructions/sme.cc +++ b/test/regression/aarch64/instructions/sme.cc @@ -8,6 +8,52 @@ namespace { using InstSme = AArch64RegressionTest; #if SIMENG_LLVM_VERSION >= 14 + +TEST_P(InstSme, add) { + // uint32_T, vgx2, vecs with ZA + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 96 (uint32_t) + dup z0.b, #8 + dup z1.b, #3 + ptrue p0.b + ptrue p1.b + umopa za0.s, p0/m, p1/m, z0.b, z1.b + umopa za1.s, p0/m, p1/m, z0.b, z1.b + umopa za2.s, p0/m, p1/m, z0.b, z1.b + umopa za3.s, p0/m, p1/m, z0.b, z1.b + + # Set 2 of the za rows + mov w8, #1 + dup z0.s, #8 + dup z1.s, #3 + add za.s[w8, #1, vgx2], {z0.s, z1.s} + )"); + const uint16_t zaStride = (SVL / 8) / 2; + const uint16_t zaHalfIndex = 2; + for (uint16_t i = 0; i < (SVL / 8); i++) { + if (i == zaHalfIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({104}, (SVL / 8))); + } else if (i == zaStride + zaHalfIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({99}, (SVL / 8))); + } else { + // un-effected rows should still be 96 throughout + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({96}, (SVL / 8))); + } + } +} + TEST_P(InstSme, addha) { // 32-bit RUN_AARCH64(R"( @@ -136,6 +182,496 @@ TEST_P(InstSme, addha) { } } +TEST_P(InstSme, mova_zaToVecs) { + // 2 vectors + initialHeapData_.resize(SVL / 8); + uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15}; + fillHeap(heap8, src, SVL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 96 (uint32_t) + dup z1.b, #8 + dup z2.b, #3 + ptrue p0.b + ptrue p1.b + umopa za0.s, p0/m, p1/m, z1.b, z2.b + umopa za1.s, p0/m, p1/m, z1.b, z2.b + umopa za2.s, p0/m, p1/m, z1.b, z2.b + umopa za3.s, p0/m, p1/m, z1.b, z2.b + + # Set 4 of the za rows + mov w8, #1 + dup z4.b, #10 + dup z5.b, #11 + dup z6.b, #12 + dup z7.b, #13 + ld1b {z10.b}, p0/z, [x0] + udot za.s[w8, #1, vgx4], {z4.b - z7.b}, z10.b[2] + + # Extravt un-updated values + mov w9, #0 + mova {z20.d, z21.d}, za.d[w9, #0, vgx2] + # Extract 0th and 2nd updated rows + mov {z24.d, z25.d}, za.d[w8, #1, vgx2] + # Extract 1st and 3rd updated rows (get new offset into each half) + addvl x10, x10, #1 + mov x20, #4 + udiv x10, x10, x20 + mov {z26.d, z27.d}, za.d[w10, #2, vgx2] + )"); + // Check extracted un-effected rows (two uint32_t values of 96 equal one + // uint64_t value of 412316860512) + CHECK_NEON(20, uint64_t, fillNeon({412316860512}, SVL / 8)); + CHECK_NEON(21, uint64_t, fillNeon({412316860512}, SVL / 8)); + // Check extracted effected rows (two uint32_t values concatonated into one + // uint64_t value) + CHECK_NEON(24, uint64_t, fillNeon({2044404433372}, SVL / 8)); + CHECK_NEON(25, uint64_t, fillNeon({2370821947944}, SVL / 8)); + CHECK_NEON(26, uint64_t, fillNeon({2207613190658}, SVL / 8)); + CHECK_NEON(27, uint64_t, fillNeon({2534030705230}, SVL / 8)); + + // 4 vectors + initialHeapData_.resize(SVL / 8); + heap8 = reinterpret_cast(initialHeapData_.data()); + src = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + fillHeap(heap8, src, SVL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 96 (uint32_t) + dup z1.b, #8 + dup z2.b, #3 + ptrue p0.b + ptrue p1.b + umopa za0.s, p0/m, p1/m, z1.b, z2.b + umopa za1.s, p0/m, p1/m, z1.b, z2.b + umopa za2.s, p0/m, p1/m, z1.b, z2.b + umopa za3.s, p0/m, p1/m, z1.b, z2.b + + # Set 4 of the za rows + mov w8, #1 + dup z4.b, #10 + dup z5.b, #11 + dup z6.b, #12 + dup z7.b, #13 + ld1b {z10.b}, p0/z, [x0] + udot za.s[w8, #1, vgx4], {z4.b - z7.b}, z10.b[2] + + mov w9, #0 + mova {z20.d - z23.d}, za.d[w9, #0, vgx4] + mov {z24.d - z27.d}, za.d[w8, #1, vgx4] + )"); + // Check extracted un-effected rows (two uint32_t values of 96 equal one + // uint64_t value of 412316860512) + CHECK_NEON(20, uint64_t, fillNeon({412316860512}, SVL / 8)); + CHECK_NEON(21, uint64_t, fillNeon({412316860512}, SVL / 8)); + CHECK_NEON(22, uint64_t, fillNeon({412316860512}, SVL / 8)); + CHECK_NEON(23, uint64_t, fillNeon({412316860512}, SVL / 8)); + // Check extracted effected rows (two uint32_t values concatonated into one + // uint64_t value) + CHECK_NEON(24, uint64_t, fillNeon({2044404433372}, SVL / 8)); + CHECK_NEON(25, uint64_t, fillNeon({2207613190658}, SVL / 8)); + CHECK_NEON(26, uint64_t, fillNeon({2370821947944}, SVL / 8)); + CHECK_NEON(27, uint64_t, fillNeon({2534030705230}, SVL / 8)); +} + +TEST_P(InstSme, mova_tilesToVecs) { + // uint8_t; 4 vectors + initialHeapData_.resize(SVL / 4); + uint32_t* heap32 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}; + fillHeap(heap32, src, SVL / 16); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + mov w12, #0 + ptrue p0.s + + # Pre-fill first 4 rows of za0.b + ld1w {za0h.s[w12, 0]}, p0/z, [x0] + ld1w {za1h.s[w12, 0]}, p0/z, [x0] + ld1w {za2h.s[w12, 0]}, p0/z, [x0] + ld1w {za3h.s[w12, 0]}, p0/z, [x0] + + + mova {z4.b-z7.b}, za0h.b[w12, 0:3] + + # Test Alias + mov w13, #1 + dup z11.b, #3 + mov {z8.b-z11.b}, za0h.b[w13, 0:3] + )"); + for (int i = 4; i <= 10; i++) { + CHECK_NEON( + i, uint8_t, + fillNeon({0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, 0x34, 0x12, 0x32, + 0x54, 0x76, 0x98, 0x01, 0xEF, 0xCD, 0xAB}, + SVL / 8)); + } + CHECK_NEON(11, uint8_t, fillNeon({0x00}, SVL / 8)); +} + +TEST_P(InstSme, fadd) { + // Float, VGx2 + initialHeapData_.resize(SVL / 8); + uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15}; + fillHeap(heap8, src, SVL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 24.0f + fdup z1.s, #3.0 + fdup z2.s, #8.0 + ptrue p0.s + ptrue p1.s + fmopa za0.s, p0/m, p1/m, z1.s, z2.s + fmopa za1.s, p0/m, p1/m, z1.s, z2.s + fmopa za2.s, p0/m, p1/m, z1.s, z2.s + fmopa za3.s, p0/m, p1/m, z1.s, z2.s + + # initialise registers + mov w8, #1 + fdup z4.s, #-2.5 + fdup z5.s, #3.0 + + fadd za.s[w8, #1, vgx2], {z4.s, z5.s} + )"); + const uint16_t zaStride = (SVL / 8) / 2; + const uint16_t zaHalfIndex = 2; + for (uint16_t i = 0; i < (SVL / 8); i++) { + if (i == zaHalfIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({21.5f}, (SVL / 8))); + } else if (i == zaStride + zaHalfIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({27.0f}, (SVL / 8))); + } else { + // un-effected rows should still be 24.0f throughout + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({24.0f}, (SVL / 8))); + } + } + + // Double, VGx2 + initialHeapData_.resize(SVL / 8); + heap8 = reinterpret_cast(initialHeapData_.data()); + src = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + fillHeap(heap8, src, SVL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 24.0 + fdup z1.d, #3.0 + fdup z2.d, #8.0 + ptrue p0.d + ptrue p1.d + fmopa za0.d, p0/m, p1/m, z1.d, z2.d + fmopa za1.d, p0/m, p1/m, z1.d, z2.d + fmopa za2.d, p0/m, p1/m, z1.d, z2.d + fmopa za3.d, p0/m, p1/m, z1.d, z2.d + fmopa za4.d, p0/m, p1/m, z1.d, z2.d + fmopa za5.d, p0/m, p1/m, z1.d, z2.d + fmopa za6.d, p0/m, p1/m, z1.d, z2.d + fmopa za7.d, p0/m, p1/m, z1.d, z2.d + + + # initialise registers + mov w8, #1 + fdup z4.d, #-2.5 + fdup z5.d, #3.0 + + fadd za.d[w8, #1, vgx2], {z4.d, z5.d} + )"); + for (uint16_t i = 0; i < (SVL / 8); i++) { + if (i == zaHalfIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({21.5}, (SVL / 8))); + } else if (i == zaStride + zaHalfIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({27.0}, (SVL / 8))); + } else { + // un-effected rows should still be 24.0f throughout + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({24.0}, (SVL / 8))); + } + } +} + +TEST_P(InstSme, fmla_multiVecs) { + // float, vgx4 + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 24.0f + fdup z1.s, #3.0 + fdup z2.s, #8.0 + ptrue p0.s + ptrue p1.s + fmopa za0.s, p0/m, p1/m, z1.s, z2.s + fmopa za1.s, p0/m, p1/m, z1.s, z2.s + fmopa za2.s, p0/m, p1/m, z1.s, z2.s + fmopa za3.s, p0/m, p1/m, z1.s, z2.s + + # initialise registers + mov w8, #1 + fdup z4.s, #0.25 + fdup z5.s, #1.5 + fdup z6.s, #-0.5 + fdup z7.s, #-2.5 + fdup z8.s, #3.0 + fdup z9.s, #4.0 + fdup z10.s, #5.0 + fdup z11.s, #6.0 + + fmla za.s[w8, #1, vgx4], {z4.s - z7.s}, {z8.s - z11.s} + )"); + const uint16_t zaStride = (SVL / 8) / 4; + const uint16_t zaQuartIndex = 2; + for (uint16_t i = 0; i < (SVL / 8); i++) { + // Effected rows all use same zm value of 2.0f + if (i == zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({24.75f}, (SVL / 8))); + } else if (i == zaStride + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({30.0f}, (SVL / 8))); + } else if (i == (2 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({21.5f}, (SVL / 8))); + } else if (i == (3 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({9.0f}, (SVL / 8))); + } else { + // un-effected rows should still be 24.0f throughout + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({24.0f}, (SVL / 8))); + } + } + + // double, vgx4 + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 24.0 + fdup z1.d, #3.0 + fdup z2.d, #8.0 + ptrue p0.d + ptrue p1.d + fmopa za0.d, p0/m, p1/m, z1.d, z2.d + fmopa za1.d, p0/m, p1/m, z1.d, z2.d + fmopa za2.d, p0/m, p1/m, z1.d, z2.d + fmopa za3.d, p0/m, p1/m, z1.d, z2.d + fmopa za4.d, p0/m, p1/m, z1.d, z2.d + fmopa za5.d, p0/m, p1/m, z1.d, z2.d + fmopa za6.d, p0/m, p1/m, z1.d, z2.d + fmopa za7.d, p0/m, p1/m, z1.d, z2.d + + # initialise registers + mov w8, #1 + fdup z4.d, #0.25 + fdup z5.d, #1.5 + fdup z6.d, #-0.5 + fdup z7.d, #-2.5 + fdup z8.d, #3.0 + fdup z9.d, #4.0 + fdup z10.d, #5.0 + fdup z11.d, #6.0 + + fmla za.d[w8, #1, vgx4], {z4.d - z7.d}, {z8.d - z11.d} + )"); + for (uint16_t i = 0; i < (SVL / 8); i++) { + // Effected rows all use same zm value of 2.0 + if (i == zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({24.75}, (SVL / 8))); + } else if (i == zaStride + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({30.0}, (SVL / 8))); + } else if (i == (2 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({21.5}, (SVL / 8))); + } else if (i == (3 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({9.0}, (SVL / 8))); + } else { + // un-effected rows should still be 24.0 throughout + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({24.0}, (SVL / 8))); + } + } +} + +TEST_P(InstSme, fmla_indexed_vgx4) { + // float + initialHeapData_.resize(SVL); + float* heapf = reinterpret_cast(initialHeapData_.data()); + std::vector srcf = {0.0f, 1.0f, 2.0f, 3.0f}; + fillHeap(heapf, srcf, SVL / 4); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 24.0f + fdup z1.s, #3.0 + fdup z2.s, #8.0 + ptrue p0.s + ptrue p1.s + fmopa za0.s, p0/m, p1/m, z1.s, z2.s + fmopa za1.s, p0/m, p1/m, z1.s, z2.s + fmopa za2.s, p0/m, p1/m, z1.s, z2.s + fmopa za3.s, p0/m, p1/m, z1.s, z2.s + + # initialise registers + mov w8, #1 + fdup z4.s, #0.25 + fdup z5.s, #1.5 + fdup z6.s, #-0.5 + fdup z7.s, #-2.5 + ld1w {z10.s}, p0/z, [x0] + + fmla za.s[w8, #1, vgx4], {z4.s - z7.s}, z10.s[2] + )"); + const uint16_t zaStride = (SVL / 8) / 4; + const uint16_t zaQuartIndex = 2; + for (uint16_t i = 0; i < (SVL / 8); i++) { + // Effected rows all use same zm value of 2.0f + if (i == zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({24.5f}, (SVL / 8))); + } else if (i == zaStride + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({27.0f}, (SVL / 8))); + } else if (i == (2 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({23.0f}, (SVL / 8))); + } else if (i == (3 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({19.0f}, (SVL / 8))); + } else { + // un-effected rows should still be 24.0f throughout + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({24.0f}, (SVL / 8))); + } + } + + // double + initialHeapData_.resize(SVL); + double* heapd = reinterpret_cast(initialHeapData_.data()); + std::vector srcd = {2.0f, 3.0f}; + fillHeap(heapd, srcd, SVL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 24.0f + fdup z1.d, #3.0 + fdup z2.d, #8.0 + ptrue p0.d + ptrue p1.d + fmopa za0.d, p0/m, p1/m, z1.d, z2.d + fmopa za1.d, p0/m, p1/m, z1.d, z2.d + fmopa za2.d, p0/m, p1/m, z1.d, z2.d + fmopa za3.d, p0/m, p1/m, z1.d, z2.d + fmopa za4.d, p0/m, p1/m, z1.d, z2.d + fmopa za5.d, p0/m, p1/m, z1.d, z2.d + fmopa za6.d, p0/m, p1/m, z1.d, z2.d + fmopa za7.d, p0/m, p1/m, z1.d, z2.d + + # initialise registers + mov w8, #1 + fdup z4.d, #0.25 + fdup z5.d, #1.5 + fdup z6.d, #-0.5 + fdup z7.d, #-2.5 + ld1d {z10.d}, p0/z, [x0] + + fmla za.d[w8, #1, vgx4], {z4.d - z7.d}, z10.d[0] + )"); + for (uint16_t i = 0; i < (SVL / 8); i++) { + // Effected rows all use same zm value of 2.0f + if (i == zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({24.5}, (SVL / 8))); + } else if (i == zaStride + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({27.0}, (SVL / 8))); + } else if (i == (2 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({23.0}, (SVL / 8))); + } else if (i == (3 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({19.0}, (SVL / 8))); + } else { + // un-effected rows should still be 24.0 throughout + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({24.0}, (SVL / 8))); + } + } +} TEST_P(InstSme, addva) { // 32-bit RUN_AARCH64(R"( @@ -1170,6 +1706,21 @@ TEST_P(InstSme, mova_q_vecToTile) { } } +TEST_P(InstSme, rdsvl) { + RUN_AARCH64(R"( + rdsvl x0, #-32 + rdsvl x1, #-3 + rdsvl x2, #0 + rdsvl x3, #3 + rdsvl x4, #31 +)"); + EXPECT_EQ(getGeneralRegister(0), (SVL / 8) * -32); + EXPECT_EQ(getGeneralRegister(1), (SVL / 8) * -3); + EXPECT_EQ(getGeneralRegister(2), 0); + EXPECT_EQ(getGeneralRegister(3), (SVL / 8) * 3); + EXPECT_EQ(getGeneralRegister(4), (SVL / 8) * 31); +} + TEST_P(InstSme, fmopa) { // 32-bit RUN_AARCH64(R"( @@ -3410,13 +3961,207 @@ TEST_P(InstSme, usmops) { } } +TEST_P(InstSme, udot_Indexed_vgx4) { + // 8-bit to 32-bit widening + initialHeapData_.resize(SVL / 8); + uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15}; + fillHeap(heap8, src, SVL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 96 (uint32_t) + dup z1.b, #8 + dup z2.b, #3 + ptrue p0.b + ptrue p1.b + umopa za0.s, p0/m, p1/m, z1.b, z2.b + umopa za1.s, p0/m, p1/m, z1.b, z2.b + umopa za2.s, p0/m, p1/m, z1.b, z2.b + umopa za3.s, p0/m, p1/m, z1.b, z2.b + + # initialise registers + mov w8, #1 + dup z4.b, #10 + dup z5.b, #11 + dup z6.b, #12 + dup z7.b, #13 + ld1b {z10.b}, p0/z, [x0] + + udot za.s[w8, #1, vgx4], {z4.b - z7.b}, z10.b[2] + )"); + const uint16_t zaStride = (SVL / 8) / 4; + const uint16_t zaQuartIndex = 2; + for (uint16_t i = 0; i < (SVL / 8); i++) { + // Effected rows all use same zm values of {8, 9, 10, 11} + if (i == zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({476}, (SVL / 8))); + } else if (i == zaStride + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({514}, (SVL / 8))); + } else if (i == (2 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({552}, (SVL / 8))); + } else if (i == (3 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({590}, (SVL / 8))); + } else { + // un-effected rows should still be 96 throughout + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({96}, (SVL / 8))); + } + } +} + +TEST_P(InstSme, udot_vgx4) { + // 8-bit to 32-bit widening + initialHeapData_.resize(SVL / 8); + uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15}; + fillHeap(heap8, src, SVL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 96 (uint32_t) + dup z1.b, #8 + dup z2.b, #3 + ptrue p0.b + ptrue p1.b + umopa za0.s, p0/m, p1/m, z1.b, z2.b + umopa za1.s, p0/m, p1/m, z1.b, z2.b + umopa za2.s, p0/m, p1/m, z1.b, z2.b + umopa za3.s, p0/m, p1/m, z1.b, z2.b + + # initialise registers + mov w8, #1 + dup z4.b, #10 + dup z5.b, #11 + dup z6.b, #12 + dup z7.b, #13 + ld1b {z8.b}, p0/z, [x0] + ld1b {z9.b}, p0/z, [x0] + ld1b {z10.b}, p0/z, [x0] + ld1b {z11.b}, p0/z, [x0] + + udot za.s[w8, #1, vgx4], {z4.b - z7.b}, {z8.b - z11.b} + )"); + const uint16_t zaStride = (SVL / 8) / 4; + const uint16_t zaQuartIndex = 2; + for (uint16_t i = 0; i < (SVL / 8); i++) { + if (i == zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({156, 316, 476, 636}, (SVL / 8))); + } else if (i == zaStride + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({162, 338, 514, 690}, (SVL / 8))); + } else if (i == (2 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({168, 360, 552, 744}, (SVL / 8))); + } else if (i == (3 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({174, 382, 590, 798}, (SVL / 8))); + } else { + // un-effected rows should still be 96 throughout + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({96}, (SVL / 8))); + } + } +} + +TEST_P(InstSme, uvdot_indexed_vgx4) { + // 8-bit to 32-bit widening + initialHeapData_.resize(SVL / 8); + uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15}; + fillHeap(heap8, src, SVL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 96 (uint32_t) + dup z1.b, #8 + dup z2.b, #3 + ptrue p0.b + ptrue p1.b + umopa za0.s, p0/m, p1/m, z1.b, z2.b + umopa za1.s, p0/m, p1/m, z1.b, z2.b + umopa za2.s, p0/m, p1/m, z1.b, z2.b + umopa za3.s, p0/m, p1/m, z1.b, z2.b + + # initialise registers + mov w8, #1 + dup z4.b, #10 + dup z5.b, #11 + dup z6.b, #12 + dup z7.b, #13 + ld1b {z10.b}, p0/z, [x0] + + uvdot za.s[w8, #1, vgx4], {z4.b - z7.b}, z10.b[2] + )"); + const uint16_t zaStride = (SVL / 8) / 4; + const uint16_t zaQuartIndex = 2; + for (uint16_t i = 0; i < (SVL / 8); i++) { + // Effected rows all use same zm values of {8, 9, 10, 11} + if (i == zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({538}, (SVL / 8))); + } else if (i == zaStride + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({538}, (SVL / 8))); + } else if (i == (2 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({538}, (SVL / 8))); + } else if (i == (3 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({538}, (SVL / 8))); + } else { + // un-effected rows should still be 96 throughout + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({96}, (SVL / 8))); + } + } +} + TEST_P(InstSme, zero) { + // ZT0 + RUN_AARCH64(R"( + smstart + + zero {zt0} + )"); + CHECK_TABLE(uint64_t, {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}); + + // ZA tiles RUN_AARCH64(R"( smstart zero {za} )"); - for (uint64_t i = 0; i < (SVL / 8); i++) { + for (uint16_t i = 0; i < (SVL / 8); i++) { CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint64_t, fillNeon({0}, SVL / 8)); } @@ -3453,7 +4198,7 @@ TEST_P(InstSme, zero) { zero {za0.s, za2.s} )"); - for (uint64_t i = 0; i < (SVL / 32); i++) { + for (uint16_t i = 0; i < (SVL / 32); i++) { CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t, fillNeon({0}, SVL / 8)); CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t, @@ -3467,6 +4212,7 @@ TEST_P(InstSme, zero) { INSTANTIATE_TEST_SUITE_P(AArch64, InstSme, ::testing::ValuesIn(genCoreTypeSVLPairs(EMULATION)), paramToString); + #else GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InstSme); #endif diff --git a/test/regression/aarch64/instructions/store.cc b/test/regression/aarch64/instructions/store.cc index 6d6876b49..6a8136da3 100644 --- a/test/regression/aarch64/instructions/store.cc +++ b/test/regression/aarch64/instructions/store.cc @@ -437,6 +437,26 @@ TEST_P(InstStore, st1_multi_struct) { } } + // one reg, 4s elements (post offset only) + RUN_AARCH64(R"( + mov x0, #32 + movi v0.4s, #1 + sub sp, sp, #96 + st1 {v0.4s}, [sp], #16 + st1 {v0.4s}, [sp], x0 + )"); + const uint64_t sp = process_->getInitialStackPointer(); + EXPECT_EQ(getGeneralRegister(31), sp - 48); + EXPECT_EQ(getMemoryValue(sp - 96), static_cast(1)); + EXPECT_EQ(getMemoryValue(sp - 92), static_cast(1)); + EXPECT_EQ(getMemoryValue(sp - 88), static_cast(1)); + EXPECT_EQ(getMemoryValue(sp - 84), static_cast(1)); + + EXPECT_EQ(getMemoryValue(sp - 80), static_cast(1)); + EXPECT_EQ(getMemoryValue(sp - 76), static_cast(1)); + EXPECT_EQ(getMemoryValue(sp - 72), static_cast(1)); + EXPECT_EQ(getMemoryValue(sp - 68), static_cast(1)); + // two reg, 4s elements RUN_AARCH64(R"( mov x0, #32 diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 6a52d46b9..9411ef008 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -2852,6 +2852,84 @@ TEST_P(InstSve, fadda) { CHECK_NEON(3, double, {resultB, 0}); } +TEST_P(InstSve, faddv) { + // float + initialHeapData_.resize(VL / 8); + float* fheap = reinterpret_cast(initialHeapData_.data()); + std::vector fsrc = { + 1.0f, -42.76f, -0.125f, 0.0f, 40.26f, -684.72f, -0.15f, 107.86f, + -34.71f, -0.917f, 0.0f, 80.72f, -125.67f, -0.01f, 701.90f, 7.0f}; + fillHeap(fheap, fsrc, VL / 32); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x2, xzr + mov x3, xzr + mov x4, #4 + mov x5, #2 + addvl x3, x3, #1 + sdiv x3, x3, x4 + sdiv x2, x3, x5 + + ptrue p0.s + whilelo p1.s, xzr, x2 + + ld1w {z0.s}, p0/z, [x0] + + faddv s3, p0, z0.s + faddv s4, p1, z0.s + )"); + float s3 = 0.0f; + float s4 = 0.0f; + for (uint64_t i = 0; i < VL / 32; i++) { + s3 += fsrc[i % (fsrc.size())]; + if (i < (VL / 64)) s4 += fsrc[i % (fsrc.size())]; + } + CHECK_NEON(3, float, {s3, 0.0f, 0.0f, 0.0f}); + CHECK_NEON(4, float, {s4, 0.0f, 0.0f, 0.0f}); + + // double + initialHeapData_.resize(VL); + double* dheap = reinterpret_cast(initialHeapData_.data()); + std::vector dsrc = {1.0, -42.76, -0.125, 0.0, 40.26, -684.72, + -0.15, 107.86, -34.71, -0.917, 0.0, 80.72, + -125.67, -0.01, 701.90, 7.0}; + fillHeap(dheap, dsrc, VL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x2, xzr + mov x3, xzr + mov x4, #8 + mov x5, #2 + addvl x3, x3, #1 + sdiv x3, x3, x4 + sdiv x2, x3, x5 + + ptrue p0.d + whilelo p1.d, xzr, x2 + + ld1d {z0.d}, p0/z, [x0] + + faddv d3, p0, z0.d + faddv d4, p1, z0.d + )"); + double d3 = 0.0; + double d4 = 0.0; + for (uint64_t i = 0; i < (VL / 64); i++) { + d3 += dsrc[i % (dsrc.size())]; + if (i < (VL / 128)) d4 += dsrc[i % (dsrc.size())]; + } + CHECK_NEON(3, double, {d3, 0.0}); + CHECK_NEON(4, double, {d4, 0.0}); +} + TEST_P(InstSve, fcmge) { // double initialHeapData_.resize(VL / 16); @@ -4641,6 +4719,84 @@ TEST_P(InstSve, ld1rd) { CHECK_NEON(3, uint64_t, fillNeon({0x12345678}, VL / 16)); } +TEST_P(InstSve, ld1rqb) { + initialHeapData_.resize(32); + uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); + fillHeap(heap64, + {0x12345678DEADBEEF, 0xABCDEF0198765432, + 0xABBACAFEFEDCBA98, 0xFEEDABCDBEADCABB}, + 4); + // Imm offset + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + # Load and broadcast values from heap + ptrue p0.b + ld1rqb {z0.b}, p0/z, [x0] + ld1rqb {z1.b}, p0/z, [x0, #16] + + # Test for inactive lanes + ptrue p1.b, vl1 + ld1rqb {z2.b}, p1/z, [x0] + add x0, x0, #32 + ld1rqb {z3.b}, p1/z, [x0, #-16] + )"); + CHECK_NEON(0, uint8_t, + fillNeon({0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, 0x34, 0x12, + 0x32, 0x54, 0x76, 0x98, 0x01, 0xEF, 0xCD, 0xAB}, + VL / 8)); + CHECK_NEON(1, uint8_t, + fillNeon({0x98, 0xBA, 0xDC, 0xFE, 0xFE, 0xCA, 0xBA, 0xAB, + 0xBB, 0xCA, 0xAD, 0xBE, 0xCD, 0xAB, 0xED, 0xFE}, + VL / 8)); + CHECK_NEON(2, uint8_t, + fillNeon({0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + VL / 8)); + CHECK_NEON(3, uint8_t, + fillNeon({0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + VL / 8)); + + // Reg offset + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + # Load and broadcast values from heap + ptrue p0.b + mov x1, #16 + ld1rqb {z0.b}, p0/z, [x0] + ld1rqb {z1.b}, p0/z, [x0, x1] + + # Test for inactive lanes + ptrue p1.b, vl1 + ld1rqb {z2.b}, p1/z, [x0] + ld1rqb {z3.b}, p1/z, [x0, x1] + )"); + CHECK_NEON(0, uint8_t, + fillNeon({0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, 0x34, 0x12, + 0x32, 0x54, 0x76, 0x98, 0x01, 0xEF, 0xCD, 0xAB}, + VL / 8)); + CHECK_NEON(1, uint8_t, + fillNeon({0x98, 0xBA, 0xDC, 0xFE, 0xFE, 0xCA, 0xBA, 0xAB, + 0xBB, 0xCA, 0xAD, 0xBE, 0xCD, 0xAB, 0xED, 0xFE}, + VL / 8)); + CHECK_NEON(2, uint8_t, + fillNeon({0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + VL / 8)); + CHECK_NEON(3, uint8_t, + fillNeon({0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + VL / 8)); +} + TEST_P(InstSve, ld1rqd) { initialHeapData_.resize(32); uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); @@ -4737,6 +4893,7 @@ TEST_P(InstSve, ld1rw) { } TEST_P(InstSve, ld1b) { + // Single vector initialHeapData_.resize(VL / 4); uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); std::vector src = {0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, 0x34, 0x12, @@ -4774,6 +4931,460 @@ TEST_P(InstSve, ld1b) { VL / 16)); std::rotate(src.begin(), src.begin() + ((VL / 8) % 16), src.end()); CHECK_NEON(2, uint8_t, fillNeon(src, VL / 16)); + + // Multi vector + initialHeapData_.resize(VL); + uint8_t* heap8_multi = reinterpret_cast(initialHeapData_.data()); + std::vector src_multi = {0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, + 0x34, 0x12, 0x32, 0x54, 0x76, 0x98, + 0x01, 0xEF, 0xCD, 0xAB}; + fillHeap(heap8_multi, src_multi, VL); + + // Two vector + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + dup z0.b, #1 + dup z1.b, #2 + dup z2.b, #3 + dup z3.b, #4 + + ptrue pn8.b + mov x1, #2 + + ld1b {z0.b, z1.b}, pn8/z, [x0, #2, mul vl] + ld1b {z2.b, z3.b}, pn8/z, [x0, x1] + )"); + uint16_t base = (VL / 8) * 2; + uint16_t offset = (VL / 8); + CHECK_NEON(0, uint8_t, + fillNeon( + { + src[(base) % 16], + src[(base + 1) % 16], + src[(base + 2) % 16], + src[(base + 3) % 16], + src[(base + 4) % 16], + src[(base + 5) % 16], + src[(base + 6) % 16], + src[(base + 7) % 16], + src[(base + 8) % 16], + src[(base + 9) % 16], + src[(base + 10) % 16], + src[(base + 11) % 16], + src[(base + 12) % 16], + src[(base + 13) % 16], + src[(base + 14) % 16], + src[(base + 15) % 16], + }, + VL / 8)); + CHECK_NEON(1, uint8_t, + fillNeon( + { + src[((base + offset)) % 16], + src[((base + offset) + 1) % 16], + src[((base + offset) + 2) % 16], + src[((base + offset) + 3) % 16], + src[((base + offset) + 4) % 16], + src[((base + offset) + 5) % 16], + src[((base + offset) + 6) % 16], + src[((base + offset) + 7) % 16], + src[((base + offset) + 8) % 16], + src[((base + offset) + 9) % 16], + src[((base + offset) + 10) % 16], + src[((base + offset) + 11) % 16], + src[((base + offset) + 12) % 16], + src[((base + offset) + 13) % 16], + src[((base + offset) + 14) % 16], + src[((base + offset) + 15) % 16], + }, + VL / 8)); + CHECK_NEON(2, uint8_t, + fillNeon({src[2], src[3], src[4], src[5], src[6], src[7], + src[8], src[9], src[10], src[11], src[12], + src[13], src[14], src[15], src[0], src[1]}, + VL / 8)); + CHECK_NEON( + 3, uint8_t, + fillNeon({src[(2 + offset) % 16], src[(3 + offset) % 16], + src[(4 + offset) % 16], src[(5 + offset) % 16], + src[(6 + offset) % 16], src[(7 + offset) % 16], + src[(8 + offset) % 16], src[(9 + offset) % 16], + src[(10 + offset) % 16], src[(11 + offset) % 16], + src[(12 + offset) % 16], src[(13 + offset) % 16], + src[(14 + offset) % 16], src[(15 + offset) % 16], + src[(0 + offset) % 16], src[(1 + offset) % 16]}, + VL / 8)); + + // Four vector + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + dup z0.b, #1 + dup z1.b, #2 + dup z2.b, #3 + dup z3.b, #4 + + ptrue pn8.b + + mov x1, #4 + ld1b {z0.b - z3.b}, pn8/z, [x0, #4, mul vl] + ld1b {z4.b - z7.b}, pn8/z, [x0, x1] + ld1b {z16.b, z20.b, z24.b, z28.b}, pn8/z, [x0, #4, mul vl] + ld1b {z17.b, z21.b, z25.b, z29.b}, pn8/z, [x0, x1] + )"); + base = (VL / 8) * 4; + offset = (VL / 8); + // Consecutive vectors + CHECK_NEON(0, uint8_t, + fillNeon( + { + src[(base) % 16], + src[(base + 1) % 16], + src[(base + 2) % 16], + src[(base + 3) % 16], + src[(base + 4) % 16], + src[(base + 5) % 16], + src[(base + 6) % 16], + src[(base + 7) % 16], + src[(base + 8) % 16], + src[(base + 9) % 16], + src[(base + 10) % 16], + src[(base + 11) % 16], + src[(base + 12) % 16], + src[(base + 13) % 16], + src[(base + 14) % 16], + src[(base + 15) % 16], + }, + VL / 8)); + CHECK_NEON(1, uint8_t, + fillNeon( + { + src[((base + offset)) % 16], + src[((base + offset) + 1) % 16], + src[((base + offset) + 2) % 16], + src[((base + offset) + 3) % 16], + src[((base + offset) + 4) % 16], + src[((base + offset) + 5) % 16], + src[((base + offset) + 6) % 16], + src[((base + offset) + 7) % 16], + src[((base + offset) + 8) % 16], + src[((base + offset) + 9) % 16], + src[((base + offset) + 10) % 16], + src[((base + offset) + 11) % 16], + src[((base + offset) + 12) % 16], + src[((base + offset) + 13) % 16], + src[((base + offset) + 14) % 16], + src[((base + offset) + 15) % 16], + }, + VL / 8)); + CHECK_NEON(2, uint8_t, + fillNeon( + { + src[((base + (2 * offset))) % 16], + src[((base + (2 * offset)) + 1) % 16], + src[((base + (2 * offset)) + 2) % 16], + src[((base + (2 * offset)) + 3) % 16], + src[((base + (2 * offset)) + 4) % 16], + src[((base + (2 * offset)) + 5) % 16], + src[((base + (2 * offset)) + 6) % 16], + src[((base + (2 * offset)) + 7) % 16], + src[((base + (2 * offset)) + 8) % 16], + src[((base + (2 * offset)) + 9) % 16], + src[((base + (2 * offset)) + 10) % 16], + src[((base + (2 * offset)) + 11) % 16], + src[((base + (2 * offset)) + 12) % 16], + src[((base + (2 * offset)) + 13) % 16], + src[((base + (2 * offset)) + 14) % 16], + src[((base + (2 * offset)) + 15) % 16], + }, + VL / 8)); + CHECK_NEON(3, uint8_t, + fillNeon( + { + src[((base + (3 * offset))) % 16], + src[((base + (3 * offset)) + 1) % 16], + src[((base + (3 * offset)) + 2) % 16], + src[((base + (3 * offset)) + 3) % 16], + src[((base + (3 * offset)) + 4) % 16], + src[((base + (3 * offset)) + 5) % 16], + src[((base + (3 * offset)) + 6) % 16], + src[((base + (3 * offset)) + 7) % 16], + src[((base + (3 * offset)) + 8) % 16], + src[((base + (3 * offset)) + 9) % 16], + src[((base + (3 * offset)) + 10) % 16], + src[((base + (3 * offset)) + 11) % 16], + src[((base + (3 * offset)) + 12) % 16], + src[((base + (3 * offset)) + 13) % 16], + src[((base + (3 * offset)) + 14) % 16], + src[((base + (3 * offset)) + 15) % 16], + }, + VL / 8)); + base = 4; + offset = (VL / 8); + CHECK_NEON(4, uint8_t, + fillNeon( + { + src[(base) % 16], + src[(base + 1) % 16], + src[(base + 2) % 16], + src[(base + 3) % 16], + src[(base + 4) % 16], + src[(base + 5) % 16], + src[(base + 6) % 16], + src[(base + 7) % 16], + src[(base + 8) % 16], + src[(base + 9) % 16], + src[(base + 10) % 16], + src[(base + 11) % 16], + src[(base + 12) % 16], + src[(base + 13) % 16], + src[(base + 14) % 16], + src[(base + 15) % 16], + }, + VL / 8)); + CHECK_NEON(4, uint8_t, + fillNeon( + { + src[((base + offset)) % 16], + src[((base + offset) + 1) % 16], + src[((base + offset) + 2) % 16], + src[((base + offset) + 3) % 16], + src[((base + offset) + 4) % 16], + src[((base + offset) + 5) % 16], + src[((base + offset) + 6) % 16], + src[((base + offset) + 7) % 16], + src[((base + offset) + 8) % 16], + src[((base + offset) + 9) % 16], + src[((base + offset) + 10) % 16], + src[((base + offset) + 11) % 16], + src[((base + offset) + 12) % 16], + src[((base + offset) + 13) % 16], + src[((base + offset) + 14) % 16], + src[((base + offset) + 15) % 16], + }, + VL / 8)); + CHECK_NEON(6, uint8_t, + fillNeon( + { + src[((base + (2 * offset))) % 16], + src[((base + (2 * offset)) + 1) % 16], + src[((base + (2 * offset)) + 2) % 16], + src[((base + (2 * offset)) + 3) % 16], + src[((base + (2 * offset)) + 4) % 16], + src[((base + (2 * offset)) + 5) % 16], + src[((base + (2 * offset)) + 6) % 16], + src[((base + (2 * offset)) + 7) % 16], + src[((base + (2 * offset)) + 8) % 16], + src[((base + (2 * offset)) + 9) % 16], + src[((base + (2 * offset)) + 10) % 16], + src[((base + (2 * offset)) + 11) % 16], + src[((base + (2 * offset)) + 12) % 16], + src[((base + (2 * offset)) + 13) % 16], + src[((base + (2 * offset)) + 14) % 16], + src[((base + (2 * offset)) + 15) % 16], + }, + VL / 8)); + CHECK_NEON(7, uint8_t, + fillNeon( + { + src[((base + (3 * offset))) % 16], + src[((base + (3 * offset)) + 1) % 16], + src[((base + (3 * offset)) + 2) % 16], + src[((base + (3 * offset)) + 3) % 16], + src[((base + (3 * offset)) + 4) % 16], + src[((base + (3 * offset)) + 5) % 16], + src[((base + (3 * offset)) + 6) % 16], + src[((base + (3 * offset)) + 7) % 16], + src[((base + (3 * offset)) + 8) % 16], + src[((base + (3 * offset)) + 9) % 16], + src[((base + (3 * offset)) + 10) % 16], + src[((base + (3 * offset)) + 11) % 16], + src[((base + (3 * offset)) + 12) % 16], + src[((base + (3 * offset)) + 13) % 16], + src[((base + (3 * offset)) + 14) % 16], + src[((base + (3 * offset)) + 15) % 16], + }, + VL / 8)); + // Strided (4-stride) vectors + base = (VL / 8) * 4; + offset = (VL / 8); + CHECK_NEON(16, uint8_t, + fillNeon( + { + src[(base) % 16], + src[(base + 1) % 16], + src[(base + 2) % 16], + src[(base + 3) % 16], + src[(base + 4) % 16], + src[(base + 5) % 16], + src[(base + 6) % 16], + src[(base + 7) % 16], + src[(base + 8) % 16], + src[(base + 9) % 16], + src[(base + 10) % 16], + src[(base + 11) % 16], + src[(base + 12) % 16], + src[(base + 13) % 16], + src[(base + 14) % 16], + src[(base + 15) % 16], + }, + VL / 8)); + CHECK_NEON(20, uint8_t, + fillNeon( + { + src[((base + offset)) % 16], + src[((base + offset) + 1) % 16], + src[((base + offset) + 2) % 16], + src[((base + offset) + 3) % 16], + src[((base + offset) + 4) % 16], + src[((base + offset) + 5) % 16], + src[((base + offset) + 6) % 16], + src[((base + offset) + 7) % 16], + src[((base + offset) + 8) % 16], + src[((base + offset) + 9) % 16], + src[((base + offset) + 10) % 16], + src[((base + offset) + 11) % 16], + src[((base + offset) + 12) % 16], + src[((base + offset) + 13) % 16], + src[((base + offset) + 14) % 16], + src[((base + offset) + 15) % 16], + }, + VL / 8)); + CHECK_NEON(24, uint8_t, + fillNeon( + { + src[((base + (2 * offset))) % 16], + src[((base + (2 * offset)) + 1) % 16], + src[((base + (2 * offset)) + 2) % 16], + src[((base + (2 * offset)) + 3) % 16], + src[((base + (2 * offset)) + 4) % 16], + src[((base + (2 * offset)) + 5) % 16], + src[((base + (2 * offset)) + 6) % 16], + src[((base + (2 * offset)) + 7) % 16], + src[((base + (2 * offset)) + 8) % 16], + src[((base + (2 * offset)) + 9) % 16], + src[((base + (2 * offset)) + 10) % 16], + src[((base + (2 * offset)) + 11) % 16], + src[((base + (2 * offset)) + 12) % 16], + src[((base + (2 * offset)) + 13) % 16], + src[((base + (2 * offset)) + 14) % 16], + src[((base + (2 * offset)) + 15) % 16], + }, + VL / 8)); + CHECK_NEON(28, uint8_t, + fillNeon( + { + src[((base + (3 * offset))) % 16], + src[((base + (3 * offset)) + 1) % 16], + src[((base + (3 * offset)) + 2) % 16], + src[((base + (3 * offset)) + 3) % 16], + src[((base + (3 * offset)) + 4) % 16], + src[((base + (3 * offset)) + 5) % 16], + src[((base + (3 * offset)) + 6) % 16], + src[((base + (3 * offset)) + 7) % 16], + src[((base + (3 * offset)) + 8) % 16], + src[((base + (3 * offset)) + 9) % 16], + src[((base + (3 * offset)) + 10) % 16], + src[((base + (3 * offset)) + 11) % 16], + src[((base + (3 * offset)) + 12) % 16], + src[((base + (3 * offset)) + 13) % 16], + src[((base + (3 * offset)) + 14) % 16], + src[((base + (3 * offset)) + 15) % 16], + }, + VL / 8)); + base = 4; + offset = (VL / 8); + CHECK_NEON(17, uint8_t, + fillNeon( + { + src[(base) % 16], + src[(base + 1) % 16], + src[(base + 2) % 16], + src[(base + 3) % 16], + src[(base + 4) % 16], + src[(base + 5) % 16], + src[(base + 6) % 16], + src[(base + 7) % 16], + src[(base + 8) % 16], + src[(base + 9) % 16], + src[(base + 10) % 16], + src[(base + 11) % 16], + src[(base + 12) % 16], + src[(base + 13) % 16], + src[(base + 14) % 16], + src[(base + 15) % 16], + }, + VL / 8)); + CHECK_NEON(21, uint8_t, + fillNeon( + { + src[((base + offset)) % 16], + src[((base + offset) + 1) % 16], + src[((base + offset) + 2) % 16], + src[((base + offset) + 3) % 16], + src[((base + offset) + 4) % 16], + src[((base + offset) + 5) % 16], + src[((base + offset) + 6) % 16], + src[((base + offset) + 7) % 16], + src[((base + offset) + 8) % 16], + src[((base + offset) + 9) % 16], + src[((base + offset) + 10) % 16], + src[((base + offset) + 11) % 16], + src[((base + offset) + 12) % 16], + src[((base + offset) + 13) % 16], + src[((base + offset) + 14) % 16], + src[((base + offset) + 15) % 16], + }, + VL / 8)); + CHECK_NEON(25, uint8_t, + fillNeon( + { + src[((base + (2 * offset))) % 16], + src[((base + (2 * offset)) + 1) % 16], + src[((base + (2 * offset)) + 2) % 16], + src[((base + (2 * offset)) + 3) % 16], + src[((base + (2 * offset)) + 4) % 16], + src[((base + (2 * offset)) + 5) % 16], + src[((base + (2 * offset)) + 6) % 16], + src[((base + (2 * offset)) + 7) % 16], + src[((base + (2 * offset)) + 8) % 16], + src[((base + (2 * offset)) + 9) % 16], + src[((base + (2 * offset)) + 10) % 16], + src[((base + (2 * offset)) + 11) % 16], + src[((base + (2 * offset)) + 12) % 16], + src[((base + (2 * offset)) + 13) % 16], + src[((base + (2 * offset)) + 14) % 16], + src[((base + (2 * offset)) + 15) % 16], + }, + VL / 8)); + CHECK_NEON(29, uint8_t, + fillNeon( + { + src[((base + (3 * offset))) % 16], + src[((base + (3 * offset)) + 1) % 16], + src[((base + (3 * offset)) + 2) % 16], + src[((base + (3 * offset)) + 3) % 16], + src[((base + (3 * offset)) + 4) % 16], + src[((base + (3 * offset)) + 5) % 16], + src[((base + (3 * offset)) + 6) % 16], + src[((base + (3 * offset)) + 7) % 16], + src[((base + (3 * offset)) + 8) % 16], + src[((base + (3 * offset)) + 9) % 16], + src[((base + (3 * offset)) + 10) % 16], + src[((base + (3 * offset)) + 11) % 16], + src[((base + (3 * offset)) + 12) % 16], + src[((base + (3 * offset)) + 13) % 16], + src[((base + (3 * offset)) + 14) % 16], + src[((base + (3 * offset)) + 15) % 16], + }, + VL / 8)); } TEST_P(InstSve, ld1sw_gather) { @@ -4907,6 +5518,7 @@ TEST_P(InstSve, ld1d_gather) { } TEST_P(InstSve, ld1d) { + // Single vector initialHeapData_.resize(VL / 4); uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); std::vector src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}; @@ -4948,9 +5560,111 @@ TEST_P(InstSve, ld1d) { fillNeon({src[(base) % 4], src[(base + 1) % 4], src[(base + 2) % 4], src[(base + 3) % 4]}, VL / 16)); + + // Multi vector + initialHeapData_.resize(VL); + uint64_t* heap64_multi = reinterpret_cast(initialHeapData_.data()); + std::vector src_multi = {0xDEADBEEF, 0x12345678, 0x98765432, + 0xABCDEF01}; + fillHeap(heap64_multi, src_multi, VL / 8); + + // Two vector + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + dup z0.d, #1 + dup z1.d, #2 + + ptrue pn8.d + + ld1d {z0.d, z1.d}, pn8/z, [x0, #2, mul vl] + )"); + base = (VL / 64) * 2; + uint16_t offset = (VL / 64); + CHECK_NEON(0, uint64_t, + fillNeon({src[(base) % 4], src[(base + 1) % 4], + src[(base + 2) % 4], src[(base + 3) % 4]}, + VL / 8)); + CHECK_NEON( + 1, uint64_t, + fillNeon( + {src[((base + offset)) % 4], src[((base + offset) + 1) % 4], + src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]}, + VL / 8)); + + // Four vector + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + dup z0.d, #1 + dup z1.d, #2 + dup z2.d, #3 + dup z3.d, #4 + + ptrue pn8.d + + ld1d {z0.d - z3.d}, pn8/z, [x0, #4, mul vl] + addvl x1, x1, #1 + mov x2, #2 + udiv x1, x1, x2 + ld1d {z4.d - z7.d}, pn8/z, [x0, x1, lsl #3] + )"); + base = (VL / 64) * 4; + offset = (VL / 64); + CHECK_NEON(0, uint64_t, + fillNeon({src[(base) % 4], src[(base + 1) % 4], + src[(base + 2) % 4], src[(base + 3) % 4]}, + VL / 8)); + CHECK_NEON( + 1, uint64_t, + fillNeon( + {src[((base + offset)) % 4], src[((base + offset) + 1) % 4], + src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]}, + VL / 8)); + CHECK_NEON(2, uint64_t, + fillNeon({src[((base + (offset * 2))) % 4], + src[((base + (offset * 2)) + 1) % 4], + src[((base + (offset * 2)) + 2) % 4], + src[((base + (offset * 2)) + 3) % 4]}, + VL / 8)); + CHECK_NEON(3, uint64_t, + fillNeon({src[((base + (offset * 3))) % 4], + src[((base + (offset * 3)) + 1) % 4], + src[((base + (offset * 3)) + 2) % 4], + src[((base + (offset * 3)) + 3) % 4]}, + VL / 8)); + CHECK_NEON(4, uint64_t, + fillNeon({src[(base) % 4], src[(base + 1) % 4], + src[(base + 2) % 4], src[(base + 3) % 4]}, + VL / 8)); + CHECK_NEON( + 5, uint64_t, + fillNeon( + {src[((base + offset)) % 4], src[((base + offset) + 1) % 4], + src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]}, + VL / 8)); + CHECK_NEON(6, uint64_t, + fillNeon({src[((base + (offset * 2))) % 4], + src[((base + (offset * 2)) + 1) % 4], + src[((base + (offset * 2)) + 2) % 4], + src[((base + (offset * 2)) + 3) % 4]}, + VL / 8)); + CHECK_NEON(7, uint64_t, + fillNeon({src[((base + (offset * 3))) % 4], + src[((base + (offset * 3)) + 1) % 4], + src[((base + (offset * 3)) + 2) % 4], + src[((base + (offset * 3)) + 3) % 4]}, + VL / 8)); } TEST_P(InstSve, ld1h) { + // Single vector initialHeapData_.resize(VL / 4); uint16_t* heap16 = reinterpret_cast(initialHeapData_.data()); fillHeap( @@ -4968,6 +5682,7 @@ TEST_P(InstSve, ld1h) { ptrue p0.h # Load and broadcast values from heap ld1h {z0.h}, p0/z, [x0, x1, lsl #1] + ld1h {z2.h}, p0/z, [x0] # Test for inactive lanes mov x1, #0 @@ -4977,6 +5692,10 @@ TEST_P(InstSve, ld1h) { mov x2, #0 whilelo p1.h, xzr, x1 ld1h {z1.h}, p1/z, [x0, x2, lsl #1] + + addvl x10, x10, #1 + add x10, x10, x0 + ld1h {z3.h}, p1/z, [x10, #-1, mul vl] )"); CHECK_NEON(0, uint16_t, fillNeon({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, @@ -4986,14 +5705,67 @@ TEST_P(InstSve, ld1h) { fillNeonCombined({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, 0xEF01, 0xABCD}, {0}, VL / 8)); + CHECK_NEON(2, uint16_t, + fillNeon({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, + 0xEF01, 0xABCD}, + VL / 8)); + CHECK_NEON(3, uint16_t, + fillNeonCombined({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, + 0x9876, 0xEF01, 0xABCD}, + {0}, VL / 8)); + + // Multi vector + + // Two vector + initialHeapData_.resize(VL); + heap16 = reinterpret_cast(initialHeapData_.data()); + fillHeap( + heap16, {0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, 0xEF01, 0xABCD}, + VL / 2); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ptrue pn8.h + mov x1, #1 + ld1h {z0.h, z1.h}, pn8/z, [x0] + ld1h {z2.h, z3.h}, pn8/z, [x0, x1, lsl #1] + ld1h {z4.h, z5.h}, pn8/z, [x0, #2, mul vl] + )"); + CHECK_NEON(0, uint16_t, + fillNeon({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, + 0xEF01, 0xABCD}, + VL / 8)); + CHECK_NEON(1, uint16_t, + fillNeon({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, + 0xEF01, 0xABCD}, + VL / 8)); + CHECK_NEON(2, uint16_t, + fillNeon({0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, 0xEF01, + 0xABCD, 0xBEEF}, + VL / 8)); + CHECK_NEON(3, uint16_t, + fillNeon({0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, 0xEF01, + 0xABCD, 0xBEEF}, + VL / 8)); + CHECK_NEON(4, uint16_t, + fillNeon({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, + 0xEF01, 0xABCD}, + VL / 8)); + CHECK_NEON(5, uint16_t, + fillNeon({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, + 0xEF01, 0xABCD}, + VL / 8)); } TEST_P(InstSve, ld1w) { + // Single vector initialHeapData_.resize(VL / 4); uint32_t* heap32 = reinterpret_cast(initialHeapData_.data()); std::vector src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}; fillHeap(heap32, src, VL / 16); - RUN_AARCH64(R"( # Get heap address mov x0, 0 @@ -5028,6 +5800,116 @@ TEST_P(InstSve, ld1w) { CHECK_NEON(3, uint64_t, fillNeonCombined( {0x12345678DEADBEEF, 0xABCDEF0198765432}, {0}, VL / 8)); + + // Multi vector + initialHeapData_.resize(VL); + uint32_t* heap32_multi = reinterpret_cast(initialHeapData_.data()); + std::vector src_multi = {0xDEADBEEF, 0x12345678, 0x98765432, + 0xABCDEF01}; + fillHeap(heap32_multi, src_multi, VL / 4); + + // Two vector + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + dup z0.s, #1 + dup z1.s, #2 + dup z2.s, #3 + dup z3.s, #4 + + ptrue pn8.s + mov x1, #2 + + ld1w {z0.s, z1.s}, pn8/z, [x0, #2, mul vl] + ld1w {z2.s, z3.s}, pn8/z, [x0, x1, lsl #2] + )"); + uint16_t base = (VL / 32) * 2; + uint16_t offset = (VL / 32); + CHECK_NEON(0, uint32_t, + fillNeon({src[(base) % 4], src[(base + 1) % 4], + src[(base + 2) % 4], src[(base + 3) % 4]}, + VL / 8)); + CHECK_NEON( + 1, uint32_t, + fillNeon( + {src[((base + offset)) % 4], src[((base + offset) + 1) % 4], + src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]}, + VL / 8)); + + CHECK_NEON(2, uint32_t, + fillNeon({src[2], src[3], src[0], src[1]}, VL / 8)); + CHECK_NEON(3, uint32_t, + fillNeon({src[(2 + offset) % 4], src[(3 + offset) % 4], + src[(0 + offset) % 4], src[(1 + offset) % 4]}, + VL / 8)); + + // Four vector + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + dup z0.s, #1 + dup z1.s, #2 + dup z2.s, #3 + dup z3.s, #4 + + ptrue pn8.s + addvl x1, x1, #1 + + ld1w {z0.s - z3.s}, pn8/z, [x0, #4, mul vl] + ld1w {z4.s - z7.s}, pn8/z, [x0, x1, lsl #2] + )"); + base = (VL / 32) * 4; + offset = (VL / 32); + CHECK_NEON(0, uint32_t, + fillNeon({src[(base) % 4], src[(base + 1) % 4], + src[(base + 2) % 4], src[(base + 3) % 4]}, + VL / 8)); + CHECK_NEON( + 1, uint32_t, + fillNeon( + {src[((base + offset)) % 4], src[((base + offset) + 1) % 4], + src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]}, + VL / 8)); + CHECK_NEON(2, uint32_t, + fillNeon({src[((base + (offset * 2))) % 4], + src[((base + (offset * 2)) + 1) % 4], + src[((base + (offset * 2)) + 2) % 4], + src[((base + (offset * 2)) + 3) % 4]}, + VL / 8)); + CHECK_NEON(3, uint32_t, + fillNeon({src[((base + (offset * 3))) % 4], + src[((base + (offset * 3)) + 1) % 4], + src[((base + (offset * 3)) + 2) % 4], + src[((base + (offset * 3)) + 3) % 4]}, + VL / 8)); + CHECK_NEON(4, uint32_t, + fillNeon({src[(base) % 4], src[(base + 1) % 4], + src[(base + 2) % 4], src[(base + 3) % 4]}, + VL / 8)); + CHECK_NEON( + 5, uint32_t, + fillNeon( + {src[((base + offset)) % 4], src[((base + offset) + 1) % 4], + src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]}, + VL / 8)); + CHECK_NEON(6, uint32_t, + fillNeon({src[((base + (offset * 2))) % 4], + src[((base + (offset * 2)) + 1) % 4], + src[((base + (offset * 2)) + 2) % 4], + src[((base + (offset * 2)) + 3) % 4]}, + VL / 8)); + CHECK_NEON(7, uint32_t, + fillNeon({src[((base + (offset * 3))) % 4], + src[((base + (offset * 3)) + 1) % 4], + src[((base + (offset * 3)) + 2) % 4], + src[((base + (offset * 3)) + 3) % 4]}, + VL / 8)); } TEST_P(InstSve, ld2d) { @@ -5660,6 +6542,27 @@ TEST_P(InstSve, ptrue) { CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 2)); } +TEST_P(InstSve, ptrue_counter) { + RUN_AARCH64(R"( + ptrue pn8.s + ptrue pn9.d + ptrue pn10.b + ptrue pn11.h + )"); + const uint64_t ps = + 0b0000000000000000000000000000000000000000000000001000000000000100; + const uint64_t pd = + 0b0000000000000000000000000000000000000000000000001000000000001000; + const uint64_t pb = + 0b0000000000000000000000000000000000000000000000001000000000000001; + const uint64_t ph = + 0b0000000000000000000000000000000000000000000000001000000000000010; + CHECK_PREDICATE(8, uint64_t, {ps, 0x0, 0x0, 0x0}); + CHECK_PREDICATE(9, uint64_t, {pd, 0x0, 0x0, 0x0}); + CHECK_PREDICATE(10, uint64_t, {pb, 0x0, 0x0, 0x0}); + CHECK_PREDICATE(11, uint64_t, {ph, 0x0, 0x0, 0x0}); +} + TEST_P(InstSve, punpk) { RUN_AARCH64(R"( ptrue p0.b @@ -6385,8 +7288,73 @@ TEST_P(InstSve, st1d) { } } +TEST_P(InstSve, st1d_multivec) { + // Two vectors + initialHeapData_.resize(VL / 4); + uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}; + fillHeap(heap64, src, VL / 32); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + sub sp, sp, #4095 + mov x1, #1 + mov x4, #256 + madd x4, x4, x4, x4 + ptrue p0.d + ptrue pn8.d + ld1d {z0.d}, p0/z, [x0] + ld1d {z1.d}, p0/z, [x0, #1, mul vl] + st1d {z0.d, z1.d}, pn8, [sp] + st1d {z0.d, z1.d}, pn8, [x4, #4, mul vl] + st1d {z0.d, z1.d}, pn8, [x4, x1, lsl #3] + )"); + + for (uint64_t i = 0; i < (VL / 32); i++) { + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + (i * 8)), + src[i % 4]); + EXPECT_EQ(getMemoryValue(65792 + (4 * (VL / 8)) + (i * 8)), + src[i % 4]); + EXPECT_EQ(getMemoryValue(65792 + 8 + (i * 8)), src[i % 4]); + } + + // Four vectors + initialHeapData_.resize(VL); + heap64 = reinterpret_cast(initialHeapData_.data()); + fillHeap(heap64, src, VL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + sub sp, sp, #4095 + mov x1, #2 + mov x4, #256 + madd x4, x4, x4, x4 + ptrue p0.d + ptrue pn8.d + ld1d {z0.d}, p0/z, [x0] + ld1d {z1.d}, p0/z, [x0, #1, mul vl] + ld1d {z2.d}, p0/z, [x0, #2, mul vl] + ld1d {z3.d}, p0/z, [x0, #3, mul vl] + st1d {z0.d - z3.d}, pn8, [sp] + st1d {z0.d - z3.d}, pn8, [x4, #8, mul vl] + )"); + for (uint64_t i = 0; i < (VL / 16); i++) { + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + (i * 8)), + src[i % 4]); + EXPECT_EQ(getMemoryValue(65792 + (8 * (VL / 8)) + (i * 8)), + src[i % 4]); + } +} + TEST_P(InstSve, st2d) { - // 32-bit RUN_AARCH64(R"( ptrue p0.d mov x0, #0 @@ -6423,6 +7391,62 @@ TEST_P(InstSve, st2d) { } } +TEST_P(InstSve, st4w) { + // 32-bit + RUN_AARCH64(R"( + ptrue p0.s + mov x0, #0 + addvl x1, x0, #1 + mov x2, #8 + udiv x3, x1, x2 + whilelo p1.s, xzr, x3 + + sub sp, sp, #4095 + mov x6, #300 + + dup z0.s, #3 + dup z1.s, #4 + dup z2.s, #5 + dup z3.s, #6 + + st4w {z0.s - z3.s}, p0, [sp] + st4w {z0.s - z3.s}, p1, [x6, #4, mul vl] + addvl x7, x7, #3 + st4w {z0.s - z3.s}, p1, [x6, x7, lsl #2] + )"); + + for (uint64_t i = 0; i < (VL / 32); i++) { + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + (4 * i * 4)), + 3); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + (4 * i * 4) + 4), + 4); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + (4 * i * 4) + 8), + 5); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + (4 * i * 4) + 12), + 6); + } + + int index = 4 * (VL / 8); + for (uint64_t i = 0; i < (VL / 64); i++) { + EXPECT_EQ(getMemoryValue(300 + index + (4 * i * 4)), 3); + EXPECT_EQ(getMemoryValue(300 + index + (4 * i * 4) + 4), 4); + EXPECT_EQ(getMemoryValue(300 + index + (4 * i * 4) + 8), 5); + EXPECT_EQ(getMemoryValue(300 + index + (4 * i * 4) + 12), 6); + } + + index = 12 * (VL / 8); + for (uint64_t i = 0; i < (VL / 64); i++) { + EXPECT_EQ(getMemoryValue(300 + index + (4 * i * 4)), 3); + EXPECT_EQ(getMemoryValue(300 + index + (4 * i * 4) + 4), 4); + EXPECT_EQ(getMemoryValue(300 + index + (4 * i * 4) + 8), 5); + EXPECT_EQ(getMemoryValue(300 + index + (4 * i * 4) + 12), 6); + } +} + TEST_P(InstSve, st1w_scatter) { // 32-bit RUN_AARCH64(R"( @@ -6603,6 +7627,71 @@ TEST_P(InstSve, st1w) { } } +TEST_P(InstSve, st1w_multivec) { + // Two vectors + initialHeapData_.resize(VL / 4); + uint32_t* heap32 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}; + fillHeap(heap32, src, VL / 16); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + sub sp, sp, #4095 + mov x1, #2 + mov x4, #256 + madd x4, x4, x4, x4 + ptrue p0.s + ptrue pn8.s + ld1w {z0.s}, p0/z, [x0] + ld1w {z1.s}, p0/z, [x0, #1, mul vl] + st1w {z0.s, z1.s}, pn8, [sp] + st1w {z0.s, z1.s}, pn8, [x4, #4, mul vl] + st1w {z0.s, z1.s}, pn8, [x4, x1, lsl #2] + )"); + for (uint64_t i = 0; i < (VL / 16); i++) { + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + (i * 4)), + src[i % 4]); + EXPECT_EQ(getMemoryValue(65792 + (4 * (VL / 8)) + (i * 4)), + src[i % 4]); + EXPECT_EQ(getMemoryValue(65792 + 8 + (i * 4)), src[i % 4]); + } + + // Four vectors + initialHeapData_.resize(VL); + heap32 = reinterpret_cast(initialHeapData_.data()); + fillHeap(heap32, src, VL / 4); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + sub sp, sp, #4095 + mov x1, #2 + mov x4, #256 + madd x4, x4, x4, x4 + ptrue p0.s + ptrue pn8.s + ld1w {z0.s}, p0/z, [x0] + ld1w {z1.s}, p0/z, [x0, #1, mul vl] + ld1w {z2.s}, p0/z, [x0, #2, mul vl] + ld1w {z3.s}, p0/z, [x0, #3, mul vl] + st1w {z0.s - z3.s}, pn8, [sp] + st1w {z0.s - z3.s}, pn8, [x4, #8, mul vl] + )"); + for (uint64_t i = 0; i < (VL / 8); i++) { + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + (i * 4)), + src[i % 4]); + EXPECT_EQ(getMemoryValue(65792 + (8 * (VL / 8)) + (i * 4)), + src[i % 4]); + } +} + TEST_P(InstSve, str_predicate) { initialHeapData_.resize(VL / 64); uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); @@ -7020,6 +8109,54 @@ TEST_P(InstSve, uaddv) { CHECK_NEON(3, uint64_t, {(9 * (VL / 128)), 0}); } +TEST_P(InstSve, udot) { + // udot by element + initialHeapData_.resize(16); + uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); + heap64[0] = 0xDEADBEEFFFFF00FF; + heap64[1] = 0x01234567ABBACAFE; + RUN_AARCH64(R"( + # Get heap address + mov x0, #0 + mov x8, #214 + svc #0 + + ptrue p0.b + ld1rqb { z0.b }, p0/z, [x0] + + dup z2.b, #2 + dup z3.b, #3 + dup z4.s, #4 + dup z5.s, #5 + + udot z4.s, z2.b, z0.b[0] + udot z5.s, z3.b, z0.b[3] + )"); + CHECK_NEON(4, uint32_t, fillNeon({1534}, VL / 8)); + CHECK_NEON(5, uint32_t, fillNeon({629}, VL / 8)); + + // udot by vector - 4-way + initialHeapData_.resize(16); + heap64 = reinterpret_cast(initialHeapData_.data()); + heap64[0] = 0xDEADBEEFFFFF00FF; + heap64[1] = 0x01234567ABBACAFE; + RUN_AARCH64(R"( + # Get heap address + mov x0, #0 + mov x8, #214 + svc #0 + + ptrue p0.b + ld1rqb { z0.b }, p0/z, [x0] + + dup z2.b, #2 + dup z4.s, #4 + + udot z4.s, z2.b, z0.b + )"); + CHECK_NEON(4, uint32_t, fillNeon({1534, 1652, 1630, 420}, VL / 8)); +} + TEST_P(InstSve, uqdec) { // d arrangement RUN_AARCH64(R"( @@ -7983,14 +9120,12 @@ TEST_P(InstSve, zip_pred) { } TEST_P(InstSve, zip) { - // d arrangement RUN_AARCH64(R"( # 64-bit fdup z0.d, #0.5 fdup z1.d, #-0.5 fdup z2.d, #0.75 fdup z3.d, #-0.75 - zip1 z4.d, z0.d, z1.d zip2 z5.d, z2.d, z3.d @@ -8001,16 +9136,37 @@ TEST_P(InstSve, zip) { fdup z9.s, #0.75 zip1 z10.s, z6.s, z7.s zip2 z11.s, z8.s, z9.s - )"); + # 8-bit + dup z12.b, #1 + dup z13.b, #-2 + dup z14.b, #-1 + dup z15.b, #2 + zip1 z16.b, z12.b, z13.b + zip2 z17.b, z14.b, z15.b + )"); CHECK_NEON(4, double, fillNeon({0.5, -0.5}, VL / 8)); CHECK_NEON(5, double, fillNeon({0.75, -0.75}, VL / 8)); CHECK_NEON(10, float, fillNeon({0.5, -0.75}, VL / 8)); CHECK_NEON(11, float, fillNeon({-0.5, 0.75}, VL / 8)); + CHECK_NEON(16, int8_t, fillNeon({1, -2}, VL / 8)); + CHECK_NEON(17, int8_t, fillNeon({-1, 2}, VL / 8)); + + // Multi-vector + RUN_AARCH64(R"( + #32-bit + dup z0.s, #5 + dup z1.s, #6 + dup z2.s, #7 + dup z3.s, #8 + zip {z4.s - z7.s}, {z0.s - z3.s} + )"); + CHECK_NEON(4, uint32_t, fillNeon({5, 6, 7, 8}, VL / 8)); + CHECK_NEON(5, uint32_t, fillNeon({5, 6, 7, 8}, VL / 8)); + CHECK_NEON(6, uint32_t, fillNeon({5, 6, 7, 8}, VL / 8)); + CHECK_NEON(7, uint32_t, fillNeon({5, 6, 7, 8}, VL / 8)); } -#if SIMENG_LLVM_VERSION >= 14 -// If LLVM version supports SVE2 : TEST_P(InstSve, psel) { RUN_AARCH64(R"( mov w13, #0 @@ -8044,7 +9200,6 @@ TEST_P(InstSve, psel) { CHECK_PREDICATE(14, uint64_t, fillPred(VL / 8, {0}, 4)); CHECK_PREDICATE(15, uint64_t, fillPred(VL / 8, {0}, 8)); } -#endif INSTANTIATE_TEST_SUITE_P(AArch64, InstSve, ::testing::ValuesIn(genCoreTypeVLPairs(EMULATION)), diff --git a/test/unit/aarch64/ArchInfoTest.cc b/test/unit/aarch64/ArchInfoTest.cc index 39e25a0bd..a2b41a9ec 100644 --- a/test/unit/aarch64/ArchInfoTest.cc +++ b/test/unit/aarch64/ArchInfoTest.cc @@ -23,7 +23,8 @@ class AArch64ArchInfoTest : public ::testing::Test { aarch64_sysreg::AARCH64_SYSREG_MIDR_EL1, aarch64_sysreg::AARCH64_SYSREG_CNTVCT_EL0, aarch64_sysreg::AARCH64_SYSREG_PMCCNTR_EL0, - aarch64_sysreg::AARCH64_SYSREG_SVCR}; + aarch64_sysreg::AARCH64_SYSREG_SVCR, + aarch64_sysreg::AARCH64_SYSREG_TPIDR2_EL0}; const std::vector archRegStruct = { {8, 32}, diff --git a/test/unit/aarch64/InstructionTest.cc b/test/unit/aarch64/InstructionTest.cc index 8d4b0d87f..53041905e 100644 --- a/test/unit/aarch64/InstructionTest.cc +++ b/test/unit/aarch64/InstructionTest.cc @@ -602,6 +602,39 @@ TEST_F(AArch64InstructionTest, setters) { EXPECT_TRUE(insn.isWaitingCommit()); } +// Test predAsCounterToMasks function. +TEST_F(AArch64InstructionTest, predAsCounterToMasks_test) { + // 1.5 full vectors from start, VL = 128b, uint8_t elem size + std::vector> ref(2, {0, 0, 0, 0}); + ref[0][0] = + 0b0000000000000000000000000000000000000000000000001111111111111111; + ref[1][0] = + 0b0000000000000000000000000000000000000000000000000000000011111111; + // invert = 0, num active Elems = 24 + uint64_t pn = + 0b0000000000000000000000000000000000000000000000000000000000110001; + auto out = predAsCounterToMasks(pn, 128); + EXPECT_EQ(out[0][0], ref[0][0]); + EXPECT_EQ(out[1][0], ref[1][0]); + + // 0.5 of last vector, VL = 1024b, uint64_t elem size + std::vector> ref2(4, {0, 0, 0, 0}); + ref2[3][1] = + 0b0000000100000001000000010000000100000001000000010000000100000001; + // Invert = 1, num inactive Elems = 56 + uint64_t pn2 = + 0b0000000000000000000000000000000000000000000000001000001110001000; + auto out2 = predAsCounterToMasks(pn2, 1024); + EXPECT_EQ(out2[0][0], ref2[0][0]); + EXPECT_EQ(out2[0][1], ref2[0][1]); + EXPECT_EQ(out2[1][0], ref2[1][0]); + EXPECT_EQ(out2[1][1], ref2[1][1]); + EXPECT_EQ(out2[2][0], ref2[2][0]); + EXPECT_EQ(out2[2][1], ref2[2][1]); + EXPECT_EQ(out2[3][0], ref2[3][0]); + EXPECT_EQ(out2[3][1], ref2[3][1]); +} + } // namespace aarch64 } // namespace arch } // namespace simeng \ No newline at end of file