diff --git a/CMakeLists.txt b/CMakeLists.txt
index b8f4379b9..afeeb4aba 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -116,6 +116,7 @@ option(SIMENG_SANITIZE "Enable compiler sanitizers" OFF)
 option(SIMENG_OPTIMIZE "Enable Extra Compiler Optimizations" OFF)
 option(SIMENG_ENABLE_SST "Compile SimEng SST Wrapper" OFF)
 option(SIMENG_ENABLE_SST_TESTS "Enable testing for SST" OFF)
+option(SIMENG_ENABLE_BF16 "Enable __bf16 instruction execution logic" OFF)
 
 # Set CXX flag for Apple Mac so that `binary_function` and `unary_function` types that are used in SST can be recognised. 
 # They were deprecated in C++11 and removed in C++17, and Apple Clang v15 no longer supports these types without the following flag
@@ -155,10 +156,9 @@ if(SIMENG_ENABLE_TESTS)
 
       # Print message containing if the full test suite will run
     if (${LLVM_PACKAGE_VERSION} VERSION_LESS "14.0")
-      message(STATUS "LLVM version does not support AArch64 extensions SME or SVE2. These test suites will be skipped.")
-    endif()
-    if (${LLVM_PACKAGE_VERSION} VERSION_LESS "18.0")
-      message(STATUS "LLVM version does not support AArch64 extensions SME2. These test suites will be skipped.")
+      message(STATUS "LLVM version does not support AArch64 extensions SVE2, SVE2.1, SME, or SME2. Related tests will fail.")
+    elseif (${LLVM_PACKAGE_VERSION} VERSION_LESS "18.0")
+      message(STATUS "LLVM version does not support AArch64 extensions SME2 or SVE2.1. Related test will fail.")
     endif()
 
   else()
diff --git a/src/include/simeng/arch/aarch64/ArchInfo.hh b/src/include/simeng/arch/aarch64/ArchInfo.hh
index 1403da08f..b7f274035 100644
--- a/src/include/simeng/arch/aarch64/ArchInfo.hh
+++ b/src/include/simeng/arch/aarch64/ArchInfo.hh
@@ -18,7 +18,8 @@ class ArchInfo : public simeng::arch::ArchInfo {
                            aarch64_sysreg::AARCH64_SYSREG_MIDR_EL1,
                            aarch64_sysreg::AARCH64_SYSREG_CNTVCT_EL0,
                            aarch64_sysreg::AARCH64_SYSREG_PMCCNTR_EL0,
-                           aarch64_sysreg::AARCH64_SYSREG_SVCR}),
+                           aarch64_sysreg::AARCH64_SYSREG_SVCR,
+                           aarch64_sysreg::AARCH64_SYSREG_TPIDR2_EL0}),
         zaSize_(config["Core"]["Streaming-Vector-Length"].as<uint16_t>() / 8) {
     // Generate the architecture-defined architectural register structure
     archRegStruct_ = {
diff --git a/src/include/simeng/arch/aarch64/Instruction.hh b/src/include/simeng/arch/aarch64/Instruction.hh
index d510c1f37..f37089219 100644
--- a/src/include/simeng/arch/aarch64/Instruction.hh
+++ b/src/include/simeng/arch/aarch64/Instruction.hh
@@ -283,6 +283,40 @@ enum class InsnType : uint32_t {
   isBranch = 1 << 14
 };
 
+/** Convert Predicate-as-Counter to Predicate-as-Masks.
+ * T represents the element type (i.e. for pg.s, T = uint32_t).
+ * V represents the number of vectors the predicate-as-counter is being used
+ * for. */
+template <typename T, int V>
+std::vector<std::array<uint64_t, 4>> predAsCounterToMasks(
+    const uint64_t predAsCounter, const uint16_t VL_bits) {
+  std::vector<std::array<uint64_t, 4>> out(V, {0, 0, 0, 0});
+
+  const uint16_t elemsPerVec = VL_bits / (sizeof(T) * 8);
+  // Get predicate-as-counter information
+  const bool invert = (predAsCounter & 0b1000000000000000) != 0;
+  const uint64_t predElemCount =
+      (predAsCounter & static_cast<uint64_t>(0b0111111111111111)) >>
+      static_cast<uint8_t>(std::log2f(sizeof(T)) + 1);
+
+  for (int r = 0; r < V; r++) {
+    for (uint16_t i = 0; i < elemsPerVec; i++) {
+      // Move bit to next position based on element type
+      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+      // If invert = True (invert bit = 1), predElemCount dictates number of
+      // initial inactive elements.
+      // If invert = False (invert bit = 0), it indicates the number of initial
+      // active elements.
+      if (static_cast<uint64_t>(r * elemsPerVec) + i < predElemCount) {
+        out[r][i / (64 / sizeof(T))] |= (invert) ? 0 : shifted_active;
+      } else {
+        out[r][i / (64 / sizeof(T))] |= (invert) ? shifted_active : 0;
+      }
+    }
+  }
+  return out;
+}
+
 /** A basic Armv9.2-a implementation of the `Instruction` interface. */
 class Instruction : public simeng::Instruction {
  public:
diff --git a/src/include/simeng/arch/aarch64/helpers/float.hh b/src/include/simeng/arch/aarch64/helpers/float.hh
index 454f50070..0d198f926 100644
--- a/src/include/simeng/arch/aarch64/helpers/float.hh
+++ b/src/include/simeng/arch/aarch64/helpers/float.hh
@@ -194,6 +194,23 @@ D fcvtzu_integer(srcValContainer& sourceValues) {
   return result;
 }
 
+/** Helper function for SCALAR/FP instructions with the format ucvtf rd, rn
+ * #fbits.
+ * D represents the destination register type (e.g. for Sd, D = float).
+ * N represents the source register type (e.g. for Xn, N = uint32_t).
+ * Returns single value of type D. */
+template <typename D, typename N>
+D ucvtf_fixedToFloat(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  // Convert Fixed-Point to FP
+  // Using algorithm from
+  // https://embeddedartistry.com/blog/2018/07/12/simple-fixed-point-conversion-in-c/
+  const N xn = sourceValues[0].get<N>();
+  const N fbits = static_cast<N>(metadata.operands[2].imm);
+  return (static_cast<D>(xn) / static_cast<D>(1ull << fbits));
+}
+
 }  // namespace aarch64
 }  // namespace arch
 }  // namespace simeng
\ No newline at end of file
diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh
index cc9aa0346..2a9ac3d0f 100644
--- a/src/include/simeng/arch/aarch64/helpers/neon.hh
+++ b/src/include/simeng/arch/aarch64/helpers/neon.hh
@@ -951,6 +951,63 @@ RegisterValue vecUzp(srcValContainer& sourceValues, bool isUzp1) {
   return {out, 256};
 }
 
+/** Helper function for NEON instructions with the format `udot vd.s, vn.b,
+ * vm.b`. D represents the number of elements in the output vector to be updated
+ * (i.e. for vd.2s D = 2). Only 2 or 4 are valid. Returns correctly formatted
+ * RegisterValue. */
+template <int D>
+RegisterValue vecUdot(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  // Check D and N are valid values
+  static_assert((D == 2 || D == 4) &&
+                "D must be either 2 or 4 to align with vd.2s or vd.4s.");
+
+  const uint32_t* vd = sourceValues[0].getAsVector<uint32_t>();
+  const uint8_t* vn = sourceValues[1].getAsVector<uint8_t>();
+  const uint8_t* vm = sourceValues[2].getAsVector<uint8_t>();
+
+  uint32_t out[D] = {0};
+  for (int i = 0; i < D; i++) {
+    out[i] = vd[i];
+    for (int j = 0; j < 4; j++) {
+      out[i] += (static_cast<uint32_t>(vn[(4 * i) + j]) *
+                 static_cast<uint32_t>(vm[(4 * i) + j]));
+    }
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `udot vd.s, vn.b,
+ * vm.4b[index]`.
+ * D represents the number of elements in the output vector to be updated (i.e.
+ * for vd.2s D = 2). Only 2 or 4 are valid.
+ * Returns correctly formatted RegisterValue. */
+template <int D>
+RegisterValue vecUdot_byElement(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  // Check D and N are valid values
+  static_assert((D == 2 || D == 4) &&
+                "D must be either 2 or 4 to align with vd.2s or vd.4s.");
+
+  const uint32_t* vd = sourceValues[0].getAsVector<uint32_t>();
+  const uint8_t* vn = sourceValues[1].getAsVector<uint8_t>();
+  const uint8_t* vm = sourceValues[2].getAsVector<uint8_t>();
+  const int index = metadata.operands[2].vector_index;
+
+  uint32_t out[D] = {0};
+  for (int i = 0; i < D; i++) {
+    uint32_t acc = vd[i];
+    for (int j = 0; j < 4; j++) {
+      acc += (static_cast<uint32_t>(vn[(4 * i) + j]) *
+              static_cast<uint32_t>(vm[(4 * index) + j]));
+    }
+    out[i] = acc;
+  }
+  return {out, 256};
+}
+
 /** Helper function for NEON instructions with the format `zip<1,2> vd.T,
  * vn.T, vm.T`.
  * T represents the type of sourceValues (e.g. for vn.d, T = uint64_t).
diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index 2c33ccfbe..2315021a1 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -626,6 +626,27 @@ std::enable_if_t<std::is_floating_point_v<T>, RegisterValue> sveFDivPredicated(
   return {out, 256};
 }
 
+/** Helper function for SVE instructions with the format `faddv rd, pg, zn.
+ * D represents the source vector element type and the destination scalar
+ * register type (i.e. for zn.s and sd, D = float).
+ * Returns correctly formatted RegisterValue. */
+template <typename D>
+RegisterValue sveFaddv_predicated(srcValContainer& sourceValues,
+                                  const uint16_t VL_bits) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const D* zn = sourceValues[1].getAsVector<D>();
+
+  const uint16_t partition_num = VL_bits / (8 * sizeof(D));
+  D out[256 / sizeof(D)] = {0};
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(D))) * sizeof(D));
+    if (p[i / (64 / sizeof(D))] & shifted_active) {
+      out[0] += zn[i];
+    }
+  }
+  return {out, 256};
+}
+
 /** Helper function for SVE instructions with the format `fmad zd, pg/m, zn,
  * zm`.
  * T represents the type of sourceValues (e.g. for zn.d, T = double).
@@ -1319,6 +1340,40 @@ std::array<uint64_t, 4> svePtrue(
   return out;
 }
 
+/** Helper function for SVE instructions with the format `ptrue pnd.
+ * T represents the type of sourceValues (e.g. for pnd.d, T = uint64_t).
+ * Returns an array of 4 uint64_t elements. */
+template <typename T>
+std::array<uint64_t, 4> svePtrue_counter(const uint16_t VL_bits) {
+  // Predicate as counter is 16-bits and has the following encoding:
+  //    - Up to first 4 bits (named LSZ) encode the element size (0b1, 0b10,
+  //    0b100, 0b1000 for b h s d respectively)
+  //            - bits 0->LSZ
+  //    - Bits LSZ -> 14 represent a uint of the number of consecutive elements
+  //    from element 0 that are active / inactive
+  //            - If invert bit = 0 it is number of active elements
+  //            - If invert bit = 1 it is number of inactive elements
+  //    - Bit 15 represents the invert bit
+  std::array<uint64_t, 4> out = {0, 0, 0, 0};
+
+  // Set invert bit to 1 and count to 0 so that the first 0 elements are FALSE.
+  // This is how the spec defines all true to be encoded.
+  out[0] |= 0b1000000000000000;
+
+  // Set Element size field
+  if (sizeof(T) == 1) {
+    out[0] |= 0b1;
+  } else if (sizeof(T) == 2) {
+    out[0] |= 0b10;
+  } else if (sizeof(T) == 4) {
+    out[0] |= 0b100;
+  } else if (sizeof(T) == 8) {
+    out[0] |= 0b1000;
+  }
+
+  return out;
+}
+
 /** Helper function for SVE instructions with the format `punpk<hi,lo> pd.h,
  * pn.b`.
  * If `isHI` = false, then PUNPKLO is performed.
@@ -1563,6 +1618,69 @@ RegisterValue sveTrn2_3vecs(srcValContainer& sourceValues,
   return {out, 256};
 }
 
+/** Helper function for SVE instructions with the format `udot zd, zn, zm`.
+ * D represents the element type of the destination register (i.e. for zd.s,
+ * D = uint32_t).
+ * N represents the element type of the source registers (i.e. for zn.b, N =
+ * uint8_t).
+ * W represents how many source elements are multiplied to form an output
+ * element (i.e. for 4-way, W = 4).
+ * Returns correctly formatted RegisterValue. */
+template <typename D, typename N, int W>
+RegisterValue sveUdot(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const D* zd = sourceValues[0].getAsVector<D>();
+  const N* zn = sourceValues[1].getAsVector<N>();
+  const N* zm = sourceValues[2].getAsVector<N>();
+
+  D out[256 / sizeof(D)] = {0};
+  for (size_t i = 0; i < (VL_bits / (sizeof(D) * 8)); i++) {
+    out[i] = zd[i];
+    for (int j = 0; j < W; j++) {
+      out[i] +=
+          (static_cast<D>(zn[(W * i) + j]) * static_cast<N>(zm[(W * i) + j]));
+    }
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `udot zd, zn,
+ * zm[index]`.
+ * D represents the element type of the destination register (i.e. for uint32_t,
+ * D = uint32_t).
+ * N represents the element type of the source registers (i.e. for uint8_t, N =
+ * uint8_t).
+ * W represents how many source elements are multiplied to form an output
+ * element (i.e. for 4-way, W = 4).
+ * Returns correctly formatted RegisterValue. */
+template <typename D, typename N, int W>
+RegisterValue sveUdot_indexed(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const D* zd = sourceValues[0].getAsVector<D>();
+  const N* zn = sourceValues[1].getAsVector<N>();
+  const N* zm = sourceValues[2].getAsVector<N>();
+  const int index = metadata.operands[2].vector_index;
+
+  D out[256 / sizeof(D)] = {0};
+  for (size_t i = 0; i < (VL_bits / (sizeof(D) * 8)); i++) {
+    D acc = zd[i];
+    // Index into zm selects which D-type element within each 128-bit vector
+    // segment to use
+    int base = i - (i % (128 / (sizeof(D) * 8)));
+    int zmIndex = base + index;
+    for (int j = 0; j < W; j++) {
+      acc += (static_cast<D>(zn[(W * i) + j]) *
+              static_cast<N>(zm[(W * zmIndex) + j]));
+    }
+    out[i] = acc;
+  }
+  return {out, 256};
+}
+
 /** Helper function for SVE instructions with the format `<s,u>unpk>hi,lo> zd,
  * zn`.
  * D represents the type of the destination register (e.g. <u>int32_t for
diff --git a/src/include/simeng/arch/aarch64/operandContainer.hh b/src/include/simeng/arch/aarch64/operandContainer.hh
index c73b8881d..996454b00 100644
--- a/src/include/simeng/arch/aarch64/operandContainer.hh
+++ b/src/include/simeng/arch/aarch64/operandContainer.hh
@@ -10,7 +10,7 @@ namespace arch {
 namespace aarch64 {
 
 /** The maximum number of source registers a non-SME instruction can have. */
-const uint8_t MAX_SOURCE_REGISTERS = 6;
+const uint8_t MAX_SOURCE_REGISTERS = 7;
 
 /** The maximum number of destination registers a non-SME instruction can have.
  */
diff --git a/src/include/simeng/version.hh.in b/src/include/simeng/version.hh.in
index 5f1e8f410..f563e281f 100644
--- a/src/include/simeng/version.hh.in
+++ b/src/include/simeng/version.hh.in
@@ -9,5 +9,6 @@
 #define SIMENG_LLVM_VERSION @SIMENG_LLVM_VERSION@
 #define SIMENG_ENABLE_TESTS "${SIMENG_ENABLE_TESTS}"
 #define SIMENG_BUILD_DIR "${CMAKE_BINARY_DIR}"
+#define SIMENG_ENABLE_BF16 ${SIMENG_ENABLE_BF16}
 
 #endif
\ No newline at end of file
diff --git a/src/lib/arch/aarch64/ExceptionHandler.cc b/src/lib/arch/aarch64/ExceptionHandler.cc
index ae98dddb1..ff7375339 100644
--- a/src/lib/arch/aarch64/ExceptionHandler.cc
+++ b/src/lib/arch/aarch64/ExceptionHandler.cc
@@ -626,8 +626,7 @@ bool ExceptionHandler::init() {
 
         break;
       }
-      case 293:  // rseq
-      {
+      case 293: {  // rseq
         stateChange = {ChangeType::REPLACEMENT, {R0}, {0ull}};
         break;
       }
@@ -818,7 +817,7 @@ void ExceptionHandler::readLinkAt(span<char> path) {
   for (size_t i = 0; i < bytesCopied; i += 256) {
     uint8_t size = std::min<uint64_t>(bytesCopied - i, 256ul);
     stateChange.memoryAddresses.push_back({bufAddress + i, size});
-    stateChange.memoryAddressValues.push_back(RegisterValue(bufPtr, size));
+    stateChange.memoryAddressValues.push_back(RegisterValue(bufPtr + i, size));
   }
 
   concludeSyscall(stateChange);
diff --git a/src/lib/arch/aarch64/InstructionMetadata.cc b/src/lib/arch/aarch64/InstructionMetadata.cc
index 56e438a3d..07deed41a 100644
--- a/src/lib/arch/aarch64/InstructionMetadata.cc
+++ b/src/lib/arch/aarch64/InstructionMetadata.cc
@@ -279,7 +279,7 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn)
   if (isAlias) {
     exceptionString_ =
         "This instruction is an alias. The printed mnemonic and operand string "
-        "differ from what is expected of the Capstone opcode.";
+        "may differ from the underlying opcode.";
   }
 }
 
diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index ec4f269a8..d0c792096 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -190,6 +190,18 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         }
         break;
       }
+      case Opcode::AArch64_LD1RQ_B: {  // ld1rqb {zd.b}, pg/z, [xn, xm]
+        uint64_t addr =
+            sourceValues_[1].get<uint64_t>() + sourceValues_[2].get<uint64_t>();
+        setMemoryAddresses({addr, static_cast<uint16_t>(16)});
+        break;
+      }
+      case Opcode::AArch64_LD1RQ_B_IMM: {  // ld1rqb {zd.b}, pg/z, [xn{, #imm}]
+        uint64_t addr =
+            sourceValues_[1].get<uint64_t>() + metadata_.operands[2].mem.disp;
+        setMemoryAddresses({addr, static_cast<uint16_t>(16)});
+        break;
+      }
       case Opcode::AArch64_LD1RQ_D_IMM: {  // ld1rqd {zd.d}, pg/z, [xn{, #imm}]
         uint64_t addr =
             sourceValues_[1].get<uint64_t>() + metadata_.operands[2].mem.disp;
@@ -292,6 +304,10 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 16}});
         break;
       }
+      case Opcode::AArch64_LD1Onev8b_POST: {  // ld1 {vt.8b}, [xn], <#imm|xm>
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 8}});
+        break;
+      }
       case Opcode::AArch64_LD1Fourv16b:  // ld1 {vt1.16b, vt2.16b, vt3.16b,
                                          // vt4.16b}, [xn]
         [[fallthrough]];
@@ -324,6 +340,9 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
       case Opcode::AArch64_LD1Twov2d_POST:  // ld1 {vt1.2d, vt2.2d}, [xn],
                                             // <#imm|xm>
         [[fallthrough]];
+      case Opcode::AArch64_LD1Twov8h_POST:  // ld1 {vt1.8h, vt2.8h}, [xn],
+                                            // <#imm|xm>
+        [[fallthrough]];
       case Opcode::AArch64_LD1Twov4s:  // ld1 {vt1.4s, vt2.4s}, [xn]
         [[fallthrough]];
       case Opcode::AArch64_LD1Twov4s_POST: {  // ld1 {vt1.4s, vt2.4s}, [xn],
@@ -349,6 +368,100 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({addr, static_cast<uint16_t>(VL_bits / 8)});
         break;
       }
+      case Opcode::AArch64_LD1B_2Z: {  // ld1b {zt1.b, zt2.b}, png/z, [xn, xm]
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset = sourceValues_[2].get<uint64_t>();
+        const uint64_t addr = base + offset;
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(2);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_LD1B_2Z_IMM: {  // ld1b {zt1.b, zt2.b}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        const uint16_t partition_num = VL_bits / 8;
+
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset =
+            static_cast<uint64_t>(metadata_.operands[3].mem.disp);
+        const uint64_t addr = base + (offset * partition_num);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(2);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_LD1B_4Z_STRIDED: {  // ld1b {zt1.b, zt2.b, zt3.b,
+                                               // zt4.b}, png/z, [xn, xm]
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset = sourceValues_[2].get<uint64_t>();
+        const uint64_t addr = base + offset;
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(4);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+        addresses.push_back({addr + 2 * blockSize, blockSize});
+        addresses.push_back({addr + 3 * blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_LD1B_4Z_STRIDED_IMM:  // ld1b {zt1.b, zt2.b, zt3.b,
+                                                 // zt4.b}, png/z, [xn{, #imm,
+                                                 // mul vl}]
+        [[fallthrough]];
+      case Opcode::AArch64_LD1B_4Z_IMM: {  // ld1b {zt1.b - zt4.b}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        const uint16_t partition_num = VL_bits / 8;
+
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset =
+            static_cast<uint64_t>(metadata_.operands[5].mem.disp);
+        const uint64_t addr = base + (offset * partition_num);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(4);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+        addresses.push_back({addr + 2 * blockSize, blockSize});
+        addresses.push_back({addr + 3 * blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_LD1B_4Z: {  // ld1b {zt1.b - zt4.b}, png/z, [xn, xm]
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const int64_t offset = sourceValues_[2].get<int64_t>();
+        const uint64_t addr = base + offset;
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(4);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+        addresses.push_back({addr + 2 * blockSize, blockSize});
+        addresses.push_back({addr + 3 * blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
       case Opcode::AArch64_LD1D: {  // ld1d {zt.d}, pg/z, [xn, xm, lsl #3]
         const uint64_t base = sourceValues_[1].get<uint64_t>();
         const uint64_t offset = sourceValues_[2].get<uint64_t>();
@@ -357,6 +470,64 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({addr, static_cast<uint16_t>(VL_bits / 8)});
         break;
       }
+      case Opcode::AArch64_LD1D_2Z_IMM: {  // ld1d {zt1.d, zt2.d}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        const uint16_t partition_num = VL_bits / 64;
+
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset =
+            static_cast<uint64_t>(metadata_.operands[3].mem.disp);
+        const uint64_t addr = base + (offset * partition_num * 8);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(2);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_LD1D_4Z: {  // ld1d {zt1.d - zt4.d}, png/z, [xn,
+                                       // xm, lsl #3]
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset = sourceValues_[2].get<uint64_t>();
+        const uint64_t addr = base + (offset << 3);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(4);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+        addresses.push_back({addr + 2 * blockSize, blockSize});
+        addresses.push_back({addr + 3 * blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_LD1D_4Z_IMM: {  // ld1d {zt1.d - zt4.d}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        const uint16_t partition_num = VL_bits / 64;
+
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset =
+            static_cast<uint64_t>(metadata_.operands[5].mem.disp);
+        const uint64_t addr = base + (offset * partition_num * 8);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(4);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+        addresses.push_back({addr + 2 * blockSize, blockSize});
+        addresses.push_back({addr + 3 * blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
       case Opcode::AArch64_LD1D_IMM: {  // ld1d {zt.d}, pg/z, [xn{, #imm,
                                         // mul vl}]
         const uint16_t partition_num = VL_bits / 64;
@@ -377,6 +548,52 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({addr, static_cast<uint16_t>(VL_bits / 8)});
         break;
       }
+      case Opcode::AArch64_LD1H_IMM: {  // ld1h  {zt.h}, pg/z, [xn{, #imm, mul
+                                        // vl}]
+        const uint16_t partition_num = VL_bits / 16;
+
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const int64_t offset = metadata_.operands[2].mem.disp;
+        const uint64_t addr = base + (offset * partition_num * 2);
+
+        setMemoryAddresses({addr, static_cast<uint16_t>(VL_bits / 8)});
+        break;
+      }
+      case Opcode::AArch64_LD1H_2Z: {  // ld1h {zt1.h, zt2.h}, png/z, [xn, xm,
+                                       // lsl #1]
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const int64_t offset = sourceValues_[2].get<int64_t>();
+        const uint64_t addr = base + (offset << 1);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(2);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_LD1H_2Z_IMM: {  // ld1h {zt1.h, zt2.h}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        const uint16_t partition_num = VL_bits / 16;
+
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset =
+            static_cast<uint64_t>(metadata_.operands[3].mem.disp);
+        const uint64_t addr = base + (offset * partition_num * 4);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(2);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
       case Opcode::AArch64_LD1W: {  // ld1w {zt.s}, pg/z, [xn, xm, lsl #2]
         const uint64_t base = sourceValues_[1].get<uint64_t>();
         const uint64_t offset = sourceValues_[2].get<uint64_t>();
@@ -397,6 +614,80 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({addr, static_cast<uint16_t>(VL_bits / 8)});
         break;
       }
+      case Opcode::AArch64_LD1W_2Z: {  // ld1w {zt1.s, zt2.s}, png/z, [xn,
+                                       // xm, lsl #2]
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset = sourceValues_[2].get<uint64_t>();
+        const uint64_t addr = base + (offset << 2);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(2);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_LD1W_2Z_IMM: {  // ld1w {zt1.s, zt2.s}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        const uint16_t partition_num = VL_bits / 32;
+
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset =
+            static_cast<uint64_t>(metadata_.operands[3].mem.disp);
+        const uint64_t addr = base + (offset * partition_num * 4);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(2);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_LD1W_4Z: {  // ld1w {zt1.s - zt4.s}, png/z, [xn,
+                                       // xm, lsl #2]
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset = sourceValues_[2].get<uint64_t>();
+        const uint64_t addr = base + (offset << 2);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(4);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+        addresses.push_back({addr + 2 * blockSize, blockSize});
+        addresses.push_back({addr + 3 * blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_LD1W_4Z_IMM: {  // ld1w {zt1.s - zt4.s}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        const uint16_t partition_num = VL_bits / 32;
+
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset =
+            static_cast<uint64_t>(metadata_.operands[5].mem.disp);
+        const uint64_t addr = base + (offset * partition_num * 4);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(4);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+        addresses.push_back({addr + 2 * blockSize, blockSize});
+        addresses.push_back({addr + 3 * blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
       case Opcode::AArch64_LD2D: {  // ld2d {zt1.d, zt2.d}, pg/z, [xn|sp, xm,
                                     // lsl #3]
         const uint64_t base = sourceValues_[1].get<uint64_t>();
@@ -771,6 +1062,10 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({{base, 4}, {base + 4, 4}});
         break;
       }
+      case Opcode::AArch64_LDRSBWpost: {  // ldrsb wt, [xn], #imm
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 1}});
+        break;
+      }
       case Opcode::AArch64_LDRSBWroX: {  // ldrsb wt, [xn, xm{, extend
                                          // {#amount}}]
         uint64_t offset = extendOffset(sourceValues_[1].get<uint64_t>(),
@@ -1031,6 +1326,74 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(std::move(addresses));
         break;
       }
+      case Opcode::AArch64_ST1D_2Z: {  // st1d {zt1.d, zt2.d}, png, [xn, xm, lsl
+                                       // #3]
+        const uint64_t pn = sourceValues_[2].get<uint64_t>();
+        auto preds = predAsCounterToMasks<uint64_t, 2>(pn, VL_bits);
+        const uint16_t partition_num = VL_bits / 64;
+
+        const uint64_t base = sourceValues_[3].get<uint64_t>();
+        const uint64_t offset = sourceValues_[4].get<uint64_t>();
+        const uint64_t addr = base + (offset << 3);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+
+        generatePredicatedContiguousAddressBlocks(addr, partition_num, 8, 8,
+                                                  preds[0].data(), addresses);
+        generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8),
+                                                  partition_num, 8, 8,
+                                                  preds[1].data(), addresses);
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_ST1D_2Z_IMM: {  // st1d {zt1.d, zt2.d}, png, [xn{,
+                                           // #imm, mul vl}]
+        const uint64_t pn = sourceValues_[2].get<uint64_t>();
+        auto preds = predAsCounterToMasks<uint64_t, 2>(pn, VL_bits);
+        const uint16_t partition_num = VL_bits / 64;
+
+        const uint64_t base = sourceValues_[3].get<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[3].mem.disp);
+        const uint64_t addr = base + (offset * partition_num * 8);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+
+        generatePredicatedContiguousAddressBlocks(addr, partition_num, 8, 8,
+                                                  preds[0].data(), addresses);
+        generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8),
+                                                  partition_num, 8, 8,
+                                                  preds[1].data(), addresses);
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_ST1D_4Z_IMM: {  // st1d {zt1.d - zt4.d}, png, [xn{,
+                                           // #imm, mul vl}]
+        const uint64_t pn = sourceValues_[4].get<uint64_t>();
+        auto preds = predAsCounterToMasks<uint64_t, 4>(pn, VL_bits);
+        const uint16_t partition_num = VL_bits / 64;
+
+        const uint64_t base = sourceValues_[5].get<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[5].mem.disp);
+        const uint64_t addr = base + (offset * partition_num * 8);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+
+        generatePredicatedContiguousAddressBlocks(addr, partition_num, 8, 8,
+                                                  preds[0].data(), addresses);
+        generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8),
+                                                  partition_num, 8, 8,
+                                                  preds[1].data(), addresses);
+        generatePredicatedContiguousAddressBlocks(addr + 2 * (VL_bits / 8),
+                                                  partition_num, 8, 8,
+                                                  preds[2].data(), addresses);
+        generatePredicatedContiguousAddressBlocks(addr + 3 * (VL_bits / 8),
+                                                  partition_num, 8, 8,
+                                                  preds[3].data(), addresses);
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
       case Opcode::AArch64_ST2D_IMM: {  // st2d {zt1.d, zt2.d}, pg, [<xn|sp>{,
                                         // #imm, mul vl}]
         const uint64_t* p = sourceValues_[2].getAsVector<uint64_t>();
@@ -1045,8 +1408,84 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
 
         uint64_t addr = base + (offset * partition_num * 8);
 
-        generatePredicatedContiguousAddressBlocks(addr, partition_num, 16, 8, p,
-                                                  addresses);
+        // As vectors are stored in an interleaved manner (i.e. zt1[0], zt2[0],
+        // zt1[1], zt2[1], ...) we must generate an address for each element (if
+        // the predicate is true for that element). This is because, if the
+        // predicate indicates that all elements are active, a single address
+        // and MemoryAccessTarget will be generated with a size of 2xVL. This
+        // could lead to issues for core models which have a maximum store
+        // bandwidth of 1xVL.
+        for (int i = 0; i < partition_num; i++) {
+          uint64_t shifted_active = 1ull << ((i % 8) * 8);
+          if (p[i / 8] & shifted_active) {
+            addresses.push_back({addr + (2 * i * 8), 8});
+            addresses.push_back({addr + (2 * i * 8) + 8, 8});
+          }
+        }
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_ST4W: {  // st4w {zt1.s, zt2.s, zt3.s, zt4.s},
+                                    // pg, [<xn|sp>, xm, lsl #2]
+        const uint64_t* p = sourceValues_[4].getAsVector<uint64_t>();
+        const uint16_t partition_num = VL_bits / 32;
+
+        const uint64_t base = sourceValues_[5].get<uint64_t>();
+        const int64_t offset = sourceValues_[6].get<int64_t>();
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(partition_num * 4);
+
+        uint64_t addr = base + (offset << 2);
+
+        // As vectors are stored in an interleaved manner (i.e. zt1[0], zt2[0],
+        // zt3[0], zt4[0], zt1[1], zt2[1], zt3[1], zt4[1] ...) we must generate
+        // an address for each element (if the predicate is true for that
+        // element). This is because, if the predicate indicates that all
+        // elements are active, a single address and MemoryAccessTarget will be
+        // generated with a size of 4xVL. This could lead to issues for core
+        // models which have a maximum store bandwidth of 1xVL.
+        for (int i = 0; i < partition_num; i++) {
+          uint64_t shifted_active = 1ull << ((i % 16) * 4);
+          if (p[i / 16] & shifted_active) {
+            addresses.push_back({addr + (4 * i * 4), 4});
+            addresses.push_back({addr + (4 * i * 4) + 4, 4});
+            addresses.push_back({addr + (4 * i * 4) + 8, 4});
+            addresses.push_back({addr + (4 * i * 4) + 12, 4});
+          }
+        }
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_ST4W_IMM: {  // st4w {zt1.s, zt2.s, zt3.s, zt4.s},
+                                        // pg, [<xn|sp>{, #imm, mul vl}]
+        const uint64_t* p = sourceValues_[4].getAsVector<uint64_t>();
+        const uint16_t partition_num = VL_bits / 32;
+
+        const uint64_t base = sourceValues_[5].get<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[5].mem.disp);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(partition_num * 4);
+        uint64_t addr = base + (offset * partition_num * 4);
+
+        // As vectors are stored in an interleaved manner (i.e. zt1[0], zt2[0],
+        // zt3[0], zt4[0], zt1[1], zt2[1], zt3[1], zt4[1] ...) we must generate
+        // an address for each element (if the predicate is true for that
+        // element). This is because, if the predicate indicates that all
+        // elements are active, a single address and MemoryAccessTarget will be
+        // generated with a size of 4xVL. This could lead to issues for core
+        // models which have a maximum store bandwidth of 1xVL.
+        for (int i = 0; i < partition_num; i++) {
+          uint64_t shifted_active = 1ull << ((i % 16) * 4);
+          if (p[i / 16] & shifted_active) {
+            addresses.push_back({addr + (4 * i * 4), 4});
+            addresses.push_back({addr + (4 * i * 4) + 4, 4});
+            addresses.push_back({addr + (4 * i * 4) + 8, 4});
+            addresses.push_back({addr + (4 * i * 4) + 12, 4});
+          }
+        }
         setMemoryAddresses(std::move(addresses));
         break;
       }
@@ -1213,6 +1652,74 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(std::move(addresses));
         break;
       }
+      case Opcode::AArch64_ST1W_2Z: {  // st1w {zt1.s, zt2.s}, png, [xn, xm, lsl
+                                       // #2]
+        const uint64_t pn = sourceValues_[2].get<uint64_t>();
+        auto preds = predAsCounterToMasks<uint32_t, 2>(pn, VL_bits);
+        const uint16_t partition_num = VL_bits / 32;
+
+        const uint64_t base = sourceValues_[3].get<uint64_t>();
+        const uint64_t offset = sourceValues_[4].get<uint64_t>();
+        const uint64_t addr = base + (offset << 2);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+
+        generatePredicatedContiguousAddressBlocks(addr, partition_num, 4, 4,
+                                                  preds[0].data(), addresses);
+        generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8),
+                                                  partition_num, 4, 4,
+                                                  preds[1].data(), addresses);
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_ST1W_2Z_IMM: {  // st1w {zt1.s, zt2.s}, png, [xn{,
+                                           // #imm, mul vl}]
+        const uint64_t pn = sourceValues_[2].get<uint64_t>();
+        auto preds = predAsCounterToMasks<uint32_t, 2>(pn, VL_bits);
+        const uint16_t partition_num = VL_bits / 32;
+
+        const uint64_t base = sourceValues_[3].get<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[3].mem.disp);
+        const uint64_t addr = base + (offset * partition_num * 4);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+
+        generatePredicatedContiguousAddressBlocks(addr, partition_num, 4, 4,
+                                                  preds[0].data(), addresses);
+        generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8),
+                                                  partition_num, 4, 4,
+                                                  preds[1].data(), addresses);
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_ST1W_4Z_IMM: {  // st1w {zt1.s - zt4.s}, png, [xn{,
+                                           // #imm, mul vl}]
+        const uint64_t pn = sourceValues_[4].get<uint64_t>();
+        auto preds = predAsCounterToMasks<uint32_t, 4>(pn, VL_bits);
+        const uint16_t partition_num = VL_bits / 32;
+
+        const uint64_t base = sourceValues_[5].get<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[5].mem.disp);
+        const uint64_t addr = base + (offset * partition_num * 4);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+
+        generatePredicatedContiguousAddressBlocks(addr, partition_num, 4, 4,
+                                                  preds[0].data(), addresses);
+        generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8),
+                                                  partition_num, 4, 4,
+                                                  preds[1].data(), addresses);
+        generatePredicatedContiguousAddressBlocks(addr + 2 * (VL_bits / 8),
+                                                  partition_num, 4, 4,
+                                                  preds[2].data(), addresses);
+        generatePredicatedContiguousAddressBlocks(addr + 3 * (VL_bits / 8),
+                                                  partition_num, 4, 4,
+                                                  preds[3].data(), addresses);
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
       case Opcode::AArch64_SST1W_D_IMM: {  // st1w {zt.d}, pg, [zn.d{, #imm}]
         const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 64;
@@ -1442,6 +1949,11 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(std::move(addresses));
         break;
       }
+      case Opcode::AArch64_ST1Onev4s_POST: {  // st1 {vt.4s}, [xn|sp], <#imm|xm>
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        setMemoryAddresses({base, 16});
+        break;
+      }
       case Opcode::AArch64_ST1Twov16b:  // st1 {vt.16b, vt2.16b}, [xn]
         [[fallthrough]];
       case Opcode::AArch64_ST1Twov16b_POST:  // st1 {vt.16b, vt2.16b}, [xn],
diff --git a/src/lib/arch/aarch64/Instruction_decode.cc b/src/lib/arch/aarch64/Instruction_decode.cc
index 3535ce590..5e9987258 100644
--- a/src/lib/arch/aarch64/Instruction_decode.cc
+++ b/src/lib/arch/aarch64/Instruction_decode.cc
@@ -532,8 +532,8 @@ void Instruction::decode() {
 
     if (isInstruction(InsnType::isStoreData)) {
       // Identify store instruction group
-      if (AARCH64_REG_Z0 <= metadata_.operands[0].reg &&
-          metadata_.operands[0].reg <= AARCH64_REG_Z31) {
+      if ((AARCH64_REG_Z0 <= metadata_.operands[0].reg &&
+           metadata_.operands[0].reg <= AARCH64_REG_Z31)) {
         setInstructionType(InsnType::isSVEData);
       } else if ((metadata_.operands[0].reg <= AARCH64_REG_S31 &&
                   metadata_.operands[0].reg >= AARCH64_REG_Q0) ||
@@ -639,8 +639,8 @@ void Instruction::decode() {
       }
     }
   } else {
-    // For SME instructions, resize the following structures to have the
-    // exact amount of space required
+    // For SME instructions (not using ZT0), resize the following structures to
+    // have the exact amount of space required
     sourceRegisters_.resize(sourceRegisterCount_);
     destinationRegisters_.resize(destinationRegisterCount_);
     sourceValues_.resize(sourceRegisterCount_);
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 8f4bc3814..f2d575673 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -470,6 +470,40 @@ void Instruction::execute() {
         results_[0] = vecAdd_3ops<uint8_t, 8>(sourceValues_);
         break;
       }
+      case Opcode::AArch64_ADD_VG2_M2Z_S: {  // add za.s[wv, off, vgx2], {zn1.s,
+                                             // zn2.s}
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint16_t elemCount = VL_bits / 32;
+
+        // Get ZA stride between halves and index into each ZA half
+        const uint16_t zaStride = zaRowCount / 2;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[0].sme.slice_offset.imm) %
+                                 zaStride;
+
+        // Pre-set all ZA result rows as only 2 will be updated in loop below
+        for (int z = 0; z < zaRowCount; z++) {
+          results_[z] = sourceValues_[z];
+        }
+
+        for (int r = 0; r < 2; r++) {
+          const uint32_t* zaRow =
+              sourceValues_[(r * zaStride) + zaIndex].getAsVector<uint32_t>();
+          const uint32_t* znr =
+              sourceValues_[zaRowCount + 1 + r].getAsVector<uint32_t>();
+          uint32_t out[64] = {0};
+          for (int i = 0; i < elemCount; i++) {
+            out[i] = zaRow[i] + znr[i];
+          }
+          results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256);
+        }
+        break;
+      }
       case Opcode::AArch64_ADR: {  // adr xd, #imm
         results_[0] = instructionAddress_ + metadata_.operands[1].imm;
         break;
@@ -628,6 +662,66 @@ void Instruction::execute() {
         branchAddress_ = instructionAddress_ + metadata_.operands[0].imm;
         break;
       }
+#if SIMENG_ENABLE_BF16 == 1
+      case Opcode::AArch64_BF16DOTlanev8bf16: {  // bfdot vd.4s, vn.8h,
+                                                 // vm.2h[index]
+        // BF16 -- EXPERIMENTAL
+        // Must be enabled at SimEng compile time
+        // Not verified to be working for all compilers or OSs.
+        // No Tests written
+
+        const float* vd = sourceValues_[0].getAsVector<float>();
+        const __bf16* vn = sourceValues_[1].getAsVector<__bf16>();
+        const __bf16* vm = sourceValues_[2].getAsVector<__bf16>();
+        const int vmIndex = metadata_.operands[2].vector_index;
+
+        float out[4] = {vd[0], vd[1], vd[2], vd[3]};
+        for (int i = 0; i < 4; i++) {
+          out[i] += (static_cast<float>(vn[2 * i]) *
+                     static_cast<float>(vm[2 * vmIndex])) +
+                    (static_cast<float>(vn[2 * i + 1]) *
+                     static_cast<float>(vm[2 * vmIndex + 1]));
+        }
+        results_[0] = RegisterValue(out, 256);
+        break;
+      }
+      case Opcode::AArch64_BFDOT_ZZI: {  // bfdot zd.s, zn.h, zm.h[index]
+        // BF16 -- EXPERIMENTAL
+        // Must be enabled at SimEng compile time
+        // Not verified to be working for all compilers or OSs.
+        // No Tests written
+
+        const uint16_t partition_num = VL_bits / 16;
+
+        const float* zd = sourceValues_[0].getAsVector<float>();
+        // Extract data as uint16_t so that bytes-per-element is correct
+        const uint16_t* zn = sourceValues_[1].getAsVector<uint16_t>();
+        const uint16_t* zm = sourceValues_[2].getAsVector<uint16_t>();
+        const int index = metadata_.operands[2].vector_index;
+
+        float out[64] = {0.0f};
+        for (int i = 0; i < partition_num; i++) {
+          // MOD 4 as 4 32-bit elements in each 128-bit segment
+          const int zmBase = i - (i % 4);
+          const int zmIndex = zmBase + index;
+
+          float zn1, zn2, zm1, zm2;
+          // Horrible hack in order to convert bf16 (currently stored in a
+          // uint16_t) into a float.
+          // Each bf16 is copied into the most significant 16-bits of each
+          // float variable; given IEEE FP32 and BF16 have the same width
+          // exponent and one sign bit.
+          memcpy((uint16_t*)&zn1 + 1, &zn[2 * i], 2);
+          memcpy((uint16_t*)&zn2 + 1, &zn[2 * i + 1], 2);
+          memcpy((uint16_t*)&zm1 + 1, &zm[2 * zmIndex], 2);
+          memcpy((uint16_t*)&zm2 + 1, &zm[2 * zmIndex + 1], 2);
+
+          out[i] = zd[i] + ((zn1 * zm1) + (zn2 * zm2));
+        }
+        results_[0] = RegisterValue(out, 256);
+        break;
+      }
+#endif
       case Opcode::AArch64_BFMWri: {  // bfm wd, wn, #immr, #imms
         results_[0] = {
             bfm_2imms<uint32_t>(sourceValues_, metadata_, false, false), 8};
@@ -1757,6 +1851,80 @@ void Instruction::execute() {
         results_[0] = {add_3ops<float>(sourceValues_), 256};
         break;
       }
+      case Opcode::AArch64_FADD_VG2_M2Z_D: {  // fadd za.d[wv, #off, vgx2],
+                                              // {zn1.d, zn2.d}
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint16_t elemCount = VL_bits / 64;
+        // Get ZA stride between halves and index into each ZA half
+        const uint16_t zaStride = zaRowCount / 2;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[0].sme.slice_offset.imm) %
+                                 zaStride;
+
+        // Pre-set all ZA result rows as only 2 will be updated in loop below
+        for (int z = 0; z < zaRowCount; z++) {
+          results_[z] = sourceValues_[z];
+        }
+
+        // For each source vector and ZA Row pair
+        for (int r = 0; r < 2; r++) {
+          // Get row in correct ZA half
+          const double* zaRow =
+              sourceValues_[(r * zaStride) + zaIndex].getAsVector<double>();
+          // Get current source vector
+          const double* znr =
+              sourceValues_[zaRowCount + 1 + r].getAsVector<double>();
+          double out[32] = {0.0};
+          // Loop over all elements and destructively add
+          for (int e = 0; e < elemCount; e++) {
+            out[e] = zaRow[e] + znr[e];
+          }
+          results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256);
+        }
+        break;
+      }
+      case Opcode::AArch64_FADD_VG2_M2Z_S: {  // fadd za.s[wv, #off, vgx2],
+                                              // {zn1.s, zn2.s}
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint16_t elemCount = VL_bits / 32;
+        // Get ZA stride between halves and index into each ZA half
+        const uint16_t zaStride = zaRowCount / 2;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[0].sme.slice_offset.imm) %
+                                 zaStride;
+
+        // Pre-set all ZA result rows as only 2 will be updated in loop below
+        for (int z = 0; z < zaRowCount; z++) {
+          results_[z] = sourceValues_[z];
+        }
+
+        // For each source vector and ZA Row pair
+        for (int r = 0; r < 2; r++) {
+          // Get row in correct ZA half
+          const float* zaRow =
+              sourceValues_[(r * zaStride) + zaIndex].getAsVector<float>();
+          // Get current source vector
+          const float* znr =
+              sourceValues_[zaRowCount + 1 + r].getAsVector<float>();
+          float out[64] = {0.0f};
+          // Loop over all elements and destructively add
+          for (int e = 0; e < elemCount; e++) {
+            out[e] = zaRow[e] + znr[e];
+          }
+          results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256);
+        }
+        break;
+      }
       case Opcode::AArch64_FADD_ZPmI_D: {  // fadd zdn.d, pg/m, zdn.d, const
         results_[0] =
             sveAddPredicated_const<double>(sourceValues_, metadata_, VL_bits);
@@ -1795,6 +1963,16 @@ void Instruction::execute() {
         results_[0] = vecAdd_3ops<float, 4>(sourceValues_);
         break;
       }
+      case Opcode::AArch64_FADDV_VPZ_D: {  // faddv dd, p0, zn.d
+
+        results_[0] = sveFaddv_predicated<double>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_FADDV_VPZ_S: {  // faddv sd, p0, zn.s
+
+        results_[0] = sveFaddv_predicated<float>(sourceValues_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_FCADD_ZPmZ_D: {  // fcadd zdn.d, pg/m, zdn.d, zm.d,
                                             // #imm
         results_[0] =
@@ -2221,6 +2399,196 @@ void Instruction::execute() {
             [](double x, double y) -> double { return std::fmin(x, y); });
         break;
       }
+      case Opcode::AArch64_FMLA_VG4_M4Z4Z_D: {  // fmla za.d[wv, offs, vgx4],
+                                                // {zn1.d - zn4.d}, {zm1.d -
+                                                // zm4.d}
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint16_t elemCount = VL_bits / 64;
+        // Get ZA stride between quarters and index into each ZA quarter
+        const uint16_t zaStride = zaRowCount / 4;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[0].sme.slice_offset.imm) %
+                                 zaStride;
+
+        // Pre-set all ZA result rows as only 4 will be updated in loop below
+        for (int z = 0; z < zaRowCount; z++) {
+          results_[z] = sourceValues_[z];
+        }
+
+        // Get sourceValues_ index of first zn and zm regs
+        const uint16_t n = zaRowCount + 1;
+        const uint16_t m = zaRowCount + 5;
+
+        // Loop over each source vector and destination vector (from the za
+        // single-vector group) pair
+        for (int r = 0; r < 4; r++) {
+          // For ZA single-vector groups of 4 vectors (vgx4), each vector is in
+          // a different quarter of ZA; indexed into it by Wv+off.
+          const double* zaRow =
+              sourceValues_[(r * zaStride) + zaIndex].getAsVector<double>();
+          const double* zn = sourceValues_[n + r].getAsVector<double>();
+          const double* zm = sourceValues_[m + r].getAsVector<double>();
+          double out[32] = {0.0};
+          for (int e = 0; e < elemCount; e++) {
+            out[e] = zaRow[e] + (zn[e] * zm[e]);
+          }
+          results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256);
+        }
+        break;
+      }
+      case Opcode::AArch64_FMLA_VG4_M4Z4Z_S: {  // fmla za.s[wv, offs, vgx4],
+                                                // {zn1.s - zn4.s}, {zm1.s -
+                                                // zm4.s}
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint16_t elemCount = VL_bits / 32;
+        // Get ZA stride between quarters and index into each ZA quarter
+        const uint16_t zaStride = zaRowCount / 4;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[0].sme.slice_offset.imm) %
+                                 zaStride;
+
+        // Pre-set all ZA result rows as only 4 will be updated in loop below
+        for (int z = 0; z < zaRowCount; z++) {
+          results_[z] = sourceValues_[z];
+        }
+
+        // Get sourceValues_ index of first zn and zm regs
+        const uint16_t n = zaRowCount + 1;
+        const uint16_t m = zaRowCount + 5;
+
+        // Loop over each source vector and destination vector (from the za
+        // single-vector group) pair
+        for (int r = 0; r < 4; r++) {
+          // For ZA single-vector groups of 4 vectors (vgx4), each vector is in
+          // a different quarter of ZA; indexed into it by Wv+off.
+          const float* zaRow =
+              sourceValues_[(r * zaStride) + zaIndex].getAsVector<float>();
+          const float* zn = sourceValues_[n + r].getAsVector<float>();
+          const float* zm = sourceValues_[m + r].getAsVector<float>();
+          float out[64] = {0.0f};
+          for (int e = 0; e < elemCount; e++) {
+            out[e] = zaRow[e] + (zn[e] * zm[e]);
+          }
+          results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256);
+        }
+        break;
+      }
+      case Opcode::AArch64_FMLA_VG4_M4ZZI_D: {  // fmla za.d[wv, offs, vgx4],
+                                                // {zn1.d - zn4.d}, zm.d[index]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint16_t elemCount = VL_bits / 64;
+        // Get ZA stride between quarters and index into each ZA quarter
+        const uint16_t zaStride = zaRowCount / 4;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[0].sme.slice_offset.imm) %
+                                 zaStride;
+        // Get zm vector and zm's index
+        const double* zm = sourceValues_[zaRowCount + 5].getAsVector<double>();
+        const int zmIndex = metadata_.operands[5].vector_index;
+
+        // Pre-set all ZA result rows as only 4 will be updated in loop below
+        for (int z = 0; z < zaRowCount; z++) {
+          results_[z] = sourceValues_[z];
+        }
+
+        // Loop over each source vector and destination vector (from the za
+        // single-vector group) pair
+        for (int r = 0; r < 4; r++) {
+          // For ZA single-vector groups of 4 vectors (vgx4), each vector is in
+          // a different quarter of ZA; indexed into it by Wv+off.
+          const double* zaRow =
+              sourceValues_[(r * zaStride) + zaIndex].getAsVector<double>();
+          const double* znr =
+              sourceValues_[zaRowCount + 1 + r].getAsVector<double>();
+          double out[32] = {0.0};
+          // Loop over all elements of output row vector `zaRow`
+          for (int e = 0; e < elemCount; e++) {
+            // This instruction multiplies each element of the current `znr` by
+            // an indexed element of `zm` and destructively adds the result to
+            // the corresponding element in the current `zaRow`.
+            //
+            // The index for `zm` specifies which element in each 128-bit
+            // segment to use. The 128-bit segment of `zm` currently in use
+            // corresponds to the 128-bit segment that the current element of
+            // `znr` and `zaRow` is within.
+
+            // MOD 2 as there are 2 64-bit elements per 128-bit segment of `zm`
+            const int zmSegBase = e - (e % 2);
+            out[e] = zaRow[e] + (znr[e] * zm[zmSegBase + zmIndex]);
+          }
+          // Update results_ for completed row
+          results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256);
+        }
+        break;
+      }
+      case Opcode::AArch64_FMLA_VG4_M4ZZI_S: {  // fmla za.s[wv, offs, vgx4],
+                                                // {zn1.s - zn4.s}, zm.s[index]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint16_t elemCount = VL_bits / 32;
+        // Get ZA stride between quarters and index into each ZA quarter
+        const uint16_t zaStride = zaRowCount / 4;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[0].sme.slice_offset.imm) %
+                                 zaStride;
+        // Get zm vector and zm's index
+        const float* zm = sourceValues_[zaRowCount + 5].getAsVector<float>();
+        const int zmIndex = metadata_.operands[5].vector_index;
+
+        // Pre-set all ZA result rows as only 4 will be updated in loop below
+        for (int z = 0; z < zaRowCount; z++) {
+          results_[z] = sourceValues_[z];
+        }
+
+        // Loop over each source vector and destination vector (from the za
+        // single-vector group) pair
+        for (int r = 0; r < 4; r++) {
+          // For ZA single-vector groups of 4 vectors (vgx4), each vector is in
+          // a different quarter of ZA; indexed into it by Wv+off.
+          const float* zaRow =
+              sourceValues_[(r * zaStride) + zaIndex].getAsVector<float>();
+          const float* znr =
+              sourceValues_[zaRowCount + 1 + r].getAsVector<float>();
+          float out[64] = {0.0f};
+          // Loop over all elements of output row vector `zaRow`
+          for (int e = 0; e < elemCount; e++) {
+            // This instruction multiplies each element of the current `znr` by
+            // an indexed element of `zm` and destructively adds the result to
+            // the corresponding element in the current `zaRow`.
+            //
+            // The index for `zm` specifies which element in each 128-bit
+            // segment to use. The 128-bit segment of `zm` currently in use
+            // corresponds to the 128-bit segment that the current element of
+            // `znr` and `zaRow` is within.
+
+            // MOD 4 as there are 4 32-bit elements per 128-bit segment of `zm`
+            const int zmSegBase = e - (e % 4);
+            out[e] = zaRow[e] + (znr[e] * zm[zmSegBase + zmIndex]);
+          }
+          // Update results_ for completed row
+          results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256);
+        }
+        break;
+      }
       case Opcode::AArch64_FMLA_ZPmZZ_D: {  // fmla zd.d, pg/m, zn.d, zm.d
         results_[0] = sveMlaPredicated_vecs<double>(sourceValues_, VL_bits);
         break;
@@ -2291,6 +2659,63 @@ void Instruction::execute() {
         results_[0] = vecFmlsIndexed_3vecs<float, 4>(sourceValues_, metadata_);
         break;
       }
+#if SIMENG_ENABLE_BF16 == 1
+      case Opcode::AArch64_BFMOPA_MPPZZ: {  // bfmopa zada.s, pn/m, pm/m, zn.h,
+                                            // zm.h
+        // SME
+        // BF16 -- EXPERIMENTAL
+        // Must be enabled at SimEng compile time
+        // Not verified to be working for all compilers or OSs.
+        // No Tests written
+
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[rowCount].getAsVector<uint64_t>();
+        const uint64_t* pm =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        // Use uint16_t to get 2-byte elements
+        const uint16_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint16_t>();
+        const uint16_t* zm =
+            sourceValues_[rowCount + 3].getAsVector<uint16_t>();
+
+        // zn is row, zm is col
+        for (int row = 0; row < rowCount; row++) {
+          float outRow[64] = {0.0f};
+          // Shifted active is for bf16 elements
+          uint64_t shifted_active_row = 1ull << ((row % 32) * 2);
+          const float* zadaRow = sourceValues_[row].getAsVector<float>();
+          for (int col = 0; col < rowCount; col++) {
+            outRow[col] = zadaRow[col];
+            // Shifted active is for bf16 elements
+            uint64_t shifted_active_col = 1ull << ((col % 32) * 2);
+            bool pred_row1 = pn[(2 * row) / 32] & shifted_active_row;
+            bool pred_row2 = pn[(2 * row + 1) / 32] & shifted_active_row;
+            bool pred_col1 = pm[(2 * col) / 32] & shifted_active_col;
+            bool pred_col2 = pm[(2 * col + 1) / 32] & shifted_active_col;
+            if ((pred_row1 && pred_col1) || (pred_row2 && pred_col2)) {
+              float zn1, zn2, zm1, zm2;
+              // Horrible hack in order to convert bf16 (currently stored in a
+              // uint16_t) into a float.
+              // Each bf16 is copied into the most significant 16-bits of each
+              // float variable; given IEEE FP32 and BF16 have the same width
+              // exponent and one sign bit.
+              memcpy((uint16_t*)&zn1 + 1, &zn[2 * row], 2);
+              memcpy((uint16_t*)&zn2 + 1, &zn[2 * row + 1], 2);
+              memcpy((uint16_t*)&zm1 + 1, &zm[2 * col], 2);
+              memcpy((uint16_t*)&zm2 + 1, &zm[2 * col + 1], 2);
+              outRow[col] += (pred_row1 && pred_col1) ? zn1 * zm1 : 0.0f;
+              outRow[col] += (pred_row2 && pred_col2) ? zn2 * zm2 : 0.0f;
+            }
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+#endif
       case Opcode::AArch64_FMOPA_MPPZZ_D: {  // fmopa zada.d, pn/m, pm/m, zn.d,
                                              // zm.d
         // SME
@@ -3657,7 +4082,7 @@ void Instruction::execute() {
         }
         break;
       }
-      case Opcode::AArch64_LD1B: {  // ld1b  {zt.b}, pg/z, [xn, xm]
+      case Opcode::AArch64_LD1B: {  // ld1b {zt.b}, pg/z, [xn, xm]
         // LOAD
         const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
 
@@ -3695,6 +4120,69 @@ void Instruction::execute() {
         results_[0] = {out, 256};
         break;
       }
+      case Opcode::AArch64_LD1B_2Z:  // ld1b {zt1.b, zt2.b}, png/z, [xn, xm]
+        // LOAD
+        [[fallthrough]];
+      case Opcode::AArch64_LD1B_2Z_IMM: {  // ld1b {zt1.b, zt2.b}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        // LOAD
+        const uint64_t pn = sourceValues_[0].get<uint64_t>();
+
+        auto preds = predAsCounterToMasks<uint8_t, 2>(pn, VL_bits);
+
+        uint8_t out[2][256] = {{0}, {0}};
+        const uint16_t partition_num = VL_bits / 8;
+
+        for (int r = 0; r < 2; r++) {
+          const uint8_t* data = memoryData_[r].getAsVector<uint8_t>();
+          for (int i = 0; i < partition_num; i++) {
+            uint64_t shifted_active = 1ull << (i % 64);
+            if (preds[r][i / 64] & shifted_active) {
+              out[r][i] = data[i];
+            }
+          }
+        }
+        results_[0] = {out[0], 256};
+        results_[1] = {out[1], 256};
+        break;
+      }
+      case Opcode::AArch64_LD1B_4Z_STRIDED:  // ld1b {zt1.b, zt2.b, zt3.b,
+                                             // zt4.b}, png/z, [xn, xm]
+        // LOAD
+        [[fallthrough]];
+      case Opcode::AArch64_LD1B_4Z_STRIDED_IMM:  // ld1b {zt1.b, zt2.b, zt3.b,
+                                                 // zt4.b}, png/z, [xn{, #imm,
+                                                 // mul vl}]
+        // LOAD
+        [[fallthrough]];
+      case Opcode::AArch64_LD1B_4Z:  // ld1b {zt1.b - zt4.b}, png/z, [xn, xm]
+        // LOAD
+        [[fallthrough]];
+      case Opcode::AArch64_LD1B_4Z_IMM: {  // ld1b {zt1.b - zt4.b}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        // LOAD
+        const uint64_t pn = sourceValues_[0].get<uint64_t>();
+
+        auto preds = predAsCounterToMasks<uint8_t, 4>(pn, VL_bits);
+
+        uint8_t out[4][256] = {{0}, {0}, {0}, {0}};
+        const uint16_t partition_num = VL_bits / 8;
+
+        for (int r = 0; r < 4; r++) {
+          const uint8_t* data = memoryData_[r].getAsVector<uint8_t>();
+          for (int i = 0; i < partition_num; i++) {
+            uint64_t shifted_active = 1ull << (i % 64);
+            if (preds[r][i / 64] & shifted_active) {
+              out[r][i] = data[i];
+            }
+          }
+        }
+        results_[0] = {out[0], 256};
+        results_[1] = {out[1], 256};
+        results_[2] = {out[2], 256};
+        results_[3] = {out[3], 256};
+        break;
+      }
       case Opcode::AArch64_LD1D: {  // ld1d  {zt.d}, pg/z, [xn, xm, lsl #3]
         // LOAD
         const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
@@ -3714,6 +4202,58 @@ void Instruction::execute() {
         results_[0] = {out, 256};
         break;
       }
+      case Opcode::AArch64_LD1D_2Z_IMM: {  // ld1d {zt1.d, zt2.d}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        // LOAD
+        const uint64_t pn = sourceValues_[0].get<uint64_t>();
+
+        auto preds = predAsCounterToMasks<uint64_t, 2>(pn, VL_bits);
+
+        uint64_t out[2][32] = {{0}, {0}};
+        const uint16_t partition_num = VL_bits / 64;
+
+        for (int r = 0; r < 2; r++) {
+          const uint64_t* data = memoryData_[r].getAsVector<uint64_t>();
+          for (int i = 0; i < partition_num; i++) {
+            uint64_t shifted_active = 1ull << ((i % 8) * 8);
+            if (preds[r][i / 8] & shifted_active) {
+              out[r][i] = data[i];
+            }
+          }
+        }
+        results_[0] = {out[0], 256};
+        results_[1] = {out[1], 256};
+        break;
+      }
+      case Opcode::AArch64_LD1D_4Z:  // ld1d {zt1.d - zt4.d}, png/z, [xn,
+                                     // xm, lsl #3]
+        // LOAD
+        [[fallthrough]];
+      case Opcode::AArch64_LD1D_4Z_IMM: {  // ld1d {zt1.d - zt4.d}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        // LOAD
+        const uint64_t pn = sourceValues_[0].get<uint64_t>();
+
+        auto preds = predAsCounterToMasks<uint64_t, 4>(pn, VL_bits);
+
+        uint64_t out[4][32] = {{0}, {0}, {0}, {0}};
+        const uint16_t partition_num = VL_bits / 64;
+
+        for (int r = 0; r < 4; r++) {
+          const uint64_t* data = memoryData_[r].getAsVector<uint64_t>();
+          for (int i = 0; i < partition_num; i++) {
+            uint64_t shifted_active = 1ull << ((i % 8) * 8);
+            if (preds[r][i / 8] & shifted_active) {
+              out[r][i] = data[i];
+            }
+          }
+        }
+        results_[0] = {out[0], 256};
+        results_[1] = {out[1], 256};
+        results_[2] = {out[2], 256};
+        results_[3] = {out[3], 256};
+        break;
+      }
       case Opcode::AArch64_LD1D_IMM: {  // ld1d  {zt.d}, pg/z, [xn{, #imm,
                                         // mul vl}]
         // LOAD
@@ -3734,6 +4274,10 @@ void Instruction::execute() {
         results_[0] = {out, 256};
         break;
       }
+      case Opcode::AArch64_LD1H_IMM:  // ld1h  {zt.h}, pg/z, [xn{, #imm, mul
+                                      // vl}]
+        // LOAD
+        [[fallthrough]];
       case Opcode::AArch64_LD1H: {  // ld1h  {zt.h}, pg/z, [xn, xm, lsl #1]
         // LOAD
         const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
@@ -3753,6 +4297,33 @@ void Instruction::execute() {
         results_[0] = {out, 256};
         break;
       }
+      case Opcode::AArch64_LD1H_2Z:  // ld1h {zt1.h, zt2.h}, png/z, [xn, xm,
+                                     // lsl #1]
+        // LOAD
+        [[fallthrough]];
+      case Opcode::AArch64_LD1H_2Z_IMM: {  // ld1h {zt1.h, zt2.h}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        // LOAD
+        const uint64_t pn = sourceValues_[0].get<uint64_t>();
+
+        auto preds = predAsCounterToMasks<uint16_t, 2>(pn, VL_bits);
+
+        uint16_t out[2][128] = {{0}, {0}};
+        const uint16_t partition_num = VL_bits / 16;
+
+        for (int r = 0; r < 2; r++) {
+          const uint16_t* data = memoryData_[r].getAsVector<uint16_t>();
+          for (int i = 0; i < partition_num; i++) {
+            uint64_t shifted_active = 1ull << ((i % 32) * 2);
+            if (preds[r][i / 32] & shifted_active) {
+              out[r][i] = data[i];
+            }
+          }
+        }
+        results_[0] = {out[0], 256};
+        results_[1] = {out[1], 256};
+        break;
+      }
       case Opcode::AArch64_LD1Onev16b: {  // ld1 {vt.16b} [xn]
         results_[0] = memoryData_[0].zeroExtend(memoryData_[0].size(), 256);
         break;
@@ -3767,6 +4338,16 @@ void Instruction::execute() {
         results_[1] = memoryData_[0].zeroExtend(memoryData_[0].size(), 256);
         break;
       }
+      case Opcode::AArch64_LD1Onev8b_POST: {  // ld1 {vt.8b}, [xn], <#imm|xm>
+        // if #imm post-index, value can only be 8
+        const uint64_t postIndex =
+            (metadata_.operands[2].type == AARCH64_OP_REG)
+                ? sourceValues_[1].get<uint64_t>()
+                : 8;
+        results_[0] = sourceValues_[0].get<uint64_t>() + postIndex;
+        results_[1] = memoryData_[0].zeroExtend(memoryData_[0].size(), 256);
+        break;
+      }
       case Opcode::AArch64_LD1RD_IMM: {  // ld1rd {zt.d}, pg/z, [xn, #imm]
         // LOAD
         const uint16_t partition_num = VL_bits / 64;
@@ -3794,6 +4375,30 @@ void Instruction::execute() {
         results_[0] = {out, 256};
         break;
       }
+      case Opcode::AArch64_LD1RQ_B:        // ld1rqb {zd.b}, pg/z, [xn, xm]
+      case Opcode::AArch64_LD1RQ_B_IMM: {  // ld1rqb {zd.b}, pg/z, [xn{, #imm }]
+        // LOAD
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
+        const uint16_t partition_num = VL_bits / 8;
+        uint8_t out[256] = {0};
+        const uint8_t* data = memoryData_[0].getAsVector<uint8_t>();
+
+        // Get mini-vector (quadword)
+        uint8_t mini[16] = {0};
+        for (int i = 0; i < 16; i++) {
+          uint64_t shifted_active = 1ull << (i % 64);
+          if (p[i / 64] & shifted_active) {
+            mini[i] = data[i];
+          }
+        }
+
+        // Duplicate mini-vector into output vector
+        for (int i = 0; i < partition_num; i++) {
+          out[i] = mini[i % 16];
+        }
+        results_[0] = {out, 256};
+        break;
+      }
       case Opcode::AArch64_LD1RQ_D_IMM: {  // ld1rqd {zd.d}, pg/z, [xn{, #imm}]
         // LOAD
         const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
@@ -4078,6 +4683,9 @@ void Instruction::execute() {
       case Opcode::AArch64_LD1Twov2d_POST:  // ld1 {vt1.2d, vt2.2d}, [xn],
                                             // <#imm|xm>
         [[fallthrough]];
+      case Opcode::AArch64_LD1Twov8h_POST:  // ld1 {vt1.8h, vt2.8h}, [xn],
+                                            // <#imm|xm>
+        [[fallthrough]];
       case Opcode::AArch64_LD1Twov4s_POST: {  // ld1 {vt1.4s, vt2.4s}, [xn],
                                               // <#imm|xm>
         // LOAD
@@ -4130,6 +4738,62 @@ void Instruction::execute() {
         results_[0] = {out, 256};
         break;
       }
+      case Opcode::AArch64_LD1W_2Z:  // ld1w {zt1.s, zt2.s}, png/z, [xn, xm,
+                                     // lsl #2]
+        // LOAD
+        [[fallthrough]];
+      case Opcode::AArch64_LD1W_2Z_IMM: {  // ld1w {zt1.s, zt2.s}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        // LOAD
+        const uint64_t pn = sourceValues_[0].get<uint64_t>();
+
+        auto preds = predAsCounterToMasks<uint32_t, 2>(pn, VL_bits);
+
+        uint32_t out[2][64] = {{0}, {0}};
+        const uint16_t partition_num = VL_bits / 32;
+
+        for (int r = 0; r < 2; r++) {
+          const uint32_t* data = memoryData_[r].getAsVector<uint32_t>();
+          for (int i = 0; i < partition_num; i++) {
+            uint64_t shifted_active = 1ull << ((i % 16) * 4);
+            if (preds[r][i / 16] & shifted_active) {
+              out[r][i] = data[i];
+            }
+          }
+        }
+        results_[0] = {out[0], 256};
+        results_[1] = {out[1], 256};
+        break;
+      }
+      case Opcode::AArch64_LD1W_4Z:  // ld1w {zt1.s - zt4.s}, png/z, [xn,
+                                     // xm, lsl #2]
+        // LOAD
+        [[fallthrough]];
+      case Opcode::AArch64_LD1W_4Z_IMM: {  // ld1w {zt1.s - zt4.s}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        // LOAD
+        const uint64_t pn = sourceValues_[0].get<uint64_t>();
+
+        auto preds = predAsCounterToMasks<uint32_t, 4>(pn, VL_bits);
+
+        uint32_t out[4][64] = {{0}, {0}, {0}, {0}};
+        const uint16_t partition_num = VL_bits / 32;
+
+        for (int r = 0; r < 4; r++) {
+          const uint32_t* data = memoryData_[r].getAsVector<uint32_t>();
+          for (int i = 0; i < partition_num; i++) {
+            uint64_t shifted_active = 1ull << ((i % 16) * 4);
+            if (preds[r][i / 16] & shifted_active) {
+              out[r][i] = data[i];
+            }
+          }
+        }
+        results_[0] = {out[0], 256};
+        results_[1] = {out[1], 256};
+        results_[2] = {out[2], 256};
+        results_[3] = {out[3], 256};
+        break;
+      }
       case Opcode::AArch64_LD1i32: {  // ld1 {vt.s}[index], [xn]
         // LOAD
         const int index = metadata_.operands[0].vector_index;
@@ -4494,6 +5158,15 @@ void Instruction::execute() {
         results_[0] = memoryData_[0].zeroExtend(16, 256);
         break;
       }
+      case Opcode::AArch64_LDRSBWpost: {  // ldrsb wt, [xn], #imm
+        // LOAD
+        results_[1] =
+            RegisterValue(static_cast<int32_t>(memoryData_[0].get<int8_t>()), 4)
+                .zeroExtend(4, 8);
+        results_[0] = RegisterValue(
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[2].imm, 8);
+        break;
+      }
       case Opcode::AArch64_LDRSBWroX: {  // ldrsb wt, [xn, xm{, extend
                                          // {#amount}}]
         // LOAD
@@ -4768,6 +5441,65 @@ void Instruction::execute() {
         results_[0] = sveMlaPredicated_vecs<uint32_t>(sourceValues_, VL_bits);
         break;
       }
+      case Opcode::AArch64_MOVA_4ZMXI_H_B: {  // mova {zd1.b - zd4.b},
+                                              // za0h.b[ws, offs1:offs4]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t sliceCount = VL_bits / 8;
+
+        const uint32_t ws = sourceValues_[sliceCount].get<uint32_t>();
+        const uint8_t offs1 =
+            metadata_.operands[4].sme.slice_offset.imm_range.first;
+        const uint8_t offs4 =
+            metadata_.operands[4].sme.slice_offset.imm_range.offset;
+
+        for (uint8_t i = offs1; i <= offs4; i++) {
+          const uint8_t index = i - offs1;
+          results_[index] = sourceValues_[(ws + i) % sliceCount];
+        }
+        break;
+      }
+      case Opcode::AArch64_MOVA_VG2_2ZMXI: {  // mova {zd1.d, zd2.d}, za.d[wv,
+                                              // offs, vgx2]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        // Get ZA stride between halves and index into each ZA quarter
+        const uint16_t zaStride = zaRowCount / 2;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[2].sme.slice_offset.imm) %
+                                 zaStride;
+
+        results_[0] = sourceValues_[zaIndex];
+        results_[1] = sourceValues_[zaStride + zaIndex];
+        break;
+      }
+      case Opcode::AArch64_MOVA_VG4_4ZMXI: {  // mova {zd1.d - zd4.d}, za.d[wv,
+                                              // offs, vgx4]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        // Get ZA stride between quarters and index into each ZA quarter
+        const uint16_t zaStride = zaRowCount / 4;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[4].sme.slice_offset.imm) %
+                                 zaStride;
+
+        results_[0] = sourceValues_[zaIndex];
+        results_[1] = sourceValues_[zaStride + zaIndex];
+        results_[2] = sourceValues_[(2 * zaStride) + zaIndex];
+        results_[3] = sourceValues_[(3 * zaStride) + zaIndex];
+        break;
+      }
       case Opcode::AArch64_MOVID: {  // movi dd, #imm
         results_[0] = {static_cast<uint64_t>(metadata_.operands[1].imm), 256};
         break;
@@ -5004,6 +5736,14 @@ void Instruction::execute() {
             [](uint8_t x, uint8_t y) -> uint8_t { return x | y; });
         break;
       }
+      case Opcode::AArch64_AUTIASP:  // autiasp
+        [[fallthrough]];
+      case Opcode::AArch64_PACIASP: {  // paciasp
+        const uint64_t x30 = sourceValues_[0].get<uint64_t>();
+        // Mimic execution by writing leaving x30 unmodified
+        results_[0] = {x30, 8};
+        break;
+      }
       case Opcode::AArch64_PFALSE: {  // pfalse pd.b
         uint64_t out[4] = {0, 0, 0, 0};
         results_[0] = out;
@@ -5053,6 +5793,22 @@ void Instruction::execute() {
         results_[0] = svePtrue<uint32_t>(metadata_, VL_bits);
         break;
       }
+      case Opcode::AArch64_PTRUE_C_B: {  // ptrue pnd.b
+        results_[0] = svePtrue_counter<uint8_t>(VL_bits);
+        break;
+      }
+      case Opcode::AArch64_PTRUE_C_D: {  // ptrue pnd.d
+        results_[0] = svePtrue_counter<uint64_t>(VL_bits);
+        break;
+      }
+      case Opcode::AArch64_PTRUE_C_H: {  // ptrue pnd.h
+        results_[0] = svePtrue_counter<uint16_t>(VL_bits);
+        break;
+      }
+      case Opcode::AArch64_PTRUE_C_S: {  // ptrue pnd.s
+        results_[0] = svePtrue_counter<uint32_t>(VL_bits);
+        break;
+      }
       case Opcode::AArch64_PUNPKHI_PP: {  // punpkhi pd.h, pn.b
         results_[0] = svePunpk(sourceValues_, VL_bits, true);
         break;
@@ -5069,9 +5825,18 @@ void Instruction::execute() {
         results_[0] = rbit<uint64_t>(sourceValues_, metadata_);
         break;
       }
+      case Opcode::AArch64_RDSVLI_XI: {  // rdsvl xd, #imm
+        // Uses Streaming SVE vector register size, regardless of streaming mode
+        // state
+        int64_t imm = metadata_.operands[1].imm;
+        results_[0] = imm * static_cast<int64_t>(
+                                architecture_.getStreamingVectorLength() / 8);
+        break;
+      }
       case Opcode::AArch64_RDVLI_XI: {  // rdvl xd, #imm
-        int8_t imm = static_cast<int8_t>(metadata_.operands[1].imm);
-        results_[0] = (uint64_t)(imm * (VL_bits / 8));
+        // Uses current vector register size
+        int64_t imm = metadata_.operands[1].imm;
+        results_[0] = imm * static_cast<int64_t>(VL_bits / 8);
         break;
       }
       case Opcode::AArch64_RET: {  // ret {xr}
@@ -5961,6 +6726,50 @@ void Instruction::execute() {
         memoryData_ = sve_merge_store_data<uint64_t>(d, p, VL_bits);
         break;
       }
+      case Opcode::AArch64_ST1D_2Z:  // st1d {zt1.d, zt2.d}, png, [xn, xm, lsl
+                                     // #3]
+        // STORE
+        [[fallthrough]];
+      case Opcode::AArch64_ST1D_2Z_IMM: {  // st1d {zt1.d, zt2.d}, png, [xn{,
+                                           // #imm, mul vl}]
+        // STORE
+        const uint64_t* t1 = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* t2 = sourceValues_[1].getAsVector<uint64_t>();
+        const uint64_t pn = sourceValues_[2].get<uint64_t>();
+
+        auto preds = predAsCounterToMasks<uint64_t, 2>(pn, VL_bits);
+
+        memoryData_ =
+            sve_merge_store_data<uint64_t>(t1, preds[0].data(), VL_bits);
+        std::vector<RegisterValue> out2 =
+            sve_merge_store_data<uint64_t>(t2, preds[1].data(), VL_bits);
+        memoryData_.insert(memoryData_.end(), out2.begin(), out2.end());
+        break;
+      }
+      case Opcode::AArch64_ST1D_4Z_IMM: {  // st1d {zt1.d - zt4.d}, png, [xn{,
+                                           // #imm, mul vl}]
+        // STORE
+        const uint64_t* t1 = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* t2 = sourceValues_[1].getAsVector<uint64_t>();
+        const uint64_t* t3 = sourceValues_[2].getAsVector<uint64_t>();
+        const uint64_t* t4 = sourceValues_[3].getAsVector<uint64_t>();
+        const uint64_t pn = sourceValues_[4].get<uint64_t>();
+
+        auto preds = predAsCounterToMasks<uint64_t, 4>(pn, VL_bits);
+
+        memoryData_ =
+            sve_merge_store_data<uint64_t>(t1, preds[0].data(), VL_bits);
+        std::vector<RegisterValue> out2 =
+            sve_merge_store_data<uint64_t>(t2, preds[1].data(), VL_bits);
+        std::vector<RegisterValue> out3 =
+            sve_merge_store_data<uint64_t>(t3, preds[2].data(), VL_bits);
+        std::vector<RegisterValue> out4 =
+            sve_merge_store_data<uint64_t>(t4, preds[3].data(), VL_bits);
+        memoryData_.insert(memoryData_.end(), out2.begin(), out2.end());
+        memoryData_.insert(memoryData_.end(), out3.begin(), out3.end());
+        memoryData_.insert(memoryData_.end(), out4.begin(), out4.end());
+        break;
+      }
       case Opcode::AArch64_ST1Fourv16b: {  // st1 {vt.16b, vt2.16b, vt3.16b,
                                            // vt4.16b}, [xn|sp]
         // STORE
@@ -6056,6 +6865,19 @@ void Instruction::execute() {
         results_[0] = sourceValues_[4].get<uint64_t>() + postIndex;
         break;
       }
+      case Opcode::AArch64_ST1Onev4s_POST: {  // st1 {vt.4s}, [xn|sp], <#imm|xm>
+        // STORE
+        const uint32_t* vt = sourceValues_[0].getAsVector<uint32_t>();
+        memoryData_[0] = RegisterValue((char*)vt, 4 * sizeof(uint32_t));
+
+        // if #imm post-index, value can only be 16
+        const uint64_t postIndex =
+            (metadata_.operands[2].type == AARCH64_OP_REG)
+                ? sourceValues_[2].get<uint64_t>()
+                : 16;
+        results_[0] = sourceValues_[1].get<uint64_t>() + postIndex;
+        break;
+      }
       case Opcode::AArch64_ST1Twov16b: {  // st1 {vt.16b, vt2.16b}, [xn|sp]
         // STORE
         const uint8_t* t = sourceValues_[0].getAsVector<uint8_t>();
@@ -6152,6 +6974,50 @@ void Instruction::execute() {
         memoryData_ = sve_merge_store_data<uint32_t>(d, p, VL_bits);
         break;
       }
+      case Opcode::AArch64_ST1W_2Z:  // st1w {zt1.s, zt2.s}, png, [xn, xm, lsl
+                                     // #2]
+        // STORE
+        [[fallthrough]];
+      case Opcode::AArch64_ST1W_2Z_IMM: {  // st1w {zt1.s, zt2.s}, png, [xn{,
+                                           // #imm, mul vl}]
+        // STORE
+        const uint32_t* t1 = sourceValues_[0].getAsVector<uint32_t>();
+        const uint32_t* t2 = sourceValues_[1].getAsVector<uint32_t>();
+        const uint64_t pn = sourceValues_[2].get<uint64_t>();
+
+        auto preds = predAsCounterToMasks<uint32_t, 2>(pn, VL_bits);
+
+        memoryData_ =
+            sve_merge_store_data<uint32_t>(t1, preds[0].data(), VL_bits);
+        std::vector<RegisterValue> out2 =
+            sve_merge_store_data<uint32_t>(t2, preds[1].data(), VL_bits);
+        memoryData_.insert(memoryData_.end(), out2.begin(), out2.end());
+        break;
+      }
+      case Opcode::AArch64_ST1W_4Z_IMM: {  // st1w {zt1.s - zt4.s}, png, [xn{,
+                                           // #imm, mul vl}]
+        // STORE
+        const uint32_t* t1 = sourceValues_[0].getAsVector<uint32_t>();
+        const uint32_t* t2 = sourceValues_[1].getAsVector<uint32_t>();
+        const uint32_t* t3 = sourceValues_[2].getAsVector<uint32_t>();
+        const uint32_t* t4 = sourceValues_[3].getAsVector<uint32_t>();
+        const uint64_t pn = sourceValues_[4].get<uint64_t>();
+
+        auto preds = predAsCounterToMasks<uint32_t, 4>(pn, VL_bits);
+
+        memoryData_ =
+            sve_merge_store_data<uint32_t>(t1, preds[0].data(), VL_bits);
+        std::vector<RegisterValue> out2 =
+            sve_merge_store_data<uint32_t>(t2, preds[1].data(), VL_bits);
+        std::vector<RegisterValue> out3 =
+            sve_merge_store_data<uint32_t>(t3, preds[2].data(), VL_bits);
+        std::vector<RegisterValue> out4 =
+            sve_merge_store_data<uint32_t>(t4, preds[3].data(), VL_bits);
+        memoryData_.insert(memoryData_.end(), out2.begin(), out2.end());
+        memoryData_.insert(memoryData_.end(), out3.begin(), out3.end());
+        memoryData_.insert(memoryData_.end(), out4.begin(), out4.end());
+        break;
+      }
       case Opcode::AArch64_ST1i16: {  // st1 {vt.h}[index], [xn]
         // STORE
         const uint16_t* t = sourceValues_[0].getAsVector<uint16_t>();
@@ -6231,33 +7097,15 @@ void Instruction::execute() {
         const uint64_t* d2 = sourceValues_[1].getAsVector<uint64_t>();
         const uint64_t* p = sourceValues_[2].getAsVector<uint64_t>();
 
-        std::vector<uint64_t> memData;
-        bool inActiveBlock = false;
-
         const uint16_t partition_num = VL_bits / 64;
         uint16_t index = 0;
         for (int i = 0; i < partition_num; i++) {
           uint64_t shifted_active = 1ull << ((i % 8) * 8);
           if (p[i / 8] & shifted_active) {
-            // If active and not in active block, initialise
-            if (!inActiveBlock) {
-              memData.clear();
-              inActiveBlock = true;
-            }
-            memData.push_back(d1[i]);
-            memData.push_back(d2[i]);
-          } else if (inActiveBlock) {
-            inActiveBlock = false;
-            memoryData_[index] = RegisterValue(
-                (char*)memData.data(), sizeof(uint64_t) * memData.size());
-            index++;
+            memoryData_[index++] = RegisterValue(d1[i], 8);
+            memoryData_[index++] = RegisterValue(d2[i], 8);
           }
         }
-        // Add final block if needed
-        if (inActiveBlock)
-          memoryData_[index] = RegisterValue((char*)memData.data(),
-                                             sizeof(uint64_t) * memData.size());
-
         break;
       }
       case Opcode::AArch64_ST2Twov4s_POST: {  // st2 {vt1.4s, vt2.4s}, [xn],
@@ -6277,6 +7125,31 @@ void Instruction::execute() {
         results_[0] = sourceValues_[2].get<uint64_t>() + postIndex;
         break;
       }
+      case Opcode::AArch64_ST4W:  // st4w {zt1.s, zt2.s, zt3.s, zt4.s},
+                                  // pg, [<xn|sp>, xm, lsl #2]
+        [[fallthrough]];
+      case Opcode::AArch64_ST4W_IMM: {  // st4w {zt1.s, zt2.s, zt3.s, zt4.s},
+                                        // pg, [<xn|sp>{, #imm, mul vl}]
+        // STORE
+        const uint32_t* d1 = sourceValues_[0].getAsVector<uint32_t>();
+        const uint32_t* d2 = sourceValues_[1].getAsVector<uint32_t>();
+        const uint32_t* d3 = sourceValues_[2].getAsVector<uint32_t>();
+        const uint32_t* d4 = sourceValues_[3].getAsVector<uint32_t>();
+        const uint64_t* p = sourceValues_[4].getAsVector<uint64_t>();
+
+        const uint16_t partition_num = VL_bits / 32;
+        uint16_t index = 0;
+        for (int i = 0; i < partition_num; i++) {
+          uint64_t shifted_active = 1ull << ((i % 16) * 4);
+          if (p[i / 16] & shifted_active) {
+            memoryData_[index++] = RegisterValue(d1[i], 4);
+            memoryData_[index++] = RegisterValue(d2[i], 4);
+            memoryData_[index++] = RegisterValue(d3[i], 4);
+            memoryData_[index++] = RegisterValue(d4[i], 4);
+          }
+        }
+        break;
+      }
       case Opcode::AArch64_STLRB: {  // stlrb wt, [xn]
         // STORE
         memoryData_[0] = sourceValues_[0];
@@ -7028,6 +7901,11 @@ void Instruction::execute() {
             bfm_2imms<uint64_t>(sourceValues_, metadata_, false, true);
         break;
       }
+      case Opcode::AArch64_UCVTFSXSri: {  // ucvtf sd, xn, #fbits
+        results_[0] = {
+            ucvtf_fixedToFloat<float, uint32_t>(sourceValues_, metadata_), 256};
+        break;
+      }
       case Opcode::AArch64_UCVTFUWDri: {  // ucvtf dd, wn
         results_[0] = {static_cast<double>(sourceValues_[0].get<uint32_t>()),
                        256};
@@ -7066,6 +7944,200 @@ void Instruction::execute() {
         results_[0] = {div_3ops<uint64_t>(sourceValues_), 8};
         break;
       }
+      case Opcode::AArch64_UDOT_VG4_M4Z4Z_BtoS: {  // udot za.s[wv, #off, vgx4],
+                                                   // {zn1.b - zn4.b}, {zm1.b -
+                                                   // zm4.b}
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint16_t elemCount = VL_bits / 32;
+        // Get ZA stride between quarters and index into each ZA quarter
+        const uint16_t zaStride = zaRowCount / 4;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[0].sme.slice_offset.imm) %
+                                 zaStride;
+
+        // Pre-set all ZA result rows as only 4 will be updated in loop below
+        for (int z = 0; z < zaRowCount; z++) {
+          results_[z] = sourceValues_[z];
+        }
+
+        // Get base zn and zm register indexed in sourceValues
+        const uint16_t znBase = zaRowCount + 1;
+        const uint16_t zmBase = zaRowCount + 5;
+
+        // Loop over each source vector and destination vector (from the za
+        // single-vector group) pair
+        for (int r = 0; r < 4; r++) {
+          // For ZA single-vector groups of 4 vectors (vgx4), each vector is in
+          // a different quarter of ZA; indexed into it by Wv+off.
+          const uint32_t* zaRow =
+              sourceValues_[(r * zaStride) + zaIndex].getAsVector<uint32_t>();
+          const uint8_t* znr = sourceValues_[znBase + r].getAsVector<uint8_t>();
+          const uint8_t* zmr = sourceValues_[zmBase + r].getAsVector<uint8_t>();
+          uint32_t out[64] = {0};
+          // Loop over all 32-bit elements of output row vector `zaRow`
+          for (int e = 0; e < elemCount; e++) {
+            out[e] = zaRow[e];
+            // There are 4 8-bit elements per 32-bit element of `znr` and `zmr`
+            for (int i = 0; i < 4; i++) {
+              out[e] += static_cast<uint32_t>(znr[4 * e + i]) *
+                        static_cast<uint32_t>(zmr[4 * e + i]);
+            }
+          }
+          // Update results_ for completed row
+          results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256);
+        }
+        break;
+      }
+      case Opcode::AArch64_UDOT_VG4_M4ZZI_BtoS: {  // udot za.s[wv, #off, vgx4],
+                                                   // {zn1.b - zn4.b},
+                                                   // zm.b[#index]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint16_t elemCount = VL_bits / 32;
+        // Get ZA stride between quarters and index into each ZA quarter
+        const uint16_t zaStride = zaRowCount / 4;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[0].sme.slice_offset.imm) %
+                                 zaStride;
+        // Get zm vector and zm's index
+        const uint8_t* zm =
+            sourceValues_[zaRowCount + 5].getAsVector<uint8_t>();
+        const int zmIndex = metadata_.operands[5].vector_index;
+
+        // Pre-set all ZA result rows as only 4 will be updated in loop below
+        for (int z = 0; z < zaRowCount; z++) {
+          results_[z] = sourceValues_[z];
+        }
+
+        // Loop over each source vector and destination vector (from the za
+        // single-vector group) pair
+        for (int r = 0; r < 4; r++) {
+          // For ZA single-vector groups of 4 vectors (vgx4), each vector is in
+          // a different quarter of ZA; indexed into it by Wv+off.
+          const uint32_t* zaRow =
+              sourceValues_[(r * zaStride) + zaIndex].getAsVector<uint32_t>();
+          const uint8_t* znr =
+              sourceValues_[zaRowCount + 1 + r].getAsVector<uint8_t>();
+          uint32_t out[64] = {0};
+          // Loop over all 32-bit elements of output row vector `zaRow`
+          for (int e = 0; e < elemCount; e++) {
+            // This instruction destructively adds the widened dot product
+            // (4x 8-bit --> 1x 32-bit) of the following to each 32-bit element
+            // in the current `zaRow`:
+            //    - four 8-bit values in each corresponding 32-bit element of
+            //      the current source `znr` vector
+            //    - four 8-bit values from a 32-bit element of `zm`, selected
+            //      from each 128-bit segment of `zm` using an index
+            //
+            // The 128-bit segment of `zm` currently in use corresponds to the
+            // 128-bit segment that the current 32-bit elements of `znr`
+            // and `zaRow` are within.
+            // For example, with a SVL = 512-bits, elements `e` of `zaRow` in
+            // the range 0->15, and zmIndex = 1:
+            //    - When `e` = 0 -> 3, the 32-bit element used from `zm` will be
+            //                         zm[1] (1st 32-bit element in 0th 128-bit
+            //                         segment)
+            //    - When `e` = 4 -> 7, the 32-bit element used from `zm` will be
+            //                         zm[5] (1st 32-bit element in 1st 128-bit
+            //                         segment)
+            out[e] = zaRow[e];
+            // MOD 4 as there are 4 32-bit elements per 128-bit segment of `zm`
+            const int zmSegBase = e - (e % 4);
+            const int s = zmSegBase + zmIndex;
+            // There are 4 8-bit elements per 32-bit element of `znr` and `zm`
+            for (int i = 0; i < 4; i++) {
+              out[e] += static_cast<uint32_t>(znr[4 * e + i]) *
+                        static_cast<uint32_t>(zm[4 * s + i]);
+            }
+          }
+          // Update results_ for completed row
+          results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256);
+        }
+        break;
+      }
+      case Opcode::AArch64_UVDOT_VG4_M4ZZI_BtoS: {  // uvdot za.s[wv, #off,
+                                                    // vgx4], {zn1.b - zn4.b},
+                                                    // zm.b[#index]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint16_t elemCount = VL_bits / 32;
+        // Get ZA stride between quarters and index into each ZA quarter
+        const uint16_t zaStride = zaRowCount / 4;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[0].sme.slice_offset.imm) %
+                                 zaStride;
+        // Get zm vector and zm's index
+        const uint8_t* zm =
+            sourceValues_[zaRowCount + 5].getAsVector<uint8_t>();
+        const int zmIndex = metadata_.operands[5].vector_index;
+
+        // Pre-set all ZA result rows as only 4 will be updated in loop below
+        for (int z = 0; z < zaRowCount; z++) {
+          results_[z] = sourceValues_[z];
+        }
+
+        // Loop over each source vector and destination vector (from the za
+        // single-vector group) pair
+        for (int r = 0; r < 4; r++) {
+          // For ZA single-vector groups of 4 vectors (vgx4), each vector is in
+          // a different quarter of ZA; indexed into it by Wv+off.
+          const uint32_t* zaRow =
+              sourceValues_[(r * zaStride) + zaIndex].getAsVector<uint32_t>();
+          uint32_t out[64] = {0};
+          // Loop over all 32-bit elements of output row vector `zaRow`
+          for (int e = 0; e < elemCount; e++) {
+            out[e] = zaRow[e];
+            // MOD 4 as there are 4 32-bit elements per 128-bit segment of `zm`
+            const int zmSegBase = e - (e % 4);
+            const int s = zmSegBase + zmIndex;
+            // There are 4 8-bit elements per 32-bit element of `znr` and `zm`
+            for (int i = 0; i < 4; i++) {
+              const uint8_t* znr =
+                  sourceValues_[zaRowCount + 1 + i].getAsVector<uint8_t>();
+              out[e] += static_cast<uint32_t>(znr[4 * e + r]) *
+                        static_cast<uint32_t>(zm[4 * s + i]);
+            }
+          }
+          // Update results_ for completed row
+          results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256);
+        }
+        break;
+      }
+      case Opcode::AArch64_UDOT_ZZZ_S: {  // udot zd.s, zn.b, zm.b
+        results_[0] =
+            sveUdot<uint32_t, uint8_t, 4>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_UDOT_ZZZI_S: {  // udot zd.s, zn.b, zm.b[index]
+        results_[0] = sveUdot_indexed<uint32_t, uint8_t, 4>(sourceValues_,
+                                                            metadata_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_UDOTv16i8: {  // udot vd.4s, vn.16b, vm.16b
+        results_[0] = vecUdot<4>(sourceValues_, metadata_);
+        break;
+      }
+      case Opcode::AArch64_UDOTlanev16i8: {  // udot vd.4s, vn.16b, vm.4b[index]
+        results_[0] = vecUdot_byElement<4>(sourceValues_, metadata_);
+        break;
+      }
+      case Opcode::AArch64_UDOTlanev8i8: {  // udot vd.2s, vn.8b, vm.4b[index]
+        results_[0] = vecUdot_byElement<2>(sourceValues_, metadata_);
+        break;
+      }
       case Opcode::AArch64_UMADDLrrr: {  // umaddl xd, wn, wm, xa
         results_[0] = maddl_4ops<uint64_t, uint32_t>(sourceValues_);
         break;
@@ -7078,6 +8150,32 @@ void Instruction::execute() {
         results_[0] = vecUMinP<uint8_t, 16>(sourceValues_);
         break;
       }
+      case Opcode::AArch64_UMLALv2i32_indexed: {  // umlal vd.2d, vn.2s,
+                                                  // vm.s[index]
+        const uint64_t* vd = sourceValues_[0].getAsVector<uint64_t>();
+        const uint32_t* vn = sourceValues_[1].getAsVector<uint32_t>();
+        const uint32_t* vm = sourceValues_[2].getAsVector<uint32_t>();
+        const int64_t index = metadata_.operands[2].vector_index;
+        const uint64_t vm_idx_elem = static_cast<uint64_t>(vm[index]);
+
+        uint64_t out[2] = {vd[0] + static_cast<uint64_t>(vn[0]) * vm_idx_elem,
+                           vd[1] + static_cast<uint64_t>(vn[1]) * vm_idx_elem};
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_UMLALv4i32_indexed: {  // umlal2 vd.2d, vn.4s,
+                                                  // vm.s[index]
+        const uint64_t* vd = sourceValues_[0].getAsVector<uint64_t>();
+        const uint32_t* vn = sourceValues_[1].getAsVector<uint32_t>();
+        const uint32_t* vm = sourceValues_[2].getAsVector<uint32_t>();
+        const int64_t index = metadata_.operands[2].vector_index;
+        const uint64_t vm_idx_elem = static_cast<uint64_t>(vm[index]);
+
+        uint64_t out[2] = {vd[0] + static_cast<uint64_t>(vn[2]) * vm_idx_elem,
+                           vd[1] + static_cast<uint64_t>(vn[3]) * vm_idx_elem};
+        results_[0] = {out, 256};
+        break;
+      }
       case Opcode::AArch64_UMOPA_MPPZZ_D: {  // umopa zada.d, pn/m, pm/m, zn.h,
                                              // zm.h
         // SME
@@ -7257,6 +8355,17 @@ void Instruction::execute() {
                             sourceValues_[1].get<uint64_t>());
         break;
       }
+      case Opcode::AArch64_UMULLv4i16_v4i32: {  // umull vd.4s, vn.4h, vm.4h
+        const uint16_t* vn = sourceValues_[0].getAsVector<uint16_t>();
+        const uint16_t* vm = sourceValues_[1].getAsVector<uint16_t>();
+
+        uint32_t out[4] = {0};
+        for (int i = 0; i < 4; i++) {
+          out[i] = static_cast<uint32_t>(vn[i]) * static_cast<uint32_t>(vm[i]);
+        }
+        results_[0] = {out, 256};
+        break;
+      }
       case Opcode::AArch64_UQDECD_WPiI: {  // uqdecd wd{, pattern{, MUL #imm}}
         results_[0] =
             sveUqdec<uint32_t, 64u>(sourceValues_, metadata_, VL_bits);
@@ -7655,6 +8764,10 @@ void Instruction::execute() {
         results_[0] = sveZip_preds<uint32_t>(sourceValues_, VL_bits, false);
         break;
       }
+      case Opcode::AArch64_ZIP1_ZZZ_B: {  // zip1 zd.b, zn.b, zm.b
+        results_[0] = sveZip_vecs<uint8_t>(sourceValues_, VL_bits, false);
+        break;
+      }
       case Opcode::AArch64_ZIP1_ZZZ_D: {  // zip1 zd.d, zn.d, zm.d
         results_[0] = sveZip_vecs<uint64_t>(sourceValues_, VL_bits, false);
         break;
@@ -7707,6 +8820,10 @@ void Instruction::execute() {
         results_[0] = sveZip_preds<uint32_t>(sourceValues_, VL_bits, true);
         break;
       }
+      case Opcode::AArch64_ZIP2_ZZZ_B: {  // zip2 zd.b, zn.b, zm.b
+        results_[0] = sveZip_vecs<uint8_t>(sourceValues_, VL_bits, true);
+        break;
+      }
       case Opcode::AArch64_ZIP2_ZZZ_D: {  // zip2 zd.d, zn.d, zm.d
         results_[0] = sveZip_vecs<uint64_t>(sourceValues_, VL_bits, true);
         break;
@@ -7743,6 +8860,29 @@ void Instruction::execute() {
         results_[0] = vecZip<uint8_t, 8>(sourceValues_, true);
         break;
       }
+      case Opcode::AArch64_ZIP_VG4_4Z4Z_S: {  // zip {zd1.s - zd4.s}, {zn1.s -
+                                              // zn4.s}
+        const uint32_t* zn[4];
+        zn[0] = sourceValues_[0].getAsVector<uint32_t>();
+        zn[1] = sourceValues_[1].getAsVector<uint32_t>();
+        zn[2] = sourceValues_[2].getAsVector<uint32_t>();
+        zn[3] = sourceValues_[3].getAsVector<uint32_t>();
+
+        const uint16_t quads = VL_bits / (32 * 4);
+
+        uint32_t out[4][64] = {{0}, {0}, {0}, {0}};
+        for (int r = 0; r < 4; r++) {
+          const uint16_t base = r * quads;
+          for (int q = 0; q < quads; q++) {
+            out[r][4 * q] = zn[0][base + q];
+            out[r][4 * q + 1] = zn[1][base + q];
+            out[r][4 * q + 2] = zn[2][base + q];
+            out[r][4 * q + 3] = zn[3][base + q];
+          }
+          results_[r] = RegisterValue(out[r], 256);
+        }
+        break;
+      }
       case Opcode::AArch64_ZERO_M: {  // zero {mask}
         // SME
         // Not in right context mode. Raise exception
@@ -7753,6 +8893,15 @@ void Instruction::execute() {
         }
         break;
       }
+      case Opcode::AArch64_ZERO_T: {  // zero {zt0}
+        // SME
+        // Not in right context mode. Raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        // ZT0 has a fixed width of 512-bits
+        results_[0] = RegisterValue(0, 64);
+        break;
+      }
       default:
         return executionNYI();
     }
diff --git a/test/integration/ConfigTest.cc b/test/integration/ConfigTest.cc
index 48975eeac..49a028ebb 100644
--- a/test/integration/ConfigTest.cc
+++ b/test/integration/ConfigTest.cc
@@ -24,7 +24,8 @@ TEST(ConfigTest, Default) {
       aarch64_sysreg::AARCH64_SYSREG_MIDR_EL1,
       aarch64_sysreg::AARCH64_SYSREG_CNTVCT_EL0,
       aarch64_sysreg::AARCH64_SYSREG_PMCCNTR_EL0,
-      aarch64_sysreg::AARCH64_SYSREG_SVCR};
+      aarch64_sysreg::AARCH64_SYSREG_SVCR,
+      aarch64_sysreg::AARCH64_SYSREG_TPIDR2_EL0};
   EXPECT_EQ(simeng::config::SimInfo::getSysRegVec(), sysRegisterEnums);
   std::vector<simeng::RegisterFileStructure> archRegStruct = {
       {8, 32},
@@ -384,7 +385,8 @@ TEST(ConfigTest, configFromFile) {
       aarch64_sysreg::AARCH64_SYSREG_MIDR_EL1,
       aarch64_sysreg::AARCH64_SYSREG_CNTVCT_EL0,
       aarch64_sysreg::AARCH64_SYSREG_PMCCNTR_EL0,
-      aarch64_sysreg::AARCH64_SYSREG_SVCR};
+      aarch64_sysreg::AARCH64_SYSREG_SVCR,
+      aarch64_sysreg::AARCH64_SYSREG_TPIDR2_EL0};
   EXPECT_EQ(simeng::config::SimInfo::getSysRegVec(), sysRegisterEnums);
   std::vector<simeng::RegisterFileStructure> archRegStruct = {
       {8, 32},
diff --git a/test/regression/aarch64/AArch64RegressionTest.hh b/test/regression/aarch64/AArch64RegressionTest.hh
index 32d975b09..6afdc47d2 100644
--- a/test/regression/aarch64/AArch64RegressionTest.hh
+++ b/test/regression/aarch64/AArch64RegressionTest.hh
@@ -190,6 +190,23 @@ inline std::vector<std::tuple<CoreType, std::string>> genCoreTypeSVLPairs(
     checkMatrixRegisterCol<type>(tag, index, __VA_ARGS__); \
   }
 
+/** Check each element of the Lookup Table register ZT0 against expected values.
+ *
+ * The `type` argument is the C++ data type to use for value comparisons. The
+ * third argument should be an initializer list containing one value for each
+ * register element (for a total of `(64 / sizeof(type))` values).
+ *
+ * For example:
+ *
+ *     // Compare zt0 to some expected 32-bit uint64 values.
+ *     CHECK_TABLE(0, uint32_t, {1, 2, 3, 4, ..., 16});
+ */
+#define CHECK_TABLE(type, ...)                 \
+  {                                            \
+    SCOPED_TRACE("<<== error generated here"); \
+    checkTableRegister<type>(__VA_ARGS__);     \
+  }
+
 /** A helper macro to predecode the first instruction in a snippet of Armv9.2-a
  * assembly code and check the assigned group(s) for each micro-op matches the
  * expected group(s). Returns from the calling function if a fatal error occurs.
@@ -239,13 +256,16 @@ class AArch64RegressionTest : public RegressionTest {
 
   /** Get the subtarget feature string based on LLVM version being used */
   std::string getSubtargetFeaturesString() {
-#if SIMENG_LLVM_VERSION < 14
-    return "+sve,+lse";
-#elif SIMENG_LLVM_VERSION < 18
-    return "+sve,+lse,+sve2,+sme,+sme-f64";
-#else
-    return "+sve,+lse,+sve2,+sme,+sme-f64f64,+sme-i16i64,+sme2";
+    std::string features = "+dotprod,+sve,+lse";
+#if SIMENG_LLVM_VERSION > 13
+    // "+dotprod,+sve,+lse,+sve2,+sme,+sme-f64";
+    features += ",+sve2,+sme,+sme-f64";
+#endif
+#if SIMENG_LLVM_VERSION > 17
+    // "+dotprod,+sve,+lse,+sve2,+sme,+sme-f64f64,+sme-i16i64,+sme2";
+    features += "f64,+sme-i16i64,+sme2";
 #endif
+    return features;
   }
 
   /** Check the elements of a Neon register.
@@ -358,6 +378,21 @@ class AArch64RegressionTest : public RegressionTest {
     }
   }
 
+  /** Check the elements of the ZT0 lookup table register.
+   *
+   * This should be invoked via the `CHECK_TABLE` macro in order to provide
+   * better diagnostic messages, rather than called directly from test code.
+   */
+  template <typename T>
+  void checkTableRegister(const std::array<T, (64 / sizeof(T))>& values) const {
+    const T* data = RegressionTest::getVectorRegister<T>(
+        {simeng::arch::aarch64::RegisterType::TABLE, 0});
+    for (unsigned i = 0; i < (64 / sizeof(T)); i++) {
+      EXPECT_NEAR(data[i], values[i], 0.0005)
+          << "Mismatch for element " << i << ".";
+    }
+  }
+
   /** Get the value of a general purpose register. */
   template <typename T>
   T getGeneralRegister(uint8_t tag) const {
diff --git a/test/regression/aarch64/instructions/bitmanip.cc b/test/regression/aarch64/instructions/bitmanip.cc
index a72dcb64d..30eb27fce 100644
--- a/test/regression/aarch64/instructions/bitmanip.cc
+++ b/test/regression/aarch64/instructions/bitmanip.cc
@@ -274,11 +274,20 @@ TEST_P(InstBitmanip, ubfm) {
     ubfm w2, w0, #16, #31
     ubfm w3, w0, #28, #23
     ubfm w4, w0, #30, #27
+
+    # check alias
+    mov w10, #-1
+    mov w11, #-1
+    mov w12, #128
+    lsl w10, w12, #1
+    lsr w11, w12, #1
   )");
   EXPECT_EQ(getGeneralRegister<uint32_t>(1), 0x000007A0ull);
   EXPECT_EQ(getGeneralRegister<uint32_t>(2), 0x0000007Aull);
   EXPECT_EQ(getGeneralRegister<uint32_t>(3), 0x07A00000ull);
   EXPECT_EQ(getGeneralRegister<uint32_t>(4), 0x01E80000ull);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(10), 256);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(11), 64);
 
   RUN_AARCH64(R"(
     # Fill destination registers with 1s
@@ -295,11 +304,20 @@ TEST_P(InstBitmanip, ubfm) {
     ubfm x2, x0, #16, #63
     ubfm x3, x0, #32, #23
     ubfm x4, x0, #60, #55
+
+    # check alias
+    mov x10, #-1
+    mov x11, #-1
+    mov x12, #128
+    lsl x10, x12, #1
+    lsr x11, x12, #1
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(1), 0x00000000000007A0ull);
   EXPECT_EQ(getGeneralRegister<uint64_t>(2), 0x000000000000007Aull);
   EXPECT_EQ(getGeneralRegister<uint64_t>(3), 0x007A000000000000ull);
   EXPECT_EQ(getGeneralRegister<uint64_t>(4), 0x0000000007A00000ull);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(10), 256);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(11), 64);
 }
 
 INSTANTIATE_TEST_SUITE_P(AArch64, InstBitmanip,
diff --git a/test/regression/aarch64/instructions/float.cc b/test/regression/aarch64/instructions/float.cc
index 03f3f799d..627e710e7 100644
--- a/test/regression/aarch64/instructions/float.cc
+++ b/test/regression/aarch64/instructions/float.cc
@@ -1453,6 +1453,37 @@ TEST_P(InstFloat, ucvtf) {
   CHECK_NEON(9, float, {static_cast<float>(UINT64_C(1) << 48), 0.f, 0.f, 0.f});
   CHECK_NEON(10, float, {static_cast<float>(UINT64_MAX), 0.f, 0.f, 0.f});
   CHECK_NEON(11, float, {0.f, 0.f, 0.f, 0.f});
+
+  // 32-bit unsigned fixed-point to float
+  // Numbers have been chosen to have less than 0.0005 fixed-point
+  // representation error to ensure tests pass
+  initialHeapData_.resize(12);
+  heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  heap32[0] = 0x000001EE;  // 123.5 (2 fraction bits)
+  heap32[1] = 0x00021F3B;  // 543.23 (8 fraction bits)
+  heap32[2] = 0x32FE6B75;  // 101.987654321 (23 fraction bits)
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    # 2 fraction-bits (123.5)
+    ldr w1, [x0], #4
+    ucvtf s1, x1, #0x2
+
+    # 8 fraction-bits (543.23)
+    ldr w2, [x0], #4
+    ucvtf s2, x2, #0x8
+
+
+    # 23 fraction-bits (101.987654321)
+    ldr w3, [x0]
+    ucvtf s3, x3, #0x17
+  )");
+  CHECK_NEON(1, float, {123.5f, 0.0f, 0.0f, 0.0f});
+  CHECK_NEON(2, float, {543.23f, 0.0f, 0.0f, 0.0f});
+  CHECK_NEON(3, float, {101.987654321f, 0.0f, 0.0f, 0.0f});
 }
 
 TEST_P(InstFloat, frintp) {
diff --git a/test/regression/aarch64/instructions/load.cc b/test/regression/aarch64/instructions/load.cc
index 09269eebb..b98013d2a 100644
--- a/test/regression/aarch64/instructions/load.cc
+++ b/test/regression/aarch64/instructions/load.cc
@@ -231,6 +231,41 @@ TEST_P(InstLoad, ld1_multi_struct) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(12),
             getGeneralRegister<uint64_t>(10) + 16);
 
+  // One reg, 8b elements
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #8
+
+    # save heap address before post index
+    mov x10, x0
+
+    # Load values from heap with imm post-index
+    ld1 {v1.8b}, [x0], #8
+
+    # save heap address after post index
+    mov x11, x0
+
+    # Load values from heap with reg post-index
+    ld1 {v2.8b}, [x0], x1
+
+    mov x12, x0
+  )");
+
+  CHECK_NEON(1, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x00, 0x00, 0x00,
+              0x00, 0x00, 0x00, 0x00, 0x00});
+  CHECK_NEON(2, uint8_t,
+             {0x77, 0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0x00, 0x00, 0x00,
+              0x00, 0x00, 0x00, 0x00, 0x00});
+  EXPECT_EQ(getGeneralRegister<uint64_t>(11),
+            getGeneralRegister<uint64_t>(10) + 8);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(12),
+            getGeneralRegister<uint64_t>(10) + 16);
+
   // Two reg, 16b elements
   RUN_AARCH64(R"(
     # Get heap address
@@ -282,6 +317,53 @@ TEST_P(InstLoad, ld1_multi_struct) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(12),
             getGeneralRegister<uint64_t>(10) + 32);
 
+  // Two reg, 8h elements
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #32
+
+    # Load values from heap
+    # ld1 {v0.8h, v1.8h}, [x0]
+
+    # save heap address before post index
+    mov x10, x0
+
+    # Load values from heap with imm post-index
+    ld1 {v2.8h, v3.8h}, [x0], #32
+
+    # save heap address after post index
+    mov x11, x0
+    sub x0, x0, #32
+
+    # Load values from heap with reg post-index
+    ld1 {v4.8h, v5.8h}, [x0], x1
+
+    mov x12, x0
+  )");
+
+  // CHECK_NEON(0, uint16_t,
+  //            {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB,
+  //            0xEEDD});
+  // CHECK_NEON(1, uint16_t,
+  //            {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB,
+  //            0xEEDD});
+  CHECK_NEON(2, uint16_t,
+             {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+  CHECK_NEON(3, uint16_t,
+             {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+  CHECK_NEON(4, uint16_t,
+             {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+  CHECK_NEON(5, uint16_t,
+             {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+  EXPECT_EQ(getGeneralRegister<uint64_t>(11),
+            getGeneralRegister<uint64_t>(10) + 32);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(12),
+            getGeneralRegister<uint64_t>(10) + 32);
+
   // Two reg, 2d elements
   RUN_AARCH64(R"(
     # Get heap address
@@ -1222,14 +1304,23 @@ TEST_P(InstLoad, ldrsb) {
     mov x5, 1
     # Load 8-bit values from heap and sign-extend to 32-bits
     ldrsb w1, [x0, x5, sxtx]
+    # Post Index
+    mov x20, x0
+    ldrsb w2, [x20], #16
+
 
     # Load 8-bit values from heap and sign-extend to 64-bits
-    ldrsb x2, [x0]
-    ldrsb x3, [x0, #3]
+    ldrsb x3, [x0]
+    ldrsb x4, [x0, #3]
+
   )");
   EXPECT_EQ(getGeneralRegister<int32_t>(1), INT8_MAX);
-  EXPECT_EQ(getGeneralRegister<int64_t>(2), -2);
-  EXPECT_EQ(getGeneralRegister<int64_t>(3), 64);
+  EXPECT_EQ(getGeneralRegister<int32_t>(2), -2);
+  EXPECT_EQ(getGeneralRegister<int64_t>(20),
+            getGeneralRegister<uint64_t>(0) + 16);
+
+  EXPECT_EQ(getGeneralRegister<int64_t>(3), -2);
+  EXPECT_EQ(getGeneralRegister<int64_t>(4), 64);
 }
 
 TEST_P(InstLoad, ldrsh) {
diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc
index 96d23590a..f3341e23f 100644
--- a/test/regression/aarch64/instructions/neon.cc
+++ b/test/regression/aarch64/instructions/neon.cc
@@ -3648,6 +3648,65 @@ TEST_P(InstNeon, trn) {
   CHECK_NEON(8, uint64_t, {0x1e1c1a1816141210, 0x1f1d1b1917151311});
 }
 
+TEST_P(InstNeon, udot) {
+  // udot by element
+  initialHeapData_.resize(128);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  heap64[0] = 0xDEADBEEFFFFF00FF;
+  heap64[1] = 0x01234567ABBACAFE;
+  heap64[2] = 0xFEDCBA9876543210;
+  heap64[3] = 0xDEADCAFEABBABEEF;
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, #0
+    mov x8, #214
+    svc #0
+
+    ldr q0, [x0]
+    ldr q1, [x0, #16]
+
+    movi v2.4s, #3
+    movi v3.4s, #4
+    movi v4.4s, #5
+    movi v5.4s, #6
+
+    udot v2.2s, v1.8b, v0.4b[0]
+    udot v3.4s, v1.16b, v0.4b[1]
+    udot v4.2s, v1.8b, v0.4b[2]
+    udot v5.4s, v1.16b, v0.4b[3]
+  )");
+  CHECK_NEON(0, uint64_t, {0xDEADBEEFFFFF00FF, 0x01234567ABBACAFE});
+  CHECK_NEON(1, uint64_t, {0xFEDCBA9876543210, 0xDEADCAFEABBABEEF});
+  CHECK_NEON(2, uint32_t, {0xd929, 0x26f91, 0x0, 0x0});
+  CHECK_NEON(3, uint32_t, {0xd328, 0x288e8, 0x27e25, 0x2b87f});
+  CHECK_NEON(4, uint32_t, {0xc333, 0x2731b, 0x0, 0x0});
+  CHECK_NEON(5, uint32_t, {0x1fe2, 0x8e62, 0xad7e, 0xb52f});
+
+  // udot by vector
+  initialHeapData_.resize(128);
+  heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  heap64[0] = 0xDEADBEEFFFFFFFFF;
+  heap64[1] = 0x01234567ABBACAFE;
+  heap64[2] = 0xFEDCBA98FFFFFFFF;
+  heap64[3] = 0xDEADCAFEABBABEEF;
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, #0
+    mov x8, #214
+    svc #0
+
+    ldr q0, [x0]
+    ldr q1, [x0, #16]
+
+    movi v2.4s, #3
+
+    udot v2.4s, v1.16b, v0.16b
+  )");
+  CHECK_NEON(0, uint64_t, {0xDEADBEEFFFFFFFFF, 0x01234567ABBACAFE});
+  CHECK_NEON(1, uint64_t, {0xFEDCBA98FFFFFFFF, 0xDEADCAFEABBABEEF});
+  CHECK_NEON(2, uint32_t, {0x3F807, 0x288E7, 0x27C6E, 0xB52C});
+}
+
 TEST_P(InstNeon, uzp) {
   initialHeapData_.resize(128);
   uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
@@ -3723,6 +3782,92 @@ TEST_P(InstNeon, uzp) {
   CHECK_NEON(8, uint64_t, {0x1e1c1a1816141210, 0x1f1d1b1917151311});
 }
 
+TEST_P(InstNeon, umlal) {
+  // uint32 to uint64, lower half
+  RUN_AARCH64(R"(
+    mov w0, #-1
+    mov w1, #344
+    mov v0.s[0], w0
+    mov v0.s[3], w1
+
+    mov w2, #-1
+    mov w3, #3
+    mov v1.s[0], w2
+    mov v1.s[1], w3
+
+    mov v2.d[0], xzr
+    mov v2.d[1], xzr
+    mov v3.d[0], xzr
+    mov v3.d[1], xzr
+
+    umlal v2.2d, v1.2s, v0.s[0]
+    umlal v3.2d, v1.2s, v0.s[3]
+  )");
+  CHECK_NEON(0, uint32_t, {UINT32_MAX, 0, 0, 344});
+  CHECK_NEON(2, uint64_t, {18446744065119617025ull, 12884901885ull});
+  CHECK_NEON(3, uint64_t, {1477468749480ull, 1032ull});
+
+  // uint32 to uint64, upper half
+  RUN_AARCH64(R"(
+    mov w0, #-1
+    mov w1, #344
+    mov v0.s[0], w0
+    mov v0.s[3], w1
+
+    mov w2, #-1
+    mov w3, #3
+    mov v1.s[2], w2
+    mov v1.s[3], w3
+
+    mov v2.d[0], xzr
+    mov v2.d[1], xzr
+    mov v3.d[0], xzr
+    mov v3.d[1], xzr
+
+    umlal2 v2.2d, v1.4s, v0.s[0]
+    umlal2 v3.2d, v1.4s, v0.s[3]
+  )");
+  CHECK_NEON(0, uint32_t, {UINT32_MAX, 0, 0, 344});
+  CHECK_NEON(2, uint64_t, {18446744065119617025ull, 12884901885ull});
+  CHECK_NEON(3, uint64_t, {1477468749480ull, 1032ull});
+}
+
+TEST_P(InstNeon, umull) {
+  // uint16_t to uint32_t
+  initialHeapData_.resize(32);
+  uint16_t* heap16 = reinterpret_cast<uint16_t*>(initialHeapData_.data());
+  heap16[0] = UINT16_MAX;
+  heap16[1] = 0;
+  heap16[2] = 1234;
+  heap16[3] = 0xBEEF;
+  heap16[4] = 0xABBA;
+  heap16[5] = 0xCAFE;
+  heap16[6] = 0xDEAD;
+  heap16[7] = 0xACDC;
+
+  heap16[8] = UINT16_MAX;
+  heap16[9] = 0xACDC;
+  heap16[10] = 0xCAFE;
+  heap16[11] = 0xABBA;
+  heap16[12] = 0xBEEF;
+  heap16[13] = 0xDEAD;
+  heap16[14] = 9876;
+  heap16[15] = 0;
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, #0
+    mov x8, #214
+    svc #0
+
+    ldr q0, [x0]
+    ldr q1, [x0, #16]
+
+    umull v2.4s, v0.4h, v1.4h
+  )");
+  CHECK_NEON(2, uint32_t, {4294836225u, 0, 64126044u, 2148818598u});
+}
+
 TEST_P(InstNeon, zip) {
   initialHeapData_.resize(128);
   uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc
index a54c0c981..9a7c3b4ec 100644
--- a/test/regression/aarch64/instructions/sme.cc
+++ b/test/regression/aarch64/instructions/sme.cc
@@ -8,6 +8,52 @@ namespace {
 using InstSme = AArch64RegressionTest;
 
 #if SIMENG_LLVM_VERSION >= 14
+
+TEST_P(InstSme, add) {
+  // uint32_T, vgx2, vecs with ZA
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 96 (uint32_t)
+    dup z0.b, #8
+    dup z1.b, #3
+    ptrue p0.b
+    ptrue p1.b
+    umopa za0.s, p0/m, p1/m, z0.b, z1.b
+    umopa za1.s, p0/m, p1/m, z0.b, z1.b
+    umopa za2.s, p0/m, p1/m, z0.b, z1.b
+    umopa za3.s, p0/m, p1/m, z0.b, z1.b
+
+    # Set 2 of the za rows
+    mov w8, #1
+    dup z0.s, #8
+    dup z1.s, #3
+    add za.s[w8, #1, vgx2], {z0.s, z1.s}
+  )");
+  const uint16_t zaStride = (SVL / 8) / 2;
+  const uint16_t zaHalfIndex = 2;
+  for (uint16_t i = 0; i < (SVL / 8); i++) {
+    if (i == zaHalfIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({104}, (SVL / 8)));
+    } else if (i == zaStride + zaHalfIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({99}, (SVL / 8)));
+    } else {
+      // un-effected rows should still be 96 throughout
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({96}, (SVL / 8)));
+    }
+  }
+}
+
 TEST_P(InstSme, addha) {
   // 32-bit
   RUN_AARCH64(R"(
@@ -136,6 +182,496 @@ TEST_P(InstSme, addha) {
   }
 }
 
+TEST_P(InstSme, mova_zaToVecs) {
+  // 2 vectors
+  initialHeapData_.resize(SVL / 8);
+  uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src = {0, 1, 2,  3,  4,  5,  6,  7,
+                              8, 9, 10, 11, 12, 13, 14, 15};
+  fillHeap<uint8_t>(heap8, src, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 96 (uint32_t)
+    dup z1.b, #8
+    dup z2.b, #3
+    ptrue p0.b
+    ptrue p1.b
+    umopa za0.s, p0/m, p1/m, z1.b, z2.b
+    umopa za1.s, p0/m, p1/m, z1.b, z2.b
+    umopa za2.s, p0/m, p1/m, z1.b, z2.b
+    umopa za3.s, p0/m, p1/m, z1.b, z2.b
+
+    # Set 4 of the za rows
+    mov w8, #1
+    dup z4.b, #10
+    dup z5.b, #11
+    dup z6.b, #12
+    dup z7.b, #13
+    ld1b {z10.b}, p0/z, [x0]
+    udot za.s[w8, #1, vgx4], {z4.b - z7.b}, z10.b[2]
+
+    # Extravt un-updated values
+    mov w9, #0
+    mova {z20.d, z21.d}, za.d[w9, #0, vgx2]
+    # Extract 0th and 2nd updated rows
+    mov {z24.d, z25.d}, za.d[w8, #1, vgx2]
+    # Extract 1st and 3rd updated rows (get new offset into each half)
+    addvl x10, x10, #1
+    mov x20, #4
+    udiv x10, x10, x20
+    mov {z26.d, z27.d}, za.d[w10, #2, vgx2]
+  )");
+  // Check extracted un-effected rows (two uint32_t values of 96 equal one
+  // uint64_t value of 412316860512)
+  CHECK_NEON(20, uint64_t, fillNeon<uint64_t>({412316860512}, SVL / 8));
+  CHECK_NEON(21, uint64_t, fillNeon<uint64_t>({412316860512}, SVL / 8));
+  // Check extracted effected rows (two uint32_t values concatonated into one
+  // uint64_t value)
+  CHECK_NEON(24, uint64_t, fillNeon<uint64_t>({2044404433372}, SVL / 8));
+  CHECK_NEON(25, uint64_t, fillNeon<uint64_t>({2370821947944}, SVL / 8));
+  CHECK_NEON(26, uint64_t, fillNeon<uint64_t>({2207613190658}, SVL / 8));
+  CHECK_NEON(27, uint64_t, fillNeon<uint64_t>({2534030705230}, SVL / 8));
+
+  // 4 vectors
+  initialHeapData_.resize(SVL / 8);
+  heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  src = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  fillHeap<uint8_t>(heap8, src, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 96 (uint32_t)
+    dup z1.b, #8
+    dup z2.b, #3
+    ptrue p0.b
+    ptrue p1.b
+    umopa za0.s, p0/m, p1/m, z1.b, z2.b
+    umopa za1.s, p0/m, p1/m, z1.b, z2.b
+    umopa za2.s, p0/m, p1/m, z1.b, z2.b
+    umopa za3.s, p0/m, p1/m, z1.b, z2.b
+
+    # Set 4 of the za rows
+    mov w8, #1
+    dup z4.b, #10
+    dup z5.b, #11
+    dup z6.b, #12
+    dup z7.b, #13
+    ld1b {z10.b}, p0/z, [x0]
+    udot za.s[w8, #1, vgx4], {z4.b - z7.b}, z10.b[2]
+
+    mov w9, #0
+    mova {z20.d - z23.d}, za.d[w9, #0, vgx4]
+    mov {z24.d - z27.d}, za.d[w8, #1, vgx4]
+  )");
+  // Check extracted un-effected rows (two uint32_t values of 96 equal one
+  // uint64_t value of 412316860512)
+  CHECK_NEON(20, uint64_t, fillNeon<uint64_t>({412316860512}, SVL / 8));
+  CHECK_NEON(21, uint64_t, fillNeon<uint64_t>({412316860512}, SVL / 8));
+  CHECK_NEON(22, uint64_t, fillNeon<uint64_t>({412316860512}, SVL / 8));
+  CHECK_NEON(23, uint64_t, fillNeon<uint64_t>({412316860512}, SVL / 8));
+  // Check extracted effected rows (two uint32_t values concatonated into one
+  // uint64_t value)
+  CHECK_NEON(24, uint64_t, fillNeon<uint64_t>({2044404433372}, SVL / 8));
+  CHECK_NEON(25, uint64_t, fillNeon<uint64_t>({2207613190658}, SVL / 8));
+  CHECK_NEON(26, uint64_t, fillNeon<uint64_t>({2370821947944}, SVL / 8));
+  CHECK_NEON(27, uint64_t, fillNeon<uint64_t>({2534030705230}, SVL / 8));
+}
+
+TEST_P(InstSme, mova_tilesToVecs) {
+  // uint8_t; 4 vectors
+  initialHeapData_.resize(SVL / 4);
+  uint32_t* heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  std::vector<uint32_t> src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01};
+  fillHeap<uint32_t>(heap32, src, SVL / 16);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    mov w12, #0
+    ptrue p0.s
+
+    # Pre-fill first 4 rows of za0.b
+    ld1w {za0h.s[w12, 0]}, p0/z, [x0]
+    ld1w {za1h.s[w12, 0]}, p0/z, [x0]
+    ld1w {za2h.s[w12, 0]}, p0/z, [x0]
+    ld1w {za3h.s[w12, 0]}, p0/z, [x0]
+
+
+    mova {z4.b-z7.b}, za0h.b[w12, 0:3]
+    
+    # Test Alias
+    mov w13, #1
+    dup z11.b, #3
+    mov {z8.b-z11.b}, za0h.b[w13, 0:3]
+  )");
+  for (int i = 4; i <= 10; i++) {
+    CHECK_NEON(
+        i, uint8_t,
+        fillNeon<uint8_t>({0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, 0x34, 0x12, 0x32,
+                           0x54, 0x76, 0x98, 0x01, 0xEF, 0xCD, 0xAB},
+                          SVL / 8));
+  }
+  CHECK_NEON(11, uint8_t, fillNeon<uint8_t>({0x00}, SVL / 8));
+}
+
+TEST_P(InstSme, fadd) {
+  // Float, VGx2
+  initialHeapData_.resize(SVL / 8);
+  uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src = {0, 1, 2,  3,  4,  5,  6,  7,
+                              8, 9, 10, 11, 12, 13, 14, 15};
+  fillHeap<uint8_t>(heap8, src, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 24.0f
+    fdup z1.s, #3.0
+    fdup z2.s, #8.0
+    ptrue p0.s
+    ptrue p1.s
+    fmopa za0.s, p0/m, p1/m, z1.s, z2.s
+    fmopa za1.s, p0/m, p1/m, z1.s, z2.s
+    fmopa za2.s, p0/m, p1/m, z1.s, z2.s
+    fmopa za3.s, p0/m, p1/m, z1.s, z2.s
+
+    # initialise registers
+    mov w8, #1
+    fdup z4.s, #-2.5
+    fdup z5.s, #3.0
+
+    fadd za.s[w8, #1, vgx2], {z4.s, z5.s}
+  )");
+  const uint16_t zaStride = (SVL / 8) / 2;
+  const uint16_t zaHalfIndex = 2;
+  for (uint16_t i = 0; i < (SVL / 8); i++) {
+    if (i == zaHalfIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({21.5f}, (SVL / 8)));
+    } else if (i == zaStride + zaHalfIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({27.0f}, (SVL / 8)));
+    } else {
+      // un-effected rows should still be 24.0f throughout
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({24.0f}, (SVL / 8)));
+    }
+  }
+
+  // Double, VGx2
+  initialHeapData_.resize(SVL / 8);
+  heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  src = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  fillHeap<uint8_t>(heap8, src, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 24.0
+    fdup z1.d, #3.0
+    fdup z2.d, #8.0
+    ptrue p0.d
+    ptrue p1.d
+    fmopa za0.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za1.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za2.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za3.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za4.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za5.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za6.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za7.d, p0/m, p1/m, z1.d, z2.d
+
+
+    # initialise registers
+    mov w8, #1
+    fdup z4.d, #-2.5
+    fdup z5.d, #3.0
+
+    fadd za.d[w8, #1, vgx2], {z4.d, z5.d}
+  )");
+  for (uint16_t i = 0; i < (SVL / 8); i++) {
+    if (i == zaHalfIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({21.5}, (SVL / 8)));
+    } else if (i == zaStride + zaHalfIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({27.0}, (SVL / 8)));
+    } else {
+      // un-effected rows should still be 24.0f throughout
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({24.0}, (SVL / 8)));
+    }
+  }
+}
+
+TEST_P(InstSme, fmla_multiVecs) {
+  // float, vgx4
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 24.0f
+    fdup z1.s, #3.0
+    fdup z2.s, #8.0
+    ptrue p0.s
+    ptrue p1.s
+    fmopa za0.s, p0/m, p1/m, z1.s, z2.s
+    fmopa za1.s, p0/m, p1/m, z1.s, z2.s
+    fmopa za2.s, p0/m, p1/m, z1.s, z2.s
+    fmopa za3.s, p0/m, p1/m, z1.s, z2.s
+
+    # initialise registers
+    mov w8, #1
+    fdup z4.s, #0.25
+    fdup z5.s, #1.5
+    fdup z6.s, #-0.5
+    fdup z7.s, #-2.5
+    fdup z8.s, #3.0
+    fdup z9.s, #4.0
+    fdup z10.s, #5.0
+    fdup z11.s, #6.0
+
+    fmla za.s[w8, #1, vgx4], {z4.s - z7.s}, {z8.s - z11.s}
+  )");
+  const uint16_t zaStride = (SVL / 8) / 4;
+  const uint16_t zaQuartIndex = 2;
+  for (uint16_t i = 0; i < (SVL / 8); i++) {
+    // Effected rows all use same zm value of 2.0f
+    if (i == zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({24.75f}, (SVL / 8)));
+    } else if (i == zaStride + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({30.0f}, (SVL / 8)));
+    } else if (i == (2 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({21.5f}, (SVL / 8)));
+    } else if (i == (3 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({9.0f}, (SVL / 8)));
+    } else {
+      // un-effected rows should still be 24.0f throughout
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({24.0f}, (SVL / 8)));
+    }
+  }
+
+  // double, vgx4
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 24.0
+    fdup z1.d, #3.0
+    fdup z2.d, #8.0
+    ptrue p0.d
+    ptrue p1.d
+    fmopa za0.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za1.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za2.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za3.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za4.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za5.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za6.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za7.d, p0/m, p1/m, z1.d, z2.d
+
+    # initialise registers
+    mov w8, #1
+    fdup z4.d, #0.25
+    fdup z5.d, #1.5
+    fdup z6.d, #-0.5
+    fdup z7.d, #-2.5
+    fdup z8.d, #3.0
+    fdup z9.d, #4.0
+    fdup z10.d, #5.0
+    fdup z11.d, #6.0
+
+    fmla za.d[w8, #1, vgx4], {z4.d - z7.d}, {z8.d - z11.d}
+  )");
+  for (uint16_t i = 0; i < (SVL / 8); i++) {
+    // Effected rows all use same zm value of 2.0
+    if (i == zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({24.75}, (SVL / 8)));
+    } else if (i == zaStride + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({30.0}, (SVL / 8)));
+    } else if (i == (2 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({21.5}, (SVL / 8)));
+    } else if (i == (3 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({9.0}, (SVL / 8)));
+    } else {
+      // un-effected rows should still be 24.0 throughout
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({24.0}, (SVL / 8)));
+    }
+  }
+}
+
+TEST_P(InstSme, fmla_indexed_vgx4) {
+  // float
+  initialHeapData_.resize(SVL);
+  float* heapf = reinterpret_cast<float*>(initialHeapData_.data());
+  std::vector<float> srcf = {0.0f, 1.0f, 2.0f, 3.0f};
+  fillHeap<float>(heapf, srcf, SVL / 4);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 24.0f
+    fdup z1.s, #3.0
+    fdup z2.s, #8.0
+    ptrue p0.s
+    ptrue p1.s
+    fmopa za0.s, p0/m, p1/m, z1.s, z2.s
+    fmopa za1.s, p0/m, p1/m, z1.s, z2.s
+    fmopa za2.s, p0/m, p1/m, z1.s, z2.s
+    fmopa za3.s, p0/m, p1/m, z1.s, z2.s
+
+    # initialise registers
+    mov w8, #1
+    fdup z4.s, #0.25
+    fdup z5.s, #1.5
+    fdup z6.s, #-0.5
+    fdup z7.s, #-2.5
+    ld1w {z10.s}, p0/z, [x0]
+
+    fmla za.s[w8, #1, vgx4], {z4.s - z7.s}, z10.s[2]
+  )");
+  const uint16_t zaStride = (SVL / 8) / 4;
+  const uint16_t zaQuartIndex = 2;
+  for (uint16_t i = 0; i < (SVL / 8); i++) {
+    // Effected rows all use same zm value of 2.0f
+    if (i == zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({24.5f}, (SVL / 8)));
+    } else if (i == zaStride + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({27.0f}, (SVL / 8)));
+    } else if (i == (2 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({23.0f}, (SVL / 8)));
+    } else if (i == (3 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({19.0f}, (SVL / 8)));
+    } else {
+      // un-effected rows should still be 24.0f throughout
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({24.0f}, (SVL / 8)));
+    }
+  }
+
+  // double
+  initialHeapData_.resize(SVL);
+  double* heapd = reinterpret_cast<double*>(initialHeapData_.data());
+  std::vector<double> srcd = {2.0f, 3.0f};
+  fillHeap<double>(heapd, srcd, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 24.0f
+    fdup z1.d, #3.0
+    fdup z2.d, #8.0
+    ptrue p0.d
+    ptrue p1.d
+    fmopa za0.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za1.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za2.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za3.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za4.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za5.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za6.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za7.d, p0/m, p1/m, z1.d, z2.d
+
+    # initialise registers
+    mov w8, #1
+    fdup z4.d, #0.25
+    fdup z5.d, #1.5
+    fdup z6.d, #-0.5
+    fdup z7.d, #-2.5
+    ld1d {z10.d}, p0/z, [x0]
+
+    fmla za.d[w8, #1, vgx4], {z4.d - z7.d}, z10.d[0]
+  )");
+  for (uint16_t i = 0; i < (SVL / 8); i++) {
+    // Effected rows all use same zm value of 2.0f
+    if (i == zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({24.5}, (SVL / 8)));
+    } else if (i == zaStride + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({27.0}, (SVL / 8)));
+    } else if (i == (2 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({23.0}, (SVL / 8)));
+    } else if (i == (3 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({19.0}, (SVL / 8)));
+    } else {
+      // un-effected rows should still be 24.0 throughout
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({24.0}, (SVL / 8)));
+    }
+  }
+}
 TEST_P(InstSme, addva) {
   // 32-bit
   RUN_AARCH64(R"(
@@ -1170,6 +1706,21 @@ TEST_P(InstSme, mova_q_vecToTile) {
   }
 }
 
+TEST_P(InstSme, rdsvl) {
+  RUN_AARCH64(R"(
+  rdsvl x0, #-32
+  rdsvl x1, #-3
+  rdsvl x2, #0
+  rdsvl x3, #3
+  rdsvl x4, #31
+)");
+  EXPECT_EQ(getGeneralRegister<int64_t>(0), (SVL / 8) * -32);
+  EXPECT_EQ(getGeneralRegister<int64_t>(1), (SVL / 8) * -3);
+  EXPECT_EQ(getGeneralRegister<int64_t>(2), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(3), (SVL / 8) * 3);
+  EXPECT_EQ(getGeneralRegister<int64_t>(4), (SVL / 8) * 31);
+}
+
 TEST_P(InstSme, fmopa) {
   // 32-bit
   RUN_AARCH64(R"(
@@ -3410,13 +3961,207 @@ TEST_P(InstSme, usmops) {
   }
 }
 
+TEST_P(InstSme, udot_Indexed_vgx4) {
+  // 8-bit to 32-bit widening
+  initialHeapData_.resize(SVL / 8);
+  uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src = {0, 1, 2,  3,  4,  5,  6,  7,
+                              8, 9, 10, 11, 12, 13, 14, 15};
+  fillHeap<uint8_t>(heap8, src, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 96 (uint32_t)
+    dup z1.b, #8
+    dup z2.b, #3
+    ptrue p0.b
+    ptrue p1.b
+    umopa za0.s, p0/m, p1/m, z1.b, z2.b
+    umopa za1.s, p0/m, p1/m, z1.b, z2.b
+    umopa za2.s, p0/m, p1/m, z1.b, z2.b
+    umopa za3.s, p0/m, p1/m, z1.b, z2.b
+
+    # initialise registers
+    mov w8, #1
+    dup z4.b, #10
+    dup z5.b, #11
+    dup z6.b, #12
+    dup z7.b, #13
+    ld1b {z10.b}, p0/z, [x0]
+
+    udot za.s[w8, #1, vgx4], {z4.b - z7.b}, z10.b[2]
+  )");
+  const uint16_t zaStride = (SVL / 8) / 4;
+  const uint16_t zaQuartIndex = 2;
+  for (uint16_t i = 0; i < (SVL / 8); i++) {
+    // Effected rows all use same zm values of {8, 9, 10, 11}
+    if (i == zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({476}, (SVL / 8)));
+    } else if (i == zaStride + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({514}, (SVL / 8)));
+    } else if (i == (2 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({552}, (SVL / 8)));
+    } else if (i == (3 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({590}, (SVL / 8)));
+    } else {
+      // un-effected rows should still be 96 throughout
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({96}, (SVL / 8)));
+    }
+  }
+}
+
+TEST_P(InstSme, udot_vgx4) {
+  // 8-bit to 32-bit widening
+  initialHeapData_.resize(SVL / 8);
+  uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src = {0, 1, 2,  3,  4,  5,  6,  7,
+                              8, 9, 10, 11, 12, 13, 14, 15};
+  fillHeap<uint8_t>(heap8, src, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 96 (uint32_t)
+    dup z1.b, #8
+    dup z2.b, #3
+    ptrue p0.b
+    ptrue p1.b
+    umopa za0.s, p0/m, p1/m, z1.b, z2.b
+    umopa za1.s, p0/m, p1/m, z1.b, z2.b
+    umopa za2.s, p0/m, p1/m, z1.b, z2.b
+    umopa za3.s, p0/m, p1/m, z1.b, z2.b
+
+    # initialise registers
+    mov w8, #1
+    dup z4.b, #10
+    dup z5.b, #11
+    dup z6.b, #12
+    dup z7.b, #13
+    ld1b {z8.b}, p0/z, [x0]
+    ld1b {z9.b}, p0/z, [x0]
+    ld1b {z10.b}, p0/z, [x0]
+    ld1b {z11.b}, p0/z, [x0]
+
+    udot za.s[w8, #1, vgx4], {z4.b - z7.b}, {z8.b - z11.b}
+  )");
+  const uint16_t zaStride = (SVL / 8) / 4;
+  const uint16_t zaQuartIndex = 2;
+  for (uint16_t i = 0; i < (SVL / 8); i++) {
+    if (i == zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({156, 316, 476, 636}, (SVL / 8)));
+    } else if (i == zaStride + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({162, 338, 514, 690}, (SVL / 8)));
+    } else if (i == (2 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({168, 360, 552, 744}, (SVL / 8)));
+    } else if (i == (3 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({174, 382, 590, 798}, (SVL / 8)));
+    } else {
+      // un-effected rows should still be 96 throughout
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({96}, (SVL / 8)));
+    }
+  }
+}
+
+TEST_P(InstSme, uvdot_indexed_vgx4) {
+  // 8-bit to 32-bit widening
+  initialHeapData_.resize(SVL / 8);
+  uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src = {0, 1, 2,  3,  4,  5,  6,  7,
+                              8, 9, 10, 11, 12, 13, 14, 15};
+  fillHeap<uint8_t>(heap8, src, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 96 (uint32_t)
+    dup z1.b, #8
+    dup z2.b, #3
+    ptrue p0.b
+    ptrue p1.b
+    umopa za0.s, p0/m, p1/m, z1.b, z2.b
+    umopa za1.s, p0/m, p1/m, z1.b, z2.b
+    umopa za2.s, p0/m, p1/m, z1.b, z2.b
+    umopa za3.s, p0/m, p1/m, z1.b, z2.b
+
+    # initialise registers
+    mov w8, #1
+    dup z4.b, #10
+    dup z5.b, #11
+    dup z6.b, #12
+    dup z7.b, #13
+    ld1b {z10.b}, p0/z, [x0]
+
+    uvdot za.s[w8, #1, vgx4], {z4.b - z7.b}, z10.b[2]
+  )");
+  const uint16_t zaStride = (SVL / 8) / 4;
+  const uint16_t zaQuartIndex = 2;
+  for (uint16_t i = 0; i < (SVL / 8); i++) {
+    // Effected rows all use same zm values of {8, 9, 10, 11}
+    if (i == zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({538}, (SVL / 8)));
+    } else if (i == zaStride + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({538}, (SVL / 8)));
+    } else if (i == (2 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({538}, (SVL / 8)));
+    } else if (i == (3 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({538}, (SVL / 8)));
+    } else {
+      // un-effected rows should still be 96 throughout
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({96}, (SVL / 8)));
+    }
+  }
+}
+
 TEST_P(InstSme, zero) {
+  // ZT0
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {zt0}
+  )");
+  CHECK_TABLE(uint64_t, {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0});
+
+  // ZA tiles
   RUN_AARCH64(R"(
     smstart
 
     zero {za}
   )");
-  for (uint64_t i = 0; i < (SVL / 8); i++) {
+  for (uint16_t i = 0; i < (SVL / 8); i++) {
     CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint64_t,
                   fillNeon<uint64_t>({0}, SVL / 8));
   }
@@ -3453,7 +4198,7 @@ TEST_P(InstSme, zero) {
 
     zero {za0.s, za2.s}
   )");
-  for (uint64_t i = 0; i < (SVL / 32); i++) {
+  for (uint16_t i = 0; i < (SVL / 32); i++) {
     CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t,
                   fillNeon<uint32_t>({0}, SVL / 8));
     CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t,
@@ -3467,6 +4212,7 @@ TEST_P(InstSme, zero) {
 INSTANTIATE_TEST_SUITE_P(AArch64, InstSme,
                          ::testing::ValuesIn(genCoreTypeSVLPairs(EMULATION)),
                          paramToString);
+
 #else
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InstSme);
 #endif
diff --git a/test/regression/aarch64/instructions/store.cc b/test/regression/aarch64/instructions/store.cc
index 6d6876b49..6a8136da3 100644
--- a/test/regression/aarch64/instructions/store.cc
+++ b/test/regression/aarch64/instructions/store.cc
@@ -437,6 +437,26 @@ TEST_P(InstStore, st1_multi_struct) {
     }
   }
 
+  // one reg, 4s elements (post offset only)
+  RUN_AARCH64(R"(
+    mov x0, #32
+    movi v0.4s, #1
+    sub sp, sp, #96
+    st1 {v0.4s}, [sp], #16
+    st1 {v0.4s}, [sp], x0
+  )");
+  const uint64_t sp = process_->getInitialStackPointer();
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31), sp - 48);
+  EXPECT_EQ(getMemoryValue<uint32_t>(sp - 96), static_cast<uint32_t>(1));
+  EXPECT_EQ(getMemoryValue<uint32_t>(sp - 92), static_cast<uint32_t>(1));
+  EXPECT_EQ(getMemoryValue<uint32_t>(sp - 88), static_cast<uint32_t>(1));
+  EXPECT_EQ(getMemoryValue<uint32_t>(sp - 84), static_cast<uint32_t>(1));
+
+  EXPECT_EQ(getMemoryValue<uint32_t>(sp - 80), static_cast<uint32_t>(1));
+  EXPECT_EQ(getMemoryValue<uint32_t>(sp - 76), static_cast<uint32_t>(1));
+  EXPECT_EQ(getMemoryValue<uint32_t>(sp - 72), static_cast<uint32_t>(1));
+  EXPECT_EQ(getMemoryValue<uint32_t>(sp - 68), static_cast<uint32_t>(1));
+
   // two reg, 4s elements
   RUN_AARCH64(R"(
     mov x0, #32
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 6a52d46b9..9411ef008 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -2852,6 +2852,84 @@ TEST_P(InstSve, fadda) {
   CHECK_NEON(3, double, {resultB, 0});
 }
 
+TEST_P(InstSve, faddv) {
+  // float
+  initialHeapData_.resize(VL / 8);
+  float* fheap = reinterpret_cast<float*>(initialHeapData_.data());
+  std::vector<float> fsrc = {
+      1.0f,    -42.76f, -0.125f, 0.0f,   40.26f,   -684.72f, -0.15f,  107.86f,
+      -34.71f, -0.917f, 0.0f,    80.72f, -125.67f, -0.01f,   701.90f, 7.0f};
+  fillHeap<float>(fheap, fsrc, VL / 32);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x2, xzr
+    mov x3, xzr
+    mov x4, #4
+    mov x5, #2
+    addvl x3, x3, #1
+    sdiv x3, x3, x4
+    sdiv x2, x3, x5
+
+    ptrue p0.s
+    whilelo p1.s, xzr, x2
+
+    ld1w {z0.s}, p0/z, [x0]
+
+    faddv s3, p0, z0.s
+    faddv s4, p1, z0.s
+  )");
+  float s3 = 0.0f;
+  float s4 = 0.0f;
+  for (uint64_t i = 0; i < VL / 32; i++) {
+    s3 += fsrc[i % (fsrc.size())];
+    if (i < (VL / 64)) s4 += fsrc[i % (fsrc.size())];
+  }
+  CHECK_NEON(3, float, {s3, 0.0f, 0.0f, 0.0f});
+  CHECK_NEON(4, float, {s4, 0.0f, 0.0f, 0.0f});
+
+  // double
+  initialHeapData_.resize(VL);
+  double* dheap = reinterpret_cast<double*>(initialHeapData_.data());
+  std::vector<double> dsrc = {1.0,     -42.76, -0.125, 0.0,    40.26, -684.72,
+                              -0.15,   107.86, -34.71, -0.917, 0.0,   80.72,
+                              -125.67, -0.01,  701.90, 7.0};
+  fillHeap<double>(dheap, dsrc, VL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x2, xzr
+    mov x3, xzr
+    mov x4, #8
+    mov x5, #2
+    addvl x3, x3, #1
+    sdiv x3, x3, x4
+    sdiv x2, x3, x5
+
+    ptrue p0.d
+    whilelo p1.d, xzr, x2
+
+    ld1d {z0.d}, p0/z, [x0]
+
+    faddv d3, p0, z0.d
+    faddv d4, p1, z0.d
+  )");
+  double d3 = 0.0;
+  double d4 = 0.0;
+  for (uint64_t i = 0; i < (VL / 64); i++) {
+    d3 += dsrc[i % (dsrc.size())];
+    if (i < (VL / 128)) d4 += dsrc[i % (dsrc.size())];
+  }
+  CHECK_NEON(3, double, {d3, 0.0});
+  CHECK_NEON(4, double, {d4, 0.0});
+}
+
 TEST_P(InstSve, fcmge) {
   // double
   initialHeapData_.resize(VL / 16);
@@ -4641,6 +4719,84 @@ TEST_P(InstSve, ld1rd) {
   CHECK_NEON(3, uint64_t, fillNeon<uint64_t>({0x12345678}, VL / 16));
 }
 
+TEST_P(InstSve, ld1rqb) {
+  initialHeapData_.resize(32);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  fillHeap<uint64_t>(heap64,
+                     {0x12345678DEADBEEF, 0xABCDEF0198765432,
+                      0xABBACAFEFEDCBA98, 0xFEEDABCDBEADCABB},
+                     4);
+  // Imm offset
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    # Load and broadcast values from heap
+    ptrue p0.b
+    ld1rqb {z0.b}, p0/z, [x0]
+    ld1rqb {z1.b}, p0/z, [x0, #16]
+
+    # Test for inactive lanes
+    ptrue p1.b, vl1
+    ld1rqb {z2.b}, p1/z, [x0]
+    add x0, x0, #32
+    ld1rqb {z3.b}, p1/z, [x0, #-16]
+  )");
+  CHECK_NEON(0, uint8_t,
+             fillNeon<uint8_t>({0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, 0x34, 0x12,
+                                0x32, 0x54, 0x76, 0x98, 0x01, 0xEF, 0xCD, 0xAB},
+                               VL / 8));
+  CHECK_NEON(1, uint8_t,
+             fillNeon<uint8_t>({0x98, 0xBA, 0xDC, 0xFE, 0xFE, 0xCA, 0xBA, 0xAB,
+                                0xBB, 0xCA, 0xAD, 0xBE, 0xCD, 0xAB, 0xED, 0xFE},
+                               VL / 8));
+  CHECK_NEON(2, uint8_t,
+             fillNeon<uint8_t>({0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                               VL / 8));
+  CHECK_NEON(3, uint8_t,
+             fillNeon<uint8_t>({0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                               VL / 8));
+
+  // Reg offset
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    # Load and broadcast values from heap
+    ptrue p0.b
+    mov x1, #16
+    ld1rqb {z0.b}, p0/z, [x0]
+    ld1rqb {z1.b}, p0/z, [x0, x1]
+
+    # Test for inactive lanes
+    ptrue p1.b, vl1
+    ld1rqb {z2.b}, p1/z, [x0]
+    ld1rqb {z3.b}, p1/z, [x0, x1]
+  )");
+  CHECK_NEON(0, uint8_t,
+             fillNeon<uint8_t>({0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, 0x34, 0x12,
+                                0x32, 0x54, 0x76, 0x98, 0x01, 0xEF, 0xCD, 0xAB},
+                               VL / 8));
+  CHECK_NEON(1, uint8_t,
+             fillNeon<uint8_t>({0x98, 0xBA, 0xDC, 0xFE, 0xFE, 0xCA, 0xBA, 0xAB,
+                                0xBB, 0xCA, 0xAD, 0xBE, 0xCD, 0xAB, 0xED, 0xFE},
+                               VL / 8));
+  CHECK_NEON(2, uint8_t,
+             fillNeon<uint8_t>({0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                               VL / 8));
+  CHECK_NEON(3, uint8_t,
+             fillNeon<uint8_t>({0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                               VL / 8));
+}
+
 TEST_P(InstSve, ld1rqd) {
   initialHeapData_.resize(32);
   uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
@@ -4737,6 +4893,7 @@ TEST_P(InstSve, ld1rw) {
 }
 
 TEST_P(InstSve, ld1b) {
+  // Single vector
   initialHeapData_.resize(VL / 4);
   uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
   std::vector<uint8_t> src = {0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, 0x34, 0x12,
@@ -4774,6 +4931,460 @@ TEST_P(InstSve, ld1b) {
                                VL / 16));
   std::rotate(src.begin(), src.begin() + ((VL / 8) % 16), src.end());
   CHECK_NEON(2, uint8_t, fillNeon<uint8_t>(src, VL / 16));
+
+  // Multi vector
+  initialHeapData_.resize(VL);
+  uint8_t* heap8_multi = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src_multi = {0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56,
+                                    0x34, 0x12, 0x32, 0x54, 0x76, 0x98,
+                                    0x01, 0xEF, 0xCD, 0xAB};
+  fillHeap<uint8_t>(heap8_multi, src_multi, VL);
+
+  // Two vector
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    dup z0.b, #1
+    dup z1.b, #2
+    dup z2.b, #3
+    dup z3.b, #4
+
+    ptrue pn8.b
+    mov x1, #2
+
+    ld1b {z0.b, z1.b}, pn8/z, [x0, #2, mul vl]
+    ld1b {z2.b, z3.b}, pn8/z, [x0, x1]
+  )");
+  uint16_t base = (VL / 8) * 2;
+  uint16_t offset = (VL / 8);
+  CHECK_NEON(0, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[(base) % 16],
+                     src[(base + 1) % 16],
+                     src[(base + 2) % 16],
+                     src[(base + 3) % 16],
+                     src[(base + 4) % 16],
+                     src[(base + 5) % 16],
+                     src[(base + 6) % 16],
+                     src[(base + 7) % 16],
+                     src[(base + 8) % 16],
+                     src[(base + 9) % 16],
+                     src[(base + 10) % 16],
+                     src[(base + 11) % 16],
+                     src[(base + 12) % 16],
+                     src[(base + 13) % 16],
+                     src[(base + 14) % 16],
+                     src[(base + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(1, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + offset)) % 16],
+                     src[((base + offset) + 1) % 16],
+                     src[((base + offset) + 2) % 16],
+                     src[((base + offset) + 3) % 16],
+                     src[((base + offset) + 4) % 16],
+                     src[((base + offset) + 5) % 16],
+                     src[((base + offset) + 6) % 16],
+                     src[((base + offset) + 7) % 16],
+                     src[((base + offset) + 8) % 16],
+                     src[((base + offset) + 9) % 16],
+                     src[((base + offset) + 10) % 16],
+                     src[((base + offset) + 11) % 16],
+                     src[((base + offset) + 12) % 16],
+                     src[((base + offset) + 13) % 16],
+                     src[((base + offset) + 14) % 16],
+                     src[((base + offset) + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(2, uint8_t,
+             fillNeon<uint8_t>({src[2], src[3], src[4], src[5], src[6], src[7],
+                                src[8], src[9], src[10], src[11], src[12],
+                                src[13], src[14], src[15], src[0], src[1]},
+                               VL / 8));
+  CHECK_NEON(
+      3, uint8_t,
+      fillNeon<uint8_t>({src[(2 + offset) % 16], src[(3 + offset) % 16],
+                         src[(4 + offset) % 16], src[(5 + offset) % 16],
+                         src[(6 + offset) % 16], src[(7 + offset) % 16],
+                         src[(8 + offset) % 16], src[(9 + offset) % 16],
+                         src[(10 + offset) % 16], src[(11 + offset) % 16],
+                         src[(12 + offset) % 16], src[(13 + offset) % 16],
+                         src[(14 + offset) % 16], src[(15 + offset) % 16],
+                         src[(0 + offset) % 16], src[(1 + offset) % 16]},
+                        VL / 8));
+
+  // Four vector
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    dup z0.b, #1
+    dup z1.b, #2
+    dup z2.b, #3
+    dup z3.b, #4
+
+    ptrue pn8.b
+
+    mov x1, #4
+    ld1b {z0.b - z3.b}, pn8/z, [x0, #4, mul vl]
+    ld1b {z4.b - z7.b}, pn8/z, [x0, x1]
+    ld1b {z16.b, z20.b, z24.b, z28.b}, pn8/z, [x0, #4, mul vl]
+    ld1b {z17.b, z21.b, z25.b, z29.b}, pn8/z, [x0, x1]
+  )");
+  base = (VL / 8) * 4;
+  offset = (VL / 8);
+  // Consecutive vectors
+  CHECK_NEON(0, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[(base) % 16],
+                     src[(base + 1) % 16],
+                     src[(base + 2) % 16],
+                     src[(base + 3) % 16],
+                     src[(base + 4) % 16],
+                     src[(base + 5) % 16],
+                     src[(base + 6) % 16],
+                     src[(base + 7) % 16],
+                     src[(base + 8) % 16],
+                     src[(base + 9) % 16],
+                     src[(base + 10) % 16],
+                     src[(base + 11) % 16],
+                     src[(base + 12) % 16],
+                     src[(base + 13) % 16],
+                     src[(base + 14) % 16],
+                     src[(base + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(1, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + offset)) % 16],
+                     src[((base + offset) + 1) % 16],
+                     src[((base + offset) + 2) % 16],
+                     src[((base + offset) + 3) % 16],
+                     src[((base + offset) + 4) % 16],
+                     src[((base + offset) + 5) % 16],
+                     src[((base + offset) + 6) % 16],
+                     src[((base + offset) + 7) % 16],
+                     src[((base + offset) + 8) % 16],
+                     src[((base + offset) + 9) % 16],
+                     src[((base + offset) + 10) % 16],
+                     src[((base + offset) + 11) % 16],
+                     src[((base + offset) + 12) % 16],
+                     src[((base + offset) + 13) % 16],
+                     src[((base + offset) + 14) % 16],
+                     src[((base + offset) + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(2, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + (2 * offset))) % 16],
+                     src[((base + (2 * offset)) + 1) % 16],
+                     src[((base + (2 * offset)) + 2) % 16],
+                     src[((base + (2 * offset)) + 3) % 16],
+                     src[((base + (2 * offset)) + 4) % 16],
+                     src[((base + (2 * offset)) + 5) % 16],
+                     src[((base + (2 * offset)) + 6) % 16],
+                     src[((base + (2 * offset)) + 7) % 16],
+                     src[((base + (2 * offset)) + 8) % 16],
+                     src[((base + (2 * offset)) + 9) % 16],
+                     src[((base + (2 * offset)) + 10) % 16],
+                     src[((base + (2 * offset)) + 11) % 16],
+                     src[((base + (2 * offset)) + 12) % 16],
+                     src[((base + (2 * offset)) + 13) % 16],
+                     src[((base + (2 * offset)) + 14) % 16],
+                     src[((base + (2 * offset)) + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(3, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + (3 * offset))) % 16],
+                     src[((base + (3 * offset)) + 1) % 16],
+                     src[((base + (3 * offset)) + 2) % 16],
+                     src[((base + (3 * offset)) + 3) % 16],
+                     src[((base + (3 * offset)) + 4) % 16],
+                     src[((base + (3 * offset)) + 5) % 16],
+                     src[((base + (3 * offset)) + 6) % 16],
+                     src[((base + (3 * offset)) + 7) % 16],
+                     src[((base + (3 * offset)) + 8) % 16],
+                     src[((base + (3 * offset)) + 9) % 16],
+                     src[((base + (3 * offset)) + 10) % 16],
+                     src[((base + (3 * offset)) + 11) % 16],
+                     src[((base + (3 * offset)) + 12) % 16],
+                     src[((base + (3 * offset)) + 13) % 16],
+                     src[((base + (3 * offset)) + 14) % 16],
+                     src[((base + (3 * offset)) + 15) % 16],
+                 },
+                 VL / 8));
+  base = 4;
+  offset = (VL / 8);
+  CHECK_NEON(4, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[(base) % 16],
+                     src[(base + 1) % 16],
+                     src[(base + 2) % 16],
+                     src[(base + 3) % 16],
+                     src[(base + 4) % 16],
+                     src[(base + 5) % 16],
+                     src[(base + 6) % 16],
+                     src[(base + 7) % 16],
+                     src[(base + 8) % 16],
+                     src[(base + 9) % 16],
+                     src[(base + 10) % 16],
+                     src[(base + 11) % 16],
+                     src[(base + 12) % 16],
+                     src[(base + 13) % 16],
+                     src[(base + 14) % 16],
+                     src[(base + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(4, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + offset)) % 16],
+                     src[((base + offset) + 1) % 16],
+                     src[((base + offset) + 2) % 16],
+                     src[((base + offset) + 3) % 16],
+                     src[((base + offset) + 4) % 16],
+                     src[((base + offset) + 5) % 16],
+                     src[((base + offset) + 6) % 16],
+                     src[((base + offset) + 7) % 16],
+                     src[((base + offset) + 8) % 16],
+                     src[((base + offset) + 9) % 16],
+                     src[((base + offset) + 10) % 16],
+                     src[((base + offset) + 11) % 16],
+                     src[((base + offset) + 12) % 16],
+                     src[((base + offset) + 13) % 16],
+                     src[((base + offset) + 14) % 16],
+                     src[((base + offset) + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(6, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + (2 * offset))) % 16],
+                     src[((base + (2 * offset)) + 1) % 16],
+                     src[((base + (2 * offset)) + 2) % 16],
+                     src[((base + (2 * offset)) + 3) % 16],
+                     src[((base + (2 * offset)) + 4) % 16],
+                     src[((base + (2 * offset)) + 5) % 16],
+                     src[((base + (2 * offset)) + 6) % 16],
+                     src[((base + (2 * offset)) + 7) % 16],
+                     src[((base + (2 * offset)) + 8) % 16],
+                     src[((base + (2 * offset)) + 9) % 16],
+                     src[((base + (2 * offset)) + 10) % 16],
+                     src[((base + (2 * offset)) + 11) % 16],
+                     src[((base + (2 * offset)) + 12) % 16],
+                     src[((base + (2 * offset)) + 13) % 16],
+                     src[((base + (2 * offset)) + 14) % 16],
+                     src[((base + (2 * offset)) + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(7, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + (3 * offset))) % 16],
+                     src[((base + (3 * offset)) + 1) % 16],
+                     src[((base + (3 * offset)) + 2) % 16],
+                     src[((base + (3 * offset)) + 3) % 16],
+                     src[((base + (3 * offset)) + 4) % 16],
+                     src[((base + (3 * offset)) + 5) % 16],
+                     src[((base + (3 * offset)) + 6) % 16],
+                     src[((base + (3 * offset)) + 7) % 16],
+                     src[((base + (3 * offset)) + 8) % 16],
+                     src[((base + (3 * offset)) + 9) % 16],
+                     src[((base + (3 * offset)) + 10) % 16],
+                     src[((base + (3 * offset)) + 11) % 16],
+                     src[((base + (3 * offset)) + 12) % 16],
+                     src[((base + (3 * offset)) + 13) % 16],
+                     src[((base + (3 * offset)) + 14) % 16],
+                     src[((base + (3 * offset)) + 15) % 16],
+                 },
+                 VL / 8));
+  // Strided (4-stride) vectors
+  base = (VL / 8) * 4;
+  offset = (VL / 8);
+  CHECK_NEON(16, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[(base) % 16],
+                     src[(base + 1) % 16],
+                     src[(base + 2) % 16],
+                     src[(base + 3) % 16],
+                     src[(base + 4) % 16],
+                     src[(base + 5) % 16],
+                     src[(base + 6) % 16],
+                     src[(base + 7) % 16],
+                     src[(base + 8) % 16],
+                     src[(base + 9) % 16],
+                     src[(base + 10) % 16],
+                     src[(base + 11) % 16],
+                     src[(base + 12) % 16],
+                     src[(base + 13) % 16],
+                     src[(base + 14) % 16],
+                     src[(base + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(20, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + offset)) % 16],
+                     src[((base + offset) + 1) % 16],
+                     src[((base + offset) + 2) % 16],
+                     src[((base + offset) + 3) % 16],
+                     src[((base + offset) + 4) % 16],
+                     src[((base + offset) + 5) % 16],
+                     src[((base + offset) + 6) % 16],
+                     src[((base + offset) + 7) % 16],
+                     src[((base + offset) + 8) % 16],
+                     src[((base + offset) + 9) % 16],
+                     src[((base + offset) + 10) % 16],
+                     src[((base + offset) + 11) % 16],
+                     src[((base + offset) + 12) % 16],
+                     src[((base + offset) + 13) % 16],
+                     src[((base + offset) + 14) % 16],
+                     src[((base + offset) + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(24, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + (2 * offset))) % 16],
+                     src[((base + (2 * offset)) + 1) % 16],
+                     src[((base + (2 * offset)) + 2) % 16],
+                     src[((base + (2 * offset)) + 3) % 16],
+                     src[((base + (2 * offset)) + 4) % 16],
+                     src[((base + (2 * offset)) + 5) % 16],
+                     src[((base + (2 * offset)) + 6) % 16],
+                     src[((base + (2 * offset)) + 7) % 16],
+                     src[((base + (2 * offset)) + 8) % 16],
+                     src[((base + (2 * offset)) + 9) % 16],
+                     src[((base + (2 * offset)) + 10) % 16],
+                     src[((base + (2 * offset)) + 11) % 16],
+                     src[((base + (2 * offset)) + 12) % 16],
+                     src[((base + (2 * offset)) + 13) % 16],
+                     src[((base + (2 * offset)) + 14) % 16],
+                     src[((base + (2 * offset)) + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(28, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + (3 * offset))) % 16],
+                     src[((base + (3 * offset)) + 1) % 16],
+                     src[((base + (3 * offset)) + 2) % 16],
+                     src[((base + (3 * offset)) + 3) % 16],
+                     src[((base + (3 * offset)) + 4) % 16],
+                     src[((base + (3 * offset)) + 5) % 16],
+                     src[((base + (3 * offset)) + 6) % 16],
+                     src[((base + (3 * offset)) + 7) % 16],
+                     src[((base + (3 * offset)) + 8) % 16],
+                     src[((base + (3 * offset)) + 9) % 16],
+                     src[((base + (3 * offset)) + 10) % 16],
+                     src[((base + (3 * offset)) + 11) % 16],
+                     src[((base + (3 * offset)) + 12) % 16],
+                     src[((base + (3 * offset)) + 13) % 16],
+                     src[((base + (3 * offset)) + 14) % 16],
+                     src[((base + (3 * offset)) + 15) % 16],
+                 },
+                 VL / 8));
+  base = 4;
+  offset = (VL / 8);
+  CHECK_NEON(17, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[(base) % 16],
+                     src[(base + 1) % 16],
+                     src[(base + 2) % 16],
+                     src[(base + 3) % 16],
+                     src[(base + 4) % 16],
+                     src[(base + 5) % 16],
+                     src[(base + 6) % 16],
+                     src[(base + 7) % 16],
+                     src[(base + 8) % 16],
+                     src[(base + 9) % 16],
+                     src[(base + 10) % 16],
+                     src[(base + 11) % 16],
+                     src[(base + 12) % 16],
+                     src[(base + 13) % 16],
+                     src[(base + 14) % 16],
+                     src[(base + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(21, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + offset)) % 16],
+                     src[((base + offset) + 1) % 16],
+                     src[((base + offset) + 2) % 16],
+                     src[((base + offset) + 3) % 16],
+                     src[((base + offset) + 4) % 16],
+                     src[((base + offset) + 5) % 16],
+                     src[((base + offset) + 6) % 16],
+                     src[((base + offset) + 7) % 16],
+                     src[((base + offset) + 8) % 16],
+                     src[((base + offset) + 9) % 16],
+                     src[((base + offset) + 10) % 16],
+                     src[((base + offset) + 11) % 16],
+                     src[((base + offset) + 12) % 16],
+                     src[((base + offset) + 13) % 16],
+                     src[((base + offset) + 14) % 16],
+                     src[((base + offset) + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(25, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + (2 * offset))) % 16],
+                     src[((base + (2 * offset)) + 1) % 16],
+                     src[((base + (2 * offset)) + 2) % 16],
+                     src[((base + (2 * offset)) + 3) % 16],
+                     src[((base + (2 * offset)) + 4) % 16],
+                     src[((base + (2 * offset)) + 5) % 16],
+                     src[((base + (2 * offset)) + 6) % 16],
+                     src[((base + (2 * offset)) + 7) % 16],
+                     src[((base + (2 * offset)) + 8) % 16],
+                     src[((base + (2 * offset)) + 9) % 16],
+                     src[((base + (2 * offset)) + 10) % 16],
+                     src[((base + (2 * offset)) + 11) % 16],
+                     src[((base + (2 * offset)) + 12) % 16],
+                     src[((base + (2 * offset)) + 13) % 16],
+                     src[((base + (2 * offset)) + 14) % 16],
+                     src[((base + (2 * offset)) + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(29, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + (3 * offset))) % 16],
+                     src[((base + (3 * offset)) + 1) % 16],
+                     src[((base + (3 * offset)) + 2) % 16],
+                     src[((base + (3 * offset)) + 3) % 16],
+                     src[((base + (3 * offset)) + 4) % 16],
+                     src[((base + (3 * offset)) + 5) % 16],
+                     src[((base + (3 * offset)) + 6) % 16],
+                     src[((base + (3 * offset)) + 7) % 16],
+                     src[((base + (3 * offset)) + 8) % 16],
+                     src[((base + (3 * offset)) + 9) % 16],
+                     src[((base + (3 * offset)) + 10) % 16],
+                     src[((base + (3 * offset)) + 11) % 16],
+                     src[((base + (3 * offset)) + 12) % 16],
+                     src[((base + (3 * offset)) + 13) % 16],
+                     src[((base + (3 * offset)) + 14) % 16],
+                     src[((base + (3 * offset)) + 15) % 16],
+                 },
+                 VL / 8));
 }
 
 TEST_P(InstSve, ld1sw_gather) {
@@ -4907,6 +5518,7 @@ TEST_P(InstSve, ld1d_gather) {
 }
 
 TEST_P(InstSve, ld1d) {
+  // Single vector
   initialHeapData_.resize(VL / 4);
   uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
   std::vector<uint64_t> src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01};
@@ -4948,9 +5560,111 @@ TEST_P(InstSve, ld1d) {
              fillNeon<uint64_t>({src[(base) % 4], src[(base + 1) % 4],
                                  src[(base + 2) % 4], src[(base + 3) % 4]},
                                 VL / 16));
+
+  // Multi vector
+  initialHeapData_.resize(VL);
+  uint64_t* heap64_multi = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> src_multi = {0xDEADBEEF, 0x12345678, 0x98765432,
+                                     0xABCDEF01};
+  fillHeap<uint64_t>(heap64_multi, src_multi, VL / 8);
+
+  // Two vector
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    dup z0.d, #1
+    dup z1.d, #2
+
+    ptrue pn8.d
+
+    ld1d {z0.d, z1.d}, pn8/z, [x0, #2, mul vl]
+  )");
+  base = (VL / 64) * 2;
+  uint16_t offset = (VL / 64);
+  CHECK_NEON(0, uint64_t,
+             fillNeon<uint64_t>({src[(base) % 4], src[(base + 1) % 4],
+                                 src[(base + 2) % 4], src[(base + 3) % 4]},
+                                VL / 8));
+  CHECK_NEON(
+      1, uint64_t,
+      fillNeon<uint64_t>(
+          {src[((base + offset)) % 4], src[((base + offset) + 1) % 4],
+           src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]},
+          VL / 8));
+
+  // Four vector
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    dup z0.d, #1
+    dup z1.d, #2
+    dup z2.d, #3
+    dup z3.d, #4
+
+    ptrue pn8.d
+
+    ld1d {z0.d - z3.d}, pn8/z, [x0, #4, mul vl]
+    addvl x1, x1, #1
+    mov x2, #2
+    udiv x1, x1, x2
+    ld1d {z4.d - z7.d}, pn8/z, [x0, x1, lsl #3]
+  )");
+  base = (VL / 64) * 4;
+  offset = (VL / 64);
+  CHECK_NEON(0, uint64_t,
+             fillNeon<uint64_t>({src[(base) % 4], src[(base + 1) % 4],
+                                 src[(base + 2) % 4], src[(base + 3) % 4]},
+                                VL / 8));
+  CHECK_NEON(
+      1, uint64_t,
+      fillNeon<uint64_t>(
+          {src[((base + offset)) % 4], src[((base + offset) + 1) % 4],
+           src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]},
+          VL / 8));
+  CHECK_NEON(2, uint64_t,
+             fillNeon<uint64_t>({src[((base + (offset * 2))) % 4],
+                                 src[((base + (offset * 2)) + 1) % 4],
+                                 src[((base + (offset * 2)) + 2) % 4],
+                                 src[((base + (offset * 2)) + 3) % 4]},
+                                VL / 8));
+  CHECK_NEON(3, uint64_t,
+             fillNeon<uint64_t>({src[((base + (offset * 3))) % 4],
+                                 src[((base + (offset * 3)) + 1) % 4],
+                                 src[((base + (offset * 3)) + 2) % 4],
+                                 src[((base + (offset * 3)) + 3) % 4]},
+                                VL / 8));
+  CHECK_NEON(4, uint64_t,
+             fillNeon<uint64_t>({src[(base) % 4], src[(base + 1) % 4],
+                                 src[(base + 2) % 4], src[(base + 3) % 4]},
+                                VL / 8));
+  CHECK_NEON(
+      5, uint64_t,
+      fillNeon<uint64_t>(
+          {src[((base + offset)) % 4], src[((base + offset) + 1) % 4],
+           src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]},
+          VL / 8));
+  CHECK_NEON(6, uint64_t,
+             fillNeon<uint64_t>({src[((base + (offset * 2))) % 4],
+                                 src[((base + (offset * 2)) + 1) % 4],
+                                 src[((base + (offset * 2)) + 2) % 4],
+                                 src[((base + (offset * 2)) + 3) % 4]},
+                                VL / 8));
+  CHECK_NEON(7, uint64_t,
+             fillNeon<uint64_t>({src[((base + (offset * 3))) % 4],
+                                 src[((base + (offset * 3)) + 1) % 4],
+                                 src[((base + (offset * 3)) + 2) % 4],
+                                 src[((base + (offset * 3)) + 3) % 4]},
+                                VL / 8));
 }
 
 TEST_P(InstSve, ld1h) {
+  // Single vector
   initialHeapData_.resize(VL / 4);
   uint16_t* heap16 = reinterpret_cast<uint16_t*>(initialHeapData_.data());
   fillHeap<uint16_t>(
@@ -4968,6 +5682,7 @@ TEST_P(InstSve, ld1h) {
     ptrue p0.h
     # Load and broadcast values from heap
     ld1h {z0.h}, p0/z, [x0, x1, lsl #1]
+    ld1h {z2.h}, p0/z, [x0]
 
     # Test for inactive lanes
     mov x1, #0
@@ -4977,6 +5692,10 @@ TEST_P(InstSve, ld1h) {
     mov x2, #0
     whilelo p1.h, xzr, x1
     ld1h {z1.h}, p1/z, [x0, x2, lsl #1]
+
+    addvl x10, x10, #1
+    add x10, x10, x0
+    ld1h {z3.h}, p1/z, [x10, #-1, mul vl]
   )");
   CHECK_NEON(0, uint16_t,
              fillNeon<uint16_t>({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876,
@@ -4986,14 +5705,67 @@ TEST_P(InstSve, ld1h) {
              fillNeonCombined<uint16_t>({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432,
                                          0x9876, 0xEF01, 0xABCD},
                                         {0}, VL / 8));
+  CHECK_NEON(2, uint16_t,
+             fillNeon<uint16_t>({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876,
+                                 0xEF01, 0xABCD},
+                                VL / 8));
+  CHECK_NEON(3, uint16_t,
+             fillNeonCombined<uint16_t>({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432,
+                                         0x9876, 0xEF01, 0xABCD},
+                                        {0}, VL / 8));
+
+  // Multi vector
+
+  // Two vector
+  initialHeapData_.resize(VL);
+  heap16 = reinterpret_cast<uint16_t*>(initialHeapData_.data());
+  fillHeap<uint16_t>(
+      heap16, {0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, 0xEF01, 0xABCD},
+      VL / 2);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ptrue pn8.h
+    mov x1, #1
+    ld1h {z0.h, z1.h}, pn8/z, [x0]
+    ld1h {z2.h, z3.h}, pn8/z, [x0, x1, lsl #1]
+    ld1h {z4.h, z5.h}, pn8/z, [x0, #2, mul vl]
+  )");
+  CHECK_NEON(0, uint16_t,
+             fillNeon<uint16_t>({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876,
+                                 0xEF01, 0xABCD},
+                                VL / 8));
+  CHECK_NEON(1, uint16_t,
+             fillNeon<uint16_t>({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876,
+                                 0xEF01, 0xABCD},
+                                VL / 8));
+  CHECK_NEON(2, uint16_t,
+             fillNeon<uint16_t>({0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, 0xEF01,
+                                 0xABCD, 0xBEEF},
+                                VL / 8));
+  CHECK_NEON(3, uint16_t,
+             fillNeon<uint16_t>({0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, 0xEF01,
+                                 0xABCD, 0xBEEF},
+                                VL / 8));
+  CHECK_NEON(4, uint16_t,
+             fillNeon<uint16_t>({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876,
+                                 0xEF01, 0xABCD},
+                                VL / 8));
+  CHECK_NEON(5, uint16_t,
+             fillNeon<uint16_t>({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876,
+                                 0xEF01, 0xABCD},
+                                VL / 8));
 }
 
 TEST_P(InstSve, ld1w) {
+  // Single vector
   initialHeapData_.resize(VL / 4);
   uint32_t* heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
   std::vector<uint32_t> src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01};
   fillHeap<uint32_t>(heap32, src, VL / 16);
-
   RUN_AARCH64(R"(
     # Get heap address
     mov x0, 0
@@ -5028,6 +5800,116 @@ TEST_P(InstSve, ld1w) {
   CHECK_NEON(3, uint64_t,
              fillNeonCombined<uint64_t>(
                  {0x12345678DEADBEEF, 0xABCDEF0198765432}, {0}, VL / 8));
+
+  // Multi vector
+  initialHeapData_.resize(VL);
+  uint32_t* heap32_multi = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  std::vector<uint32_t> src_multi = {0xDEADBEEF, 0x12345678, 0x98765432,
+                                     0xABCDEF01};
+  fillHeap<uint32_t>(heap32_multi, src_multi, VL / 4);
+
+  // Two vector
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    dup z0.s, #1
+    dup z1.s, #2
+    dup z2.s, #3
+    dup z3.s, #4
+
+    ptrue pn8.s
+    mov x1, #2
+
+    ld1w {z0.s, z1.s}, pn8/z, [x0, #2, mul vl]
+    ld1w {z2.s, z3.s}, pn8/z, [x0, x1, lsl #2]
+  )");
+  uint16_t base = (VL / 32) * 2;
+  uint16_t offset = (VL / 32);
+  CHECK_NEON(0, uint32_t,
+             fillNeon<uint32_t>({src[(base) % 4], src[(base + 1) % 4],
+                                 src[(base + 2) % 4], src[(base + 3) % 4]},
+                                VL / 8));
+  CHECK_NEON(
+      1, uint32_t,
+      fillNeon<uint32_t>(
+          {src[((base + offset)) % 4], src[((base + offset) + 1) % 4],
+           src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]},
+          VL / 8));
+
+  CHECK_NEON(2, uint32_t,
+             fillNeon<uint32_t>({src[2], src[3], src[0], src[1]}, VL / 8));
+  CHECK_NEON(3, uint32_t,
+             fillNeon<uint32_t>({src[(2 + offset) % 4], src[(3 + offset) % 4],
+                                 src[(0 + offset) % 4], src[(1 + offset) % 4]},
+                                VL / 8));
+
+  // Four vector
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    dup z0.s, #1
+    dup z1.s, #2
+    dup z2.s, #3
+    dup z3.s, #4
+
+    ptrue pn8.s
+    addvl x1, x1, #1
+
+    ld1w {z0.s - z3.s}, pn8/z, [x0, #4, mul vl]
+    ld1w {z4.s - z7.s}, pn8/z, [x0, x1, lsl #2]
+  )");
+  base = (VL / 32) * 4;
+  offset = (VL / 32);
+  CHECK_NEON(0, uint32_t,
+             fillNeon<uint32_t>({src[(base) % 4], src[(base + 1) % 4],
+                                 src[(base + 2) % 4], src[(base + 3) % 4]},
+                                VL / 8));
+  CHECK_NEON(
+      1, uint32_t,
+      fillNeon<uint32_t>(
+          {src[((base + offset)) % 4], src[((base + offset) + 1) % 4],
+           src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]},
+          VL / 8));
+  CHECK_NEON(2, uint32_t,
+             fillNeon<uint32_t>({src[((base + (offset * 2))) % 4],
+                                 src[((base + (offset * 2)) + 1) % 4],
+                                 src[((base + (offset * 2)) + 2) % 4],
+                                 src[((base + (offset * 2)) + 3) % 4]},
+                                VL / 8));
+  CHECK_NEON(3, uint32_t,
+             fillNeon<uint32_t>({src[((base + (offset * 3))) % 4],
+                                 src[((base + (offset * 3)) + 1) % 4],
+                                 src[((base + (offset * 3)) + 2) % 4],
+                                 src[((base + (offset * 3)) + 3) % 4]},
+                                VL / 8));
+  CHECK_NEON(4, uint32_t,
+             fillNeon<uint32_t>({src[(base) % 4], src[(base + 1) % 4],
+                                 src[(base + 2) % 4], src[(base + 3) % 4]},
+                                VL / 8));
+  CHECK_NEON(
+      5, uint32_t,
+      fillNeon<uint32_t>(
+          {src[((base + offset)) % 4], src[((base + offset) + 1) % 4],
+           src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]},
+          VL / 8));
+  CHECK_NEON(6, uint32_t,
+             fillNeon<uint32_t>({src[((base + (offset * 2))) % 4],
+                                 src[((base + (offset * 2)) + 1) % 4],
+                                 src[((base + (offset * 2)) + 2) % 4],
+                                 src[((base + (offset * 2)) + 3) % 4]},
+                                VL / 8));
+  CHECK_NEON(7, uint32_t,
+             fillNeon<uint32_t>({src[((base + (offset * 3))) % 4],
+                                 src[((base + (offset * 3)) + 1) % 4],
+                                 src[((base + (offset * 3)) + 2) % 4],
+                                 src[((base + (offset * 3)) + 3) % 4]},
+                                VL / 8));
 }
 
 TEST_P(InstSve, ld2d) {
@@ -5660,6 +6542,27 @@ TEST_P(InstSve, ptrue) {
   CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 2));
 }
 
+TEST_P(InstSve, ptrue_counter) {
+  RUN_AARCH64(R"(
+    ptrue pn8.s
+    ptrue pn9.d
+    ptrue pn10.b
+    ptrue pn11.h
+  )");
+  const uint64_t ps =
+      0b0000000000000000000000000000000000000000000000001000000000000100;
+  const uint64_t pd =
+      0b0000000000000000000000000000000000000000000000001000000000001000;
+  const uint64_t pb =
+      0b0000000000000000000000000000000000000000000000001000000000000001;
+  const uint64_t ph =
+      0b0000000000000000000000000000000000000000000000001000000000000010;
+  CHECK_PREDICATE(8, uint64_t, {ps, 0x0, 0x0, 0x0});
+  CHECK_PREDICATE(9, uint64_t, {pd, 0x0, 0x0, 0x0});
+  CHECK_PREDICATE(10, uint64_t, {pb, 0x0, 0x0, 0x0});
+  CHECK_PREDICATE(11, uint64_t, {ph, 0x0, 0x0, 0x0});
+}
+
 TEST_P(InstSve, punpk) {
   RUN_AARCH64(R"(
     ptrue p0.b
@@ -6385,8 +7288,73 @@ TEST_P(InstSve, st1d) {
   }
 }
 
+TEST_P(InstSve, st1d_multivec) {
+  // Two vectors
+  initialHeapData_.resize(VL / 4);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01};
+  fillHeap<uint64_t>(heap64, src, VL / 32);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    sub sp, sp, #4095
+    mov x1, #1
+    mov x4, #256
+    madd x4, x4, x4, x4
+    ptrue p0.d
+    ptrue pn8.d
+    ld1d {z0.d}, p0/z, [x0]
+    ld1d {z1.d}, p0/z, [x0, #1, mul vl]
+    st1d {z0.d, z1.d}, pn8, [sp]
+    st1d {z0.d, z1.d}, pn8, [x4, #4, mul vl]
+    st1d {z0.d, z1.d}, pn8, [x4, x1, lsl #3]
+  )");
+
+  for (uint64_t i = 0; i < (VL / 32); i++) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 8)),
+              src[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(65792 + (4 * (VL / 8)) + (i * 8)),
+              src[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(65792 + 8 + (i * 8)), src[i % 4]);
+  }
+
+  // Four vectors
+  initialHeapData_.resize(VL);
+  heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  fillHeap<uint64_t>(heap64, src, VL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    sub sp, sp, #4095
+    mov x1, #2
+    mov x4, #256
+    madd x4, x4, x4, x4
+    ptrue p0.d
+    ptrue pn8.d
+    ld1d {z0.d}, p0/z, [x0]
+    ld1d {z1.d}, p0/z, [x0, #1, mul vl]
+    ld1d {z2.d}, p0/z, [x0, #2, mul vl]
+    ld1d {z3.d}, p0/z, [x0, #3, mul vl]
+    st1d {z0.d - z3.d}, pn8, [sp]
+    st1d {z0.d - z3.d}, pn8, [x4, #8, mul vl]
+  )");
+  for (uint64_t i = 0; i < (VL / 16); i++) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 8)),
+              src[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(65792 + (8 * (VL / 8)) + (i * 8)),
+              src[i % 4]);
+  }
+}
+
 TEST_P(InstSve, st2d) {
-  // 32-bit
   RUN_AARCH64(R"(
     ptrue p0.d
     mov x0, #0
@@ -6423,6 +7391,62 @@ TEST_P(InstSve, st2d) {
   }
 }
 
+TEST_P(InstSve, st4w) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    ptrue p0.s
+    mov x0, #0
+    addvl x1, x0, #1
+    mov x2, #8
+    udiv x3, x1, x2
+    whilelo p1.s, xzr, x3
+
+    sub sp, sp, #4095
+    mov x6, #300
+
+    dup z0.s, #3
+    dup z1.s, #4
+    dup z2.s, #5
+    dup z3.s, #6
+
+    st4w {z0.s - z3.s}, p0, [sp]
+    st4w {z0.s - z3.s}, p1, [x6, #4, mul vl]
+    addvl x7, x7, #3
+    st4w {z0.s - z3.s}, p1, [x6, x7, lsl #2]
+  )");
+
+  for (uint64_t i = 0; i < (VL / 32); i++) {
+    EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
+                                       4095 + (4 * i * 4)),
+              3);
+    EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
+                                       4095 + (4 * i * 4) + 4),
+              4);
+    EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
+                                       4095 + (4 * i * 4) + 8),
+              5);
+    EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
+                                       4095 + (4 * i * 4) + 12),
+              6);
+  }
+
+  int index = 4 * (VL / 8);
+  for (uint64_t i = 0; i < (VL / 64); i++) {
+    EXPECT_EQ(getMemoryValue<uint32_t>(300 + index + (4 * i * 4)), 3);
+    EXPECT_EQ(getMemoryValue<uint32_t>(300 + index + (4 * i * 4) + 4), 4);
+    EXPECT_EQ(getMemoryValue<uint32_t>(300 + index + (4 * i * 4) + 8), 5);
+    EXPECT_EQ(getMemoryValue<uint32_t>(300 + index + (4 * i * 4) + 12), 6);
+  }
+
+  index = 12 * (VL / 8);
+  for (uint64_t i = 0; i < (VL / 64); i++) {
+    EXPECT_EQ(getMemoryValue<uint32_t>(300 + index + (4 * i * 4)), 3);
+    EXPECT_EQ(getMemoryValue<uint32_t>(300 + index + (4 * i * 4) + 4), 4);
+    EXPECT_EQ(getMemoryValue<uint32_t>(300 + index + (4 * i * 4) + 8), 5);
+    EXPECT_EQ(getMemoryValue<uint32_t>(300 + index + (4 * i * 4) + 12), 6);
+  }
+}
+
 TEST_P(InstSve, st1w_scatter) {
   // 32-bit
   RUN_AARCH64(R"(
@@ -6603,6 +7627,71 @@ TEST_P(InstSve, st1w) {
   }
 }
 
+TEST_P(InstSve, st1w_multivec) {
+  // Two vectors
+  initialHeapData_.resize(VL / 4);
+  uint32_t* heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  std::vector<uint32_t> src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01};
+  fillHeap<uint32_t>(heap32, src, VL / 16);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    sub sp, sp, #4095
+    mov x1, #2
+    mov x4, #256
+    madd x4, x4, x4, x4
+    ptrue p0.s
+    ptrue pn8.s
+    ld1w {z0.s}, p0/z, [x0]
+    ld1w {z1.s}, p0/z, [x0, #1, mul vl]
+    st1w {z0.s, z1.s}, pn8, [sp]
+    st1w {z0.s, z1.s}, pn8, [x4, #4, mul vl]
+    st1w {z0.s, z1.s}, pn8, [x4, x1, lsl #2]
+  )");
+  for (uint64_t i = 0; i < (VL / 16); i++) {
+    EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 4)),
+              src[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint32_t>(65792 + (4 * (VL / 8)) + (i * 4)),
+              src[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint32_t>(65792 + 8 + (i * 4)), src[i % 4]);
+  }
+
+  // Four vectors
+  initialHeapData_.resize(VL);
+  heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  fillHeap<uint32_t>(heap32, src, VL / 4);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    sub sp, sp, #4095
+    mov x1, #2
+    mov x4, #256
+    madd x4, x4, x4, x4
+    ptrue p0.s
+    ptrue pn8.s
+    ld1w {z0.s}, p0/z, [x0]
+    ld1w {z1.s}, p0/z, [x0, #1, mul vl]
+    ld1w {z2.s}, p0/z, [x0, #2, mul vl]
+    ld1w {z3.s}, p0/z, [x0, #3, mul vl]
+    st1w {z0.s - z3.s}, pn8, [sp]
+    st1w {z0.s - z3.s}, pn8, [x4, #8, mul vl]
+  )");
+  for (uint64_t i = 0; i < (VL / 8); i++) {
+    EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 4)),
+              src[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint32_t>(65792 + (8 * (VL / 8)) + (i * 4)),
+              src[i % 4]);
+  }
+}
+
 TEST_P(InstSve, str_predicate) {
   initialHeapData_.resize(VL / 64);
   uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
@@ -7020,6 +8109,54 @@ TEST_P(InstSve, uaddv) {
   CHECK_NEON(3, uint64_t, {(9 * (VL / 128)), 0});
 }
 
+TEST_P(InstSve, udot) {
+  // udot by element
+  initialHeapData_.resize(16);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  heap64[0] = 0xDEADBEEFFFFF00FF;
+  heap64[1] = 0x01234567ABBACAFE;
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, #0
+    mov x8, #214
+    svc #0
+
+    ptrue p0.b
+    ld1rqb	{ z0.b }, p0/z, [x0]
+
+    dup z2.b, #2
+    dup z3.b, #3
+    dup z4.s, #4
+    dup z5.s, #5
+
+    udot z4.s, z2.b, z0.b[0]
+    udot z5.s, z3.b, z0.b[3]
+  )");
+  CHECK_NEON(4, uint32_t, fillNeon<uint32_t>({1534}, VL / 8));
+  CHECK_NEON(5, uint32_t, fillNeon<uint32_t>({629}, VL / 8));
+
+  // udot by vector - 4-way
+  initialHeapData_.resize(16);
+  heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  heap64[0] = 0xDEADBEEFFFFF00FF;
+  heap64[1] = 0x01234567ABBACAFE;
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, #0
+    mov x8, #214
+    svc #0
+
+    ptrue p0.b
+    ld1rqb	{ z0.b }, p0/z, [x0]
+
+    dup z2.b, #2
+    dup z4.s, #4
+
+    udot z4.s, z2.b, z0.b
+  )");
+  CHECK_NEON(4, uint32_t, fillNeon<uint32_t>({1534, 1652, 1630, 420}, VL / 8));
+}
+
 TEST_P(InstSve, uqdec) {
   // d arrangement
   RUN_AARCH64(R"(
@@ -7983,14 +9120,12 @@ TEST_P(InstSve, zip_pred) {
 }
 
 TEST_P(InstSve, zip) {
-  // d arrangement
   RUN_AARCH64(R"(
     # 64-bit  
     fdup z0.d, #0.5
     fdup z1.d, #-0.5
     fdup z2.d, #0.75
     fdup z3.d, #-0.75
-
     zip1 z4.d, z0.d, z1.d
     zip2 z5.d, z2.d, z3.d
 
@@ -8001,16 +9136,37 @@ TEST_P(InstSve, zip) {
     fdup z9.s, #0.75
     zip1 z10.s, z6.s, z7.s
     zip2 z11.s, z8.s, z9.s
-  )");
 
+    # 8-bit
+    dup z12.b, #1
+    dup z13.b, #-2
+    dup z14.b, #-1
+    dup z15.b, #2
+    zip1 z16.b, z12.b, z13.b
+    zip2 z17.b, z14.b, z15.b
+  )");
   CHECK_NEON(4, double, fillNeon<double>({0.5, -0.5}, VL / 8));
   CHECK_NEON(5, double, fillNeon<double>({0.75, -0.75}, VL / 8));
   CHECK_NEON(10, float, fillNeon<float>({0.5, -0.75}, VL / 8));
   CHECK_NEON(11, float, fillNeon<float>({-0.5, 0.75}, VL / 8));
+  CHECK_NEON(16, int8_t, fillNeon<int8_t>({1, -2}, VL / 8));
+  CHECK_NEON(17, int8_t, fillNeon<int8_t>({-1, 2}, VL / 8));
+
+  // Multi-vector
+  RUN_AARCH64(R"(
+    #32-bit
+    dup z0.s, #5
+    dup z1.s, #6
+    dup z2.s, #7
+    dup z3.s, #8
+    zip {z4.s - z7.s}, {z0.s - z3.s}
+  )");
+  CHECK_NEON(4, uint32_t, fillNeon<uint32_t>({5, 6, 7, 8}, VL / 8));
+  CHECK_NEON(5, uint32_t, fillNeon<uint32_t>({5, 6, 7, 8}, VL / 8));
+  CHECK_NEON(6, uint32_t, fillNeon<uint32_t>({5, 6, 7, 8}, VL / 8));
+  CHECK_NEON(7, uint32_t, fillNeon<uint32_t>({5, 6, 7, 8}, VL / 8));
 }
 
-#if SIMENG_LLVM_VERSION >= 14
-// If LLVM version supports SVE2 :
 TEST_P(InstSve, psel) {
   RUN_AARCH64(R"(
     mov w13, #0
@@ -8044,7 +9200,6 @@ TEST_P(InstSve, psel) {
   CHECK_PREDICATE(14, uint64_t, fillPred(VL / 8, {0}, 4));
   CHECK_PREDICATE(15, uint64_t, fillPred(VL / 8, {0}, 8));
 }
-#endif
 
 INSTANTIATE_TEST_SUITE_P(AArch64, InstSve,
                          ::testing::ValuesIn(genCoreTypeVLPairs(EMULATION)),
diff --git a/test/unit/aarch64/ArchInfoTest.cc b/test/unit/aarch64/ArchInfoTest.cc
index 39e25a0bd..a2b41a9ec 100644
--- a/test/unit/aarch64/ArchInfoTest.cc
+++ b/test/unit/aarch64/ArchInfoTest.cc
@@ -23,7 +23,8 @@ class AArch64ArchInfoTest : public ::testing::Test {
       aarch64_sysreg::AARCH64_SYSREG_MIDR_EL1,
       aarch64_sysreg::AARCH64_SYSREG_CNTVCT_EL0,
       aarch64_sysreg::AARCH64_SYSREG_PMCCNTR_EL0,
-      aarch64_sysreg::AARCH64_SYSREG_SVCR};
+      aarch64_sysreg::AARCH64_SYSREG_SVCR,
+      aarch64_sysreg::AARCH64_SYSREG_TPIDR2_EL0};
 
   const std::vector<simeng::RegisterFileStructure> archRegStruct = {
       {8, 32},
diff --git a/test/unit/aarch64/InstructionTest.cc b/test/unit/aarch64/InstructionTest.cc
index 8d4b0d87f..53041905e 100644
--- a/test/unit/aarch64/InstructionTest.cc
+++ b/test/unit/aarch64/InstructionTest.cc
@@ -602,6 +602,39 @@ TEST_F(AArch64InstructionTest, setters) {
   EXPECT_TRUE(insn.isWaitingCommit());
 }
 
+// Test predAsCounterToMasks function.
+TEST_F(AArch64InstructionTest, predAsCounterToMasks_test) {
+  // 1.5 full vectors from start, VL = 128b, uint8_t elem size
+  std::vector<std::array<uint64_t, 4>> ref(2, {0, 0, 0, 0});
+  ref[0][0] =
+      0b0000000000000000000000000000000000000000000000001111111111111111;
+  ref[1][0] =
+      0b0000000000000000000000000000000000000000000000000000000011111111;
+  // invert = 0, num active Elems = 24
+  uint64_t pn =
+      0b0000000000000000000000000000000000000000000000000000000000110001;
+  auto out = predAsCounterToMasks<uint8_t, 2>(pn, 128);
+  EXPECT_EQ(out[0][0], ref[0][0]);
+  EXPECT_EQ(out[1][0], ref[1][0]);
+
+  // 0.5 of last vector, VL = 1024b, uint64_t elem size
+  std::vector<std::array<uint64_t, 4>> ref2(4, {0, 0, 0, 0});
+  ref2[3][1] =
+      0b0000000100000001000000010000000100000001000000010000000100000001;
+  // Invert = 1, num inactive Elems = 56
+  uint64_t pn2 =
+      0b0000000000000000000000000000000000000000000000001000001110001000;
+  auto out2 = predAsCounterToMasks<uint64_t, 4>(pn2, 1024);
+  EXPECT_EQ(out2[0][0], ref2[0][0]);
+  EXPECT_EQ(out2[0][1], ref2[0][1]);
+  EXPECT_EQ(out2[1][0], ref2[1][0]);
+  EXPECT_EQ(out2[1][1], ref2[1][1]);
+  EXPECT_EQ(out2[2][0], ref2[2][0]);
+  EXPECT_EQ(out2[2][1], ref2[2][1]);
+  EXPECT_EQ(out2[3][0], ref2[3][0]);
+  EXPECT_EQ(out2[3][1], ref2[3][1]);
+}
+
 }  // namespace aarch64
 }  // namespace arch
 }  // namespace simeng
\ No newline at end of file