akielaries · akielaries · Dec 23, 2024 · Dec 23, 2024 · Dec 23, 2024 · Dec 23, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2,7 +2,7 @@
 # Root CMake file for the openGPMP project
 # *************************************************************************/
 
-cmake_minimum_required(VERSION 3.25)
+cmake_minimum_required(VERSION 3.20)
 set (CMAKE_CXX_STANDARD 20)
 include(CheckIncludeFileCXX)
 
@@ -196,7 +196,7 @@ if(NOT BUILD_TINYGPMP AND NOT BUILD_PYGPMP OR BUILD_OPENGPMP)
 
         # run C++ and Fortran unit tests
         add_dependencies(${PROJECT_NAME} RUN_CPP_TESTS)
-        add_dependencies(${PROJECT_NAME} RUN_FORTRAN_TESTS) 
+        add_dependencies(${PROJECT_NAME} RUN_FORTRAN_TESTS)
     endif()
 
     # uninstall target

diff --git a/experiment/gemm.cpp b/experiment/gemm.cpp
@@ -0,0 +1,115 @@
+#include <immintrin.h>
+#include <vector>
+#include <iostream>
+#include <chrono>
+
+// Block size for tiling optimization
+const size_t BLOCK_SIZE = 256;
+
+// Naive GEMM implementation (no optimizations)
+void gemm_naive(const float* A, const float* B, float* C, size_t M, size_t N, size_t K) {
+    for (size_t i = 0; i < M; ++i) {
+        for (size_t j = 0; j < N; ++j) {
+            C[i * N + j] = 0.0f;
+            for (size_t k = 0; k < K; ++k) {
+                C[i * N + j] += A[i * K + k] * B[k * N + j];
+            }
+        }
+    }
+}
+
+// Tiled GEMM implementation (optimized with blocking and packing)
+void pack_matrix_A(const float* A, float* packed_A, size_t M, size_t K, size_t block_start_row, size_t block_start_col) {
+    for (size_t i = 0; i < BLOCK_SIZE && (block_start_row + i) < M; ++i) {
+        for (size_t j = 0; j < BLOCK_SIZE && (block_start_col + j) < K; ++j) {
+            packed_A[i * BLOCK_SIZE + j] = A[(block_start_row + i) * K + block_start_col + j];
+        }
+    }
+}
+
+void pack_matrix_B(const float* B, float* packed_B, size_t K, size_t N, size_t block_start_row, size_t block_start_col) {
+    for (size_t i = 0; i < BLOCK_SIZE && (block_start_row + i) < K; ++i) {
+        for (size_t j = 0; j < BLOCK_SIZE && (block_start_col + j) < N; ++j) {
+            packed_B[i * BLOCK_SIZE + j] = B[(block_start_row + i) * N + block_start_col + j];
+        }
+    }
+}
+
+void gemm_block(const float* packed_A, const float* packed_B, float* C, size_t M, size_t N, size_t K, size_t block_row, size_t block_col) {
+    for (size_t i = 0; i < BLOCK_SIZE && (block_row + i) < M; ++i) {
+        for (size_t j = 0; j < BLOCK_SIZE && (block_col + j) < N; ++j) {
+            float sum = 0.0f;
+            for (size_t k = 0; k < BLOCK_SIZE && k < K; ++k) {
+                sum += packed_A[i * BLOCK_SIZE + k] * packed_B[k * BLOCK_SIZE + j];
+            }
+            C[(block_row + i) * N + block_col + j] += sum;
+        }
+    }
+}
+
+void gemm_tiled(const float* A, const float* B, float* C, size_t M, size_t N, size_t K) {
+    std::vector<float> packed_A(BLOCK_SIZE * BLOCK_SIZE, 0.0f);
+    std::vector<float> packed_B(BLOCK_SIZE * BLOCK_SIZE, 0.0f);
+
+    for (size_t block_row = 0; block_row < M; block_row += BLOCK_SIZE) {
+        for (size_t block_col = 0; block_col < N; block_col += BLOCK_SIZE) {
+            for (size_t block_k = 0; block_k < K; block_k += BLOCK_SIZE) {
+                pack_matrix_A(A, packed_A.data(), M, K, block_row, block_k);
+                pack_matrix_B(B, packed_B.data(), K, N, block_k, block_col);
+                gemm_block(packed_A.data(), packed_B.data(), C, M, N, K, block_row, block_col);
+            }
+        }
+    }
+}
+
+void print_matrix(const std::vector<float>& mat, size_t rows, size_t cols) {
+    for (size_t i = 0; i < rows; ++i) {
+        for (size_t j = 0; j < cols; ++j) {
+            std::cout << mat[i * cols + j] << " ";
+        }
+        std::cout << "\n";
+    }
+}
+
+int main() {
+    // Dimensions of matrices (use larger sizes for more meaningful benchmarks)
+    size_t M = 384, N = 384, K = 384;
+
+    // Initialize matrices with random values
+    std::vector<float> A(M * K, 1.0f);
+    std::vector<float> B(K * N, 1.0f);
+    std::vector<float> C_naive(M * N, 0.0f);
+    std::vector<float> C_optimized(M * N, 0.0f);
+
+    // Measure time for naive GEMM
+    auto start_naive = std::chrono::high_resolution_clock::now();
+    gemm_naive(A.data(), B.data(), C_naive.data(), M, N, K);
+    auto end_naive = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<float> duration_naive = end_naive - start_naive;
+    std::cout << "Naive GEMM Time: " << duration_naive.count() << " seconds\n";
+
+    // Measure time for optimized GEMM
+    auto start_optimized = std::chrono::high_resolution_clock::now();
+    gemm_tiled(A.data(), B.data(), C_optimized.data(), M, N, K);
+    auto end_optimized = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<float> duration_optimized = end_optimized - start_optimized;
+    std::cout << "Optimized GEMM Time: " << duration_optimized.count() << " seconds\n";
+
+    // Verify results
+    bool correct = true;
+    for (size_t i = 0; i < M * N; ++i) {
+        if (std::abs(C_naive[i] - C_optimized[i]) > 1e-6) {
+            correct = false;
+            break;
+        }
+    }
+
+    if (correct) {
+        std::cout << "Results match between naive and optimized GEMM.\n";
+    } else {
+        std::cout << "Results do not match between naive and optimized GEMM.\n";
+    }
+
+    return 0;
+}
+
diff --git a/experiment/gemm_comparison b/experiment/gemm_comparison
diff --git a/modules/linalg/dgemm_arr.cpp b/modules/linalg/dgemm_arr.cpp
@@ -98,6 +98,7 @@ void gpmp::linalg::DGEMM::dgemm_micro_kernel(long kc,
     long kb = kc / 4;
     long kl = kc % 4;
 
+#ifdef __x86_64__
     dgemm_kernel_asm(A,
                      B,
                      C,
@@ -109,6 +110,7 @@ void gpmp::linalg::DGEMM::dgemm_micro_kernel(long kc,
                      incColC,
                      alpha,
                      beta);
+#endif
 }
 
 // MATRIX BUFFERS

diff --git a/modules/optim/function.cpp b/modules/optim/function.cpp
@@ -31,8 +31,9 @@
  *
  ************************************************************************/
 #include <iostream>
-#include <openGPMP/optim/function.hpp>
+#include <algorithm>
 #include <vector>
+#include <openGPMP/optim/function.hpp>
 
 std::vector<double> gpmp::optim::Func::generate_random_point(
     const std::vector<double> &lower_bounds,

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -45,29 +45,14 @@ set(CPP_TEST_FILES
     linalg/t_eigen.cpp
     linalg/t_linsys.cpp
 
-    linalg/t_matrix_vector_i32.cpp
-    linalg/t_matrix_vector_f64.cpp
-
     linalg/t_matrix_arr_naive.cpp
 
-    linalg/t_matrix_arr_i8.cpp
-    linalg/t_matrix_arr_i16.cpp
-    linalg/t_matrix_arr_i32.cpp
-    linalg/t_matrix_arr_f64.cpp
-
-    linalg/t_matrix_arr_f90.cpp
-
     linalg/t_igemm_arr.cpp
     linalg/t_sgemm_arr.cpp
     linalg/t_dgemm_arr.cpp
 
     linalg/t_vector_vector_naive.cpp
 
-    linalg/t_vector_vector_i8.cpp
-    linalg/t_vector_vector_i32.cpp
-    linalg/t_vector_vector_f64.cpp
-
-
     nt/t_cipher.cpp
     nt/t_rc4.cpp
     nt/t_primes.cpp

diff --git a/tests/linalg/t_eigen.cpp b/tests/linalg/t_eigen.cpp
@@ -7,6 +7,8 @@
 #include <openGPMP/linalg/eigen.hpp>
 #include <stdexcept>
 #include <vector>
+#include <algorithm>
+
 
 const double TOLERANCE = 1e-3;
 

diff --git a/tinygpmp/README.md b/tinygpmp/README.md
@@ -2,3 +2,8 @@
 `tinygpmp` aims to support low-voltage and resource constrained devices,
 primarily microcontrollers. For now, focus on support for AVR and STM32
 series devices is planned.
+
+## Testing
+The `fixture` directory contains instructions and source code for an example application
+and debugging via the UART pins on an STM32 and monitoring this connection using
+screen or minicom on the `/dev`
diff --git a/tinygpmp/include/fec/README.md b/tinygpmp/include/fec/README.md
@@ -0,0 +1,2 @@
+# Forward Error Correction
+
diff --git a/tinygpmp/modules/fec/README.md b/tinygpmp/modules/fec/README.md
@@ -0,0 +1,2 @@
+# Forward Error Correction
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Forward Error Correction
Check notice on line 1 in tinygpmp/include/fec/README.md View check run for this annotation Codacy Production / Codacy Static Code Analysis tinygpmp/include/fec/README.md#L1 `Expected: [None]; Actual: # Forward Error Correction`
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Forward Error Correction
Check notice on line 1 in tinygpmp/modules/fec/README.md View check run for this annotation Codacy Production / Codacy Static Code Analysis tinygpmp/modules/fec/README.md#L1 `Expected: [None]; Actual: # Forward Error Correction`