From 5b37d15d885c232a8cdf88ddeea6236b5f06576d Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Thu, 5 Sep 2024 20:49:20 +0200
Subject: [PATCH 01/44] added missing cast function

---
 .../include/rmagine/math/types/Matrix.hpp           | 11 +++++++++++
 .../include/rmagine/math/types/Matrix.tcc           | 12 ++++++++++++
 .../include/rmagine/math/types/Transform.hpp        | 13 ++++++-------
 3 files changed, 29 insertions(+), 7 deletions(-)
diff --git a/src/rmagine_core/include/rmagine/math/types/Matrix.hpp b/src/rmagine_core/include/rmagine/math/types/Matrix.hpp
index 46cdc56..22e824b 100644
--- a/src/rmagine_core/include/rmagine/math/types/Matrix.hpp
+++ b/src/rmagine_core/include/rmagine/math/types/Matrix.hpp
@@ -333,6 +333,17 @@ struct Matrix_ {
     RMAGINE_INLINE_FUNCTION
     operator EulerAngles_<DataT>() const;
 
+    /**
+     * @brief Transformation Matrix -> Transform
+     * WARNING: The matrix has to be isometric, i.e. composed only of
+     * rotational and translational components. If it has e.g. scalar
+     * components use the "decompose" function instead
+     * 
+     * @return Transform_<DataT>
+     */
+    RMAGINE_INLINE_FUNCTION
+    operator Transform_<DataT>() const;
+
     /**
      * @brief Data Type Cast to ConvT
      * 
diff --git a/src/rmagine_core/include/rmagine/math/types/Matrix.tcc b/src/rmagine_core/include/rmagine/math/types/Matrix.tcc
index 5973ce9..47ee5a1 100644
--- a/src/rmagine_core/include/rmagine/math/types/Matrix.tcc
+++ b/src/rmagine_core/include/rmagine/math/types/Matrix.tcc
@@ -1267,6 +1267,18 @@ Matrix_<DataT, Rows, Cols>::operator EulerAngles_<DataT>() const
     return e;
 }
 
+template<typename DataT, unsigned int Rows, unsigned int Cols> 
+RMAGINE_INLINE_FUNCTION
+Matrix_<DataT, Rows, Cols>::operator Transform_<DataT>() const 
+{
+    static_assert(Rows == 4 && Cols == 4);
+    
+    Transform_<DataT> T;
+    T.set(*this);
+    return T;
+}
+
+
 
 template<typename DataT, unsigned int Rows, unsigned int Cols> 
 template<typename ConvT>
diff --git a/src/rmagine_core/include/rmagine/math/types/Transform.hpp b/src/rmagine_core/include/rmagine/math/types/Transform.hpp
index 2ad90aa..08a94aa 100644
--- a/src/rmagine_core/include/rmagine/math/types/Transform.hpp
+++ b/src/rmagine_core/include/rmagine/math/types/Transform.hpp
@@ -38,6 +38,12 @@ struct Transform_
     RMAGINE_INLINE_FUNCTION
     void setIdentity();
 
+    /**
+     * @brief Setting the transform from an 4x4 transformation matrix
+     * WARNING matrix must be isometric, i.e. must only contain
+     * rotational and translational parts (not scale). Otherwise,
+     * use "decompose" function
+     */
     RMAGINE_INLINE_FUNCTION
     void set(const Matrix_<DataT, 4, 4>& M);
 
@@ -76,13 +82,6 @@ struct Transform_
     RMAGINE_INLINE_FUNCTION
     Transform_<DataT> pow(const DataT& exp) const;
 
-    // OPERATORS
-    RMAGINE_INLINE_FUNCTION
-    void operator=(const Matrix_<DataT, 4, 4>& M)
-    {
-        set(M);
-    }
-
     RMAGINE_INLINE_FUNCTION
     Transform_<DataT> operator~() const
     {

From 1d12ea37d7c35cfab29225d5bf284697cd231a87 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Thu, 5 Sep 2024 21:03:43 +0200
Subject: [PATCH 02/44] using already existing math constants

---
 .../include/rmagine/math/types/Matrix.tcc          |  2 +-
 .../include/rmagine/math/types/Quaternion.tcc      |  4 +---
 .../include/rmagine/math/types/Transform.hpp       | 14 ++++++++++++++
 .../include/rmagine/math/types/Transform.tcc       |  9 +++++++++
 4 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/src/rmagine_core/include/rmagine/math/types/Matrix.tcc b/src/rmagine_core/include/rmagine/math/types/Matrix.tcc
index 47ee5a1..5fe5271 100644
--- a/src/rmagine_core/include/rmagine/math/types/Matrix.tcc
+++ b/src/rmagine_core/include/rmagine/math/types/Matrix.tcc
@@ -1256,7 +1256,7 @@ Matrix_<DataT, Rows, Cols>::operator EulerAngles_<DataT>() const
     // pitch (y-axis)
     if (fabs(sB) >= 1.0)
     {
-        e.pitch = copysignf(M_PI / 2, sB); // use 90 degrees if out of range
+        e.pitch = copysignf(M_PI_2, sB); // use 90 degrees if out of range
     } else {
         e.pitch = asinf(sB);
     }
diff --git a/src/rmagine_core/include/rmagine/math/types/Quaternion.tcc b/src/rmagine_core/include/rmagine/math/types/Quaternion.tcc
index a374265..b4dac7d 100644
--- a/src/rmagine_core/include/rmagine/math/types/Quaternion.tcc
+++ b/src/rmagine_core/include/rmagine/math/types/Quaternion.tcc
@@ -182,8 +182,6 @@ template<typename DataT>
 RMAGINE_INLINE_FUNCTION
 Quaternion_<DataT>::operator EulerAngles_<DataT>() const 
 {
-    constexpr DataT PI_HALF = M_PI / 2.0;
-
     // TODO: check
     // https://en.wikipedia.org/wiki/Conversion_between_quaternions_and_Euler_angles
     // checked once
@@ -215,7 +213,7 @@ Quaternion_<DataT>::operator EulerAngles_<DataT>() const
     // pitch (y-axis)
     if (fabs(sinp) >= 1.0f)
     {
-        e.pitch = copysign(PI_HALF, sinp); // use 90 degrees if out of range
+        e.pitch = copysign(M_PI_2, sinp); // use 90 degrees if out of range
     } else {
         e.pitch = asin(sinp);
     }
diff --git a/src/rmagine_core/include/rmagine/math/types/Transform.hpp b/src/rmagine_core/include/rmagine/math/types/Transform.hpp
index 08a94aa..baef438 100644
--- a/src/rmagine_core/include/rmagine/math/types/Transform.hpp
+++ b/src/rmagine_core/include/rmagine/math/types/Transform.hpp
@@ -107,6 +107,20 @@ struct Transform_
         return mult(v);
     }
 
+    /////////////////////
+    // CASTING
+
+    /**
+     * @brief Transform -> Matrix4x4
+     * 
+     * @return Matrix_<DataT> 
+     */
+    RMAGINE_INLINE_FUNCTION
+    operator Matrix_<DataT, 4, 4>() const;
+
+    /**
+     * @brief Internal data type cast
+     */
     template<typename ConvT>
     Transform_<ConvT> cast() const
     {
diff --git a/src/rmagine_core/include/rmagine/math/types/Transform.tcc b/src/rmagine_core/include/rmagine/math/types/Transform.tcc
index 02fd878..8e8d6b5 100644
--- a/src/rmagine_core/include/rmagine/math/types/Transform.tcc
+++ b/src/rmagine_core/include/rmagine/math/types/Transform.tcc
@@ -78,4 +78,13 @@ Transform_<DataT> Transform_<DataT>::pow(const DataT& exp) const
     return res;
 }
 
+template<typename DataT>
+RMAGINE_INLINE_FUNCTION
+Transform_<DataT>::operator Matrix_<DataT, 4, 4>() const 
+{
+    Matrix_<DataT, 4, 4> M;
+    M.set(*this);
+    return M;
+}
+
 } // namespace rmagine
\ No newline at end of file

From 55c18f77bceae7dd72601fc5b625415ae9a442e5 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Thu, 5 Sep 2024 21:15:33 +0200
Subject: [PATCH 03/44] added access function to face struct

---
 .../include/rmagine/types/mesh_types.h        | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/src/rmagine_core/include/rmagine/types/mesh_types.h b/src/rmagine_core/include/rmagine/types/mesh_types.h
index 210c7fc..5d96e5b 100644
--- a/src/rmagine_core/include/rmagine/types/mesh_types.h
+++ b/src/rmagine_core/include/rmagine/types/mesh_types.h
@@ -53,8 +53,30 @@ struct Face {
     unsigned int v0;
     unsigned int v1;
     unsigned int v2;
+
+    // Other access functions
+    // use with care! No out of range checks
+    RMAGINE_INLINE_FUNCTION
+    unsigned int operator[](const size_t& idx) const;
+
+    RMAGINE_INLINE_FUNCTION
+    unsigned int& operator[](const size_t& idx);
 };
 
+// IMPL
+RMAGINE_INLINE_FUNCTION
+unsigned int Face::operator[](const size_t& idx) const
+{
+    return *((&v0)+idx);
+}
+
+RMAGINE_INLINE_FUNCTION
+unsigned int& Face::operator[](const size_t& idx)
+{
+    return *((&v0)+idx);
+}
+
+
 } // namespace rmagine
 
 #endif // RMAGINE_TYPES_MESH_TYPES_H
\ No newline at end of file

From 0a347f83f05fabb3ff7405a632cc091feea4b00f Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Thu, 5 Sep 2024 21:52:09 +0200
Subject: [PATCH 04/44] added mesh types tcc

---
 .../include/rmagine/types/mesh_types.h        | 19 +++++++++----------
 .../include/rmagine/types/mesh_types.tcc      | 18 ++++++++++++++++++
 2 files changed, 27 insertions(+), 10 deletions(-)
 create mode 100644 src/rmagine_core/include/rmagine/types/mesh_types.tcc

diff --git a/src/rmagine_core/include/rmagine/types/mesh_types.h b/src/rmagine_core/include/rmagine/types/mesh_types.h
index 5d96e5b..2e93485 100644
--- a/src/rmagine_core/include/rmagine/types/mesh_types.h
+++ b/src/rmagine_core/include/rmagine/types/mesh_types.h
@@ -63,20 +63,19 @@ struct Face {
     unsigned int& operator[](const size_t& idx);
 };
 
-// IMPL
-RMAGINE_INLINE_FUNCTION
-unsigned int Face::operator[](const size_t& idx) const
-{
-    return *((&v0)+idx);
-}
+} // namespace rmagine
 
-RMAGINE_INLINE_FUNCTION
-unsigned int& Face::operator[](const size_t& idx)
+
+// Polyscope field access
+static unsigned int adaptorF_custom_accessVector3Value(
+  const rmagine::Face& f, 
+  unsigned int ind) 
 {
-    return *((&v0)+idx);
+  return f[ind];
 }
 
 
-} // namespace rmagine
+#include "mesh_types.tcc"
+
 
 #endif // RMAGINE_TYPES_MESH_TYPES_H
\ No newline at end of file
diff --git a/src/rmagine_core/include/rmagine/types/mesh_types.tcc b/src/rmagine_core/include/rmagine/types/mesh_types.tcc
new file mode 100644
index 0000000..7a0d07e
--- /dev/null
+++ b/src/rmagine_core/include/rmagine/types/mesh_types.tcc
@@ -0,0 +1,18 @@
+#include "mesh_types.h"
+
+namespace rmagine
+{
+
+RMAGINE_INLINE_FUNCTION
+unsigned int Face::operator[](const size_t& idx) const
+{
+    return *((&v0)+idx);
+}
+
+RMAGINE_INLINE_FUNCTION
+unsigned int& Face::operator[](const size_t& idx)
+{
+    return *((&v0)+idx);
+}
+
+} // namespace rmagine
\ No newline at end of file

From 9b8d97b5201e9474e70702815526d3ae922f9022 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Thu, 5 Sep 2024 21:55:44 +0200
Subject: [PATCH 05/44] type adaptor to core library

---
 src/rmagine_core/CMakeLists.txt                    |  1 +
 .../include/rmagine/types/mesh_types.h             |  7 ++-----
 src/rmagine_core/src/types/mesh_types.cpp          | 14 ++++++++++++++
 3 files changed, 17 insertions(+), 5 deletions(-)
 create mode 100644 src/rmagine_core/src/types/mesh_types.cpp

diff --git a/src/rmagine_core/CMakeLists.txt b/src/rmagine_core/CMakeLists.txt
index 9bcfe03..9ab8b04 100644
--- a/src/rmagine_core/CMakeLists.txt
+++ b/src/rmagine_core/CMakeLists.txt
@@ -12,6 +12,7 @@ set(RMAGINE_CORE_SRCS
     src/types/Memory.cpp
     src/types/conversions.cpp
     src/types/sensors.cpp
+    src/types/mesh_types.cpp
     # Util
     src/util/synthetic.cpp
     src/util/assimp/helper.cpp
diff --git a/src/rmagine_core/include/rmagine/types/mesh_types.h b/src/rmagine_core/include/rmagine/types/mesh_types.h
index 2e93485..58919da 100644
--- a/src/rmagine_core/include/rmagine/types/mesh_types.h
+++ b/src/rmagine_core/include/rmagine/types/mesh_types.h
@@ -67,12 +67,9 @@ struct Face {
 
 
 // Polyscope field access
-static unsigned int adaptorF_custom_accessVector3Value(
+unsigned int adaptorF_custom_accessVector3Value(
   const rmagine::Face& f, 
-  unsigned int ind) 
-{
-  return f[ind];
-}
+  unsigned int ind);
 
 
 #include "mesh_types.tcc"
diff --git a/src/rmagine_core/src/types/mesh_types.cpp b/src/rmagine_core/src/types/mesh_types.cpp
new file mode 100644
index 0000000..5b717dd
--- /dev/null
+++ b/src/rmagine_core/src/types/mesh_types.cpp
@@ -0,0 +1,14 @@
+#include "rmagine/types/mesh_types.h"
+
+namespace rmagine
+{
+
+} // namespace rmagine
+
+
+unsigned int adaptorF_custom_accessVector3Value(
+  const rmagine::Face& f, 
+  unsigned int ind) 
+{
+  return f[ind];
+}
\ No newline at end of file

From 066a3388ba3f2aa9d7b5cd801c7bdba8ba4e5b87 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Thu, 5 Sep 2024 22:02:25 +0200
Subject: [PATCH 06/44] added size adaptor

---
 src/rmagine_core/include/rmagine/types/mesh_types.h | 2 ++
 src/rmagine_core/src/types/mesh_types.cpp           | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/src/rmagine_core/include/rmagine/types/mesh_types.h b/src/rmagine_core/include/rmagine/types/mesh_types.h
index 58919da..f6c0b64 100644
--- a/src/rmagine_core/include/rmagine/types/mesh_types.h
+++ b/src/rmagine_core/include/rmagine/types/mesh_types.h
@@ -71,6 +71,8 @@ unsigned int adaptorF_custom_accessVector3Value(
   const rmagine::Face& f, 
   unsigned int ind);
 
+size_t adaptorF_size(const rmagine::Face& f);
+
 
 #include "mesh_types.tcc"
 
diff --git a/src/rmagine_core/src/types/mesh_types.cpp b/src/rmagine_core/src/types/mesh_types.cpp
index 5b717dd..fe429e7 100644
--- a/src/rmagine_core/src/types/mesh_types.cpp
+++ b/src/rmagine_core/src/types/mesh_types.cpp
@@ -11,4 +11,9 @@ unsigned int adaptorF_custom_accessVector3Value(
   unsigned int ind) 
 {
   return f[ind];
+}
+
+size_t adaptorF_size(const rmagine::Face& f)
+{
+  return 3;
 }
\ No newline at end of file

From 3283e345db5bb1d2a8d842919c5bc1ac1013a288 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Thu, 5 Sep 2024 22:15:10 +0200
Subject: [PATCH 07/44] get function overload

---
 .../include/rmagine/types/mesh_types.h        | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/src/rmagine_core/include/rmagine/types/mesh_types.h b/src/rmagine_core/include/rmagine/types/mesh_types.h
index f6c0b64..b48c20e 100644
--- a/src/rmagine_core/include/rmagine/types/mesh_types.h
+++ b/src/rmagine_core/include/rmagine/types/mesh_types.h
@@ -43,6 +43,8 @@
 #define RMAGINE_TYPES_MESH_TYPES_H
 
 #include <rmagine/math/types.h>
+#include <tuple>
+#include <type_traits>
 
 namespace rmagine
 {
@@ -73,6 +75,37 @@ unsigned int adaptorF_custom_accessVector3Value(
 
 size_t adaptorF_size(const rmagine::Face& f);
 
+namespace std
+{
+
+// Custom get function for MyStruct, index 0 (for `int`)
+template <std::size_t I>
+decltype(auto) get(rmagine::Face& f) {
+    if constexpr (I == 0) return (f.v0);
+    else if constexpr (I == 1) return (f.v1);
+    else if constexpr (I == 2) return (f.v2);
+    else static_assert(I < 3, "Index out of bounds");
+}
+
+template <std::size_t I>
+decltype(auto) get(const rmagine::Face& f) {
+    if constexpr (I == 0) return (f.v0);
+    else if constexpr (I == 1) return (f.v1);
+    else if constexpr (I == 2) return (f.v2);
+    else static_assert(I < 3, "Index out of bounds");
+}
+
+
+} // namespace std
+
+
+
+
+// namespace std
+// {
+
+
+// } // namespace std
 
 #include "mesh_types.tcc"
 

From 600ce17007c77d2d19830bbdb287e347a4b34283 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Thu, 5 Sep 2024 22:24:57 +0200
Subject: [PATCH 08/44] removed old types adaptors

---
 .../include/rmagine/types/mesh_types.h        | 32 ++-----------------
 .../include/rmagine/types/mesh_types.tcc      | 25 ++++++++++++++-
 2 files changed, 26 insertions(+), 31 deletions(-)

diff --git a/src/rmagine_core/include/rmagine/types/mesh_types.h b/src/rmagine_core/include/rmagine/types/mesh_types.h
index b48c20e..0e9df85 100644
--- a/src/rmagine_core/include/rmagine/types/mesh_types.h
+++ b/src/rmagine_core/include/rmagine/types/mesh_types.h
@@ -67,46 +67,18 @@ struct Face {
 
 } // namespace rmagine
 
-
-// Polyscope field access
-unsigned int adaptorF_custom_accessVector3Value(
-  const rmagine::Face& f, 
-  unsigned int ind);
-
-size_t adaptorF_size(const rmagine::Face& f);
-
 namespace std
 {
 
-// Custom get function for MyStruct, index 0 (for `int`)
 template <std::size_t I>
-decltype(auto) get(rmagine::Face& f) {
-    if constexpr (I == 0) return (f.v0);
-    else if constexpr (I == 1) return (f.v1);
-    else if constexpr (I == 2) return (f.v2);
-    else static_assert(I < 3, "Index out of bounds");
-}
+unsigned int& get(rmagine::Face& f);
 
 template <std::size_t I>
-decltype(auto) get(const rmagine::Face& f) {
-    if constexpr (I == 0) return (f.v0);
-    else if constexpr (I == 1) return (f.v1);
-    else if constexpr (I == 2) return (f.v2);
-    else static_assert(I < 3, "Index out of bounds");
-}
-
+const unsigned int& get(const rmagine::Face& f);
 
 } // namespace std
 
 
-
-
-// namespace std
-// {
-
-
-// } // namespace std
-
 #include "mesh_types.tcc"
 
 
diff --git a/src/rmagine_core/include/rmagine/types/mesh_types.tcc b/src/rmagine_core/include/rmagine/types/mesh_types.tcc
index 7a0d07e..6042b1c 100644
--- a/src/rmagine_core/include/rmagine/types/mesh_types.tcc
+++ b/src/rmagine_core/include/rmagine/types/mesh_types.tcc
@@ -15,4 +15,27 @@ unsigned int& Face::operator[](const size_t& idx)
     return *((&v0)+idx);
 }
 
-} // namespace rmagine
\ No newline at end of file
+
+} // namespace rmagine
+
+namespace std
+{
+
+template <std::size_t I>
+unsigned int& get(rmagine::Face& f) {
+    if constexpr (I == 0) return (f.v0);
+    else if constexpr (I == 1) return (f.v1);
+    else if constexpr (I == 2) return (f.v2);
+    else static_assert(I < 3, "Index out of bounds");
+}
+
+template <std::size_t I>
+const unsigned int& get(const rmagine::Face& f) {
+    if constexpr (I == 0) return (f.v0);
+    else if constexpr (I == 1) return (f.v1);
+    else if constexpr (I == 2) return (f.v2);
+    else static_assert(I < 3, "Index out of bounds");
+}
+
+
+} // namespace std
\ No newline at end of file

From 329353eed97380bc4929edd5eae06a950af8023c Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Sat, 7 Sep 2024 17:24:40 +0200
Subject: [PATCH 09/44] prepared face to be compatible with polyscope

---
 .../include/rmagine/types/mesh_types.h        | 22 +++++++-----
 .../include/rmagine/types/mesh_types.tcc      | 36 +++++++++----------
 src/rmagine_core/src/types/mesh_types.cpp     | 22 ++++++------
 3 files changed, 44 insertions(+), 36 deletions(-)

diff --git a/src/rmagine_core/include/rmagine/types/mesh_types.h b/src/rmagine_core/include/rmagine/types/mesh_types.h
index 0e9df85..69a3350 100644
--- a/src/rmagine_core/include/rmagine/types/mesh_types.h
+++ b/src/rmagine_core/include/rmagine/types/mesh_types.h
@@ -59,24 +59,30 @@ struct Face {
     // Other access functions
     // use with care! No out of range checks
     RMAGINE_INLINE_FUNCTION
-    unsigned int operator[](const size_t& idx) const;
+    const unsigned int& operator[](const size_t& idx) const;
 
     RMAGINE_INLINE_FUNCTION
     unsigned int& operator[](const size_t& idx);
+
+    RMAGINE_INLINE_FUNCTION
+    constexpr size_t size() const
+    {
+      return 3;
+    }
 };
 
 } // namespace rmagine
 
-namespace std
-{
+// namespace std
+// {
 
-template <std::size_t I>
-unsigned int& get(rmagine::Face& f);
+// template <std::size_t I>
+// unsigned int& get(rmagine::Face& f);
 
-template <std::size_t I>
-const unsigned int& get(const rmagine::Face& f);
+// template <std::size_t I>
+// const unsigned int& get(const rmagine::Face& f);
 
-} // namespace std
+// } // namespace std
 
 
 #include "mesh_types.tcc"
diff --git a/src/rmagine_core/include/rmagine/types/mesh_types.tcc b/src/rmagine_core/include/rmagine/types/mesh_types.tcc
index 6042b1c..330594d 100644
--- a/src/rmagine_core/include/rmagine/types/mesh_types.tcc
+++ b/src/rmagine_core/include/rmagine/types/mesh_types.tcc
@@ -4,7 +4,7 @@ namespace rmagine
 {
 
 RMAGINE_INLINE_FUNCTION
-unsigned int Face::operator[](const size_t& idx) const
+const unsigned int& Face::operator[](const size_t& idx) const
 {
     return *((&v0)+idx);
 }
@@ -18,24 +18,24 @@ unsigned int& Face::operator[](const size_t& idx)
 
 } // namespace rmagine
 
-namespace std
-{
+// namespace std
+// {
 
-template <std::size_t I>
-unsigned int& get(rmagine::Face& f) {
-    if constexpr (I == 0) return (f.v0);
-    else if constexpr (I == 1) return (f.v1);
-    else if constexpr (I == 2) return (f.v2);
-    else static_assert(I < 3, "Index out of bounds");
-}
+// template <std::size_t I>
+// unsigned int& get(rmagine::Face& f) {
+//     if constexpr (I == 0) return (f.v0);
+//     else if constexpr (I == 1) return (f.v1);
+//     else if constexpr (I == 2) return (f.v2);
+//     else static_assert(I < 3, "Index out of bounds");
+// }
 
-template <std::size_t I>
-const unsigned int& get(const rmagine::Face& f) {
-    if constexpr (I == 0) return (f.v0);
-    else if constexpr (I == 1) return (f.v1);
-    else if constexpr (I == 2) return (f.v2);
-    else static_assert(I < 3, "Index out of bounds");
-}
+// template <std::size_t I>
+// const unsigned int& get(const rmagine::Face& f) {
+//     if constexpr (I == 0) return (f.v0);
+//     else if constexpr (I == 1) return (f.v1);
+//     else if constexpr (I == 2) return (f.v2);
+//     else static_assert(I < 3, "Index out of bounds");
+// }
 
 
-} // namespace std
\ No newline at end of file
+// } // namespace std
\ No newline at end of file
diff --git a/src/rmagine_core/src/types/mesh_types.cpp b/src/rmagine_core/src/types/mesh_types.cpp
index fe429e7..85b7114 100644
--- a/src/rmagine_core/src/types/mesh_types.cpp
+++ b/src/rmagine_core/src/types/mesh_types.cpp
@@ -3,17 +3,19 @@
 namespace rmagine
 {
 
+
+
 } // namespace rmagine
 
 
-unsigned int adaptorF_custom_accessVector3Value(
-  const rmagine::Face& f, 
-  unsigned int ind) 
-{
-  return f[ind];
-}
+// unsigned int adaptorF_custom_accessVector3Value(
+//   const rmagine::Face& f, 
+//   unsigned int ind) 
+// {
+//   return f[ind];
+// }
 
-size_t adaptorF_size(const rmagine::Face& f)
-{
-  return 3;
-}
\ No newline at end of file
+// size_t adaptorF_size(const rmagine::Face& f)
+// {
+//   return 3;
+// }
\ No newline at end of file

From 2441aaa76fd8f5b7549d58d9445a912f46b37027 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Sat, 7 Sep 2024 17:47:32 +0200
Subject: [PATCH 10/44] removed comments

---
 .../include/rmagine/types/mesh_types.h        | 13 -----------
 .../include/rmagine/types/mesh_types.tcc      | 23 -------------------
 src/rmagine_core/src/types/mesh_types.cpp     | 15 ------------
 3 files changed, 51 deletions(-)

diff --git a/src/rmagine_core/include/rmagine/types/mesh_types.h b/src/rmagine_core/include/rmagine/types/mesh_types.h
index 69a3350..4e094a4 100644
--- a/src/rmagine_core/include/rmagine/types/mesh_types.h
+++ b/src/rmagine_core/include/rmagine/types/mesh_types.h
@@ -73,19 +73,6 @@ struct Face {
 
 } // namespace rmagine
 
-// namespace std
-// {
-
-// template <std::size_t I>
-// unsigned int& get(rmagine::Face& f);
-
-// template <std::size_t I>
-// const unsigned int& get(const rmagine::Face& f);
-
-// } // namespace std
-
-
 #include "mesh_types.tcc"
 
-
 #endif // RMAGINE_TYPES_MESH_TYPES_H
\ No newline at end of file
diff --git a/src/rmagine_core/include/rmagine/types/mesh_types.tcc b/src/rmagine_core/include/rmagine/types/mesh_types.tcc
index 330594d..fa9d07c 100644
--- a/src/rmagine_core/include/rmagine/types/mesh_types.tcc
+++ b/src/rmagine_core/include/rmagine/types/mesh_types.tcc
@@ -15,27 +15,4 @@ unsigned int& Face::operator[](const size_t& idx)
     return *((&v0)+idx);
 }
 
-
 } // namespace rmagine
-
-// namespace std
-// {
-
-// template <std::size_t I>
-// unsigned int& get(rmagine::Face& f) {
-//     if constexpr (I == 0) return (f.v0);
-//     else if constexpr (I == 1) return (f.v1);
-//     else if constexpr (I == 2) return (f.v2);
-//     else static_assert(I < 3, "Index out of bounds");
-// }
-
-// template <std::size_t I>
-// const unsigned int& get(const rmagine::Face& f) {
-//     if constexpr (I == 0) return (f.v0);
-//     else if constexpr (I == 1) return (f.v1);
-//     else if constexpr (I == 2) return (f.v2);
-//     else static_assert(I < 3, "Index out of bounds");
-// }
-
-
-// } // namespace std
\ No newline at end of file
diff --git a/src/rmagine_core/src/types/mesh_types.cpp b/src/rmagine_core/src/types/mesh_types.cpp
index 85b7114..dbc2e29 100644
--- a/src/rmagine_core/src/types/mesh_types.cpp
+++ b/src/rmagine_core/src/types/mesh_types.cpp
@@ -3,19 +3,4 @@
 namespace rmagine
 {
 
-
-
 } // namespace rmagine
-
-
-// unsigned int adaptorF_custom_accessVector3Value(
-//   const rmagine::Face& f, 
-//   unsigned int ind) 
-// {
-//   return f[ind];
-// }
-
-// size_t adaptorF_size(const rmagine::Face& f)
-// {
-//   return 3;
-// }
\ No newline at end of file

From ce4a4ac603e7671e1a3c5f689c31f4889f264510 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Tue, 10 Sep 2024 21:30:47 +0200
Subject: [PATCH 11/44] integrated min and max range in raycast. should improve
 speed a bit

---
 .../rmagine/simulation/O1DnSimulatorEmbree.tcc     | 14 +++++++++++---
 .../rmagine/simulation/OnDnSimulatorEmbree.tcc     | 12 ++++++++++--
 .../rmagine/simulation/PinholeSimulatorEmbree.tcc  | 14 +++++++++++---
 .../rmagine/simulation/SphereSimulatorEmbree.tcc   | 14 +++++++++++---
 4 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc
index 7e1598c..9ef3298 100644
--- a/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc
@@ -21,6 +21,9 @@ void O1DnSimulatorEmbree::simulate(
     SimulationFlags flags = SimulationFlags::Zero();
     set_simulation_flags_<RAM>(ret, flags);
 
+    const float range_min = m_model->range.min;
+    const float range_max = m_model->range.max;
+
     #pragma omp parallel for
     for(size_t pid = 0; pid < Tbm.size(); pid++)
     {
@@ -52,8 +55,8 @@ void O1DnSimulatorEmbree::simulate(
                 rayhit.ray.dir_x = ray_dir_m.x;
                 rayhit.ray.dir_y = ray_dir_m.y;
                 rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
+                rayhit.ray.tnear = 0; // if set to m_model->range.min we would scan through near occlusions
+                rayhit.ray.tfar = range_max;
                 rayhit.ray.mask = -1;
                 rayhit.ray.flags = 0;
                 rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
@@ -67,7 +70,12 @@ void O1DnSimulatorEmbree::simulate(
                     {
                         if(flags.hits)
                         {
-                            ret.Hits<RAM>::hits[glob_id] = 1;
+                            if(rayhit.ray.tfar >= range_min)
+                            {
+                                ret.Hits<RAM>::hits[glob_id] = 1;
+                            } else {
+                                ret.Hits<RAM>::hits[glob_id] = 0;
+                            }
                         }
                     }
 
diff --git a/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc
index e003ff6..05a9606 100644
--- a/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc
@@ -20,6 +20,9 @@ void OnDnSimulatorEmbree::simulate(
 {
     SimulationFlags flags = SimulationFlags::Zero();
     set_simulation_flags_<RAM>(ret, flags);
+    
+    const float range_min = m_model->range.min;
+    const float range_max = m_model->range.max;
 
     #pragma omp parallel for
     for(size_t pid = 0; pid < Tbm.size(); pid++)
@@ -53,7 +56,7 @@ void OnDnSimulatorEmbree::simulate(
                 rayhit.ray.dir_y = ray_dir_m.y;
                 rayhit.ray.dir_z = ray_dir_m.z;
                 rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
+                rayhit.ray.tfar = range_max;
                 rayhit.ray.mask = -1;
                 rayhit.ray.flags = 0;
                 rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
@@ -67,7 +70,12 @@ void OnDnSimulatorEmbree::simulate(
                     {
                         if(flags.hits)
                         {
-                            ret.Hits<RAM>::hits[glob_id] = 1;
+                            if(rayhit.ray.tfar >= range_min)
+                            {
+                                ret.Hits<RAM>::hits[glob_id] = 1;
+                            } else {
+                                ret.Hits<RAM>::hits[glob_id] = 0;
+                            }
                         }
                     }
 
diff --git a/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc
index 956953c..9f57be4 100644
--- a/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc
@@ -21,6 +21,9 @@ void PinholeSimulatorEmbree::simulate(
     SimulationFlags flags = SimulationFlags::Zero();
     set_simulation_flags_<RAM>(ret, flags);
 
+    const float range_min = m_model->range.min;
+    const float range_max = m_model->range.max;
+
     #pragma omp parallel for
     for(size_t pid = 0; pid < Tbm.size(); pid++)
     {
@@ -49,8 +52,8 @@ void PinholeSimulatorEmbree::simulate(
                 rayhit.ray.dir_x = ray_dir_m.x;
                 rayhit.ray.dir_y = ray_dir_m.y;
                 rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
+                rayhit.ray.tnear = 0; // if set to m_model->range.min we would scan through near occlusions
+                rayhit.ray.tfar = range_max;
                 rayhit.ray.mask = -1;
                 rayhit.ray.flags = 0;
                 rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
@@ -64,7 +67,12 @@ void PinholeSimulatorEmbree::simulate(
                     {
                         if(flags.hits)
                         {
-                            ret.Hits<RAM>::hits[glob_id] = 1;
+                            if(rayhit.ray.tfar >= range_min)
+                            {
+                                ret.Hits<RAM>::hits[glob_id] = 1;
+                            } else {
+                                ret.Hits<RAM>::hits[glob_id] = 0;
+                            }
                         }
                     }
 
diff --git a/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.tcc
index ac800af..d9c1ace 100644
--- a/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.tcc
@@ -21,6 +21,9 @@ void SphereSimulatorEmbree::simulate(
     SimulationFlags flags = SimulationFlags::Zero();
     set_simulation_flags_<RAM>(ret, flags);
 
+    const float range_min = m_model->range.min;
+    const float range_max = m_model->range.max;
+
     #pragma omp parallel for
     for(size_t pid = 0; pid < Tbm.size(); pid++)
     {
@@ -49,8 +52,8 @@ void SphereSimulatorEmbree::simulate(
                 rayhit.ray.dir_x = ray_dir_m.x;
                 rayhit.ray.dir_y = ray_dir_m.y;
                 rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
+                rayhit.ray.tnear = 0.0; // if set to m_model->range.min we would scan through near occlusions
+                rayhit.ray.tfar = range_max;
                 rayhit.ray.mask = -1;
                 rayhit.ray.flags = 0;
                 rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
@@ -64,7 +67,12 @@ void SphereSimulatorEmbree::simulate(
                     {
                         if(flags.hits)
                         {
-                            ret.Hits<RAM>::hits[glob_id] = 1;
+                            if(rayhit.ray.tfar >= range_min)
+                            {
+                                ret.Hits<RAM>::hits[glob_id] = 1;
+                            } else {
+                                ret.Hits<RAM>::hits[glob_id] = 0;
+                            }
                         }
                     }
 

From e7f6039dfdcc21d4feb8b76b30ef9d1766fa5ac1 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Tue, 10 Sep 2024 22:54:16 +0200
Subject: [PATCH 12/44] added ouster component

---
 CMakeLists.txt                                |  8 ++
 src/rmagine_ouster/CMakeLists.txt             | 96 +++++++++++++++++++
 .../cmake/rmagine-ouster-config.cmake.in      | 12 +++
 .../include/rmagine/types/ouster_sensors.h    | 52 ++++++++++
 .../src/types/ouster_sensors.cpp              | 23 +++++
 5 files changed, 191 insertions(+)
 create mode 100644 src/rmagine_ouster/CMakeLists.txt
 create mode 100644 src/rmagine_ouster/cmake/rmagine-ouster-config.cmake.in
 create mode 100644 src/rmagine_ouster/include/rmagine/types/ouster_sensors.h
 create mode 100644 src/rmagine_ouster/src/types/ouster_sensors.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9ddff22..2cf0396 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -114,6 +114,9 @@ endif(OPENMP_FOUND)
 ########################################
 ## Optional Deps
 
+# for ouster config loading
+find_package(jsoncpp)
+
 ######################################
 ## CUDA: For Optix                  ##
 ######################################
@@ -178,6 +181,11 @@ set(RMAGINE_STATIC_LIBRARIES)
 ### RMAGINE CORE LIB
 add_subdirectory(src/rmagine_core)
 
+### RMAGINE OUSTER LIB
+if(jsoncpp_FOUND)
+    add_subdirectory(src/rmagine_ouster)
+endif(jsoncpp_FOUND)
+
 ### RMAGINE EMBREE LIB
 if(embree_FOUND)
   add_subdirectory(src/rmagine_embree)
diff --git a/src/rmagine_ouster/CMakeLists.txt b/src/rmagine_ouster/CMakeLists.txt
new file mode 100644
index 0000000..6570914
--- /dev/null
+++ b/src/rmagine_ouster/CMakeLists.txt
@@ -0,0 +1,96 @@
+
+message(STATUS "Building Core. Library: rmagine")
+
+find_package(jsoncpp REQUIRED)
+
+set(RMAGINE_CORE_SRCS
+    # Types
+    src/types/ouster_sensors.cpp
+)
+
+add_library(rmagine-ouster SHARED
+    ${RMAGINE_CORE_SRCS}
+)
+
+target_include_directories(rmagine-ouster
+  PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/rmagine-${rmagine_VERSION}>
+    ${ASSIMP_INCLUDE_DIRS}
+    ${Boost_INCLUDE_DIRS}
+)
+
+target_link_libraries(rmagine-ouster
+    rmagine-core
+    ${ASSIMP_LIBRARIES}
+    Eigen3::Eigen
+    ${Boost_LIBRARIES}
+    ${OpenMP_CXX_LIBRARIES}
+    jsoncpp
+)
+
+set_target_properties(rmagine-ouster
+  PROPERTIES
+    EXPORT_NAME ouster
+    SOVERSION ${rmagine_VERSION_MAJOR}
+    VERSION ${rmagine_VERSION}
+    COMPONENT ouster
+    CXX_STANDARD 17
+)
+
+# TODO: do this:
+add_library(rmagine ALIAS rmagine-ouster)
+add_library(rmagine::ouster ALIAS rmagine-ouster)
+
+
+list(APPEND RMAGINE_LIBRARIES rmagine-ouster)
+set(RMAGINE_LIBRARIES ${RMAGINE_LIBRARIES} PARENT_SCOPE)
+
+
+# CMAKE FIND SCRIPT
+include(CMakePackageConfigHelpers)
+
+###########
+## CORE
+############
+install(TARGETS rmagine-ouster 
+  EXPORT rmagine-ouster-targets
+  COMPONENT ouster)
+
+install(EXPORT rmagine-ouster-targets
+    FILE rmagine-ouster-targets.cmake
+    COMPONENT ouster
+    NAMESPACE rmagine::
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/rmagine-${rmagine_VERSION}
+)
+
+write_basic_package_version_file(
+  ${CMAKE_CURRENT_BINARY_DIR}/rmagine-ouster-config-version.cmake
+  VERSION ${rmagine_VERSION}
+  COMPATIBILITY SameMajorVersion
+)
+
+configure_package_config_file(cmake/rmagine-ouster-config.cmake.in
+    ${CMAKE_CURRENT_BINARY_DIR}/rmagine-ouster-config.cmake
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/rmagine-${rmagine_VERSION}
+)
+
+install(
+  FILES
+    ${CMAKE_CURRENT_BINARY_DIR}/rmagine-ouster-config.cmake
+    ${CMAKE_CURRENT_BINARY_DIR}/rmagine-ouster-config-version.cmake
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/rmagine-${rmagine_VERSION}
+  COMPONENT ouster
+)
+
+install(
+  DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/rmagine
+  COMPONENT ouster
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rmagine-${rmagine_VERSION}
+)
+
+set(CPACK_DEBIAN_CORE_PACKAGE_DEPENDS "libeigen3-dev, libassimp-dev" PARENT_SCOPE)
+
+list(APPEND CPACK_COMPONENTS_ALL ouster)
+set(CPACK_COMPONENTS_ALL ${CPACK_COMPONENTS_ALL} PARENT_SCOPE)
+
diff --git a/src/rmagine_ouster/cmake/rmagine-ouster-config.cmake.in b/src/rmagine_ouster/cmake/rmagine-ouster-config.cmake.in
new file mode 100644
index 0000000..a2d0340
--- /dev/null
+++ b/src/rmagine_ouster/cmake/rmagine-ouster-config.cmake.in
@@ -0,0 +1,12 @@
+@PACKAGE_INIT@
+
+include(${CMAKE_CURRENT_LIST_DIR}/rmagine-ouster-config-version.cmake)
+include(${CMAKE_CURRENT_LIST_DIR}/rmagine-ouster-targets.cmake)
+
+include(CMakeFindDependencyMacro)
+
+find_dependency(jsoncpp)
+
+check_required_components(ouster)
+
+set(rmagine_ouster_FOUND 1)
\ No newline at end of file
diff --git a/src/rmagine_ouster/include/rmagine/types/ouster_sensors.h b/src/rmagine_ouster/include/rmagine/types/ouster_sensors.h
new file mode 100644
index 0000000..afcaf92
--- /dev/null
+++ b/src/rmagine_ouster/include/rmagine/types/ouster_sensors.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2024, University Osnabrück
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the University Osnabrück nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL University Osnabrück BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * 
+ * @brief Predefined Sensor Models
+ *
+ * @date 10.09.2024
+ * @author Alexander Mock
+ * 
+ * @copyright Copyright (c) 2024, University Osnabrück. All rights reserved.
+ * This project is released under the 3-Clause BSD License.
+ * 
+ */
+#ifndef RMAGINE_TYPES_OUSTER_SENSORS_H
+#define RMAGINE_TYPES_OUSTER_SENSORS_H
+
+#include <rmagine/types/sensor_models.h>
+
+namespace rmagine
+{
+
+O1DnModel o1dn_from_ouster_config(std::string filename);
+
+} // namespace rmagine
+
+#endif // RMAGINE_TYPES_OUSTER_SENSORS_H
diff --git a/src/rmagine_ouster/src/types/ouster_sensors.cpp b/src/rmagine_ouster/src/types/ouster_sensors.cpp
new file mode 100644
index 0000000..a9fb132
--- /dev/null
+++ b/src/rmagine_ouster/src/types/ouster_sensors.cpp
@@ -0,0 +1,23 @@
+#include "rmagine/types/ouster_sensors.h"
+
+#include <jsoncpp/json/json.h>
+#include <fstream>
+
+namespace rmagine {
+
+O1DnModel o1dn_from_ouster_config(std::string filename)
+{
+  O1DnModel ret;
+
+  std::ifstream ouster_file(filename, std::ifstream::binary);
+
+  Json::Value ouster_config;
+  ouster_file >> ouster_config;
+
+  std::cout << ouster_config << std::endl;
+
+
+  return ret;
+}
+
+} // namespace rmagine
\ No newline at end of file

From d46fc28417ada5116423c2a5ae742ad13ef065c6 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Tue, 10 Sep 2024 23:01:16 +0200
Subject: [PATCH 13/44] renamed function added example ouster meta file

---
 dat/ouster_config/os0-128x1024x20-meta.json   | 461 ++++++++++++++++++
 .../include/rmagine/types/ouster_sensors.h    |   2 +-
 .../src/types/ouster_sensors.cpp              |   2 +-
 3 files changed, 463 insertions(+), 2 deletions(-)
 create mode 100644 dat/ouster_config/os0-128x1024x20-meta.json

diff --git a/dat/ouster_config/os0-128x1024x20-meta.json b/dat/ouster_config/os0-128x1024x20-meta.json
new file mode 100644
index 0000000..60026a7
--- /dev/null
+++ b/dat/ouster_config/os0-128x1024x20-meta.json
@@ -0,0 +1,461 @@
+{
+    "beam_altitude_angles": 
+    [
+        45.31,
+        44.31,
+        43.58,
+        43.16,
+        42.31,
+        41.32,
+        40.61,
+        40.19,
+        39.32,
+        38.35,
+        37.66,
+        37.22,
+        36.36,
+        35.42,
+        34.73,
+        34.28,
+        33.4,
+        32.48,
+        31.81,
+        31.34,
+        30.47,
+        29.58,
+        28.9,
+        28.44,
+        27.55,
+        26.68,
+        26.01,
+        25.53,
+        24.64,
+        23.8,
+        23.15,
+        22.64,
+        21.76,
+        20.93,
+        20.28,
+        19.77,
+        18.88,
+        18.09,
+        17.44,
+        16.91,
+        16.01,
+        15.25,
+        14.61,
+        14.06,
+        13.16,
+        12.42,
+        11.78,
+        11.25,
+        10.32,
+        9.61,
+        8.97,
+        8.4,
+        7.49,
+        6.81,
+        6.18,
+        5.58,
+        4.66,
+        4.01,
+        3.38,
+        2.76,
+        1.84,
+        1.2,
+        0.57,
+        -0.06,
+        -0.98,
+        -1.6,
+        -2.22,
+        -2.89,
+        -3.8,
+        -4.4,
+        -5.03,
+        -5.7,
+        -6.62,
+        -7.2,
+        -7.83,
+        -8.52,
+        -9.43,
+        -10,
+        -10.65,
+        -11.37,
+        -12.27,
+        -12.83,
+        -13.47,
+        -14.21,
+        -15.12,
+        -15.65,
+        -16.3,
+        -17.06,
+        -17.97,
+        -18.47,
+        -19.12,
+        -19.91,
+        -20.83,
+        -21.32,
+        -21.98,
+        -22.78,
+        -23.71,
+        -24.18,
+        -24.83,
+        -25.67,
+        -26.59,
+        -27.05,
+        -27.7,
+        -28.56,
+        -29.48,
+        -29.92,
+        -30.6,
+        -31.48,
+        -32.4,
+        -32.84,
+        -33.52,
+        -34.4,
+        -35.32,
+        -35.76,
+        -36.44,
+        -37.36,
+        -38.28,
+        -38.71,
+        -39.4,
+        -40.36,
+        -41.27,
+        -41.69,
+        -42.38,
+        -43.36,
+        -44.29,
+        -44.69,
+        -45.4,
+        -46.42
+    ],
+    "beam_azimuth_angles": 
+    [
+        11.58,
+        4.26,
+        -2.91,
+        -9.95,
+        11.11,
+        4.08,
+        -2.85,
+        -9.65,
+        10.69,
+        3.9,
+        -2.8,
+        -9.39,
+        10.34,
+        3.75,
+        -2.75,
+        -9.18,
+        10.03,
+        3.62,
+        -2.72,
+        -8.99,
+        9.77,
+        3.5,
+        -2.71,
+        -8.84,
+        9.53,
+        3.4,
+        -2.68,
+        -8.72,
+        9.32,
+        3.3,
+        -2.68,
+        -8.6,
+        9.14,
+        3.21,
+        -2.69,
+        -8.52,
+        8.99,
+        3.13,
+        -2.69,
+        -8.45,
+        8.84,
+        3.06,
+        -2.69,
+        -8.41,
+        8.73,
+        2.99,
+        -2.7,
+        -8.37,
+        8.63,
+        2.94,
+        -2.73,
+        -8.36,
+        8.54,
+        2.9,
+        -2.75,
+        -8.35,
+        8.48,
+        2.85,
+        -2.77,
+        -8.37,
+        8.43,
+        2.83,
+        -2.81,
+        -8.4,
+        8.4,
+        2.78,
+        -2.84,
+        -8.43,
+        8.38,
+        2.76,
+        -2.87,
+        -8.51,
+        8.37,
+        2.72,
+        -2.91,
+        -8.57,
+        8.38,
+        2.72,
+        -2.96,
+        -8.65,
+        8.4,
+        2.7,
+        -3.03,
+        -8.76,
+        8.45,
+        2.7,
+        -3.08,
+        -8.87,
+        8.49,
+        2.69,
+        -3.15,
+        -9.01,
+        8.57,
+        2.69,
+        -3.22,
+        -9.18,
+        8.67,
+        2.7,
+        -3.31,
+        -9.36,
+        8.79,
+        2.71,
+        -3.41,
+        -9.58,
+        8.92,
+        2.73,
+        -3.52,
+        -9.81,
+        9.09,
+        2.77,
+        -3.63,
+        -10.1,
+        9.27,
+        2.79,
+        -3.78,
+        -10.42,
+        9.5,
+        2.84,
+        -3.93,
+        -10.8,
+        9.78,
+        2.9,
+        -4.11,
+        -11.23,
+        10.11,
+        2.95,
+        -4.34,
+        -11.75
+    ],
+    "build_date": "2022-09-21T17:47:45Z",
+    "build_rev": "v2.4.0",
+    "client_version": "ouster_client 0.7.1",
+    "data_format": 
+    {
+        "column_window": 
+        [
+            0,
+            1023
+        ],
+        "columns_per_frame": 1024,
+        "columns_per_packet": 16,
+        "pixel_shift_by_row": 
+        [
+            33,
+            12,
+            -8,
+            -28,
+            32,
+            12,
+            -8,
+            -27,
+            30,
+            11,
+            -8,
+            -27,
+            29,
+            11,
+            -8,
+            -26,
+            29,
+            10,
+            -8,
+            -26,
+            28,
+            10,
+            -8,
+            -25,
+            27,
+            10,
+            -8,
+            -25,
+            27,
+            9,
+            -8,
+            -24,
+            26,
+            9,
+            -8,
+            -24,
+            26,
+            9,
+            -8,
+            -24,
+            25,
+            9,
+            -8,
+            -24,
+            25,
+            9,
+            -8,
+            -24,
+            25,
+            8,
+            -8,
+            -24,
+            24,
+            8,
+            -8,
+            -24,
+            24,
+            8,
+            -8,
+            -24,
+            24,
+            8,
+            -8,
+            -24,
+            24,
+            8,
+            -8,
+            -24,
+            24,
+            8,
+            -8,
+            -24,
+            24,
+            8,
+            -8,
+            -24,
+            24,
+            8,
+            -8,
+            -25,
+            24,
+            8,
+            -9,
+            -25,
+            24,
+            8,
+            -9,
+            -25,
+            24,
+            8,
+            -9,
+            -26,
+            24,
+            8,
+            -9,
+            -26,
+            25,
+            8,
+            -9,
+            -27,
+            25,
+            8,
+            -10,
+            -27,
+            25,
+            8,
+            -10,
+            -28,
+            26,
+            8,
+            -10,
+            -29,
+            26,
+            8,
+            -11,
+            -30,
+            27,
+            8,
+            -11,
+            -31,
+            28,
+            8,
+            -12,
+            -32,
+            29,
+            8,
+            -12,
+            -33
+        ],
+        "pixels_per_column": 128,
+        "udp_profile_imu": "LEGACY",
+        "udp_profile_lidar": "LEGACY"
+    },
+    "hostname": "",
+    "image_rev": "ousteros-image-prod-aries-v2.4.0+20220921174636",
+    "imu_to_sensor_transform": 
+    [
+        1,
+        0,
+        0,
+        6.253,
+        0,
+        1,
+        0,
+        -11.775,
+        0,
+        0,
+        1,
+        7.645,
+        0,
+        0,
+        0,
+        1
+    ],
+    "initialization_id": 7109749,
+    "json_calibration_version": 4,
+    "lidar_mode": "1024x20",
+    "lidar_origin_to_beam_origin_mm": 27.67,
+    "lidar_to_sensor_transform": 
+    [
+        -1,
+        0,
+        0,
+        0,
+        0,
+        -1,
+        0,
+        0,
+        0,
+        0,
+        1,
+        36.18,
+        0,
+        0,
+        0,
+        1
+    ],
+    "prod_line": "OS-0-128",
+    "prod_pn": "840-103574-06",
+    "prod_sn": "122219001184",
+    "status": "RUNNING",
+    "udp_port_imu": 55235,
+    "udp_port_lidar": 52031
+}
diff --git a/src/rmagine_ouster/include/rmagine/types/ouster_sensors.h b/src/rmagine_ouster/include/rmagine/types/ouster_sensors.h
index afcaf92..9ec65e9 100644
--- a/src/rmagine_ouster/include/rmagine/types/ouster_sensors.h
+++ b/src/rmagine_ouster/include/rmagine/types/ouster_sensors.h
@@ -45,7 +45,7 @@
 namespace rmagine
 {
 
-O1DnModel o1dn_from_ouster_config(std::string filename);
+O1DnModel o1dn_from_ouster_meta_file(std::string filename);
 
 } // namespace rmagine
 
diff --git a/src/rmagine_ouster/src/types/ouster_sensors.cpp b/src/rmagine_ouster/src/types/ouster_sensors.cpp
index a9fb132..7aaa557 100644
--- a/src/rmagine_ouster/src/types/ouster_sensors.cpp
+++ b/src/rmagine_ouster/src/types/ouster_sensors.cpp
@@ -5,7 +5,7 @@
 
 namespace rmagine {
 
-O1DnModel o1dn_from_ouster_config(std::string filename)
+O1DnModel o1dn_from_ouster_meta_file(std::string filename)
 {
   O1DnModel ret;
 

From bd1b6969b82414e8c447ddd31934197a97c01b95 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Tue, 10 Sep 2024 23:55:09 +0200
Subject: [PATCH 14/44] added ouster

---
 .../src/types/ouster_sensors.cpp              | 71 +++++++++++++++++--
 1 file changed, 65 insertions(+), 6 deletions(-)

diff --git a/src/rmagine_ouster/src/types/ouster_sensors.cpp b/src/rmagine_ouster/src/types/ouster_sensors.cpp
index 7aaa557..de2d09c 100644
--- a/src/rmagine_ouster/src/types/ouster_sensors.cpp
+++ b/src/rmagine_ouster/src/types/ouster_sensors.cpp
@@ -7,17 +7,76 @@ namespace rmagine {
 
 O1DnModel o1dn_from_ouster_meta_file(std::string filename)
 {
-  O1DnModel ret;
+    O1DnModel model;
 
-  std::ifstream ouster_file(filename, std::ifstream::binary);
+    std::ifstream ouster_file(filename, std::ifstream::binary);
 
-  Json::Value ouster_config;
-  ouster_file >> ouster_config;
+    Json::Value ouster_config;
+    ouster_file >> ouster_config;
 
-  std::cout << ouster_config << std::endl;
+    std::string lidar_mode = ouster_config["lidar_mode"].asString();
 
+    size_t x_id = lidar_mode.find("x");
+    unsigned int lidar_width = std::stoi(lidar_mode.substr(0, x_id));
+    unsigned int lidar_frequency = std::stoi(lidar_mode.substr(x_id+1));
 
-  return ret;
+    std::vector<float> beam_altitude_angles;
+    for(auto val : ouster_config["beam_altitude_angles"])
+    {
+        beam_altitude_angles.push_back(val.asFloat());
+    }
+
+    unsigned int lidar_height = beam_altitude_angles.size();
+
+    std::vector<float> beam_azimuth_angles;
+    for(auto val : ouster_config["beam_azimuth_angles"])
+    {
+        beam_azimuth_angles.push_back(val.asFloat());
+    }
+
+    const float lidar_origin_to_beam_origin_mm = ouster_config["lidar_origin_to_beam_origin_mm"].asFloat();
+
+    std::cout << "Loading Ouster Lidar: " << lidar_width << "x" << lidar_height << ". " << lidar_frequency << "hz" << std::endl;
+
+    model.width = lidar_width;
+    model.height = lidar_height;
+    model.dirs.resize(lidar_width * lidar_height);
+    
+    const float hor_start_angle = 0.0;
+    const float hor_angle_inc = (2.0 * M_PI) / static_cast<float>(model.getWidth());
+
+    const float ver_start_angle = 0.0;
+
+    
+
+    model.orig = {0.0, 0.0, 0.0};
+
+    model.range.min = 0.1;
+    model.range.max = 80.0;
+
+    for(unsigned int vid = 0; vid < model.getHeight(); vid++)
+    {
+        // vertical angle
+        const float phi = beam_altitude_angles[vid] * DEG_TO_RAD_F;
+
+        for(unsigned int hid = 0; hid < model.getWidth(); hid++)
+        {
+            // horizontal angle
+            const float theta_offset = beam_azimuth_angles[vid] * DEG_TO_RAD_F;
+            const float theta = hor_start_angle + hor_angle_inc * static_cast<float>(hid) + theta_offset;
+
+            // compute direction: polar to cartesian
+            Vector dir{
+                cosf(phi) * cosf(theta), 
+                cosf(phi) * sinf(theta), 
+                sinf(phi)};
+
+            const unsigned int buf_id = model.getBufferId(vid, hid);
+            model.dirs[buf_id] = dir;
+        }
+    }
+
+    return model;
 }
 
 } // namespace rmagine
\ No newline at end of file

From 7a23b415be6b024221c2988da2327426e93ad371 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 00:02:01 +0200
Subject: [PATCH 15/44] removed cout

---
 src/rmagine_ouster/src/types/ouster_sensors.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/rmagine_ouster/src/types/ouster_sensors.cpp b/src/rmagine_ouster/src/types/ouster_sensors.cpp
index de2d09c..b1af656 100644
--- a/src/rmagine_ouster/src/types/ouster_sensors.cpp
+++ b/src/rmagine_ouster/src/types/ouster_sensors.cpp
@@ -36,7 +36,7 @@ O1DnModel o1dn_from_ouster_meta_file(std::string filename)
 
     const float lidar_origin_to_beam_origin_mm = ouster_config["lidar_origin_to_beam_origin_mm"].asFloat();
 
-    std::cout << "Loading Ouster Lidar: " << lidar_width << "x" << lidar_height << ". " << lidar_frequency << "hz" << std::endl;
+    // std::cout << "Loading Ouster Lidar: " << lidar_width << "x" << lidar_height << ". " << lidar_frequency << "hz" << std::endl;
 
     model.width = lidar_width;
     model.height = lidar_height;

From 5e3efe0acb4cb377dd36a8855312844862b8ccb2 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 09:46:30 +0200
Subject: [PATCH 16/44] added convenience functions for single scan simulation.
 removed deprecated function resizeMemoryBundle

---
 .../rmagine/simulation/SimulationResults.hpp  | 21 +------------------
 .../include/rmagine/types/Bundle.hpp          |  2 --
 .../include/rmagine/types/Memory.hpp          | 16 ++++++++++++++
 .../simulation/O1DnSimulatorEmbree.hpp        | 21 ++++++++++++++++++-
 .../simulation/O1DnSimulatorEmbree.tcc        | 19 +++++++++++++++++
 .../simulation/OnDnSimulatorEmbree.hpp        | 21 ++++++++++++++++++-
 .../simulation/OnDnSimulatorEmbree.tcc        | 20 ++++++++++++++++++
 .../simulation/PinholeSimulatorEmbree.hpp     | 21 ++++++++++++++++++-
 .../simulation/PinholeSimulatorEmbree.tcc     | 20 ++++++++++++++++++
 .../simulation/SphereSimulatorEmbree.hpp      | 21 ++++++++++++++++++-
 .../simulation/SphereSimulatorEmbree.tcc      | 20 ++++++++++++++++++
 11 files changed, 176 insertions(+), 26 deletions(-)

diff --git a/src/rmagine_core/include/rmagine/simulation/SimulationResults.hpp b/src/rmagine_core/include/rmagine/simulation/SimulationResults.hpp
index aa8fe2c..2072087 100644
--- a/src/rmagine_core/include/rmagine/simulation/SimulationResults.hpp
+++ b/src/rmagine_core/include/rmagine/simulation/SimulationResults.hpp
@@ -97,7 +97,6 @@ struct FaceIds {
     Memory<unsigned int, MemT> face_ids;
 };
 
-
 /**
  * @brief GeomIds computed by the simulators
  * 
@@ -156,7 +155,7 @@ template<typename MemT, typename BundleT>
 static void resize_memory_bundle(BundleT& res, 
     unsigned int W,
     unsigned int H,
-    unsigned int N )
+    unsigned int N = 1 )
 {
     if constexpr(BundleT::template has<Hits<MemT> >())
     {
@@ -194,24 +193,6 @@ static void resize_memory_bundle(BundleT& res,
     }
 }
 
-// template<typename BundleT>
-// static void resize_memory_bundle(BundleT& res, 
-//     unsigned int W,
-//     unsigned int H,
-//     unsigned int N )
-// {
-//     resize_memory_bundle_<RAM>(res, W, H, N);
-// }
-
-template<typename MemT, typename BundleT>
-[[deprecated("Use resize_memory_bundle() instead.")]]
-void resizeMemoryBundle(BundleT& res, 
-    unsigned int W,
-    unsigned int H,
-    unsigned int N )
-{
-    resize_memory_bundle<BundleT>(res, W, H, N);
-}
 
 } // namespace rmagine
 
diff --git a/src/rmagine_core/include/rmagine/types/Bundle.hpp b/src/rmagine_core/include/rmagine/types/Bundle.hpp
index cd01581..2495263 100644
--- a/src/rmagine_core/include/rmagine/types/Bundle.hpp
+++ b/src/rmagine_core/include/rmagine/types/Bundle.hpp
@@ -93,8 +93,6 @@ struct Bundle : public Tp...
         static constexpr bool value = has_type<F, elems>::type::value;
     };
 
-
-
     template <typename T, typename Tuple>
     struct Index;
 
diff --git a/src/rmagine_core/include/rmagine/types/Memory.hpp b/src/rmagine_core/include/rmagine/types/Memory.hpp
index 3b0279c..ed7b170 100644
--- a/src/rmagine_core/include/rmagine/types/Memory.hpp
+++ b/src/rmagine_core/include/rmagine/types/Memory.hpp
@@ -39,6 +39,7 @@
 #include <iostream>
 #include <cstring>
 #include <type_traits>
+#include <exception>
 
 #include <rmagine/types/shared_functions.h>
 
@@ -46,6 +47,15 @@ namespace rmagine {
 
 struct RAM;
 
+class MemoryResizeError : public std::runtime_error {
+public:
+  MemoryResizeError()
+  :std::runtime_error("rmagine: cannot resize memory view!")
+  {
+
+  }
+};
+
 template<typename DataT, typename MemT = RAM>
 class MemoryView {
 public:
@@ -126,6 +136,11 @@ class MemoryView {
         return m_size;
     }
 
+    // Shall we introduce this? 
+    // virtual void resize(size_t N) {
+    //     throw MemoryResizeError();
+    // }
+
     MemoryView<DataT, MemT> slice(size_t idx_start, size_t idx_end)
     {
         return MemoryView<DataT, MemT>(m_mem + idx_start, idx_end - idx_start);
@@ -173,6 +188,7 @@ class Memory : public MemoryView<DataT, MemT> {
 
     ~Memory();
 
+    // virtual void resize(size_t N);
     void resize(size_t N);
 
     // Copy for assignment of same MemT
diff --git a/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.hpp b/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.hpp
index 714942e..1d6da43 100644
--- a/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.hpp
+++ b/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.hpp
@@ -112,7 +112,26 @@ class O1DnSimulatorEmbree {
     void setModel(const MemoryView<O1DnModel_<RAM>, RAM>& model);
     void setModel(const O1DnModel_<RAM>& model);
 
-    // Generic Version
+    /**
+     * @brief Simulate from one pose
+     * 
+     * @tparam BundleT 
+     * @param Tbm 
+     * @param ret 
+     */
+    template<typename BundleT>
+    void simulate(const Transform& Tbm, BundleT& ret) const;
+
+    template<typename BundleT>
+    BundleT simulate(const Transform& Tbm) const;
+
+    /**
+     * @brief Simulate for multiple poses at once
+     * 
+     * @tparam BundleT 
+     * @param Tbm 
+     * @param ret 
+     */
     template<typename BundleT>
     void simulate(const MemoryView<Transform, RAM>& Tbm,
         BundleT& ret) const;
diff --git a/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc
index 9ef3298..dc0b500 100644
--- a/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc
@@ -12,6 +12,25 @@
 namespace rmagine
 {
 
+template<typename BundleT>
+void O1DnSimulatorEmbree::simulate(
+    const Transform& Tbm,
+    BundleT& ret) const
+{
+    MemoryView<Transform, RAM> Tbm_mem(&Tbm, 1);
+    // TODO: change parallelization scheme for single simulations?
+    simulate(Tbm_mem, ret);
+}
+
+template<typename BundleT>
+BundleT O1DnSimulatorEmbree::simulate(
+    const Transform& Tbm) const
+{
+    BundleT res;
+    resize_memory_bundle<RAM>(res, m_model->getWidth(), m_model->getHeight(), 1);
+    simulate(Tbm, res);
+    return res;
+}
 
 template<typename BundleT>
 void O1DnSimulatorEmbree::simulate(
diff --git a/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.hpp b/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.hpp
index ccfe7ff..6a7754f 100644
--- a/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.hpp
+++ b/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.hpp
@@ -112,8 +112,26 @@ class OnDnSimulatorEmbree {
     void setModel(const OnDnModel_<RAM>& model);
     void setModel(const MemoryView<OnDnModel_<RAM>, RAM>& model);
 
+    /**
+     * @brief Simulate from one pose
+     * 
+     * @tparam BundleT 
+     * @param Tbm 
+     * @param ret 
+     */
+    template<typename BundleT>
+    void simulate(const Transform& Tbm, BundleT& ret) const;
 
-    // Generic Version
+    template<typename BundleT>
+    BundleT simulate(const Transform& Tbm) const;
+
+    /**
+     * @brief Simulate for multiple poses at once
+     * 
+     * @tparam BundleT 
+     * @param Tbm 
+     * @param ret 
+     */
     template<typename BundleT>
     void simulate(const MemoryView<Transform, RAM>& Tbm,
         BundleT& ret) const;
@@ -121,6 +139,7 @@ class OnDnSimulatorEmbree {
     template<typename BundleT>
     BundleT simulate(const MemoryView<Transform, RAM>& Tbm) const;
 
+
     // [[deprecated("Use simulate<AttrT>() instead.")]]
     void simulateRanges(
         const MemoryView<Transform, RAM>& Tbm, 
diff --git a/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc
index 05a9606..5e42782 100644
--- a/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc
@@ -13,6 +13,26 @@
 namespace rmagine
 {
 
+template<typename BundleT>
+void OnDnSimulatorEmbree::simulate(
+    const Transform& Tbm,
+    BundleT& ret) const
+{
+    MemoryView<Transform, RAM> Tbm_mem(&Tbm, 1);
+    // TODO: change parallelization scheme for single simulations?
+    simulate(Tbm_mem, ret);
+}
+
+template<typename BundleT>
+BundleT OnDnSimulatorEmbree::simulate(
+    const Transform& Tbm) const
+{
+    BundleT res;
+    resize_memory_bundle<RAM>(res, m_model->getWidth(), m_model->getHeight(), 1);
+    simulate(Tbm, res);
+    return res;
+}
+
 template<typename BundleT>
 void OnDnSimulatorEmbree::simulate(
     const MemoryView<Transform, RAM>& Tbm,
diff --git a/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.hpp b/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.hpp
index d940fb5..b1af509 100644
--- a/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.hpp
+++ b/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.hpp
@@ -116,7 +116,26 @@ class PinholeSimulatorEmbree {
     void setModel(const MemoryView<PinholeModel, RAM>& model);
     void setModel(const PinholeModel& model);
 
-    // Generic Version
+    /**
+     * @brief Simulate from one pose
+     * 
+     * @tparam BundleT 
+     * @param Tbm 
+     * @param ret 
+     */
+    template<typename BundleT>
+    void simulate(const Transform& Tbm, BundleT& ret) const;
+
+    template<typename BundleT>
+    BundleT simulate(const Transform& Tbm) const;
+
+    /**
+     * @brief Simulate for multiple poses at once
+     * 
+     * @tparam BundleT 
+     * @param Tbm 
+     * @param ret 
+     */
     template<typename BundleT>
     void simulate(const MemoryView<Transform, RAM>& Tbm,
         BundleT& ret) const;
diff --git a/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc
index 9f57be4..ce9b4af 100644
--- a/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc
@@ -13,6 +13,26 @@
 namespace rmagine
 {
 
+template<typename BundleT>
+void PinholeSimulatorEmbree::simulate(
+    const Transform& Tbm,
+    BundleT& ret) const
+{
+    MemoryView<Transform, RAM> Tbm_mem(&Tbm, 1);
+    // TODO: change parallelization scheme for single simulations?
+    simulate(Tbm_mem, ret);
+}
+
+template<typename BundleT>
+BundleT PinholeSimulatorEmbree::simulate(
+    const Transform& Tbm) const
+{
+    BundleT res;
+    resize_memory_bundle<RAM>(res, m_model->getWidth(), m_model->getHeight(), 1);
+    simulate(Tbm, res);
+    return res;
+}
+
 template<typename BundleT>
 void PinholeSimulatorEmbree::simulate(
     const MemoryView<Transform, RAM>& Tbm,
diff --git a/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.hpp b/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.hpp
index b60736e..2ab9fae 100644
--- a/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.hpp
+++ b/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.hpp
@@ -116,7 +116,26 @@ class SphereSimulatorEmbree {
     void setModel(const MemoryView<SphericalModel, RAM>& model);
     void setModel(const SphericalModel& model);
 
-    // Generic Version
+    /**
+     * @brief Simulate from one pose
+     * 
+     * @tparam BundleT 
+     * @param Tbm 
+     * @param ret 
+     */
+    template<typename BundleT>
+    void simulate(const Transform& Tbm, BundleT& ret) const;
+
+    template<typename BundleT>
+    BundleT simulate(const Transform& Tbm) const;
+
+    /**
+     * @brief Simulate for multiple poses at once
+     * 
+     * @tparam BundleT 
+     * @param Tbm 
+     * @param ret 
+     */
     template<typename BundleT>
     void simulate(const MemoryView<Transform, RAM>& Tbm,
         BundleT& ret) const;
diff --git a/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.tcc
index d9c1ace..160c635 100644
--- a/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.tcc
@@ -13,6 +13,26 @@
 namespace rmagine
 {
 
+template<typename BundleT>
+void SphereSimulatorEmbree::simulate(
+    const Transform& Tbm,
+    BundleT& ret) const
+{
+    MemoryView<Transform, RAM> Tbm_mem(&Tbm, 1);
+    // TODO: change parallelization scheme for single simulations?
+    simulate(Tbm_mem, ret);
+}
+
+template<typename BundleT>
+BundleT SphereSimulatorEmbree::simulate(
+    const Transform& Tbm) const
+{
+    BundleT res;
+    resize_memory_bundle<RAM>(res, m_model->getWidth(), m_model->getHeight(), 1);
+    simulate(Tbm, res);
+    return res;
+}
+
 template<typename BundleT>
 void SphereSimulatorEmbree::simulate(
     const MemoryView<Transform, RAM>& Tbm,

From 3724c5325f8a2e7726033ea636164e9884aedb62 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 10:07:45 +0200
Subject: [PATCH 17/44] cleanup

---
 .../include/rmagine/map/embree/EmbreeMesh.hpp            | 9 ---------
 src/rmagine_embree/src/map/embree/EmbreeMesh.cpp         | 2 --
 2 files changed, 11 deletions(-)

diff --git a/src/rmagine_embree/include/rmagine/map/embree/EmbreeMesh.hpp b/src/rmagine_embree/include/rmagine/map/embree/EmbreeMesh.hpp
index ea0a580..fe7925c 100644
--- a/src/rmagine_embree/include/rmagine/map/embree/EmbreeMesh.hpp
+++ b/src/rmagine_embree/include/rmagine/map/embree/EmbreeMesh.hpp
@@ -92,8 +92,6 @@ class EmbreeMesh
 
     void initVertexNormals();
 
-    bool closestPointFunc2(RTCPointQueryFunctionArguments* args);
-
     // PUBLIC ATTRIBUTES
     MemoryView<Face, RAM> faces() const;
     MemoryView<Vertex, RAM> vertices() const;
@@ -103,9 +101,6 @@ class EmbreeMesh
     MemoryView<const Vertex, RAM> verticesTransformed() const;
     MemoryView<const Vector, RAM> faceNormalsTransformed() const;
     
-    
-    
-
     void computeFaceNormals();
 
     /**
@@ -137,10 +132,6 @@ class EmbreeMesh
     Vertex* m_vertices_transformed;
     Memory<Vector, RAM> m_face_normals_transformed;
     Memory<Vector, RAM> m_vertex_normals_transformed;
-
-    // boost::function<bool (RTCPointQueryFunctionArguments*)> m_closest_point_func;
-    // RTCPointQueryFunction* m_closest_point_func_raw;
-
 };
 
 using EmbreeMeshPtr = std::shared_ptr<EmbreeMesh>;
diff --git a/src/rmagine_embree/src/map/embree/EmbreeMesh.cpp b/src/rmagine_embree/src/map/embree/EmbreeMesh.cpp
index cc29067..8a87011 100644
--- a/src/rmagine_embree/src/map/embree/EmbreeMesh.cpp
+++ b/src/rmagine_embree/src/map/embree/EmbreeMesh.cpp
@@ -322,6 +322,4 @@ void EmbreeMesh::apply()
     }
 }
 
-// pt2ConstMember = &EmbreeMesh::closestPointFunc2; 
-
 } // namespace rmagine
\ No newline at end of file

From 957f8b484373b6e3419d8e16adde3a60053a102c Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 10:38:42 +0200
Subject: [PATCH 18/44] added const functions

---
 apps/rmagine_benchmark/Main.cpp               |   2 +-
 .../simulation/O1DnSimulatorEmbree.hpp        |  46 +++----
 .../simulation/O1DnSimulatorEmbree.tcc        |  21 ++-
 .../simulation/OnDnSimulatorEmbree.hpp        |  45 +++---
 .../simulation/OnDnSimulatorEmbree.tcc        |  21 ++-
 .../simulation/PinholeSimulatorEmbree.hpp     |  43 +++---
 .../simulation/PinholeSimulatorEmbree.tcc     |  11 +-
 .../simulation/SphereSimulatorEmbree.hpp      |  44 +++---
 .../simulation/SphereSimulatorEmbree.tcc      |  21 ++-
 .../src/simulation/O1DnSimulatorEmbree.cpp    | 130 +-----------------
 .../src/simulation/OnDnSimulatorEmbree.cpp    | 128 -----------------
 .../src/simulation/PinholeSimulatorEmbree.cpp | 114 ---------------
 .../src/simulation/SphereSimulatorEmbree.cpp  | 116 ----------------
 13 files changed, 138 insertions(+), 604 deletions(-)

diff --git a/apps/rmagine_benchmark/Main.cpp b/apps/rmagine_benchmark/Main.cpp
index 3cfc533..91e8359 100644
--- a/apps/rmagine_benchmark/Main.cpp
+++ b/apps/rmagine_benchmark/Main.cpp
@@ -116,7 +116,7 @@ int main(int argc, char** argv)
         // Define what to simulate
         double velos_per_second_mean = 0.0;
 
-        std::cout << "- range of last ray: " << cpu_sim->simulateRanges(Tbm)[Tbm.size() * model->phi.size * model->theta.size - 1] << std::endl;
+        // std::cout << "- range of last ray: " << cpu_sim->simulateRanges(Tbm)[Tbm.size() * model->phi.size * model->theta.size - 1] << std::endl;
         std::cout << "-- Starting Benchmark --" << std::endl;
 
         // predefine result buffer
diff --git a/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.hpp b/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.hpp
index 1d6da43..9892808 100644
--- a/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.hpp
+++ b/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.hpp
@@ -112,6 +112,17 @@ class O1DnSimulatorEmbree {
     void setModel(const MemoryView<O1DnModel_<RAM>, RAM>& model);
     void setModel(const O1DnModel_<RAM>& model);
 
+    inline Memory<O1DnModel_<RAM>, RAM> model() const
+    {
+        return m_model;
+    }
+
+    inline EmbreeMapPtr map() const 
+    {
+        return m_map;
+    }
+
+
     /**
      * @brief Simulate from one pose
      * 
@@ -137,37 +148,14 @@ class O1DnSimulatorEmbree {
         BundleT& ret) const;
 
     template<typename BundleT>
-    BundleT simulate(const MemoryView<Transform, RAM>& Tbm) const;
-
-
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    void simulateRanges(
-        const MemoryView<Transform, RAM>& Tbm, 
-        MemoryView<float, RAM>& ranges) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    Memory<float, RAM> simulateRanges(
-        const MemoryView<Transform, RAM>& Tbm) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    void simulateHits(
-        const MemoryView<Transform, RAM>& Tbm, 
-        MemoryView<uint8_t, RAM>& hits) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    Memory<uint8_t, RAM> simulateHits(
-        const MemoryView<Transform, RAM>& Tbm) const;
+    void simulate(const MemoryView<const Transform, RAM>& Tbm,
+        BundleT& ret) const;
 
-    inline Memory<O1DnModel_<RAM>, RAM> model() const
-    {
-        return m_model;
-    }
+    template<typename BundleT>
+    BundleT simulate(const MemoryView<Transform, RAM>& Tbm) const;
 
-    inline EmbreeMapPtr map() const 
-    {
-        return m_map;
-    }
+    template<typename BundleT>
+    BundleT simulate(const MemoryView<const Transform, RAM>& Tbm) const;
 
 protected:
     EmbreeMapPtr m_map;
diff --git a/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc
index dc0b500..27a5cee 100644
--- a/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc
@@ -17,8 +17,8 @@ void O1DnSimulatorEmbree::simulate(
     const Transform& Tbm,
     BundleT& ret) const
 {
-    MemoryView<Transform, RAM> Tbm_mem(&Tbm, 1);
     // TODO: change parallelization scheme for single simulations?
+    const MemoryView<const Transform, RAM> Tbm_mem(&Tbm, 1);
     simulate(Tbm_mem, ret);
 }
 
@@ -36,6 +36,15 @@ template<typename BundleT>
 void O1DnSimulatorEmbree::simulate(
     const MemoryView<Transform, RAM>& Tbm,
     BundleT& ret) const
+{
+    const MemoryView<const Transform, RAM> Tbm_const(Tbm.raw(), Tbm.size());
+    simulate(Tbm, ret);
+}
+
+template<typename BundleT>
+void O1DnSimulatorEmbree::simulate(
+    const MemoryView<const Transform, RAM>& Tbm,
+    BundleT& ret) const
 {
     SimulationFlags flags = SimulationFlags::Zero();
     set_simulation_flags_<RAM>(ret, flags);
@@ -244,4 +253,14 @@ BundleT O1DnSimulatorEmbree::simulate(
     return res;
 }
 
+template<typename BundleT>
+BundleT O1DnSimulatorEmbree::simulate(
+    const MemoryView<const Transform, RAM>& Tbm) const
+{
+    BundleT res;
+    resize_memory_bundle<RAM>(res, m_model->getWidth(), m_model->getHeight(), Tbm.size());
+    simulate(Tbm, res);
+    return res;
+}
+
 } // namespace rmagine
\ No newline at end of file
diff --git a/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.hpp b/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.hpp
index 6a7754f..af2015c 100644
--- a/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.hpp
+++ b/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.hpp
@@ -112,6 +112,16 @@ class OnDnSimulatorEmbree {
     void setModel(const OnDnModel_<RAM>& model);
     void setModel(const MemoryView<OnDnModel_<RAM>, RAM>& model);
 
+    inline Memory<OnDnModel_<RAM>, RAM> model() const
+    {
+      return m_model;
+    }
+
+    inline EmbreeMapPtr map() const 
+    {
+        return m_map;
+    }
+
     /**
      * @brief Simulate from one pose
      * 
@@ -137,37 +147,16 @@ class OnDnSimulatorEmbree {
         BundleT& ret) const;
 
     template<typename BundleT>
-    BundleT simulate(const MemoryView<Transform, RAM>& Tbm) const;
-
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    void simulateRanges(
-        const MemoryView<Transform, RAM>& Tbm, 
-        MemoryView<float, RAM>& ranges) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    Memory<float, RAM> simulateRanges(
-        const MemoryView<Transform, RAM>& Tbm) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    void simulateHits(
-        const MemoryView<Transform, RAM>& Tbm, 
-        MemoryView<uint8_t, RAM>& hits) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    Memory<uint8_t, RAM> simulateHits(
-        const MemoryView<Transform, RAM>& Tbm) const;
+    void simulate(const MemoryView<const Transform, RAM>& Tbm,
+        BundleT& ret) const;
 
-    inline Memory<OnDnModel_<RAM>, RAM> model() const
-    {
-      return m_model;
-    }
+    template<typename BundleT>
+    BundleT simulate(const MemoryView<Transform, RAM>& Tbm) const;
 
-    inline EmbreeMapPtr map() const 
-    {
-        return m_map;
-    }
+    template<typename BundleT>
+    BundleT simulate(const MemoryView<const Transform, RAM>& Tbm) const;
 
+    
 protected:
     EmbreeMapPtr m_map;
     
diff --git a/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc
index 5e42782..7b88ee5 100644
--- a/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc
@@ -18,8 +18,8 @@ void OnDnSimulatorEmbree::simulate(
     const Transform& Tbm,
     BundleT& ret) const
 {
-    MemoryView<Transform, RAM> Tbm_mem(&Tbm, 1);
     // TODO: change parallelization scheme for single simulations?
+    const MemoryView<const Transform, RAM> Tbm_mem(&Tbm, 1);
     simulate(Tbm_mem, ret);
 }
 
@@ -37,6 +37,15 @@ template<typename BundleT>
 void OnDnSimulatorEmbree::simulate(
     const MemoryView<Transform, RAM>& Tbm,
     BundleT& ret) const
+{
+    const MemoryView<const Transform, RAM> Tbm_const(Tbm.raw(), Tbm.size());
+    simulate(Tbm, ret);
+}
+
+template<typename BundleT>
+void OnDnSimulatorEmbree::simulate(
+    const MemoryView<const Transform, RAM>& Tbm,
+    BundleT& ret) const
 {
     SimulationFlags flags = SimulationFlags::Zero();
     set_simulation_flags_<RAM>(ret, flags);
@@ -245,4 +254,14 @@ BundleT OnDnSimulatorEmbree::simulate(
     return res;
 }
 
+template<typename BundleT>
+BundleT OnDnSimulatorEmbree::simulate(
+    const MemoryView<const Transform, RAM>& Tbm) const
+{
+    BundleT res;
+    resize_memory_bundle<RAM>(res, m_model->getWidth(), m_model->getHeight(), Tbm.size());
+    simulate(Tbm, res);
+    return res;
+}
+
 } // namespace rmagine
\ No newline at end of file
diff --git a/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.hpp b/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.hpp
index b1af509..04e7aab 100644
--- a/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.hpp
+++ b/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.hpp
@@ -116,6 +116,16 @@ class PinholeSimulatorEmbree {
     void setModel(const MemoryView<PinholeModel, RAM>& model);
     void setModel(const PinholeModel& model);
 
+    inline Memory<PinholeModel, RAM> model() const
+    {
+      return m_model;
+    }
+
+    inline EmbreeMapPtr map() const 
+    {
+        return m_map;
+    }
+
     /**
      * @brief Simulate from one pose
      * 
@@ -141,35 +151,14 @@ class PinholeSimulatorEmbree {
         BundleT& ret) const;
 
     template<typename BundleT>
-    BundleT simulate(const MemoryView<Transform, RAM>& Tbm) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    void simulateRanges(
-        const MemoryView<Transform, RAM>& Tbm, 
-        MemoryView<float, RAM>& ranges) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    Memory<float, RAM> simulateRanges(
-        const MemoryView<Transform, RAM>& Tbm) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    void simulateHits(
-        const MemoryView<Transform, RAM>& Tbm, 
-        MemoryView<uint8_t, RAM>& hits) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    Memory<uint8_t, RAM> simulateHits(
-        const MemoryView<Transform, RAM>& Tbm) const;
+    void simulate(const MemoryView<const Transform, RAM>& Tbm,
+        BundleT& ret) const;
 
-    inline Memory<PinholeModel, RAM> model() const
-    {
-      return m_model;
-    }
+    template<typename BundleT>
+    BundleT simulate(const MemoryView<Transform, RAM>& Tbm) const;
 
-    inline EmbreeMapPtr map() const 
-    {
-        return m_map;
-    }
+    template<typename BundleT>
+    BundleT simulate(const MemoryView<const Transform, RAM>& Tbm) const;
 
 protected:
     EmbreeMapPtr m_map;
diff --git a/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc
index ce9b4af..857b080 100644
--- a/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc
@@ -18,8 +18,8 @@ void PinholeSimulatorEmbree::simulate(
     const Transform& Tbm,
     BundleT& ret) const
 {
-    MemoryView<Transform, RAM> Tbm_mem(&Tbm, 1);
     // TODO: change parallelization scheme for single simulations?
+    const MemoryView<const Transform, RAM> Tbm_mem(&Tbm, 1);
     simulate(Tbm_mem, ret);
 }
 
@@ -37,6 +37,15 @@ template<typename BundleT>
 void PinholeSimulatorEmbree::simulate(
     const MemoryView<Transform, RAM>& Tbm,
     BundleT& ret) const
+{
+    const MemoryView<const Transform, RAM> Tbm_const(Tbm.raw(), Tbm.size());
+    simulate(Tbm, ret);
+}
+
+template<typename BundleT>
+void PinholeSimulatorEmbree::simulate(
+    const MemoryView<const Transform, RAM>& Tbm,
+    BundleT& ret) const
 {
     SimulationFlags flags = SimulationFlags::Zero();
     set_simulation_flags_<RAM>(ret, flags);
diff --git a/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.hpp b/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.hpp
index 2ab9fae..2902561 100644
--- a/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.hpp
+++ b/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.hpp
@@ -116,6 +116,16 @@ class SphereSimulatorEmbree {
     void setModel(const MemoryView<SphericalModel, RAM>& model);
     void setModel(const SphericalModel& model);
 
+    inline Memory<SphericalModel, RAM> model() const
+    {
+      return m_model;
+    }
+
+    inline EmbreeMapPtr map() const 
+    {
+        return m_map;
+    }
+
     /**
      * @brief Simulate from one pose
      * 
@@ -141,36 +151,14 @@ class SphereSimulatorEmbree {
         BundleT& ret) const;
 
     template<typename BundleT>
-    BundleT simulate(const MemoryView<Transform, RAM>& Tbm) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    void simulateRanges(
-        const MemoryView<Transform, RAM>& Tbm, 
-        MemoryView<float, RAM>& ranges) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    Memory<float, RAM> simulateRanges(
-        const MemoryView<Transform, RAM>& Tbm) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    void simulateHits(
-        const MemoryView<Transform, RAM>& Tbm, 
-        MemoryView<uint8_t, RAM>& hits) const;
-
-    // [[deprecated("Use simulate<AttrT>() instead.")]]
-    Memory<uint8_t, RAM> simulateHits(
-        const MemoryView<Transform, RAM>& Tbm) const;
-    
+    void simulate(const MemoryView<const Transform, RAM>& Tbm,
+        BundleT& ret) const;
 
-    inline Memory<SphericalModel, RAM> model() const
-    {
-      return m_model;
-    }
+    template<typename BundleT>
+    BundleT simulate(const MemoryView<Transform, RAM>& Tbm) const;
 
-    inline EmbreeMapPtr map() const 
-    {
-        return m_map;
-    }
+    template<typename BundleT>
+    BundleT simulate(const MemoryView<const Transform, RAM>& Tbm) const;
 
 protected:
     EmbreeMapPtr m_map;
diff --git a/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.tcc
index 160c635..683e44c 100644
--- a/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.tcc
@@ -18,8 +18,8 @@ void SphereSimulatorEmbree::simulate(
     const Transform& Tbm,
     BundleT& ret) const
 {
-    MemoryView<Transform, RAM> Tbm_mem(&Tbm, 1);
     // TODO: change parallelization scheme for single simulations?
+    const MemoryView<const Transform, RAM> Tbm_mem(&Tbm, 1);
     simulate(Tbm_mem, ret);
 }
 
@@ -37,6 +37,15 @@ template<typename BundleT>
 void SphereSimulatorEmbree::simulate(
     const MemoryView<Transform, RAM>& Tbm,
     BundleT& ret) const
+{
+    const MemoryView<const Transform, RAM> Tbm_const(Tbm.raw(), Tbm.size());
+    simulate(Tbm, ret);
+}
+
+template<typename BundleT>
+void SphereSimulatorEmbree::simulate(
+    const MemoryView<const Transform, RAM>& Tbm,
+    BundleT& ret) const
 {
     SimulationFlags flags = SimulationFlags::Zero();
     set_simulation_flags_<RAM>(ret, flags);
@@ -242,4 +251,14 @@ BundleT SphereSimulatorEmbree::simulate(
     return res;
 }
 
+template<typename BundleT>
+BundleT SphereSimulatorEmbree::simulate(
+    const MemoryView<const Transform, RAM>& Tbm) const
+{
+    BundleT res;
+    resize_memory_bundle<RAM>(res, m_model->getWidth(), m_model->getHeight(), Tbm.size());
+    simulate(Tbm, res);
+    return res;
+}
+
 } // namespace rmagine
\ No newline at end of file
diff --git a/src/rmagine_embree/src/simulation/O1DnSimulatorEmbree.cpp b/src/rmagine_embree/src/simulation/O1DnSimulatorEmbree.cpp
index c99741c..92d5793 100644
--- a/src/rmagine_embree/src/simulation/O1DnSimulatorEmbree.cpp
+++ b/src/rmagine_embree/src/simulation/O1DnSimulatorEmbree.cpp
@@ -19,7 +19,7 @@ O1DnSimulatorEmbree::O1DnSimulatorEmbree(EmbreeMapPtr map)
 
 O1DnSimulatorEmbree::~O1DnSimulatorEmbree()
 {
-    // std::cout << "O1DnSimulatorEmbree - Destructor " << std::endl;
+    
 }
 
 void O1DnSimulatorEmbree::setMap(EmbreeMapPtr map)
@@ -50,134 +50,6 @@ void O1DnSimulatorEmbree::setModel(
     m_model->range = model->range;
     m_model->orig = model->orig;
     m_model->dirs = model->dirs;
-
-    // this is shallow copy becauce pointer is copied
-    // m_model = model;
-    // copy(model[0].dirs, m_model[0].dirs);
-    // std::cout << "Set Model" << std::endl;
-    // std::cout << model[0].dirs.raw() << " -> " << m_model[0].dirs.raw() << std::endl; 
-}
-
-void O1DnSimulatorEmbree::simulateRanges(
-    const MemoryView<Transform, RAM>& Tbm,
-    MemoryView<float, RAM>& ranges) const
-{
-    #pragma omp parallel for
-    for(size_t pid = 0; pid < Tbm.size(); pid++)
-    {
-        const Transform Tbm_ = Tbm[pid];
-        const Transform Tsm_ = Tbm_ * m_Tsb[0];
-
-        const unsigned int glob_shift = pid * m_model->size();
-
-        for(unsigned int vid = 0; vid < m_model->getHeight(); vid++)
-        {
-            for(unsigned int hid = 0; hid < m_model->getWidth(); hid++)
-            {
-                const unsigned int loc_id = m_model->getBufferId(vid, hid);
-                const unsigned int glob_id = glob_shift + loc_id;
-
-                const Vector ray_dir_s = m_model->getDirection(vid, hid);
-                const Vector ray_dir_m = Tsm_.R * ray_dir_s;
-
-                const Vector ray_orig_s = m_model->getOrigin(vid, hid);
-                const Vector ray_orig_m = Tsm_ * ray_orig_s;
-
-                // std::cout << ray_dir_s.x << " " << ray_dir_s.y << " " << ray_dir_s.z << std::endl;
-
-                RTCRayHit rayhit;
-                rayhit.ray.org_x = ray_orig_m.x;
-                rayhit.ray.org_y = ray_orig_m.y;
-                rayhit.ray.org_z = ray_orig_m.z;
-                rayhit.ray.dir_x = ray_dir_m.x;
-                rayhit.ray.dir_y = ray_dir_m.y;
-                rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
-                rayhit.ray.mask = -1;
-                rayhit.ray.flags = 0;
-                rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
-                rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
-
-                rtcIntersect1(m_map->scene->handle(), &rayhit);
-                
-                if(rayhit.hit.geomID != RTC_INVALID_GEOMETRY_ID)
-                {
-                    ranges[glob_id] = rayhit.ray.tfar;
-                } else {
-                    ranges[glob_id] = m_model->range.max + 1.0;
-                }
-            }
-        }
-    }
-}
-
-Memory<float, RAM> O1DnSimulatorEmbree::simulateRanges(
-    const MemoryView<Transform, RAM>& Tbm) const
-{
-    Memory<float, RAM> res(m_model->size() * Tbm.size());
-    simulateRanges(Tbm, res);
-    return res;
-}
-
-void O1DnSimulatorEmbree::simulateHits(
-    const MemoryView<Transform, RAM>& Tbm, 
-    MemoryView<uint8_t, RAM>& hits) const
-{
-    #pragma omp parallel for
-    for(size_t pid = 0; pid < Tbm.size(); pid++)
-    {
-        const Transform Tbm_ = Tbm[pid];
-        const Transform Tsm_ = Tbm_ * m_Tsb[0];
-
-        const unsigned int glob_shift = pid * m_model->size();
-
-        for(unsigned int vid = 0; vid < m_model->getHeight(); vid++)
-        {
-            for(unsigned int hid = 0; hid < m_model->getWidth(); hid++)
-            {
-                const unsigned int loc_id = m_model->getBufferId(vid, hid);
-                const unsigned int glob_id = glob_shift + loc_id;
-
-                const Vector ray_dir_s = m_model->getDirection(vid, hid);
-                const Vector ray_dir_m = Tsm_.R * ray_dir_s;
-
-                const Vector ray_orig_s = m_model->getOrigin(vid, hid);
-                const Vector ray_orig_m = Tsm_ * ray_orig_s;
-
-                RTCRayHit rayhit;
-                rayhit.ray.org_x = ray_orig_m.x;
-                rayhit.ray.org_y = ray_orig_m.y;
-                rayhit.ray.org_z = ray_orig_m.z;
-                rayhit.ray.dir_x = ray_dir_m.x;
-                rayhit.ray.dir_y = ray_dir_m.y;
-                rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
-                rayhit.ray.mask = -1;
-                rayhit.ray.flags = 0;
-                rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
-                rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
-
-                rtcIntersect1(m_map->scene->handle(), &rayhit);
-                
-                if(rayhit.hit.geomID != RTC_INVALID_GEOMETRY_ID)
-                {
-                    hits[glob_id] = 1;
-                } else {
-                    hits[glob_id] = 0;
-                }
-            }
-        }
-    }
-}
-
-Memory<uint8_t, RAM> O1DnSimulatorEmbree::simulateHits(
-    const MemoryView<Transform, RAM>& Tbm) const
-{
-    Memory<uint8_t, RAM> res(m_model->size() * Tbm.size());
-    simulateHits(Tbm, res);
-    return res;
 }
 
 } // namespace rmagine
\ No newline at end of file
diff --git a/src/rmagine_embree/src/simulation/OnDnSimulatorEmbree.cpp b/src/rmagine_embree/src/simulation/OnDnSimulatorEmbree.cpp
index 5e4ee7b..702831d 100644
--- a/src/rmagine_embree/src/simulation/OnDnSimulatorEmbree.cpp
+++ b/src/rmagine_embree/src/simulation/OnDnSimulatorEmbree.cpp
@@ -54,134 +54,6 @@ void OnDnSimulatorEmbree::setModel(
     m_model->range = model->range;
     m_model->origs = model->origs;
     m_model->dirs = model->dirs;
-
-    // this is shallow copy becauce pointer is copied
-    // m_model = model;
-    // copy(model[0].dirs, m_model[0].dirs);
-    // std::cout << "Set Model" << std::endl;
-    // std::cout << model[0].dirs.raw() << " -> " << m_model[0].dirs.raw() << std::endl; 
-}
-
-void OnDnSimulatorEmbree::simulateRanges(
-    const MemoryView<Transform, RAM>& Tbm,
-    MemoryView<float, RAM>& ranges) const
-{
-    #pragma omp parallel for
-    for(size_t pid = 0; pid < Tbm.size(); pid++)
-    {
-        const Transform Tbm_ = Tbm[pid];
-        const Transform Tsm_ = Tbm_ * m_Tsb[0];
-
-        const unsigned int glob_shift = pid * m_model->size();
-
-        for(unsigned int vid = 0; vid < m_model->getHeight(); vid++)
-        {
-            for(unsigned int hid = 0; hid < m_model->getWidth(); hid++)
-            {
-                const unsigned int loc_id = m_model->getBufferId(vid, hid);
-                const unsigned int glob_id = glob_shift + loc_id;
-
-                const Vector ray_dir_s = m_model->getDirection(vid, hid);
-                const Vector ray_dir_m = Tsm_.R * ray_dir_s;
-
-                const Vector ray_orig_s = m_model->getOrigin(vid, hid);
-                const Vector ray_orig_m = Tsm_ * ray_orig_s;
-
-                // std::cout << ray_dir_s.x << " " << ray_dir_s.y << " " << ray_dir_s.z << std::endl;
-
-                RTCRayHit rayhit;
-                rayhit.ray.org_x = ray_orig_m.x;
-                rayhit.ray.org_y = ray_orig_m.y;
-                rayhit.ray.org_z = ray_orig_m.z;
-                rayhit.ray.dir_x = ray_dir_m.x;
-                rayhit.ray.dir_y = ray_dir_m.y;
-                rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
-                rayhit.ray.mask = -1;
-                rayhit.ray.flags = 0;
-                rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
-                rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
-
-                rtcIntersect1(m_map->scene->handle(), &rayhit);
-                
-                if(rayhit.hit.geomID != RTC_INVALID_GEOMETRY_ID)
-                {
-                    ranges[glob_id] = rayhit.ray.tfar;
-                } else {
-                    ranges[glob_id] = m_model->range.max + 1.0;
-                }
-            }
-        }
-    }
-}
-
-Memory<float, RAM> OnDnSimulatorEmbree::simulateRanges(
-    const MemoryView<Transform, RAM>& Tbm) const
-{
-    Memory<float, RAM> res(m_model->size() * Tbm.size());
-    simulateRanges(Tbm, res);
-    return res;
-}
-
-void OnDnSimulatorEmbree::simulateHits(
-    const MemoryView<Transform, RAM>& Tbm, 
-    MemoryView<uint8_t, RAM>& hits) const
-{
-    #pragma omp parallel for
-    for(size_t pid = 0; pid < Tbm.size(); pid++)
-    {
-        const Transform Tbm_ = Tbm[pid];
-        const Transform Tsm_ = Tbm_ * m_Tsb[0];
-
-        const unsigned int glob_shift = pid * m_model->size();
-
-        for(unsigned int vid = 0; vid < m_model->getHeight(); vid++)
-        {
-            for(unsigned int hid = 0; hid < m_model->getWidth(); hid++)
-            {
-                const unsigned int loc_id = m_model->getBufferId(vid, hid);
-                const unsigned int glob_id = glob_shift + loc_id;
-
-                const Vector ray_dir_s = m_model->getDirection(vid, hid);
-                const Vector ray_dir_m = Tsm_.R * ray_dir_s;
-
-                const Vector ray_orig_s = m_model->getOrigin(vid, hid);
-                const Vector ray_orig_m = Tsm_ * ray_orig_s;
-
-                RTCRayHit rayhit;
-                rayhit.ray.org_x = ray_orig_m.x;
-                rayhit.ray.org_y = ray_orig_m.y;
-                rayhit.ray.org_z = ray_orig_m.z;
-                rayhit.ray.dir_x = ray_dir_m.x;
-                rayhit.ray.dir_y = ray_dir_m.y;
-                rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
-                rayhit.ray.mask = -1;
-                rayhit.ray.flags = 0;
-                rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
-                rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
-
-                rtcIntersect1(m_map->scene->handle(), &rayhit);
-                
-                if(rayhit.hit.geomID != RTC_INVALID_GEOMETRY_ID)
-                {
-                    hits[glob_id] = 1;
-                } else {
-                    hits[glob_id] = 0;
-                }
-            }
-        }
-    }
-}
-
-Memory<uint8_t, RAM> OnDnSimulatorEmbree::simulateHits(
-    const MemoryView<Transform, RAM>& Tbm) const
-{
-    Memory<uint8_t, RAM> res(m_model->size() * Tbm.size());
-    simulateHits(Tbm, res);
-    return res;
 }
 
 } // namespace rmagine
\ No newline at end of file
diff --git a/src/rmagine_embree/src/simulation/PinholeSimulatorEmbree.cpp b/src/rmagine_embree/src/simulation/PinholeSimulatorEmbree.cpp
index 48db4f4..c93b867 100644
--- a/src/rmagine_embree/src/simulation/PinholeSimulatorEmbree.cpp
+++ b/src/rmagine_embree/src/simulation/PinholeSimulatorEmbree.cpp
@@ -49,118 +49,4 @@ void PinholeSimulatorEmbree::setModel(const PinholeModel& model)
     m_model[0] = model;
 }
 
-void PinholeSimulatorEmbree::simulateRanges(
-    const MemoryView<Transform, RAM>& Tbm,
-    MemoryView<float, RAM>& ranges) const
-{
-    #pragma omp parallel for
-    for(size_t pid = 0; pid < Tbm.size(); pid++)
-    {
-        const Transform Tbm_ = Tbm[pid];
-        const Transform Tsm_ = Tbm_ * m_Tsb[0];
-
-        const unsigned int glob_shift = pid * m_model->size();
-
-        for(unsigned int vid = 0; vid < m_model->getHeight(); vid++)
-        {
-            for(unsigned int hid = 0; hid < m_model->getWidth(); hid++)
-            {
-                const unsigned int loc_id = m_model->getBufferId(vid, hid);
-                const unsigned int glob_id = glob_shift + loc_id;
-
-                const Vector ray_dir_s = m_model->getDirection(vid, hid);
-                const Vector ray_dir_m = Tsm_.R * ray_dir_s;
-
-                RTCRayHit rayhit;
-                rayhit.ray.org_x = Tsm_.t.x;
-                rayhit.ray.org_y = Tsm_.t.y;
-                rayhit.ray.org_z = Tsm_.t.z;
-                rayhit.ray.dir_x = ray_dir_m.x;
-                rayhit.ray.dir_y = ray_dir_m.y;
-                rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
-                rayhit.ray.mask = -1;
-                rayhit.ray.flags = 0;
-                rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
-                rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
-
-                rtcIntersect1(m_map->scene->handle(), &rayhit);
-                
-                if(rayhit.hit.geomID != RTC_INVALID_GEOMETRY_ID)
-                {
-                    ranges[glob_id] = rayhit.ray.tfar;
-                } else {
-                    ranges[glob_id] = m_model->range.max + 1.0;
-                }
-            }
-        }
-    }
-}
-
-Memory<float, RAM> PinholeSimulatorEmbree::simulateRanges(
-    const MemoryView<Transform, RAM>& Tbm) const
-{
-    Memory<float, RAM> res(m_model->size() * Tbm.size());
-    simulateRanges(Tbm, res);
-    return res;
-}
-
-void PinholeSimulatorEmbree::simulateHits(
-    const MemoryView<Transform, RAM>& Tbm, 
-    MemoryView<uint8_t, RAM>& hits) const
-{
-    #pragma omp parallel for
-    for(size_t pid = 0; pid < Tbm.size(); pid++)
-    {
-        const Transform Tbm_ = Tbm[pid];
-        const Transform Tsm_ = Tbm_ * m_Tsb[0];
-
-        const unsigned int glob_shift = pid * m_model->size();
-
-        for(unsigned int vid = 0; vid < m_model->getHeight(); vid++)
-        {
-            for(unsigned int hid = 0; hid < m_model->getWidth(); hid++)
-            {
-                const unsigned int loc_id = m_model->getBufferId(vid, hid);
-                const unsigned int glob_id = glob_shift + loc_id;
-
-                const Vector ray_dir_s = m_model->getDirection(vid, hid);
-                const Vector ray_dir_m = Tsm_.R * ray_dir_s;
-
-                RTCRayHit rayhit;
-                rayhit.ray.org_x = Tsm_.t.x;
-                rayhit.ray.org_y = Tsm_.t.y;
-                rayhit.ray.org_z = Tsm_.t.z;
-                rayhit.ray.dir_x = ray_dir_m.x;
-                rayhit.ray.dir_y = ray_dir_m.y;
-                rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
-                rayhit.ray.mask = -1;
-                rayhit.ray.flags = 0;
-                rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
-                rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
-
-                rtcIntersect1(m_map->scene->handle(), &rayhit);
-                
-                if(rayhit.hit.geomID != RTC_INVALID_GEOMETRY_ID)
-                {
-                    hits[glob_id] = 1;
-                } else {
-                    hits[glob_id] = 0;
-                }
-            }
-        }
-    }
-}
-
-Memory<uint8_t, RAM> PinholeSimulatorEmbree::simulateHits(
-    const MemoryView<Transform, RAM>& Tbm) const
-{
-    Memory<uint8_t, RAM> res(m_model->size() * Tbm.size());
-    simulateHits(Tbm, res);
-    return res;
-}
-
 } // namespace rmagine
\ No newline at end of file
diff --git a/src/rmagine_embree/src/simulation/SphereSimulatorEmbree.cpp b/src/rmagine_embree/src/simulation/SphereSimulatorEmbree.cpp
index eabbf70..a14bcf2 100644
--- a/src/rmagine_embree/src/simulation/SphereSimulatorEmbree.cpp
+++ b/src/rmagine_embree/src/simulation/SphereSimulatorEmbree.cpp
@@ -56,120 +56,4 @@ void SphereSimulatorEmbree::setModel(
     m_model[0] = model;
 }
 
-void SphereSimulatorEmbree::simulateRanges(
-    const MemoryView<Transform, RAM>& Tbm,
-    MemoryView<float, RAM>& ranges) const
-{
-    auto handle = m_map->scene->handle();
-
-    #pragma omp parallel for
-    for(size_t pid = 0; pid < Tbm.size(); pid++)
-    {
-        const Transform Tbm_ = Tbm[pid];
-        const Transform Tsm_ = Tbm_ * m_Tsb[0];
-
-        const unsigned int glob_shift = pid * m_model->size();
-
-        for(unsigned int vid = 0; vid < m_model->getHeight(); vid++)
-        {
-            for(unsigned int hid = 0; hid < m_model->getWidth(); hid++)
-            {
-                const unsigned int loc_id = m_model->getBufferId(vid, hid);
-                const unsigned int glob_id = glob_shift + loc_id;
-
-                const Vector ray_dir_s = m_model->getDirection(vid, hid);
-                const Vector ray_dir_m = Tsm_.R * ray_dir_s;
-
-                RTCRayHit rayhit;
-                rayhit.ray.org_x = Tsm_.t.x;
-                rayhit.ray.org_y = Tsm_.t.y;
-                rayhit.ray.org_z = Tsm_.t.z;
-                rayhit.ray.dir_x = ray_dir_m.x;
-                rayhit.ray.dir_y = ray_dir_m.y;
-                rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
-                rayhit.ray.mask = -1;
-                rayhit.ray.flags = 0;
-                rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
-                rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
-
-                rtcIntersect1(m_map->scene->handle(), &rayhit);
-                
-                if(rayhit.hit.geomID != RTC_INVALID_GEOMETRY_ID)
-                {
-                    ranges[glob_id] = rayhit.ray.tfar;
-                } else {
-                    ranges[glob_id] = m_model->range.max + 1.0;
-                }
-            }
-        }
-    }
-}
-
-Memory<float, RAM> SphereSimulatorEmbree::simulateRanges(
-    const MemoryView<Transform, RAM>& Tbm) const
-{
-    Memory<float, RAM> res(m_model->phi.size * m_model->theta.size * Tbm.size());
-    simulateRanges(Tbm, res);
-    return res;
-}
-
-void SphereSimulatorEmbree::simulateHits(
-    const MemoryView<Transform, RAM>& Tbm, 
-    MemoryView<uint8_t, RAM>& hits) const
-{
-    #pragma omp parallel for
-    for(size_t pid = 0; pid < Tbm.size(); pid++)
-    {
-        const Transform Tbm_ = Tbm[pid];
-        const Transform Tsm_ = Tbm_ * m_Tsb[0];
-
-        const unsigned int glob_shift = pid * m_model->size();
-
-        for(unsigned int vid = 0; vid < m_model->getHeight(); vid++)
-        {
-            for(unsigned int hid = 0; hid < m_model->getWidth(); hid++)
-            {
-                const unsigned int loc_id = m_model->getBufferId(vid, hid);
-                const unsigned int glob_id = glob_shift + loc_id;
-
-                const Vector ray_dir_s = m_model->getDirection(vid, hid);
-                const Vector ray_dir_m = Tsm_.R * ray_dir_s;
-
-                RTCRayHit rayhit;
-                rayhit.ray.org_x = Tsm_.t.x;
-                rayhit.ray.org_y = Tsm_.t.y;
-                rayhit.ray.org_z = Tsm_.t.z;
-                rayhit.ray.dir_x = ray_dir_m.x;
-                rayhit.ray.dir_y = ray_dir_m.y;
-                rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = std::numeric_limits<float>::infinity();
-                rayhit.ray.mask = -1;
-                rayhit.ray.flags = 0;
-                rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
-                rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
-
-                rtcIntersect1(m_map->scene->handle(), &rayhit);
-                
-                if(rayhit.hit.geomID != RTC_INVALID_GEOMETRY_ID)
-                {
-                    hits[glob_id] = 1;
-                } else {
-                    hits[glob_id] = 0;
-                }
-            }
-        }
-    }
-}
-
-Memory<uint8_t, RAM> SphereSimulatorEmbree::simulateHits(
-    const MemoryView<Transform, RAM>& Tbm) const
-{
-    Memory<uint8_t, RAM> res(m_model->phi.size * m_model->theta.size * Tbm.size());
-    simulateHits(Tbm, res);
-    return res;
-}
-
 } // namespace rmagine
\ No newline at end of file

From 6d3a9b018a6589024c2b749c7fc9ac0d1f7e0775 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 10:55:30 +0200
Subject: [PATCH 19/44] fix points

---
 .../include/rmagine/simulation/OnDnSimulatorEmbree.tcc        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc
index 7b88ee5..5b6dd51 100644
--- a/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc
@@ -63,7 +63,7 @@ void OnDnSimulatorEmbree::simulate(
         const Transform Tms_ = Tsm_.inv();
 
         const unsigned int glob_shift = pid * m_model->size();
-
+        
         for(unsigned int vid = 0; vid < m_model->getHeight(); vid++)
         {
             for(unsigned int hid = 0; hid < m_model->getWidth(); hid++)
@@ -120,7 +120,7 @@ void OnDnSimulatorEmbree::simulate(
                     {
                         if(flags.points)
                         {
-                            Vector pint = ray_dir_s * rayhit.ray.tfar;
+                            Vector pint = ray_dir_s * rayhit.ray.tfar + ray_orig_s;
                             ret.Points<RAM>::points[glob_id] = pint;
                         }
                     }

From 60f80e4682a2654d58e48ad9ac5366a74b3d845b Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 10:59:15 +0200
Subject: [PATCH 20/44] fixed O1Dn points

---
 .../include/rmagine/simulation/O1DnSimulatorEmbree.tcc          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc
index 27a5cee..2e8468c 100644
--- a/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc
@@ -119,7 +119,7 @@ void O1DnSimulatorEmbree::simulate(
                     {
                         if(flags.points)
                         {
-                            Vector pint = ray_dir_s * rayhit.ray.tfar;
+                            Vector pint = ray_dir_s * rayhit.ray.tfar + ray_orig_s;
                             ret.Points<RAM>::points[glob_id] = pint;
                         }
                     }

From 35e9c280a717aacab51542f178c7562ab1306fa2 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 11:04:15 +0200
Subject: [PATCH 21/44] added comments

---
 .../include/rmagine/simulation/PinholeSimulatorEmbree.tcc      | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc
index 857b080..7cba6a4 100644
--- a/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc
@@ -131,9 +131,10 @@ void PinholeSimulatorEmbree::simulate(
                                     rayhit.hit.Ng_y,
                                     rayhit.hit.Ng_z
                                 };
-                            
+                            // nint in map frame
                             nint.normalizeInplace();
                             nint = Tms_.R * nint;
+                            // nint in sensor frame
 
                             // flip?
                             if(ray_dir_s.dot(nint) > 0.0)

From f69045fce3b0ebf3dae2da782e8c8e03758722f3 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 11:21:03 +0200
Subject: [PATCH 22/44] introduced getPixelCoord to sensor models which is the
 inverse of getBufferId

---
 .../include/rmagine/math/types/definitions.h  | 15 ++---
 .../include/rmagine/types/sensor_models.h     | 65 ++++++++++++-------
 2 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/src/rmagine_core/include/rmagine/math/types/definitions.h b/src/rmagine_core/include/rmagine/math/types/definitions.h
index 39de285..15dcc17 100644
--- a/src/rmagine_core/include/rmagine/math/types/definitions.h
+++ b/src/rmagine_core/include/rmagine/math/types/definitions.h
@@ -41,6 +41,8 @@ struct AABB_;
 
 
 using Vector2f = Vector2_<float>;
+using Vector2u = Vector2_<uint32_t>;
+using Vector2i = Vector2_<int32_t>;
 using Vector3f = Vector3_<float>;
 using Matrix2x2f = Matrix_<float, 2, 2>;
 using Matrix3x3f = Matrix_<float, 3, 3>;
@@ -79,17 +81,12 @@ using AABB = AABB_<DefaultFloatType>;
 using Vector = Vector3;
 using Point = Vector;
 
+// @amock TODO: how to define a pixel? unsigned or signed?
+// - projection operations can result in negative pixels
+// using Pixel = Vector2u;
+// using Pixel = Vector2i;
 
 
-// struct Vector2;
-// struct Vector3;
-// struct EulerAngles;
-// struct Quaternion;
-// struct Transform;
-// struct Matrix3x3;
-// struct Matrix4x4;
-// struct AABB;
-
 } // namespace rmagine
 
 #endif // RMAGINE_MATH_DEFINITIONS_H
\ No newline at end of file
diff --git a/src/rmagine_core/include/rmagine/types/sensor_models.h b/src/rmagine_core/include/rmagine/types/sensor_models.h
index 4c69ee3..dbf190f 100644
--- a/src/rmagine_core/include/rmagine/types/sensor_models.h
+++ b/src/rmagine_core/include/rmagine/types/sensor_models.h
@@ -179,6 +179,12 @@ struct SphericalModel
     {
         return phi_id * theta.size + theta_id;
     }
+
+    RMAGINE_INLINE_FUNCTION
+    Vector2u getPixelCoord(uint32_t buffer_id)
+    {
+        return {buffer_id % theta.size, buffer_id / theta.size};
+    }
 };
 
 using LiDARModel = SphericalModel;
@@ -250,6 +256,11 @@ struct PinholeModel {
         return vid * width + hid;
     }
 
+    RMAGINE_INLINE_FUNCTION
+    Vector2u getPixelCoord(uint32_t buffer_id)
+    {
+        return {buffer_id % width, buffer_id / width};
+    }
 };
 // Distortion? Fisheye / radial-tangential ? 
 
@@ -257,20 +268,20 @@ using CameraModel = PinholeModel;
 using DepthCameraModel = PinholeModel;
 
 // TODO: distortion
-struct RadialTangentialDistortion {
-    // TODO
-};
+// struct RadialTangentialDistortion {
+//     // TODO
+// };
 
-struct FisheyeDistortion {
+// struct FisheyeDistortion {
 
-};
+// };
 
 
-struct CylindricModel {
-    static constexpr char name[] = "Cylinder";
-    // TODO
+// struct CylindricModel {
+//     static constexpr char name[] = "Cylinder";
+//     // TODO
     
-};
+// };
 
 template<typename MemT>
 struct O1DnModel_ {
@@ -303,12 +314,6 @@ struct O1DnModel_ {
         return getWidth() * getHeight();
     }
 
-    RMAGINE_INLINE_FUNCTION
-    uint32_t getBufferId(uint32_t vid, uint32_t hid) const 
-    {
-        return vid * getWidth() + hid;
-    }
-
     RMAGINE_INLINE_FUNCTION
     Vector getOrigin(uint32_t vid, uint32_t hid) const 
     {
@@ -320,6 +325,18 @@ struct O1DnModel_ {
     {
         return dirs[getBufferId(vid, hid)];
     }
+
+    RMAGINE_INLINE_FUNCTION
+    uint32_t getBufferId(uint32_t vid, uint32_t hid) const 
+    {
+        return vid * getWidth() + hid;
+    }
+
+    RMAGINE_INLINE_FUNCTION
+    Vector2u getPixelCoord(uint32_t buffer_id)
+    {
+        return {buffer_id % width, buffer_id / width};
+    }
 };
 
 using O1DnModel = O1DnModel_<RAM>;
@@ -356,13 +373,6 @@ struct OnDnModel_ {
         return getWidth() * getHeight();
     }
 
-
-    RMAGINE_INLINE_FUNCTION
-    uint32_t getBufferId(uint32_t vid, uint32_t hid) const 
-    {
-        return vid * getWidth() + hid;
-    }
-
     RMAGINE_INLINE_FUNCTION
     Vector getOrigin(uint32_t vid, uint32_t hid) const 
     {
@@ -375,6 +385,17 @@ struct OnDnModel_ {
         return dirs[getBufferId(vid, hid)];
     }
 
+    RMAGINE_INLINE_FUNCTION
+    uint32_t getBufferId(uint32_t vid, uint32_t hid) const 
+    {
+        return vid * getWidth() + hid;
+    }
+
+    RMAGINE_INLINE_FUNCTION
+    Vector2u getPixelCoord(uint32_t buffer_id)
+    {
+        return {buffer_id % width, buffer_id / width};
+    }
 };
 
 using OnDnModel = OnDnModel_<RAM>;

From d533ecd9899d857792992f7105d4e49c6149cb8a Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 11:35:28 +0200
Subject: [PATCH 23/44] added comments

---
 .../include/rmagine/simulation/SimulationResults.hpp  |  7 +++++++
 .../include/rmagine/types/sensor_models.h             | 11 +++++++++++
 2 files changed, 18 insertions(+)

diff --git a/src/rmagine_core/include/rmagine/simulation/SimulationResults.hpp b/src/rmagine_core/include/rmagine/simulation/SimulationResults.hpp
index 2072087..9942a1e 100644
--- a/src/rmagine_core/include/rmagine/simulation/SimulationResults.hpp
+++ b/src/rmagine_core/include/rmagine/simulation/SimulationResults.hpp
@@ -126,6 +126,13 @@ struct ObjectIds {
 };
 
 
+/**
+ * @brief Convenience object if we want to access all attributes at intersection
+ * 
+ * WARNING: use with care; It causes slower runtime in contrast to a more specific 
+ *          choice of attributes
+ * 
+ */
 template<typename MemT>
 using IntAttrAll = Bundle<
     Hits<MemT>,
diff --git a/src/rmagine_core/include/rmagine/types/sensor_models.h b/src/rmagine_core/include/rmagine/types/sensor_models.h
index dbf190f..042274b 100644
--- a/src/rmagine_core/include/rmagine/types/sensor_models.h
+++ b/src/rmagine_core/include/rmagine/types/sensor_models.h
@@ -396,11 +396,22 @@ struct OnDnModel_ {
     {
         return {buffer_id % width, buffer_id / width};
     }
+
+    // shall we implement buffer access here?
+    // template<typename DataT> 
+    // MemoryView<DataT> get(MemoryView<DataT>& data, uint32_t id)
+    // {
+    //     return data(id * size(), id * size() + size());
+    // }
 };
 
 using OnDnModel = OnDnModel_<RAM>;
 
 
+// template<typename ModelT, typename MemT>
+// MemoryView<MemT::DataT> slice(const ModelT& model, MemT )
+
+
 } // namespace rmagine
 
 #endif // RMAGINE_TYPES_SENSOR_MODELS_H
\ No newline at end of file

From 5e778be6fb318a34585991627c89d3bea949da30 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 14:12:03 +0200
Subject: [PATCH 24/44] removed old definitions file. added new buffer access
 functions to sensor models

---
 .../include/rmagine/math/definitions.h        |  30 ----
 .../include/rmagine/math/types/definitions.h  |   1 -
 .../include/rmagine/types/sensor_models.h     | 128 ++++++++++++++++--
 3 files changed, 115 insertions(+), 44 deletions(-)
 delete mode 100644 src/rmagine_core/include/rmagine/math/definitions.h

diff --git a/src/rmagine_core/include/rmagine/math/definitions.h b/src/rmagine_core/include/rmagine/math/definitions.h
deleted file mode 100644
index 6b585dc..0000000
--- a/src/rmagine_core/include/rmagine/math/definitions.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef RMAGINE_MATH_DEFINITIONS_OLD_H
-#define RMAGINE_MATH_DEFINITIONS_OLD_H
-
-#include <math.h>
-#include <float.h>
-#include <stdint.h>
-
-namespace rmagine
-{
-
-#define __UINT_MAX__ (__INT_MAX__ * 2U + 1U)
-
-#define DEG_TO_RAD      0.017453292519943295
-#define DEG_TO_RAD_F    0.017453292519943295f
-#define RAD_TO_DEG      57.29577951308232
-#define RAD_TO_DEG_F    57.29577951308232f
-
-// Forward declarations
-struct Vector2;
-struct Vector3;
-struct EulerAngles;
-struct Quaternion;
-struct Transform;
-struct Matrix3x3;
-struct Matrix4x4;
-struct AABB;
-
-} // namespace rmagine
-
-#endif // RMAGINE_MATH_DEFINITIONS_H
\ No newline at end of file
diff --git a/src/rmagine_core/include/rmagine/math/types/definitions.h b/src/rmagine_core/include/rmagine/math/types/definitions.h
index 15dcc17..3991316 100644
--- a/src/rmagine_core/include/rmagine/math/types/definitions.h
+++ b/src/rmagine_core/include/rmagine/math/types/definitions.h
@@ -17,7 +17,6 @@ namespace rmagine
 
 // Forward declarations
 
-
 template<typename DataT>
 struct Vector2_;
 
diff --git a/src/rmagine_core/include/rmagine/types/sensor_models.h b/src/rmagine_core/include/rmagine/types/sensor_models.h
index 042274b..d54a15b 100644
--- a/src/rmagine_core/include/rmagine/types/sensor_models.h
+++ b/src/rmagine_core/include/rmagine/types/sensor_models.h
@@ -148,6 +148,12 @@ struct SphericalModel
         return getWidth() * getHeight();
     }
 
+    RMAGINE_INLINE_FUNCTION
+    uint32_t getSize() const 
+    {
+        return getWidth() * getHeight();
+    }
+
     RMAGINE_INLINE_FUNCTION
     float getPhi(uint32_t phi_id) const
     {
@@ -181,10 +187,30 @@ struct SphericalModel
     }
 
     RMAGINE_INLINE_FUNCTION
-    Vector2u getPixelCoord(uint32_t buffer_id)
+    Vector2u getPixelCoord(uint32_t buffer_id) const
     {
         return {buffer_id % theta.size, buffer_id / theta.size};
     }
+
+    // slice horizontal line. vertical is not currently not possible because of memory layout
+    template<typename DataT, typename MemT>
+    MemoryView<DataT, MemT> getRow(const MemoryView<DataT, MemT>& mem, uint32_t vid) const 
+    {
+        return mem.slice(vid * getWidth(), (vid+1) * getWidth());
+    }
+
+    // for RAM we can access single elements of a buffer
+    template<typename DataT>
+    DataT& getPixelValue(MemoryView<DataT, RAM>& mem, uint32_t vid, uint32_t hid) const 
+    {
+        return mem[getBufferId(vid, hid)];
+    }
+
+    template<typename DataT>
+    DataT getPixelValue(const MemoryView<DataT, RAM>& mem, uint32_t vid, uint32_t hid) const 
+    {
+        return mem[getBufferId(vid, hid)];
+    }
 };
 
 using LiDARModel = SphericalModel;
@@ -221,6 +247,12 @@ struct PinholeModel {
         return getWidth() * getHeight();
     }
 
+    RMAGINE_INLINE_FUNCTION
+    uint32_t getSize() const 
+    {
+        return getWidth() * getHeight();
+    }
+
     RMAGINE_INLINE_FUNCTION
     Vector getDirectionOptical(uint32_t vid, uint32_t hid) const
     {
@@ -257,10 +289,30 @@ struct PinholeModel {
     }
 
     RMAGINE_INLINE_FUNCTION
-    Vector2u getPixelCoord(uint32_t buffer_id)
+    Vector2u getPixelCoord(uint32_t buffer_id) const
     {
         return {buffer_id % width, buffer_id / width};
     }
+
+    // slice horizontal line. vertical is not currently not possible because of memory layout
+    template<typename DataT, typename MemT>
+    MemoryView<DataT, MemT> getRow(const MemoryView<DataT, MemT>& mem, uint32_t vid) const 
+    {
+        return mem.slice(vid * getWidth(), (vid+1) * getWidth());
+    }
+
+    // for RAM we can access single elements of a buffer
+    template<typename DataT>
+    DataT& getPixelValue(MemoryView<DataT, RAM>& mem, uint32_t vid, uint32_t hid) const 
+    {
+        return mem[getBufferId(vid, hid)];
+    }
+
+    template<typename DataT>
+    DataT getPixelValue(const MemoryView<DataT, RAM>& mem, uint32_t vid, uint32_t hid) const 
+    {
+        return mem[getBufferId(vid, hid)];
+    }
 };
 // Distortion? Fisheye / radial-tangential ? 
 
@@ -308,6 +360,12 @@ struct O1DnModel_ {
         return height;
     }
 
+    RMAGINE_INLINE_FUNCTION
+    uint32_t getSize() const 
+    {
+        return getWidth() * getHeight();
+    }
+
     RMAGINE_INLINE_FUNCTION
     uint32_t size() const 
     {
@@ -333,10 +391,30 @@ struct O1DnModel_ {
     }
 
     RMAGINE_INLINE_FUNCTION
-    Vector2u getPixelCoord(uint32_t buffer_id)
+    Vector2u getPixelCoord(uint32_t buffer_id) const
     {
         return {buffer_id % width, buffer_id / width};
     }
+
+    // slice horizontal line. vertical is not currently not possible because of memory layout
+    template<typename DataT, typename MemT_>
+    MemoryView<DataT, MemT_> getRow(const MemoryView<DataT, MemT_>& mem, uint32_t vid) const 
+    {
+        return mem.slice(vid * getWidth(), (vid+1) * getWidth());
+    }
+
+    // for RAM we can access single elements of a buffer
+    template<typename DataT>
+    DataT& getPixelValue(MemoryView<DataT, RAM>& mem, uint32_t vid, uint32_t hid) const 
+    {
+        return mem[getBufferId(vid, hid)];
+    }
+
+    template<typename DataT>
+    DataT getPixelValue(const MemoryView<DataT, RAM>& mem, uint32_t vid, uint32_t hid) const 
+    {
+        return mem[getBufferId(vid, hid)];
+    }
 };
 
 using O1DnModel = O1DnModel_<RAM>;
@@ -367,6 +445,12 @@ struct OnDnModel_ {
         return height;
     }
 
+    RMAGINE_INLINE_FUNCTION
+    uint32_t getSize() const 
+    {
+        return getWidth() * getHeight();
+    }
+
     RMAGINE_INLINE_FUNCTION
     uint32_t size() const 
     {
@@ -392,25 +476,43 @@ struct OnDnModel_ {
     }
 
     RMAGINE_INLINE_FUNCTION
-    Vector2u getPixelCoord(uint32_t buffer_id)
+    Vector2u getPixelCoord(uint32_t buffer_id) const
     {
         return {buffer_id % width, buffer_id / width};
     }
 
-    // shall we implement buffer access here?
-    // template<typename DataT> 
-    // MemoryView<DataT> get(MemoryView<DataT>& data, uint32_t id)
-    // {
-    //     return data(id * size(), id * size() + size());
-    // }
+    // slice horizontal line. vertical is not currently not possible because of memory layout
+    template<typename DataT, typename MemT_>
+    MemoryView<DataT, MemT_> getRow(MemoryView<DataT, MemT_>& mem, uint32_t vid) const 
+    {
+        return mem.slice(vid * getWidth(), (vid+1) * getWidth());
+    }
+
+    // for CPU we can access single elements of a buffer
+    template<typename DataT>
+    DataT& getPixelValue(MemoryView<DataT, RAM>& mem, uint32_t vid, uint32_t hid) const 
+    {
+        return mem[getBufferId(vid, hid)];
+    }
+
+    template<typename DataT>
+    DataT getPixelValue(const MemoryView<DataT, RAM>& mem, uint32_t vid, uint32_t hid) const 
+    {
+        return mem[getBufferId(vid, hid)];
+    }
 };
 
 using OnDnModel = OnDnModel_<RAM>;
 
 
-// template<typename ModelT, typename MemT>
-// MemoryView<MemT::DataT> slice(const ModelT& model, MemT )
-
+template<typename ModelT, typename DataT, typename MemT>
+MemoryView<DataT, MemT> slice( 
+    const MemoryView<DataT, MemT>& mem,
+    const ModelT& model,
+    const uint32_t pose_id)
+{
+    return mem.slice(model.getSize() * pose_id, model.getSize() * (pose_id + 1));
+}
 
 } // namespace rmagine
 

From 7c0de91667e3edff125f34d97f9c9e32f1769e2e Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 15:33:55 +0200
Subject: [PATCH 25/44] fixed infinite recursion

---
 .../include/rmagine/simulation/O1DnSimulatorEmbree.tcc         | 2 +-
 .../include/rmagine/simulation/OnDnSimulatorEmbree.tcc         | 2 +-
 .../include/rmagine/simulation/PinholeSimulatorEmbree.tcc      | 2 +-
 .../include/rmagine/simulation/SphereSimulatorEmbree.tcc       | 3 ++-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc
index 2e8468c..63ccd22 100644
--- a/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/O1DnSimulatorEmbree.tcc
@@ -38,7 +38,7 @@ void O1DnSimulatorEmbree::simulate(
     BundleT& ret) const
 {
     const MemoryView<const Transform, RAM> Tbm_const(Tbm.raw(), Tbm.size());
-    simulate(Tbm, ret);
+    simulate(Tbm_const, ret);
 }
 
 template<typename BundleT>
diff --git a/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc
index 5b6dd51..9a875be 100644
--- a/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/OnDnSimulatorEmbree.tcc
@@ -39,7 +39,7 @@ void OnDnSimulatorEmbree::simulate(
     BundleT& ret) const
 {
     const MemoryView<const Transform, RAM> Tbm_const(Tbm.raw(), Tbm.size());
-    simulate(Tbm, ret);
+    simulate(Tbm_const, ret);
 }
 
 template<typename BundleT>
diff --git a/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc
index 7cba6a4..0e563b2 100644
--- a/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/PinholeSimulatorEmbree.tcc
@@ -39,7 +39,7 @@ void PinholeSimulatorEmbree::simulate(
     BundleT& ret) const
 {
     const MemoryView<const Transform, RAM> Tbm_const(Tbm.raw(), Tbm.size());
-    simulate(Tbm, ret);
+    simulate(Tbm_const, ret);
 }
 
 template<typename BundleT>
diff --git a/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.tcc b/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.tcc
index 683e44c..3511838 100644
--- a/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.tcc
+++ b/src/rmagine_embree/include/rmagine/simulation/SphereSimulatorEmbree.tcc
@@ -39,7 +39,7 @@ void SphereSimulatorEmbree::simulate(
     BundleT& ret) const
 {
     const MemoryView<const Transform, RAM> Tbm_const(Tbm.raw(), Tbm.size());
-    simulate(Tbm, ret);
+    simulate(Tbm_const, ret);
 }
 
 template<typename BundleT>
@@ -52,6 +52,7 @@ void SphereSimulatorEmbree::simulate(
 
     const float range_min = m_model->range.min;
     const float range_max = m_model->range.max;
+    
 
     #pragma omp parallel for
     for(size_t pid = 0; pid < Tbm.size(); pid++)

From 889822d50e700ece8f2400719f5044539d742d6f Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 16:23:55 +0200
Subject: [PATCH 26/44] svd2 diminesions fixed. tests are running succesfully

---
 src/rmagine_core/CMakeLists.txt               |   1 +
 .../include/rmagine/math/SVD2.hpp             | 161 +++++
 .../include/rmagine/math/SVD2.tcc             | 581 ++++++++++++++++++
 .../include/rmagine/math/types/Vector3.hpp    |  25 +
 src/rmagine_core/src/math/SVD2.cpp            | 420 +++++++++++++
 tests/core/CMakeLists.txt                     |   8 +
 tests/core/math_svd.cpp                       | 226 +++++++
 7 files changed, 1422 insertions(+)
 create mode 100644 src/rmagine_core/include/rmagine/math/SVD2.hpp
 create mode 100644 src/rmagine_core/include/rmagine/math/SVD2.tcc
 create mode 100644 src/rmagine_core/src/math/SVD2.cpp
 create mode 100644 tests/core/math_svd.cpp

diff --git a/src/rmagine_core/CMakeLists.txt b/src/rmagine_core/CMakeLists.txt
index 9ab8b04..20db4f3 100644
--- a/src/rmagine_core/CMakeLists.txt
+++ b/src/rmagine_core/CMakeLists.txt
@@ -8,6 +8,7 @@ set(RMAGINE_CORE_SRCS
     src/math/math.cpp
     src/math/linalg.cpp
     src/math/SVD.cpp
+    src/math/SVD2.cpp
     # Types
     src/types/Memory.cpp
     src/types/conversions.cpp
diff --git a/src/rmagine_core/include/rmagine/math/SVD2.hpp b/src/rmagine_core/include/rmagine/math/SVD2.hpp
new file mode 100644
index 0000000..8773efc
--- /dev/null
+++ b/src/rmagine_core/include/rmagine/math/SVD2.hpp
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2022, University Osnabrück
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the University Osnabrück nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL University Osnabrück BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * 
+ * @brief SVD solver for CPU Memory
+ *
+ * @date 03.10.2022
+ * @author Alexander Mock
+ * 
+ * @copyright Copyright (c) 2022, University Osnabrück. All rights reserved.
+ * This project is released under the 3-Clause BSD License.
+ * 
+ */
+
+#ifndef RMAGINE_MATH_SVD2_HPP
+#define RMAGINE_MATH_SVD2_HPP
+
+#include <rmagine/types/Memory.hpp>
+#include <rmagine/math/types.h>
+#include <memory>
+
+namespace rmagine {
+
+
+template<class T>
+inline T SQR(const T a) {return a*a;}
+
+template<class T>
+inline const T &MAX(const T &a, const T &b)
+    {return b > a ? (b) : (a);}
+
+inline float MAX(const double &a, const float &b)
+    {return b > a ? (b) : float(a);}
+
+inline float MAX(const float &a, const double &b)
+    {return b > a ? float(b) : (a);}
+
+template<class T>
+inline const T &MIN(const T &a, const T &b)
+    {return b < a ? (b) : (a);}
+
+inline float MIN(const double &a, const float &b)
+    {return b < a ? (b) : float(a);}
+
+inline float MIN(const float &a, const double &b)
+    {return b < a ? float(b) : (a);}
+
+template<class T>
+inline T SIGN(const T &a, const T &b)
+    {return b >= 0 ? (a >= 0 ? a : -a) : (a >= 0 ? -a : a);}
+
+inline float SIGN(const float &a, const double &b)
+    {return b >= 0 ? (a >= 0 ? a : -a) : (a >= 0 ? -a : a);}
+
+inline float SIGN(const double &a, const float &b)
+    {return (float)(b >= 0 ? (a >= 0 ? a : -a) : (a >= 0 ? -a : a));}
+
+template<class T>
+inline void SWAP(T &a, T &b)
+    {T dum=a; a=b; b=dum;}
+
+template<typename T>
+inline T PYTHAG(const T a, const T b)
+{
+    T absa = abs(a);
+    T absb = abs(b);
+    return (absa > absb ? absa * sqrt(1.0+SQR(absb/absa)) :
+        (absb == 0.0 ? 0.0 : absb * sqrt(1.0+SQR(absa/absb))));
+}
+
+
+class SVD2
+{
+public:
+    SVD2();
+    ~SVD2();
+
+    void decompose(const Matrix3x3& a);
+
+    Matrix3x3 u,v;
+	Vector3 w;
+private:
+    
+    void reorder();
+    float pythag(const float a, const float b);
+
+    // int m, n;
+	float eps, tsh;
+};
+
+// Numerical Recipes
+// M = MatrixT::rows()
+// N = MatrixT::cols()
+template<typename MatrixT>
+struct svd_dims {
+    using U = MatrixT; // same as input
+    using w = Matrix_<typename MatrixT::Type, MatrixT::cols(), 1>;
+    using W = Matrix_<typename MatrixT::Type, MatrixT::cols(), MatrixT::cols()>;
+    using V = Matrix_<typename MatrixT::Type, MatrixT::cols(), MatrixT::cols()>;
+};
+
+// Wikipedia: M = USV*
+// - M: mxn
+// - U: mxm
+// - S: mxn
+// - V*: nxn - V: nxn
+
+/**
+ *
+ * @brief own SVD implementation 
+ *
+ */
+template<typename DataT, unsigned int Rows, unsigned int Cols>
+void svd(
+    const Matrix_<DataT, Rows, Cols>& A,
+    Matrix_<DataT, Rows, Cols>& U,
+    Matrix_<DataT, Cols, 1>& w, // vector version (Cols should be something with max)
+    Matrix_<DataT, Cols, Cols>& V
+);
+
+template<typename DataT, unsigned int Rows, unsigned int Cols>
+void svd(
+    const Matrix_<DataT, Rows, Cols>& A, 
+    Matrix_<DataT, Rows, Cols>& U,
+    Matrix_<DataT, Cols, Cols>& W, // matrix
+    Matrix_<DataT, Cols, Cols>& V
+);
+
+using SVD2Ptr = std::shared_ptr<SVD2>;
+
+} // namespace rmagine
+
+#include "SVD2.tcc"
+
+#endif // RMAGINE_MATH_SVD2_HPP
diff --git a/src/rmagine_core/include/rmagine/math/SVD2.tcc b/src/rmagine_core/include/rmagine/math/SVD2.tcc
new file mode 100644
index 0000000..af71130
--- /dev/null
+++ b/src/rmagine_core/include/rmagine/math/SVD2.tcc
@@ -0,0 +1,581 @@
+#include <rmagine/math/types.h>
+
+namespace rmagine
+{
+
+template<typename DataT, unsigned int Rows, unsigned int Cols>
+void svd(
+    const Matrix_<DataT, Rows, Cols>& a, 
+    Matrix_<DataT, Rows, Cols>& u,
+    Matrix_<DataT, Cols, Cols>& w,
+    Matrix_<DataT, Cols, Cols>& v)
+{
+    constexpr unsigned int m = Rows;
+    constexpr unsigned int n = Cols;
+
+    constexpr unsigned int max_iterations = 30;
+
+    // extra memory required
+    bool flag;
+    int i, its, j, jj, k, l, nm;
+    float anorm, c, f, g, h, s, scale, x, y, z;
+    DataT rv1[n];
+
+    g = scale = anorm = 0.0;
+    const float eps = std::numeric_limits<float>::epsilon();
+    u = a;
+
+    for(i=0; i < n; i++) 
+    {
+        l = i + 2;
+        rv1[i] = scale * g;
+        g = s = scale = 0.0;
+        if(i < m) 
+        {
+            for(k=i; k<m; k++) 
+            {
+                scale += abs(u(k,i));
+            }
+            if(scale != 0.0) 
+            {
+                for(k=i; k<m; k++) 
+                {
+                    u(k,i) /= scale;
+                    s += u(k,i) * u(k,i);
+                }
+                f = u(i,i);
+                g = -SIGN(sqrt(s),f);
+                h = f*g-s;
+                u(i,i) = f-g;
+                for(j=l-1;j<n;j++) 
+                {
+                    for (s=0.0,k=i;k<m;k++) 
+                    {
+                        s += u(k,i) * u(k,j);
+                    }
+                    f = s/h;
+                    for (k=i;k<m;k++)
+                    {
+                        u(k,j) += f * u(k,i);
+                    }
+                }
+                for (k=i;k<m;k++)
+                {
+                    u(k,i) *= scale;
+                }
+            }
+        }
+        w(i, i) = scale * g;
+        g = s = scale = 0.0;
+        if(i+1 <= m && i+1 != n)
+        {
+            for(k=l-1; k<n; k++)
+            {
+                scale += abs(u(i,k));
+            }
+            if(scale != 0.0)
+            {
+                for(k=l-1;k<n;k++) 
+                {
+                    u(i, k) /= scale;
+                    s += u(i,k) * u(i,k);
+                }
+                f = u(i, l-1);
+                g = -SIGN(sqrt(s),f);
+                h = f*g-s;
+                u(i,l-1) = f-g;
+                for(k=l-1;k<n;k++)
+                {
+                    rv1[k] = u(i,k) / h;
+                }
+                for(j=l-1; j<m; j++)
+                {
+                    for (s=0.0,k=l-1; k<n; k++)
+                    {
+                        s += u(j,k) * u(i,k);
+                    }
+                    for (k=l-1; k<n;k++)
+                    {
+                        u(j,k) += s * rv1[k];
+                    }
+                }
+                for(k=l-1; k<n; k++)
+                {
+                    u(i,k) *= scale;
+                }
+            }
+        }
+        anorm = MAX(anorm, (abs(w(i, i))+abs(rv1[i])));
+    }
+    for(i=n-1; i>=0; i--)
+    {
+        if(i < n-1) 
+        {
+            if(g != 0.0)
+            {
+                for(j=l; j<n; j++)
+                {
+                    v(j,i) = (u(i,j)/u(i,l)) / g;
+                }
+                for(j=l; j<n; j++)
+                {
+                    for (s=0.0,k=l;k<n;k++) 
+                    {
+                        s += u(i,k) * v(k,j);
+                    }
+                    for (k=l; k<n; k++)
+                    {
+                        v(k,j) += s * v(k,i);
+                    }
+                }
+            }
+            for (j=l; j<n; j++) 
+            {
+                v(i,j) = 0.0;
+                v(j,i) = 0.0;
+            }
+        }
+        v(i,i) = 1.0;
+        g = rv1[i];
+        l = i;
+    }
+    for(i=MIN(m,n)-1; i>=0; i--)
+    {
+        l = i+1;
+        g = w(i, i);
+        for(j=l;j<n;j++) 
+        {
+            u(i,j) = 0.0;
+        }
+        if(g != 0.0)
+        {
+            g = 1.0/g;
+            for(j=l; j<n; j++)
+            {
+                for (s=0.0,k=l; k<m; k++) 
+                {
+                    s += u(k,i)*u(k,j);
+                }
+                f = (s/u(i,i)) * g;
+                for (k=i; k<m; k++) 
+                {
+                    u(k,j) += f * u(k,i);
+                }
+            }
+            for(j=i;j<m;j++) 
+            {
+                u(j,i) *= g;
+            }
+        } else {
+            for(j=i;j<m;j++) 
+            {
+                u(j,i) = 0.0;
+            } 
+        }
+        ++u(i,i);
+    }
+    for(k=n-1; k>=0; k--) 
+    {
+        for(its=0; its<max_iterations; its++) 
+        {
+            flag=true;
+            for(l=k; l>=0; l--) 
+            {
+                nm=l-1;
+                if (l == 0 || abs(rv1[l]) <= eps*anorm) {
+                    flag=false;
+                    break;
+                }
+                if (abs(w(nm, nm)) <= eps*anorm) 
+                {
+                    break;
+                }
+            }
+            if(flag) 
+            {
+                c=0.0;
+                s=1.0;
+                for(i=l; i<k+1; i++) 
+                {
+                    f = s*rv1[i];
+                    rv1[i] = c*rv1[i];
+                    if(abs(f) <= eps*anorm) 
+                    {
+                        break;
+                    }
+                    g = w(i, i);
+                    h = PYTHAG(f,g);
+                    w(i, i) = h;
+                    h = 1.0/h;
+                    c = g*h;
+                    s = -f*h;
+                    for(j=0; j<m; j++)
+                    {
+                        y = u(j,nm);
+                        z = u(j,i);
+                        u(j,nm) = y*c+z*s;
+                        u(j,i) = z*c-y*s;
+                    }
+                }
+            }
+            z = w(k, k);
+            if (l == k)
+            {
+                if (z < 0.0)
+                {
+                    w(k,k) = -z;
+                    for (j=0;j<n;j++) 
+                    {
+                        v(j,k) = -v(j,k);
+                    }
+                }
+                break;
+            }
+            if (its == 29) 
+            {
+                throw std::runtime_error("no convergence in 30 svdcmp iterations");
+            }
+            x = w(l, l);
+            nm = k-1;
+            y = w(nm, nm);
+            g = rv1[nm];
+            h = rv1[k];
+            f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.0*h*y);
+            g = PYTHAG(f, (DataT)1.0);
+            f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+            c = s = 1.0;
+            for (j=l; j<=nm; j++) 
+            {
+                i = j+1;
+                g = rv1[i];
+                y = w(i, i);
+                h = s*g;
+                g = c*g;
+                z = PYTHAG(f,h);
+                rv1[j] = z;
+                c = f/z;
+                s = h/z;
+                f = x*c+g*s;
+                g = g*c-x*s;
+                h = y*s;
+                y *= c;
+                for (jj=0;jj<n;jj++)
+                {
+                    x = v(jj,j);
+                    z = v(jj,i);
+                    v(jj,j) = x*c+z*s;
+                    v(jj,i) = z*c-x*s;
+                }
+                z = PYTHAG(f,h);
+                w(j, j) = z;
+                if (z) 
+                {
+                    z = 1.0/z;
+                    c = f*z;
+                    s = h*z;
+                }
+                f = c*g+s*y;
+                x = c*y-s*g;
+                for (jj=0;jj<m;jj++)
+                {
+                    y = u(jj,j);
+                    z = u(jj,i);
+                    u(jj,j) = y*c+z*s;
+                    u(jj,i) = z*c-y*s;
+                }
+            }
+            rv1[l] = 0.0;
+            rv1[k] = f;
+            w(k, k) = x;
+        }
+    }
+}
+
+template<typename DataT, unsigned int Rows, unsigned int Cols>
+void svd(
+    const Matrix_<DataT, Rows, Cols>& a, 
+    Matrix_<DataT, Rows, Cols>& u,
+    Matrix_<DataT, Cols, 1>& w, // vector version
+    Matrix_<DataT, Cols, Cols>& v)
+{
+    constexpr unsigned int m = Rows;
+    constexpr unsigned int n = Cols;
+    constexpr unsigned int max_iterations = 30;
+    
+    // additional memory required
+    bool flag;
+    int i, its, j, jj, k, l, nm;
+    float anorm, c, f, g, h, s, scale, x, y, z;
+    DataT rv1[n];
+    
+    g = scale = anorm = 0.0;
+    float eps = std::numeric_limits<float>::epsilon();
+    u = a;
+
+    for(i=0; i < n; i++) 
+    {
+        l = i+2;
+        rv1[i] = scale*g;
+        g = s = scale = 0.0;
+        if(i < m) 
+        {
+            for(k=i; k<m; k++) 
+            {
+                scale += abs(u(k,i));
+            }
+            if(scale != 0.0) 
+            {
+                for(k=i; k<m; k++) 
+                {
+                    u(k,i) /= scale;
+                    s += u(k,i) * u(k,i);
+                }
+                f = u(i,i);
+                g = -SIGN(sqrt(s),f);
+                h = f*g-s;
+                u(i,i) = f-g;
+                for(j=l-1;j<n;j++) 
+                {
+                    for (s=0.0,k=i;k<m;k++) 
+                    {
+                        s += u(k,i) * u(k,j);
+                    }
+                    f = s/h;
+                    for (k=i;k<m;k++)
+                    {
+                        u(k,j) += f * u(k,i);
+                    }
+                }
+                for (k=i;k<m;k++)
+                {
+                    u(k,i) *= scale;
+                }
+            }
+        }
+        w(i, 0) = scale * g;
+        g = s = scale = 0.0;
+        if(i+1 <= m && i+1 != n)
+        {
+            for(k=l-1; k<n; k++)
+            {
+                scale += abs(u(i,k));
+            }
+            if(scale != 0.0)
+            {
+                for(k=l-1;k<n;k++) 
+                {
+                    u(i, k) /= scale;
+                    s += u(i,k) * u(i,k);
+                }
+                f = u(i, l-1);
+                g = -SIGN(sqrt(s),f);
+                h = f*g-s;
+                u(i,l-1) = f-g;
+                for(k=l-1;k<n;k++)
+                {
+                    rv1[k] = u(i,k) / h;
+                }
+                for(j=l-1; j<m; j++)
+                {
+                    for (s=0.0,k=l-1; k<n; k++)
+                    {
+                        s += u(j,k) * u(i,k);
+                    }
+                    for (k=l-1; k<n;k++)
+                    {
+                        u(j,k) += s * rv1[k];
+                    }
+                }
+                for(k=l-1; k<n; k++)
+                {
+                    u(i,k) *= scale;
+                }
+            }
+        }
+        anorm = MAX(anorm, (abs(w(i, 0))+abs(rv1[i])));
+    }
+    for(i=n-1; i>=0; i--)
+    {
+        if(i < n-1) 
+        {
+            if(g != 0.0)
+            {
+                for(j=l; j<n; j++)
+                {
+                    v(j,i) = (u(i,j)/u(i,l)) / g;
+                }
+                for(j=l; j<n; j++)
+                {
+                    for (s=0.0,k=l;k<n;k++) 
+                    {
+                        s += u(i,k) * v(k,j);
+                    }
+                    for (k=l; k<n; k++)
+                    {
+                        v(k,j) += s * v(k,i);
+                    }
+                }
+            }
+            for (j=l; j<n; j++) 
+            {
+                v(i,j) = 0.0;
+                v(j,i) = 0.0;
+            }
+        }
+        v(i,i) = 1.0;
+        g = rv1[i];
+        l = i;
+    }
+    for(i=MIN(m,n)-1; i>=0; i--)
+    {
+        l = i+1;
+        g = w(i, 0);
+        for(j=l;j<n;j++) 
+        {
+            u(i,j) = 0.0;
+        }
+        if(g != 0.0)
+        {
+            g = 1.0/g;
+            for(j=l; j<n; j++)
+            {
+                for (s=0.0,k=l; k<m; k++) 
+                {
+                    s += u(k,i)*u(k,j);
+                }
+                f = (s/u(i,i)) * g;
+                for (k=i; k<m; k++) 
+                {
+                    u(k,j) += f * u(k,i);
+                }
+            }
+            for(j=i;j<m;j++) 
+            {
+                u(j,i) *= g;
+            }
+        } else {
+            for(j=i;j<m;j++) 
+            {
+                u(j,i) = 0.0;
+            } 
+        }
+        ++u(i,i);
+    }
+    for(k=n-1; k>=0; k--) 
+    {
+        for(its=0; its<30; its++) 
+        {
+            flag=true;
+            for(l=k; l>=0; l--) 
+            {
+                nm=l-1;
+                if (l == 0 || abs(rv1[l]) <= eps*anorm) {
+                    flag=false;
+                    break;
+                }
+                if (abs(w(nm, 0)) <= eps*anorm) 
+                {
+                    break;
+                }
+            }
+            if(flag) 
+            {
+                c=0.0;
+                s=1.0;
+                for(i=l; i<k+1; i++) 
+                {
+                    f = s*rv1[i];
+                    rv1[i] = c*rv1[i];
+                    if(abs(f) <= eps*anorm) 
+                    {
+                        break;
+                    }
+                    g = w(i, 0);
+                    h = PYTHAG(f,g);
+                    w(i, 0) = h;
+                    h = 1.0/h;
+                    c = g*h;
+                    s = -f*h;
+                    for(j=0; j<m; j++)
+                    {
+                        y = u(j,nm);
+                        z = u(j,i);
+                        u(j,nm) = y*c+z*s;
+                        u(j,i) = z*c-y*s;
+                    }
+                }
+            }
+            z = w(k, 0);
+            if (l == k)
+            {
+                if (z < 0.0)
+                {
+                    w(k,0) = -z;
+                    for (j=0;j<n;j++) 
+                    {
+                        v(j,k) = -v(j,k);
+                    }
+                }
+                break;
+            }
+            if (its == 29) 
+            {
+                throw std::runtime_error("no convergence in 30 svdcmp iterations");
+            }
+            x = w(l, 0);
+            nm = k-1;
+            y = w(nm, 0);
+            g = rv1[nm];
+            h = rv1[k];
+            f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.0*h*y);
+            g = PYTHAG(f, (DataT)1.0);
+            f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+            c = s = 1.0;
+            for (j=l;j<=nm;j++) 
+            {
+                i = j+1;
+                g = rv1[i];
+                y = w(i, 0);
+                h = s*g;
+                g = c*g;
+                z = PYTHAG(f,h);
+                rv1[j] = z;
+                c = f/z;
+                s = h/z;
+                f = x*c+g*s;
+                g = g*c-x*s;
+                h = y*s;
+                y *= c;
+                for (jj=0;jj<n;jj++)
+                {
+                    x = v(jj,j);
+                    z = v(jj,i);
+                    v(jj,j) = x*c+z*s;
+                    v(jj,i) = z*c-x*s;
+                }
+                z = PYTHAG(f,h);
+                w(j, 0) = z;
+                if (z) 
+                {
+                    z = 1.0/z;
+                    c = f*z;
+                    s = h*z;
+                }
+                f = c*g+s*y;
+                x = c*y-s*g;
+                for (jj=0;jj<m;jj++)
+                {
+                    y = u(jj,j);
+                    z = u(jj,i);
+                    u(jj,j) = y*c+z*s;
+                    u(jj,i) = z*c-y*s;
+                }
+            }
+            rv1[l] = 0.0;
+            rv1[k] = f;
+            w(k, 0) = x;
+        }
+    }
+}
+
+} // namespace rmagine
diff --git a/src/rmagine_core/include/rmagine/math/types/Vector3.hpp b/src/rmagine_core/include/rmagine/math/types/Vector3.hpp
index e4a0b66..6a24951 100644
--- a/src/rmagine_core/include/rmagine/math/types/Vector3.hpp
+++ b/src/rmagine_core/include/rmagine/math/types/Vector3.hpp
@@ -245,6 +245,31 @@ struct Vector3_
     {   
         return multEwise(b);
     }
+
+    // Use with care!
+    RMAGINE_INLINE_FUNCTION
+    DataT& operator[](const size_t& idx)
+    {
+        return *(&x + idx);
+    }
+
+    RMAGINE_INLINE_FUNCTION
+    const DataT& operator[](const size_t& idx) const
+    {
+        return *(&x + idx);
+    }
+
+    RMAGINE_INLINE_FUNCTION
+    DataT& operator()(const size_t& idx)
+    {
+        return *(&x + idx);
+    }
+
+    RMAGINE_INLINE_FUNCTION
+    const DataT& operator()(const size_t& idx) const
+    {
+        return *(&x + idx);
+    }
 };
 
 } // namespace rmagine
diff --git a/src/rmagine_core/src/math/SVD2.cpp b/src/rmagine_core/src/math/SVD2.cpp
new file mode 100644
index 0000000..d804b57
--- /dev/null
+++ b/src/rmagine_core/src/math/SVD2.cpp
@@ -0,0 +1,420 @@
+#include "rmagine/math/SVD2.hpp"
+#include "rmagine/types/Memory.hpp"
+#include <assert.h>
+// #include <Eigen/Dense>
+
+namespace rmagine
+{
+
+SVD2::SVD2()
+{
+    // eps = std::numeric_limits<float>::epsilon();
+    // decompose();
+    // reorder();
+    // tsh = 0.5 * sqrt(m + n + 1.) * w(0) * eps;
+}
+
+SVD2::~SVD2()
+{
+    
+}
+
+void SVD2::decompose(
+    const Matrix3x3& a)
+{
+    bool flag;
+    int i, its, j, jj, k, l, nm;
+    float anorm,c,f,g,h,s,scale,x,y,z;
+    Vector3 rv1;
+    g = scale = anorm = 0.0;
+
+    const int m = 3;
+    const int n = 3;
+
+    eps = std::numeric_limits<float>::epsilon();
+    u = a;
+
+    for(i=0; i < n; i++) 
+    {
+        l = i+2;
+        rv1(i) = scale*g;
+        g = s = scale = 0.0;
+        if(i < m) 
+        {
+            for(k=i; k<m; k++) 
+            {
+                scale += abs(u(k,i));
+            }
+            if(scale != 0.0) 
+            {
+                for(k=i; k<m; k++) 
+                {
+                    u(k,i) /= scale;
+                    s += u(k,i) * u(k,i);
+                }
+                f = u(i,i);
+                g = -SIGN(sqrt(s),f);
+                h = f*g-s;
+                u(i,i) = f-g;
+                for(j=l-1;j<n;j++) 
+                {
+                    for (s=0.0,k=i;k<m;k++) 
+                    {
+                        s += u(k,i) * u(k,j);
+                    }
+                    f = s/h;
+                    for (k=i;k<m;k++)
+                    {
+                        u(k,j) += f * u(k,i);
+                    }
+                }
+                for (k=i;k<m;k++)
+                {
+                    u(k,i) *= scale;
+                }
+            }
+        }
+        w(i) = scale * g;
+        g = s = scale = 0.0;
+        if(i+1 <= m && i+1 != n)
+        {
+            for(k=l-1; k<n; k++) 
+            {
+                scale += abs(u(i,k));
+            }
+            if(scale != 0.0)
+            {
+                for(k=l-1;k<n;k++) 
+                {
+                    u(i, k) /= scale;
+                    s += u(i,k) * u(i,k);
+                }
+                f = u(i, l-1);
+                g = -SIGN(sqrt(s),f);
+                h = f*g-s;
+                u(i,l-1) = f-g;
+                for(k=l-1;k<n;k++)
+                {
+                    rv1(k) = u(i,k) / h;
+                }
+                for(j=l-1; j<m; j++)
+                {
+                    for (s=0.0,k=l-1; k<n; k++)
+                    {
+                        s += u(j,k) * u(i,k);
+                    }
+                    for (k=l-1; k<n;k++)
+                    {
+                        u(j,k) += s * rv1(k);
+                    }
+                }
+                for(k=l-1; k<n; k++)
+                {
+                    u(i,k) *= scale;
+                }
+            }
+        }
+        anorm = MAX(anorm, (abs(w(i))+abs(rv1(i))));
+    }
+    for(i=n-1; i>=0; i--)
+    {
+        if(i < n-1) 
+        {
+            if(g != 0.0)
+            {
+                for(j=l; j<n; j++)
+                {
+                    v(j,i) = (u(i,j)/u(i,l)) / g;
+                }
+                for(j=l; j<n; j++)
+                {
+                    for (s=0.0,k=l;k<n;k++) 
+                    {
+                        s += u(i,k) * v(k,j);
+                    }
+                    for (k=l; k<n; k++)
+                    {
+                        v(k,j) += s * v(k,i);
+                    }
+                }
+            }
+            for (j=l; j<n; j++) 
+            {
+                v(i,j) = 0.0;
+                v(j,i) = 0.0;
+            }
+        }
+        v(i,i) = 1.0;
+        g = rv1(i);
+        l = i;
+    }
+    for(i=MIN(m,n)-1; i>=0; i--)
+    {
+        l = i+1;
+        g = w(i);
+        for(j=l;j<n;j++) 
+        {
+            u(i,j) = 0.0;
+        }
+        if(g != 0.0) 
+        {
+            g = 1.0/g;
+            for(j=l;j<n;j++)
+            {
+                for (s=0.0,k=l; k<m; k++) 
+                {
+                    s += u(k,i)*u(k,j);
+                }
+                f = (s/u(i,i)) * g;
+                for (k=i; k<m; k++) 
+                {
+                    u(k,j) += f * u(k,i);
+                }
+            }
+            for(j=i;j<m;j++) 
+            {
+                u(j,i) *= g;
+            }
+        } else {
+            for(j=i;j<m;j++) 
+            {
+                u(j,i) = 0.0;
+            } 
+        }
+        ++u(i,i);
+    }
+    for(k=n-1; k>=0; k--) 
+    {
+        for(its=0; its<30; its++) 
+        {
+            flag=true;
+            for(l=k; l>=0; l--) 
+            {
+                nm=l-1;
+                if (l == 0 || abs(rv1(l)) <= eps*anorm) {
+                    flag=false;
+                    break;
+                }
+                if (abs(w(nm)) <= eps*anorm) 
+                {
+                    break;
+                }
+            }
+            if(flag) 
+            {
+                c=0.0;
+                s=1.0;
+                for(i=l; i<k+1; i++) 
+                {
+                    f = s*rv1(i);
+                    rv1(i) = c*rv1(i);
+                    if(abs(f) <= eps*anorm) 
+                    {
+                        break;
+                    }
+                    g = w(i);
+                    h = PYTHAG(f,g);
+                    w(i) = h;
+                    h = 1.0/h;
+                    c = g*h;
+                    s = -f*h;
+                    for(j=0; j<m; j++)
+                    {
+                        y = u(j,nm);
+                        z = u(j,i);
+                        u(j,nm) = y*c+z*s;
+                        u(j,i) = z*c-y*s;
+                    }
+                }
+            }
+            z = w(k);
+            if (l == k)
+            {
+                if (z < 0.0)
+                {
+                    w(k) = -z;
+                    for (j=0;j<n;j++) 
+                    {
+                        v(j,k) = -v(j,k);
+                    }
+                }
+                break;
+            }
+            if (its == 29) 
+            {
+                throw std::runtime_error("no convergence in 30 svdcmp iterations");
+            }
+            x = w(l);
+            nm = k-1;
+            y = w(nm);
+            g = rv1(nm);
+            h = rv1(k);
+            f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.0*h*y);
+            g = PYTHAG(f, 1.0f);
+            f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+            c = s = 1.0;
+            for (j=l;j<=nm;j++) 
+            {
+                i = j+1;
+                g = rv1(i);
+                y = w(i);
+                h = s*g;
+                g = c*g;
+                z = PYTHAG(f,h);
+                rv1(j) = z;
+                c = f/z;
+                s = h/z;
+                f = x*c+g*s;
+                g = g*c-x*s;
+                h = y*s;
+                y *= c;
+                for (jj=0;jj<n;jj++)
+                {
+                    x = v(jj,j);
+                    z = v(jj,i);
+                    v(jj,j) = x*c+z*s;
+                    v(jj,i) = z*c-x*s;
+                }
+                z = PYTHAG(f,h);
+                w(j) = z;
+                if (z) 
+                {
+                    z = 1.0/z;
+                    c = f*z;
+                    s = h*z;
+                }
+                f = c*g+s*y;
+                x = c*y-s*g;
+                for (jj=0;jj<m;jj++)
+                {
+                    y = u(jj,j);
+                    z = u(jj,i);
+                    u(jj,j) = y*c+z*s;
+                    u(jj,i) = z*c-y*s;
+                }
+            }
+            rv1(l) = 0.0;
+            rv1(k) = f;
+            w(k) = x;
+        }
+    }
+
+    reorder();
+    tsh = 0.5 * sqrt(m + n + 1.) * w(0) * eps;
+}
+
+void SVD2::reorder()
+{
+    int i,j,k,s, inc=1;
+    float sw;
+    Vector3 su, sv;
+
+    const int m = 3;
+    const int n = 3;
+
+    do 
+    { 
+        inc *= 3; 
+        inc++; 
+    } while (inc <= n);
+
+    do 
+    {
+        inc /= 3;
+        for (i=inc; i<n; i++)
+        {
+            sw = w(i);
+            for (k=0;k<m;k++) 
+            {
+                su(k) = u(k,i);
+            }
+            for (k=0;k<n;k++) 
+            {
+                sv(k) = v(k,i);
+            }
+            j = i;
+            while (w(j-inc) < sw) 
+            {
+                w(j) = w(j-inc);
+                for(k=0; k<m; k++) 
+                {
+                    u(k,j) = u(k,j-inc);
+                }
+                for (k=0;k<n;k++)
+                {
+                    v(k,j) = v(k,j-inc);
+                }
+                j -= inc;
+                if (j < inc)
+                {
+                    break;
+                }
+            }
+            w(j) = sw;
+            for (k=0; k<m; k++)
+            {
+                u(k,j) = su(k);
+            }
+            for (k=0; k<n; k++)
+            {
+                v(k,j) = sv(k);
+            }
+        }
+    } while (inc > 1);
+
+    for(k=0; k<n; k++) 
+    {
+        s=0;
+        for(i=0; i<m; i++) 
+        {
+            if (u(i,k) < 0.)
+            {
+                s++;
+            }
+        }
+        for(j=0; j<n; j++) 
+        {
+            if (v(j,k) < 0.) 
+            { 
+                s++;
+            }
+        }
+        if(s > (m+n)/2) 
+        {
+            for (i=0; i<m; i++) 
+            {
+                u(i,k) = -u(i,k);
+            }
+            for (j=0;j<n;j++) 
+            {
+                v(j,k) = -v(j,k);
+            }
+        }
+    }
+}
+
+// int SVD2::rank(float thresh)
+// {
+
+//     return 0;
+// }
+
+// int SVD2::nullity(float thresh)
+// {
+    
+//     return 0;
+// }
+
+// Matrix3x3 SVD2::range(float thresh)
+// {
+
+//     return Matrix3x3::Zeros();
+// }
+
+// Matrix3x3 SVD2::nullspace(float thresh)
+// {
+
+//     return Matrix3x3::Zeros();
+// }
+
+} // namespace rmagine
diff --git a/tests/core/CMakeLists.txt b/tests/core/CMakeLists.txt
index 2809af9..2977723 100644
--- a/tests/core/CMakeLists.txt
+++ b/tests/core/CMakeLists.txt
@@ -35,3 +35,11 @@ target_link_libraries(rmagine_tests_core_quaternion
 )
 
 add_test(NAME core_quaternion COMMAND rmagine_tests_core_quaternion)
+
+# 4. SVD
+add_executable(rmagine_tests_core_math_svd math_svd.cpp)
+target_link_libraries(rmagine_tests_core_math_svd
+    rmagine::core
+)
+
+add_test(NAME core_math_svd COMMAND rmagine_tests_core_math_svd)
diff --git a/tests/core/math_svd.cpp b/tests/core/math_svd.cpp
new file mode 100644
index 0000000..e32039c
--- /dev/null
+++ b/tests/core/math_svd.cpp
@@ -0,0 +1,226 @@
+#include <iostream>
+
+
+#include "rmagine/math/types.h"
+
+#include <rmagine/math/math.h>
+#include <rmagine/util/StopWatch.hpp>
+
+#include <rmagine/util/exceptions.h>
+#include <rmagine/util/prints.h>
+
+#include <rmagine/math/SVD2.hpp>
+
+#include <Eigen/Dense>
+
+#include <stdint.h>
+#include <string.h>
+
+namespace rm = rmagine;
+
+Eigen::Matrix3f& eigen_view(rm::Matrix3x3& M)
+{
+    return *reinterpret_cast<Eigen::Matrix3f*>( &M );
+}
+
+template<typename DataT, int Rows, int Cols>
+void testSVD(const Eigen::Matrix<DataT, Rows, Cols>& A)
+{
+    Eigen::JacobiSVD<Eigen::Matrix<DataT, Rows, Cols> > svd(A, Eigen::ComputeFullU | Eigen::ComputeFullV);
+}
+
+template<typename DataT, unsigned Rows, unsigned Cols>
+void testSVD(const rm::Matrix_<DataT, Rows, Cols>& A)
+{
+    using AMatT = rm::Matrix_<DataT, Rows, Cols>;
+    using UMatT = typename rm::svd_dims<AMatT>::U;
+    using WMatT = typename rm::svd_dims<AMatT>::W;
+    using VMatT = typename rm::svd_dims<AMatT>::V;
+
+
+
+    UMatT U = UMatT::Zeros();
+    WMatT W = WMatT::Zeros();
+    VMatT V = VMatT::Zeros();
+
+    rm::svd(A, U, W, V);
+}
+
+void svdTestWithPrints()
+{
+    rm::Matrix3x3 Arm;
+
+    Eigen::Matrix3f Aeig = Eigen::Matrix3f::Random(3, 3);
+    for(size_t i=0; i<3; i++)
+    {
+        for(size_t j=0; j<3; j++)
+        {
+            Arm(i, j) = Aeig(i, j);
+        }
+    }
+    
+
+    std::cout << "A: " << std::endl << Aeig << std::endl;
+
+    std::cout << std::endl;
+    std::cout << "Eigen: " << std::endl;
+    
+
+    Eigen::JacobiSVD<Eigen::Matrix3f> svdeig(Aeig, Eigen::ComputeFullU | Eigen::ComputeFullV);
+    // std::cout << "U: " << std::endl << svdeig.matrixU() << std::endl;
+    // std::cout << "V: " << std::endl << svdeig.matrixV() << std::endl;
+
+    std::cout << "U * V.T" << std::endl;
+    std::cout << svdeig.matrixU() * svdeig.matrixV().transpose() << std::endl;
+
+    // rm::SVD2 svdrm;
+    // svdrm.decompose(Arm);
+
+    std::cout << std::endl;
+    std::cout << "Rmagine: " << std::endl;
+    // std::cout  << "U: " << svdrm.u << std::endl;
+    // std::cout << "V: " << svdrm.v << std::endl;
+    // std::cout << "w: " << svdrm.w << std::endl;
+
+    std::cout << "U * V.T" << std::endl;
+    // std::cout << svdrm.u * svdrm.v.transpose() << std::endl;
+
+    // std::cout << svdrm.w << std::endl;
+
+    using AMat = rm::Matrix_<float, 3, 3>;
+    using UMat = typename rm::svd_dims<AMat>::U;
+    using WMat = typename rm::svd_dims<AMat>::W;
+    using VMat = typename rm::svd_dims<AMat>::V;
+
+    UMat Urm = UMat::Zeros();
+    WMat Wrm = WMat::Zeros();
+    VMat Vrm = VMat::Zeros();
+
+    rm::svd(Arm, Urm, Wrm, Vrm);
+
+    std::cout << Urm * Vrm.T() << std::endl;
+}
+
+template<int N, int M>
+void runtimeTest()
+{
+    rm::Matrix_<float, N, M> Arm;
+
+    Eigen::Matrix<float, N, M> Aeig = Eigen::Matrix<float, N, M>::Random(N, M);
+    for(size_t i=0; i<N; i++)
+    {
+        for(size_t j=0; j<M; j++)
+        {
+            Arm(i, j) = Aeig(i, j);
+        }
+    }
+    
+    rm::StopWatch sw;
+    double el_eig, el_rm;
+
+    sw();
+    testSVD(Aeig);
+    el_eig = sw();
+
+    sw();
+    testSVD(Arm);
+    el_rm = sw();
+
+    std::cout << "N = " << N << ", M = " << M << std::endl;
+    std::cout << "- eigen: " << el_eig*1000.0 << "ms" << std::endl;
+    std::cout << "- rmagine: " << el_rm*1000.0 << "ms" << std::endl;
+}
+
+
+template<int N, int M>
+void equalityTest()
+{
+    rm::Matrix_<float, N, M> Arm;
+
+    Eigen::Matrix<float, N, M> Aeig = Eigen::Matrix<float, N, M>::Random(N, M);
+    for(size_t i=0; i<N; i++)
+    {
+        for(size_t j=0; j<M; j++)
+        {
+            Arm(i, j) = Aeig(i, j);
+        }
+    }
+    
+    rm::StopWatch sw;
+    double el_eig, el_rm;
+
+    std::cout << "A: " << std::endl;
+    std::cout << Aeig << std::endl;
+
+    Eigen::JacobiSVD<Eigen::Matrix<float, N, M> > svdeig(Aeig, Eigen::ComputeFullU | Eigen::ComputeFullV);
+
+    auto s = svdeig.singularValues();
+
+    Eigen::Matrix<float, N, M> Seig = Eigen::Matrix<float, N, M>::Zero();
+    for(size_t i=0; i<s.rows(); i++)
+    {
+        Seig(i, i) = s(i);
+    }
+    // std::cout << "Seig: " << std::endl;
+    // std::cout << Seig << std::endl;
+
+    std::cout << "Eigen: " << std::endl;
+    auto uvt_eig = svdeig.matrixU() * Seig * svdeig.matrixV().transpose();
+
+    std::cout << uvt_eig << std::endl;
+
+    float error_eig = 0.0;
+    for(size_t i=0; i<N; i++)
+    {
+        for(size_t j=0; j<M; j++)
+        {
+            error_eig += abs(uvt_eig(i, j) - Aeig(i, j));
+        }
+    }
+
+    std::cout << "- error: " << error_eig << std::endl;
+
+    using AMat = rm::Matrix_<float, N, M>;
+    using UMat = typename rm::svd_dims<AMat>::U;
+    using WMat = typename rm::svd_dims<AMat>::W;
+    using VMat = typename rm::svd_dims<AMat>::V;
+
+    UMat Urm = UMat::Zeros();
+    WMat Wrm = WMat::Zeros();
+    VMat Vrm = VMat::Zeros();
+
+    rm::svd(Arm, Urm, Wrm, Vrm);
+
+    auto uvt_rm = Urm * Wrm * Vrm.T();
+
+    std::cout << "Rmagine: " << std::endl;
+    std::cout << uvt_rm << std::endl;
+
+    float error_rm = 0.0;
+    for(size_t i=0; i<N; i++)
+    {
+        for(size_t j=0; j<M; j++)
+        {
+            error_rm += abs(uvt_rm(i, j) - Arm(i, j));
+        }
+    }
+
+    std::cout << "- error: " << error_eig << std::endl;
+
+}
+
+int main(int argc, char** argv)
+{
+    srand((unsigned int) time(0));
+
+    
+    svdTestWithPrints();
+
+    runtimeTest<20, 30>();
+    equalityTest<5, 10>();
+
+
+
+    return 0;
+}
+

From beb9f521d3f74a16527e95c5b07c82281dd6a0ae Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 17:44:40 +0200
Subject: [PATCH 27/44] added test for parallel computation of SVD

---
 .../include/rmagine/math/SVD2.hpp             |  48 +-
 .../include/rmagine/math/SVD2.tcc             |   8 +-
 src/rmagine_core/src/math/SVD2.cpp            | 580 ++++++++++++++++++
 tests/core/math_svd.cpp                       | 151 ++++-
 4 files changed, 753 insertions(+), 34 deletions(-)

diff --git a/src/rmagine_core/include/rmagine/math/SVD2.hpp b/src/rmagine_core/include/rmagine/math/SVD2.hpp
index 8773efc..5080a0b 100644
--- a/src/rmagine_core/include/rmagine/math/SVD2.hpp
+++ b/src/rmagine_core/include/rmagine/math/SVD2.hpp
@@ -41,6 +41,7 @@
 #ifndef RMAGINE_MATH_SVD2_HPP
 #define RMAGINE_MATH_SVD2_HPP
 
+#include <rmagine/types/shared_functions.h>
 #include <rmagine/types/Memory.hpp>
 #include <rmagine/math/types.h>
 #include <memory>
@@ -94,7 +95,6 @@ inline T PYTHAG(const T a, const T b)
         (absb == 0.0 ? 0.0 : absb * sqrt(1.0+SQR(absa/absb))));
 }
 
-
 class SVD2
 {
 public:
@@ -114,9 +114,14 @@ class SVD2
 	float eps, tsh;
 };
 
+using SVD2Ptr = std::shared_ptr<SVD2>;
+
 // Numerical Recipes
 // M = MatrixT::rows()
 // N = MatrixT::cols()
+// 
+// Warning: Numerical Recipes has different SVD matrix shapes
+// than Wikipedia
 template<typename MatrixT>
 struct svd_dims {
     using U = MatrixT; // same as input
@@ -125,34 +130,49 @@ struct svd_dims {
     using V = Matrix_<typename MatrixT::Type, MatrixT::cols(), MatrixT::cols()>;
 };
 
-// Wikipedia: M = USV*
-// - M: mxn
-// - U: mxm
-// - S: mxn
-// - V*: nxn - V: nxn
-
 /**
- *
- * @brief own SVD implementation 
+ * @brief own SVD implementation. 
+ * Why use it? 
+ * - ~2x faster than Eigen
+ * - SOON: Works insided of CUDA kernels
  *
  */
 template<typename DataT, unsigned int Rows, unsigned int Cols>
 void svd(
-    const Matrix_<DataT, Rows, Cols>& A,
+    const Matrix_<DataT, Rows, Cols>& A, 
     Matrix_<DataT, Rows, Cols>& U,
-    Matrix_<DataT, Cols, 1>& w, // vector version (Cols should be something with max)
+    Matrix_<DataT, Cols, Cols>& W, // matrix
     Matrix_<DataT, Cols, Cols>& V
 );
 
 template<typename DataT, unsigned int Rows, unsigned int Cols>
 void svd(
-    const Matrix_<DataT, Rows, Cols>& A, 
+    const Matrix_<DataT, Rows, Cols>& A,
     Matrix_<DataT, Rows, Cols>& U,
-    Matrix_<DataT, Cols, Cols>& W, // matrix
+    Matrix_<DataT, Cols, 1>& w, // vector version (Cols should be something with max)
     Matrix_<DataT, Cols, Cols>& V
 );
 
-using SVD2Ptr = std::shared_ptr<SVD2>;
+/**
+ * @brief SVD that can be used for both CPU and GPU (Cuda kernels)
+ *
+ */
+// RMAGINE_FUNCTION
+void mysvd(
+    const Matrix3x3& A,
+    Matrix3x3& U,
+    Matrix3x3& W,
+    Matrix3x3& V
+);
+
+// RMAGINE_FUNCTION
+// void svd(
+//     const Matrix3x3& A, 
+//     Matrix3x3& U,
+//     Vector3& w,
+//     Matrix3x3& V
+// );
+
 
 } // namespace rmagine
 
diff --git a/src/rmagine_core/include/rmagine/math/SVD2.tcc b/src/rmagine_core/include/rmagine/math/SVD2.tcc
index af71130..b515e84 100644
--- a/src/rmagine_core/include/rmagine/math/SVD2.tcc
+++ b/src/rmagine_core/include/rmagine/math/SVD2.tcc
@@ -231,9 +231,9 @@ void svd(
                 }
                 break;
             }
-            if (its == 29) 
+            if (its == max_iterations - 1) 
             {
-                throw std::runtime_error("no convergence in 30 svdcmp iterations");
+                throw std::runtime_error("no convergence after max svdcmp iterations");
             }
             x = w(l, l);
             nm = k-1;
@@ -463,7 +463,7 @@ void svd(
     }
     for(k=n-1; k>=0; k--) 
     {
-        for(its=0; its<30; its++) 
+        for(its=0; its<max_iterations; its++) 
         {
             flag=true;
             for(l=k; l>=0; l--) 
@@ -518,7 +518,7 @@ void svd(
                 }
                 break;
             }
-            if (its == 29) 
+            if (its == max_iterations - 1) 
             {
                 throw std::runtime_error("no convergence in 30 svdcmp iterations");
             }
diff --git a/src/rmagine_core/src/math/SVD2.cpp b/src/rmagine_core/src/math/SVD2.cpp
index d804b57..0908702 100644
--- a/src/rmagine_core/src/math/SVD2.cpp
+++ b/src/rmagine_core/src/math/SVD2.cpp
@@ -417,4 +417,584 @@ void SVD2::reorder()
 //     return Matrix3x3::Zeros();
 // }
 
+// RMAGINE_FUNCTION
+void mysvd(
+    const Matrix3x3& a, 
+    Matrix3x3& u,
+    Matrix3x3& w,
+    Matrix3x3& v)
+{
+    
+    // TODO: test
+    constexpr unsigned int m = 3;
+    constexpr unsigned int n = 3;
+    constexpr unsigned int max_iterations = 30;
+    
+    // additional memory required
+    bool flag;
+    int i, its, j, jj, k, l, nm;
+    float anorm, c, f, g, h, s, scale, x, y, z;
+    
+    Vector3 rv1;
+    
+    g = scale = anorm = 0.0;
+    float eps = std::numeric_limits<float>::epsilon();
+    u = a;
+
+    for(i=0; i < n; i++) 
+    {
+        l = i+2;
+        rv1[i] = scale*g;
+        g = s = scale = 0.0;
+        if(i < m) 
+        {
+            for(k=i; k<m; k++) 
+            {
+                scale += abs(u(k,i));
+            }
+            if(scale != 0.0) 
+            {
+                for(k=i; k<m; k++) 
+                {
+                    u(k,i) /= scale;
+                    s += u(k,i) * u(k,i);
+                }
+                f = u(i,i);
+                g = -SIGN(sqrt(s),f);
+                h = f*g-s;
+                u(i,i) = f-g;
+                for(j=l-1;j<n;j++) 
+                {
+                    for (s=0.0,k=i;k<m;k++) 
+                    {
+                        s += u(k,i) * u(k,j);
+                    }
+                    f = s/h;
+                    for (k=i;k<m;k++)
+                    {
+                        u(k,j) += f * u(k,i);
+                    }
+                }
+                for (k=i;k<m;k++)
+                {
+                    u(k,i) *= scale;
+                }
+            }
+        }
+        w(i, i) = scale * g;
+        g = s = scale = 0.0;
+        if(i+1 <= m && i+1 != n)
+        {
+            for(k=l-1; k<n; k++)
+            {
+                scale += abs(u(i,k));
+            }
+            if(scale != 0.0)
+            {
+                for(k=l-1;k<n;k++) 
+                {
+                    u(i, k) /= scale;
+                    s += u(i,k) * u(i,k);
+                }
+                f = u(i, l-1);
+                g = -SIGN(sqrt(s),f);
+                h = f*g-s;
+                u(i,l-1) = f-g;
+                for(k=l-1;k<n;k++)
+                {
+                    rv1[k] = u(i,k) / h;
+                }
+                for(j=l-1; j<m; j++)
+                {
+                    for (s=0.0,k=l-1; k<n; k++)
+                    {
+                        s += u(j,k) * u(i,k);
+                    }
+                    for (k=l-1; k<n;k++)
+                    {
+                        u(j,k) += s * rv1[k];
+                    }
+                }
+                for(k=l-1; k<n; k++)
+                {
+                    u(i,k) *= scale;
+                }
+            }
+        }
+        anorm = MAX(anorm, (abs(w(i, i))+abs(rv1[i])));
+    }
+    for(i=n-1; i>=0; i--)
+    {
+        if(i < n-1) 
+        {
+            if(g != 0.0)
+            {
+                for(j=l; j<n; j++)
+                {
+                    v(j,i) = (u(i,j)/u(i,l)) / g;
+                }
+                for(j=l; j<n; j++)
+                {
+                    for (s=0.0,k=l;k<n;k++) 
+                    {
+                        s += u(i,k) * v(k,j);
+                    }
+                    for (k=l; k<n; k++)
+                    {
+                        v(k,j) += s * v(k,i);
+                    }
+                }
+            }
+            for (j=l; j<n; j++) 
+            {
+                v(i,j) = 0.0;
+                v(j,i) = 0.0;
+            }
+        }
+        v(i,i) = 1.0;
+        g = rv1[i];
+        l = i;
+    }
+    for(i=MIN(m,n)-1; i>=0; i--)
+    {
+        l = i+1;
+        g = w(i, i);
+        for(j=l;j<n;j++) 
+        {
+            u(i,j) = 0.0;
+        }
+        if(g != 0.0)
+        {
+            g = 1.0/g;
+            for(j=l; j<n; j++)
+            {
+                for (s=0.0,k=l; k<m; k++) 
+                {
+                    s += u(k,i)*u(k,j);
+                }
+                f = (s/u(i,i)) * g;
+                for (k=i; k<m; k++) 
+                {
+                    u(k,j) += f * u(k,i);
+                }
+            }
+            for(j=i;j<m;j++) 
+            {
+                u(j,i) *= g;
+            }
+        } else {
+            for(j=i;j<m;j++) 
+            {
+                u(j,i) = 0.0;
+            } 
+        }
+        ++u(i,i);
+    }
+    for(k=n-1; k>=0; k--) 
+    {
+        for(its=0; its<max_iterations; its++) 
+        {
+            flag=true;
+            for(l=k; l>=0; l--) 
+            {
+                nm=l-1;
+                if (l == 0 || abs(rv1[l]) <= eps*anorm) {
+                    flag=false;
+                    break;
+                }
+                if (abs(w(nm, nm)) <= eps*anorm) 
+                {
+                    break;
+                }
+            }
+            if(flag) 
+            {
+                c=0.0;
+                s=1.0;
+                for(i=l; i<k+1; i++) 
+                {
+                    f = s*rv1[i];
+                    rv1[i] = c*rv1[i];
+                    if(abs(f) <= eps*anorm) 
+                    {
+                        break;
+                    }
+                    g = w(i, i);
+                    h = PYTHAG(f,g);
+                    w(i, i) = h;
+                    h = 1.0/h;
+                    c = g*h;
+                    s = -f*h;
+                    for(j=0; j<m; j++)
+                    {
+                        y = u(j,nm);
+                        z = u(j,i);
+                        u(j,nm) = y*c+z*s;
+                        u(j,i) = z*c-y*s;
+                    }
+                }
+            }
+            z = w(k, k);
+            if (l == k)
+            {
+                if (z < 0.0)
+                {
+                    w(k, k) = -z;
+                    for (j=0;j<n;j++) 
+                    {
+                        v(j,k) = -v(j,k);
+                    }
+                }
+                break;
+            }
+            if (its == max_iterations - 1) 
+            {
+                throw std::runtime_error("no convergence in 30 svdcmp iterations");
+            }
+            x = w(l, l);
+            nm = k-1;
+            y = w(nm, nm);
+            g = rv1[nm];
+            h = rv1[k];
+            f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
+            g = PYTHAG(f, 1.f);
+            f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+            c = s = 1.f;
+            for (j=l;j<=nm;j++) 
+            {
+                i = j+1;
+                g = rv1[i];
+                y = w(i, i);
+                h = s*g;
+                g = c*g;
+                z = PYTHAG(f,h);
+                rv1[j] = z;
+                c = f/z;
+                s = h/z;
+                f = x*c+g*s;
+                g = g*c-x*s;
+                h = y*s;
+                y *= c;
+                for (jj=0;jj<n;jj++)
+                {
+                    x = v(jj,j);
+                    z = v(jj,i);
+                    v(jj,j) = x*c+z*s;
+                    v(jj,i) = z*c-x*s;
+                }
+                z = PYTHAG(f,h);
+                w(j, j) = z;
+                if (z) 
+                {
+                    z = 1.f/z;
+                    c = f*z;
+                    s = h*z;
+                }
+                f = c*g+s*y;
+                x = c*y-s*g;
+                for (jj=0;jj<m;jj++)
+                {
+                    y = u(jj,j);
+                    z = u(jj,i);
+                    u(jj,j) = y*c+z*s;
+                    u(jj,i) = z*c-y*s;
+                }
+            }
+            rv1[l] = 0.f;
+            rv1[k] = f;
+            w(k, k) = x;
+        }
+    }
+}
+
+
+// RMAGINE_FUNCTION
+// void svd(
+//     const Matrix3x3& a, 
+//     Matrix3x3& u,
+//     Vector3& w, // matrix
+//     Matrix3x3& v)
+// {
+//     // TODO: test
+//     constexpr unsigned int m = 3;
+//     constexpr unsigned int n = 3;
+//     constexpr unsigned int max_iterations = 30;
+    
+//     // additional memory required
+//     bool flag;
+//     int i, its, j, jj, k, l, nm;
+//     float anorm, c, f, g, h, s, scale, x, y, z;
+    
+//     Vector3 rv1;
+    
+//     g = scale = anorm = 0.0;
+//     float eps = std::numeric_limits<float>::epsilon();
+//     u = a;
+
+//     for(i=0; i < n; i++) 
+//     {
+//         l = i+2;
+//         rv1[i] = scale*g;
+//         g = s = scale = 0.0;
+//         if(i < m) 
+//         {
+//             for(k=i; k<m; k++) 
+//             {
+//                 scale += abs(u(k,i));
+//             }
+//             if(scale != 0.0) 
+//             {
+//                 for(k=i; k<m; k++) 
+//                 {
+//                     u(k,i) /= scale;
+//                     s += u(k,i) * u(k,i);
+//                 }
+//                 f = u(i,i);
+//                 g = -SIGN(sqrt(s),f);
+//                 h = f*g-s;
+//                 u(i,i) = f-g;
+//                 for(j=l-1;j<n;j++) 
+//                 {
+//                     for (s=0.0,k=i;k<m;k++) 
+//                     {
+//                         s += u(k,i) * u(k,j);
+//                     }
+//                     f = s/h;
+//                     for (k=i;k<m;k++)
+//                     {
+//                         u(k,j) += f * u(k,i);
+//                     }
+//                 }
+//                 for (k=i;k<m;k++)
+//                 {
+//                     u(k,i) *= scale;
+//                 }
+//             }
+//         }
+//         w(i) = scale * g;
+//         g = s = scale = 0.0;
+//         if(i+1 <= m && i+1 != n)
+//         {
+//             for(k=l-1; k<n; k++)
+//             {
+//                 scale += abs(u(i,k));
+//             }
+//             if(scale != 0.0)
+//             {
+//                 for(k=l-1;k<n;k++) 
+//                 {
+//                     u(i, k) /= scale;
+//                     s += u(i,k) * u(i,k);
+//                 }
+//                 f = u(i, l-1);
+//                 g = -SIGN(sqrt(s),f);
+//                 h = f*g-s;
+//                 u(i,l-1) = f-g;
+//                 for(k=l-1;k<n;k++)
+//                 {
+//                     rv1[k] = u(i,k) / h;
+//                 }
+//                 for(j=l-1; j<m; j++)
+//                 {
+//                     for (s=0.0,k=l-1; k<n; k++)
+//                     {
+//                         s += u(j,k) * u(i,k);
+//                     }
+//                     for (k=l-1; k<n;k++)
+//                     {
+//                         u(j,k) += s * rv1[k];
+//                     }
+//                 }
+//                 for(k=l-1; k<n; k++)
+//                 {
+//                     u(i,k) *= scale;
+//                 }
+//             }
+//         }
+//         anorm = MAX(anorm, (abs(w(i))+abs(rv1[i])));
+//     }
+//     for(i=n-1; i>=0; i--)
+//     {
+//         if(i < n-1) 
+//         {
+//             if(g != 0.0)
+//             {
+//                 for(j=l; j<n; j++)
+//                 {
+//                     v(j,i) = (u(i,j)/u(i,l)) / g;
+//                 }
+//                 for(j=l; j<n; j++)
+//                 {
+//                     for (s=0.0,k=l;k<n;k++) 
+//                     {
+//                         s += u(i,k) * v(k,j);
+//                     }
+//                     for (k=l; k<n; k++)
+//                     {
+//                         v(k,j) += s * v(k,i);
+//                     }
+//                 }
+//             }
+//             for (j=l; j<n; j++) 
+//             {
+//                 v(i,j) = 0.0;
+//                 v(j,i) = 0.0;
+//             }
+//         }
+//         v(i,i) = 1.0;
+//         g = rv1[i];
+//         l = i;
+//     }
+//     for(i=MIN(m,n)-1; i>=0; i--)
+//     {
+//         l = i+1;
+//         g = w(i);
+//         for(j=l;j<n;j++) 
+//         {
+//             u(i,j) = 0.0;
+//         }
+//         if(g != 0.0)
+//         {
+//             g = 1.0/g;
+//             for(j=l; j<n; j++)
+//             {
+//                 for (s=0.0,k=l; k<m; k++) 
+//                 {
+//                     s += u(k,i)*u(k,j);
+//                 }
+//                 f = (s/u(i,i)) * g;
+//                 for (k=i; k<m; k++) 
+//                 {
+//                     u(k,j) += f * u(k,i);
+//                 }
+//             }
+//             for(j=i;j<m;j++) 
+//             {
+//                 u(j,i) *= g;
+//             }
+//         } else {
+//             for(j=i;j<m;j++) 
+//             {
+//                 u(j,i) = 0.0;
+//             } 
+//         }
+//         ++u(i,i);
+//     }
+//     for(k=n-1; k>=0; k--) 
+//     {
+//         for(its=0; its<max_iterations; its++) 
+//         {
+//             flag=true;
+//             for(l=k; l>=0; l--) 
+//             {
+//                 nm=l-1;
+//                 if (l == 0 || abs(rv1[l]) <= eps*anorm) {
+//                     flag=false;
+//                     break;
+//                 }
+//                 if (abs(w(nm)) <= eps*anorm) 
+//                 {
+//                     break;
+//                 }
+//             }
+//             if(flag) 
+//             {
+//                 c=0.0;
+//                 s=1.0;
+//                 for(i=l; i<k+1; i++) 
+//                 {
+//                     f = s*rv1[i];
+//                     rv1[i] = c*rv1[i];
+//                     if(abs(f) <= eps*anorm) 
+//                     {
+//                         break;
+//                     }
+//                     g = w(i);
+//                     h = PYTHAG(f,g);
+//                     w(i) = h;
+//                     h = 1.0/h;
+//                     c = g*h;
+//                     s = -f*h;
+//                     for(j=0; j<m; j++)
+//                     {
+//                         y = u(j,nm);
+//                         z = u(j,i);
+//                         u(j,nm) = y*c+z*s;
+//                         u(j,i) = z*c-y*s;
+//                     }
+//                 }
+//             }
+//             z = w(k);
+//             if (l == k)
+//             {
+//                 if (z < 0.0)
+//                 {
+//                     w(k) = -z;
+//                     for (j=0;j<n;j++) 
+//                     {
+//                         v(j,k) = -v(j,k);
+//                     }
+//                 }
+//                 break;
+//             }
+//             if (its == max_iterations - 1) 
+//             {
+//                 throw std::runtime_error("no convergence in 30 svdcmp iterations");
+//             }
+//             x = w(l);
+//             nm = k-1;
+//             y = w(nm);
+//             g = rv1[nm];
+//             h = rv1[k];
+//             f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
+//             g = PYTHAG(f, 1.f);
+//             f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+//             c = s = 1.f;
+//             for (j=l;j<=nm;j++) 
+//             {
+//                 i = j+1;
+//                 g = rv1[i];
+//                 y = w(i);
+//                 h = s*g;
+//                 g = c*g;
+//                 z = PYTHAG(f,h);
+//                 rv1[j] = z;
+//                 c = f/z;
+//                 s = h/z;
+//                 f = x*c+g*s;
+//                 g = g*c-x*s;
+//                 h = y*s;
+//                 y *= c;
+//                 for (jj=0;jj<n;jj++)
+//                 {
+//                     x = v(jj,j);
+//                     z = v(jj,i);
+//                     v(jj,j) = x*c+z*s;
+//                     v(jj,i) = z*c-x*s;
+//                 }
+//                 z = PYTHAG(f,h);
+//                 w(j) = z;
+//                 if (z) 
+//                 {
+//                     z = 1.f/z;
+//                     c = f*z;
+//                     s = h*z;
+//                 }
+//                 f = c*g+s*y;
+//                 x = c*y-s*g;
+//                 for (jj=0;jj<m;jj++)
+//                 {
+//                     y = u(jj,j);
+//                     z = u(jj,i);
+//                     u(jj,j) = y*c+z*s;
+//                     u(jj,i) = z*c-y*s;
+//                 }
+//             }
+//             rv1[l] = 0.f;
+//             rv1[k] = f;
+//             w(k) = x;
+//         }
+//     }
+// }
+
 } // namespace rmagine
diff --git a/tests/core/math_svd.cpp b/tests/core/math_svd.cpp
index e32039c..fffb58f 100644
--- a/tests/core/math_svd.cpp
+++ b/tests/core/math_svd.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 
 
-#include "rmagine/math/types.h"
+#include <rmagine/math/types.h>
 
 #include <rmagine/math/math.h>
 #include <rmagine/util/StopWatch.hpp>
@@ -18,6 +18,33 @@
 
 namespace rm = rmagine;
 
+
+float compute_error(Eigen::Matrix3f gt, Eigen::Matrix3f m)
+{
+    float ret = 0.0;
+    for(size_t i=0; i<3; i++)
+    {
+        for(size_t j=0; j<3; j++)
+        {
+            ret += abs(gt(i, j) - m(i, j));
+        }
+    }
+    return ret;
+}
+
+float compute_error(rm::Matrix3x3 gt, rm::Matrix3x3 m)
+{
+    float ret = 0.0;
+    for(size_t i=0; i<3; i++)
+    {
+        for(size_t j=0; j<3; j++)
+        {
+            ret += abs(gt(i, j) - m(i, j));
+        }
+    }
+    return ret;
+}
+
 Eigen::Matrix3f& eigen_view(rm::Matrix3x3& M)
 {
     return *reinterpret_cast<Eigen::Matrix3f*>( &M );
@@ -37,8 +64,6 @@ void testSVD(const rm::Matrix_<DataT, Rows, Cols>& A)
     using WMatT = typename rm::svd_dims<AMatT>::W;
     using VMatT = typename rm::svd_dims<AMatT>::V;
 
-
-
     UMatT U = UMatT::Zeros();
     WMatT W = WMatT::Zeros();
     VMatT V = VMatT::Zeros();
@@ -133,8 +158,9 @@ void runtimeTest()
 
 
 template<int N, int M>
-void equalityTest()
+void accuracyTest()
 {
+    std::cout << "Accuracy Test" << std::endl;
     rm::Matrix_<float, N, M> Arm;
 
     Eigen::Matrix<float, N, M> Aeig = Eigen::Matrix<float, N, M>::Random(N, M);
@@ -149,8 +175,8 @@ void equalityTest()
     rm::StopWatch sw;
     double el_eig, el_rm;
 
-    std::cout << "A: " << std::endl;
-    std::cout << Aeig << std::endl;
+    // std::cout << "A: " << std::endl;
+    // std::cout << Aeig << std::endl;
 
     Eigen::JacobiSVD<Eigen::Matrix<float, N, M> > svdeig(Aeig, Eigen::ComputeFullU | Eigen::ComputeFullV);
 
@@ -164,10 +190,11 @@ void equalityTest()
     // std::cout << "Seig: " << std::endl;
     // std::cout << Seig << std::endl;
 
-    std::cout << "Eigen: " << std::endl;
+
+    // std::cout << "Eigen: " << std::endl;
     auto uvt_eig = svdeig.matrixU() * Seig * svdeig.matrixV().transpose();
 
-    std::cout << uvt_eig << std::endl;
+    // std::cout << uvt_eig << std::endl;
 
     float error_eig = 0.0;
     for(size_t i=0; i<N; i++)
@@ -178,7 +205,7 @@ void equalityTest()
         }
     }
 
-    std::cout << "- error: " << error_eig << std::endl;
+    std::cout << "- Eigen JacobiSVD error: " << error_eig << std::endl;
 
     using AMat = rm::Matrix_<float, N, M>;
     using UMat = typename rm::svd_dims<AMat>::U;
@@ -193,8 +220,8 @@ void equalityTest()
 
     auto uvt_rm = Urm * Wrm * Vrm.T();
 
-    std::cout << "Rmagine: " << std::endl;
-    std::cout << uvt_rm << std::endl;
+    // std::cout << "Rmagine: " << std::endl;
+    // std::cout << uvt_rm << std::endl;
 
     float error_rm = 0.0;
     for(size_t i=0; i<N; i++)
@@ -205,7 +232,99 @@ void equalityTest()
         }
     }
 
-    std::cout << "- error: " << error_eig << std::endl;
+    std::cout << "- Rmagine error: " << error_eig << std::endl;
+}
+
+
+void parallelTest()
+{
+    std::cout << "parallelTest" << std::endl;
+    // correct num_objects objects in parallel
+    size_t num_objects = 1000000;
+
+    std::vector<Eigen::Matrix3f> covs_eigen(num_objects);
+    rm::Memory<rm::Matrix3x3, rm::RAM> covs_rm(num_objects);
+
+    for(size_t obj_id=0; obj_id<num_objects; obj_id++)
+    {
+        rm::Matrix3x3 Arm;
+        Eigen::Matrix3f Aeig = Eigen::Matrix3f::Random(3, 3);
+        for(size_t i=0; i<3; i++)
+        {
+            for(size_t j=0; j<3; j++)
+            {
+                Arm(i, j) = Aeig(i, j);
+            }
+        }
+
+        covs_eigen[obj_id] = Aeig;
+        covs_rm[obj_id] = Arm;
+    }
+
+
+    // C -> SVD -> UWT* -> U * W * T* -> C
+
+    std::cout << "Start computing SVD of " << num_objects << " 3x3 matrices" << std::endl;
+
+    std::vector<Eigen::Matrix3f> res_eigen(num_objects);
+
+    rm::StopWatch sw;
+    double el_eigen, el_rmagine;
+
+    sw();
+    #pragma omp parallel for
+    for(size_t obj_id=0; obj_id<num_objects; obj_id++)
+    {
+        Eigen::JacobiSVD<Eigen::Matrix3f> svdeig(covs_eigen[obj_id], 
+            Eigen::ComputeFullU | Eigen::ComputeFullV);
+        auto s = svdeig.singularValues();
+        Eigen::Matrix3f Seig = Eigen::Matrix3f::Zero();
+        for(size_t i=0; i<s.rows(); i++)
+        {
+            Seig(i, i) = s(i);
+        }
+        auto uvt_eig = svdeig.matrixU() * Seig * svdeig.matrixV().transpose();
+        res_eigen[obj_id] = uvt_eig;
+    }
+    el_eigen = sw();
+
+    float err_eigen = 0.0;
+    for(size_t obj_id = 0; obj_id < num_objects; obj_id++)
+    {
+        err_eigen += compute_error(covs_eigen[obj_id], res_eigen[obj_id]);
+    }
+
+    std::cout << "Eigen:" << std::endl;
+    std::cout << "- run time: " << el_eigen << " s" << std::endl;
+    std::cout << "- summed error: " << err_eigen << std::endl;
+
+
+    rm::Memory<rm::Matrix3x3> res_rm(num_objects);
+
+    sw();
+    #pragma omp parallel for
+    for(size_t obj_id=0; obj_id<num_objects; obj_id++)
+    {
+        rm::Matrix3x3 Urm = rm::Matrix3x3::Zeros();
+        rm::Matrix3x3 Wrm = rm::Matrix3x3::Zeros();
+        rm::Matrix3x3 Vrm = rm::Matrix3x3::Zeros();
+        rm::mysvd(covs_rm[obj_id], Urm, Wrm, Vrm);
+        auto uvt_rm = Urm * Wrm * Vrm.T();
+        res_rm[obj_id] = uvt_rm;
+    }
+    el_rmagine = sw();
+    
+    
+    float err_rmagine = 0.0;
+    for(size_t obj_id = 0; obj_id < num_objects; obj_id++)
+    {
+        err_rmagine += compute_error(covs_rm[obj_id], res_rm[obj_id]);
+    }
+    
+    std::cout << "Rmagine:" << std::endl;
+    std::cout << "- run time: " << el_rmagine << " s" << std::endl;
+    std::cout << "- summed error: " << err_rmagine << std::endl;
+
 
 }
 
@@ -213,12 +332,12 @@ int main(int argc, char** argv)
 {
     srand((unsigned int) time(0));
 
-    
-    svdTestWithPrints();
-
+    // svdTestWithPrints();
     runtimeTest<20, 30>();
-    equalityTest<5, 10>();
+    accuracyTest<20, 30>();
+
 
+    parallelTest();
 
 
     return 0;

From daa051a90ebdd5705ba21a07d1c45b767bffab70 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 17:48:04 +0200
Subject: [PATCH 28/44] tested new functions

---
 .../include/rmagine/math/SVD2.hpp             |  4 ++--
 src/rmagine_core/src/math/SVD2.cpp            | 20 ++++++++++++++++---
 tests/core/math_svd.cpp                       |  2 +-
 3 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/src/rmagine_core/include/rmagine/math/SVD2.hpp b/src/rmagine_core/include/rmagine/math/SVD2.hpp
index 5080a0b..1cdbc9d 100644
--- a/src/rmagine_core/include/rmagine/math/SVD2.hpp
+++ b/src/rmagine_core/include/rmagine/math/SVD2.hpp
@@ -157,8 +157,8 @@ void svd(
  * @brief SVD that can be used for both CPU and GPU (Cuda kernels)
  *
  */
-// RMAGINE_FUNCTION
-void mysvd(
+RMAGINE_FUNCTION
+void svd(
     const Matrix3x3& A,
     Matrix3x3& U,
     Matrix3x3& W,
diff --git a/src/rmagine_core/src/math/SVD2.cpp b/src/rmagine_core/src/math/SVD2.cpp
index 0908702..3f4665d 100644
--- a/src/rmagine_core/src/math/SVD2.cpp
+++ b/src/rmagine_core/src/math/SVD2.cpp
@@ -417,14 +417,13 @@ void SVD2::reorder()
 //     return Matrix3x3::Zeros();
 // }
 
-// RMAGINE_FUNCTION
-void mysvd(
+RMAGINE_FUNCTION
+void svd(
     const Matrix3x3& a, 
     Matrix3x3& u,
     Matrix3x3& w,
     Matrix3x3& v)
 {
-    
     // TODO: test
     constexpr unsigned int m = 3;
     constexpr unsigned int n = 3;
@@ -441,6 +440,21 @@ void mysvd(
     float eps = std::numeric_limits<float>::epsilon();
     u = a;
 
+    // i = 0;
+    // {
+
+    // }
+
+    // i = 1;
+    // {
+
+    // }
+
+    // i = 2;
+    // {
+
+    // }
+
     for(i=0; i < n; i++) 
     {
         l = i+2;
diff --git a/tests/core/math_svd.cpp b/tests/core/math_svd.cpp
index fffb58f..8182016 100644
--- a/tests/core/math_svd.cpp
+++ b/tests/core/math_svd.cpp
@@ -308,7 +308,7 @@ void parallelTest()
         rm::Matrix3x3 Urm = rm::Matrix3x3::Zeros();
         rm::Matrix3x3 Wrm = rm::Matrix3x3::Zeros();
         rm::Matrix3x3 Vrm = rm::Matrix3x3::Zeros();
-        rm::mysvd(covs_rm[obj_id], Urm, Wrm, Vrm);
+        rm::svd(covs_rm[obj_id], Urm, Wrm, Vrm);
         auto uvt_rm = Urm * Wrm * Vrm.T();
         res_rm[obj_id] = uvt_rm;
     }

From a19fa723bb6ceeea7cc240fdbe1186ba54c65f55 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 18:30:02 +0200
Subject: [PATCH 29/44] started to implement hard wired SVD for 3x3 matrices

---
 .../include/rmagine/math/SVD2.hpp             |   8 +
 src/rmagine_core/src/math/SVD2.cpp            | 446 ++++++++++++++++++
 tests/core/math_svd.cpp                       |  33 +-
 3 files changed, 484 insertions(+), 3 deletions(-)

diff --git a/src/rmagine_core/include/rmagine/math/SVD2.hpp b/src/rmagine_core/include/rmagine/math/SVD2.hpp
index 1cdbc9d..c963749 100644
--- a/src/rmagine_core/include/rmagine/math/SVD2.hpp
+++ b/src/rmagine_core/include/rmagine/math/SVD2.hpp
@@ -165,6 +165,14 @@ void svd(
     Matrix3x3& V
 );
 
+RMAGINE_FUNCTION
+void svd2(
+    const Matrix3x3& A,
+    Matrix3x3& U,
+    Matrix3x3& W,
+    Matrix3x3& V
+);
+
 // RMAGINE_FUNCTION
 // void svd(
 //     const Matrix3x3& A, 
diff --git a/src/rmagine_core/src/math/SVD2.cpp b/src/rmagine_core/src/math/SVD2.cpp
index 3f4665d..918fcb2 100644
--- a/src/rmagine_core/src/math/SVD2.cpp
+++ b/src/rmagine_core/src/math/SVD2.cpp
@@ -722,6 +722,452 @@ void svd(
 }
 
 
+RMAGINE_FUNCTION
+void svd2(
+    const Matrix3x3& a, 
+    Matrix3x3& u,
+    Matrix3x3& w,
+    Matrix3x3& v)
+{
+    // TODO: test
+    constexpr unsigned int m = 3;
+    constexpr unsigned int n = 3;
+    constexpr unsigned int max_iterations = 30;
+    
+    // additional memory required
+    bool flag;
+    int its, j, jj, k, nm;
+    float anorm, c, f, g, h, s, scale, x, y, z;
+    
+    Vector3 rv1 = Vector3::Zeros();
+    
+    g = s = scale = anorm = 0.0;
+    float eps = std::numeric_limits<float>::epsilon();
+    u = a;
+
+    // i = 0;
+    {
+        // l = 2;
+        scale = fabs(u(0,0)) + fabs(u(1,0)) + fabs(u(2,0));
+        if(scale > 0.0)
+        {
+            u(0, 0) /= scale;
+            u(1, 0) /= scale;
+            u(2, 0) /= scale;
+
+            s = u(0,0) * u(0,0) + u(1,0) * u(1,0) + u(2,0) * u(2,0);
+            f = u(0,0);
+            g = -SIGN(sqrt(s), f);
+            h = f * g - s;
+
+            u(0, 0) = f - g;
+
+            f = (u(0, 0) * u(0, 1) + u(1, 0) * u(1, 1) + u(2, 0) * u(2, 1)) / h;
+            u(0, 1) += f * u(0, 0);
+            u(1, 1) += f * u(1, 0);
+            u(2, 1) += f * u(2, 0);
+
+            f = (u(0, 0) * u(0, 2) + u(1, 0) * u(1, 2) + u(2, 0) * u(2, 2)) / h;
+            u(0, 2) += f * u(0, 0);
+            u(1, 2) += f * u(1, 0);
+            u(2, 2) += f * u(2, 0);
+
+            u(0, 0) *= scale;
+            u(1, 0) *= scale;
+            u(2, 0) *= scale;
+        }
+        
+        w(0, 0) = scale * g;
+        g = s = scale = 0.0;
+        
+        scale = abs(u(0,0)) + abs(u(0,1)) + abs(u(0,2));
+        
+        if(scale > 0.0)
+        {
+            u(0, 1) /= scale;
+            u(0, 2) /= scale;
+            s = u(0,1) * u(0,1) + u(0,2) * u(0,2);
+
+            f = u(0, 1);
+            g = -SIGN(sqrt(s),f);
+            h = f * g-s;
+            u(0, 1) = f - g;
+
+            rv1.y = u(0, 1) / h;
+            rv1.z = u(0, 2) / h;
+
+            s = u(1,1) * u(0,1) + u(1,2) * u(0,2);
+            u(1, 1) += s * rv1.y;
+            u(1, 2) += s * rv1.z;
+        
+            s = u(2,1) * u(0,1) + u(2,2) * u(0,2);
+            u(2, 1) += s * rv1.y;
+            u(2, 2) += s * rv1.z;            
+
+            u(0, 1) *= scale;
+            u(0, 2) *= scale;
+        }
+        
+        anorm = MAX(anorm, (fabs(w(0, 0)) + fabs(rv1.x)));
+    }
+    int i, l;
+
+    i = 1;
+    {
+        l = i+2;
+        rv1[i] = scale*g;
+        g = s = scale = 0.0;
+        if(i < m) 
+        {
+            for(k=i; k<m; k++) 
+            {
+                scale += abs(u(k,i));
+            }
+            if(scale != 0.0) 
+            {
+                for(k=i; k<m; k++) 
+                {
+                    u(k,i) /= scale;
+                    s += u(k,i) * u(k,i);
+                }
+                f = u(i,i);
+                g = -SIGN(sqrt(s),f);
+                h = f*g-s;
+                u(i,i) = f-g;
+                for(j=l-1;j<n;j++) 
+                {
+                    for (s=0.0,k=i;k<m;k++) 
+                    {
+                        s += u(k,i) * u(k,j);
+                    }
+                    f = s/h;
+                    for (k=i;k<m;k++)
+                    {
+                        u(k,j) += f * u(k,i);
+                    }
+                }
+                for (k=i;k<m;k++)
+                {
+                    u(k,i) *= scale;
+                }
+            }
+        }
+        w(i, i) = scale * g;
+        g = s = scale = 0.0;
+        if(i+1 <= m && i+1 != n)
+        {
+            for(k=l-1; k<n; k++)
+            {
+                scale += abs(u(i,k));
+            }
+            if(scale != 0.0)
+            {
+                for(k=l-1;k<n;k++) 
+                {
+                    u(i, k) /= scale;
+                    s += u(i,k) * u(i,k);
+                }
+                f = u(i, l-1);
+                g = -SIGN(sqrt(s),f);
+                h = f*g-s;
+                u(i,l-1) = f-g;
+                for(k=l-1;k<n;k++)
+                {
+                    rv1[k] = u(i,k) / h;
+                }
+                for(j=l-1; j<m; j++)
+                {
+                    for (s=0.0,k=l-1; k<n; k++)
+                    {
+                        s += u(j,k) * u(i,k);
+                    }
+                    for (k=l-1; k<n;k++)
+                    {
+                        u(j,k) += s * rv1[k];
+                    }
+                }
+                for(k=l-1; k<n; k++)
+                {
+                    u(i,k) *= scale;
+                }
+            }
+        }
+        anorm = MAX(anorm, (abs(w(i, i))+abs(rv1[i])));
+    }
+
+    i = 2;
+    {
+        l = i+2;
+        rv1[i] = scale*g;
+        g = s = scale = 0.0;
+        if(i < m) 
+        {
+            for(k=i; k<m; k++) 
+            {
+                scale += abs(u(k,i));
+            }
+            if(scale != 0.0) 
+            {
+                for(k=i; k<m; k++) 
+                {
+                    u(k,i) /= scale;
+                    s += u(k,i) * u(k,i);
+                }
+                f = u(i,i);
+                g = -SIGN(sqrt(s),f);
+                h = f*g-s;
+                u(i,i) = f-g;
+                for(j=l-1;j<n;j++) 
+                {
+                    for (s=0.0,k=i;k<m;k++) 
+                    {
+                        s += u(k,i) * u(k,j);
+                    }
+                    f = s/h;
+                    for (k=i;k<m;k++)
+                    {
+                        u(k,j) += f * u(k,i);
+                    }
+                }
+                for (k=i;k<m;k++)
+                {
+                    u(k,i) *= scale;
+                }
+            }
+        }
+        w(i, i) = scale * g;
+        g = s = scale = 0.0;
+        if(i+1 <= m && i+1 != n)
+        {
+            for(k=l-1; k<n; k++)
+            {
+                scale += abs(u(i,k));
+            }
+            if(scale != 0.0)
+            {
+                for(k=l-1;k<n;k++) 
+                {
+                    u(i, k) /= scale;
+                    s += u(i,k) * u(i,k);
+                }
+                f = u(i, l-1);
+                g = -SIGN(sqrt(s),f);
+                h = f*g-s;
+                u(i,l-1) = f-g;
+                for(k=l-1;k<n;k++)
+                {
+                    rv1[k] = u(i,k) / h;
+                }
+                for(j=l-1; j<m; j++)
+                {
+                    for (s=0.0,k=l-1; k<n; k++)
+                    {
+                        s += u(j,k) * u(i,k);
+                    }
+                    for (k=l-1; k<n;k++)
+                    {
+                        u(j,k) += s * rv1[k];
+                    }
+                }
+                for(k=l-1; k<n; k++)
+                {
+                    u(i,k) *= scale;
+                }
+            }
+        }
+        anorm = MAX(anorm, (abs(w(i, i))+abs(rv1[i])));
+    }
+
+    // for(i=0; i < n; i++) 
+    // {
+        
+    // }
+    for(i=n-1; i>=0; i--)
+    {
+        if(i < n-1) 
+        {
+            if(g != 0.0)
+            {
+                for(j=l; j<n; j++)
+                {
+                    v(j,i) = (u(i,j)/u(i,l)) / g;
+                }
+                for(j=l; j<n; j++)
+                {
+                    for (s=0.0,k=l;k<n;k++) 
+                    {
+                        s += u(i,k) * v(k,j);
+                    }
+                    for (k=l; k<n; k++)
+                    {
+                        v(k,j) += s * v(k,i);
+                    }
+                }
+            }
+            for (j=l; j<n; j++) 
+            {
+                v(i,j) = 0.0;
+                v(j,i) = 0.0;
+            }
+        }
+        v(i,i) = 1.0;
+        g = rv1[i];
+        l = i;
+    }
+    for(i=MIN(m,n)-1; i>=0; i--)
+    {
+        l = i+1;
+        g = w(i, i);
+        for(j=l;j<n;j++) 
+        {
+            u(i,j) = 0.0;
+        }
+        if(g != 0.0)
+        {
+            g = 1.0/g;
+            for(j=l; j<n; j++)
+            {
+                for (s=0.0,k=l; k<m; k++) 
+                {
+                    s += u(k,i)*u(k,j);
+                }
+                f = (s/u(i,i)) * g;
+                for (k=i; k<m; k++) 
+                {
+                    u(k,j) += f * u(k,i);
+                }
+            }
+            for(j=i;j<m;j++) 
+            {
+                u(j,i) *= g;
+            }
+        } else {
+            for(j=i;j<m;j++) 
+            {
+                u(j,i) = 0.0;
+            } 
+        }
+        ++u(i,i);
+    }
+    for(k=n-1; k>=0; k--) 
+    {
+        for(its=0; its<max_iterations; its++) 
+        {
+            flag=true;
+            for(l=k; l>=0; l--) 
+            {
+                nm=l-1;
+                if (l == 0 || abs(rv1[l]) <= eps*anorm) {
+                    flag=false;
+                    break;
+                }
+                if (abs(w(nm, nm)) <= eps*anorm) 
+                {
+                    break;
+                }
+            }
+            if(flag) 
+            {
+                c=0.0;
+                s=1.0;
+                for(i=l; i<k+1; i++) 
+                {
+                    f = s*rv1[i];
+                    rv1[i] = c*rv1[i];
+                    if(abs(f) <= eps*anorm) 
+                    {
+                        break;
+                    }
+                    g = w(i, i);
+                    h = PYTHAG(f,g);
+                    w(i, i) = h;
+                    h = 1.0/h;
+                    c = g*h;
+                    s = -f*h;
+                    for(j=0; j<m; j++)
+                    {
+                        y = u(j,nm);
+                        z = u(j,i);
+                        u(j,nm) = y*c+z*s;
+                        u(j,i) = z*c-y*s;
+                    }
+                }
+            }
+            z = w(k, k);
+            if (l == k)
+            {
+                if (z < 0.0)
+                {
+                    w(k, k) = -z;
+                    for (j=0;j<n;j++) 
+                    {
+                        v(j,k) = -v(j,k);
+                    }
+                }
+                break;
+            }
+            if (its == max_iterations - 1) 
+            {
+                throw std::runtime_error("no convergence in 30 svdcmp iterations");
+            }
+            x = w(l, l);
+            nm = k-1;
+            y = w(nm, nm);
+            g = rv1[nm];
+            h = rv1[k];
+            f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
+            g = PYTHAG(f, 1.f);
+            f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+            c = s = 1.f;
+            for (j=l;j<=nm;j++) 
+            {
+                i = j+1;
+                g = rv1[i];
+                y = w(i, i);
+                h = s*g;
+                g = c*g;
+                z = PYTHAG(f,h);
+                rv1[j] = z;
+                c = f/z;
+                s = h/z;
+                f = x*c+g*s;
+                g = g*c-x*s;
+                h = y*s;
+                y *= c;
+                for (jj=0;jj<n;jj++)
+                {
+                    x = v(jj,j);
+                    z = v(jj,i);
+                    v(jj,j) = x*c+z*s;
+                    v(jj,i) = z*c-x*s;
+                }
+                z = PYTHAG(f,h);
+                w(j, j) = z;
+                if (z) 
+                {
+                    z = 1.f/z;
+                    c = f*z;
+                    s = h*z;
+                }
+                f = c*g+s*y;
+                x = c*y-s*g;
+                for (jj=0;jj<m;jj++)
+                {
+                    y = u(jj,j);
+                    z = u(jj,i);
+                    u(jj,j) = y*c+z*s;
+                    u(jj,i) = z*c-y*s;
+                }
+            }
+            rv1[l] = 0.f;
+            rv1[k] = f;
+            w(k, k) = x;
+        }
+    }
+}
+
+
+
 // RMAGINE_FUNCTION
 // void svd(
 //     const Matrix3x3& a, 
diff --git a/tests/core/math_svd.cpp b/tests/core/math_svd.cpp
index 8182016..4e17822 100644
--- a/tests/core/math_svd.cpp
+++ b/tests/core/math_svd.cpp
@@ -269,10 +269,10 @@ void parallelTest()
     std::vector<Eigen::Matrix3f> res_eigen(num_objects);
 
     rm::StopWatch sw;
-    double el_eigen, el_rmagine;
+    double el_eigen, el_rmagine, el_rmagine2;
 
     sw();
-    #pragma omp parallel for
+    // #pragma omp parallel for
     for(size_t obj_id=0; obj_id<num_objects; obj_id++)
     {
         Eigen::JacobiSVD<Eigen::Matrix3f> svdeig(covs_eigen[obj_id], 
@@ -302,7 +302,7 @@ void parallelTest()
     rm::Memory<rm::Matrix3x3> res_rm(num_objects);
 
     sw();
-    #pragma omp parallel for
+    // #pragma omp parallel for
     for(size_t obj_id=0; obj_id<num_objects; obj_id++)
     {
         rm::Matrix3x3 Urm = rm::Matrix3x3::Zeros();
@@ -326,6 +326,33 @@ void parallelTest()
     std::cout << "- summed error: " << err_rmagine << std::endl;
 
 
+    rm::Memory<rm::Matrix3x3> res_rm2(num_objects);
+
+    sw();
+    // #pragma omp parallel for
+    for(size_t obj_id=0; obj_id<num_objects; obj_id++)
+    {
+        rm::Matrix3x3 Urm = rm::Matrix3x3::Zeros();
+        rm::Matrix3x3 Wrm = rm::Matrix3x3::Zeros();
+        rm::Matrix3x3 Vrm = rm::Matrix3x3::Zeros();
+        rm::svd2(covs_rm[obj_id], Urm, Wrm, Vrm);
+        auto uvt_rm = Urm * Wrm * Vrm.T();
+        res_rm2[obj_id] = uvt_rm;
+    }
+    el_rmagine2 = sw();
+
+    float err_rmagine2 = 0.0;
+    for(size_t obj_id = 0; obj_id < num_objects; obj_id++)
+    {
+        err_rmagine2 += compute_error(covs_rm[obj_id], res_rm2[obj_id]);
+    }
+
+
+    std::cout << "Rmagine Opti:" << std::endl;
+    std::cout << "- run time: " << el_rmagine2 << " s" << std::endl;
+    std::cout << "- summed error: " << err_rmagine2 << std::endl;
+
+
 }
 
 int main(int argc, char** argv)

From d7da9291eb15dc8077188766afd5cbcdef734f7b Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 19:19:55 +0200
Subject: [PATCH 30/44] fixed error for including jsoncpp twice

---
 CMakeLists.txt                    | 2 +-
 src/rmagine_ouster/CMakeLists.txt | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2cf0396..0de0e27 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -201,7 +201,7 @@ endif(CUDA_FOUND)
 
 message(STATUS "${BoldCyan}Components build:${ColourReset}")
 foreach(LIBRARY ${RMAGINE_LIBRARIES})
-  message(STATUS "- ${BoldGreen}${LIBRARY}${ColourReset}")
+    message(STATUS "- ${BoldGreen}${LIBRARY}${ColourReset}")
 endforeach()
 
 #### TESTS
diff --git a/src/rmagine_ouster/CMakeLists.txt b/src/rmagine_ouster/CMakeLists.txt
index 6570914..8e3e830 100644
--- a/src/rmagine_ouster/CMakeLists.txt
+++ b/src/rmagine_ouster/CMakeLists.txt
@@ -1,8 +1,6 @@
 
 message(STATUS "Building Core. Library: rmagine")
 
-find_package(jsoncpp REQUIRED)
-
 set(RMAGINE_CORE_SRCS
     # Types
     src/types/ouster_sensors.cpp
@@ -26,7 +24,7 @@ target_link_libraries(rmagine-ouster
     Eigen3::Eigen
     ${Boost_LIBRARIES}
     ${OpenMP_CXX_LIBRARIES}
-    jsoncpp
+    JsonCpp::JsonCpp
 )
 
 set_target_properties(rmagine-ouster

From fd656dfd94eda1ef03491c74cf4f322c205cd23f Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 19:49:42 +0200
Subject: [PATCH 31/44] update

---
 src/rmagine_core/src/math/SVD2.cpp | 245 +++++++++++++----------------
 1 file changed, 105 insertions(+), 140 deletions(-)

diff --git a/src/rmagine_core/src/math/SVD2.cpp b/src/rmagine_core/src/math/SVD2.cpp
index 918fcb2..d9d858d 100644
--- a/src/rmagine_core/src/math/SVD2.cpp
+++ b/src/rmagine_core/src/math/SVD2.cpp
@@ -440,21 +440,6 @@ void svd(
     float eps = std::numeric_limits<float>::epsilon();
     u = a;
 
-    // i = 0;
-    // {
-
-    // }
-
-    // i = 1;
-    // {
-
-    // }
-
-    // i = 2;
-    // {
-
-    // }
-
     for(i=0; i < n; i++) 
     {
         l = i+2;
@@ -746,155 +731,139 @@ void svd2(
     u = a;
 
     // i = 0;
+    // l = 2;
+    scale = fabs(u(0,0)) + fabs(u(1,0)) + fabs(u(2,0));
+    if(scale > 0.0)
     {
-        // l = 2;
-        scale = fabs(u(0,0)) + fabs(u(1,0)) + fabs(u(2,0));
-        if(scale > 0.0)
-        {
-            u(0, 0) /= scale;
-            u(1, 0) /= scale;
-            u(2, 0) /= scale;
+        u(0, 0) /= scale;
+        u(1, 0) /= scale;
+        u(2, 0) /= scale;
 
-            s = u(0,0) * u(0,0) + u(1,0) * u(1,0) + u(2,0) * u(2,0);
-            f = u(0,0);
-            g = -SIGN(sqrt(s), f);
-            h = f * g - s;
+        s = u(0,0) * u(0,0) + u(1,0) * u(1,0) + u(2,0) * u(2,0);
+        f = u(0,0);
+        g = -SIGN(sqrt(s), f);
+        h = f * g - s;
 
-            u(0, 0) = f - g;
+        u(0, 0) = f - g;
 
-            f = (u(0, 0) * u(0, 1) + u(1, 0) * u(1, 1) + u(2, 0) * u(2, 1)) / h;
-            u(0, 1) += f * u(0, 0);
-            u(1, 1) += f * u(1, 0);
-            u(2, 1) += f * u(2, 0);
+        f = (u(0, 0) * u(0, 1) + u(1, 0) * u(1, 1) + u(2, 0) * u(2, 1)) / h;
+        u(0, 1) += f * u(0, 0);
+        u(1, 1) += f * u(1, 0);
+        u(2, 1) += f * u(2, 0);
 
-            f = (u(0, 0) * u(0, 2) + u(1, 0) * u(1, 2) + u(2, 0) * u(2, 2)) / h;
-            u(0, 2) += f * u(0, 0);
-            u(1, 2) += f * u(1, 0);
-            u(2, 2) += f * u(2, 0);
+        f = (u(0, 0) * u(0, 2) + u(1, 0) * u(1, 2) + u(2, 0) * u(2, 2)) / h;
+        u(0, 2) += f * u(0, 0);
+        u(1, 2) += f * u(1, 0);
+        u(2, 2) += f * u(2, 0);
 
-            u(0, 0) *= scale;
-            u(1, 0) *= scale;
-            u(2, 0) *= scale;
-        }
-        
-        w(0, 0) = scale * g;
-        g = s = scale = 0.0;
-        
-        scale = abs(u(0,0)) + abs(u(0,1)) + abs(u(0,2));
-        
-        if(scale > 0.0)
-        {
-            u(0, 1) /= scale;
-            u(0, 2) /= scale;
-            s = u(0,1) * u(0,1) + u(0,2) * u(0,2);
+        u(0, 0) *= scale;
+        u(1, 0) *= scale;
+        u(2, 0) *= scale;
+    }
+    
+    w(0, 0) = scale * g;
+    g = s = scale = 0.0;
+    
+    scale = abs(u(0,0)) + abs(u(0,1)) + abs(u(0,2));
+    
+    if(scale > 0.0)
+    {
+        u(0, 1) /= scale;
+        u(0, 2) /= scale;
+        s = u(0,1) * u(0,1) + u(0,2) * u(0,2);
 
-            f = u(0, 1);
-            g = -SIGN(sqrt(s),f);
-            h = f * g-s;
-            u(0, 1) = f - g;
+        f = u(0, 1);
+        g = -SIGN(sqrt(s),f);
+        h = f * g-s;
+        u(0, 1) = f - g;
+
+        rv1.y = u(0, 1) / h;
+        rv1.z = u(0, 2) / h;
 
-            rv1.y = u(0, 1) / h;
-            rv1.z = u(0, 2) / h;
+        s = u(1,1) * u(0,1) + u(1,2) * u(0,2);
+        u(1, 1) += s * rv1.y;
+        u(1, 2) += s * rv1.z;
+    
+        s = u(2,1) * u(0,1) + u(2,2) * u(0,2);
+        u(2, 1) += s * rv1.y;
+        u(2, 2) += s * rv1.z;            
+
+        u(0, 1) *= scale;
+        u(0, 2) *= scale;
+    }
+    
+    anorm = fabs(w(0, 0));
+    // anorm = MAX(anorm, (fabs(w(0, 0)) + fabs(rv1.x))); // rv1.x is always 0 here, anorm too. fabs(X) >= 0
+    
+    
 
-            s = u(1,1) * u(0,1) + u(1,2) * u(0,2);
-            u(1, 1) += s * rv1.y;
-            u(1, 2) += s * rv1.z;
+    // i = 1;
+    {
+        // l = 3; // l = 3
+        rv1.y = scale * g;
+        g = 0.0;
+        scale = fabs(u(1, 1)) + fabs(u(2, 1));
         
-            s = u(2,1) * u(0,1) + u(2,2) * u(0,2);
-            u(2, 1) += s * rv1.y;
-            u(2, 2) += s * rv1.z;            
+        if(scale > 0.0)
+        {
+            u(1,1) /= scale;
+            u(2,1) /= scale;
 
-            u(0, 1) *= scale;
-            u(0, 2) *= scale;
+            s = u(1,1) * u(1,1) + u(2,1) * u(2,1);
+            f = u(1,1);
+            g = -SIGN(sqrt(s),f);
+            h = f*g-s;
+            u(1,1) = f-g;
+            
+            f = (u(1,1) * u(1,2) + u(2,1) * u(2,2)) / h;
+            u(1,2) += f * u(1,1);
+            u(2,2) += f * u(2,1);
+            
+            u(1,1) *= scale;
+            u(2,1) *= scale;
         }
         
-        anorm = MAX(anorm, (fabs(w(0, 0)) + fabs(rv1.x)));
-    }
-    int i, l;
+        w(1, 1) = scale * g;
+        
+        scale = abs(u(1,2));
 
-    i = 1;
-    {
-        l = i+2;
-        rv1[i] = scale*g;
-        g = s = scale = 0.0;
-        if(i < m) 
+        if(scale > 0.0)
         {
-            for(k=i; k<m; k++) 
+            u(1, 2) /= scale;
+            s = u(1, 2) * u(1, 2);
+            
+            f = u(1, 2);
+            g = -SIGN(sqrt(s), f);
+            h = f * g - s;
+            u(1, 2) = f-g;
+
+            for(k=2; k<n; k++)
             {
-                scale += abs(u(k,i));
+                rv1[k] = u(1,k) / h;
             }
-            if(scale != 0.0) 
+            for(j=2; j<3; j++)
             {
-                for(k=i; k<m; k++) 
-                {
-                    u(k,i) /= scale;
-                    s += u(k,i) * u(k,i);
-                }
-                f = u(i,i);
-                g = -SIGN(sqrt(s),f);
-                h = f*g-s;
-                u(i,i) = f-g;
-                for(j=l-1;j<n;j++) 
+                s = 0.0;
+                for (k=2; k<3; k++)
                 {
-                    for (s=0.0,k=i;k<m;k++) 
-                    {
-                        s += u(k,i) * u(k,j);
-                    }
-                    f = s/h;
-                    for (k=i;k<m;k++)
-                    {
-                        u(k,j) += f * u(k,i);
-                    }
+                    s += u(j,k) * u(1,k);
                 }
-                for (k=i;k<m;k++)
+                for (k=2; k<3; k++)
                 {
-                    u(k,i) *= scale;
+                    u(j,k) += s * rv1[k];
                 }
             }
-        }
-        w(i, i) = scale * g;
-        g = s = scale = 0.0;
-        if(i+1 <= m && i+1 != n)
-        {
-            for(k=l-1; k<n; k++)
-            {
-                scale += abs(u(i,k));
-            }
-            if(scale != 0.0)
+            for(k=2; k<3; k++)
             {
-                for(k=l-1;k<n;k++) 
-                {
-                    u(i, k) /= scale;
-                    s += u(i,k) * u(i,k);
-                }
-                f = u(i, l-1);
-                g = -SIGN(sqrt(s),f);
-                h = f*g-s;
-                u(i,l-1) = f-g;
-                for(k=l-1;k<n;k++)
-                {
-                    rv1[k] = u(i,k) / h;
-                }
-                for(j=l-1; j<m; j++)
-                {
-                    for (s=0.0,k=l-1; k<n; k++)
-                    {
-                        s += u(j,k) * u(i,k);
-                    }
-                    for (k=l-1; k<n;k++)
-                    {
-                        u(j,k) += s * rv1[k];
-                    }
-                }
-                for(k=l-1; k<n; k++)
-                {
-                    u(i,k) *= scale;
-                }
+                u(1,k) *= scale;
             }
         }
-        anorm = MAX(anorm, (abs(w(i, i))+abs(rv1[i])));
+    
+        anorm = MAX(anorm, (abs(w(1, 1))+abs(rv1.y)));
     }
 
+    int i, l;
+
     i = 2;
     {
         l = i+2;
@@ -978,10 +947,6 @@ void svd2(
         anorm = MAX(anorm, (abs(w(i, i))+abs(rv1[i])));
     }
 
-    // for(i=0; i < n; i++) 
-    // {
-        
-    // }
     for(i=n-1; i>=0; i--)
     {
         if(i < n-1) 

From 9fa3a140fb7da0ac13755d4b7397065ccae0a258 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 20:03:47 +0200
Subject: [PATCH 32/44] hardwired first part

---
 src/rmagine_core/src/math/SVD2.cpp | 194 +++++++++--------------------
 1 file changed, 60 insertions(+), 134 deletions(-)

diff --git a/src/rmagine_core/src/math/SVD2.cpp b/src/rmagine_core/src/math/SVD2.cpp
index d9d858d..5131a8a 100644
--- a/src/rmagine_core/src/math/SVD2.cpp
+++ b/src/rmagine_core/src/math/SVD2.cpp
@@ -798,155 +798,81 @@ void svd2(
     
 
     // i = 1;
+    // l = 3;
+    rv1.y = scale * g;
+    g = 0.0;
+    scale = fabs(u(1, 1)) + fabs(u(2, 1));
+    
+    if(scale > 0.0)
     {
-        // l = 3; // l = 3
-        rv1.y = scale * g;
-        g = 0.0;
-        scale = fabs(u(1, 1)) + fabs(u(2, 1));
-        
-        if(scale > 0.0)
-        {
-            u(1,1) /= scale;
-            u(2,1) /= scale;
+        u(1,1) /= scale;
+        u(2,1) /= scale;
 
-            s = u(1,1) * u(1,1) + u(2,1) * u(2,1);
-            f = u(1,1);
-            g = -SIGN(sqrt(s),f);
-            h = f*g-s;
-            u(1,1) = f-g;
-            
-            f = (u(1,1) * u(1,2) + u(2,1) * u(2,2)) / h;
-            u(1,2) += f * u(1,1);
-            u(2,2) += f * u(2,1);
-            
-            u(1,1) *= scale;
-            u(2,1) *= scale;
-        }
+        s = u(1,1) * u(1,1) + u(2,1) * u(2,1);
+        f = u(1,1);
+        g = -SIGN(sqrt(s),f);
+        h = f*g-s;
+        u(1,1) = f-g;
         
-        w(1, 1) = scale * g;
+        f = (u(1,1) * u(1,2) + u(2,1) * u(2,2)) / h;
+        u(1,2) += f * u(1,1);
+        u(2,2) += f * u(2,1);
         
-        scale = abs(u(1,2));
+        u(1,1) *= scale;
+        u(2,1) *= scale;
+    }
+    
+    w(1, 1) = scale * g;
+    
+    scale = abs(u(1,2));
+    if(scale > 0.0)
+    {
+        u(1, 2) /= scale;
+        s = u(1, 2) * u(1, 2);
+        
+        f = u(1, 2);
+        g = -SIGN(sqrt(s), f);
+        h = f * g - s;
+        u(1, 2) = f - g;
 
-        if(scale > 0.0)
-        {
-            u(1, 2) /= scale;
-            s = u(1, 2) * u(1, 2);
-            
-            f = u(1, 2);
-            g = -SIGN(sqrt(s), f);
-            h = f * g - s;
-            u(1, 2) = f-g;
+        rv1.z = u(1,2) / h;
+        s = u(2, 2) * u(1, 2);
 
-            for(k=2; k<n; k++)
-            {
-                rv1[k] = u(1,k) / h;
-            }
-            for(j=2; j<3; j++)
-            {
-                s = 0.0;
-                for (k=2; k<3; k++)
-                {
-                    s += u(j,k) * u(1,k);
-                }
-                for (k=2; k<3; k++)
-                {
-                    u(j,k) += s * rv1[k];
-                }
-            }
-            for(k=2; k<3; k++)
-            {
-                u(1,k) *= scale;
-            }
-        }
-    
-        anorm = MAX(anorm, (abs(w(1, 1))+abs(rv1.y)));
+        u(2, 2) += s * rv1.z;
+        u(1, 2) *= scale;
     }
 
-    int i, l;
+    anorm = MAX(anorm, (abs(w(1, 1)) + abs(rv1.y)));
+    
+
 
-    i = 2;
+    // i = 2;
     {
-        l = i+2;
-        rv1[i] = scale*g;
-        g = s = scale = 0.0;
-        if(i < m) 
+        // l = 4;
+        rv1.z = scale * g;
+        
+        scale = abs(u(2, 2));
+
+        if(scale != 0.0) 
         {
-            for(k=i; k<m; k++) 
-            {
-                scale += abs(u(k,i));
-            }
-            if(scale != 0.0) 
-            {
-                for(k=i; k<m; k++) 
-                {
-                    u(k,i) /= scale;
-                    s += u(k,i) * u(k,i);
-                }
-                f = u(i,i);
-                g = -SIGN(sqrt(s),f);
-                h = f*g-s;
-                u(i,i) = f-g;
-                for(j=l-1;j<n;j++) 
-                {
-                    for (s=0.0,k=i;k<m;k++) 
-                    {
-                        s += u(k,i) * u(k,j);
-                    }
-                    f = s/h;
-                    for (k=i;k<m;k++)
-                    {
-                        u(k,j) += f * u(k,i);
-                    }
-                }
-                for (k=i;k<m;k++)
-                {
-                    u(k,i) *= scale;
-                }
-            }
+            u(2, 2) /= scale;
+            s = u(2, 2) * u(2, 2);
+            f = u(2, 2);
+            g = -SIGN(sqrt(s),f);
+            h = f * g - s;
+
+            u(2, 2) = f - g;
+            u(2, 2) *= scale;
         }
-        w(i, i) = scale * g;
+        
+        w(2, 2) = scale * g;
         g = s = scale = 0.0;
-        if(i+1 <= m && i+1 != n)
-        {
-            for(k=l-1; k<n; k++)
-            {
-                scale += abs(u(i,k));
-            }
-            if(scale != 0.0)
-            {
-                for(k=l-1;k<n;k++) 
-                {
-                    u(i, k) /= scale;
-                    s += u(i,k) * u(i,k);
-                }
-                f = u(i, l-1);
-                g = -SIGN(sqrt(s),f);
-                h = f*g-s;
-                u(i,l-1) = f-g;
-                for(k=l-1;k<n;k++)
-                {
-                    rv1[k] = u(i,k) / h;
-                }
-                for(j=l-1; j<m; j++)
-                {
-                    for (s=0.0,k=l-1; k<n; k++)
-                    {
-                        s += u(j,k) * u(i,k);
-                    }
-                    for (k=l-1; k<n;k++)
-                    {
-                        u(j,k) += s * rv1[k];
-                    }
-                }
-                for(k=l-1; k<n; k++)
-                {
-                    u(i,k) *= scale;
-                }
-            }
-        }
-        anorm = MAX(anorm, (abs(w(i, i))+abs(rv1[i])));
+        
+        anorm = MAX(anorm, (abs(w(2, 2))+abs(rv1.z)));
     }
 
+    int i, l;
+
     for(i=n-1; i>=0; i--)
     {
         if(i < n-1) 

From 36d28f8711bb547734ce1c4dc274d7a2a1d6552c Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 20:30:21 +0200
Subject: [PATCH 33/44] second part hard wired

---
 src/rmagine_core/src/math/SVD2.cpp | 127 +++++++++++++++--------------
 1 file changed, 66 insertions(+), 61 deletions(-)

diff --git a/src/rmagine_core/src/math/SVD2.cpp b/src/rmagine_core/src/math/SVD2.cpp
index 5131a8a..4d55226 100644
--- a/src/rmagine_core/src/math/SVD2.cpp
+++ b/src/rmagine_core/src/math/SVD2.cpp
@@ -730,6 +730,8 @@ void svd2(
     float eps = std::numeric_limits<float>::epsilon();
     u = a;
 
+    // FIRST PART
+
     // i = 0;
     // l = 2;
     scale = fabs(u(0,0)) + fabs(u(1,0)) + fabs(u(2,0));
@@ -795,8 +797,6 @@ void svd2(
     anorm = fabs(w(0, 0));
     // anorm = MAX(anorm, (fabs(w(0, 0)) + fabs(rv1.x))); // rv1.x is always 0 here, anorm too. fabs(X) >= 0
     
-    
-
     // i = 1;
     // l = 3;
     rv1.y = scale * g;
@@ -811,7 +811,7 @@ void svd2(
         s = u(1,1) * u(1,1) + u(2,1) * u(2,1);
         f = u(1,1);
         g = -SIGN(sqrt(s),f);
-        h = f*g-s;
+        h = f * g - s;
         u(1,1) = f-g;
         
         f = (u(1,1) * u(1,2) + u(2,1) * u(2,2)) / h;
@@ -823,88 +823,93 @@ void svd2(
     }
     
     w(1, 1) = scale * g;
+    g = s = scale = 0.0;
     
     scale = abs(u(1,2));
     if(scale > 0.0)
     {
-        u(1, 2) /= scale;
-        s = u(1, 2) * u(1, 2);
+        u(1,2) /= scale;
+        s = u(1,2) * u(1,2);
         
         f = u(1, 2);
         g = -SIGN(sqrt(s), f);
         h = f * g - s;
-        u(1, 2) = f - g;
+        u(1,2) = f - g;
 
         rv1.z = u(1,2) / h;
-        s = u(2, 2) * u(1, 2);
+        s = u(2,2) * u(1,2);
 
-        u(2, 2) += s * rv1.z;
-        u(1, 2) *= scale;
+        u(2,2) += s * rv1.z;
+        u(1,2) *= scale;
     }
 
     anorm = MAX(anorm, (abs(w(1, 1)) + abs(rv1.y)));
     
+    rv1.z = scale * g;
 
-
-    // i = 2;
+    scale = abs(u(2, 2));
+    if(scale > 0.0) 
     {
-        // l = 4;
-        rv1.z = scale * g;
-        
-        scale = abs(u(2, 2));
+        u(2, 2) /= scale;
+        s = u(2, 2) * u(2, 2);
+        f = u(2, 2);
+        g = -SIGN(sqrt(s),f);
+        h = f * g - s;
 
-        if(scale != 0.0) 
-        {
-            u(2, 2) /= scale;
-            s = u(2, 2) * u(2, 2);
-            f = u(2, 2);
-            g = -SIGN(sqrt(s),f);
-            h = f * g - s;
+        u(2, 2) = f - g;
+        u(2, 2) *= scale;
+    }
+    
+    w(2, 2) = scale * g;
+    g = s = scale = 0.0;
+    
+    anorm = MAX(anorm, (abs(w(2, 2))+abs(rv1.z)));
 
-            u(2, 2) = f - g;
-            u(2, 2) *= scale;
-        }
-        
-        w(2, 2) = scale * g;
-        g = s = scale = 0.0;
-        
-        anorm = MAX(anorm, (abs(w(2, 2))+abs(rv1.z)));
+    // SECOND PART    
+    v(2, 2) = 1.0;
+    g = rv1.z;
+
+    // i = 1;
+    // l = 2;
+    if(fabs(g) > 0.0)
+    {
+        v(2,1) = (u(1,2) / u(1,2)) / g;
+        s = u(1,2) * v(2,2);
+        v(2,2) += s * v(2,1);
     }
+    v(1,2) = 0.0;
+    v(2,1) = 0.0;
+    v(1,1) = 1.0;
 
-    int i, l;
+    g = rv1.y;
 
-    for(i=n-1; i>=0; i--)
+    // l = 1;
+    // i = 0;
+    if(fabs(g) > 0.0)
     {
-        if(i < n-1) 
-        {
-            if(g != 0.0)
-            {
-                for(j=l; j<n; j++)
-                {
-                    v(j,i) = (u(i,j)/u(i,l)) / g;
-                }
-                for(j=l; j<n; j++)
-                {
-                    for (s=0.0,k=l;k<n;k++) 
-                    {
-                        s += u(i,k) * v(k,j);
-                    }
-                    for (k=l; k<n; k++)
-                    {
-                        v(k,j) += s * v(k,i);
-                    }
-                }
-            }
-            for (j=l; j<n; j++) 
-            {
-                v(i,j) = 0.0;
-                v(j,i) = 0.0;
-            }
-        }
-        v(i,i) = 1.0;
-        g = rv1[i];
-        l = i;
+        v(1,0) = (u(0,1) / u(0,1)) / g;
+        v(2,0) = (u(0,2) / u(0,1)) / g;
+
+        s = u(0,1) * v(1,1) + u(0,2) * v(2,1);
+        v(1,1) += s * v(1,0);
+        v(2,1) += s * v(2,0);
+
+        s = u(0,1) * v(1,2) + u(0,2) * v(2,2);
+        v(1,2) += s * v(1,0);
+        v(2,2) += s * v(2,0);
     }
+    v(0,1) = 0.0;
+    v(1,0) = 0.0;
+    v(0,2) = 0.0;
+    v(2,0) = 0.0;
+    v(0,0) = 1.0;
+    g = rv1.x;
+
+
+    int i, l;
+
+    l = 0;
+
     for(i=MIN(m,n)-1; i>=0; i--)
     {
         l = i+1;

From e676ee95e9439c69bccbd8cd0c04d810d604c191 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 20:57:42 +0200
Subject: [PATCH 34/44] part 3 finished

---
 src/rmagine_core/src/math/SVD2.cpp | 100 +++++++++++++++++++----------
 tests/core/math_svd.cpp            |   3 +
 2 files changed, 68 insertions(+), 35 deletions(-)

diff --git a/src/rmagine_core/src/math/SVD2.cpp b/src/rmagine_core/src/math/SVD2.cpp
index 4d55226..10a0881 100644
--- a/src/rmagine_core/src/math/SVD2.cpp
+++ b/src/rmagine_core/src/math/SVD2.cpp
@@ -906,45 +906,75 @@ void svd2(
     g = rv1.x;
 
 
-    int i, l;
 
-    l = 0;
+    // THIRD PART
+    
+    // i = 2;
+    // l = 3;
+    g = w(2, 2);
+    if(fabs(g) > 0.0)
+    {
+        u(2,2) /= g;
+    } else { 
+        // TODO(amock): shouldnt this be a large number?
+        u(2,2) = 0.0;   
+    }
+    u(2,2) += 1.0;
+    
+    // i = 1;
+    // l = 2;
 
-    for(i=MIN(m,n)-1; i>=0; i--)
+    g = w(1, 1);
+    u(1,2) = 0.0;
+    
+    if(fabs(g) > 0.0)
     {
-        l = i+1;
-        g = w(i, i);
-        for(j=l;j<n;j++) 
-        {
-            u(i,j) = 0.0;
-        }
-        if(g != 0.0)
-        {
-            g = 1.0/g;
-            for(j=l; j<n; j++)
-            {
-                for (s=0.0,k=l; k<m; k++) 
-                {
-                    s += u(k,i)*u(k,j);
-                }
-                f = (s/u(i,i)) * g;
-                for (k=i; k<m; k++) 
-                {
-                    u(k,j) += f * u(k,i);
-                }
-            }
-            for(j=i;j<m;j++) 
-            {
-                u(j,i) *= g;
-            }
-        } else {
-            for(j=i;j<m;j++) 
-            {
-                u(j,i) = 0.0;
-            } 
-        }
-        ++u(i,i);
+        g = 1.0/g;
+        s = u(2,1) * u(2,2);
+        f = (s/u(1,1)) * g;
+
+        u(1,2) += f * u(1,1);
+        u(2,2) += f * u(2,1);
+    
+        u(1,1) *= g;
+        u(2,1) *= g;
+    } else {
+        u(1,1) = 0.0;
+        u(2,1) = 0.0;
     }
+    u(1,1) += 1.0;
+    
+    // i = 0;
+    // l = 1;
+    g = w(0, 0);
+    u(0,1) = 0.0;
+    u(0,2) = 0.0;
+    
+    if(fabs(g) > 0.0)
+    {
+        f = (u(1,0) * u(1,1) + u(2,0) * u(2,1)) / (g * u(0,0));
+        u(0,1) += f * u(0,0);
+        u(1,1) += f * u(1,0);
+        u(2,1) += f * u(2,0);
+
+        f = (u(1,0) * u(1,2) + u(2,0) * u(2,2)) / (g * u(0,0));
+        u(0,2) += f * u(0,0);
+        u(1,2) += f * u(1,0);
+        u(2,2) += f * u(2,0);
+
+        u(0,0) /= g;
+        u(1,0) /= g;
+        u(2,0) /= g;
+    } else {
+        u(0,0) = 0.0;
+        u(1,0) = 0.0;
+        u(2,0) = 0.0;
+    }
+    u(0,0) += 1.0;
+
+    int i, l;
+
+    // PART 4: Opti
     for(k=n-1; k>=0; k--) 
     {
         for(its=0; its<max_iterations; its++) 
diff --git a/tests/core/math_svd.cpp b/tests/core/math_svd.cpp
index 4e17822..8f22f40 100644
--- a/tests/core/math_svd.cpp
+++ b/tests/core/math_svd.cpp
@@ -261,6 +261,9 @@ void parallelTest()
         covs_rm[obj_id] = Arm;
     }
 
+    std::cout << "First Mat: " << std::endl;
+    std::cout << covs_rm[0] << std::endl;
+
 
     // C -> SVD -> UWT* -> U * W * T* -> C
 

From e44f18559dc7619d8623f01bbf2c4c038a8788bb Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 21:07:38 +0200
Subject: [PATCH 35/44] first opti

---
 src/rmagine_core/src/math/SVD2.cpp | 237 ++++++++++++++++++++++++++++-
 1 file changed, 235 insertions(+), 2 deletions(-)

diff --git a/src/rmagine_core/src/math/SVD2.cpp b/src/rmagine_core/src/math/SVD2.cpp
index 10a0881..eafcdfa 100644
--- a/src/rmagine_core/src/math/SVD2.cpp
+++ b/src/rmagine_core/src/math/SVD2.cpp
@@ -975,7 +975,124 @@ void svd2(
     int i, l;
 
     // PART 4: Opti
-    for(k=n-1; k>=0; k--) 
+
+    k = 2;
+    {
+        for(its=0; its<max_iterations; its++) 
+        {
+            flag=true;
+            for(l=2; l>=0; l--) 
+            {
+                nm = l-1;
+                if(l == 0 || abs(rv1[l]) <= eps*anorm) {
+                    flag=false;
+                    break;
+                }
+                if(abs(w(nm, nm)) <= eps*anorm) 
+                {
+                    break;
+                }
+            }
+            if(flag) 
+            {
+                c=0.0;
+                s=1.0;
+                for(i=l; i<3; i++) 
+                {
+                    f = s*rv1[i];
+                    rv1[i] = c*rv1[i];
+                    if(abs(f) <= eps*anorm) 
+                    {
+                        break;
+                    }
+                    g = w(i, i);
+                    h = PYTHAG(f,g);
+                    w(i, i) = h;
+                    h = 1.0/h;
+                    c = g*h;
+                    s = -f*h;
+                    for(j=0; j<m; j++)
+                    {
+                        y = u(j,nm);
+                        z = u(j,i);
+                        u(j,nm) = y*c+z*s;
+                        u(j,i) = z*c-y*s;
+                    }
+                }
+            }
+            z = w(2, 2);
+            if (l == 2)
+            {
+                if (z < 0.0)
+                {
+                    w(2, 2) = -z;
+                    for (j=0; j<3; j++) 
+                    {
+                        v(j,2) = -v(j,2);
+                    }
+                }
+                break;
+            }
+            if (its == max_iterations - 1) 
+            {
+                throw std::runtime_error("no convergence in 30 svdcmp iterations");
+            }
+            x = w(l, l);
+            nm = 1;
+            y = w(1, 1);
+            g = rv1.y;
+            h = rv1.z;
+            f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
+            g = PYTHAG(f, 1.f);
+            f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+            c = s = 1.f;
+            for (j=l; j<2; j++) 
+            {
+                i = j+1;
+                g = rv1[i];
+                y = w(i, i);
+                h = s*g;
+                g = c*g;
+                z = PYTHAG(f,h);
+                rv1[j] = z;
+                c = f/z;
+                s = h/z;
+                f = x*c+g*s;
+                g = g*c-x*s;
+                h = y*s;
+                y *= c;
+                for (jj=0;jj<n;jj++)
+                {
+                    x = v(jj,j);
+                    z = v(jj,i);
+                    v(jj,j) = x*c+z*s;
+                    v(jj,i) = z*c-x*s;
+                }
+                z = PYTHAG(f,h);
+                w(j, j) = z;
+                if (z) 
+                {
+                    z = 1.f/z;
+                    c = f*z;
+                    s = h*z;
+                }
+                f = c*g+s*y;
+                x = c*y-s*g;
+                for (jj=0;jj<m;jj++)
+                {
+                    y = u(jj,j);
+                    z = u(jj,i);
+                    u(jj,j) = y*c+z*s;
+                    u(jj,i) = z*c-y*s;
+                }
+            }
+            rv1[l] = 0.f;
+            rv1[k] = f;
+            w(k, k) = x;
+        }
+    }
+
+    k = 1;
     {
         for(its=0; its<max_iterations; its++) 
         {
@@ -1025,7 +1142,123 @@ void svd2(
                 if (z < 0.0)
                 {
                     w(k, k) = -z;
-                    for (j=0;j<n;j++) 
+                    for (j=0;j<3;j++) 
+                    {
+                        v(j,k) = -v(j,k);
+                    }
+                }
+                break;
+            }
+            if (its == max_iterations - 1) 
+            {
+                throw std::runtime_error("no convergence in 30 svdcmp iterations");
+            }
+            x = w(l, l);
+            nm = k-1;
+            y = w(nm, nm);
+            g = rv1[nm];
+            h = rv1[k];
+            f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
+            g = PYTHAG(f, 1.f);
+            f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+            c = s = 1.f;
+            for (j=l;j<=nm;j++) 
+            {
+                i = j+1;
+                g = rv1[i];
+                y = w(i, i);
+                h = s*g;
+                g = c*g;
+                z = PYTHAG(f,h);
+                rv1[j] = z;
+                c = f/z;
+                s = h/z;
+                f = x*c+g*s;
+                g = g*c-x*s;
+                h = y*s;
+                y *= c;
+                for (jj=0;jj<n;jj++)
+                {
+                    x = v(jj,j);
+                    z = v(jj,i);
+                    v(jj,j) = x*c+z*s;
+                    v(jj,i) = z*c-x*s;
+                }
+                z = PYTHAG(f,h);
+                w(j, j) = z;
+                if (z) 
+                {
+                    z = 1.f/z;
+                    c = f*z;
+                    s = h*z;
+                }
+                f = c*g+s*y;
+                x = c*y-s*g;
+                for (jj=0;jj<m;jj++)
+                {
+                    y = u(jj,j);
+                    z = u(jj,i);
+                    u(jj,j) = y*c+z*s;
+                    u(jj,i) = z*c-y*s;
+                }
+            }
+            rv1[l] = 0.f;
+            rv1[k] = f;
+            w(k, k) = x;
+        }
+    }
+
+    k = 0;
+    {
+        for(its=0; its<max_iterations; its++) 
+        {
+            flag=true;
+            for(l=k; l>=0; l--) 
+            {
+                nm=l-1;
+                if (l == 0 || abs(rv1[l]) <= eps*anorm) {
+                    flag=false;
+                    break;
+                }
+                if (abs(w(nm, nm)) <= eps*anorm) 
+                {
+                    break;
+                }
+            }
+            if(flag) 
+            {
+                c=0.0;
+                s=1.0;
+                for(i=l; i<k+1; i++) 
+                {
+                    f = s*rv1[i];
+                    rv1[i] = c*rv1[i];
+                    if(abs(f) <= eps*anorm) 
+                    {
+                        break;
+                    }
+                    g = w(i, i);
+                    h = PYTHAG(f,g);
+                    w(i, i) = h;
+                    h = 1.0/h;
+                    c = g*h;
+                    s = -f*h;
+                    for(j=0; j<m; j++)
+                    {
+                        y = u(j,nm);
+                        z = u(j,i);
+                        u(j,nm) = y*c+z*s;
+                        u(j,i) = z*c-y*s;
+                    }
+                }
+            }
+            z = w(k, k);
+            if (l == k)
+            {
+                if (z < 0.0)
+                {
+                    w(k, k) = -z;
+                    for (j=0;j<3;j++) 
                     {
                         v(j,k) = -v(j,k);
                     }

From 90c74d32614d88651fd09a950aff5e42faef6dc9 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 22:00:21 +0200
Subject: [PATCH 36/44] smaller opti

---
 src/rmagine_core/src/math/SVD2.cpp | 38 ++++++++++++++++++------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/src/rmagine_core/src/math/SVD2.cpp b/src/rmagine_core/src/math/SVD2.cpp
index eafcdfa..3502793 100644
--- a/src/rmagine_core/src/math/SVD2.cpp
+++ b/src/rmagine_core/src/math/SVD2.cpp
@@ -981,19 +981,28 @@ void svd2(
         for(its=0; its<max_iterations; its++) 
         {
             flag=true;
-            for(l=2; l>=0; l--) 
+
+            l=2;
+            if(abs(rv1.z) <= eps*anorm)
+            {
+                l=2;
+                flag=false;
+            }
+            else if(abs(w(1, 1)) > eps*anorm)
             {
-                nm = l-1;
-                if(l == 0 || abs(rv1[l]) <= eps*anorm) {
+                l=1;
+                if(abs(rv1.y) <= eps*anorm) 
+                {
                     flag=false;
-                    break;
                 }
-                if(abs(w(nm, nm)) <= eps*anorm) 
+                else if(abs(w(0, 0)) > eps*anorm) 
                 {
-                    break;
+                    l=0;
+                    flag = false;
                 }
             }
-            if(flag) 
+            
+            if(flag)
             {
                 c=0.0;
                 s=1.0;
@@ -1013,17 +1022,17 @@ void svd2(
                     s = -f*h;
                     for(j=0; j<m; j++)
                     {
-                        y = u(j,nm);
+                        y = u(j,l-1);
                         z = u(j,i);
-                        u(j,nm) = y*c+z*s;
+                        u(j,l-1) = y*c+z*s;
                         u(j,i) = z*c-y*s;
                     }
                 }
             }
             z = w(2, 2);
-            if (l == 2)
+            if(l == 2)
             {
-                if (z < 0.0)
+                if(z < 0.0)
                 {
                     w(2, 2) = -z;
                     for (j=0; j<3; j++) 
@@ -1033,12 +1042,11 @@ void svd2(
                 }
                 break;
             }
-            if (its == max_iterations - 1) 
+            if(its == max_iterations - 1) 
             {
                 throw std::runtime_error("no convergence in 30 svdcmp iterations");
             }
             x = w(l, l);
-            nm = 1;
             y = w(1, 1);
             g = rv1.y;
             h = rv1.z;
@@ -1087,8 +1095,8 @@ void svd2(
                 }
             }
             rv1[l] = 0.f;
-            rv1[k] = f;
-            w(k, k) = x;
+            rv1.z = f;
+            w(2, 2) = x;
         }
     }
 

From 4a23f3523042253ef78dc9bf79c2f4ba468cba40 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 22:41:03 +0200
Subject: [PATCH 37/44] more optimization

---
 src/rmagine_core/src/math/SVD2.cpp | 233 +++++++++--------------------
 1 file changed, 73 insertions(+), 160 deletions(-)

diff --git a/src/rmagine_core/src/math/SVD2.cpp b/src/rmagine_core/src/math/SVD2.cpp
index 3502793..ddf9807 100644
--- a/src/rmagine_core/src/math/SVD2.cpp
+++ b/src/rmagine_core/src/math/SVD2.cpp
@@ -980,22 +980,31 @@ void svd2(
     {
         for(its=0; its<max_iterations; its++) 
         {
-            flag=true;
+            // flag=true;
+            // l = 2;
+            // if(MIN(fabs(rv1.z), fabs(w(1,1))) > eps*anorm)
+            // {
+            //     l = 1;
+            //     if(MIN(fabs(rv1.y),abs(w(0,0))) > eps*anorm)
+            //     {
+            //         l = 0;
+            //     }
+            // }
 
+            flag=true;
             l=2;
             if(abs(rv1.z) <= eps*anorm)
             {
-                l=2;
                 flag=false;
             }
-            else if(abs(w(1, 1)) > eps*anorm)
+            else if(abs(w(1,1)) > eps*anorm)
             {
                 l=1;
                 if(abs(rv1.y) <= eps*anorm) 
                 {
                     flag=false;
                 }
-                else if(abs(w(0, 0)) > eps*anorm) 
+                else if(abs(w(0,0)) > eps*anorm) 
                 {
                     l=0;
                     flag = false;
@@ -1105,139 +1114,22 @@ void svd2(
         for(its=0; its<max_iterations; its++) 
         {
             flag=true;
-            for(l=k; l>=0; l--) 
-            {
-                nm=l-1;
-                if (l == 0 || abs(rv1[l]) <= eps*anorm) {
-                    flag=false;
-                    break;
-                }
-                if (abs(w(nm, nm)) <= eps*anorm) 
-                {
-                    break;
-                }
-            }
-            if(flag) 
-            {
-                c=0.0;
-                s=1.0;
-                for(i=l; i<k+1; i++) 
-                {
-                    f = s*rv1[i];
-                    rv1[i] = c*rv1[i];
-                    if(abs(f) <= eps*anorm) 
-                    {
-                        break;
-                    }
-                    g = w(i, i);
-                    h = PYTHAG(f,g);
-                    w(i, i) = h;
-                    h = 1.0/h;
-                    c = g*h;
-                    s = -f*h;
-                    for(j=0; j<m; j++)
-                    {
-                        y = u(j,nm);
-                        z = u(j,i);
-                        u(j,nm) = y*c+z*s;
-                        u(j,i) = z*c-y*s;
-                    }
-                }
-            }
-            z = w(k, k);
-            if (l == k)
+            l=1;
+            if(abs(rv1.y) <= eps*anorm) 
             {
-                if (z < 0.0)
-                {
-                    w(k, k) = -z;
-                    for (j=0;j<3;j++) 
-                    {
-                        v(j,k) = -v(j,k);
-                    }
-                }
-                break;
-            }
-            if (its == max_iterations - 1) 
-            {
-                throw std::runtime_error("no convergence in 30 svdcmp iterations");
+                flag=false;
             }
-            x = w(l, l);
-            nm = k-1;
-            y = w(nm, nm);
-            g = rv1[nm];
-            h = rv1[k];
-            f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
-            g = PYTHAG(f, 1.f);
-            f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
-            c = s = 1.f;
-            for (j=l;j<=nm;j++) 
+            else if(abs(w(0,0)) > eps*anorm) 
             {
-                i = j+1;
-                g = rv1[i];
-                y = w(i, i);
-                h = s*g;
-                g = c*g;
-                z = PYTHAG(f,h);
-                rv1[j] = z;
-                c = f/z;
-                s = h/z;
-                f = x*c+g*s;
-                g = g*c-x*s;
-                h = y*s;
-                y *= c;
-                for (jj=0;jj<n;jj++)
-                {
-                    x = v(jj,j);
-                    z = v(jj,i);
-                    v(jj,j) = x*c+z*s;
-                    v(jj,i) = z*c-x*s;
-                }
-                z = PYTHAG(f,h);
-                w(j, j) = z;
-                if (z) 
-                {
-                    z = 1.f/z;
-                    c = f*z;
-                    s = h*z;
-                }
-                f = c*g+s*y;
-                x = c*y-s*g;
-                for (jj=0;jj<m;jj++)
-                {
-                    y = u(jj,j);
-                    z = u(jj,i);
-                    u(jj,j) = y*c+z*s;
-                    u(jj,i) = z*c-y*s;
-                }
+                l=0;
+                flag = false;
             }
-            rv1[l] = 0.f;
-            rv1[k] = f;
-            w(k, k) = x;
-        }
-    }
 
-    k = 0;
-    {
-        for(its=0; its<max_iterations; its++) 
-        {
-            flag=true;
-            for(l=k; l>=0; l--) 
-            {
-                nm=l-1;
-                if (l == 0 || abs(rv1[l]) <= eps*anorm) {
-                    flag=false;
-                    break;
-                }
-                if (abs(w(nm, nm)) <= eps*anorm) 
-                {
-                    break;
-                }
-            }
-            if(flag) 
+            if(flag)
             {
                 c=0.0;
                 s=1.0;
-                for(i=l; i<k+1; i++) 
+                for(i=l; i<2; i++) 
                 {
                     f = s*rv1[i];
                     rv1[i] = c*rv1[i];
@@ -1253,64 +1145,75 @@ void svd2(
                     s = -f*h;
                     for(j=0; j<m; j++)
                     {
-                        y = u(j,nm);
+                        y = u(j,l-1);
                         z = u(j,i);
-                        u(j,nm) = y*c+z*s;
+                        u(j,l-1) = y*c+z*s;
                         u(j,i) = z*c-y*s;
                     }
                 }
             }
-            z = w(k, k);
-            if (l == k)
+            z = w(1,1);
+            if(l == 1)
             {
-                if (z < 0.0)
+                if(z < 0.0)
                 {
                     w(k, k) = -z;
-                    for (j=0;j<3;j++) 
+                    for (j=0; j<3; j++) 
                     {
                         v(j,k) = -v(j,k);
                     }
                 }
                 break;
             }
-            if (its == max_iterations - 1) 
+            if(its == max_iterations - 1) 
             {
                 throw std::runtime_error("no convergence in 30 svdcmp iterations");
             }
+
             x = w(l, l);
-            nm = k-1;
-            y = w(nm, nm);
-            g = rv1[nm];
+            y = w(0, 0);
+            g = rv1.x;
             h = rv1[k];
             f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
             g = PYTHAG(f, 1.f);
             f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
             c = s = 1.f;
-            for (j=l;j<=nm;j++) 
+
+            if(l == 0)
             {
-                i = j+1;
-                g = rv1[i];
-                y = w(i, i);
+                g = rv1.y;
+                y = w(1,1);
                 h = s*g;
                 g = c*g;
                 z = PYTHAG(f,h);
-                rv1[j] = z;
+                rv1.x = z;
                 c = f/z;
                 s = h/z;
                 f = x*c+g*s;
                 g = g*c-x*s;
                 h = y*s;
                 y *= c;
-                for (jj=0;jj<n;jj++)
-                {
-                    x = v(jj,j);
-                    z = v(jj,i);
-                    v(jj,j) = x*c+z*s;
-                    v(jj,i) = z*c-x*s;
-                }
+
+
+                x = v(0,0);
+                z = v(0,1);
+                v(0,0) = x*c+z*s;
+                v(0,1) = z*c-x*s;
+
+                x = v(1,0);
+                z = v(1,1);
+                v(1,0) = x*c+z*s;
+                v(1,1) = z*c-x*s;
+
+                x = v(2,0);
+                z = v(2,1);
+                v(2,0) = x*c+z*s;
+                v(2,1) = z*c-x*s;
+
+
                 z = PYTHAG(f,h);
-                w(j, j) = z;
-                if (z) 
+                w(0,0) = z;
+                if(z>0.f)
                 {
                     z = 1.f/z;
                     c = f*z;
@@ -1318,19 +1221,29 @@ void svd2(
                 }
                 f = c*g+s*y;
                 x = c*y-s*g;
-                for (jj=0;jj<m;jj++)
+                for(jj=0; jj<3; jj++)
                 {
-                    y = u(jj,j);
-                    z = u(jj,i);
-                    u(jj,j) = y*c+z*s;
-                    u(jj,i) = z*c-y*s;
+                    y = u(jj,0);
+                    z = u(jj,1);
+                    u(jj,0) = y*c+z*s;
+                    u(jj,1) = z*c-y*s;
                 }
             }
+            
             rv1[l] = 0.f;
-            rv1[k] = f;
-            w(k, k) = x;
+            rv1.y = f;
+            w(1,1) = x;
         }
     }
+
+    z = w(0, 0);
+    if (z < 0.0)
+    {
+        w(0,0) = -z;
+        v(0,0) = -v(0,0);
+        v(1,0) = -v(1,0);
+        v(2,0) = -v(2,0);
+    }
 }
 
 

From 1f407392e277413226e2c75cf004fde982f1ef3d Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 22:56:06 +0200
Subject: [PATCH 38/44] first stable state

---
 src/rmagine_core/src/math/SVD2.cpp | 440 ++++++++++++++---------------
 1 file changed, 219 insertions(+), 221 deletions(-)

diff --git a/src/rmagine_core/src/math/SVD2.cpp b/src/rmagine_core/src/math/SVD2.cpp
index ddf9807..d010ace 100644
--- a/src/rmagine_core/src/math/SVD2.cpp
+++ b/src/rmagine_core/src/math/SVD2.cpp
@@ -717,7 +717,7 @@ void svd2(
     // TODO: test
     constexpr unsigned int m = 3;
     constexpr unsigned int n = 3;
-    constexpr unsigned int max_iterations = 30;
+    constexpr unsigned int max_iterations = 20;
     
     // additional memory required
     bool flag;
@@ -906,7 +906,6 @@ void svd2(
     g = rv1.x;
 
 
-
     // THIRD PART
     
     // i = 2;
@@ -976,267 +975,266 @@ void svd2(
 
     // PART 4: Opti
 
-    k = 2;
+    // k = 2;
+    for(its=0; its<max_iterations; its++) 
     {
-        for(its=0; its<max_iterations; its++) 
+        // flag=true;
+        // l = 2;
+        // if(MIN(fabs(rv1.z), fabs(w(1,1))) > eps*anorm)
+        // {
+        //     l = 1;
+        //     if(MIN(fabs(rv1.y),abs(w(0,0))) > eps*anorm)
+        //     {
+        //         l = 0;
+        //     }
+        // }
+
+        flag=true;
+        l=2;
+        if(abs(rv1.z) <= eps*anorm)
         {
-            // flag=true;
-            // l = 2;
-            // if(MIN(fabs(rv1.z), fabs(w(1,1))) > eps*anorm)
-            // {
-            //     l = 1;
-            //     if(MIN(fabs(rv1.y),abs(w(0,0))) > eps*anorm)
-            //     {
-            //         l = 0;
-            //     }
-            // }
-
-            flag=true;
-            l=2;
-            if(abs(rv1.z) <= eps*anorm)
+            flag=false;
+        }
+        else if(abs(w(1,1)) > eps*anorm)
+        {
+            l=1;
+            if(abs(rv1.y) <= eps*anorm) 
             {
                 flag=false;
             }
-            else if(abs(w(1,1)) > eps*anorm)
+            else if(abs(w(0,0)) > eps*anorm) 
             {
-                l=1;
-                if(abs(rv1.y) <= eps*anorm) 
+                l=0;
+                flag = false;
+            }
+        }
+        
+        if(flag)
+        {
+            c=0.0;
+            s=1.0;
+            for(i=l; i<3; i++) 
+            {
+                f = s*rv1[i];
+                rv1[i] = c*rv1[i];
+                if(abs(f) <= eps*anorm) 
                 {
-                    flag=false;
+                    break;
                 }
-                else if(abs(w(0,0)) > eps*anorm) 
+                g = w(i, i);
+                h = PYTHAG(f,g);
+                w(i, i) = h;
+                h = 1.0/h;
+                c = g*h;
+                s = -f*h;
+                for(j=0; j<m; j++)
                 {
-                    l=0;
-                    flag = false;
+                    y = u(j,l-1);
+                    z = u(j,i);
+                    u(j,l-1) = y*c+z*s;
+                    u(j,i) = z*c-y*s;
                 }
             }
-            
-            if(flag)
+        }
+        z = w(2,2);
+        if(l == 2)
+        {
+            if(z < 0.0)
             {
-                c=0.0;
-                s=1.0;
-                for(i=l; i<3; i++) 
+                w(2, 2) = -z;
+                for (j=0; j<3; j++) 
                 {
-                    f = s*rv1[i];
-                    rv1[i] = c*rv1[i];
-                    if(abs(f) <= eps*anorm) 
-                    {
-                        break;
-                    }
-                    g = w(i, i);
-                    h = PYTHAG(f,g);
-                    w(i, i) = h;
-                    h = 1.0/h;
-                    c = g*h;
-                    s = -f*h;
-                    for(j=0; j<m; j++)
-                    {
-                        y = u(j,l-1);
-                        z = u(j,i);
-                        u(j,l-1) = y*c+z*s;
-                        u(j,i) = z*c-y*s;
-                    }
+                    v(j,2) = -v(j,2);
                 }
             }
-            z = w(2, 2);
-            if(l == 2)
+            break;
+        }
+        if(its == max_iterations - 1) 
+        {
+            // std::cout << "no convergence in " << max_iterations << " svdcmp iterations" << std::endl;
+            throw std::runtime_error("no convergence in max svdcmp iterations");
+        }
+        x = w(l,l);
+        y = w(1,1);
+        g = rv1.y;
+        h = rv1.z;
+        f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
+        g = PYTHAG(f, 1.f);
+        f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+        c = s = 1.f;
+        for (j=l; j<2; j++) 
+        {
+            i = j+1;
+            g = rv1[i];
+            y = w(i, i);
+            h = s*g;
+            g = c*g;
+            z = PYTHAG(f,h);
+            rv1[j] = z;
+            c = f/z;
+            s = h/z;
+            f = x*c+g*s;
+            g = g*c-x*s;
+            h = y*s;
+            y *= c;
+            for(jj=0;jj<3;jj++)
             {
-                if(z < 0.0)
-                {
-                    w(2, 2) = -z;
-                    for (j=0; j<3; j++) 
-                    {
-                        v(j,2) = -v(j,2);
-                    }
-                }
-                break;
+                x = v(jj,j);
+                z = v(jj,i);
+                v(jj,j) = x*c+z*s;
+                v(jj,i) = z*c-x*s;
             }
-            if(its == max_iterations - 1) 
+            z = PYTHAG(f,h);
+            w(j,j) = z;
+            if(z>0.0) 
             {
-                throw std::runtime_error("no convergence in 30 svdcmp iterations");
+                z = 1.f/z;
+                c = f*z;
+                s = h*z;
             }
-            x = w(l, l);
-            y = w(1, 1);
-            g = rv1.y;
-            h = rv1.z;
-            f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
-            g = PYTHAG(f, 1.f);
-            f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
-            c = s = 1.f;
-            for (j=l; j<2; j++) 
+            f = c*g+s*y;
+            x = c*y-s*g;
+            for (jj=0;jj<m;jj++)
             {
-                i = j+1;
-                g = rv1[i];
-                y = w(i, i);
-                h = s*g;
-                g = c*g;
-                z = PYTHAG(f,h);
-                rv1[j] = z;
-                c = f/z;
-                s = h/z;
-                f = x*c+g*s;
-                g = g*c-x*s;
-                h = y*s;
-                y *= c;
-                for (jj=0;jj<n;jj++)
-                {
-                    x = v(jj,j);
-                    z = v(jj,i);
-                    v(jj,j) = x*c+z*s;
-                    v(jj,i) = z*c-x*s;
-                }
-                z = PYTHAG(f,h);
-                w(j, j) = z;
-                if (z) 
-                {
-                    z = 1.f/z;
-                    c = f*z;
-                    s = h*z;
-                }
-                f = c*g+s*y;
-                x = c*y-s*g;
-                for (jj=0;jj<m;jj++)
-                {
-                    y = u(jj,j);
-                    z = u(jj,i);
-                    u(jj,j) = y*c+z*s;
-                    u(jj,i) = z*c-y*s;
-                }
+                y = u(jj,j);
+                z = u(jj,i);
+                u(jj,j) = y*c+z*s;
+                u(jj,i) = z*c-y*s;
             }
-            rv1[l] = 0.f;
-            rv1.z = f;
-            w(2, 2) = x;
         }
+        rv1[l] = 0.f;
+        rv1.z = f;
+        w(2,2) = x;
     }
 
-    k = 1;
+
+    for(its=0; its<max_iterations; its++) 
     {
-        for(its=0; its<max_iterations; its++) 
+        flag=true;
+        l=1;
+        if(abs(rv1.y) <= eps*anorm) 
         {
-            flag=true;
-            l=1;
-            if(abs(rv1.y) <= eps*anorm) 
-            {
-                flag=false;
-            }
-            else if(abs(w(0,0)) > eps*anorm) 
-            {
-                l=0;
-                flag = false;
-            }
+            flag=false;
+        }
+        else if(abs(w(0,0)) > eps*anorm) 
+        {
+            l=0;
+            flag = false;
+        }
 
-            if(flag)
+        if(flag)
+        {
+            c=0.0;
+            s=1.0;
+            for(i=l; i<2; i++) 
             {
-                c=0.0;
-                s=1.0;
-                for(i=l; i<2; i++) 
+                f = s*rv1[i];
+                rv1[i] = c*rv1[i];
+                if(abs(f) <= eps*anorm) 
                 {
-                    f = s*rv1[i];
-                    rv1[i] = c*rv1[i];
-                    if(abs(f) <= eps*anorm) 
-                    {
-                        break;
-                    }
-                    g = w(i, i);
-                    h = PYTHAG(f,g);
-                    w(i, i) = h;
-                    h = 1.0/h;
-                    c = g*h;
-                    s = -f*h;
-                    for(j=0; j<m; j++)
-                    {
-                        y = u(j,l-1);
-                        z = u(j,i);
-                        u(j,l-1) = y*c+z*s;
-                        u(j,i) = z*c-y*s;
-                    }
+                    break;
                 }
-            }
-            z = w(1,1);
-            if(l == 1)
-            {
-                if(z < 0.0)
+                g = w(i,i);
+                h = PYTHAG(f,g);
+                w(i, i) = h;
+                h = 1.0/h;
+                c = g*h;
+                s = -f*h;
+                for(j=0; j<m; j++)
                 {
-                    w(k, k) = -z;
-                    for (j=0; j<3; j++) 
-                    {
-                        v(j,k) = -v(j,k);
-                    }
+                    y = u(j,l-1);
+                    z = u(j,i);
+                    u(j,l-1) = y*c+z*s;
+                    u(j,i) = z*c-y*s;
                 }
-                break;
             }
-            if(its == max_iterations - 1) 
+        }
+        z = w(1,1);
+        if(l == 1)
+        {
+            if(z < 0.0)
             {
-                throw std::runtime_error("no convergence in 30 svdcmp iterations");
+                w(1,1) = -z;
+                for (j=0; j<3; j++) 
+                {
+                    v(j,1) = -v(j,1);
+                }
             }
+            break;
+        }
+        if(its == max_iterations - 1) 
+        {
+            std::cout << "no convergence in " << max_iterations << " svdcmp iterations" << std::endl;
+            throw std::runtime_error("no convergence in max svdcmp iterations");
+        }
 
-            x = w(l, l);
-            y = w(0, 0);
-            g = rv1.x;
-            h = rv1[k];
-            f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
-            g = PYTHAG(f, 1.f);
-            f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
-            c = s = 1.f;
+        x = w(l, l);
+        y = w(0, 0);
+        g = rv1.x;
+        h = rv1.y;
+        f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
+        g = PYTHAG(f, 1.f);
+        f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+        c = s = 1.f;
 
-            if(l == 0)
+        if(l == 0)
+        {
+            g = rv1.y;
+            y = w(1,1);
+            h = s*g;
+            g = c*g;
+            z = PYTHAG(f,h);
+            rv1.x = z;
+            c = f/z;
+            s = h/z;
+            f = x*c+g*s;
+            g = g*c-x*s;
+            h = y*s;
+            y *= c;
+
+
+            x = v(0,0);
+            z = v(0,1);
+            v(0,0) = x*c+z*s;
+            v(0,1) = z*c-x*s;
+
+            x = v(1,0);
+            z = v(1,1);
+            v(1,0) = x*c+z*s;
+            v(1,1) = z*c-x*s;
+
+            x = v(2,0);
+            z = v(2,1);
+            v(2,0) = x*c+z*s;
+            v(2,1) = z*c-x*s;
+
+
+            z = PYTHAG(f,h);
+            w(0,0) = z;
+            if(z>0.f)
             {
-                g = rv1.y;
-                y = w(1,1);
-                h = s*g;
-                g = c*g;
-                z = PYTHAG(f,h);
-                rv1.x = z;
-                c = f/z;
-                s = h/z;
-                f = x*c+g*s;
-                g = g*c-x*s;
-                h = y*s;
-                y *= c;
-
-
-                x = v(0,0);
-                z = v(0,1);
-                v(0,0) = x*c+z*s;
-                v(0,1) = z*c-x*s;
-
-                x = v(1,0);
-                z = v(1,1);
-                v(1,0) = x*c+z*s;
-                v(1,1) = z*c-x*s;
-
-                x = v(2,0);
-                z = v(2,1);
-                v(2,0) = x*c+z*s;
-                v(2,1) = z*c-x*s;
-
-
-                z = PYTHAG(f,h);
-                w(0,0) = z;
-                if(z>0.f)
-                {
-                    z = 1.f/z;
-                    c = f*z;
-                    s = h*z;
-                }
-                f = c*g+s*y;
-                x = c*y-s*g;
-                for(jj=0; jj<3; jj++)
-                {
-                    y = u(jj,0);
-                    z = u(jj,1);
-                    u(jj,0) = y*c+z*s;
-                    u(jj,1) = z*c-y*s;
-                }
+                z = 1.f/z;
+                c = f*z;
+                s = h*z;
+            }
+            f = c*g+s*y;
+            x = c*y-s*g;
+            for(jj=0; jj<3; jj++)
+            {
+                y = u(jj,0);
+                z = u(jj,1);
+                u(jj,0) = y*c+z*s;
+                u(jj,1) = z*c-y*s;
             }
-            
-            rv1[l] = 0.f;
-            rv1.y = f;
-            w(1,1) = x;
         }
+        
+        rv1[l] = 0.f;
+        rv1.y = f;
+        w(1,1) = x;
     }
 
-    z = w(0, 0);
+
+    z = w(0,0);
     if (z < 0.0)
     {
         w(0,0) = -z;

From cfa67a969b5715761ddb2d88c11db57e6b7b6d44 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 23:07:11 +0200
Subject: [PATCH 39/44] added vector version

---
 .../include/rmagine/math/SVD2.hpp             |   14 +-
 src/rmagine_core/src/math/SVD2.cpp            | 1110 ++++++++---------
 tests/core/math_svd.cpp                       |   50 +-
 3 files changed, 561 insertions(+), 613 deletions(-)

diff --git a/src/rmagine_core/include/rmagine/math/SVD2.hpp b/src/rmagine_core/include/rmagine/math/SVD2.hpp
index c963749..5c10692 100644
--- a/src/rmagine_core/include/rmagine/math/SVD2.hpp
+++ b/src/rmagine_core/include/rmagine/math/SVD2.hpp
@@ -166,21 +166,13 @@ void svd(
 );
 
 RMAGINE_FUNCTION
-void svd2(
-    const Matrix3x3& A,
+void svd(
+    const Matrix3x3& A, 
     Matrix3x3& U,
-    Matrix3x3& W,
+    Vector3& w,
     Matrix3x3& V
 );
 
-// RMAGINE_FUNCTION
-// void svd(
-//     const Matrix3x3& A, 
-//     Matrix3x3& U,
-//     Vector3& w,
-//     Matrix3x3& V
-// );
-
 
 } // namespace rmagine
 
diff --git a/src/rmagine_core/src/math/SVD2.cpp b/src/rmagine_core/src/math/SVD2.cpp
index d010ace..ffc2bd0 100644
--- a/src/rmagine_core/src/math/SVD2.cpp
+++ b/src/rmagine_core/src/math/SVD2.cpp
@@ -423,296 +423,6 @@ void svd(
     Matrix3x3& u,
     Matrix3x3& w,
     Matrix3x3& v)
-{
-    // TODO: test
-    constexpr unsigned int m = 3;
-    constexpr unsigned int n = 3;
-    constexpr unsigned int max_iterations = 30;
-    
-    // additional memory required
-    bool flag;
-    int i, its, j, jj, k, l, nm;
-    float anorm, c, f, g, h, s, scale, x, y, z;
-    
-    Vector3 rv1;
-    
-    g = scale = anorm = 0.0;
-    float eps = std::numeric_limits<float>::epsilon();
-    u = a;
-
-    for(i=0; i < n; i++) 
-    {
-        l = i+2;
-        rv1[i] = scale*g;
-        g = s = scale = 0.0;
-        if(i < m) 
-        {
-            for(k=i; k<m; k++) 
-            {
-                scale += abs(u(k,i));
-            }
-            if(scale != 0.0) 
-            {
-                for(k=i; k<m; k++) 
-                {
-                    u(k,i) /= scale;
-                    s += u(k,i) * u(k,i);
-                }
-                f = u(i,i);
-                g = -SIGN(sqrt(s),f);
-                h = f*g-s;
-                u(i,i) = f-g;
-                for(j=l-1;j<n;j++) 
-                {
-                    for (s=0.0,k=i;k<m;k++) 
-                    {
-                        s += u(k,i) * u(k,j);
-                    }
-                    f = s/h;
-                    for (k=i;k<m;k++)
-                    {
-                        u(k,j) += f * u(k,i);
-                    }
-                }
-                for (k=i;k<m;k++)
-                {
-                    u(k,i) *= scale;
-                }
-            }
-        }
-        w(i, i) = scale * g;
-        g = s = scale = 0.0;
-        if(i+1 <= m && i+1 != n)
-        {
-            for(k=l-1; k<n; k++)
-            {
-                scale += abs(u(i,k));
-            }
-            if(scale != 0.0)
-            {
-                for(k=l-1;k<n;k++) 
-                {
-                    u(i, k) /= scale;
-                    s += u(i,k) * u(i,k);
-                }
-                f = u(i, l-1);
-                g = -SIGN(sqrt(s),f);
-                h = f*g-s;
-                u(i,l-1) = f-g;
-                for(k=l-1;k<n;k++)
-                {
-                    rv1[k] = u(i,k) / h;
-                }
-                for(j=l-1; j<m; j++)
-                {
-                    for (s=0.0,k=l-1; k<n; k++)
-                    {
-                        s += u(j,k) * u(i,k);
-                    }
-                    for (k=l-1; k<n;k++)
-                    {
-                        u(j,k) += s * rv1[k];
-                    }
-                }
-                for(k=l-1; k<n; k++)
-                {
-                    u(i,k) *= scale;
-                }
-            }
-        }
-        anorm = MAX(anorm, (abs(w(i, i))+abs(rv1[i])));
-    }
-    for(i=n-1; i>=0; i--)
-    {
-        if(i < n-1) 
-        {
-            if(g != 0.0)
-            {
-                for(j=l; j<n; j++)
-                {
-                    v(j,i) = (u(i,j)/u(i,l)) / g;
-                }
-                for(j=l; j<n; j++)
-                {
-                    for (s=0.0,k=l;k<n;k++) 
-                    {
-                        s += u(i,k) * v(k,j);
-                    }
-                    for (k=l; k<n; k++)
-                    {
-                        v(k,j) += s * v(k,i);
-                    }
-                }
-            }
-            for (j=l; j<n; j++) 
-            {
-                v(i,j) = 0.0;
-                v(j,i) = 0.0;
-            }
-        }
-        v(i,i) = 1.0;
-        g = rv1[i];
-        l = i;
-    }
-    for(i=MIN(m,n)-1; i>=0; i--)
-    {
-        l = i+1;
-        g = w(i, i);
-        for(j=l;j<n;j++) 
-        {
-            u(i,j) = 0.0;
-        }
-        if(g != 0.0)
-        {
-            g = 1.0/g;
-            for(j=l; j<n; j++)
-            {
-                for (s=0.0,k=l; k<m; k++) 
-                {
-                    s += u(k,i)*u(k,j);
-                }
-                f = (s/u(i,i)) * g;
-                for (k=i; k<m; k++) 
-                {
-                    u(k,j) += f * u(k,i);
-                }
-            }
-            for(j=i;j<m;j++) 
-            {
-                u(j,i) *= g;
-            }
-        } else {
-            for(j=i;j<m;j++) 
-            {
-                u(j,i) = 0.0;
-            } 
-        }
-        ++u(i,i);
-    }
-    for(k=n-1; k>=0; k--) 
-    {
-        for(its=0; its<max_iterations; its++) 
-        {
-            flag=true;
-            for(l=k; l>=0; l--) 
-            {
-                nm=l-1;
-                if (l == 0 || abs(rv1[l]) <= eps*anorm) {
-                    flag=false;
-                    break;
-                }
-                if (abs(w(nm, nm)) <= eps*anorm) 
-                {
-                    break;
-                }
-            }
-            if(flag) 
-            {
-                c=0.0;
-                s=1.0;
-                for(i=l; i<k+1; i++) 
-                {
-                    f = s*rv1[i];
-                    rv1[i] = c*rv1[i];
-                    if(abs(f) <= eps*anorm) 
-                    {
-                        break;
-                    }
-                    g = w(i, i);
-                    h = PYTHAG(f,g);
-                    w(i, i) = h;
-                    h = 1.0/h;
-                    c = g*h;
-                    s = -f*h;
-                    for(j=0; j<m; j++)
-                    {
-                        y = u(j,nm);
-                        z = u(j,i);
-                        u(j,nm) = y*c+z*s;
-                        u(j,i) = z*c-y*s;
-                    }
-                }
-            }
-            z = w(k, k);
-            if (l == k)
-            {
-                if (z < 0.0)
-                {
-                    w(k, k) = -z;
-                    for (j=0;j<n;j++) 
-                    {
-                        v(j,k) = -v(j,k);
-                    }
-                }
-                break;
-            }
-            if (its == max_iterations - 1) 
-            {
-                throw std::runtime_error("no convergence in 30 svdcmp iterations");
-            }
-            x = w(l, l);
-            nm = k-1;
-            y = w(nm, nm);
-            g = rv1[nm];
-            h = rv1[k];
-            f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
-            g = PYTHAG(f, 1.f);
-            f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
-            c = s = 1.f;
-            for (j=l;j<=nm;j++) 
-            {
-                i = j+1;
-                g = rv1[i];
-                y = w(i, i);
-                h = s*g;
-                g = c*g;
-                z = PYTHAG(f,h);
-                rv1[j] = z;
-                c = f/z;
-                s = h/z;
-                f = x*c+g*s;
-                g = g*c-x*s;
-                h = y*s;
-                y *= c;
-                for (jj=0;jj<n;jj++)
-                {
-                    x = v(jj,j);
-                    z = v(jj,i);
-                    v(jj,j) = x*c+z*s;
-                    v(jj,i) = z*c-x*s;
-                }
-                z = PYTHAG(f,h);
-                w(j, j) = z;
-                if (z) 
-                {
-                    z = 1.f/z;
-                    c = f*z;
-                    s = h*z;
-                }
-                f = c*g+s*y;
-                x = c*y-s*g;
-                for (jj=0;jj<m;jj++)
-                {
-                    y = u(jj,j);
-                    z = u(jj,i);
-                    u(jj,j) = y*c+z*s;
-                    u(jj,i) = z*c-y*s;
-                }
-            }
-            rv1[l] = 0.f;
-            rv1[k] = f;
-            w(k, k) = x;
-        }
-    }
-}
-
-
-RMAGINE_FUNCTION
-void svd2(
-    const Matrix3x3& a, 
-    Matrix3x3& u,
-    Matrix3x3& w,
-    Matrix3x3& v)
 {
     // TODO: test
     constexpr unsigned int m = 3;
@@ -1245,294 +955,540 @@ void svd2(
 }
 
 
-
-// RMAGINE_FUNCTION
-// void svd(
-//     const Matrix3x3& a, 
-//     Matrix3x3& u,
-//     Vector3& w, // matrix
-//     Matrix3x3& v)
-// {
-//     // TODO: test
-//     constexpr unsigned int m = 3;
-//     constexpr unsigned int n = 3;
-//     constexpr unsigned int max_iterations = 30;
-    
-//     // additional memory required
-//     bool flag;
-//     int i, its, j, jj, k, l, nm;
-//     float anorm, c, f, g, h, s, scale, x, y, z;
+RMAGINE_FUNCTION
+void svd(
+    const Matrix3x3& a, 
+    Matrix3x3& u,
+    Vector3& w,
+    Matrix3x3& v)
+{
+    // TODO: test
+    constexpr unsigned int m = 3;
+    constexpr unsigned int n = 3;
+    constexpr unsigned int max_iterations = 20;
     
-//     Vector3 rv1;
+    // additional memory required
+    bool flag;
+    int its, j, jj, k, nm;
+    float anorm, c, f, g, h, s, scale, x, y, z;
     
-//     g = scale = anorm = 0.0;
-//     float eps = std::numeric_limits<float>::epsilon();
-//     u = a;
-
-//     for(i=0; i < n; i++) 
-//     {
-//         l = i+2;
-//         rv1[i] = scale*g;
-//         g = s = scale = 0.0;
-//         if(i < m) 
-//         {
-//             for(k=i; k<m; k++) 
-//             {
-//                 scale += abs(u(k,i));
-//             }
-//             if(scale != 0.0) 
-//             {
-//                 for(k=i; k<m; k++) 
-//                 {
-//                     u(k,i) /= scale;
-//                     s += u(k,i) * u(k,i);
-//                 }
-//                 f = u(i,i);
-//                 g = -SIGN(sqrt(s),f);
-//                 h = f*g-s;
-//                 u(i,i) = f-g;
-//                 for(j=l-1;j<n;j++) 
-//                 {
-//                     for (s=0.0,k=i;k<m;k++) 
-//                     {
-//                         s += u(k,i) * u(k,j);
-//                     }
-//                     f = s/h;
-//                     for (k=i;k<m;k++)
-//                     {
-//                         u(k,j) += f * u(k,i);
-//                     }
-//                 }
-//                 for (k=i;k<m;k++)
-//                 {
-//                     u(k,i) *= scale;
-//                 }
-//             }
-//         }
-//         w(i) = scale * g;
-//         g = s = scale = 0.0;
-//         if(i+1 <= m && i+1 != n)
-//         {
-//             for(k=l-1; k<n; k++)
-//             {
-//                 scale += abs(u(i,k));
-//             }
-//             if(scale != 0.0)
-//             {
-//                 for(k=l-1;k<n;k++) 
-//                 {
-//                     u(i, k) /= scale;
-//                     s += u(i,k) * u(i,k);
-//                 }
-//                 f = u(i, l-1);
-//                 g = -SIGN(sqrt(s),f);
-//                 h = f*g-s;
-//                 u(i,l-1) = f-g;
-//                 for(k=l-1;k<n;k++)
-//                 {
-//                     rv1[k] = u(i,k) / h;
-//                 }
-//                 for(j=l-1; j<m; j++)
-//                 {
-//                     for (s=0.0,k=l-1; k<n; k++)
-//                     {
-//                         s += u(j,k) * u(i,k);
-//                     }
-//                     for (k=l-1; k<n;k++)
-//                     {
-//                         u(j,k) += s * rv1[k];
-//                     }
-//                 }
-//                 for(k=l-1; k<n; k++)
-//                 {
-//                     u(i,k) *= scale;
-//                 }
-//             }
-//         }
-//         anorm = MAX(anorm, (abs(w(i))+abs(rv1[i])));
-//     }
-//     for(i=n-1; i>=0; i--)
-//     {
-//         if(i < n-1) 
-//         {
-//             if(g != 0.0)
-//             {
-//                 for(j=l; j<n; j++)
-//                 {
-//                     v(j,i) = (u(i,j)/u(i,l)) / g;
-//                 }
-//                 for(j=l; j<n; j++)
-//                 {
-//                     for (s=0.0,k=l;k<n;k++) 
-//                     {
-//                         s += u(i,k) * v(k,j);
-//                     }
-//                     for (k=l; k<n; k++)
-//                     {
-//                         v(k,j) += s * v(k,i);
-//                     }
-//                 }
-//             }
-//             for (j=l; j<n; j++) 
-//             {
-//                 v(i,j) = 0.0;
-//                 v(j,i) = 0.0;
-//             }
-//         }
-//         v(i,i) = 1.0;
-//         g = rv1[i];
-//         l = i;
-//     }
-//     for(i=MIN(m,n)-1; i>=0; i--)
-//     {
-//         l = i+1;
-//         g = w(i);
-//         for(j=l;j<n;j++) 
-//         {
-//             u(i,j) = 0.0;
-//         }
-//         if(g != 0.0)
-//         {
-//             g = 1.0/g;
-//             for(j=l; j<n; j++)
-//             {
-//                 for (s=0.0,k=l; k<m; k++) 
-//                 {
-//                     s += u(k,i)*u(k,j);
-//                 }
-//                 f = (s/u(i,i)) * g;
-//                 for (k=i; k<m; k++) 
-//                 {
-//                     u(k,j) += f * u(k,i);
-//                 }
-//             }
-//             for(j=i;j<m;j++) 
-//             {
-//                 u(j,i) *= g;
-//             }
-//         } else {
-//             for(j=i;j<m;j++) 
-//             {
-//                 u(j,i) = 0.0;
-//             } 
-//         }
-//         ++u(i,i);
-//     }
-//     for(k=n-1; k>=0; k--) 
-//     {
-//         for(its=0; its<max_iterations; its++) 
-//         {
-//             flag=true;
-//             for(l=k; l>=0; l--) 
-//             {
-//                 nm=l-1;
-//                 if (l == 0 || abs(rv1[l]) <= eps*anorm) {
-//                     flag=false;
-//                     break;
-//                 }
-//                 if (abs(w(nm)) <= eps*anorm) 
-//                 {
-//                     break;
-//                 }
-//             }
-//             if(flag) 
-//             {
-//                 c=0.0;
-//                 s=1.0;
-//                 for(i=l; i<k+1; i++) 
-//                 {
-//                     f = s*rv1[i];
-//                     rv1[i] = c*rv1[i];
-//                     if(abs(f) <= eps*anorm) 
-//                     {
-//                         break;
-//                     }
-//                     g = w(i);
-//                     h = PYTHAG(f,g);
-//                     w(i) = h;
-//                     h = 1.0/h;
-//                     c = g*h;
-//                     s = -f*h;
-//                     for(j=0; j<m; j++)
-//                     {
-//                         y = u(j,nm);
-//                         z = u(j,i);
-//                         u(j,nm) = y*c+z*s;
-//                         u(j,i) = z*c-y*s;
-//                     }
-//                 }
-//             }
-//             z = w(k);
-//             if (l == k)
-//             {
-//                 if (z < 0.0)
-//                 {
-//                     w(k) = -z;
-//                     for (j=0;j<n;j++) 
-//                     {
-//                         v(j,k) = -v(j,k);
-//                     }
-//                 }
-//                 break;
-//             }
-//             if (its == max_iterations - 1) 
-//             {
-//                 throw std::runtime_error("no convergence in 30 svdcmp iterations");
-//             }
-//             x = w(l);
-//             nm = k-1;
-//             y = w(nm);
-//             g = rv1[nm];
-//             h = rv1[k];
-//             f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
-//             g = PYTHAG(f, 1.f);
-//             f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
-//             c = s = 1.f;
-//             for (j=l;j<=nm;j++) 
-//             {
-//                 i = j+1;
-//                 g = rv1[i];
-//                 y = w(i);
-//                 h = s*g;
-//                 g = c*g;
-//                 z = PYTHAG(f,h);
-//                 rv1[j] = z;
-//                 c = f/z;
-//                 s = h/z;
-//                 f = x*c+g*s;
-//                 g = g*c-x*s;
-//                 h = y*s;
-//                 y *= c;
-//                 for (jj=0;jj<n;jj++)
-//                 {
-//                     x = v(jj,j);
-//                     z = v(jj,i);
-//                     v(jj,j) = x*c+z*s;
-//                     v(jj,i) = z*c-x*s;
-//                 }
-//                 z = PYTHAG(f,h);
-//                 w(j) = z;
-//                 if (z) 
-//                 {
-//                     z = 1.f/z;
-//                     c = f*z;
-//                     s = h*z;
-//                 }
-//                 f = c*g+s*y;
-//                 x = c*y-s*g;
-//                 for (jj=0;jj<m;jj++)
-//                 {
-//                     y = u(jj,j);
-//                     z = u(jj,i);
-//                     u(jj,j) = y*c+z*s;
-//                     u(jj,i) = z*c-y*s;
-//                 }
-//             }
-//             rv1[l] = 0.f;
-//             rv1[k] = f;
-//             w(k) = x;
-//         }
-//     }
-// }
+    Vector3 rv1 = Vector3::Zeros();
+    
+    g = s = scale = anorm = 0.0;
+    float eps = std::numeric_limits<float>::epsilon();
+    u = a;
+
+    // FIRST PART
+
+    // i = 0;
+    // l = 2;
+    scale = fabs(u(0,0)) + fabs(u(1,0)) + fabs(u(2,0));
+    if(scale > 0.0)
+    {
+        u(0, 0) /= scale;
+        u(1, 0) /= scale;
+        u(2, 0) /= scale;
+
+        s = u(0,0) * u(0,0) + u(1,0) * u(1,0) + u(2,0) * u(2,0);
+        f = u(0,0);
+        g = -SIGN(sqrt(s), f);
+        h = f * g - s;
+
+        u(0, 0) = f - g;
+
+        f = (u(0, 0) * u(0, 1) + u(1, 0) * u(1, 1) + u(2, 0) * u(2, 1)) / h;
+        u(0, 1) += f * u(0, 0);
+        u(1, 1) += f * u(1, 0);
+        u(2, 1) += f * u(2, 0);
+
+        f = (u(0, 0) * u(0, 2) + u(1, 0) * u(1, 2) + u(2, 0) * u(2, 2)) / h;
+        u(0, 2) += f * u(0, 0);
+        u(1, 2) += f * u(1, 0);
+        u(2, 2) += f * u(2, 0);
+
+        u(0, 0) *= scale;
+        u(1, 0) *= scale;
+        u(2, 0) *= scale;
+    }
+    
+    w.x = scale * g;
+    g = s = scale = 0.0;
+    
+    scale = abs(u(0,0)) + abs(u(0,1)) + abs(u(0,2));
+    
+    if(scale > 0.0)
+    {
+        u(0, 1) /= scale;
+        u(0, 2) /= scale;
+        s = u(0,1) * u(0,1) + u(0,2) * u(0,2);
+
+        f = u(0, 1);
+        g = -SIGN(sqrt(s),f);
+        h = f * g-s;
+        u(0, 1) = f - g;
+
+        rv1.y = u(0, 1) / h;
+        rv1.z = u(0, 2) / h;
+
+        s = u(1,1) * u(0,1) + u(1,2) * u(0,2);
+        u(1, 1) += s * rv1.y;
+        u(1, 2) += s * rv1.z;
+    
+        s = u(2,1) * u(0,1) + u(2,2) * u(0,2);
+        u(2, 1) += s * rv1.y;
+        u(2, 2) += s * rv1.z;            
+
+        u(0, 1) *= scale;
+        u(0, 2) *= scale;
+    }
+    
+    anorm = fabs(w.x);
+    // anorm = MAX(anorm, (fabs(w(0, 0)) + fabs(rv1.x))); // rv1.x is always 0 here, anorm too. fabs(X) >= 0
+    
+    // i = 1;
+    // l = 3;
+    rv1.y = scale * g;
+    g = 0.0;
+    scale = fabs(u(1, 1)) + fabs(u(2, 1));
+    
+    if(scale > 0.0)
+    {
+        u(1,1) /= scale;
+        u(2,1) /= scale;
+
+        s = u(1,1) * u(1,1) + u(2,1) * u(2,1);
+        f = u(1,1);
+        g = -SIGN(sqrt(s),f);
+        h = f * g - s;
+        u(1,1) = f-g;
+        
+        f = (u(1,1) * u(1,2) + u(2,1) * u(2,2)) / h;
+        u(1,2) += f * u(1,1);
+        u(2,2) += f * u(2,1);
+        
+        u(1,1) *= scale;
+        u(2,1) *= scale;
+    }
+    
+    w.y = scale * g;
+    g = s = scale = 0.0;
+    
+    scale = abs(u(1,2));
+    if(scale > 0.0)
+    {
+        u(1,2) /= scale;
+        s = u(1,2) * u(1,2);
+        
+        f = u(1, 2);
+        g = -SIGN(sqrt(s), f);
+        h = f * g - s;
+        u(1,2) = f - g;
+
+        rv1.z = u(1,2) / h;
+        s = u(2,2) * u(1,2);
+
+        u(2,2) += s * rv1.z;
+        u(1,2) *= scale;
+    }
+
+    anorm = MAX(anorm, (abs(w.y) + abs(rv1.y)));
+    
+    rv1.z = scale * g;
+
+    scale = abs(u(2, 2));
+    if(scale > 0.0) 
+    {
+        u(2, 2) /= scale;
+        s = u(2, 2) * u(2, 2);
+        f = u(2, 2);
+        g = -SIGN(sqrt(s),f);
+        h = f * g - s;
+
+        u(2, 2) = f - g;
+        u(2, 2) *= scale;
+    }
+    
+    w.z = scale * g;
+    g = s = scale = 0.0;
+    
+    anorm = MAX(anorm, (abs(w.z)+abs(rv1.z)));
+
+    // SECOND PART    
+    v(2, 2) = 1.0;
+    g = rv1.z;
+
+    // i = 1;
+    // l = 2;
+    if(fabs(g) > 0.0)
+    {
+        v(2,1) = (u(1,2) / u(1,2)) / g;
+        s = u(1,2) * v(2,2);
+        v(2,2) += s * v(2,1);
+    }
+    v(1,2) = 0.0;
+    v(2,1) = 0.0;
+    v(1,1) = 1.0;
+
+    g = rv1.y;
+
+    // l = 1;
+    // i = 0;
+    if(fabs(g) > 0.0)
+    {
+        v(1,0) = (u(0,1) / u(0,1)) / g;
+        v(2,0) = (u(0,2) / u(0,1)) / g;
+
+        s = u(0,1) * v(1,1) + u(0,2) * v(2,1);
+        v(1,1) += s * v(1,0);
+        v(2,1) += s * v(2,0);
+
+        s = u(0,1) * v(1,2) + u(0,2) * v(2,2);
+        v(1,2) += s * v(1,0);
+        v(2,2) += s * v(2,0);
+    }
+    v(0,1) = 0.0;
+    v(1,0) = 0.0;
+    v(0,2) = 0.0;
+    v(2,0) = 0.0;
+    v(0,0) = 1.0;
+    g = rv1.x;
+
+
+    // THIRD PART
+    
+    // i = 2;
+    // l = 3;
+    g = w.z;
+    if(fabs(g) > 0.0)
+    {
+        u(2,2) /= g;
+    } else { 
+        // TODO(amock): shouldnt this be a large number?
+        u(2,2) = 0.0;   
+    }
+    u(2,2) += 1.0;
+    
+    // i = 1;
+    // l = 2;
+
+    g = w.y;
+    u(1,2) = 0.0;
+    
+    if(fabs(g) > 0.0)
+    {
+        g = 1.0/g;
+        s = u(2,1) * u(2,2);
+        f = (s/u(1,1)) * g;
+
+        u(1,2) += f * u(1,1);
+        u(2,2) += f * u(2,1);
+    
+        u(1,1) *= g;
+        u(2,1) *= g;
+    } else {
+        u(1,1) = 0.0;
+        u(2,1) = 0.0;
+    }
+    u(1,1) += 1.0;
+    
+    // i = 0;
+    // l = 1;
+    g = w.x;
+    u(0,1) = 0.0;
+    u(0,2) = 0.0;
+    
+    if(fabs(g) > 0.0)
+    {
+        f = (u(1,0) * u(1,1) + u(2,0) * u(2,1)) / (g * u(0,0));
+        u(0,1) += f * u(0,0);
+        u(1,1) += f * u(1,0);
+        u(2,1) += f * u(2,0);
+
+        f = (u(1,0) * u(1,2) + u(2,0) * u(2,2)) / (g * u(0,0));
+        u(0,2) += f * u(0,0);
+        u(1,2) += f * u(1,0);
+        u(2,2) += f * u(2,0);
+
+        u(0,0) /= g;
+        u(1,0) /= g;
+        u(2,0) /= g;
+    } else {
+        u(0,0) = 0.0;
+        u(1,0) = 0.0;
+        u(2,0) = 0.0;
+    }
+    u(0,0) += 1.0;
+
+    int i, l;
+
+    // PART 4: Opti
+
+    // k = 2;
+    for(its=0; its<max_iterations; its++) 
+    {
+        // flag=true;
+        // l = 2;
+        // if(MIN(fabs(rv1.z), fabs(w(1,1))) > eps*anorm)
+        // {
+        //     l = 1;
+        //     if(MIN(fabs(rv1.y),abs(w(0,0))) > eps*anorm)
+        //     {
+        //         l = 0;
+        //     }
+        // }
+
+        flag=true;
+        l=2;
+        if(abs(rv1.z) <= eps*anorm)
+        {
+            flag=false;
+        }
+        else if(abs(w.y) > eps*anorm)
+        {
+            l=1;
+            if(abs(rv1.y) <= eps*anorm) 
+            {
+                flag=false;
+            }
+            else if(abs(w.x) > eps*anorm) 
+            {
+                l=0;
+                flag = false;
+            }
+        }
+        
+        if(flag)
+        {
+            c=0.0;
+            s=1.0;
+            for(i=l; i<3; i++) 
+            {
+                f = s*rv1[i];
+                rv1[i] = c*rv1[i];
+                if(abs(f) <= eps*anorm) 
+                {
+                    break;
+                }
+                g = w(i);
+                h = PYTHAG(f,g);
+                w(i) = h;
+                h = 1.0/h;
+                c = g*h;
+                s = -f*h;
+                for(j=0; j<m; j++)
+                {
+                    y = u(j,l-1);
+                    z = u(j,i);
+                    u(j,l-1) = y*c+z*s;
+                    u(j,i) = z*c-y*s;
+                }
+            }
+        }
+        z = w.z;
+        if(l == 2)
+        {
+            if(z < 0.0)
+            {
+                w.z = -z;
+                v(0,2) = -v(0,2);
+                v(1,2) = -v(1,2);
+                v(2,2) = -v(2,2);
+            }
+            break;
+        }
+        if(its == max_iterations - 1) 
+        {
+            // std::cout << "no convergence in " << max_iterations << " svdcmp iterations" << std::endl;
+            throw std::runtime_error("no convergence in max svdcmp iterations");
+        }
+        x = w(l);
+        y = w.y;
+        g = rv1.y;
+        h = rv1.z;
+        f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
+        g = PYTHAG(f, 1.f);
+        f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+        c = s = 1.f;
+        for (j=l; j<2; j++)
+        {
+            i = j+1;
+            g = rv1[i];
+            y = w(i);
+            h = s*g;
+            g = c*g;
+            z = PYTHAG(f,h);
+            rv1[j] = z;
+            c = f/z;
+            s = h/z;
+            f = x*c+g*s;
+            g = g*c-x*s;
+            h = y*s;
+            y *= c;
+            for(jj=0;jj<3;jj++)
+            {
+                x = v(jj,j);
+                z = v(jj,i);
+                v(jj,j) = x*c+z*s;
+                v(jj,i) = z*c-x*s;
+            }
+            z = PYTHAG(f,h);
+            w(j) = z;
+            if(z>0.0) 
+            {
+                z = 1.f/z;
+                c = f*z;
+                s = h*z;
+            }
+            f = c*g+s*y;
+            x = c*y-s*g;
+            for (jj=0;jj<m;jj++)
+            {
+                y = u(jj,j);
+                z = u(jj,i);
+                u(jj,j) = y*c+z*s;
+                u(jj,i) = z*c-y*s;
+            }
+        }
+        rv1[l] = 0.f;
+        rv1.z = f;
+        w.z = x;
+    }
+
+
+    for(its=0; its<max_iterations; its++) 
+    {
+        flag=true;
+        l=1;
+        if(abs(rv1.y) <= eps*anorm) 
+        {
+            flag=false;
+        }
+        else if(abs(w.x) > eps*anorm) 
+        {
+            l=0;
+            flag = false;
+        }
+
+        if(flag)
+        {
+            c=0.0;
+            s=1.0;
+            for(i=l; i<2; i++) 
+            {
+                f = s*rv1[i];
+                rv1[i] = c*rv1[i];
+                if(abs(f) <= eps*anorm) 
+                {
+                    break;
+                }
+                g = w(i);
+                h = PYTHAG(f,g);
+                w(i) = h;
+                h = 1.0/h;
+                c = g*h;
+                s = -f*h;
+                for(j=0; j<m; j++)
+                {
+                    y = u(j,l-1);
+                    z = u(j,i);
+                    u(j,l-1) = y*c+z*s;
+                    u(j,i) = z*c-y*s;
+                }
+            }
+        }
+        z = w.y;
+        if(l == 1)
+        {
+            if(z < 0.0)
+            {
+                w.y = -z;
+                for (j=0; j<3; j++) 
+                {
+                    v(j,1) = -v(j,1);
+                }
+            }
+            break;
+        }
+        if(its == max_iterations - 1) 
+        {
+            std::cout << "no convergence in " << max_iterations << " svdcmp iterations" << std::endl;
+            throw std::runtime_error("no convergence in max svdcmp iterations");
+        }
+
+        x = w(l);
+        y = w.x;
+        g = rv1.x;
+        h = rv1.y;
+        f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
+        g = PYTHAG(f, 1.f);
+        f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+        c = s = 1.f;
+
+        if(l == 0)
+        {
+            g = rv1.y;
+            y = w.y;
+            h = s*g;
+            g = c*g;
+            z = PYTHAG(f,h);
+            rv1.x = z;
+            c = f/z;
+            s = h/z;
+            f = x*c+g*s;
+            g = g*c-x*s;
+            h = y*s;
+            y *= c;
+
+
+            x = v(0,0);
+            z = v(0,1);
+            v(0,0) = x*c+z*s;
+            v(0,1) = z*c-x*s;
+
+            x = v(1,0);
+            z = v(1,1);
+            v(1,0) = x*c+z*s;
+            v(1,1) = z*c-x*s;
+
+            x = v(2,0);
+            z = v(2,1);
+            v(2,0) = x*c+z*s;
+            v(2,1) = z*c-x*s;
+
+
+            z = PYTHAG(f,h);
+            w.x = z;
+            if(z>0.f)
+            {
+                z = 1.f/z;
+                c = f*z;
+                s = h*z;
+            }
+            f = c*g+s*y;
+            x = c*y-s*g;
+            for(jj=0; jj<3; jj++)
+            {
+                y = u(jj,0);
+                z = u(jj,1);
+                u(jj,0) = y*c+z*s;
+                u(jj,1) = z*c-y*s;
+            }
+        }
+        
+        rv1[l] = 0.f;
+        rv1.y = f;
+        w.y = x;
+    }
+
+
+    z = w.x;
+    if (z < 0.0)
+    {
+        w.x = -z;
+        v(0,0) = -v(0,0);
+        v(1,0) = -v(1,0);
+        v(2,0) = -v(2,0);
+    }
+}
 
 } // namespace rmagine
diff --git a/tests/core/math_svd.cpp b/tests/core/math_svd.cpp
index 8f22f40..e39c915 100644
--- a/tests/core/math_svd.cpp
+++ b/tests/core/math_svd.cpp
@@ -329,31 +329,31 @@ void parallelTest()
     std::cout << "- summed error: " << err_rmagine << std::endl;
 
 
-    rm::Memory<rm::Matrix3x3> res_rm2(num_objects);
-
-    sw();
-    // #pragma omp parallel for
-    for(size_t obj_id=0; obj_id<num_objects; obj_id++)
-    {
-        rm::Matrix3x3 Urm = rm::Matrix3x3::Zeros();
-        rm::Matrix3x3 Wrm = rm::Matrix3x3::Zeros();
-        rm::Matrix3x3 Vrm = rm::Matrix3x3::Zeros();
-        rm::svd2(covs_rm[obj_id], Urm, Wrm, Vrm);
-        auto uvt_rm = Urm * Wrm * Vrm.T();
-        res_rm2[obj_id] = uvt_rm;
-    }
-    el_rmagine2 = sw();
-
-    float err_rmagine2 = 0.0;
-    for(size_t obj_id = 0; obj_id < num_objects; obj_id++)
-    {
-        err_rmagine2 += compute_error(covs_rm[obj_id], res_rm2[obj_id]);
-    }
-
-
-    std::cout << "Rmagine Opti:" << std::endl;
-    std::cout << "- run time: " << el_rmagine2 << " s" << std::endl;
-    std::cout << "- summed error: " << err_rmagine2 << std::endl;
+    // rm::Memory<rm::Matrix3x3> res_rm2(num_objects);
+
+    // sw();
+    // // #pragma omp parallel for
+    // for(size_t obj_id=0; obj_id<num_objects; obj_id++)
+    // {
+    //     rm::Matrix3x3 Urm = rm::Matrix3x3::Zeros();
+    //     rm::Matrix3x3 Wrm = rm::Matrix3x3::Zeros();
+    //     rm::Matrix3x3 Vrm = rm::Matrix3x3::Zeros();
+    //     rm::svd2(covs_rm[obj_id], Urm, Wrm, Vrm);
+    //     auto uvt_rm = Urm * Wrm * Vrm.T();
+    //     res_rm2[obj_id] = uvt_rm;
+    // }
+    // el_rmagine2 = sw();
+
+    // float err_rmagine2 = 0.0;
+    // for(size_t obj_id = 0; obj_id < num_objects; obj_id++)
+    // {
+    //     err_rmagine2 += compute_error(covs_rm[obj_id], res_rm2[obj_id]);
+    // }
+
+
+    // std::cout << "Rmagine Opti:" << std::endl;
+    // std::cout << "- run time: " << el_rmagine2 << " s" << std::endl;
+    // std::cout << "- summed error: " << err_rmagine2 << std::endl;
 
 
 }

From a797adfc98379f8fa2aa43ffd13c038dfb17b1db Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Wed, 11 Sep 2024 23:17:47 +0200
Subject: [PATCH 40/44] started to implement cuda tests. but my laptop has no
 gpu

---
 .../include/rmagine/math/math.cuh             | 10 ++++
 src/rmagine_cuda/src/math/math.cu             | 10 ++++
 tests/core/math_svd.cpp                       | 29 ----------
 tests/cuda/CMakeLists.txt                     |  8 +++
 tests/cuda/math_svd.cpp                       | 53 +++++++++++++++++++
 5 files changed, 81 insertions(+), 29 deletions(-)
 create mode 100644 tests/cuda/math_svd.cpp

diff --git a/src/rmagine_cuda/include/rmagine/math/math.cuh b/src/rmagine_cuda/include/rmagine/math/math.cuh
index 7439487..c7987e0 100644
--- a/src/rmagine_cuda/include/rmagine/math/math.cuh
+++ b/src/rmagine_cuda/include/rmagine/math/math.cuh
@@ -543,6 +543,16 @@ Memory<Matrix3x3, VRAM_CUDA> cov(
     const MemoryView<Vector, VRAM_CUDA>& v2
 );
 
+/**
+ * @brief decompose A = UWV* using singular velue decomposition
+ */
+void svd(
+    const MemoryView<rm::Matrix3x3, VRAM_CUDA>& As,
+    const MemoryView<rm::Matrix3x3, VRAM_CUDA>& Us,
+    const MemoryView<rm::Matrix3x3, VRAM_CUDA>& Ws,
+    const MemoryView<rm::Matrix3x3, VRAM_CUDA>& Vs
+    );
+
 
 } // namespace rmagine
 
diff --git a/src/rmagine_cuda/src/math/math.cu b/src/rmagine_cuda/src/math/math.cu
index e06d3ba..f550290 100644
--- a/src/rmagine_cuda/src/math/math.cu
+++ b/src/rmagine_cuda/src/math/math.cu
@@ -1429,5 +1429,15 @@ Memory<Matrix3x3, VRAM_CUDA> cov(
     return C;
 }
 
+void svd(
+    const MemoryView<rm::Matrix3x3, VRAM_CUDA>& As,
+    const MemoryView<rm::Matrix3x3, VRAM_CUDA>& Us,
+    const MemoryView<rm::Matrix3x3, VRAM_CUDA>& Ws,
+    const MemoryView<rm::Matrix3x3, VRAM_CUDA>& Vs
+)
+{
+    std::cout << "CUDA SVD!" << std::endl;
+}
+
 
 } // namespace rmagine
\ No newline at end of file
diff --git a/tests/core/math_svd.cpp b/tests/core/math_svd.cpp
index e39c915..07da8e9 100644
--- a/tests/core/math_svd.cpp
+++ b/tests/core/math_svd.cpp
@@ -327,35 +327,6 @@ void parallelTest()
     std::cout << "Rmagine:" << std::endl;
     std::cout << "- run time: " << el_rmagine << " s" << std::endl;
     std::cout << "- summed error: " << err_rmagine << std::endl;
-
-
-    // rm::Memory<rm::Matrix3x3> res_rm2(num_objects);
-
-    // sw();
-    // // #pragma omp parallel for
-    // for(size_t obj_id=0; obj_id<num_objects; obj_id++)
-    // {
-    //     rm::Matrix3x3 Urm = rm::Matrix3x3::Zeros();
-    //     rm::Matrix3x3 Wrm = rm::Matrix3x3::Zeros();
-    //     rm::Matrix3x3 Vrm = rm::Matrix3x3::Zeros();
-    //     rm::svd2(covs_rm[obj_id], Urm, Wrm, Vrm);
-    //     auto uvt_rm = Urm * Wrm * Vrm.T();
-    //     res_rm2[obj_id] = uvt_rm;
-    // }
-    // el_rmagine2 = sw();
-
-    // float err_rmagine2 = 0.0;
-    // for(size_t obj_id = 0; obj_id < num_objects; obj_id++)
-    // {
-    //     err_rmagine2 += compute_error(covs_rm[obj_id], res_rm2[obj_id]);
-    // }
-
-
-    // std::cout << "Rmagine Opti:" << std::endl;
-    // std::cout << "- run time: " << el_rmagine2 << " s" << std::endl;
-    // std::cout << "- summed error: " << err_rmagine2 << std::endl;
-
-
 }
 
 int main(int argc, char** argv)
diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt
index f1bb3c4..2998f89 100644
--- a/tests/cuda/CMakeLists.txt
+++ b/tests/cuda/CMakeLists.txt
@@ -26,3 +26,11 @@ target_link_libraries(rmagine_tests_cuda_memory_slicing
 
 add_test(NAME cuda_memory_slicing COMMAND rmagine_tests_cuda_memory_slicing)
 
+
+# 4. SVD
+add_executable(rmagine_tests_cuda_math_svd math_svd.cpp)
+target_link_libraries(rmagine_tests_cuda_math_svd
+    rmagine::cuda
+)
+
+add_test(NAME cuda_math_svd COMMAND rmagine_tests_cuda_math_svd)
diff --git a/tests/cuda/math_svd.cpp b/tests/cuda/math_svd.cpp
new file mode 100644
index 0000000..873e400
--- /dev/null
+++ b/tests/cuda/math_svd.cpp
@@ -0,0 +1,53 @@
+#include <rmagine/math/SVD2.hpp>
+
+#include <rmagine/math/math_batched.cuh>
+
+#include <Eigen/Dense>
+
+namespace rm = rmagine;
+
+
+void parallelTest()
+{
+    std::cout << "parallelTest" << std::endl;
+    // correct num_objects objects in parallel
+    size_t num_objects = 1000000;
+
+    std::vector<Eigen::Matrix3f> covs_eigen(num_objects);
+    rm::Memory<rm::Matrix3x3, rm::RAM> covs_rm(num_objects);
+
+    for(size_t obj_id=0; obj_id<num_objects; obj_id++)
+    {
+        rm::Matrix3x3 Arm;
+        Eigen::Matrix3f Aeig = Eigen::Matrix3f::Random(3, 3);
+        for(size_t i=0; i<3; i++)
+        {
+            for(size_t j=0; j<3; j++)
+            {
+                Arm(i, j) = Aeig(i, j);
+            }
+        }
+
+        covs_eigen[obj_id] = Aeig;
+        covs_rm[obj_id] = Arm;
+    }
+
+    std::cout << "First Mat: " << std::endl;
+    std::cout << covs_rm[0] << std::endl;
+
+
+    rm::Memory<rm::Matrix3x3, rm::VRAM_CUDA> covs_rm_ = covs_rm;
+
+
+
+
+}
+
+int main(int argc, char** argv)
+{
+    srand((unsigned int) time(0));
+
+    parallelTest();
+
+    return 0;
+}
\ No newline at end of file

From 0d533800903bee6aea4edc9f841c471aeea53995 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Thu, 12 Sep 2024 01:44:34 +0200
Subject: [PATCH 41/44] finished impl. we can start to test it now

---
 src/rmagine_core/CMakeLists.txt               |    1 -
 .../include/rmagine/math/SVD2.hpp             |  181 ---
 .../include/rmagine/math/linalg.h             |   67 +
 .../rmagine/math/{SVD2.tcc => linalg.tcc}     |    1 +
 src/rmagine_core/include/rmagine/math/math.h  |   85 +-
 .../include/rmagine/types/shared_functions.h  |    8 +
 src/rmagine_core/src/math/linalg.cpp          | 1081 +++++++++++++++++
 src/rmagine_core/src/math/math.cpp            |   36 +
 src/rmagine_cuda/CMakeLists.txt               |    4 +
 .../include/rmagine/math/math.cuh             |   24 +-
 .../include/rmagine/util/cuda/CudaDebug.hpp   |   12 +
 .../src/math/linalg.cu}                       |  501 +-------
 src/rmagine_cuda/src/math/math.cu             |  110 +-
 src/rmagine_ouster/CMakeLists.txt             |    2 +-
 tests/core/math_svd.cpp                       |   72 +-
 tests/cuda/math_svd.cpp                       |   81 +-
 16 files changed, 1605 insertions(+), 661 deletions(-)
 delete mode 100644 src/rmagine_core/include/rmagine/math/SVD2.hpp
 rename src/rmagine_core/include/rmagine/math/{SVD2.tcc => linalg.tcc} (99%)
 rename src/{rmagine_core/src/math/SVD2.cpp => rmagine_cuda/src/math/linalg.cu} (66%)

diff --git a/src/rmagine_core/CMakeLists.txt b/src/rmagine_core/CMakeLists.txt
index 20db4f3..9ab8b04 100644
--- a/src/rmagine_core/CMakeLists.txt
+++ b/src/rmagine_core/CMakeLists.txt
@@ -8,7 +8,6 @@ set(RMAGINE_CORE_SRCS
     src/math/math.cpp
     src/math/linalg.cpp
     src/math/SVD.cpp
-    src/math/SVD2.cpp
     # Types
     src/types/Memory.cpp
     src/types/conversions.cpp
diff --git a/src/rmagine_core/include/rmagine/math/SVD2.hpp b/src/rmagine_core/include/rmagine/math/SVD2.hpp
deleted file mode 100644
index 5c10692..0000000
--- a/src/rmagine_core/include/rmagine/math/SVD2.hpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2022, University Osnabrück
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the University Osnabrück nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL University Osnabrück BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/**
- * @file
- * 
- * @brief SVD solver for CPU Memory
- *
- * @date 03.10.2022
- * @author Alexander Mock
- * 
- * @copyright Copyright (c) 2022, University Osnabrück. All rights reserved.
- * This project is released under the 3-Clause BSD License.
- * 
- */
-
-#ifndef RMAGINE_MATH_SVD2_HPP
-#define RMAGINE_MATH_SVD2_HPP
-
-#include <rmagine/types/shared_functions.h>
-#include <rmagine/types/Memory.hpp>
-#include <rmagine/math/types.h>
-#include <memory>
-
-namespace rmagine {
-
-
-template<class T>
-inline T SQR(const T a) {return a*a;}
-
-template<class T>
-inline const T &MAX(const T &a, const T &b)
-    {return b > a ? (b) : (a);}
-
-inline float MAX(const double &a, const float &b)
-    {return b > a ? (b) : float(a);}
-
-inline float MAX(const float &a, const double &b)
-    {return b > a ? float(b) : (a);}
-
-template<class T>
-inline const T &MIN(const T &a, const T &b)
-    {return b < a ? (b) : (a);}
-
-inline float MIN(const double &a, const float &b)
-    {return b < a ? (b) : float(a);}
-
-inline float MIN(const float &a, const double &b)
-    {return b < a ? float(b) : (a);}
-
-template<class T>
-inline T SIGN(const T &a, const T &b)
-    {return b >= 0 ? (a >= 0 ? a : -a) : (a >= 0 ? -a : a);}
-
-inline float SIGN(const float &a, const double &b)
-    {return b >= 0 ? (a >= 0 ? a : -a) : (a >= 0 ? -a : a);}
-
-inline float SIGN(const double &a, const float &b)
-    {return (float)(b >= 0 ? (a >= 0 ? a : -a) : (a >= 0 ? -a : a));}
-
-template<class T>
-inline void SWAP(T &a, T &b)
-    {T dum=a; a=b; b=dum;}
-
-template<typename T>
-inline T PYTHAG(const T a, const T b)
-{
-    T absa = abs(a);
-    T absb = abs(b);
-    return (absa > absb ? absa * sqrt(1.0+SQR(absb/absa)) :
-        (absb == 0.0 ? 0.0 : absb * sqrt(1.0+SQR(absa/absb))));
-}
-
-class SVD2
-{
-public:
-    SVD2();
-    ~SVD2();
-
-    void decompose(const Matrix3x3& a);
-
-    Matrix3x3 u,v;
-	Vector3 w;
-private:
-    
-    void reorder();
-    float pythag(const float a, const float b);
-
-    // int m, n;
-	float eps, tsh;
-};
-
-using SVD2Ptr = std::shared_ptr<SVD2>;
-
-// Numerical Recipes
-// M = MatrixT::rows()
-// N = MatrixT::cols()
-// 
-// Warning: Numerical Recipes has different SVD matrix shapes
-// than Wikipedia
-template<typename MatrixT>
-struct svd_dims {
-    using U = MatrixT; // same as input
-    using w = Matrix_<typename MatrixT::Type, MatrixT::cols(), 1>;
-    using W = Matrix_<typename MatrixT::Type, MatrixT::cols(), MatrixT::cols()>;
-    using V = Matrix_<typename MatrixT::Type, MatrixT::cols(), MatrixT::cols()>;
-};
-
-/**
- * @brief own SVD implementation. 
- * Why use it? 
- * - ~2x faster than Eigen
- * - SOON: Works insided of CUDA kernels
- *
- */
-template<typename DataT, unsigned int Rows, unsigned int Cols>
-void svd(
-    const Matrix_<DataT, Rows, Cols>& A, 
-    Matrix_<DataT, Rows, Cols>& U,
-    Matrix_<DataT, Cols, Cols>& W, // matrix
-    Matrix_<DataT, Cols, Cols>& V
-);
-
-template<typename DataT, unsigned int Rows, unsigned int Cols>
-void svd(
-    const Matrix_<DataT, Rows, Cols>& A,
-    Matrix_<DataT, Rows, Cols>& U,
-    Matrix_<DataT, Cols, 1>& w, // vector version (Cols should be something with max)
-    Matrix_<DataT, Cols, Cols>& V
-);
-
-/**
- * @brief SVD that can be used for both CPU and GPU (Cuda kernels)
- *
- */
-RMAGINE_FUNCTION
-void svd(
-    const Matrix3x3& A,
-    Matrix3x3& U,
-    Matrix3x3& W,
-    Matrix3x3& V
-);
-
-RMAGINE_FUNCTION
-void svd(
-    const Matrix3x3& A, 
-    Matrix3x3& U,
-    Vector3& w,
-    Matrix3x3& V
-);
-
-
-} // namespace rmagine
-
-#include "SVD2.tcc"
-
-#endif // RMAGINE_MATH_SVD2_HPP
diff --git a/src/rmagine_core/include/rmagine/math/linalg.h b/src/rmagine_core/include/rmagine/math/linalg.h
index 5a6b60b..58bd0a3 100644
--- a/src/rmagine_core/include/rmagine/math/linalg.h
+++ b/src/rmagine_core/include/rmagine/math/linalg.h
@@ -41,6 +41,7 @@
 #define RMAGINE_MATH_LINALG_H
 
 #include "types.h"
+#include <rmagine/types/shared_functions.h>
 
 namespace rmagine
 {
@@ -53,8 +54,10 @@ namespace rmagine
  * @param scale   scale vector
  * @return Matrix4x4   composed 4x4 transformation matrix
  */
+RMAGINE_FUNCTION
 Matrix4x4 compose(const Transform& T, const Vector3& scale);
 
+RMAGINE_FUNCTION
 Matrix4x4 compose(const Transform& T, const Matrix3x3& S);
 
 /**
@@ -65,6 +68,7 @@ Matrix4x4 compose(const Transform& T, const Matrix3x3& S);
  * @param T transform object consisting of translational and rotational parts
  * @param S 3x3 scale matrix
  */
+RMAGINE_FUNCTION
 void decompose(const Matrix4x4& M, Transform& T, Matrix3x3& S);
 
 /**
@@ -75,6 +79,7 @@ void decompose(const Matrix4x4& M, Transform& T, Matrix3x3& S);
  * @param T transform object consisting of translational and rotational parts
  * @param scale scale vector
  */
+RMAGINE_FUNCTION
 void decompose(const Matrix4x4& M, Transform& T, Vector3& scale);
 
 /**
@@ -86,6 +91,7 @@ void decompose(const Matrix4x4& M, Transform& T, Vector3& scale);
  * - if fac=1.0 it is exactly B
  * - if fac=2.0 it it goes on a (B-A) length from B (extrapolation)
 */
+RMAGINE_FUNCTION
 Quaternion polate(const Quaternion& A, const Quaternion& B, float fac);
 
 /**
@@ -97,8 +103,69 @@ Quaternion polate(const Quaternion& A, const Quaternion& B, float fac);
  * - if fac=1.0 it is exactly B
  * - if fac=2.0 it it goes on a (B-A) length from B (extrapolation)
 */
+RMAGINE_FUNCTION
 Transform polate(const Transform& A, const Transform& B, float fac);
 
+
+// Numerical Recipes
+// M = MatrixT::rows()
+// N = MatrixT::cols()
+// 
+// Warning: Numerical Recipes has different SVD matrix shapes
+// than Wikipedia
+template<typename MatrixT>
+struct svd_dims {
+    using U = MatrixT; // same as input
+    using w = Matrix_<typename MatrixT::Type, MatrixT::cols(), 1>;
+    using W = Matrix_<typename MatrixT::Type, MatrixT::cols(), MatrixT::cols()>;
+    using V = Matrix_<typename MatrixT::Type, MatrixT::cols(), MatrixT::cols()>;
+};
+
+/**
+ * @brief own SVD implementation. 
+ * Why use it? 
+ * - ~2x faster than Eigen
+ * - SOON: Works insided of CUDA kernels
+ *
+ */
+template<typename DataT, unsigned int Rows, unsigned int Cols>
+void svd(
+    const Matrix_<DataT, Rows, Cols>& A, 
+    Matrix_<DataT, Rows, Cols>& U,
+    Matrix_<DataT, Cols, Cols>& W, // matrix
+    Matrix_<DataT, Cols, Cols>& V
+);
+
+template<typename DataT, unsigned int Rows, unsigned int Cols>
+void svd(
+    const Matrix_<DataT, Rows, Cols>& A,
+    Matrix_<DataT, Rows, Cols>& U,
+    Matrix_<DataT, Cols, 1>& w, // vector version (Cols should be something with max)
+    Matrix_<DataT, Cols, Cols>& V
+);
+
+/**
+ * @brief SVD that can be used for both CPU and GPU (Cuda kernels)
+ *
+ */
+RMAGINE_FUNCTION
+void svd(
+    const Matrix3x3& A,
+    Matrix3x3& U,
+    Matrix3x3& W,
+    Matrix3x3& V
+);
+
+RMAGINE_FUNCTION
+void svd(
+    const Matrix3x3& A, 
+    Matrix3x3& U,
+    Vector3& w,
+    Matrix3x3& V
+);
+
 } // namespace rmagine
 
+#include "linalg.tcc"
+
 #endif // RMAGINE_MATH_MATH_LINALG_H
\ No newline at end of file
diff --git a/src/rmagine_core/include/rmagine/math/SVD2.tcc b/src/rmagine_core/include/rmagine/math/linalg.tcc
similarity index 99%
rename from src/rmagine_core/include/rmagine/math/SVD2.tcc
rename to src/rmagine_core/include/rmagine/math/linalg.tcc
index b515e84..ef7a376 100644
--- a/src/rmagine_core/include/rmagine/math/SVD2.tcc
+++ b/src/rmagine_core/include/rmagine/math/linalg.tcc
@@ -1,4 +1,5 @@
 #include <rmagine/math/types.h>
+#include <rmagine/math/math.h>
 
 namespace rmagine
 {
diff --git a/src/rmagine_core/include/rmagine/math/math.h b/src/rmagine_core/include/rmagine/math/math.h
index 39c341f..eb42079 100644
--- a/src/rmagine_core/include/rmagine/math/math.h
+++ b/src/rmagine_core/include/rmagine/math/math.h
@@ -49,6 +49,67 @@
 namespace rmagine
 {
 
+///////
+// TODO: Integrate the following functions better / somewhere else
+//////
+template<class T>
+RMAGINE_INLINE_FUNCTION
+T SQR(const T a) {return a*a;}
+
+template<class T>
+RMAGINE_INLINE_FUNCTION
+const T &MAX(const T &a, const T &b)
+    {return b > a ? (b) : (a);}
+
+RMAGINE_INLINE_FUNCTION
+float MAX(const double &a, const float &b)
+    {return b > a ? (b) : float(a);}
+
+RMAGINE_INLINE_FUNCTION
+float MAX(const float &a, const double &b)
+    {return b > a ? float(b) : (a);}
+
+template<class T>
+RMAGINE_INLINE_FUNCTION
+const T &MIN(const T &a, const T &b)
+    {return b < a ? (b) : (a);}
+
+RMAGINE_INLINE_FUNCTION
+float MIN(const double &a, const float &b)
+    {return b < a ? (b) : float(a);}
+
+RMAGINE_INLINE_FUNCTION
+float MIN(const float &a, const double &b)
+    {return b < a ? float(b) : (a);}
+
+template<class T>
+RMAGINE_INLINE_FUNCTION
+T SIGN(const T &a, const T &b)
+    {return b >= 0 ? (a >= 0 ? a : -a) : (a >= 0 ? -a : a);}
+
+RMAGINE_INLINE_FUNCTION
+float SIGN(const float &a, const double &b)
+    {return b >= 0 ? (a >= 0 ? a : -a) : (a >= 0 ? -a : a);}
+
+RMAGINE_INLINE_FUNCTION
+float SIGN(const double &a, const float &b)
+    {return (float)(b >= 0 ? (a >= 0 ? a : -a) : (a >= 0 ? -a : a));}
+
+template<class T>
+RMAGINE_INLINE_FUNCTION
+void SWAP(T &a, T &b)
+    {T dum=a; a=b; b=dum;}
+
+template<typename T>
+RMAGINE_INLINE_FUNCTION
+T PYTHAG(const T a, const T b)
+{
+    T absa = abs(a);
+    T absb = abs(b);
+    return (absa > absb ? absa * sqrt(1.0+SQR(absb/absa)) :
+        (absb == 0.0 ? 0.0 : absb * sqrt(1.0+SQR(absa/absb))));
+}
+
 
 template<typename DataT>
 Vector3_<DataT> min(const Vector3_<DataT>& a, const Vector3_<DataT>& b);
@@ -56,7 +117,6 @@ Vector3_<DataT> min(const Vector3_<DataT>& a, const Vector3_<DataT>& b);
 template<typename DataT>
 Vector3_<DataT> max(const Vector3_<DataT>& a, const Vector3_<DataT>& b);
 
-
 /////////////
 // #multNxN
 ////////
@@ -371,6 +431,29 @@ Memory<Matrix3x3, RAM> cov(
     const MemoryView<Vector, RAM>& v2
 );
 
+/**
+ * @brief decompose A = UWV* using singular value decomposition
+ */
+void svd(
+    const MemoryView<Matrix3x3, RAM>& As,
+    MemoryView<Matrix3x3, RAM>& Us,
+    MemoryView<Matrix3x3, RAM>& Ws,
+    MemoryView<Matrix3x3, RAM>& Vs
+);
+
+/**
+ * @brief decompose A = UWV* using singular value decomposition
+ * 
+ * w is a vector which is the diagonal of matrix W
+ */
+void svd(
+    const MemoryView<Matrix3x3, RAM>& As,
+    MemoryView<Matrix3x3, RAM>& Us,
+    MemoryView<Vector3, RAM>& ws,
+    MemoryView<Matrix3x3, RAM>& Vs
+);
+
+
 } // namespace rmagine
 
 #include "math.tcc"
diff --git a/src/rmagine_core/include/rmagine/types/shared_functions.h b/src/rmagine_core/include/rmagine/types/shared_functions.h
index a8e1178..c4bdb96 100644
--- a/src/rmagine_core/include/rmagine/types/shared_functions.h
+++ b/src/rmagine_core/include/rmagine/types/shared_functions.h
@@ -44,9 +44,17 @@
 #ifdef __CUDA_ARCH__
 #define RMAGINE_FUNCTION __host__ __device__
 #define RMAGINE_INLINE_FUNCTION __inline__ __host__ __device__ 
+#define RMAGINE_HOST_FUNCTION __host__
+#define RMAGINE_INLINE_HOST_FUNCTION __inline__ __host__
+#define RMAGINE_DEVICE_FUNCTION __device__
+#define RMAGINE_INLINE_DEVICE_FUNCTION __inline__ __device__
 #else
 #define RMAGINE_FUNCTION
 #define RMAGINE_INLINE_FUNCTION inline
+#define RMAGINE_HOST_FUNCTION 
+#define RMAGINE_INLINE_HOST_FUNCTION inline
+#define RMAGINE_DEVICE_FUNCTION 
+#define RMAGINE_INLINE_DEVICE_FUNCTION inline
 #endif
 
 #endif // RMAGINE_TYPES_SHARED_FUNCTIONS_H
\ No newline at end of file
diff --git a/src/rmagine_core/src/math/linalg.cpp b/src/rmagine_core/src/math/linalg.cpp
index 739b24f..148d199 100644
--- a/src/rmagine_core/src/math/linalg.cpp
+++ b/src/rmagine_core/src/math/linalg.cpp
@@ -1,10 +1,15 @@
 #include "rmagine/math/linalg.h"
+#include "rmagine/types/Memory.hpp"
+#include <assert.h>
 
 #include <Eigen/Dense>
 
+#include "rmagine/math/math.h"
+
 namespace rmagine
 {
 
+RMAGINE_HOST_FUNCTION
 Matrix4x4 compose(const Transform& T, const Vector3& scale)
 {
     Matrix4x4 M;
@@ -19,6 +24,7 @@ Matrix4x4 compose(const Transform& T, const Vector3& scale)
     return M * S;
 }
 
+RMAGINE_HOST_FUNCTION
 Matrix4x4 compose(const Transform& T, const Matrix3x3& S)
 {
     Matrix4x4 M;
@@ -38,6 +44,7 @@ Matrix4x4 compose(const Transform& T, const Matrix3x3& S)
     return M * S_;
 }
 
+RMAGINE_HOST_FUNCTION
 void decompose(const Matrix4x4& M, Transform& T, Matrix3x3& S)
 {
     Eigen::Matrix4f Meig;
@@ -73,6 +80,7 @@ void decompose(const Matrix4x4& M, Transform& T, Matrix3x3& S)
     T.R.set(R);
 }
 
+RMAGINE_HOST_FUNCTION
 void decompose(const Matrix4x4& M, Transform& T, Vector3& scale)
 {
     Matrix3x3 S;
@@ -85,15 +93,1088 @@ void decompose(const Matrix4x4& M, Transform& T, Vector3& scale)
     scale.z = S(2,2);
 }
 
+RMAGINE_HOST_FUNCTION
 Quaternion polate(const Quaternion& A, const Quaternion& B, float fac)
 {
     return A * A.to(B).pow(fac);
 }
 
+RMAGINE_HOST_FUNCTION
 Transform polate(const Transform& A, const Transform& B, float fac)
 {
     return A * A.to(B).pow(fac);
 }
 
 
+RMAGINE_HOST_FUNCTION
+void svd(
+    const Matrix3x3& a, 
+    Matrix3x3& u,
+    Matrix3x3& w,
+    Matrix3x3& v)
+{
+    // TODO: test
+    constexpr unsigned int max_iterations = 20;
+    
+    // additional memory required
+    bool flag;
+    int its, j, jj;
+    float anorm, c, f, g, h, s, scale, x, y, z;
+    
+    Vector3 rv1 = Vector3::Zeros();
+    
+    g = s = scale = anorm = 0.0;
+    float eps = __FLT_EPSILON__;
+    u = a;
+
+    // FIRST PART
+
+    // i = 0;
+    // l = 2;
+    scale = fabs(u(0,0)) + fabs(u(1,0)) + fabs(u(2,0));
+    if(scale > 0.0)
+    {
+        u(0, 0) /= scale;
+        u(1, 0) /= scale;
+        u(2, 0) /= scale;
+
+        s = u(0,0) * u(0,0) + u(1,0) * u(1,0) + u(2,0) * u(2,0);
+        f = u(0,0);
+        g = -SIGN(sqrt(s), f);
+        h = f * g - s;
+
+        u(0, 0) = f - g;
+
+        f = (u(0, 0) * u(0, 1) + u(1, 0) * u(1, 1) + u(2, 0) * u(2, 1)) / h;
+        u(0, 1) += f * u(0, 0);
+        u(1, 1) += f * u(1, 0);
+        u(2, 1) += f * u(2, 0);
+
+        f = (u(0, 0) * u(0, 2) + u(1, 0) * u(1, 2) + u(2, 0) * u(2, 2)) / h;
+        u(0, 2) += f * u(0, 0);
+        u(1, 2) += f * u(1, 0);
+        u(2, 2) += f * u(2, 0);
+
+        u(0, 0) *= scale;
+        u(1, 0) *= scale;
+        u(2, 0) *= scale;
+    }
+    
+    w(0, 0) = scale * g;
+    g = s = scale = 0.0;
+    
+    scale = abs(u(0,0)) + abs(u(0,1)) + abs(u(0,2));
+    
+    if(scale > 0.0)
+    {
+        u(0, 1) /= scale;
+        u(0, 2) /= scale;
+        s = u(0,1) * u(0,1) + u(0,2) * u(0,2);
+
+        f = u(0, 1);
+        g = -SIGN(sqrt(s),f);
+        h = f * g-s;
+        u(0, 1) = f - g;
+
+        rv1.y = u(0, 1) / h;
+        rv1.z = u(0, 2) / h;
+
+        s = u(1,1) * u(0,1) + u(1,2) * u(0,2);
+        u(1, 1) += s * rv1.y;
+        u(1, 2) += s * rv1.z;
+    
+        s = u(2,1) * u(0,1) + u(2,2) * u(0,2);
+        u(2, 1) += s * rv1.y;
+        u(2, 2) += s * rv1.z;            
+
+        u(0, 1) *= scale;
+        u(0, 2) *= scale;
+    }
+    
+    anorm = fabs(w(0, 0));
+    // anorm = MAX(anorm, (fabs(w(0, 0)) + fabs(rv1.x))); // rv1.x is always 0 here, anorm too. fabs(X) >= 0
+    
+    // i = 1;
+    // l = 3;
+    rv1.y = scale * g;
+    g = 0.0;
+    scale = fabs(u(1, 1)) + fabs(u(2, 1));
+    
+    if(scale > 0.0)
+    {
+        u(1,1) /= scale;
+        u(2,1) /= scale;
+
+        s = u(1,1) * u(1,1) + u(2,1) * u(2,1);
+        f = u(1,1);
+        g = -SIGN(sqrt(s),f);
+        h = f * g - s;
+        u(1,1) = f-g;
+        
+        f = (u(1,1) * u(1,2) + u(2,1) * u(2,2)) / h;
+        u(1,2) += f * u(1,1);
+        u(2,2) += f * u(2,1);
+        
+        u(1,1) *= scale;
+        u(2,1) *= scale;
+    }
+    
+    w(1, 1) = scale * g;
+    g = s = scale = 0.0;
+    
+    scale = abs(u(1,2));
+    if(scale > 0.0)
+    {
+        u(1,2) /= scale;
+        s = u(1,2) * u(1,2);
+        
+        f = u(1, 2);
+        g = -SIGN(sqrt(s), f);
+        h = f * g - s;
+        u(1,2) = f - g;
+
+        rv1.z = u(1,2) / h;
+        s = u(2,2) * u(1,2);
+
+        u(2,2) += s * rv1.z;
+        u(1,2) *= scale;
+    }
+
+    anorm = MAX(anorm, (abs(w(1, 1)) + abs(rv1.y)));
+    
+    rv1.z = scale * g;
+
+    scale = abs(u(2, 2));
+    if(scale > 0.0) 
+    {
+        u(2, 2) /= scale;
+        s = u(2, 2) * u(2, 2);
+        f = u(2, 2);
+        g = -SIGN(sqrt(s),f);
+        h = f * g - s;
+
+        u(2, 2) = f - g;
+        u(2, 2) *= scale;
+    }
+    
+    w(2, 2) = scale * g;
+    g = s = scale = 0.0;
+    
+    anorm = MAX(anorm, (abs(w(2, 2))+abs(rv1.z)));
+
+    // SECOND PART    
+    v(2, 2) = 1.0;
+    g = rv1.z;
+
+    // i = 1;
+    // l = 2;
+    if(fabs(g) > 0.0)
+    {
+        v(2,1) = (u(1,2) / u(1,2)) / g;
+        s = u(1,2) * v(2,2);
+        v(2,2) += s * v(2,1);
+    }
+    v(1,2) = 0.0;
+    v(2,1) = 0.0;
+    v(1,1) = 1.0;
+
+    g = rv1.y;
+
+    // l = 1;
+    // i = 0;
+    if(fabs(g) > 0.0)
+    {
+        v(1,0) = (u(0,1) / u(0,1)) / g;
+        v(2,0) = (u(0,2) / u(0,1)) / g;
+
+        s = u(0,1) * v(1,1) + u(0,2) * v(2,1);
+        v(1,1) += s * v(1,0);
+        v(2,1) += s * v(2,0);
+
+        s = u(0,1) * v(1,2) + u(0,2) * v(2,2);
+        v(1,2) += s * v(1,0);
+        v(2,2) += s * v(2,0);
+    }
+    v(0,1) = 0.0;
+    v(1,0) = 0.0;
+    v(0,2) = 0.0;
+    v(2,0) = 0.0;
+    v(0,0) = 1.0;
+    g = rv1.x;
+
+
+    // THIRD PART
+    
+    // i = 2;
+    // l = 3;
+    g = w(2, 2);
+    if(fabs(g) > 0.0)
+    {
+        u(2,2) /= g;
+    } else { 
+        // TODO(amock): shouldnt this be a large number?
+        u(2,2) = 0.0;   
+    }
+    u(2,2) += 1.0;
+    
+    // i = 1;
+    // l = 2;
+
+    g = w(1, 1);
+    u(1,2) = 0.0;
+    
+    if(fabs(g) > 0.0)
+    {
+        g = 1.0/g;
+        s = u(2,1) * u(2,2);
+        f = (s/u(1,1)) * g;
+
+        u(1,2) += f * u(1,1);
+        u(2,2) += f * u(2,1);
+    
+        u(1,1) *= g;
+        u(2,1) *= g;
+    } else {
+        u(1,1) = 0.0;
+        u(2,1) = 0.0;
+    }
+    u(1,1) += 1.0;
+    
+    // i = 0;
+    // l = 1;
+    g = w(0, 0);
+    u(0,1) = 0.0;
+    u(0,2) = 0.0;
+    
+    if(fabs(g) > 0.0)
+    {
+        f = (u(1,0) * u(1,1) + u(2,0) * u(2,1)) / (g * u(0,0));
+        u(0,1) += f * u(0,0);
+        u(1,1) += f * u(1,0);
+        u(2,1) += f * u(2,0);
+
+        f = (u(1,0) * u(1,2) + u(2,0) * u(2,2)) / (g * u(0,0));
+        u(0,2) += f * u(0,0);
+        u(1,2) += f * u(1,0);
+        u(2,2) += f * u(2,0);
+
+        u(0,0) /= g;
+        u(1,0) /= g;
+        u(2,0) /= g;
+    } else {
+        u(0,0) = 0.0;
+        u(1,0) = 0.0;
+        u(2,0) = 0.0;
+    }
+    u(0,0) += 1.0;
+
+    int i, l;
+
+    // PART 4: Opti
+
+    // k = 2;
+    for(its=0; its<max_iterations; its++) 
+    {
+        // flag=true;
+        // l = 2;
+        // if(MIN(fabs(rv1.z), fabs(w(1,1))) > eps*anorm)
+        // {
+        //     l = 1;
+        //     if(MIN(fabs(rv1.y),abs(w(0,0))) > eps*anorm)
+        //     {
+        //         l = 0;
+        //     }
+        // }
+
+        flag=true;
+        l=2;
+        if(abs(rv1.z) <= eps*anorm)
+        {
+            flag=false;
+        }
+        else if(abs(w(1,1)) > eps*anorm)
+        {
+            l=1;
+            if(abs(rv1.y) <= eps*anorm) 
+            {
+                flag=false;
+            }
+            else if(abs(w(0,0)) > eps*anorm) 
+            {
+                l=0;
+                flag = false;
+            }
+        }
+        
+        if(flag)
+        {
+            c=0.0;
+            s=1.0;
+            for(i=l; i<3; i++) 
+            {
+                f = s*rv1[i];
+                rv1[i] = c*rv1[i];
+                if(abs(f) <= eps*anorm) 
+                {
+                    break;
+                }
+                g = w(i, i);
+                h = PYTHAG(f,g);
+                w(i, i) = h;
+                h = 1.0/h;
+                c = g*h;
+                s = -f*h;
+                for(j=0; j<3; j++)
+                {
+                    y = u(j,l-1);
+                    z = u(j,i);
+                    u(j,l-1) = y*c+z*s;
+                    u(j,i) = z*c-y*s;
+                }
+            }
+        }
+        z = w(2,2);
+        if(l == 2)
+        {
+            if(z < 0.0)
+            {
+                w(2, 2) = -z;
+                for (j=0; j<3; j++) 
+                {
+                    v(j,2) = -v(j,2);
+                }
+            }
+            break;
+        }
+        if(its == max_iterations - 1) 
+        {
+            // std::cout << "no convergence in " << max_iterations << " svdcmp iterations" << std::endl;
+            // throw std::runtime_error("no convergence in max svdcmp iterations");
+        }
+        x = w(l,l);
+        y = w(1,1);
+        g = rv1.y;
+        h = rv1.z;
+        f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
+        g = PYTHAG(f, 1.f);
+        f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+        c = s = 1.f;
+        for (j=l; j<2; j++) 
+        {
+            i = j+1;
+            g = rv1[i];
+            y = w(i, i);
+            h = s*g;
+            g = c*g;
+            z = PYTHAG(f,h);
+            rv1[j] = z;
+            c = f/z;
+            s = h/z;
+            f = x*c+g*s;
+            g = g*c-x*s;
+            h = y*s;
+            y *= c;
+            for(jj=0;jj<3;jj++)
+            {
+                x = v(jj,j);
+                z = v(jj,i);
+                v(jj,j) = x*c+z*s;
+                v(jj,i) = z*c-x*s;
+            }
+            z = PYTHAG(f,h);
+            w(j,j) = z;
+            if(z>0.0) 
+            {
+                z = 1.f/z;
+                c = f*z;
+                s = h*z;
+            }
+            f = c*g+s*y;
+            x = c*y-s*g;
+            for (jj=0;jj<3;jj++)
+            {
+                y = u(jj,j);
+                z = u(jj,i);
+                u(jj,j) = y*c+z*s;
+                u(jj,i) = z*c-y*s;
+            }
+        }
+        rv1[l] = 0.f;
+        rv1.z = f;
+        w(2,2) = x;
+    }
+
+
+    for(its=0; its<max_iterations; its++) 
+    {
+        flag=true;
+        l=1;
+        if(abs(rv1.y) <= eps*anorm) 
+        {
+            flag=false;
+        }
+        else if(abs(w(0,0)) > eps*anorm) 
+        {
+            l=0;
+            flag = false;
+        }
+
+        if(flag)
+        {
+            c=0.0;
+            s=1.0;
+            for(i=l; i<2; i++) 
+            {
+                f = s*rv1[i];
+                rv1[i] = c*rv1[i];
+                if(abs(f) <= eps*anorm) 
+                {
+                    break;
+                }
+                g = w(i,i);
+                h = PYTHAG(f,g);
+                w(i, i) = h;
+                h = 1.0/h;
+                c = g*h;
+                s = -f*h;
+                for(j=0; j<3; j++)
+                {
+                    y = u(j,l-1);
+                    z = u(j,i);
+                    u(j,l-1) = y*c+z*s;
+                    u(j,i) = z*c-y*s;
+                }
+            }
+        }
+        z = w(1,1);
+        if(l == 1)
+        {
+            if(z < 0.0)
+            {
+                w(1,1) = -z;
+                for (j=0; j<3; j++) 
+                {
+                    v(j,1) = -v(j,1);
+                }
+            }
+            break;
+        }
+        if(its == max_iterations - 1) 
+        {
+            // std::cout << "no convergence in " << max_iterations << " svdcmp iterations" << std::endl;
+            // throw std::runtime_error("no convergence in max svdcmp iterations");
+        }
+
+        x = w(l, l);
+        y = w(0, 0);
+        g = rv1.x;
+        h = rv1.y;
+        f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
+        g = PYTHAG(f, 1.f);
+        f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+        c = s = 1.f;
+
+        if(l == 0)
+        {
+            g = rv1.y;
+            y = w(1,1);
+            h = s*g;
+            g = c*g;
+            z = PYTHAG(f,h);
+            rv1.x = z;
+            c = f/z;
+            s = h/z;
+            f = x*c+g*s;
+            g = g*c-x*s;
+            h = y*s;
+            y *= c;
+
+
+            x = v(0,0);
+            z = v(0,1);
+            v(0,0) = x*c+z*s;
+            v(0,1) = z*c-x*s;
+
+            x = v(1,0);
+            z = v(1,1);
+            v(1,0) = x*c+z*s;
+            v(1,1) = z*c-x*s;
+
+            x = v(2,0);
+            z = v(2,1);
+            v(2,0) = x*c+z*s;
+            v(2,1) = z*c-x*s;
+
+
+            z = PYTHAG(f,h);
+            w(0,0) = z;
+            if(z>0.f)
+            {
+                z = 1.f/z;
+                c = f*z;
+                s = h*z;
+            }
+            f = c*g+s*y;
+            x = c*y-s*g;
+            for(jj=0; jj<3; jj++)
+            {
+                y = u(jj,0);
+                z = u(jj,1);
+                u(jj,0) = y*c+z*s;
+                u(jj,1) = z*c-y*s;
+            }
+        }
+        
+        rv1[l] = 0.f;
+        rv1.y = f;
+        w(1,1) = x;
+    }
+
+
+    z = w(0,0);
+    if (z < 0.0)
+    {
+        w(0,0) = -z;
+        v(0,0) = -v(0,0);
+        v(1,0) = -v(1,0);
+        v(2,0) = -v(2,0);
+    }
+}
+
+RMAGINE_HOST_FUNCTION
+void svd(
+    const Matrix3x3& a, 
+    Matrix3x3& u,
+    Vector3& w,
+    Matrix3x3& v)
+{
+    // TODO: test
+    constexpr unsigned int m = 3;
+    constexpr unsigned int n = 3;
+    constexpr unsigned int max_iterations = 20;
+    
+    // additional memory required
+    bool flag;
+    int its, j, jj, k, nm;
+    float anorm, c, f, g, h, s, scale, x, y, z;
+    
+    Vector3 rv1 = Vector3::Zeros();
+    
+    g = s = scale = anorm = 0.0;
+    float eps = std::numeric_limits<float>::epsilon();
+    u = a;
+
+    // FIRST PART
+
+    // i = 0;
+    // l = 2;
+    scale = fabs(u(0,0)) + fabs(u(1,0)) + fabs(u(2,0));
+    if(scale > 0.0)
+    {
+        u(0, 0) /= scale;
+        u(1, 0) /= scale;
+        u(2, 0) /= scale;
+
+        s = u(0,0) * u(0,0) + u(1,0) * u(1,0) + u(2,0) * u(2,0);
+        f = u(0,0);
+        g = -SIGN(sqrt(s), f);
+        h = f * g - s;
+
+        u(0, 0) = f - g;
+
+        f = (u(0, 0) * u(0, 1) + u(1, 0) * u(1, 1) + u(2, 0) * u(2, 1)) / h;
+        u(0, 1) += f * u(0, 0);
+        u(1, 1) += f * u(1, 0);
+        u(2, 1) += f * u(2, 0);
+
+        f = (u(0, 0) * u(0, 2) + u(1, 0) * u(1, 2) + u(2, 0) * u(2, 2)) / h;
+        u(0, 2) += f * u(0, 0);
+        u(1, 2) += f * u(1, 0);
+        u(2, 2) += f * u(2, 0);
+
+        u(0, 0) *= scale;
+        u(1, 0) *= scale;
+        u(2, 0) *= scale;
+    }
+    
+    w.x = scale * g;
+    g = s = scale = 0.0;
+    
+    scale = abs(u(0,0)) + abs(u(0,1)) + abs(u(0,2));
+    
+    if(scale > 0.0)
+    {
+        u(0, 1) /= scale;
+        u(0, 2) /= scale;
+        s = u(0,1) * u(0,1) + u(0,2) * u(0,2);
+
+        f = u(0, 1);
+        g = -SIGN(sqrt(s),f);
+        h = f * g-s;
+        u(0, 1) = f - g;
+
+        rv1.y = u(0, 1) / h;
+        rv1.z = u(0, 2) / h;
+
+        s = u(1,1) * u(0,1) + u(1,2) * u(0,2);
+        u(1, 1) += s * rv1.y;
+        u(1, 2) += s * rv1.z;
+    
+        s = u(2,1) * u(0,1) + u(2,2) * u(0,2);
+        u(2, 1) += s * rv1.y;
+        u(2, 2) += s * rv1.z;            
+
+        u(0, 1) *= scale;
+        u(0, 2) *= scale;
+    }
+    
+    anorm = fabs(w.x);
+    // anorm = MAX(anorm, (fabs(w(0, 0)) + fabs(rv1.x))); // rv1.x is always 0 here, anorm too. fabs(X) >= 0
+    
+    // i = 1;
+    // l = 3;
+    rv1.y = scale * g;
+    g = 0.0;
+    scale = fabs(u(1, 1)) + fabs(u(2, 1));
+    
+    if(scale > 0.0)
+    {
+        u(1,1) /= scale;
+        u(2,1) /= scale;
+
+        s = u(1,1) * u(1,1) + u(2,1) * u(2,1);
+        f = u(1,1);
+        g = -SIGN(sqrt(s),f);
+        h = f * g - s;
+        u(1,1) = f-g;
+        
+        f = (u(1,1) * u(1,2) + u(2,1) * u(2,2)) / h;
+        u(1,2) += f * u(1,1);
+        u(2,2) += f * u(2,1);
+        
+        u(1,1) *= scale;
+        u(2,1) *= scale;
+    }
+    
+    w.y = scale * g;
+    g = s = scale = 0.0;
+    
+    scale = abs(u(1,2));
+    if(scale > 0.0)
+    {
+        u(1,2) /= scale;
+        s = u(1,2) * u(1,2);
+        
+        f = u(1, 2);
+        g = -SIGN(sqrt(s), f);
+        h = f * g - s;
+        u(1,2) = f - g;
+
+        rv1.z = u(1,2) / h;
+        s = u(2,2) * u(1,2);
+
+        u(2,2) += s * rv1.z;
+        u(1,2) *= scale;
+    }
+
+    anorm = MAX(anorm, (abs(w.y) + abs(rv1.y)));
+    
+    rv1.z = scale * g;
+
+    scale = abs(u(2, 2));
+    if(scale > 0.0) 
+    {
+        u(2, 2) /= scale;
+        s = u(2, 2) * u(2, 2);
+        f = u(2, 2);
+        g = -SIGN(sqrt(s),f);
+        h = f * g - s;
+
+        u(2, 2) = f - g;
+        u(2, 2) *= scale;
+    }
+    
+    w.z = scale * g;
+    g = s = scale = 0.0;
+    
+    anorm = MAX(anorm, (abs(w.z)+abs(rv1.z)));
+
+    // SECOND PART    
+    v(2, 2) = 1.0;
+    g = rv1.z;
+
+    // i = 1;
+    // l = 2;
+    if(fabs(g) > 0.0)
+    {
+        v(2,1) = (u(1,2) / u(1,2)) / g;
+        s = u(1,2) * v(2,2);
+        v(2,2) += s * v(2,1);
+    }
+    v(1,2) = 0.0;
+    v(2,1) = 0.0;
+    v(1,1) = 1.0;
+
+    g = rv1.y;
+
+    // l = 1;
+    // i = 0;
+    if(fabs(g) > 0.0)
+    {
+        v(1,0) = (u(0,1) / u(0,1)) / g;
+        v(2,0) = (u(0,2) / u(0,1)) / g;
+
+        s = u(0,1) * v(1,1) + u(0,2) * v(2,1);
+        v(1,1) += s * v(1,0);
+        v(2,1) += s * v(2,0);
+
+        s = u(0,1) * v(1,2) + u(0,2) * v(2,2);
+        v(1,2) += s * v(1,0);
+        v(2,2) += s * v(2,0);
+    }
+    v(0,1) = 0.0;
+    v(1,0) = 0.0;
+    v(0,2) = 0.0;
+    v(2,0) = 0.0;
+    v(0,0) = 1.0;
+    g = rv1.x;
+
+
+    // THIRD PART
+    
+    // i = 2;
+    // l = 3;
+    g = w.z;
+    if(fabs(g) > 0.0)
+    {
+        u(2,2) /= g;
+    } else { 
+        // TODO(amock): shouldnt this be a large number?
+        u(2,2) = 0.0;   
+    }
+    u(2,2) += 1.0;
+    
+    // i = 1;
+    // l = 2;
+
+    g = w.y;
+    u(1,2) = 0.0;
+    
+    if(fabs(g) > 0.0)
+    {
+        g = 1.0/g;
+        s = u(2,1) * u(2,2);
+        f = (s/u(1,1)) * g;
+
+        u(1,2) += f * u(1,1);
+        u(2,2) += f * u(2,1);
+    
+        u(1,1) *= g;
+        u(2,1) *= g;
+    } else {
+        u(1,1) = 0.0;
+        u(2,1) = 0.0;
+    }
+    u(1,1) += 1.0;
+    
+    // i = 0;
+    // l = 1;
+    g = w.x;
+    u(0,1) = 0.0;
+    u(0,2) = 0.0;
+    
+    if(fabs(g) > 0.0)
+    {
+        f = (u(1,0) * u(1,1) + u(2,0) * u(2,1)) / (g * u(0,0));
+        u(0,1) += f * u(0,0);
+        u(1,1) += f * u(1,0);
+        u(2,1) += f * u(2,0);
+
+        f = (u(1,0) * u(1,2) + u(2,0) * u(2,2)) / (g * u(0,0));
+        u(0,2) += f * u(0,0);
+        u(1,2) += f * u(1,0);
+        u(2,2) += f * u(2,0);
+
+        u(0,0) /= g;
+        u(1,0) /= g;
+        u(2,0) /= g;
+    } else {
+        u(0,0) = 0.0;
+        u(1,0) = 0.0;
+        u(2,0) = 0.0;
+    }
+    u(0,0) += 1.0;
+
+    int i, l;
+
+    // PART 4: Opti
+
+    // k = 2;
+    for(its=0; its<max_iterations; its++) 
+    {
+        // flag=true;
+        // l = 2;
+        // if(MIN(fabs(rv1.z), fabs(w(1,1))) > eps*anorm)
+        // {
+        //     l = 1;
+        //     if(MIN(fabs(rv1.y),abs(w(0,0))) > eps*anorm)
+        //     {
+        //         l = 0;
+        //     }
+        // }
+
+        flag=true;
+        l=2;
+        if(abs(rv1.z) <= eps*anorm)
+        {
+            flag=false;
+        }
+        else if(abs(w.y) > eps*anorm)
+        {
+            l=1;
+            if(abs(rv1.y) <= eps*anorm) 
+            {
+                flag=false;
+            }
+            else if(abs(w.x) > eps*anorm) 
+            {
+                l=0;
+                flag = false;
+            }
+        }
+        
+        if(flag)
+        {
+            c=0.0;
+            s=1.0;
+            for(i=l; i<3; i++) 
+            {
+                f = s*rv1[i];
+                rv1[i] = c*rv1[i];
+                if(abs(f) <= eps*anorm) 
+                {
+                    break;
+                }
+                g = w(i);
+                h = PYTHAG(f,g);
+                w(i) = h;
+                h = 1.0/h;
+                c = g*h;
+                s = -f*h;
+                for(j=0; j<m; j++)
+                {
+                    y = u(j,l-1);
+                    z = u(j,i);
+                    u(j,l-1) = y*c+z*s;
+                    u(j,i) = z*c-y*s;
+                }
+            }
+        }
+        z = w.z;
+        if(l == 2)
+        {
+            if(z < 0.0)
+            {
+                w.z = -z;
+                v(0,2) = -v(0,2);
+                v(1,2) = -v(1,2);
+                v(2,2) = -v(2,2);
+            }
+            break;
+        }
+        if(its == max_iterations - 1) 
+        {
+            // std::cout << "no convergence in " << max_iterations << " svdcmp iterations" << std::endl;
+            throw std::runtime_error("no convergence in max svdcmp iterations");
+        }
+        x = w(l);
+        y = w.y;
+        g = rv1.y;
+        h = rv1.z;
+        f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
+        g = PYTHAG(f, 1.f);
+        f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+        c = s = 1.f;
+        for (j=l; j<2; j++)
+        {
+            i = j+1;
+            g = rv1[i];
+            y = w(i);
+            h = s*g;
+            g = c*g;
+            z = PYTHAG(f,h);
+            rv1[j] = z;
+            c = f/z;
+            s = h/z;
+            f = x*c+g*s;
+            g = g*c-x*s;
+            h = y*s;
+            y *= c;
+            for(jj=0;jj<3;jj++)
+            {
+                x = v(jj,j);
+                z = v(jj,i);
+                v(jj,j) = x*c+z*s;
+                v(jj,i) = z*c-x*s;
+            }
+            z = PYTHAG(f,h);
+            w(j) = z;
+            if(z>0.0) 
+            {
+                z = 1.f/z;
+                c = f*z;
+                s = h*z;
+            }
+            f = c*g+s*y;
+            x = c*y-s*g;
+            for (jj=0;jj<m;jj++)
+            {
+                y = u(jj,j);
+                z = u(jj,i);
+                u(jj,j) = y*c+z*s;
+                u(jj,i) = z*c-y*s;
+            }
+        }
+        rv1[l] = 0.f;
+        rv1.z = f;
+        w.z = x;
+    }
+
+
+    for(its=0; its<max_iterations; its++) 
+    {
+        flag=true;
+        l=1;
+        if(abs(rv1.y) <= eps*anorm) 
+        {
+            flag=false;
+        }
+        else if(abs(w.x) > eps*anorm) 
+        {
+            l=0;
+            flag = false;
+        }
+
+        if(flag)
+        {
+            c=0.0;
+            s=1.0;
+            for(i=l; i<2; i++) 
+            {
+                f = s*rv1[i];
+                rv1[i] = c*rv1[i];
+                if(abs(f) <= eps*anorm) 
+                {
+                    break;
+                }
+                g = w(i);
+                h = PYTHAG(f,g);
+                w(i) = h;
+                h = 1.0/h;
+                c = g*h;
+                s = -f*h;
+                for(j=0; j<m; j++)
+                {
+                    y = u(j,l-1);
+                    z = u(j,i);
+                    u(j,l-1) = y*c+z*s;
+                    u(j,i) = z*c-y*s;
+                }
+            }
+        }
+        z = w.y;
+        if(l == 1)
+        {
+            if(z < 0.0)
+            {
+                w.y = -z;
+                for (j=0; j<3; j++) 
+                {
+                    v(j,1) = -v(j,1);
+                }
+            }
+            break;
+        }
+        if(its == max_iterations - 1) 
+        {
+            std::cout << "no convergence in " << max_iterations << " svdcmp iterations" << std::endl;
+            throw std::runtime_error("no convergence in max svdcmp iterations");
+        }
+
+        x = w(l);
+        y = w.x;
+        g = rv1.x;
+        h = rv1.y;
+        f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.f*h*y);
+        g = PYTHAG(f, 1.f);
+        f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+        c = s = 1.f;
+
+        if(l == 0)
+        {
+            g = rv1.y;
+            y = w.y;
+            h = s*g;
+            g = c*g;
+            z = PYTHAG(f,h);
+            rv1.x = z;
+            c = f/z;
+            s = h/z;
+            f = x*c+g*s;
+            g = g*c-x*s;
+            h = y*s;
+            y *= c;
+
+
+            x = v(0,0);
+            z = v(0,1);
+            v(0,0) = x*c+z*s;
+            v(0,1) = z*c-x*s;
+
+            x = v(1,0);
+            z = v(1,1);
+            v(1,0) = x*c+z*s;
+            v(1,1) = z*c-x*s;
+
+            x = v(2,0);
+            z = v(2,1);
+            v(2,0) = x*c+z*s;
+            v(2,1) = z*c-x*s;
+
+
+            z = PYTHAG(f,h);
+            w.x = z;
+            if(z>0.f)
+            {
+                z = 1.f/z;
+                c = f*z;
+                s = h*z;
+            }
+            f = c*g+s*y;
+            x = c*y-s*g;
+            for(jj=0; jj<3; jj++)
+            {
+                y = u(jj,0);
+                z = u(jj,1);
+                u(jj,0) = y*c+z*s;
+                u(jj,1) = z*c-y*s;
+            }
+        }
+        
+        rv1[l] = 0.f;
+        rv1.y = f;
+        w.y = x;
+    }
+
+
+    z = w.x;
+    if (z < 0.0)
+    {
+        w.x = -z;
+        v(0,0) = -v(0,0);
+        v(1,0) = -v(1,0);
+        v(2,0) = -v(2,0);
+    }
+}
+
 } // namespace rmagine
\ No newline at end of file
diff --git a/src/rmagine_core/src/math/math.cpp b/src/rmagine_core/src/math/math.cpp
index 9323175..42de8ae 100644
--- a/src/rmagine_core/src/math/math.cpp
+++ b/src/rmagine_core/src/math/math.cpp
@@ -4,6 +4,8 @@
 
 #include <cassert>
 
+#include "rmagine/math/linalg.h"
+
 namespace rmagine {
 
 
@@ -697,5 +699,39 @@ Memory<Matrix3x3, RAM> cov(
     return C;
 }
 
+/**
+ * @brief decompose A = UWV* using singular value decomposition
+ */
+void svd(
+    const MemoryView<Matrix3x3, RAM>& As,
+    MemoryView<Matrix3x3, RAM>& Us,
+    MemoryView<Matrix3x3, RAM>& Ws,
+    MemoryView<Matrix3x3, RAM>& Vs)
+{
+    #pragma omp parallel for
+    for(size_t i=0; i<As.size(); i++)
+    {
+        svd(As[i], Us[i], Ws[i], Vs[i]);
+    }
+}
+
+/**
+ * @brief decompose A = UWV* using singular value decomposition
+ * 
+ * w is a vector which is the diagonal of matrix W
+ */
+void svd(
+    const MemoryView<Matrix3x3, RAM>& As,
+    MemoryView<Matrix3x3, RAM>& Us,
+    MemoryView<Vector3, RAM>& ws,
+    MemoryView<Matrix3x3, RAM>& Vs)
+{
+    #pragma omp parallel for
+    for(size_t i=0; i<As.size(); i++)
+    {
+        svd(As[i], Us[i], ws[i], Vs[i]);
+    }
+}
+
 
 } // namespace rmagine
\ No newline at end of file
diff --git a/src/rmagine_cuda/CMakeLists.txt b/src/rmagine_cuda/CMakeLists.txt
index 6e8beb3..115abcb 100644
--- a/src/rmagine_cuda/CMakeLists.txt
+++ b/src/rmagine_cuda/CMakeLists.txt
@@ -1,10 +1,13 @@
 message(STATUS "Building Cuda dependend code. Library: rmagine-cuda")
 
+# add_library(rmagine-cuda SHARED
+
 set(RMAGINE_CUDA_SRCS
     src/types/MemoryCuda.cu
     # math
     src/math/math.cu
     src/math/math_batched.cu
+    src/math/linalg.cu
     src/math/SVDCuda.cpp
     # util
     src/util/cuda/CudaContext.cpp
@@ -58,6 +61,7 @@ set_target_properties(rmagine-cuda
         EXPORT_NAME cuda
         SOVERSION ${rmagine_VERSION_MAJOR}
         VERSION ${rmagine_VERSION}
+        CUDA_SEPARABLE_COMPILATION ON
         # CUDA_STANDARD 17 -> only works for 
         # CXX_STANDARD 17
 )
diff --git a/src/rmagine_cuda/include/rmagine/math/math.cuh b/src/rmagine_cuda/include/rmagine/math/math.cuh
index c7987e0..645085c 100644
--- a/src/rmagine_cuda/include/rmagine/math/math.cuh
+++ b/src/rmagine_cuda/include/rmagine/math/math.cuh
@@ -544,14 +544,26 @@ Memory<Matrix3x3, VRAM_CUDA> cov(
 );
 
 /**
- * @brief decompose A = UWV* using singular velue decomposition
+ * @brief decompose A = UWV* using singular value decomposition
  */
 void svd(
-    const MemoryView<rm::Matrix3x3, VRAM_CUDA>& As,
-    const MemoryView<rm::Matrix3x3, VRAM_CUDA>& Us,
-    const MemoryView<rm::Matrix3x3, VRAM_CUDA>& Ws,
-    const MemoryView<rm::Matrix3x3, VRAM_CUDA>& Vs
-    );
+    const MemoryView<Matrix3x3, VRAM_CUDA>& As,
+    MemoryView<Matrix3x3, VRAM_CUDA>& Us,
+    MemoryView<Matrix3x3, VRAM_CUDA>& Ws,
+    MemoryView<Matrix3x3, VRAM_CUDA>& Vs
+);
+
+/**
+ * @brief decompose A = UWV* using singular value decomposition
+ * 
+ * w is a vector which is the diagonal of matrix W
+ */
+void svd(
+    const MemoryView<Matrix3x3, VRAM_CUDA>& As,
+    MemoryView<Matrix3x3, VRAM_CUDA>& Us,
+    MemoryView<Vector3, VRAM_CUDA>& ws,
+    MemoryView<Matrix3x3, VRAM_CUDA>& Vs
+);
 
 
 } // namespace rmagine
diff --git a/src/rmagine_cuda/include/rmagine/util/cuda/CudaDebug.hpp b/src/rmagine_cuda/include/rmagine/util/cuda/CudaDebug.hpp
index 2bd29b8..faa4855 100644
--- a/src/rmagine_cuda/include/rmagine/util/cuda/CudaDebug.hpp
+++ b/src/rmagine_cuda/include/rmagine/util/cuda/CudaDebug.hpp
@@ -52,4 +52,16 @@ void cudaAssert(
    const char* func,
    int line);
 
+#ifdef NDEBUG
+    #define RM_CUDA_DEBUG() 
+#else  // NDEBUG
+    #define RM_CUDA_DEBUG() \
+    cudaError_t err = cudaGetLastError(); \
+    if (err != cudaSuccess)  \
+    { \
+        printf("Error: %s\n", cudaGetErrorString(err)); \
+        throw std::runtime_error(cudaGetErrorString(err)); \
+    }
+#endif // defined NDEBUG
+
 #endif // RMAGINE_UTIL_CUDA_DEBUG_HPP
\ No newline at end of file
diff --git a/src/rmagine_core/src/math/SVD2.cpp b/src/rmagine_cuda/src/math/linalg.cu
similarity index 66%
rename from src/rmagine_core/src/math/SVD2.cpp
rename to src/rmagine_cuda/src/math/linalg.cu
index ffc2bd0..17c846d 100644
--- a/src/rmagine_core/src/math/SVD2.cpp
+++ b/src/rmagine_cuda/src/math/linalg.cu
@@ -1,443 +1,37 @@
-#include "rmagine/math/SVD2.hpp"
-#include "rmagine/types/Memory.hpp"
-#include <assert.h>
-// #include <Eigen/Dense>
+#include <rmagine/math/types.h>
+#include <rmagine/math/linalg.h>
+#include <cuda_runtime.h>
 
 namespace rmagine
 {
 
-SVD2::SVD2()
-{
-    // eps = std::numeric_limits<float>::epsilon();
-    // decompose();
-    // reorder();
-    // tsh = 0.5 * sqrt(m + n + 1.) * w(0) * eps;
-}
-
-SVD2::~SVD2()
-{
-    
-}
-
-void SVD2::decompose(
-    const Matrix3x3& a)
-{
-    bool flag;
-    int i, its, j, jj, k, l, nm;
-    float anorm,c,f,g,h,s,scale,x,y,z;
-    Vector3 rv1;
-    g = scale = anorm = 0.0;
-
-    const int m = 3;
-    const int n = 3;
-
-    eps = std::numeric_limits<float>::epsilon();
-    u = a;
-
-    for(i=0; i < n; i++) 
-    {
-        l = i+2;
-        rv1(i) = scale*g;
-        g = s = scale = 0.0;
-        if(i < m) 
-        {
-            for(k=i; k<m; k++) 
-            {
-                scale += abs(u(k,i));
-            }
-            if(scale != 0.0) 
-            {
-                for(k=i; k<m; k++) 
-                {
-                    u(k,i) /= scale;
-                    s += u(k,i) * u(k,i);
-                }
-                f = u(i,i);
-                g = -SIGN(sqrt(s),f);
-                h = f*g-s;
-                u(i,i) = f-g;
-                for(j=l-1;j<n;j++) 
-                {
-                    for (s=0.0,k=i;k<m;k++) 
-                    {
-                        s += u(k,i) * u(k,j);
-                    }
-                    f = s/h;
-                    for (k=i;k<m;k++)
-                    {
-                        u(k,j) += f * u(k,i);
-                    }
-                }
-                for (k=i;k<m;k++)
-                {
-                    u(k,i) *= scale;
-                }
-            }
-        }
-        w(i) = scale * g;
-        g = s = scale = 0.0;
-        if(i+1 <= m && i+1 != n)
-        {
-            for(k=l-1; k<n; k++) 
-            {
-                scale += abs(u(i,k));
-            }
-            if(scale != 0.0)
-            {
-                for(k=l-1;k<n;k++) 
-                {
-                    u(i, k) /= scale;
-                    s += u(i,k) * u(i,k);
-                }
-                f = u(i, l-1);
-                g = -SIGN(sqrt(s),f);
-                h = f*g-s;
-                u(i,l-1) = f-g;
-                for(k=l-1;k<n;k++)
-                {
-                    rv1(k) = u(i,k) / h;
-                }
-                for(j=l-1; j<m; j++)
-                {
-                    for (s=0.0,k=l-1; k<n; k++)
-                    {
-                        s += u(j,k) * u(i,k);
-                    }
-                    for (k=l-1; k<n;k++)
-                    {
-                        u(j,k) += s * rv1(k);
-                    }
-                }
-                for(k=l-1; k<n; k++)
-                {
-                    u(i,k) *= scale;
-                }
-            }
-        }
-        anorm = MAX(anorm, (abs(w(i))+abs(rv1(i))));
-    }
-    for(i=n-1; i>=0; i--)
-    {
-        if(i < n-1) 
-        {
-            if(g != 0.0)
-            {
-                for(j=l; j<n; j++)
-                {
-                    v(j,i) = (u(i,j)/u(i,l)) / g;
-                }
-                for(j=l; j<n; j++)
-                {
-                    for (s=0.0,k=l;k<n;k++) 
-                    {
-                        s += u(i,k) * v(k,j);
-                    }
-                    for (k=l; k<n; k++)
-                    {
-                        v(k,j) += s * v(k,i);
-                    }
-                }
-            }
-            for (j=l; j<n; j++) 
-            {
-                v(i,j) = 0.0;
-                v(j,i) = 0.0;
-            }
-        }
-        v(i,i) = 1.0;
-        g = rv1(i);
-        l = i;
-    }
-    for(i=MIN(m,n)-1; i>=0; i--)
-    {
-        l = i+1;
-        g = w(i);
-        for(j=l;j<n;j++) 
-        {
-            u(i,j) = 0.0;
-        }
-        if(g != 0.0) 
-        {
-            g = 1.0/g;
-            for(j=l;j<n;j++)
-            {
-                for (s=0.0,k=l; k<m; k++) 
-                {
-                    s += u(k,i)*u(k,j);
-                }
-                f = (s/u(i,i)) * g;
-                for (k=i; k<m; k++) 
-                {
-                    u(k,j) += f * u(k,i);
-                }
-            }
-            for(j=i;j<m;j++) 
-            {
-                u(j,i) *= g;
-            }
-        } else {
-            for(j=i;j<m;j++) 
-            {
-                u(j,i) = 0.0;
-            } 
-        }
-        ++u(i,i);
-    }
-    for(k=n-1; k>=0; k--) 
-    {
-        for(its=0; its<30; its++) 
-        {
-            flag=true;
-            for(l=k; l>=0; l--) 
-            {
-                nm=l-1;
-                if (l == 0 || abs(rv1(l)) <= eps*anorm) {
-                    flag=false;
-                    break;
-                }
-                if (abs(w(nm)) <= eps*anorm) 
-                {
-                    break;
-                }
-            }
-            if(flag) 
-            {
-                c=0.0;
-                s=1.0;
-                for(i=l; i<k+1; i++) 
-                {
-                    f = s*rv1(i);
-                    rv1(i) = c*rv1(i);
-                    if(abs(f) <= eps*anorm) 
-                    {
-                        break;
-                    }
-                    g = w(i);
-                    h = PYTHAG(f,g);
-                    w(i) = h;
-                    h = 1.0/h;
-                    c = g*h;
-                    s = -f*h;
-                    for(j=0; j<m; j++)
-                    {
-                        y = u(j,nm);
-                        z = u(j,i);
-                        u(j,nm) = y*c+z*s;
-                        u(j,i) = z*c-y*s;
-                    }
-                }
-            }
-            z = w(k);
-            if (l == k)
-            {
-                if (z < 0.0)
-                {
-                    w(k) = -z;
-                    for (j=0;j<n;j++) 
-                    {
-                        v(j,k) = -v(j,k);
-                    }
-                }
-                break;
-            }
-            if (its == 29) 
-            {
-                throw std::runtime_error("no convergence in 30 svdcmp iterations");
-            }
-            x = w(l);
-            nm = k-1;
-            y = w(nm);
-            g = rv1(nm);
-            h = rv1(k);
-            f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.0*h*y);
-            g = PYTHAG(f, 1.0f);
-            f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
-            c = s = 1.0;
-            for (j=l;j<=nm;j++) 
-            {
-                i = j+1;
-                g = rv1(i);
-                y = w(i);
-                h = s*g;
-                g = c*g;
-                z = PYTHAG(f,h);
-                rv1(j) = z;
-                c = f/z;
-                s = h/z;
-                f = x*c+g*s;
-                g = g*c-x*s;
-                h = y*s;
-                y *= c;
-                for (jj=0;jj<n;jj++)
-                {
-                    x = v(jj,j);
-                    z = v(jj,i);
-                    v(jj,j) = x*c+z*s;
-                    v(jj,i) = z*c-x*s;
-                }
-                z = PYTHAG(f,h);
-                w(j) = z;
-                if (z) 
-                {
-                    z = 1.0/z;
-                    c = f*z;
-                    s = h*z;
-                }
-                f = c*g+s*y;
-                x = c*y-s*g;
-                for (jj=0;jj<m;jj++)
-                {
-                    y = u(jj,j);
-                    z = u(jj,i);
-                    u(jj,j) = y*c+z*s;
-                    u(jj,i) = z*c-y*s;
-                }
-            }
-            rv1(l) = 0.0;
-            rv1(k) = f;
-            w(k) = x;
-        }
-    }
-
-    reorder();
-    tsh = 0.5 * sqrt(m + n + 1.) * w(0) * eps;
-}
-
-void SVD2::reorder()
-{
-    int i,j,k,s, inc=1;
-    float sw;
-    Vector3 su, sv;
-
-    const int m = 3;
-    const int n = 3;
-
-    do 
-    { 
-        inc *= 3; 
-        inc++; 
-    } while (inc <= n);
-
-    do 
-    {
-        inc /= 3;
-        for (i=inc; i<n; i++)
-        {
-            sw = w(i);
-            for (k=0;k<m;k++) 
-            {
-                su(k) = u(k,i);
-            }
-            for (k=0;k<n;k++) 
-            {
-                sv(k) = v(k,i);
-            }
-            j = i;
-            while (w(j-inc) < sw) 
-            {
-                w(j) = w(j-inc);
-                for(k=0; k<m; k++) 
-                {
-                    u(k,j) = u(k,j-inc);
-                }
-                for (k=0;k<n;k++)
-                {
-                    v(k,j) = v(k,j-inc);
-                }
-                j -= inc;
-                if (j < inc)
-                {
-                    break;
-                }
-            }
-            w(j) = sw;
-            for (k=0; k<m; k++)
-            {
-                u(k,j) = su(k);
-            }
-            for (k=0; k<n; k++)
-            {
-                v(k,j) = sv(k);
-            }
-        }
-    } while (inc > 1);
-
-    for(k=0; k<n; k++) 
-    {
-        s=0;
-        for(i=0; i<m; i++) 
-        {
-            if (u(i,k) < 0.)
-            {
-                s++;
-            }
-        }
-        for(j=0; j<n; j++) 
-        {
-            if (v(j,k) < 0.) 
-            { 
-                s++;
-            }
-        }
-        if(s > (m+n)/2) 
-        {
-            for (i=0; i<m; i++) 
-            {
-                u(i,k) = -u(i,k);
-            }
-            for (j=0;j<n;j++) 
-            {
-                v(j,k) = -v(j,k);
-            }
-        }
-    }
-}
-
-// int SVD2::rank(float thresh)
-// {
-
-//     return 0;
-// }
-
-// int SVD2::nullity(float thresh)
-// {
-    
-//     return 0;
-// }
-
-// Matrix3x3 SVD2::range(float thresh)
-// {
-
-//     return Matrix3x3::Zeros();
-// }
-
-// Matrix3x3 SVD2::nullspace(float thresh)
-// {
-
-//     return Matrix3x3::Zeros();
-// }
-
-RMAGINE_FUNCTION
+RMAGINE_DEVICE_FUNCTION
 void svd(
     const Matrix3x3& a, 
     Matrix3x3& u,
     Matrix3x3& w,
     Matrix3x3& v)
 {
+    // printf("SVDD\n");
+
+    
     // TODO: test
-    constexpr unsigned int m = 3;
-    constexpr unsigned int n = 3;
-    constexpr unsigned int max_iterations = 20;
+    const unsigned int max_iterations = 20;
     
     // additional memory required
     bool flag;
-    int its, j, jj, k, nm;
+    int its, j, jj;
     float anorm, c, f, g, h, s, scale, x, y, z;
     
-    Vector3 rv1 = Vector3::Zeros();
+    Vector3 rv1;
+    rv1.x = 0.0;
+    rv1.y = 0.0;
+    rv1.z = 0.0;
+
+    
     
     g = s = scale = anorm = 0.0;
-    float eps = std::numeric_limits<float>::epsilon();
+    const float eps = __FLT_EPSILON__;
     u = a;
 
     // FIRST PART
@@ -555,6 +149,8 @@ void svd(
 
     anorm = MAX(anorm, (abs(w(1, 1)) + abs(rv1.y)));
     
+    
+
     rv1.z = scale * g;
 
     scale = abs(u(2, 2));
@@ -683,36 +279,30 @@ void svd(
 
     int i, l;
 
+    
+
     // PART 4: Opti
 
     // k = 2;
-    for(its=0; its<max_iterations; its++) 
+    for(int its=0; its<max_iterations; its++) 
     {
-        // flag=true;
-        // l = 2;
-        // if(MIN(fabs(rv1.z), fabs(w(1,1))) > eps*anorm)
-        // {
-        //     l = 1;
-        //     if(MIN(fabs(rv1.y),abs(w(0,0))) > eps*anorm)
-        //     {
-        //         l = 0;
-        //     }
-        // }
+        // printf("First Iter %d/%d\n", its, max_iterations);
+        // return;
 
         flag=true;
         l=2;
-        if(abs(rv1.z) <= eps*anorm)
+        if(fabs(rv1.z) <= eps*anorm)
         {
             flag=false;
         }
-        else if(abs(w(1,1)) > eps*anorm)
+        else if(fabs(w(1,1)) > eps*anorm)
         {
             l=1;
-            if(abs(rv1.y) <= eps*anorm) 
+            if(fabs(rv1.y) <= eps*anorm) 
             {
                 flag=false;
             }
-            else if(abs(w(0,0)) > eps*anorm) 
+            else if(fabs(w(0,0)) > eps*anorm) 
             {
                 l=0;
                 flag = false;
@@ -727,7 +317,7 @@ void svd(
             {
                 f = s*rv1[i];
                 rv1[i] = c*rv1[i];
-                if(abs(f) <= eps*anorm) 
+                if(fabs(f) <= eps*anorm) 
                 {
                     break;
                 }
@@ -737,7 +327,7 @@ void svd(
                 h = 1.0/h;
                 c = g*h;
                 s = -f*h;
-                for(j=0; j<m; j++)
+                for(j=0; j<3; j++)
                 {
                     y = u(j,l-1);
                     z = u(j,i);
@@ -762,7 +352,7 @@ void svd(
         if(its == max_iterations - 1) 
         {
             // std::cout << "no convergence in " << max_iterations << " svdcmp iterations" << std::endl;
-            throw std::runtime_error("no convergence in max svdcmp iterations");
+            // throw std::runtime_error("no convergence in max svdcmp iterations");
         }
         x = w(l,l);
         y = w(1,1);
@@ -804,7 +394,7 @@ void svd(
             }
             f = c*g+s*y;
             x = c*y-s*g;
-            for (jj=0;jj<m;jj++)
+            for (jj=0;jj<3;jj++)
             {
                 y = u(jj,j);
                 z = u(jj,i);
@@ -817,6 +407,7 @@ void svd(
         w(2,2) = x;
     }
 
+    
 
     for(its=0; its<max_iterations; its++) 
     {
@@ -850,7 +441,7 @@ void svd(
                 h = 1.0/h;
                 c = g*h;
                 s = -f*h;
-                for(j=0; j<m; j++)
+                for(j=0; j<3; j++)
                 {
                     y = u(j,l-1);
                     z = u(j,i);
@@ -874,8 +465,8 @@ void svd(
         }
         if(its == max_iterations - 1) 
         {
-            std::cout << "no convergence in " << max_iterations << " svdcmp iterations" << std::endl;
-            throw std::runtime_error("no convergence in max svdcmp iterations");
+            // std::cout << "no convergence in " << max_iterations << " svdcmp iterations" << std::endl;
+            // throw std::runtime_error("no convergence in max svdcmp iterations");
         }
 
         x = w(l, l);
@@ -955,7 +546,7 @@ void svd(
 }
 
 
-RMAGINE_FUNCTION
+RMAGINE_DEVICE_FUNCTION
 void svd(
     const Matrix3x3& a, 
     Matrix3x3& u,
@@ -963,19 +554,19 @@ void svd(
     Matrix3x3& v)
 {
     // TODO: test
-    constexpr unsigned int m = 3;
-    constexpr unsigned int n = 3;
+    // constexpr unsigned int m = 3;
+    // constexpr unsigned int n = 3;
     constexpr unsigned int max_iterations = 20;
     
     // additional memory required
     bool flag;
-    int its, j, jj, k, nm;
+    int its, j, jj;
     float anorm, c, f, g, h, s, scale, x, y, z;
     
     Vector3 rv1 = Vector3::Zeros();
     
     g = s = scale = anorm = 0.0;
-    float eps = std::numeric_limits<float>::epsilon();
+    float eps = __FLT_EPSILON__;
     u = a;
 
     // FIRST PART
@@ -1275,7 +866,7 @@ void svd(
                 h = 1.0/h;
                 c = g*h;
                 s = -f*h;
-                for(j=0; j<m; j++)
+                for(j=0; j<3; j++)
                 {
                     y = u(j,l-1);
                     z = u(j,i);
@@ -1299,7 +890,7 @@ void svd(
         if(its == max_iterations - 1) 
         {
             // std::cout << "no convergence in " << max_iterations << " svdcmp iterations" << std::endl;
-            throw std::runtime_error("no convergence in max svdcmp iterations");
+            // throw std::runtime_error("no convergence in max svdcmp iterations");
         }
         x = w(l);
         y = w.y;
@@ -1341,7 +932,7 @@ void svd(
             }
             f = c*g+s*y;
             x = c*y-s*g;
-            for (jj=0;jj<m;jj++)
+            for (jj=0;jj<3;jj++)
             {
                 y = u(jj,j);
                 z = u(jj,i);
@@ -1387,7 +978,7 @@ void svd(
                 h = 1.0/h;
                 c = g*h;
                 s = -f*h;
-                for(j=0; j<m; j++)
+                for(j=0; j<3; j++)
                 {
                     y = u(j,l-1);
                     z = u(j,i);
@@ -1411,8 +1002,8 @@ void svd(
         }
         if(its == max_iterations - 1) 
         {
-            std::cout << "no convergence in " << max_iterations << " svdcmp iterations" << std::endl;
-            throw std::runtime_error("no convergence in max svdcmp iterations");
+        //     std::cout << "no convergence in " << max_iterations << " svdcmp iterations" << std::endl;
+        //     throw std::runtime_error("no convergence in max svdcmp iterations");
         }
 
         x = w(l);
diff --git a/src/rmagine_cuda/src/math/math.cu b/src/rmagine_cuda/src/math/math.cu
index f550290..ec02282 100644
--- a/src/rmagine_cuda/src/math/math.cu
+++ b/src/rmagine_cuda/src/math/math.cu
@@ -1,10 +1,13 @@
 #include "rmagine/math/math.cuh"
 #include "rmagine/math/math.h"
 #include "rmagine/math/types.h"
+#include "rmagine/math/linalg.h"
+#include "rmagine/util/cuda/CudaDebug.hpp"
 
 namespace rmagine 
 {
 
+
 ////////
 // Generic Kernel
 ///
@@ -508,6 +511,7 @@ void multNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     multNxN_kernel<<<gridSize, blockSize>>>(A.raw(), B.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Quaternion, VRAM_CUDA> multNxN(
@@ -528,6 +532,7 @@ void multNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     multNxN_kernel<<<gridSize, blockSize>>>(A.raw(), b.raw(), c.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> multNxN(
@@ -548,6 +553,7 @@ void multNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (T1.size() + blockSize - 1) / blockSize;
     multNxN_kernel<<<gridSize, blockSize>>>(T1.raw(), T2.raw(), Tr.raw(), T1.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Transform, VRAM_CUDA> multNxN(
@@ -567,6 +573,7 @@ void multNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (T.size() + blockSize - 1) / blockSize;
     multNxN_kernel<<<gridSize, blockSize>>>(T.raw(), x.raw(), c.raw(), T.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> multNxN(
@@ -586,6 +593,7 @@ void multNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (M1.size() + blockSize - 1) / blockSize;
     multNxN_kernel<<<gridSize, blockSize>>>(M1.raw(), M2.raw(), Mr.raw(), M1.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Matrix3x3, VRAM_CUDA> multNxN(
@@ -605,6 +613,7 @@ void multNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (M1.size() + blockSize - 1) / blockSize;
     multNxN_conv_kernel<<<gridSize, blockSize>>>(M1.raw(), M2.raw(), Qres.raw(), M1.size());
+    RM_CUDA_DEBUG();
 }
 
 void multNxN(
@@ -615,6 +624,7 @@ void multNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (M.size() + blockSize - 1) / blockSize;
     multNxN_kernel<<<gridSize, blockSize>>>(M.raw(), x.raw(), c.raw(), M.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> multNxN(
@@ -637,6 +647,7 @@ void multNx1(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     multNxN_kernel<<<gridSize, blockSize>>>(A.raw(), b.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Quaternion, VRAM_CUDA> multNx1(
@@ -657,6 +668,7 @@ void multNx1(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     multNx1_kernel<<<gridSize, blockSize>>>(A.raw(), b.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> multNx1(
@@ -676,6 +688,7 @@ void multNx1(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (T1.size() + blockSize - 1) / blockSize;
     multNx1_kernel<<<gridSize, blockSize>>>(T1.raw(), t2.raw(), Tr.raw(), T1.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Transform, VRAM_CUDA> multNx1(
@@ -695,6 +708,7 @@ void multNx1(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (T.size() + blockSize - 1) / blockSize;
     multNx1_kernel<<<gridSize, blockSize>>>(T.raw(), x.raw(), c.raw(), T.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> multNx1(
@@ -714,6 +728,7 @@ void multNx1(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (M1.size() + blockSize - 1) / blockSize;
     multNx1_kernel<<<gridSize, blockSize>>>(M1.raw(), m2.raw(), Mr.raw(), M1.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Matrix3x3, VRAM_CUDA> multNx1(
@@ -733,6 +748,7 @@ void multNx1(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (M.size() + blockSize - 1) / blockSize;
     multNx1_kernel<<<gridSize, blockSize>>>(M.raw(), x.raw(), C.raw(), M.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> multNx1(
@@ -752,6 +768,7 @@ void multNx1(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (M.size() + blockSize - 1) / blockSize;
     multNx1_kernel<<<gridSize, blockSize>>>(M.raw(), x.raw(), C.raw(), M.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> multNx1(
@@ -774,6 +791,7 @@ void mult1xN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (B.size() + blockSize - 1) / blockSize;
     mult1xN_kernel<<<gridSize, blockSize>>>(a.raw(), B.raw(), C.raw(), B.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Quaternion, VRAM_CUDA> mult1xN(
@@ -793,6 +811,7 @@ void mult1xN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (B.size() + blockSize - 1) / blockSize;
     mult1xN_kernel<<<gridSize, blockSize>>>(a.raw(), B.raw(), C.raw(), B.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> mult1xN(
@@ -812,6 +831,7 @@ void mult1xN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (T2.size() + blockSize - 1) / blockSize;
     mult1xN_kernel<<<gridSize, blockSize>>>(t1.raw(), T2.raw(), Tr.raw(), T2.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Transform, VRAM_CUDA> mult1xN(
@@ -831,6 +851,7 @@ void mult1xN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (X.size() + blockSize - 1) / blockSize;
     mult1xN_kernel<<<gridSize, blockSize>>>(t.raw(), X.raw(), C.raw(), X.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> mult1xN(
@@ -850,6 +871,7 @@ void mult1xN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (M2.size() + blockSize - 1) / blockSize;
     mult1xN_kernel<<<gridSize, blockSize>>>(m1.raw(), M2.raw(), Mr.raw(), M2.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Matrix3x3, VRAM_CUDA> mult1xN(
@@ -869,6 +891,7 @@ void mult1xN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (X.size() + blockSize - 1) / blockSize;
     mult1xN_kernel<<<gridSize, blockSize>>>(m.raw(), X.raw(), C.raw(), X.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> mult1xN(
@@ -888,6 +911,7 @@ void mult1xN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (X.size() + blockSize - 1) / blockSize;
     mult1xN_kernel<<<gridSize, blockSize>>>(m.raw(), X.raw(), C.raw(), X.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> mult1xN(
@@ -909,6 +933,7 @@ void addNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     addNxN_kernel<<<gridSize, blockSize>>>(A.raw(), B.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> addNxN(
@@ -928,6 +953,7 @@ void addNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     addNxN_kernel<<<gridSize, blockSize>>>(A.raw(), B.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<float, VRAM_CUDA> addNxN(
@@ -950,6 +976,7 @@ void subNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     subNxN_kernel<<<gridSize, blockSize>>>(A.raw(), B.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> subNxN(
@@ -969,6 +996,7 @@ void subNx1(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     subNx1_kernel<<<gridSize, blockSize>>>(A.raw(), b.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> subNx1(
@@ -989,6 +1017,7 @@ void transpose(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     transpose_kernel<<<gridSize, blockSize>>>(A.raw(), B.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Matrix3x3, VRAM_CUDA> transpose(
@@ -1006,6 +1035,7 @@ void transpose(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     transpose_kernel<<<gridSize, blockSize>>>(A.raw(), B.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Matrix4x4, VRAM_CUDA> transpose(
@@ -1024,6 +1054,7 @@ void transposeInplace(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     transposeInplace_kernel<<<gridSize, blockSize>>>(A.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 //////
@@ -1035,6 +1066,7 @@ void invert(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     invert_kernel<<<gridSize, blockSize>>>(A.raw(), B.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Matrix3x3, VRAM_CUDA> invert(
@@ -1052,6 +1084,7 @@ void invert(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     invert_kernel<<<gridSize, blockSize>>>(A.raw(), B.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Matrix4x4, VRAM_CUDA> invert(
@@ -1069,6 +1102,7 @@ void invert(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     invert_kernel<<<gridSize, blockSize>>>(A.raw(), B.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Transform, VRAM_CUDA> invert(
@@ -1089,6 +1123,7 @@ void divNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     divNxN_conv_kernel<float><<<gridSize, blockSize>>>(A.raw(), B.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> divNxN(
@@ -1108,6 +1143,7 @@ void divNxN(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     divNxN_conv_kernel<float><<<gridSize, blockSize>>>(A.raw(), B.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Matrix3x3, VRAM_CUDA> divNxN(
@@ -1129,6 +1165,7 @@ void divNxNIgnoreZeros(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     divNxNIgnoreZeros_conv_kernel<float><<<gridSize, blockSize>>>(A.raw(), B.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> divNxNIgnoreZeros(
@@ -1148,6 +1185,7 @@ void divNxNIgnoreZeros(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     divNxNIgnoreZeros_conv_kernel<float><<<gridSize, blockSize>>>(A.raw(), B.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Matrix3x3, VRAM_CUDA> divNxNIgnoreZeros(
@@ -1167,6 +1205,7 @@ void divNxNIgnoreZeros(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     divNxNIgnoreZeros_conv_kernel<float><<<gridSize, blockSize>>>(A.raw(), B.raw(), C.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<float, VRAM_CUDA> divNxNIgnoreZeros(
@@ -1187,6 +1226,7 @@ void divNxNInplace(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     divNxNInplace_kernel<<<gridSize, blockSize>>>(A.raw(), B.raw(), A.size());
+    RM_CUDA_DEBUG();
 }
 
 void divNxNInplace(
@@ -1207,6 +1247,7 @@ void divNx1Inplace(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (A.size() + blockSize - 1) / blockSize;
     divNx1Inplace_kernel<<<gridSize, blockSize>>>(A.raw(), B, A.size());
+    RM_CUDA_DEBUG();
 }
 
 void divNx1Inplace(
@@ -1228,6 +1269,7 @@ void convert(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (from.size() + blockSize - 1) / blockSize;
     convert_kernel<<<gridSize, blockSize>>>(from.raw(), to.raw(), from.size());
+    RM_CUDA_DEBUG();
 }
 
 void convert(
@@ -1245,6 +1287,7 @@ void copy(const MemoryView<unsigned int, VRAM_CUDA>& from,
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (from.size() + blockSize - 1) / blockSize;
     convert_kernel<<<gridSize, blockSize>>>(from.raw(), to.raw(), from.size());
+    RM_CUDA_DEBUG();
 }
 
 ////////
@@ -1267,6 +1310,7 @@ void pack(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (R.size() + blockSize - 1) / blockSize;
     pack_kernel<<<gridSize, blockSize>>>(R.raw(), t.raw(), T.raw(), R.size());
+    RM_CUDA_DEBUG();
 }
 
 ////////
@@ -1279,6 +1323,7 @@ void multNxNTransposed(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (m1.size() + blockSize - 1) / blockSize;
     covParts_kernel<<<gridSize, blockSize>>>(m1.raw(), m2.raw(), Cs.raw(), m1.size());
+    RM_CUDA_DEBUG();
 }
 
 Memory<Matrix3x3, VRAM_CUDA> multNxNTransposed(
@@ -1299,6 +1344,7 @@ void multNxNTransposed(
     constexpr unsigned int blockSize = 64;
     const unsigned int gridSize = (m1.size() + blockSize - 1) / blockSize;
     covParts_kernel<<<gridSize, blockSize>>>(m1.raw(), m2.raw(), mask.raw(), Cs.raw(), m1.size());
+    RM_CUDA_DEBUG();
 }
     
 Memory<Matrix3x3, VRAM_CUDA> multNxNTransposed(
@@ -1318,6 +1364,7 @@ void normalizeInplace(MemoryView<Quaternion, VRAM_CUDA>& q)
     constexpr unsigned int blockSize = 1024;
     const unsigned int gridSize = (q.size() + blockSize - 1) / blockSize;
     normalizeInplace_kernel<<<gridSize, blockSize>>>(q.raw(), q.size());
+    RM_CUDA_DEBUG();
 }
 
 ///////
@@ -1340,6 +1387,7 @@ void setIdentity(MemoryView<Quaternion, VRAM_CUDA>& qs)
     constexpr unsigned int blockSize = 1024;
     const unsigned int gridSize = (qs.size() + blockSize - 1) / blockSize;
     setIdentity_kernel<<<gridSize, blockSize>>>(qs.raw(), qs.size());
+    RM_CUDA_DEBUG();
 }
 
 void setIdentity(MemoryView<Transform, VRAM_CUDA>& Ts)
@@ -1347,6 +1395,7 @@ void setIdentity(MemoryView<Transform, VRAM_CUDA>& Ts)
     constexpr unsigned int blockSize = 1024;
     const unsigned int gridSize = (Ts.size() + blockSize - 1) / blockSize;
     setIdentity_kernel<<<gridSize, blockSize>>>(Ts.raw(), Ts.size());
+    RM_CUDA_DEBUG();
 }
 
 void setIdentity(MemoryView<Matrix3x3, VRAM_CUDA>& Ms)
@@ -1354,6 +1403,7 @@ void setIdentity(MemoryView<Matrix3x3, VRAM_CUDA>& Ms)
     constexpr unsigned int blockSize = 1024;
     const unsigned int gridSize = (Ms.size() + blockSize - 1) / blockSize;
     setIdentity_kernel<<<gridSize, blockSize>>>(Ms.raw(), Ms.size());
+    RM_CUDA_DEBUG();
 }
 
 void setIdentity(MemoryView<Matrix4x4, VRAM_CUDA>& Ms)
@@ -1361,6 +1411,7 @@ void setIdentity(MemoryView<Matrix4x4, VRAM_CUDA>& Ms)
     constexpr unsigned int blockSize = 1024;
     const unsigned int gridSize = (Ms.size() + blockSize - 1) / blockSize;
     setIdentity_kernel<<<gridSize, blockSize>>>(Ms.raw(), Ms.size());
+    RM_CUDA_DEBUG();
 }
 
 void setZeros(MemoryView<Matrix3x3, VRAM_CUDA>& Ms)
@@ -1381,6 +1432,7 @@ void sum(
     MemoryView<Vector, VRAM_CUDA>& s)
 {
     sum_kernel<1024> <<<1, 1024>>>(data.raw(), s.raw(), data.size() );
+    RM_CUDA_DEBUG();
 }
 
 Memory<Vector, VRAM_CUDA> sum(
@@ -1417,6 +1469,7 @@ void cov(
     MemoryView<Matrix3x3, VRAM_CUDA>& C)
 {
     cov_kernel<1024> <<<1, 1024>>>(v1.raw(), v2.raw(), C.raw(), v1.size() );
+    RM_CUDA_DEBUG();
 }
 
 Memory<Matrix3x3, VRAM_CUDA> cov(
@@ -1429,15 +1482,62 @@ Memory<Matrix3x3, VRAM_CUDA> cov(
     return C;
 }
 
+
+__global__ void svd_kernel(
+    const Matrix3x3* As,
+    Matrix3x3* Us,
+    Matrix3x3* Ws,
+    Matrix3x3* Vs, 
+    unsigned int N)
+{
+    const unsigned int id = blockIdx.x * blockDim.x + threadIdx.x;
+    // printf("hello\n");
+    if(id < N)
+    {
+        svd(As[id], Us[id], Ws[id], Vs[id]);
+    }
+}
+
 void svd(
-    const MemoryView<rm::Matrix3x3, VRAM_CUDA>& As,
-    const MemoryView<rm::Matrix3x3, VRAM_CUDA>& Us,
-    const MemoryView<rm::Matrix3x3, VRAM_CUDA>& Ws,
-    const MemoryView<rm::Matrix3x3, VRAM_CUDA>& Vs
+    const MemoryView<Matrix3x3, VRAM_CUDA>& As,
+    MemoryView<Matrix3x3, VRAM_CUDA>& Us,
+    MemoryView<Matrix3x3, VRAM_CUDA>& Ws,
+    MemoryView<Matrix3x3, VRAM_CUDA>& Vs
 )
 {
-    std::cout << "CUDA SVD!" << std::endl;
+    // std::cout << "SVD CUDA!" << std::endl;
+    constexpr unsigned int blockSize = 512;
+    const unsigned int gridSize = (As.size() + blockSize - 1) / blockSize;
+    svd_kernel<<<gridSize, blockSize>>>(As.raw(), Us.raw(), Ws.raw(), Vs.raw(), As.size());
+    RM_CUDA_DEBUG();
+}
+
+__global__ void svd_kernel(
+    const Matrix3x3* As,
+    Matrix3x3* Us,
+    Vector3* ws,
+    Matrix3x3* Vs,
+    unsigned int N)
+{
+    const unsigned int id = blockIdx.x * blockDim.x + threadIdx.x;
+    // printf("hello\n");
+    if(id < N)
+    {
+        svd(As[id], Us[id], ws[id], Vs[id]);
+    }
 }
 
+void svd(
+    const MemoryView<Matrix3x3, VRAM_CUDA>& As,
+    MemoryView<Matrix3x3, VRAM_CUDA>& Us,
+    MemoryView<Vector3, VRAM_CUDA>& ws,
+    MemoryView<Matrix3x3, VRAM_CUDA>& Vs
+)
+{
+    constexpr unsigned int blockSize = 512;
+    const unsigned int gridSize = (As.size() + blockSize - 1) / blockSize;
+    svd_kernel<<<gridSize, blockSize>>>(As.raw(), Us.raw(), ws.raw(), Vs.raw(), As.size());
+    RM_CUDA_DEBUG();
+}
 
 } // namespace rmagine
\ No newline at end of file
diff --git a/src/rmagine_ouster/CMakeLists.txt b/src/rmagine_ouster/CMakeLists.txt
index 8e3e830..d90c2ca 100644
--- a/src/rmagine_ouster/CMakeLists.txt
+++ b/src/rmagine_ouster/CMakeLists.txt
@@ -24,7 +24,7 @@ target_link_libraries(rmagine-ouster
     Eigen3::Eigen
     ${Boost_LIBRARIES}
     ${OpenMP_CXX_LIBRARIES}
-    JsonCpp::JsonCpp
+    jsoncpp
 )
 
 set_target_properties(rmagine-ouster
diff --git a/tests/core/math_svd.cpp b/tests/core/math_svd.cpp
index 07da8e9..974457c 100644
--- a/tests/core/math_svd.cpp
+++ b/tests/core/math_svd.cpp
@@ -9,7 +9,7 @@
 #include <rmagine/util/exceptions.h>
 #include <rmagine/util/prints.h>
 
-#include <rmagine/math/SVD2.hpp>
+#include <rmagine/math/linalg.h>
 
 #include <Eigen/Dense>
 
@@ -275,7 +275,7 @@ void parallelTest()
     double el_eigen, el_rmagine, el_rmagine2;
 
     sw();
-    // #pragma omp parallel for
+    #pragma omp parallel for
     for(size_t obj_id=0; obj_id<num_objects; obj_id++)
     {
         Eigen::JacobiSVD<Eigen::Matrix3f> svdeig(covs_eigen[obj_id], 
@@ -305,7 +305,7 @@ void parallelTest()
     rm::Memory<rm::Matrix3x3> res_rm(num_objects);
 
     sw();
-    // #pragma omp parallel for
+    #pragma omp parallel for
     for(size_t obj_id=0; obj_id<num_objects; obj_id++)
     {
         rm::Matrix3x3 Urm = rm::Matrix3x3::Zeros();
@@ -329,6 +329,71 @@ void parallelTest()
     std::cout << "- summed error: " << err_rmagine << std::endl;
 }
 
+void parallelTest2()
+{
+    std::cout << "parallelTest2" << std::endl;
+
+    size_t num_objects = 1000000;
+    std::cout << "parallelTest. Computing SVD of " << num_objects << " 3x3 matrices" << std::endl;
+    // correct num_objects objects in parallel
+    
+
+    std::vector<Eigen::Matrix3f> covs_eigen(num_objects);
+    rm::Memory<rm::Matrix3x3, rm::RAM> covs_rm(num_objects);
+    rm::Memory<rm::Matrix3x3, rm::RAM> Us(num_objects);
+    rm::Memory<rm::Matrix3x3, rm::RAM> Ws(num_objects);
+    rm::Memory<rm::Matrix3x3, rm::RAM> Vs(num_objects);
+
+    for(size_t obj_id=0; obj_id<num_objects; obj_id++)
+    {
+        rm::Matrix3x3 Arm;
+        Eigen::Matrix3f Aeig = Eigen::Matrix3f::Random(3, 3);
+        for(size_t i=0; i<3; i++)
+        {
+            for(size_t j=0; j<3; j++)
+            {
+                Arm(i, j) = Aeig(i, j);
+            }
+        }
+
+        covs_eigen[obj_id] = Aeig;
+        covs_rm[obj_id] = Arm;
+        Us[obj_id] = rm::Matrix3x3::Zeros();
+        Ws[obj_id] = rm::Matrix3x3::Zeros();
+        Vs[obj_id] = rm::Matrix3x3::Zeros();
+    }
+
+    std::cout << "First Mat: " << std::endl;
+    std::cout << covs_rm[0] << std::endl;
+
+
+    rm::StopWatch sw;
+    double el_rmagine;
+
+
+    sw();
+    svd(covs_rm, Us, Ws, Vs);
+    el_rmagine = sw();
+    
+
+    rm::Memory<rm::Matrix3x3> res_rm(num_objects);
+    for(size_t obj_id=0; obj_id<num_objects; obj_id++)
+    {
+        auto uvt_rm = Us[obj_id] * Ws[obj_id] * Vs[obj_id].T();
+        res_rm[obj_id] = uvt_rm;
+    }
+    
+    float err_rmagine = 0.0;
+    for(size_t obj_id = 0; obj_id < num_objects; obj_id++)
+    {
+        err_rmagine += compute_error(covs_rm[obj_id], res_rm[obj_id]);
+    }
+    
+    std::cout << "Rmagine:" << std::endl;
+    std::cout << "- run time: " << el_rmagine << " s" << std::endl;
+    std::cout << "- summed error: " << err_rmagine << std::endl;
+}
+
 int main(int argc, char** argv)
 {
     srand((unsigned int) time(0));
@@ -339,6 +404,7 @@ int main(int argc, char** argv)
 
 
     parallelTest();
+    parallelTest2();
 
 
     return 0;
diff --git a/tests/cuda/math_svd.cpp b/tests/cuda/math_svd.cpp
index 873e400..7cc7289 100644
--- a/tests/cuda/math_svd.cpp
+++ b/tests/cuda/math_svd.cpp
@@ -1,20 +1,52 @@
-#include <rmagine/math/SVD2.hpp>
-
+#include <rmagine/math/linalg.h>
+#include <rmagine/util/prints.h>
 #include <rmagine/math/math_batched.cuh>
+#include <rmagine/util/StopWatch.hpp>
 
 #include <Eigen/Dense>
 
+#include <cuda_runtime.h>
+
 namespace rm = rmagine;
 
+float compute_error(Eigen::Matrix3f gt, Eigen::Matrix3f m)
+{
+    float ret = 0.0;
+    for(size_t i=0; i<3; i++)
+    {
+        for(size_t j=0; j<3; j++)
+        {
+            ret += abs(gt(i, j) - m(i, j));
+        }
+    }
+    return ret;
+}
+
+float compute_error(rm::Matrix3x3 gt, rm::Matrix3x3 m)
+{
+    float ret = 0.0;
+    for(size_t i=0; i<3; i++)
+    {
+        for(size_t j=0; j<3; j++)
+        {
+            ret += abs(gt(i, j) - m(i, j));
+        }
+    }
+    return ret;
+}
 
 void parallelTest()
 {
-    std::cout << "parallelTest" << std::endl;
-    // correct num_objects objects in parallel
     size_t num_objects = 1000000;
+    std::cout << "parallelTest. Computing SVD of " << num_objects << " 3x3 matrices" << std::endl;
+    // correct num_objects objects in parallel
+    
 
     std::vector<Eigen::Matrix3f> covs_eigen(num_objects);
     rm::Memory<rm::Matrix3x3, rm::RAM> covs_rm(num_objects);
+    rm::Memory<rm::Matrix3x3, rm::RAM> Us(num_objects);
+    rm::Memory<rm::Matrix3x3, rm::RAM> Ws(num_objects);
+    rm::Memory<rm::Matrix3x3, rm::RAM> Vs(num_objects);
 
     for(size_t obj_id=0; obj_id<num_objects; obj_id++)
     {
@@ -30,17 +62,50 @@ void parallelTest()
 
         covs_eigen[obj_id] = Aeig;
         covs_rm[obj_id] = Arm;
+        Us[obj_id] = rm::Matrix3x3::Zeros();
+        Ws[obj_id] = rm::Matrix3x3::Zeros();
+        Vs[obj_id] = rm::Matrix3x3::Zeros();
     }
 
     std::cout << "First Mat: " << std::endl;
     std::cout << covs_rm[0] << std::endl;
 
-
-    rm::Memory<rm::Matrix3x3, rm::VRAM_CUDA> covs_rm_ = covs_rm;
-
-
+    rm::StopWatch sw;
+    double el_rmagine;
 
 
+    // upload
+    rm::Memory<rm::Matrix3x3, rm::VRAM_CUDA> covs_rm_ = covs_rm;
+    rm::Memory<rm::Matrix3x3, rm::VRAM_CUDA> Us_ = Us;
+    rm::Memory<rm::Matrix3x3, rm::VRAM_CUDA> Ws_ = Ws;
+    rm::Memory<rm::Matrix3x3, rm::VRAM_CUDA> Vs_ = Vs;
+
+    sw();
+    svd(covs_rm_, Us_, Ws_, Vs_);
+    cudaDeviceSynchronize();
+    el_rmagine = sw();
+    
+    // download
+    Us = Us_;
+    Ws = Ws_;
+    Vs = Vs_;
+
+    rm::Memory<rm::Matrix3x3> res_rm(num_objects);
+    for(size_t obj_id=0; obj_id<num_objects; obj_id++)
+    {
+        auto uvt_rm = Us[obj_id] * Ws[obj_id] * Vs[obj_id].T();
+        res_rm[obj_id] = uvt_rm;
+    }
+    
+    float err_rmagine = 0.0;
+    for(size_t obj_id = 0; obj_id < num_objects; obj_id++)
+    {
+        err_rmagine += compute_error(covs_rm[obj_id], res_rm[obj_id]);
+    }
+    
+    std::cout << "Rmagine:" << std::endl;
+    std::cout << "- run time: " << el_rmagine << " s" << std::endl;
+    std::cout << "- summed error: " << err_rmagine << std::endl;
 }
 
 int main(int argc, char** argv)

From a6494b9667cb522d94151403814e324ed2c36ac0 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Thu, 12 Sep 2024 01:51:12 +0200
Subject: [PATCH 42/44] upgrade version

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0de0e27..e0455c4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.16)
 project(rmagine 
     LANGUAGES CXX C
-    VERSION 2.2.5)# TODO update this version when merging into main-branch
+    VERSION 2.2.6)# TODO update this version when merging into main-branch
 
 option(BUILD_TOOLS "Build tools" ON)
 option(BUILD_TESTS "Build tests" ON)

From 9d1d577bbec7e24da6254198b18368853fc9c579 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Thu, 12 Sep 2024 02:06:55 +0200
Subject: [PATCH 43/44] changed bibtex style

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9c89873..638be3c 100644
--- a/README.md
+++ b/README.md
@@ -116,7 +116,7 @@ More detailed examples explaining each step and how to customize it to your need
 
 Please reference the following papers when using the Rmagine library in your scientific work.
 
-```latex
+```bib
 @inproceedings{mock2023rmagine,
   title={{Rmagine: 3D Range Sensor Simulation in Polygonal Maps via Ray Tracing for Embedded Hardware on Mobile Robots}}, 
   author={Mock, Alexander and Wiemann, Thomas and Hertzberg, Joachim},

From 427f134d85c927fe5d2bafbf7ad05708437234f9 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Thu, 12 Sep 2024 03:01:22 +0200
Subject: [PATCH 44/44] linked rmagine viewer in README

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 638be3c..0eb836b 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,8 @@ Rmagine
   <a href="https://github.com/uos/rmagine/issues">Issues</a>
   <span>&nbsp;&nbsp;•&nbsp;&nbsp;</span>
   <a href="https://github.com/amock/rmagine_examples">Examples</a>
+  <span>&nbsp;&nbsp;•&nbsp;&nbsp;</span>
+  <a href="https://github.com/amock/rmagine_viewer">Viewer</a>
   <br />
 </div>
 
@@ -127,6 +129,7 @@ Please reference the following papers when using the Rmagine library in your sci
 ```
 
 ## Rmagine-accelerated Applications
+- [rmagine_viewer](https://github.com/amock/rmagine_viewer)
 - [rmagine_gazebo_plugins](https://github.com/uos/rmagine_gazebo_plugins)
 - [RMCL](https://github.com/uos/rmcl)
 - [radarays_ros](https://github.com/uos/radarays_ros)