From eb788fb684300f281936c951368729f84913a462 Mon Sep 17 00:00:00 2001
From: "saru@southpole" <saurabhsawant@lbl.gov>
Date: Tue, 19 Nov 2024 18:24:27 -0800
Subject: [PATCH 1/4] added decimation technique, verified with
 all_around_metal

---
 Source/Code_Definitions.H                     |   1 +
 Source/Solver/Transport/NEGF/CNT.H            |   7 +-
 Source/Solver/Transport/NEGF/CNT.cpp          |  59 ++-
 Source/Solver/Transport/NEGF/Graphene.H       |   4 +-
 Source/Solver/Transport/NEGF/Matrix_Block.H   |  45 +-
 .../Solver/Transport/NEGF/Matrix_Block_Util.H | 490 +++++++++++-------
 Source/Solver/Transport/NEGF/NEGF_Common.H    |  33 +-
 Source/Solver/Transport/NEGF/NEGF_Common.cpp  | 166 +++++-
 Source/Solver/Transport/Transport.cpp         |   7 +-
 Source/Utils/CodeUtils/MathLib.H              |  18 +
 Source/Utils/CodeUtils/MathLib.cpp            | 195 +++++++
 Source/Utils/CodeUtils/cudaErrorCheck.H       |  18 +
 .../decimation_technique/Source/main.cpp      |  10 +-
 13 files changed, 821 insertions(+), 232 deletions(-)
 create mode 100644 Source/Utils/CodeUtils/MathLib.H
 create mode 100644 Source/Utils/CodeUtils/MathLib.cpp
 create mode 100644 Source/Utils/CodeUtils/cudaErrorCheck.H
diff --git a/Source/Code_Definitions.H b/Source/Code_Definitions.H
index d051ace..d446473 100644
--- a/Source/Code_Definitions.H
+++ b/Source/Code_Definitions.H
@@ -6,6 +6,7 @@
 #define VFRAC_THREASHOLD 1e-5
 
 #define NUM_MODES 1
+#define BLOCK_SIZE 1
 #define NUM_CONTACTS 2
 #define NUM_ENERGY_PTS_REAL 10
 
diff --git a/Source/Solver/Transport/NEGF/CNT.H b/Source/Solver/Transport/NEGF/CNT.H
index 94ed50d..f354852 100644
--- a/Source/Solver/Transport/NEGF/CNT.H
+++ b/Source/Solver/Transport/NEGF/CNT.H
@@ -10,9 +10,9 @@
 
 #include "NEGF_Common.H"
 
-class c_CNT : public c_NEGF_Common<ComplexType[NUM_MODES]>
+class c_CNT : public c_NEGF_Common<ComplexType[BLOCK_SIZE]>
 {
-    using BlkType = ComplexType[NUM_MODES];
+    using BlkType = ComplexType[BLOCK_SIZE];
 
     amrex::Real acc = 1.42e-10;
     amrex::Real gamma = 2.5;
@@ -75,7 +75,8 @@ class c_CNT : public c_NEGF_Common<ComplexType[NUM_MODES]>
     virtual int get_offDiag_repeatBlkSize() final { return 2; }
 
     virtual AMREX_GPU_HOST_DEVICE void Compute_SurfaceGreensFunction(
-        MatrixBlock<BlkType> &gr, const ComplexType EmU) final;
+        MatrixBlock<BlkType> &gr, const ComplexType E,
+        const ComplexType U) final;
 
    public:
     static int get_1D_site_id(int par_id_local_to_NS)
diff --git a/Source/Solver/Transport/NEGF/CNT.cpp b/Source/Solver/Transport/NEGF/CNT.cpp
index 5618575..5917371 100644
--- a/Source/Solver/Transport/NEGF/CNT.cpp
+++ b/Source/Solver/Transport/NEGF/CNT.cpp
@@ -66,8 +66,8 @@ void c_CNT::Set_MaterialSpecificParameters()
 
 void c_CNT::Set_BlockDegeneracyVector(amrex::Vector<int> &vec)
 {
-    vec.resize(NUM_MODES);
-    for (int m = 0; m < NUM_MODES; ++m)
+    vec.resize(BLOCK_SIZE);
+    for (int m = 0; m < BLOCK_SIZE; ++m)
     {
         vec[m] = mode_degen_vec[m];
     }
@@ -293,7 +293,8 @@ ComplexType c_CNT::get_beta(int J)
 
 void c_CNT::Define_MPI_BlkType()
 {
-    MPI_Type_vector(1, NUM_MODES, NUM_MODES, MPI_DOUBLE_COMPLEX, &MPI_BlkType);
+    MPI_Type_vector(1, BLOCK_SIZE, BLOCK_SIZE, MPI_DOUBLE_COMPLEX,
+                    &MPI_BlkType);
     MPI_Type_commit(&MPI_BlkType);
 }
 
@@ -310,7 +311,7 @@ void c_CNT::Construct_Hamiltonian()
         h_minusHa(i) = 0.;
     }
 
-    for (int j = 0; j < NUM_MODES; ++j)
+    for (int j = 0; j < BLOCK_SIZE; ++j)
     {
         int J = mode_vec[j];
         beta.block[j] = get_beta(J);
@@ -352,32 +353,42 @@ void c_CNT::Define_ContactInfo()
 
 AMREX_GPU_HOST_DEVICE
 void c_CNT::Compute_SurfaceGreensFunction(MatrixBlock<BlkType> &gr,
-                                          const ComplexType EmU)
+                                          const ComplexType E,
+                                          const ComplexType U)
 {
-    auto EmU_sq = pow(EmU, 2.);
-    auto gamma_sq = pow(gamma, 2.);
-
-    for (int i = 0; i < NUM_MODES; ++i)
+    if (use_decimation)
+    {
+        c_NEGF_Common<BlkType>::DecimationTechnique(gr, E - U);
+    }
+    else
     {
-        auto Factor = EmU_sq + gamma_sq - pow(beta.block[i], 2);
+        ComplexType EmU = E - U;
+        auto EmU_sq = pow(EmU, 2.);
+        auto gamma_sq = pow(gamma, 2.);
 
-        auto Sqrt = sqrt(pow(Factor, 2) - 4. * EmU_sq * gamma_sq);
+        for (int i = 0; i < BLOCK_SIZE; ++i)
+        {
+            auto Factor = EmU_sq + gamma_sq - pow(beta.block[i], 2);
+
+            auto Sqrt = sqrt(pow(Factor, 2) - 4. * EmU_sq * gamma_sq);
 
-        auto Denom = 2. * gamma_sq * EmU;
+            auto Denom = 2. * gamma_sq * EmU;
 
-        auto val1 = (Factor + Sqrt) / Denom;
-        auto val2 = (Factor - Sqrt) / Denom;
+            auto val1 = (Factor + Sqrt) / Denom;
+            auto val2 = (Factor - Sqrt) / Denom;
 
-        if (val1.imag() < 0.)
-            gr.block[i] = val1;
-        else if (val2.imag() < 0.)
-            gr.block[i] = val2;
+            if (val1.imag() < 0.)
+                gr.block[i] = val1;
+            else if (val2.imag() < 0.)
+                gr.block[i] = val2;
 
-        // amrex::Print() << "EmU: " << EmU << "\n";
-        // amrex::Print() << "Factor: " << Factor << "\n";
-        // amrex::Print() << "Sqrt: "  << Sqrt << "\n";
-        // amrex::Print() << "Denom: " << Denom << "\n";
-        // amrex::Print() << "Numerator: " << Factor+Sqrt << "\n";
-        // amrex::Print() << "Value: " << (Factor+Sqrt)/Denom << "\n";
+            // amrex::Print() << "EmU: " << EmU << "\n";
+            // amrex::Print() << "Factor: " << Factor << "\n";
+            // amrex::Print() << "Sqrt: "  << Sqrt << "\n";
+            // amrex::Print() << "Denom: " << Denom << "\n";
+            // amrex::Print() << "Numerator: " << Factor+Sqrt << "\n";
+            // amrex::Print() << "Value: " << (Factor+Sqrt)/Denom << "\n";
+        }
+        // amrex::Print() << "Using quadratic, gr: " << gr << "\n";
     }
 }
diff --git a/Source/Solver/Transport/NEGF/Graphene.H b/Source/Solver/Transport/NEGF/Graphene.H
index bf5e62c..7ea0a64 100644
--- a/Source/Solver/Transport/NEGF/Graphene.H
+++ b/Source/Solver/Transport/NEGF/Graphene.H
@@ -10,9 +10,9 @@
 
 #include "NEGF_Common.H"
 
-class c_Graphene : public c_NEGF_Common<ComplexType[NUM_MODES][NUM_MODES]>
+class c_Graphene : public c_NEGF_Common<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>
 {
-    using BlkType = ComplexType[NUM_MODES][NUM_MODES];
+    using BlkType = ComplexType[BLOCK_SIZE][BLOCK_SIZE];
 
     static amrex::Array<int, 2> type_id;
 
diff --git a/Source/Solver/Transport/NEGF/Matrix_Block.H b/Source/Solver/Transport/NEGF/Matrix_Block.H
index e5f8bc5..500d36b 100644
--- a/Source/Solver/Transport/NEGF/Matrix_Block.H
+++ b/Source/Solver/Transport/NEGF/Matrix_Block.H
@@ -21,6 +21,44 @@ struct MatrixBlock
    public:
     T block;
 
+    // Non-const .data() for 1D array
+    template <typename U = T>
+    auto data() ->
+        typename std::enable_if<std::rank<U>::value == 1,
+                                typename std::remove_extent<U>::type *>::type
+    {
+        return &block[0];
+    }
+
+    // Const .data() for 1D array
+    template <typename U = T>
+    auto const_data() const -> typename std::enable_if<
+        std::rank<U>::value == 1,
+        const typename std::remove_extent<U>::type *>::type
+    {
+        return &block[0];
+    }
+
+    // Non-const .data() for 2D array
+    template <typename U = T>
+    auto data() -> typename std::enable_if<
+        std::rank<U>::value == 2,
+        typename std::remove_all_extents<U>::type *>::type
+    {
+        return reinterpret_cast<typename std::remove_all_extents<U>::type *>(
+            &block[0][0]);
+    }
+
+    // Const .data() for 2D array
+    template <typename U = T>
+    auto const_data() const -> typename std::enable_if<
+        std::rank<U>::value == 2,
+        const typename std::remove_all_extents<U>::type *>::type
+    {
+        return reinterpret_cast<
+            const typename std::remove_all_extents<U>::type *>(&block[0][0]);
+    }
+
     AMREX_GPU_HOST_DEVICE MatrixBlock<T> operator=(const amrex::Real c);
     AMREX_GPU_HOST_DEVICE MatrixBlock<T> operator=(const ComplexType c);
 
@@ -63,7 +101,11 @@ struct MatrixBlock
     AMREX_GPU_HOST_DEVICE MatrixBlock<T> Conj() const;   /*Conjugate*/
     AMREX_GPU_HOST_DEVICE MatrixBlock<T> Tran() const;   /*Transpose*/
     AMREX_GPU_HOST_DEVICE MatrixBlock<T> Dagger() const; /*Conjugate-Transpose*/
-    AMREX_GPU_HOST_DEVICE ComplexType DiagSum() const;   /*Trace*/
+    AMREX_GPU_HOST_DEVICE MatrixBlock<T> Inverse() const; /*Inverse*/
+    AMREX_GPU_HOST_DEVICE ComplexType DiagSum() const;    /*Trace*/
+    AMREX_GPU_HOST_DEVICE ComplexType FrobeniusNorm() const;
+    AMREX_GPU_HOST_DEVICE void SetDiag(
+        const ComplexType c); /*set diagonal element*/
 
     template <typename U>
     AMREX_GPU_HOST_DEVICE ComplexType
@@ -75,6 +117,7 @@ struct MatrixBlock
     // MatrixBlock<T> AtomicAdd(MatrixBlock<T>& B) const; /*Atomic add for the
     // block*/ template<typename U, std::size_t N> MatrixBlock<amrex::Real>
     // Real() const;
+    //
 };
 
 #endif
diff --git a/Source/Solver/Transport/NEGF/Matrix_Block_Util.H b/Source/Solver/Transport/NEGF/Matrix_Block_Util.H
index 951921d..cd4499e 100644
--- a/Source/Solver/Transport/NEGF/Matrix_Block_Util.H
+++ b/Source/Solver/Transport/NEGF/Matrix_Block_Util.H
@@ -1,3 +1,4 @@
+#include "MathLib.H"
 #include "Matrix_Block.H"
 
 /* Definitions */
@@ -9,10 +10,10 @@ MatrixBlock<T> MatrixBlock<T>::operator=(const ComplexType c_comp)
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES]>
-MatrixBlock<ComplexType[NUM_MODES]>::operator=(const ComplexType c_comp)
+MatrixBlock<ComplexType[BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE]>::operator=(const ComplexType c_comp)
 {
-    for (int i = 0; i < NUM_MODES; ++i)
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
         this->block[i] = c_comp;
     }
@@ -20,13 +21,13 @@ MatrixBlock<ComplexType[NUM_MODES]>::operator=(const ComplexType c_comp)
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>::operator=(
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>::operator=(
     const ComplexType c_comp)
 {
-    for (int i = 0; i < NUM_MODES; ++i)
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
-        for (int j = 0; j < NUM_MODES; ++j)
+        for (int j = 0; j < BLOCK_SIZE; ++j)
         {
             this->block[i][j] = c_comp;
         }
@@ -43,8 +44,8 @@ MatrixBlock<T> MatrixBlock<T>::operator=(const amrex::Real c)
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES]>
-MatrixBlock<ComplexType[NUM_MODES]>::operator=(const amrex::Real c)
+MatrixBlock<ComplexType[BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE]>::operator=(const amrex::Real c)
 {
     ComplexType c_complex(c, 0.);
     *this = c_complex;
@@ -52,8 +53,8 @@ MatrixBlock<ComplexType[NUM_MODES]>::operator=(const amrex::Real c)
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>::operator=(const amrex::Real c)
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>::operator=(const amrex::Real c)
 {
     ComplexType c_complex(c, 0.);
     *this = c_complex;
@@ -69,11 +70,11 @@ MatrixBlock<T> MatrixBlock<T>::operator*(const ComplexType c)
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES]>
-MatrixBlock<ComplexType[NUM_MODES]>::operator*(const ComplexType c)
+MatrixBlock<ComplexType[BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE]>::operator*(const ComplexType c)
 {
-    MatrixBlock<ComplexType[NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
         result.block[i] = this->block[i] * c;
     }
@@ -82,13 +83,14 @@ MatrixBlock<ComplexType[NUM_MODES]>::operator*(const ComplexType c)
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>::operator*(const ComplexType c)
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>::operator*(const ComplexType c)
 {
-    MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> result;
+
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
-        for (int j = 0; j < NUM_MODES; ++j)
+        for (int j = 0; j < BLOCK_SIZE; ++j)
         {
             result.block[i][j] = this->block[i][j] * c;
         }
@@ -106,12 +108,12 @@ MatrixBlock<T> MatrixBlock<T>::operator*(const MatrixBlock<T> &rhs)
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES]>
-MatrixBlock<ComplexType[NUM_MODES]>::operator*(
-    const MatrixBlock<ComplexType[NUM_MODES]> &rhs)
+MatrixBlock<ComplexType[BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE]>::operator*(
+    const MatrixBlock<ComplexType[BLOCK_SIZE]> &rhs)
 {
-    MatrixBlock<ComplexType[NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
         result.block[i] = this->block[i] * rhs.block[i];
     }
@@ -119,18 +121,16 @@ MatrixBlock<ComplexType[NUM_MODES]>::operator*(
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>::operator*(
-    const MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> &rhs)
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>::operator*(
+    const MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> &rhs)
 {
-    MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
-    {
-        for (int j = 0; j < NUM_MODES; ++j)
-        {
-            result.block[i][j] = this->block[i][j] * rhs.block[i][j];
-        }
-    }
+    MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> result;
+
+    // MathLib::MatrixMatrixMultiply(result.data(), this->const_data(),
+    // rhs.const_data(),
+    //                               BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE);
+
     return result;
 }
 
@@ -143,11 +143,11 @@ MatrixBlock<T> operator*(const ComplexType c_complex, const MatrixBlock<T> &B)
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES]> operator*(
-    const ComplexType c_complex, const MatrixBlock<ComplexType[NUM_MODES]> &B)
+MatrixBlock<ComplexType[BLOCK_SIZE]> operator*(
+    const ComplexType c_complex, const MatrixBlock<ComplexType[BLOCK_SIZE]> &B)
 {
-    MatrixBlock<ComplexType[NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
         result.block[i] = c_complex * B.block[i];
     }
@@ -155,14 +155,14 @@ MatrixBlock<ComplexType[NUM_MODES]> operator*(
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> operator*(
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> operator*(
     const ComplexType c_complex,
-    const MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> &B)
+    const MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> &B)
 {
-    MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
-        for (int j = 0; j < NUM_MODES; ++j)
+        for (int j = 0; j < BLOCK_SIZE; ++j)
         {
             result.block[i][j] = c_complex * B.block[i][j];
         }
@@ -179,17 +179,17 @@ MatrixBlock<T> operator*(const amrex::Real c_real, const MatrixBlock<T> &B)
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES]> operator*(
-    const amrex::Real c_real, const MatrixBlock<ComplexType[NUM_MODES]> &B)
+MatrixBlock<ComplexType[BLOCK_SIZE]> operator*(
+    const amrex::Real c_real, const MatrixBlock<ComplexType[BLOCK_SIZE]> &B)
 {
     ComplexType c_complex(c_real, 0.);
     return operator*(c_complex, B);
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> operator*(
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> operator*(
     const amrex::Real c_real,
-    const MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> &B)
+    const MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> &B)
 {
     ComplexType c_complex(c_real, 0.);
     return operator*(c_complex, B);
@@ -204,11 +204,11 @@ MatrixBlock<T> MatrixBlock<T>::operator+(const ComplexType c)
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES]>
-MatrixBlock<ComplexType[NUM_MODES]>::operator+(const ComplexType c)
+MatrixBlock<ComplexType[BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE]>::operator+(const ComplexType c)
 {
-    MatrixBlock<ComplexType[NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
         result.block[i] = this->block[i] + c;
     }
@@ -225,12 +225,12 @@ MatrixBlock<T> MatrixBlock<T>::operator+(const MatrixBlock<T> &rhs)
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES]>
-MatrixBlock<ComplexType[NUM_MODES]>::operator+(
-    const MatrixBlock<ComplexType[NUM_MODES]> &rhs)
+MatrixBlock<ComplexType[BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE]>::operator+(
+    const MatrixBlock<ComplexType[BLOCK_SIZE]> &rhs)
 {
-    MatrixBlock<ComplexType[NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
         result.block[i] = this->block[i] + rhs.block[i];
     }
@@ -238,14 +238,14 @@ MatrixBlock<ComplexType[NUM_MODES]>::operator+(
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>::operator+(
-    const MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> &rhs)
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>::operator+(
+    const MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> &rhs)
 {
-    MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
-        for (int j = 0; j < NUM_MODES; ++j)
+        for (int j = 0; j < BLOCK_SIZE; ++j)
         {
             result.block[i][j] = this->block[i][j] + rhs.block[i][j];
         }
@@ -262,11 +262,11 @@ MatrixBlock<T> operator+(const ComplexType c, const MatrixBlock<T> &rhs)
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES]> operator+(
-    const ComplexType c, const MatrixBlock<ComplexType[NUM_MODES]> &rhs)
+MatrixBlock<ComplexType[BLOCK_SIZE]> operator+(
+    const ComplexType c, const MatrixBlock<ComplexType[BLOCK_SIZE]> &rhs)
 {
-    MatrixBlock<ComplexType[NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
         result.block[i] = c + rhs.block[i];
     }
@@ -274,14 +274,14 @@ MatrixBlock<ComplexType[NUM_MODES]> operator+(
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> operator+(
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> operator+(
     const ComplexType c,
-    const MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> &rhs)
+    const MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> &rhs)
 {
-    MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
-        for (int j = 0; j < NUM_MODES; ++j)
+        for (int j = 0; j < BLOCK_SIZE; ++j)
         {
             result.block[i][j] = c + rhs.block[i][j];
         }
@@ -298,11 +298,11 @@ MatrixBlock<T> MatrixBlock<T>::operator-(const amrex::Real c)
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES]>
-MatrixBlock<ComplexType[NUM_MODES]>::operator-(const amrex::Real c)
+MatrixBlock<ComplexType[BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE]>::operator-(const amrex::Real c)
 {
-    MatrixBlock<ComplexType[NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
         result.block[i] = this->block[i] - c;
     }
@@ -319,11 +319,11 @@ MatrixBlock<T> MatrixBlock<T>::operator-(const ComplexType c)
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES]>
-MatrixBlock<ComplexType[NUM_MODES]>::operator-(const ComplexType c)
+MatrixBlock<ComplexType[BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE]>::operator-(const ComplexType c)
 {
-    MatrixBlock<ComplexType[NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
         result.block[i] = this->block[i] - c;
     }
@@ -340,12 +340,12 @@ MatrixBlock<T> MatrixBlock<T>::operator-(const MatrixBlock<T> &rhs)
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES]>
-MatrixBlock<ComplexType[NUM_MODES]>::operator-(
-    const MatrixBlock<ComplexType[NUM_MODES]> &rhs)
+MatrixBlock<ComplexType[BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE]>::operator-(
+    const MatrixBlock<ComplexType[BLOCK_SIZE]> &rhs)
 {
-    MatrixBlock<ComplexType[NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
         result.block[i] = this->block[i] - rhs.block[i];
     }
@@ -361,11 +361,11 @@ MatrixBlock<T> operator-(const ComplexType c, const MatrixBlock<T> &rhs)
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES]> operator-(
-    const ComplexType c, const MatrixBlock<ComplexType[NUM_MODES]> &rhs)
+MatrixBlock<ComplexType[BLOCK_SIZE]> operator-(
+    const ComplexType c, const MatrixBlock<ComplexType[BLOCK_SIZE]> &rhs)
 {
-    MatrixBlock<ComplexType[NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
         result.block[i] = c - rhs.block[i];
     }
@@ -381,12 +381,12 @@ MatrixBlock<T> operator-(const MatrixBlock<T> &B, const MatrixBlock<T> &C)
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES]> operator-(
-    const MatrixBlock<ComplexType[NUM_MODES]> &B,
-    const MatrixBlock<ComplexType[NUM_MODES]> &C)
+MatrixBlock<ComplexType[BLOCK_SIZE]> operator-(
+    const MatrixBlock<ComplexType[BLOCK_SIZE]> &B,
+    const MatrixBlock<ComplexType[BLOCK_SIZE]> &C)
 {
-    MatrixBlock<ComplexType[NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
         result.block[i] = B.block[i] - C.block[i];
     }
@@ -394,14 +394,14 @@ MatrixBlock<ComplexType[NUM_MODES]> operator-(
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> operator-(
-    const MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> &B,
-    const MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> &C)
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> operator-(
+    const MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> &B,
+    const MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> &C)
 {
-    MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
-        for (int j = 0; j < NUM_MODES; ++j)
+        for (int j = 0; j < BLOCK_SIZE; ++j)
         {
             result.block[i][j] = B.block[i][j] - C.block[i][j];
         }
@@ -409,6 +409,34 @@ MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> operator-(
     return result;
 }
 
+/*Inverse*/
+template <typename T>
+MatrixBlock<T> MatrixBlock<T>::Inverse() const
+{
+    return 1. / (*this);
+}
+
+template <>
+MatrixBlock<ComplexType[BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE]>::Inverse() const
+{
+    MatrixBlock<ComplexType[BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
+    {
+        result.block[i] = 1. / this->block[i];
+    }
+    return result;
+}
+
+template <>
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>::Inverse() const
+{
+    MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> result;
+    // MathLib::DenseMatrixInversion(d_result.p, d_A.p, A_rows, A_cols);
+    return result;
+}
+
 /* Operation [R] = [B] / c_complex */
 template <typename T>
 MatrixBlock<T> MatrixBlock<T>::operator/(ComplexType c)
@@ -418,11 +446,11 @@ MatrixBlock<T> MatrixBlock<T>::operator/(ComplexType c)
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES]>
-MatrixBlock<ComplexType[NUM_MODES]>::operator/(ComplexType c)
+MatrixBlock<ComplexType[BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE]>::operator/(ComplexType c)
 {
-    MatrixBlock<ComplexType[NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
         result.block[i] = this->block[i] / c;
     }
@@ -439,18 +467,30 @@ MatrixBlock<T> MatrixBlock<T>::operator/(const MatrixBlock<T> &rhs)
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES]>
-MatrixBlock<ComplexType[NUM_MODES]>::operator/(
-    const MatrixBlock<ComplexType[NUM_MODES]> &rhs)
+MatrixBlock<ComplexType[BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE]>::operator/(
+    const MatrixBlock<ComplexType[BLOCK_SIZE]> &rhs)
 {
-    MatrixBlock<ComplexType[NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
         result.block[i] = this->block[i] / rhs.block[i];
     }
     return result;
 }
 
+template <>
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>::operator/(
+    const MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> &rhs)
+{
+    MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> result =
+        (*this) * rhs.Inverse();
+    // MathLib::DenseMatrixInversion(rhs_inverse.p, rhs.p, A_rows, A_cols);
+
+    return result;
+}
+
 /* Operation [R] = c_complex / [C] */
 template <typename T>
 MatrixBlock<T> operator/(const ComplexType c_complex, const MatrixBlock<T> &C)
@@ -460,11 +500,11 @@ MatrixBlock<T> operator/(const ComplexType c_complex, const MatrixBlock<T> &C)
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES]> operator/(
-    const ComplexType c_complex, const MatrixBlock<ComplexType[NUM_MODES]> &C)
+MatrixBlock<ComplexType[BLOCK_SIZE]> operator/(
+    const ComplexType c_complex, const MatrixBlock<ComplexType[BLOCK_SIZE]> &C)
 {
-    MatrixBlock<ComplexType[NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
         result.block[i] = c_complex / C.block[i];
     }
@@ -472,14 +512,14 @@ MatrixBlock<ComplexType[NUM_MODES]> operator/(
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> operator/(
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> operator/(
     const ComplexType c_complex,
-    const MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> &C)
+    const MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> &C)
 {
-    MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
-        for (int j = 0; j < NUM_MODES; ++j)
+        for (int j = 0; j < BLOCK_SIZE; ++j)
         {
             result.block[i][j] = c_complex / C.block[i][j];
         }
@@ -496,12 +536,12 @@ std::ostream &operator<<(std::ostream &stream, const MatrixBlock<T> &rhs)
 
 template <>
 std::ostream &operator<<(std::ostream &stream,
-                         const MatrixBlock<amrex::Real[NUM_MODES]> &rhs)
+                         const MatrixBlock<amrex::Real[BLOCK_SIZE]> &rhs)
 {
     stream << "{";
-    for (int i = 0; i < NUM_MODES; ++i)
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
-        if (i != NUM_MODES - 1)
+        if (i != BLOCK_SIZE - 1)
         {
             stream << rhs.block[i] << ", ";
         }
@@ -516,12 +556,12 @@ std::ostream &operator<<(std::ostream &stream,
 
 template <>
 std::ostream &operator<<(std::ostream &stream,
-                         const MatrixBlock<ComplexType[NUM_MODES]> &rhs)
+                         const MatrixBlock<ComplexType[BLOCK_SIZE]> &rhs)
 {
     stream << "[";
-    for (int i = 0; i < NUM_MODES; ++i)
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
-        if (i != NUM_MODES - 1)
+        if (i != BLOCK_SIZE - 1)
         {
             stream << rhs.block[i] << ", ";
         }
@@ -537,21 +577,22 @@ std::ostream &operator<<(std::ostream &stream,
 template <>
 std::ostream &operator<<(
     std::ostream &stream,
-    const MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> &rhs)
+    const MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> &rhs)
 {
     stream << "[";
-    for (int i = 0; i < NUM_MODES; ++i)
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
-        for (int j = 0; j < NUM_MODES; ++j)
+        for (int j = 0; j < BLOCK_SIZE; ++j)
         {
             stream << std::setw(10) << rhs.block[i][j];
         }
-        if (i != NUM_MODES - 1) stream << "\n ";
+        if (i != BLOCK_SIZE - 1) stream << "\n ";
     }
     stream << "]";
     return stream;
 }
 
+/*Conjugate*/
 template <typename T>
 MatrixBlock<T> MatrixBlock<T>::Conj() const
 {
@@ -560,11 +601,11 @@ MatrixBlock<T> MatrixBlock<T>::Conj() const
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES]> MatrixBlock<ComplexType[NUM_MODES]>::Conj()
-    const
+MatrixBlock<ComplexType[BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE]>::Conj() const
 {
-    MatrixBlock<ComplexType[NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
         amrex::GpuComplex X_conj(this->block[i].real(),
                                  -1. * this->block[i].imag());
@@ -574,13 +615,13 @@ MatrixBlock<ComplexType[NUM_MODES]> MatrixBlock<ComplexType[NUM_MODES]>::Conj()
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>::Conj() const
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>::Conj() const
 {
-    MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
-        for (int j = 0; j < NUM_MODES; ++j)
+        for (int j = 0; j < BLOCK_SIZE; ++j)
         {
             amrex::GpuComplex X_conj(this->block[i][j].real(),
                                      -1. * this->block[i][j].imag());
@@ -590,6 +631,7 @@ MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>::Conj() const
     return result;
 }
 
+/*Transpose*/
 template <typename T>
 MatrixBlock<T> MatrixBlock<T>::Tran() const
 {
@@ -597,20 +639,20 @@ MatrixBlock<T> MatrixBlock<T>::Tran() const
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES]> MatrixBlock<ComplexType[NUM_MODES]>::Tran()
-    const
+MatrixBlock<ComplexType[BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE]>::Tran() const
 {
     return *this;
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>::Tran() const
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>::Tran() const
 {
-    MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
-        for (int j = 0; j < NUM_MODES; ++j)
+        for (int j = 0; j < BLOCK_SIZE; ++j)
         {
             if (i == j)
             {
@@ -625,6 +667,7 @@ MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>::Tran() const
     return result;
 }
 
+/*Conjugate-transpose*/
 template <typename T>
 MatrixBlock<T> MatrixBlock<T>::Dagger() const
 {
@@ -632,17 +675,20 @@ MatrixBlock<T> MatrixBlock<T>::Dagger() const
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES]>
-MatrixBlock<ComplexType[NUM_MODES]>::Dagger() const
+MatrixBlock<ComplexType[BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE]>::Dagger() const
 {
     return this->Conj().Tran();
 }
 
 template <>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>::Dagger() const
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>::Dagger() const
 {
-    return this->Conj().Tran();
+    MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> result;
+    // MathLib::ConjugateTranspose(result.p, this->block.p, BLOCK_SIZE,
+    // BLOCK_SIZE);
+    return result;
 }
 
 ///* Operation AtomicAdd */
@@ -654,11 +700,11 @@ MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>::Dagger() const
 // }
 //
 // template <>
-// MatrixBlock<ComplexType[NUM_MODES]>
-// MatrixBlock<ComplexType[NUM_MODES]>::AtomicAdd(MatrixBlock<T>& B) const
+// MatrixBlock<ComplexType[BLOCK_SIZE]>
+// MatrixBlock<ComplexType[BLOCK_SIZE]>::AtomicAdd(MatrixBlock<T>& B) const
 //{
 //
-//     for (int i = 0; i < NUM_MODES; ++i)
+//     for (int i = 0; i < BLOCK_SIZE; ++i)
 //     {
 //         amrex::HostDevice::Atomic::Add(&(this->trace_r.block[i]),
 //         B.block[i].real());
@@ -669,14 +715,14 @@ MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>::Dagger() const
 // }
 //
 // template <>
-// MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>
-// MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>::AtomicAdd(MatrixBlock<T>& B)
-// const
+// MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>
+// MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>::AtomicAdd(MatrixBlock<T>&
+// B) const
 //{
-//     MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> result;
-//     for (int i = 0; i < NUM_MODES; ++i)
+//     MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> result;
+//     for (int i = 0; i < BLOCK_SIZE; ++i)
 //     {
-//         for (int j = 0; j < NUM_MODES; ++j)
+//         for (int j = 0; j < BLOCK_SIZE; ++j)
 //         {
 //             if(i==j)
 //             {
@@ -701,12 +747,12 @@ MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>::Dagger() const
 // }
 //
 // template <>
-// MatrixBlock<amrex::Real[NUM_MODES]>
-// MatrixBlock<ComplexType[NUM_MODES]>::Real() const
+// MatrixBlock<amrex::Real[BLOCK_SIZE]>
+// MatrixBlock<ComplexType[BLOCK_SIZE]>::Real() const
 //{
-//     MatrixBlock<amrex::Real[NUM_MODES]> result;
+//     MatrixBlock<amrex::Real[BLOCK_SIZE]> result;
 //
-//     for (int i = 0; i < NUM_MODES; ++i)
+//     for (int i = 0; i < BLOCK_SIZE; ++i)
 //     {
 //         result.block[i] = this->block[i].real();
 //     }
@@ -714,14 +760,14 @@ MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>::Dagger() const
 // }
 //
 // template <>
-// MatrixBlock<amrex::Real[NUM_MODES][NUM_MODES]>
-// MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>::Real() const
+// MatrixBlock<amrex::Real[BLOCK_SIZE][BLOCK_SIZE]>
+// MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>::Real() const
 //{
-//     MatrixBlock<amrex::Real[NUM_MODES][NUM_MODES]> result;
+//     MatrixBlock<amrex::Real[BLOCK_SIZE][BLOCK_SIZE]> result;
 //
-//     for (int i = 0; i < NUM_MODES; ++i)
+//     for (int i = 0; i < BLOCK_SIZE; ++i)
 //     {
-//         for (int j = 0; j < NUM_MODES; ++j)
+//         for (int j = 0; j < BLOCK_SIZE; ++j)
 //         {
 //             result.block[i][j] = this->block[i][j].real();
 //         }
@@ -738,10 +784,10 @@ ComplexType MatrixBlock<T>::DiagSum() const
 }
 
 template <>
-ComplexType MatrixBlock<ComplexType[NUM_MODES]>::DiagSum() const
+ComplexType MatrixBlock<ComplexType[BLOCK_SIZE]>::DiagSum() const
 {
     ComplexType result(0., 0.);
-    for (int i = 0; i < NUM_MODES; ++i)
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
         result += this->block[i];
     }
@@ -749,12 +795,12 @@ ComplexType MatrixBlock<ComplexType[NUM_MODES]>::DiagSum() const
 }
 
 template <>
-ComplexType MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>::DiagSum() const
+ComplexType MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>::DiagSum() const
 {
     ComplexType result(0., 0.);
-    for (int i = 0; i < NUM_MODES; ++i)
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
-        for (int j = 0; j < NUM_MODES; ++j)
+        for (int j = 0; j < BLOCK_SIZE; ++j)
         {
             if (i == j)
             {
@@ -765,6 +811,40 @@ ComplexType MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>::DiagSum() const
     return result;
 }
 
+/* Operation Norm */
+template <typename T>
+ComplexType MatrixBlock<T>::FrobeniusNorm() const
+{
+    ComplexType result(0., 0.);
+    return *this;
+}
+
+template <>
+ComplexType MatrixBlock<ComplexType[BLOCK_SIZE]>::FrobeniusNorm() const
+{
+    ComplexType result(0., 0.);
+    for (int i = 0; i < BLOCK_SIZE; ++i)
+    {
+        result += this->block[i] * this->block[i];
+    }
+    return amrex::sqrt(result);
+}
+
+template <>
+ComplexType MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>::FrobeniusNorm()
+    const
+{
+    ComplexType result(0., 0.);
+    for (int i = 0; i < BLOCK_SIZE; ++i)
+    {
+        for (int j = 0; j < BLOCK_SIZE; ++j)
+        {
+            result += this->block[i][j] * this->block[i][j];
+        }
+    }
+    return amrex::sqrt(result);
+}
+
 ///* Operation DiagDotSum */
 template <typename T>
 template <typename U>
@@ -776,10 +856,10 @@ ComplexType MatrixBlock<T>::DiagDotSum(U &vec) const
 
 template <>
 template <typename U>
-ComplexType MatrixBlock<ComplexType[NUM_MODES]>::DiagDotSum(U &vec) const
+ComplexType MatrixBlock<ComplexType[BLOCK_SIZE]>::DiagDotSum(U &vec) const
 {
     ComplexType result(0., 0.);
-    for (int i = 0; i < NUM_MODES; ++i)
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
         ComplexType factor(vec[i], 0.);
         result += factor * this->block[i];
@@ -789,13 +869,13 @@ ComplexType MatrixBlock<ComplexType[NUM_MODES]>::DiagDotSum(U &vec) const
 
 template <>
 template <typename U>
-ComplexType MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>::DiagDotSum(
+ComplexType MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>::DiagDotSum(
     U &vec) const
 {
     ComplexType result(0., 0.);
-    for (int i = 0; i < NUM_MODES; ++i)
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
-        for (int j = 0; j < NUM_MODES; ++j)
+        for (int j = 0; j < BLOCK_SIZE; ++j)
         {
             if (i == j)
             {
@@ -817,11 +897,11 @@ MatrixBlock<T> MatrixBlock<T>::DiagMult(U &vec) const
 
 template <>
 template <typename U>
-MatrixBlock<ComplexType[NUM_MODES]>
-MatrixBlock<ComplexType[NUM_MODES]>::DiagMult(U &vec) const
+MatrixBlock<ComplexType[BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE]>::DiagMult(U &vec) const
 {
-    MatrixBlock<ComplexType[NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
         ComplexType factor(vec[i], 0.);
         result.block[i] = factor * this->block[i];
@@ -831,13 +911,13 @@ MatrixBlock<ComplexType[NUM_MODES]>::DiagMult(U &vec) const
 
 template <>
 template <typename U>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>
-MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>::DiagMult(U &vec) const
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>
+MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>::DiagMult(U &vec) const
 {
-    MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]> result;
-    for (int i = 0; i < NUM_MODES; ++i)
+    MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]> result;
+    for (int i = 0; i < BLOCK_SIZE; ++i)
     {
-        for (int j = 0; j < NUM_MODES; ++j)
+        for (int j = 0; j < BLOCK_SIZE; ++j)
         {
             if (i == j)
             {
@@ -848,3 +928,39 @@ MatrixBlock<ComplexType[NUM_MODES][NUM_MODES]>::DiagMult(U &vec) const
     }
     return result;
 }
+
+///* Operation DiagSet */
+template <typename T>
+void MatrixBlock<T>::SetDiag(const ComplexType c_comp)
+{
+}
+
+template <>
+void MatrixBlock<ComplexType[BLOCK_SIZE]>::SetDiag(const ComplexType c_comp)
+{
+    for (int i = 0; i < BLOCK_SIZE; ++i)
+    {
+        this->block[i] = c_comp;
+    }
+}
+
+template <>
+void MatrixBlock<ComplexType[BLOCK_SIZE][BLOCK_SIZE]>::SetDiag(
+    const ComplexType c_comp)
+{
+    ComplexType zero(0., 0.);
+    for (int i = 0; i < BLOCK_SIZE; ++i)
+    {
+        for (int j = 0; j < BLOCK_SIZE; ++j)
+        {
+            if (i == j)
+            {
+                this->block[i][i] = c_comp;
+            }
+            else
+            {
+                this->block[i][i] = zero;
+            }
+        }
+    }
+}
diff --git a/Source/Solver/Transport/NEGF/NEGF_Common.H b/Source/Solver/Transport/NEGF/NEGF_Common.H
index fa009f7..059c2a4 100644
--- a/Source/Solver/Transport/NEGF/NEGF_Common.H
+++ b/Source/Solver/Transport/NEGF/NEGF_Common.H
@@ -158,6 +158,18 @@ class c_NEGF_Common : protected MatrixBlock<T>, private c_IntegrationPath
     amrex::Gpu::DeviceVector<amrex::Real> d_Trace_i;
 #endif
 
+    /*Condensed Hamiltonian*/
+    struct CondensedHamiltonian
+    {
+        MatrixBlock<T> Xi_s;
+        MatrixBlock<T> Xi;
+        MatrixBlock<T> Pi;
+
+        CondensedHamiltonian()
+            : Xi_s(), Xi(), Pi() {}  // default initialized
+                                     // calling constructor of MatrixBlock<T>
+    };
+
     void Set_KeyParams(const std::string &NS_name_str, const int &NS_id_counter,
                        const amrex::Real &NS_initial_deposit_value);
     void Define_FoldersAndFiles(const std::string &negf_foldername_str);
@@ -181,6 +193,7 @@ class c_NEGF_Common : protected MatrixBlock<T>, private c_IntegrationPath
     void Read_IntegrandWritingParams(amrex::ParmParse &);
     void Read_AtomLocationAndChargeDistributionFilename(amrex::ParmParse &);
     void Read_RecursiveOptimizationParams(amrex::ParmParse &);
+    void Read_DecimationTechniqueParams(amrex::ParmParse &);
     void Assert_Reads();
 
     void Assert_KeyParameters();
@@ -312,7 +325,14 @@ class c_NEGF_Common : protected MatrixBlock<T>, private c_IntegrationPath
     virtual void Define_IntegrationPaths();
 
     virtual AMREX_GPU_HOST_DEVICE void Compute_SurfaceGreensFunction(
-        MatrixBlock<T> &gr, const ComplexType EmU) = 0;
+        MatrixBlock<T> &gr, const ComplexType E, const ComplexType U);
+
+    /*Decimation*/
+    virtual AMREX_GPU_HOST_DEVICE void Compute_CondensedHamiltonian(
+        CondensedHamiltonian &CondH, const ComplexType EmU);
+
+    AMREX_GPU_HOST_DEVICE void DecimationTechnique(MatrixBlock<T> &gr,
+                                                   const ComplexType EmU);
 
     virtual void Write_Eql_Characteristics(
         const amrex::Vector<ComplexType> E_vec, const RealTable1D &DOS_data,
@@ -447,6 +467,17 @@ class c_NEGF_Common : protected MatrixBlock<T>, private c_IntegrationPath
     RealTable1D d_NonEq_Integrand_Drain_data;
 #endif
 
+    /*Decimation Technique*/
+    int use_decimation =
+        false;  // whether to use decimation technique in derived class.
+                // if Compute_SurfaceGreensFunction method is not overriden
+                // then the default is decimation regardless of what we set
+                // use_decimation as.
+
+    int decimation_max_iter = 30;      // maximum iterations allowed
+    int decimation_rel_error = 1.e-5;  // relative error
+    int decimation_layers = 10;        // P
+
    public:
     /*MPI Params*/
     amrex::Vector<int> MPI_recv_count;
diff --git a/Source/Solver/Transport/NEGF/NEGF_Common.cpp b/Source/Solver/Transport/NEGF/NEGF_Common.cpp
index c8f3225..2dcfa9f 100644
--- a/Source/Solver/Transport/NEGF/NEGF_Common.cpp
+++ b/Source/Solver/Transport/NEGF/NEGF_Common.cpp
@@ -7,8 +7,9 @@
 #include "Matrix_Block_Util.H"
 
 /*Explicit specializations*/
-template class c_NEGF_Common<ComplexType[NUM_MODES]>;             // of c_CNT
-template class c_NEGF_Common<ComplexType[NUM_MODES][NUM_MODES]>;  // c_Graphene
+template class c_NEGF_Common<ComplexType[BLOCK_SIZE]>;  // of c_CNT
+template class c_NEGF_Common<
+    ComplexType[BLOCK_SIZE][BLOCK_SIZE]>;  // c_Graphene
 
 const std::map<std::string, AngleType> map_strToAngleType = {
     {"d", AngleType::Degrees},
@@ -594,6 +595,15 @@ void c_NEGF_Common<T>::Read_RecursiveOptimizationParams(amrex::ParmParse &pp_ns)
     queryWithParser(pp_ns, "num_recursive_parts", num_recursive_parts);
 }
 
+template <typename T>
+void c_NEGF_Common<T>::Read_DecimationTechniqueParams(amrex::ParmParse &pp_ns)
+{
+    queryWithParser(pp_ns, "use_decimation", use_decimation);
+    queryWithParser(pp_ns, "decimation_max_iter", decimation_max_iter);
+    queryWithParser(pp_ns, "decimation_rel_error", decimation_rel_error);
+    queryWithParser(pp_ns, "decimation_layers", decimation_layers);
+}
+
 template <typename T>
 void c_NEGF_Common<T>::Assert_Reads()
 {
@@ -643,6 +653,7 @@ void c_NEGF_Common<T>::Read_CommonData(amrex::ParmParse &pp)
     Read_WritingRelatedFlags(pp);
     Read_AtomLocationAndChargeDistributionFilename(pp);
     Read_RecursiveOptimizationParams(pp);
+    Read_DecimationTechniqueParams(pp);
 }
 
 template <typename T>
@@ -1307,6 +1318,81 @@ void c_NEGF_Common<T>::Allocate_ArraysForHamiltonian()
     SetVal_Table1D(h_Hc_loc_data, zero);
 }
 
+template <typename T>
+void c_NEGF_Common<T>::Compute_CondensedHamiltonian(CondensedHamiltonian &CondH,
+                                                    const ComplexType EmU)
+{
+    auto &Xi_s = CondH.Xi_s;
+    auto &Xi = CondH.Xi;
+    auto &Pi = CondH.Pi;
+
+    int P = decimation_layers;
+
+    BlkTable1D H_tilde_data({0}, {P - 1});
+    auto const &H_tilde = H_tilde_data.table();
+    auto const &Hb = h_Hb_loc_data.table();
+
+    MatrixBlock<T> EmUI;
+    EmUI.SetDiag(EmU);  // This is (E-U)I
+
+    H_tilde(P - 1).SetDiag(1. / EmU);  // This is (P-1) element of [(E-U)I]^-1
+
+    auto H_tilde_kP = H_tilde(P - 1);  // This is H_tilde(P-1)(P-1)
+
+    for (int k = P - 2; k >= 1; --k)
+    {
+        int j = k % offDiag_repeatBlkSize;
+        auto temp1 = EmUI - 1. * Hb(j) * H_tilde(k + 1) * Hb(j).Dagger();
+
+        H_tilde(k) = temp1.Inverse();
+
+        H_tilde_kP = H_tilde(k) * Hb(j) * H_tilde_kP;
+    }
+    // here H_tilde_kP is H_tilde_1P
+
+    MatrixBlock<T> C_tilde_kk = H_tilde(1);  //==H_tilde_11 = C_tilde_11
+
+    for (int k = 2; k < P; ++k)
+    {
+        int j = (k - 1) % offDiag_repeatBlkSize;
+        C_tilde_kk = H_tilde(k) + H_tilde(k) * Hb(j).Dagger() * C_tilde_kk *
+                                      Hb(j) * H_tilde(k);
+    }
+    /* Here, C_tilde_kk = C_tilde_(P-1)(P-1)
+     * and,  C_tilde_1P = H_tilde_1P = H_tilde_kP
+     *       C_tilde_11 = H_tilde(1)
+     */
+
+    int id = (P - 1) % offDiag_repeatBlkSize;
+    Xi_s = Hb(0) * H_tilde(1) * Hb(0).Dagger();
+    Xi = Xi_s + Hb(id) * C_tilde_kk * Hb(id).Dagger();
+    Pi = Hb(0) * H_tilde_kP * Hb(id);
+
+    /* For P=2, as an example,
+     *
+     * Xi_s = Hb(0) * H_tilde(1) * Hb(0).Dagger();
+     * Xi   = Xi_s + Hb(1) * C_tilde_kk * Hb(1).Dagger();
+     * Pi   = Hb(0) * H_tilde_kP * Hb(1);
+     *
+     * H_tilde(1) = 1/(E-U)
+     * Hb(0)      = beta
+     * Hb(1)      = gamma
+     * C_tilde_kk = H_tilde(1)
+     * H_tilde_kP = H_tilde(1)
+     *
+     */
+
+    /*
+      amrex::Print() << " EmU: " << EmU  << "\n";
+      amrex::Print() << "Xi_s: " << Xi_s << "\n";
+      amrex::Print() << "  Xi: " << Xi   << "\n";
+      amrex::Print() << "  Pi: " << Pi   << "\n";
+      amrex::Print() << " H_tilde(1): " << H_tilde(1) << "\n";
+      amrex::Print() << " C_tilde_kk: " << C_tilde_kk << "\n";
+      amrex::Print() << " H_tilde_kP: " << H_tilde_kP << "\n";
+    */
+}
+
 template <typename T>
 void c_NEGF_Common<T>::Allocate_ArraysForLeadSpecificQuantities()
 {
@@ -3881,11 +3967,74 @@ void c_NEGF_Common<T>::Compute_Rho0()
     Deallocate_TemporaryArraysForGFComputation();
 }
 
-// template<typename T>
-// AMREX_GPU_HOST_DEVICE
-// void
-// c_NEGF_Common<T>:: Compute_SurfaceGreensFunction (MatrixBlock<T>& gr, const
-// ComplexType EmU) {}
+template <typename T>
+AMREX_GPU_HOST_DEVICE void c_NEGF_Common<T>::DecimationTechnique(
+    MatrixBlock<T> &gr, const ComplexType EmU)
+{
+    CondensedHamiltonian CondH;
+    Compute_CondensedHamiltonian(CondH, EmU);
+
+    MatrixBlock<T> mu, nu, gamma, zeta;
+
+    // initialize these blocks
+    {
+        MatrixBlock<T> EmUI;
+        EmUI.SetDiag(EmU);
+
+        auto mu0 = EmUI - CondH.Xi_s;
+        auto nu0 = EmUI - CondH.Xi;
+        auto gamma0 = CondH.Pi;
+
+        auto nu0_inverse = nu0.Inverse();
+        auto gamma0_dagger = gamma0.Dagger();
+
+        MatrixBlock<T> temp1 = gamma0 * nu0_inverse;
+        MatrixBlock<T> temp2 = temp1 * gamma0_dagger;
+        MatrixBlock<T> temp3 = gamma0_dagger * nu0_inverse;
+
+        mu = mu0 - temp2;
+        nu = nu0 - temp2 - temp3 * gamma0;
+        gamma = temp1 * gamma0;
+        zeta = temp3 * gamma0_dagger;
+    }
+
+    amrex::Real error = 1;
+    int i = 0;
+
+    while (error > decimation_rel_error)
+    {
+        auto mu_old = mu;
+
+        auto nu_inverse = nu.Inverse();
+        auto temp1 = zeta * nu_inverse * gamma;
+        auto temp2 = gamma * nu_inverse * zeta;
+
+        mu = mu - temp1;
+        nu = nu - temp1 - temp2;
+        gamma = gamma * nu_inverse * gamma;
+        zeta = zeta * nu_inverse * zeta;
+
+        auto sum = mu + mu_old;
+        auto error_mat = (mu - mu_old) * sum.Inverse();
+
+        error = sqrt(amrex::norm(error_mat.FrobeniusNorm()));
+        if (i > decimation_max_iter) break;
+
+        i++;
+    }
+    // amrex::Print() << "decimation iterations: " << i << " error: " << error
+    // << "\n";
+
+    gr = mu.Inverse();
+}
+
+template <typename T>
+AMREX_GPU_HOST_DEVICE void c_NEGF_Common<T>::Compute_SurfaceGreensFunction(
+    MatrixBlock<T> &gr, const ComplexType E, ComplexType U)
+{
+    DecimationTechnique(gr, E - U);
+    // amrex::Print() << "Using decimation, gr: " << gr << "\n";
+}
 
 template <typename T>
 void c_NEGF_Common<T>::get_Sigma_at_contacts(BlkTable1D &h_Sigma_contact_data,
@@ -3897,8 +4046,7 @@ void c_NEGF_Common<T>::get_Sigma_at_contacts(BlkTable1D &h_Sigma_contact_data,
     for (std::size_t c = 0; c < NUM_CONTACTS; ++c)
     {
         MatrixBlock<T> gr;
-        Compute_SurfaceGreensFunction(gr, E - U_contact[c]);
-        // amrex::Print() << "c, E, gr: " << c << " " << E << " " << gr << "\n";
+        Compute_SurfaceGreensFunction(gr, E, U_contact[c]);
         h_Sigma(c) = h_tau(c) * gr * h_tau(c).Dagger();
     }
 }
diff --git a/Source/Solver/Transport/Transport.cpp b/Source/Solver/Transport/Transport.cpp
index 3787f98..3a5c384 100644
--- a/Source/Solver/Transport/Transport.cpp
+++ b/Source/Solver/Transport/Transport.cpp
@@ -79,7 +79,12 @@ void c_TransportSolver::ReadData()
 
     Read_NSTypes(pp_transport);
 
-    Read_GatherAndDepositFields(pp_transport);
+    // ComplexType EmU = E - U;
+    auto &rCode = c_Code::GetInstance();
+    if (rCode.use_electrostatic)
+    {
+        Read_GatherAndDepositFields(pp_transport);
+    }
 
     Read_SelfConsistencyInput(pp_transport);
 
diff --git a/Source/Utils/CodeUtils/MathLib.H b/Source/Utils/CodeUtils/MathLib.H
new file mode 100644
index 0000000..087093c
--- /dev/null
+++ b/Source/Utils/CodeUtils/MathLib.H
@@ -0,0 +1,18 @@
+#include "../../Code_Definitions.H"
+
+namespace MathLib
+{
+void MatrixMatrixMultiply(ComplexType* d_C, const ComplexType* d_A,
+                          const ComplexType* d_B, unsigned int A_rows,
+                          unsigned int A_cols, unsigned int B_cols);
+
+void DenseMatrixInversion(ComplexType* d_Anv, ComplexType* d_A,
+                          unsigned int A_rows, unsigned int A_cols);
+
+void ConjugateTranspose(ComplexType* d_AconjT, const ComplexType* d_A,
+                        unsigned int A_rows, unsigned int A_cols);
+
+void DenseMatrixEigendecomposition(ComplexType* d_U, ComplexType* d_Lambda,
+                                   unsigned int A_rows, unsigned int A_cols);
+
+}  // namespace MathLib
diff --git a/Source/Utils/CodeUtils/MathLib.cpp b/Source/Utils/CodeUtils/MathLib.cpp
new file mode 100644
index 0000000..0692400
--- /dev/null
+++ b/Source/Utils/CodeUtils/MathLib.cpp
@@ -0,0 +1,195 @@
+#include <AMReX_Print.H>
+
+#ifdef AMREX_USE_CUDA
+#include <cublas_v2.h>     //CUBLAS
+#include <cuda_runtime.h>  //CUDA runtime
+#include <cusolverDn.h>    //CUSOLVER
+#endif
+
+#include "MathLib.H"
+#include "cudaErrorCheck.H"
+
+void MathLib::MatrixMatrixMultiply(ComplexType* d_C, const ComplexType* d_A,
+                                   const ComplexType* d_B, unsigned int A_rows,
+                                   unsigned int A_cols, unsigned int B_cols)
+{
+#ifdef AMREX_USE_GPU
+#ifdef AMREX_USE_CUDA
+    cublasHandle_t handle;
+    const cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0);
+    const cuDoubleComplex beta = make_cuDoubleComplex(0.0, 0.0);
+    checkCudaErrors(cublasCreate(&handle));
+
+    // Perform matrix multiplication: C = alpha * A * B + beta * C
+    // see: https://docs.nvidia.com/cuda/archive/10.0/cublas/index.html
+    cublasStatus_t status =
+        cublasZgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, A_rows, B_cols, A_cols,
+                    &alpha, reinterpret_cast<const cuDoubleComplex*>(d_A),
+                    A_rows, reinterpret_cast<const cuDoubleComplex*>(d_B),
+                    A_cols, &beta, reinterpret_cast<cuDoubleComplex*>(d_C),
+                    A_rows);
+    checkCudaErrors(status);
+
+    checkCudaErrors(cublasDestroy(handle));
+#elif AMREX_USE_HIP
+    /*HIP version*/
+#elif AMREX_USE_SYCL
+    /*SYCL version*/
+#endif
+#else
+    /*LAPACK version*/
+#endif
+}
+
+void MathLib::DenseMatrixInversion(ComplexType* d_Ainv, ComplexType* d_A,
+                                   unsigned int A_rows, unsigned int A_cols)
+{
+#ifdef AMREX_USE_GPU
+#ifdef AMREX_USE_CUDA
+    cublasHandle_t cublasH = nullptr;
+    cusolverDnHandle_t cusolverH = nullptr;
+
+    checkCudaErrors(cusolverDnCreate(&cusolverH));
+    checkCudaErrors(cublasCreate(&cublasH));
+
+    int* d_info = nullptr;
+    checkCudaErrors(cudaMalloc(&d_info, sizeof(int)));
+
+    int* d_pivot = nullptr;
+    checkCudaErrors(cudaMalloc(&d_pivot, A_rows * sizeof(int)));
+
+    int lwork = 0;
+    cuDoubleComplex* d_work = nullptr;
+
+    // Compute A_inverse from A using LU decomposition.
+    checkCudaErrors(
+        cusolverDnZgetrf_bufferSize(cusolverH, A_rows, A_cols,
+                                    reinterpret_cast<cuDoubleComplex*>(d_A),
+                                    A_rows, &lwork));
+
+    checkCudaErrors(cudaMalloc(&d_work, lwork * sizeof(cuDoubleComplex)));
+
+    // Perform LU decomposition using cuSolver.
+    // P A = L U (P is permutation matrix)
+    //  see: https://docs.nvidia.com/cuda/cusolver/index.html
+    checkCudaErrors(cusolverDnZgetrf(cusolverH, A_rows, A_cols,
+                                     reinterpret_cast<cuDoubleComplex*>(d_A),
+                                     A_rows, d_work, d_pivot, d_info));
+
+    // Check if LU decomposition was successful
+    int info = 0;
+    checkCudaErrors(
+        cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+    if (info != 0)
+    {
+        std::cerr << "LU decomposition failed with info = " << info << "\n";
+        cudaDeviceReset();
+        exit(1);
+    }
+
+    // Solve for Ainv using eq: A * A_inv = I, where A is factorized.
+    // op(A) * X = B
+    checkCudaErrors(
+        cusolverDnZgetrs(cusolverH, CUBLAS_OP_N, A_rows, A_rows,
+                         reinterpret_cast<const cuDoubleComplex*>(d_A), A_rows,
+                         d_pivot, reinterpret_cast<cuDoubleComplex*>(d_Ainv),
+                         A_rows, d_info));
+
+    checkCudaErrors(cudaFree(d_info));
+    checkCudaErrors(cudaFree(d_pivot));
+    checkCudaErrors(cudaFree(d_work));
+
+    checkCudaErrors(cusolverDnDestroy(cusolverH));
+    checkCudaErrors(cublasDestroy(cublasH));
+
+#elif AMREX_USE_HIP
+    /*HIP version*/
+#elif AMREX_USE_SYCL
+    /*SYCL version*/
+#endif
+#else
+    /*LAPACK version*/
+#endif
+}
+
+void MathLib::ConjugateTranspose(ComplexType* d_AconjT, const ComplexType* d_A,
+                                 unsigned int A_rows, unsigned int A_cols)
+{
+#ifdef AMREX_USE_GPU
+#ifdef AMREX_USE_CUDA
+    cublasHandle_t handle;
+    const cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0);
+    const cuDoubleComplex beta = make_cuDoubleComplex(0.0, 0.0);
+    checkCudaErrors(cublasCreate(&handle));
+
+    // Perform matrix conjugate transpose: d_AconjT = conj(d_A)
+    // https://docs.nvidia.com/cuda/cublas/
+    cublasStatus_t status =
+        cublasZgeam(handle, CUBLAS_OP_C, CUBLAS_OP_N, A_cols, A_rows, &alpha,
+                    reinterpret_cast<const cuDoubleComplex*>(d_A), A_rows,
+                    &beta, reinterpret_cast<const cuDoubleComplex*>(d_A),
+                    A_rows, reinterpret_cast<cuDoubleComplex*>(d_AconjT),
+                    A_cols);
+    checkCudaErrors(status);
+
+    checkCudaErrors(cublasDestroy(handle));
+#elif AMREX_USE_HIP
+    /*HIP version*/
+#elif AMREX_USE_SYCL
+    /*SYCL version*/
+#endif
+#else
+    /*LAPACK version*/
+#endif
+}
+
+void MathLib::DenseMatrixEigendecomposition(ComplexType* d_U,
+                                            ComplexType* d_Lambda,
+                                            unsigned int A_rows,
+                                            unsigned int A_cols)
+{
+#ifdef AMREX_USE_GPU
+#ifdef AMREX_USE_CUDA
+    cusolverDnHandle_t cusolverH;
+    checkCudaErrors(cusolverDnCreate(&cusolverH));
+
+    // Copy the matrix to be decomposed, say A, into d_U before calling this.
+    // so d_U will hold the eigenvectors upon completion
+
+    // Workspace and info setup for cuSolver
+    int work_size = 0;
+    int* devInfo;
+    checkCudaErrors(cudaMalloc((void**)&devInfo, sizeof(int)));
+
+    //// Query workspace size for eigendecomposition of a complex Hermitian
+    /// matrix
+    // checkCudaErrors(cusolverDnZheevd_bufferSize(
+    //     cusolverH, CUBLAS_FILL_MODE_UPPER, A_rows,
+    //     reinterpret_cast<cuDoubleComplex*>(d_U), A_rows,
+    //     reinterpret_cast<cuDoubleComplex*>(d_Lambda), &work_size));
+
+    //// Allocate workspace
+    // ComplexType* d_work;
+    // checkCudaErrors(cudaMalloc((void**)&d_work, work_size *
+    // sizeof(ComplexType)));
+
+    //// Perform the eigendecomposition
+    // checkCudaErrors(cusolverDnZheevd(
+    //     cusolverH, CUBLAS_FILL_MODE_UPPER, A_rows,
+    //     reinterpret_cast<cuDoubleComplex*>(d_U), A_rows,
+    //     reinterpret_cast<cuDoubleComplex*>(d_Lambda),
+    //     reinterpret_cast<cuDoubleComplex*>(d_work), work_size, devInfo));
+
+    // Clean up
+    // checkCudaErrors(cudaFree(d_work));
+    // checkCudaErrors(cudaFree(devInfo));
+    checkCudaErrors(cusolverDnDestroy(cusolverH));
+#elif AMREX_USE_HIP
+    /*HIP version*/
+#elif AMREX_USE_SYCL
+    /*SYCL version*/
+#endif
+#else
+    /*LAPACK version*/
+#endif
+}
diff --git a/Source/Utils/CodeUtils/cudaErrorCheck.H b/Source/Utils/CodeUtils/cudaErrorCheck.H
new file mode 100644
index 0000000..872bac3
--- /dev/null
+++ b/Source/Utils/CodeUtils/cudaErrorCheck.H
@@ -0,0 +1,18 @@
+////////////////////////////////////////////////////////////////////////////////
+// CUDA error checking
+////////////////////////////////////////////////////////////////////////////////
+#define checkCudaErrors(val) \
+    __checkCudaErrors__((val), #val, __FILE__, __LINE__)
+
+template <typename T>
+inline void __checkCudaErrors__(T code, const char *func, const char *file,
+                                int line)
+{
+    if (code)
+    {
+        fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line,
+                (unsigned int)code, func);
+        cudaDeviceReset();
+        exit(EXIT_FAILURE);
+    }
+}
diff --git a/scripts/negf_cpp/decimation_technique/Source/main.cpp b/scripts/negf_cpp/decimation_technique/Source/main.cpp
index 616e835..7c4ab34 100644
--- a/scripts/negf_cpp/decimation_technique/Source/main.cpp
+++ b/scripts/negf_cpp/decimation_technique/Source/main.cpp
@@ -102,10 +102,12 @@ void CondenseHamiltonian(CondensedHamiltonianElem& cond, MatrixDType E,
 
     cond.Pi = gamma * beta * H_tilde_11;
 
-    // amrex::Print() << "E: " << E << "\n";
-    // amrex::Print() << "Xi_s: " << Xi_s << "\n";
-    // amrex::Print() << "Xi: " << Xi << "\n";
-    // amrex::Print() << "Pi: " << Pi << "\n";
+    amrex::Print() << "E: "    << E << "\n";
+    amrex::Print() << "U: "    << U << "\n";
+    amrex::Print() << "EmU: "  << E-U << "\n";
+    amrex::Print() << "Xi_s: " << cond.Xi_s << "\n";
+    amrex::Print() << "Xi: "   << cond.Xi << "\n";
+    amrex::Print() << "Pi: "   << cond.Pi << "\n";
 }
 
 AMREX_GPU_HOST_DEVICE

From e376692d36be20addfd4bcf953e0e8268c2dca3a Mon Sep 17 00:00:00 2001
From: "saru@southpole" <saurabhsawant@lbl.gov>
Date: Tue, 19 Nov 2024 21:07:48 -0800
Subject: [PATCH 2/4] minor formatting modification

---
 Source/Solver/Transport/NEGF/NEGF_Common.H | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/Source/Solver/Transport/NEGF/NEGF_Common.H b/Source/Solver/Transport/NEGF/NEGF_Common.H
index 059c2a4..48a0b7c 100644
--- a/Source/Solver/Transport/NEGF/NEGF_Common.H
+++ b/Source/Solver/Transport/NEGF/NEGF_Common.H
@@ -468,11 +468,10 @@ class c_NEGF_Common : protected MatrixBlock<T>, private c_IntegrationPath
 #endif
 
     /*Decimation Technique*/
-    int use_decimation =
-        false;  // whether to use decimation technique in derived class.
-                // if Compute_SurfaceGreensFunction method is not overriden
-                // then the default is decimation regardless of what we set
-                // use_decimation as.
+    int use_decimation = false;
+    // whether to use decimation technique in the derived class.
+    // derived class default is false.
+    // base class default is true.
 
     int decimation_max_iter = 30;      // maximum iterations allowed
     int decimation_rel_error = 1.e-5;  // relative error

From 2b2cdc90c4f4ba94e3bceec3ce93c795ad4e4a36 Mon Sep 17 00:00:00 2001
From: "saru@southpole" <saurabhsawant@lbl.gov>
Date: Fri, 22 Nov 2024 17:12:45 -0800
Subject: [PATCH 3/4] tested decimation

---
 Source/Solver/Transport/NEGF/NEGF_Common.cpp    | 7 +++++++
 input/negf/StewartFrancois2006/all_around_metal | 6 ++++++
 2 files changed, 13 insertions(+)

diff --git a/Source/Solver/Transport/NEGF/NEGF_Common.cpp b/Source/Solver/Transport/NEGF/NEGF_Common.cpp
index 2dcfa9f..6c6e6e9 100644
--- a/Source/Solver/Transport/NEGF/NEGF_Common.cpp
+++ b/Source/Solver/Transport/NEGF/NEGF_Common.cpp
@@ -602,6 +602,13 @@ void c_NEGF_Common<T>::Read_DecimationTechniqueParams(amrex::ParmParse &pp_ns)
     queryWithParser(pp_ns, "decimation_max_iter", decimation_max_iter);
     queryWithParser(pp_ns, "decimation_rel_error", decimation_rel_error);
     queryWithParser(pp_ns, "decimation_layers", decimation_layers);
+
+    amrex::Print() << "##### use_decimation: " << use_decimation << "\n";
+    amrex::Print() << "##### decimation_max_iter: " << decimation_max_iter
+                   << "\n";
+    amrex::Print() << "##### decimation_rel_error: " << decimation_rel_error
+                   << "\n";
+    amrex::Print() << "##### decimation_layers: " << decimation_layers << "\n";
 }
 
 template <typename T>
diff --git a/input/negf/StewartFrancois2006/all_around_metal b/input/negf/StewartFrancois2006/all_around_metal
index 605c8bb..bb4ebd9 100644
--- a/input/negf/StewartFrancois2006/all_around_metal
+++ b/input/negf/StewartFrancois2006/all_around_metal
@@ -210,4 +210,10 @@ cnt.eq_integration_pts = 30 30 30
 cnt.noneq_integration_pts = 100
 
 cnt.write_at_iter = 1
+
+### decimation technique
+cnt.use_decimation=1 
+cnt.decimation_max_iter=50 
+cnt.decimation_rel_error=1e-5 
+cnt.decimation_layers=20
 #####################################

From 65a925ee1125eb395406a9aa38201f8b1c2b3663 Mon Sep 17 00:00:00 2001
From: "saru@southpole" <saurabhsawant@lbl.gov>
Date: Sun, 1 Dec 2024 16:56:07 -0800
Subject: [PATCH 4/4] verified decimation technique implementation

---
 input/negf/StewartFrancois2006/all_around_metal | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/input/negf/StewartFrancois2006/all_around_metal b/input/negf/StewartFrancois2006/all_around_metal
index bb4ebd9..2bc5985 100644
--- a/input/negf/StewartFrancois2006/all_around_metal
+++ b/input/negf/StewartFrancois2006/all_around_metal
@@ -212,8 +212,8 @@ cnt.noneq_integration_pts = 100
 cnt.write_at_iter = 1
 
 ### decimation technique
-cnt.use_decimation=1 
-cnt.decimation_max_iter=50 
-cnt.decimation_rel_error=1e-5 
-cnt.decimation_layers=20
+#cnt.use_decimation=1 
+#cnt.decimation_max_iter=50 
+#cnt.decimation_rel_error=1e-5 
+#cnt.decimation_layers=100
 #####################################