diff --git a/docs/src/manifolds/stiefel.md b/docs/src/manifolds/stiefel.md
index 0f950ad898..604426aa0b 100644
--- a/docs/src/manifolds/stiefel.md
+++ b/docs/src/manifolds/stiefel.md
@@ -54,6 +54,14 @@ Public = true
 Private = false
 ```
 
+```@autodocs
+Modules = [Manifolds]
+Pages = ["manifolds/StiefelFrechet.jl"]
+Order = [:type, :function]
+Public = true
+Private = false
+```
+
 ## Internal types and functions
 
 ```@autodocs
diff --git a/docs/src/misc/internals.md b/docs/src/misc/internals.md
index 3b0673f9de..f08a7146cd 100644
--- a/docs/src/misc/internals.md
+++ b/docs/src/misc/internals.md
@@ -7,8 +7,10 @@ This page documents the internal types and methods of `Manifolds.jl`'s that migh
 ```@docs
 Manifolds.eigen_safe
 Manifolds.isnormal
+Manifolds.expm_frechet
 Manifolds.log_safe
 Manifolds.log_safe!
+Manifolds.minimize
 Manifolds.mul!_safe
 Manifolds.nzsign
 Manifolds.realify
diff --git a/src/Manifolds.jl b/src/Manifolds.jl
index 03d63c780d..2939f58bd9 100644
--- a/src/Manifolds.jl
+++ b/src/Manifolds.jl
@@ -278,6 +278,8 @@ using StatsBase
 using StatsBase: AbstractWeights
 
 include("utils.jl")
+include("expm_frechet.jl")
+include("minimize_lbfgs.jl")
 
 include("product_representations.jl")
 include("differentiation/differentiation.jl")
@@ -345,6 +347,7 @@ include("manifolds/Stiefel.jl")
 include("manifolds/StiefelEuclideanMetric.jl")
 include("manifolds/StiefelCanonicalMetric.jl")
 include("manifolds/StiefelSubmersionMetric.jl")
+include("manifolds/StiefelFrechet.jl")
 include("manifolds/Sphere.jl")
 include("manifolds/SphereSymmetricMatrices.jl")
 include("manifolds/Symmetric.jl")
@@ -659,6 +662,8 @@ export ×,
     differential_canonical_project,
     differential_canonical_project!,
     distance,
+    dot_exp,
+    dot_exp!,
     dual_basis,
     einstein_tensor,
     embed,
@@ -666,6 +671,8 @@ export ×,
     equiv,
     exp,
     exp!,
+    exp_frechet,
+    exp_frechet!,
     flat,
     flat!,
     gaussian_curvature,
@@ -702,6 +709,7 @@ export ×,
     local_metric_jacobian,
     log,
     log!,
+    log_lbfgs,
     log_local_metric_density,
     manifold_dimension,
     metric,
diff --git a/src/expm_frechet.jl b/src/expm_frechet.jl
new file mode 100644
index 0000000000..f3c9cedd6d
--- /dev/null
+++ b/src/expm_frechet.jl
@@ -0,0 +1,518 @@
+"""
+based on algorithm 6.3 of [^AlMohyHigham2009].
+"""
+
+# From table 6.1 of [^AlMohyHigham2009]. A list of cut off values ``l_m`` used to determine how many terms of Padé approximant is used to compute expm_frechet(A, E), look for the largest index ``m\in [3, 5, 7, 9, 13]`` just exceed \lVert A\rVert_1.
+ell_table_61 = (
+    nothing,
+    # 1
+    2.11e-8,
+    3.56e-4,
+    1.08e-2,
+    6.49e-2,
+    2.00e-1,
+    4.37e-1,
+    7.83e-1,
+    1.23e0,
+    1.78e0,
+    2.42e0,
+    # 11
+    3.13e0,
+    3.90e0,
+    4.74e0,
+    5.63e0,
+    6.56e0,
+    7.52e0,
+    8.53e0,
+    9.56e0,
+    1.06e1,
+    1.17e1,
+)
+
+@doc raw"""
+    _diff_pade3(A, E)
+
+    Compute U, V, LU, LV from the first 6 lines of the for loop in
+    algorithm 6.4 of [^AlMohyHigham2009] using 3-term Padé approximant
+    the tuple b contains the Padé coefficients of the numerator
+"""
+@inline function _diff_pade3!(buff, A, E)
+    b = (120.0, 60.0, 12.0, 1.0)
+    k = size(A)[1]
+
+    @views begin
+        U = buff[1:k, 1:end]
+        V = buff[(k + 1):(2 * k), 1:end]
+        Lu = buff[(2 * k + 1):(3 * k), 1:end]
+        Lv = buff[(3 * k + 1):(4 * k), 1:end]
+
+        A2 = buff[(4 * k + 1):(5 * k), 1:end]
+        M2 = buff[(5 * k + 1):(6 * k), 1:end]
+    end
+
+    mul!(A2, A, A)
+    mul!(M2, A, E)
+    mul!(M2, E, A, 1, 1)
+    mul!(U, A, b[2])
+    mul!(U, A, A2, b[4], 1)
+    V .= b[3] * A2 + UniformScaling(b[1])
+    mul!(Lu, E, V)
+    mul!(Lu, A, M2, b[3], 1)
+    return mul!(Lv, b[3], M2)
+end
+@inline function _diff_pade3(A, E)
+    b = (120.0, 60.0, 12.0, 1.0)
+    A2 = A * A
+    M2 = A * E + E * A
+    U = A * (b[4] * A2 + UniformScaling(b[2]))
+    V = b[3] * A2 + UniformScaling(b[1])
+    Lu = A * (b[3] * M2) + E * (b[3] * A2 + UniformScaling(b[1]))
+    Lv = b[3] .* M2
+    return U, V, Lu, Lv
+end
+
+@doc raw"""
+    _diff_pade5(A, E)
+
+    Compute U, V, LU, LV from the first 6 lines of the for loop in
+    algorithm 6.4 of [^AlMohyHigham2009] using 5-term Padé approximant
+    the tuple b contains the Padé coefficients of the numerator
+"""
+@inline function _diff_pade5!(buff, A, E)
+    b = (30240.0, 15120.0, 3360.0, 420.0, 30.0, 1.0)
+    k = size(A)[1]
+    @views begin
+        U = buff[1:k, 1:end]
+        V = buff[(k + 1):(2 * k), 1:end]
+        Lu = buff[(2 * k + 1):(3 * k), 1:end]
+        Lv = buff[(3 * k + 1):(4 * k), 1:end]
+
+        A2 = buff[(4 * k + 1):(5 * k), 1:end]
+        M2 = buff[(5 * k + 1):(6 * k), 1:end]
+
+        A4 = buff[(6 * k + 1):(7 * k), 1:end]
+        M4 = buff[(7 * k + 1):(8 * k), 1:end]
+    end
+
+    mul!(A2, A, A)
+    mul!(M2, A, E)
+    mul!(M2, E, A, 1, 1)
+
+    mul!(A4, A2, A2)
+    mul!(M4, A2, M2)
+    mul!(M4, M2, A2, 1, 1)
+    Z = b[6] * A4 + b[4] * A2 + UniformScaling(b[2])
+    mul!(U, A, Z)
+    V .= b[5] * A4 + b[3] * A2 + UniformScaling(b[1])
+    mul!(Lu, E, Z)
+    mul!(Lu, A, M4, b[6], 1)
+    mul!(Lu, A, M2, b[4], 1)
+
+    return Lv .= b[5] * M4 + b[3] * M2
+end
+@inline function _diff_pade5(A, E)
+    b = (30240.0, 15120.0, 3360.0, 420.0, 30.0, 1.0)
+    A2 = A * A
+    M2 = A * E + E * A
+    A4 = A2 * A2
+    M4 = A2 * M2 + M2 * A2
+    U = A * (b[6] * A4 + b[4] * A2 + UniformScaling(b[2]))
+    V = b[5] * A4 + b[3] * A2 + UniformScaling(b[1])
+    Lu = (A * (b[6] * M4 + b[4] * M2) + E * (b[6] * A4 + b[4] * A2 + UniformScaling(b[2])))
+    Lv = b[5] * M4 + b[3] * M2
+    return U, V, Lu, Lv
+end
+
+@doc raw"""
+    _diff_pade7(A, E)
+
+    Compute U, V, LU, LV from the first 6 lines of the for loop in
+    algorithm 6.4 of [^AlMohyHigham2009] using 7-term Padé approximant
+    the tuple b contains the Padé coefficients of the numerator
+"""
+@inline function _diff_pade7!(buff, A, E)
+    b = (17297280.0, 8648640.0, 1995840.0, 277200.0, 25200.0, 1512.0, 56.0, 1.0)
+    k = size(A)[1]
+    @views begin
+        U = buff[1:k, 1:end]
+        V = buff[(k + 1):(2 * k), 1:end]
+        Lu = buff[(2 * k + 1):(3 * k), 1:end]
+        Lv = buff[(3 * k + 1):(4 * k), 1:end]
+
+        A2 = buff[(4 * k + 1):(5 * k), 1:end]
+        M2 = buff[(5 * k + 1):(6 * k), 1:end]
+
+        A4 = buff[(6 * k + 1):(7 * k), 1:end]
+        M4 = buff[(7 * k + 1):(8 * k), 1:end]
+
+        A6 = buff[(8 * k + 1):(9 * k), 1:end]
+        M6 = buff[(9 * k + 1):(10 * k), 1:end]
+    end
+
+    mul!(A2, A, A)
+    mulsym!(M2, A, E)
+    mul!(A4, A2, A2)
+    mulsym!(M4, A2, M2)
+
+    mul!(A6, A2, A4)
+    mul!(M6, A4, M2)
+    mul!(M6, M4, A2, 1, 1)
+
+    Z = b[8] * A6 + b[6] * A4 + b[4] * A2 + UniformScaling(b[2])
+    mul!(U, A, Z)
+    V .= b[7] * A6 + b[5] * A4 + b[3] * A2 + UniformScaling(b[1])
+    mul!(Lu, E, Z)
+    mul!(Lu, A, b[8] * M6 + b[6] * M4 + b[4] * M2, 1, 1)
+    return Lv .= b[7] * M6 + b[5] * M4 + b[3] * M2
+end
+@inline function _diff_pade7(A, E)
+    b = (17297280.0, 8648640.0, 1995840.0, 277200.0, 25200.0, 1512.0, 56.0, 1.0)
+    A2 = A * A
+    M2 = A * E + E * A
+    A4 = A2 * A2
+    M4 = A2 * M2 + M2 * A2
+    A6 = A2 * A4
+    M6 = A4 * M2 + M4 * A2
+    U = A * (b[8] * A6 + b[6] * A4 + b[4] * A2 + UniformScaling(b[2]))
+    V = b[7] * A6 + b[5] * A4 + b[3] * A2 + UniformScaling(b[1])
+    Lu = (
+        A * (b[8] * M6 + b[6] * M4 + b[4] * M2) +
+        E * (b[8] * A6 + b[6] * A4 + b[4] * A2 + UniformScaling(b[2]))
+    )
+    Lv = b[7] * M6 + b[5] * M4 + b[3] * M2
+    return U, V, Lu, Lv
+end
+
+@doc raw"""
+    _diff_pade9(A, E)
+
+    Compute U, V, LU, LV from the first 6 lines of the for loop in
+    algorithm 6.4 of [^AlMohyHigham2009] using 9-term Padé approximant
+    the tuple b contains the Padé coefficients of the numerator
+"""
+@inline function _diff_pade9!(buff, A, E)
+    b = (
+        17643225600.0,
+        8821612800.0,
+        2075673600.0,
+        302702400.0,
+        30270240.0,
+        2162160.0,
+        110880.0,
+        3960.0,
+        90.0,
+        1.0,
+    )
+
+    k = size(A)[1]
+    @views begin
+        U = buff[1:k, 1:end]
+        V = buff[(k + 1):(2 * k), 1:end]
+        Lu = buff[(2 * k + 1):(3 * k), 1:end]
+        Lv = buff[(3 * k + 1):(4 * k), 1:end]
+
+        A2 = buff[(4 * k + 1):(5 * k), 1:end]
+        M2 = buff[(5 * k + 1):(6 * k), 1:end]
+
+        A4 = buff[(6 * k + 1):(7 * k), 1:end]
+        M4 = buff[(7 * k + 1):(8 * k), 1:end]
+
+        A6 = buff[(8 * k + 1):(9 * k), 1:end]
+        M6 = buff[(9 * k + 1):(10 * k), 1:end]
+
+        A8 = buff[(10 * k + 1):(11 * k), 1:end]
+        M8 = buff[(11 * k + 1):(12 * k), 1:end]
+    end
+
+    mul!(A2, A, A)
+    mulsym!(M2, A, E)
+    mul!(A4, A2, A2)
+    mulsym!(M4, A2, M2)
+    mul!(A6, A2, A4)
+    mul!(M6, A4, M2)
+    mul!(M6, M4, A2, 1, 1)
+    mul!(A8, A4, A4)
+    mulsym!(M8, A4, M4)
+    Z = b[10] * A8 + b[8] * A6 + b[6] * A4 + b[4] * A2 + UniformScaling(b[2])
+    mul!(U, A, Z)
+    V .= b[9] * A8 + b[7] * A6 + b[5] * A4 + b[3] * A2 + UniformScaling(b[1])
+
+    mul!(Lu, E, Z)
+    mul!(Lu, A, b[10] * M8 + b[8] * M6 + b[6] * M4 + b[4] * M2, 1, 1)
+    return Lv .= b[9] * M8 + b[7] * M6 + b[5] * M4 + b[3] * M2
+end
+@inline function _diff_pade9(A, E)
+    b = (
+        17643225600.0,
+        8821612800.0,
+        2075673600.0,
+        302702400.0,
+        30270240.0,
+        2162160.0,
+        110880.0,
+        3960.0,
+        90.0,
+        1.0,
+    )
+    A2 = A * A
+    M2 = A * E + E * A
+    A4 = A2 * A2
+    M4 = A2 * M2 + M2 * A2
+    A6 = A2 * A4
+    M6 = A4 * M2 + M4 * A2
+    A8 = A4 * A4
+    M8 = A4 * M4 + M4 * A4
+    U = A * (b[10] * A8 + b[8] * A6 + b[6] * A4 + b[4] * A2 + UniformScaling(b[2]))
+    V = b[9] * A8 + b[7] * A6 + b[5] * A4 + b[3] * A2 + UniformScaling(b[1])
+    Lu = (
+        A * (b[10] * M8 + b[8] * M6 + b[6] * M4 + b[4] * M2) +
+        E * (b[10] * A8 + b[8] * A6 + b[6] * A4 + b[4] * A2 + UniformScaling(b[2]))
+    )
+    Lv = b[9] * M8 + b[7] * M6 + b[5] * M4 + b[3] * M2
+    return U, V, Lu, Lv
+end
+
+@doc raw"""
+    _diff_pade13(A, E)
+
+    Compute U, V, LU, LV from the lines 10-25 of the for loop in
+    algorithm 6.4 of [^AlMohyHigham2009] using 9-term Padé approximant
+    The returns U, V, Lu, Lv are stored in the first four blocks of buff
+    the tuple b contains the Padé coefficients of the numerator
+"""
+@inline function _diff_pade13!(buff, A, E)
+    b = (
+        64764752532480000.0,
+        32382376266240000.0,
+        7771770303897600.0,
+        1187353796428800.0,
+        129060195264000.0,
+        10559470521600.0,
+        670442572800.0,
+        33522128640.0,
+        1323241920.0,
+        40840800.0,
+        960960.0,
+        16380.0,
+        182.0,
+        1.0,
+    )
+    k = size(A)[1]
+
+    @views begin
+        U = buff[1:k, 1:end]
+        V = buff[(k + 1):(2 * k), 1:end]
+        Lu = buff[(2 * k + 1):(3 * k), 1:end]
+        Lv = buff[(3 * k + 1):(4 * k), 1:end]
+
+        A2 = buff[(4 * k + 1):(5 * k), 1:end]
+        M2 = buff[(5 * k + 1):(6 * k), 1:end]
+
+        A4 = buff[(6 * k + 1):(7 * k), 1:end]
+        M4 = buff[(7 * k + 1):(8 * k), 1:end]
+
+        A6 = buff[(8 * k + 1):(9 * k), 1:end]
+        M6 = buff[(9 * k + 1):(10 * k), 1:end]
+
+        W1 = buff[(10 * k + 1):(11 * k), 1:end]
+        Z1 = buff[(11 * k + 1):(12 * k), 1:end]
+        W = buff[(12 * k + 1):(13 * k), 1:end]
+        Lw1 = buff[(13 * k + 1):(14 * k), 1:end]
+        Lz1 = buff[(14 * k + 1):(15 * k), 1:end]
+        Lw = buff[(15 * k + 1):(16 * k), 1:end]
+    end
+
+    mul!(A2, A, A)
+    mul!(M2, A, E)
+    mul!(M2, E, A, 1, 1)
+    mul!(A4, A2, A2)
+    mul!(M4, A2, M2)
+    mul!(M4, M2, A2, 1, 1)
+    mul!(A6, A2, A4)
+    mul!(M6, A4, M2)
+    mul!(M6, M4, A2, 1, 1)
+    mul!(W1, b[14], A6)
+    W1 .+= b[12] .* A4
+    W1 .+= b[10] .* A2
+
+    mul!(Z1, b[13], A6)
+    Z1 .+= b[11] .* A4
+    Z1 .+= b[9] .* A2
+
+    mul!(W, A6, W1)
+    W .+= b[8] * A6 + b[6] * A4 + b[4] * A2 + UniformScaling(b[2])
+
+    mul!(U, A, W)
+    mul!(V, A6, Z1)
+    V .+= b[7] * A6 + b[5] * A4 + b[3] * A2 + UniformScaling(b[1])
+
+    Lw1 .= b[14] * M6 + b[12] * M4 + b[10] * M2
+    mul!(Lz1, b[13], M6)
+    Lz1 .+= b[11] * M4 + b[9] * M2
+
+    mul!(Lw, A6, Lw1)
+    mul!(Lw, M6, W1, 1, 1)
+    Lw .+= b[8] * M6 + b[6] * M4 + b[4] * M2
+
+    mul!(Lu, A, Lw)
+    mul!(Lu, E, W, 1, 1)
+    mul!(Lv, A6, Lz1)
+    mul!(Lv, M6, Z1, 1, 1)
+    return Lv .+= b[7] * M6 + b[5] * M4 + b[3] * M2
+end
+@inline function _diff_pade13(A, E)
+    b = (
+        64764752532480000.0,
+        32382376266240000.0,
+        7771770303897600.0,
+        1187353796428800.0,
+        129060195264000.0,
+        10559470521600.0,
+        670442572800.0,
+        33522128640.0,
+        1323241920.0,
+        40840800.0,
+        960960.0,
+        16380.0,
+        182.0,
+        1.0,
+    )
+
+    A2 = A * A
+
+    M2 = A * E + E * A
+    A4 = A2 * A2
+    M4 = A2 * M2 + M2 * A2
+    A6 = A2 * A4
+    M6 = A4 * M2 + M4 * A2
+    W1 = b[14] * A6 + b[12] * A4 + b[10] * A2
+    W2 = b[8] * A6 + b[6] * A4 + b[4] * A2 + UniformScaling(b[2])
+    Z1 = b[13] * A6 + b[11] * A4 + b[9] * A2
+    Z2 = b[7] * A6 + b[5] * A4 + b[3] * A2 + UniformScaling(b[1])
+    W = A6 * W1 + W2
+    U = A * W
+    V = A6 * Z1 + Z2
+    Lw1 = b[14] * M6 + b[12] * M4 + b[10] * M2
+    Lw2 = b[8] * M6 + b[6] * M4 + b[4] * M2
+    Lz1 = b[13] * M6 + b[11] * M4 + b[9] * M2
+    Lz2 = b[7] * M6 + b[5] * M4 + b[3] * M2
+    Lw = A6 * Lw1 + M6 * W1 + Lw2
+    Lu = A * Lw + E * W
+    Lv = A6 * Lz1 + M6 * Z1 + Lz2
+    return U, V, Lu, Lv
+end
+
+@doc raw"""
+    expm_frechet(A, E)
+    expm_frechet!(buff, A, E)
+
+Compute Frechet derivative of expm(A) in direction E using algorithm 6.4 of [^AlMohyHigham2009]
+````math
+\mathrm{expm\_frechet}(A, E) = \lim_{t\to 0}\frac{1}{t}(\exp(A + tE) - \exp(A))
+````
+For sufficiently small ``\lVert A\rVert_1`` norm, we use Padé with an appropriate number of terms
+( one of (3, 5, 7, 9, 13)), with 13 terms is the default. Otherwise, we use the formula
+``\exp(A) = (\exp(A/2^s))^{2^s}``, where s is used to scale so ``2^{-s}\lVert A\rVert_1`` is smaller than element ``\mathrm{ell\_table\_61}``[14] of the lookup table ``\mathrm{ell\_table\_61}`` in the code, which comes from table 6.1 in op. cit.
+
+For ``\mathrm{expm\_frechet}!``, buff is a matrix of size 16*k times k, where ``k`` is the size of ``A``. The returns, (exp(A), dexp(A, E)) are stored in the first two blocks of buff.
+The remaining blocks are used as temporary storage. This is for algorithms calling expm_frechet repeatedly, where allocation becomes the biggest bottleneck.
+
+[^AlMohyHigham2009]:
+    >Al-Mohy, A. H., Higham, N. J. (2009)
+    >"Computing the Frechet Derivative of the Matrix Exponential, with an application to Condition Number Estimation."
+    >SIAM Journal On Matrix Analysis and Applications., 30 (4). pp. 1639-1657. ISSN 1095-7162
+    >doi: [https://doi.org/10.1137/080716426](https://doi.org/10.1137/080716426)
+"""
+function expm_frechet(A, E)
+    n = size(A, 1)
+    s = nothing
+    A_norm_1 = maximum(sum(abs.(A), dims=1))
+    m_pade_pairs = ((3, _diff_pade3), (5, _diff_pade5), (7, _diff_pade7), (9, _diff_pade9))
+
+    for m_pade in m_pade_pairs
+        m, pade = m_pade
+        if A_norm_1 <= ell_table_61[m]
+            U, V, Lu, Lv = pade(A, E)
+            s = 0
+            break
+        end
+    end
+    if isnothing(s)
+        # scaling
+        s = max(0, Int(ceil(log2(A_norm_1 / ell_table_61[14]))))
+        # pade order 13
+        U, V, Lu, Lv = _diff_pade13((2.0^-s) * A, (2.0^-s) * E)
+    end
+    # factor once and solve twice    
+    lu_piv = lu(-U + V)
+    eA = lu_piv \ (U + V)
+    eAf = lu_piv \ (Lu + Lv + (Lu - Lv) * eA)
+
+    # squaring
+    for k in 1:s
+        eAf = eA * eAf + eAf * eA
+        eA = eA * eA
+    end
+
+    return eA, eAf
+end
+function expm_frechet!(buff, A, E)
+    n = size(A, 1)
+    s = nothing
+    A_norm_1 = maximum(sum(abs.(A), dims=1))
+    k = size(A)[1]
+    m_pade_pairs =
+        ((3, _diff_pade3!), (5, _diff_pade5!), (7, _diff_pade7!), (9, _diff_pade9!))
+
+    for m_pade in m_pade_pairs
+        m, pade = m_pade
+        if A_norm_1 <= ell_table_61[m]
+            U, V, Lu, Lv = pade(buff, A, E)
+            s = 0
+            break
+        end
+    end
+    if isnothing(s)
+        # scaling
+        s = max(0, Int(ceil(log2(A_norm_1 / ell_table_61[14]))))
+        # pade order 13
+        _diff_pade13!(buff, (2.0^-s) * A, (2.0^-s) * E)
+    end
+    buff[(4 * k + 1):(8 * k), :] .= buff[1:(4 * k), :]
+    @views begin
+        eA = buff[1:k, :]
+        eAf = buff[(k + 1):(2 * k), :]
+        tmp = buff[(2 * k + 1):(3 * k), :]
+
+        U = buff[(4 * k + 1):(5 * k), :]
+        V = buff[(5 * k + 1):(6 * k), :]
+        Lu = buff[(6 * k + 1):(7 * k), :]
+        Lv = buff[(7 * k + 1):(8 * k), :]
+    end
+
+    # factor once and solve twice    
+    lu_piv = lu(-U + V)
+    broadcast!(+, eA, U, V)
+    ldiv!(lu_piv, eA)
+    broadcast!(-, tmp, Lu, Lv)
+    mul!(eAf, tmp, eA)
+    eAf .+= Lu .+ Lv
+    ldiv!(lu_piv, eAf)
+
+    # squaring
+    for k in 1:s
+        mulsym!(tmp, eA, eAf)
+        eAf .= tmp
+        mul!(tmp, eA, eA)
+        eA .= tmp
+    end
+end
+
+@doc raw"""
+    mulsym!(C, A, E)
+    Compute C = A*E + E*A by mul    
+"""
+@inline function mulsym!(C, A, B)
+    mul!(C, A, B)
+    return mul!(C, B, A, 1, 1)
+end
diff --git a/src/manifolds/StiefelFrechet.jl b/src/manifolds/StiefelFrechet.jl
new file mode 100644
index 0000000000..de4a35b4d4
--- /dev/null
+++ b/src/manifolds/StiefelFrechet.jl
@@ -0,0 +1,314 @@
+@doc raw"""
+    q, dot_q = dot_exp(M::MetricManifold{ℝ, Stiefel{n,k,ℝ}, <:StiefelSubmersionMetric}, p, X)
+    dot_exp!(M::MetricManifold{ℝ, Stiefel{n,k,ℝ}, <:StiefelSubmersionMetric}, q, dot_q, p, X)
+
+Compute the geodesic end point ``q = q(t)`` and its time derivative ``dot\_q = \dot{q}(t)`` on [`Stiefel(n,k)`](@ref) with the [`StiefelSubmersionMetric`](@ref). The geodesic is given by
+````math
+\operatorname{Exp} tX = \begin{bmatrix} p & X\end{bmatrix}
+\{\exp t\begin{bmatrix} (2\bar{\alpha}-1)A & 2(\bar{\alpha}-1)A^2-X^{\mathrm{T}}X \\ I_k & A\end{bmatrix}\}
+\begin{bmatrix}\ \exp((1-2\bar{\alpha})tA) \\ 0\end{bmatrix}.
+````
+with ``\bar{\alpha} = \frac{1}{2(1+\alpha)}, A = p^{\mathrm{T}}X``. This implementation is based on [^Nguyen2022], equation (3).
+
+[^Nguyen2022]:
+    ^Nguyen, D. (2022),
+    ^"Closed-form Geodesics and Optimization for Riemannian Logarithms of Stiefel and Flag Manifolds".
+    ^"J Optim Theory Appl 194, 142–166 (2022)."
+    ^doi: https://doi.org/10.1007/s10957-022-02012-3
+    ^url https://rdcu.be/c0V2r
+"""
+function dot_exp(
+    M::MetricManifold{ℝ,Stiefel{n,k,ℝ},<:StiefelSubmersionMetric},
+    p,
+    X,
+    t,
+) where {n,k}
+    q = similar(p)
+    dot_q = similar(p)
+    dot_exp!(M, q, dot_q, p, X, t)
+    return q, dot_q
+end
+
+function dot_exp!(
+    M::MetricManifold{ℝ,Stiefel{n,k,ℝ},<:StiefelSubmersionMetric},
+    q,
+    dot_q,
+    p,
+    X,
+    t,
+) where {n,k}
+    # return the other exp formula, and also
+    # time derivative of the exponential map
+    α = metric(M).α
+    T = Base.promote_eltype(p, X)
+
+    alf = 0.5 / (1 + α)
+    A = p' * X
+    e_mat = Array{T,2}(undef, 2 * k, 2 * k)
+    e_mat[1:k, 1:k] = (2 * alf - 1) * A
+    e_mat[1:k, (k + 1):end] = -X' * X - 2 * (1 - alf) * A * A
+    e_mat[(k + 1):end, 1:k] = I(k)
+    e_mat[(k + 1):end, (k + 1):end] = A
+    eE = exp(t * e_mat)
+    eA = exp((1 - 2 * alf) * t * A)
+    q .= ([p X] * eE)[1:end, 1:k] * eA
+    dot_q .=
+        (vcat([p X]) * e_mat * eE)[1:end, 1:k] * eA +
+        (vcat([p X]) * eE)[1:end, 1:k] * ((1 - 2 * alf) * A * eA)
+    return q, dot_q
+end
+
+@doc raw"""
+    log_lbfgs(Stf::MetricManifold{ℝ,Stiefel{n,k,ℝ},<:StiefelSubmersionMetric}, Y, W;
+                       tolerance=sqrt(eps(float(real(Base.promote_eltype(Y, W))))),
+                       max_itr=1_000, pretol=1e-3, lbfgs_options=nothing)
+
+Compute the Riemannian logarithm on [`Stiefel(n,k)`](@ref) with the [`StiefelSubmersionMetric`](@ref). Returning ``\eta`` such that ``\operatorname{Exp}_Y\eta = W``.
+
+The logarithmic map is computed by finding the initial velocity ``\eta`` such that if ``\gamma`` is the geodesic with initial conditions ``\gamma(0) = Y, \dot{\gamma}(0) = \eta``, 
+the cost function ``|\gamma(1) - W|^2`` (the square Euclidean distance) is minimized. The exact gradient of the cost function is computed using Fréchet derivatives. The function is optimized using an L-BFGS optimizer [`minimize`](@ref).
+
+The vector space spanned by ``Y`` and ``W`` intersects with the original Stiefel manifold along a smaller Stiefel manifold in ``ℝ^{\dim(span(Y, W))\times k}``. Let ``Q`` be the complementary basis of ``Y`` in ``span(Y, W)``, so ``[Y|Q]`` is an orthogonal basis for ``span(Y, W)``, we solve for two matrices ``A, R`` representing the tangent vector in this smaller Stiefel manifold (following [^Zimmermann2017]):
+
+```math
+ \eta = YA + QR
+```
+
+The exact gradient calculation with Fréchet derivatives is relatively expensive, to speed up we only run the exact gradient until the iteration error is less than pretol, then run a simple update on ``A, R`` for the last steps. A pretol=1e-5 is used if the distance is expected to be over ``\frac{1}{2}\pi``. Otherwise, use pretol=1e-3.
+
+``\mathrm{lbfgs\_options}`` is a dictionary of technical parameters, isolated to a separate group as we do not expect users to modify them. The relevant keys are\
+
+* ``\mathrm{complementary\_rank\_cutoff}`` (default = 1e-14): cut off of eigenvalue to determine the rank of the complementary basis ``Q``. We use SVD to determine the size of the complement basis ``Q`` of ``Y`` in ``span(Y, W)``. We keep columns of ``Q`` corresponding to eigenvalues higher than this cutoff.\
+
+*  The remaining parameters are related to the L-BFGS minimizer [`minimize`](@ref):\
+
+    +  ``\mathrm{corrections}`` (default = 10): default number of stored history in L-BFGS.\
+    + ``\mathrm{c1}`` (default = 1e-4): ``c1, c2`` in the line search.\
+    + ``\mathrm{c2}`` (default = 0.9).\
+    +  ``\mathrm{max\_ls}`` (default = 25): max iterations for the line search.
+    + ``\mathrm{max\_fun\_evals}`` (default = Int(ceil(max_itr * 1.3))): max function evaluations
+"""
+function log_lbfgs(
+    Stf,
+    Y,
+    W;
+    tolerance=sqrt(eps(float(real(Base.promote_eltype(Y, W))))),
+    max_itr=1_000,
+    pretol=1e-3,
+    lbfgs_options=nothing,
+)
+    # these are default values - overide in lbfgs_options.
+
+    if !isnothing(lbfgs_options)
+        complementary_rank_cutoff = get(lbfgs_options, "complementary_rank_cutoff", 1e-14)
+        corrections = get(lbfgs_options, "corrections", 10)
+        c1 = get(lbfgs_options, "c1", 1e-4)
+        c2 = get(lbfgs_options, "c2", 0.9)
+        max_ls = get(lbfgs_options, "max_ls", 25)
+        max_fun_evals = get(lbfgs_options, "max_fun_evals", Int(ceil(max_itr * 1.3)))
+    else
+        complementary_rank_cutoff = 1e-14
+        corrections = 10
+        c1 = 1e-4
+        c2 = 0.9
+        max_ls = 25
+        max_fun_evals = Int(ceil(max_itr * 1.3))
+    end
+
+    α = metric(Stf).α
+    alf = 0.5 / (1 + α)
+    T = Base.promote_eltype(Y, W, complementary_rank_cutoff)
+    n, p = size(Y)
+
+    Q = _get_complementary_basis(Y, W, complementary_rank_cutoff)
+    k = size(Q, 2)
+
+    if k == 0
+        # q and p has the same linear span
+        A = log(Y' * W)
+        return Y * A
+    end
+
+    eta0 = project(Stf, Y, W - Y)
+    A = asym(Y' * eta0)
+    R = Q' * eta0 - (Q' * Y) * (Y' * eta0)
+    Adim = div(p * (p - 1), 2)
+
+    # W is Z in the paper.
+    WTY = W' * Y
+    WTQ = W' * Q
+
+    mat = zeros(T, p + k, p + k)
+    E = zeros(T, p + k, p + k)
+    v0 = Array{T,1}(undef, Adim + k * p)
+    buff = Array{T,2}(undef, 16 * (p + k), p + k)
+    buffA = Array{T,2}(undef, 16 * p, p)
+
+    @views begin
+        mat11 = mat[1:p, 1:p]
+        mat12 = mat[1:p, (p + 1):end]
+        mat21 = mat[(p + 1):end, 1:p]
+
+        E11 = E[1:p, 1:p]
+        E12 = E[1:p, (p + 1):end]
+
+        ex2 = buff[1:(k + p), :]
+        fe2 = buff[(k + p + 1):(2 * (k + p)), :]
+        M = buff[1:p, 1:p]
+        N = buff[(p + 1):(p + k), 1:p]
+    end
+
+    WYMQN = Array{T,2}(undef, p, p)
+    dA = similar(A)
+    dR = similar(R)
+    ex1 = similar(A)
+
+    # unravel v to A and R
+    @inline function vec2AR!(A, R, v)
+        A .= vec2asym(v[1:Adim])
+        return R .= reshape(v[(Adim + 1):end], k, p)
+    end
+
+    # compute ex1= exp((1-2*alf)*A) and populate
+    # the (p+k) × (p+k) matrix mat in the Exp formula. ex2=exp(mat)
+    @inline function setMatForExp!(mat, ex1, ex2, A, R)
+        mul!(mat11, 2 * alf, A)
+        mul!(mat12, -1, R')
+        return copyto!(mat21, R)
+    end
+
+    # cost function and gradient in the format expected by minimize
+    # F is storage for function value.
+    # G is storage for gradient.
+    # v is vectorization of matrices A and R. A is a skew matrix.
+    function cost_and_grad!(F, G, v)
+        vec2AR!(A, R, v)
+        ex1 .= exp((1 - 2 * alf) * A)
+        setMatForExp!(mat, ex1, ex2, A, R)
+        mul!(E11, ex1, WTY)
+        mul!(E12, ex1, WTQ)
+
+        # expm affects buff, so affect ex2, fe2
+        expm_frechet!(buff, mat, E)
+
+        mul!(WYMQN, WTY, M)
+        mul!(WYMQN, WTQ, N, 1, 1)
+
+        expm_frechet!(buffA, (1 - 2 * alf) * A, WYMQN)
+        dA .=
+            asym((1 - 2 * alf) * buffA[(p + 1):(2 * p), :]) .+ 2 * alf * asym(fe2[1:p, 1:p])
+
+        dR .= fe2[(p + 1):end, 1:p] - fe2[1:p, (p + 1):end]'
+
+        G[1:Adim] .= asym2vec(dA)
+        G[(1 + Adim):end] .= vec(dR)
+        return -sum(WYMQN' .* ex1)
+    end
+    v0[1:Adim] = asym2vec(A)
+    v0[(Adim + 1):end] = vec(R)
+
+    # run full gradient until pretol
+    xret, f, exitflag, output = minimize(
+        cost_and_grad!,
+        v0,
+        max_fun_evals=max_fun_evals,
+        max_itr=max_itr,
+        grad_tol=pretol,
+        func_tol=pretol,
+        corrections=corrections,
+        c1=c1,
+        c2=c2,
+        max_ls=max_ls,
+    )
+
+    vec2AR!(A, R, xret)
+
+    # run simple descent
+    function Fnorm(dA, dR)
+        return norm(dA, 2) + norm(dR, 2)
+    end
+    disc = Fnorm(dA, dR)
+    for i in 1:max_itr
+        ex1 .= exp((1 - 2 * alf) * A)
+        setMatForExp!(mat, ex1, ex2, A, R)
+        # this operation changes ex2, so change M, N        
+        ex2 .= exp(mat)
+        dA .= asym(WTY' - M * ex1)
+        broadcast!(-, dR, WTQ', N * ex1)
+        new_d = Fnorm(dA, dR)
+        if new_d < disc
+            A .+= dA
+            R .+= dR
+            disc = new_d
+        else
+            A .+= dA * disc / new_d
+            R .+= dR * disc / new_d
+        end
+        if (
+            maximum(abs.(extrema(dA))) < tolerance &&
+            maximum(abs.(extrema(dR))) < tolerance
+        )
+            break
+        end
+    end
+
+    eta = similar(Y)
+    mul!(eta, Y, A)
+    mul!(eta, Q, R, 1, 1)
+
+    return eta
+end
+
+# Note: the following are for internal use
+@doc raw"""
+    _get_complementary_basis(Y, Y1)
+Find a complementary basis Q in the linear span of Y Y1
+orthogonal to Y
+"""
+function _get_complementary_basis(Y, Y1, complementary_rank_cutoff)
+    n, p = size(Y)
+    F = svd([Y Y1])
+    k = sum(F.S .> complementary_rank_cutoff)
+    good = F.U[:, 1:k] * F.Vt[1:k, 1:k]
+    qs = nullspace(Y' * good)
+    QR = qr(good * qs)
+    return QR.Q * vcat(I, zeros((n - k + p, k - p)))
+end
+
+@doc raw"""
+     asym(mat)
+anti-symmetrize mat
+"""
+@inline asym(mat) = 0.5 * (mat - mat')
+
+@doc raw"""
+    asym2vec(mat)
+vectorize antisymmetric matrice mat
+"""
+function asym2vec(mat)
+    sz = size(mat)[1]
+    ret = zeros(div((sz * (sz - 1)), 2))
+    start = 1
+    for i in 1:(sz - 1)
+        ret[start:(start + sz - i - 1)] .= mat[(i + 1):end, i]
+        start += sz - i
+    end
+    return ret
+end
+
+@doc raw"""
+    vec2asym(v)
+unravel v to an antisymmetric matrices
+"""
+function vec2asym(v)
+    sz = 0.5 * (1 + sqrt(1 + 8 * size(v)[1]))
+    sz = Int(round(sz))
+    mat = zeros(sz, sz)
+    start = 1
+    for i in 1:(sz - 1)
+        mat[(i + 1):end, i] .= v[start:(start + sz - i - 1)]
+        mat[i, (i + 1):end] .= -v[start:(start + sz - i - 1)]
+        start += sz - i
+    end
+    return mat
+end
diff --git a/src/minimize_lbfgs.jl b/src/minimize_lbfgs.jl
new file mode 100644
index 0000000000..6aa207deba
--- /dev/null
+++ b/src/minimize_lbfgs.jl
@@ -0,0 +1,535 @@
+# A simple implementation of L-BFGS based on minFunc.m
+# https://www.cs.ubc.ca/~schmidtm/Software/minFunc.html
+
+@doc raw"""
+    _lbfgs_calc!(z, g, S, Y, rho, lbfgs_start, lbfgs_end, Hdiag, al_buff)
+Compute the L-BFGS Search Direction.
+This function returns the (L-BFGS) approximate inverse Hessian, multiplied by the negative gradient.
+
+z is the storage for the output, also returned.
+g is the gradient
+S: storage for change in x (steps) (also called q_i)
+Y: storage for change in gradient
+rho: storage for 1/dot(Y, S)
+lbfgs_start, lbfgs_end: starting and ending position of the storage, similar to a linked-list implementation. We keep only a short history, and reuse storage when full. Data is saved between lbfgs_start and lbfgs_end, inclusively both ends.
+
+Hdiag is the diagonal Hessian approximation.
+al_buff is a buffer for the alpha coefficients.
+"""
+@inline function _lbfgs_calc!(z, g, S, Y, rho, lbfgs_start, lbfgs_end, Hdiag, al_buff)
+    # Set up indexing
+    nVars, maxCorrections = size(S)
+    if lbfgs_start == 1
+        ind = Array(1:lbfgs_end)
+        nCor = lbfgs_end - lbfgs_start + 1
+    else
+        ind = vcat(Array(lbfgs_start:maxCorrections), Array(1:(lbfgs_end)))
+        nCor = maxCorrections
+    end
+    @views begin
+        al = al_buff[1:nCor]
+    end
+    al .= 0
+    # we use q and z as different valriables in the algorithm
+    # description but here we use one name to save storage    
+    z .= -g
+    nid = size(ind)[1]
+    for j in 1:nid
+        i = ind[nid - j + 1]
+        al[i] = dot(S[:, i], z) * rho[i]
+        mul!(z, UniformScaling(-al[i]), Y[:, i], 1, 1)
+    end
+    # Multiply by Initial Hessian.
+    z .= Hdiag .* z
+    for i in ind
+        be_i = dot(Y[:, i], z) * rho[i]
+        mul!(z, UniformScaling(al[i] - be_i), S[:, i], 1, 1)
+    end
+    return z
+end
+
+@doc raw"""
+    _save!(y, s, S, Y, rho, lbfgs_start, lbfgs_end, Hdiag, precondx)
+y: current change in gradient
+s: current change in x (step)
+rho: 1/(y.s) - set to 1 if (y.s) = 0
+lbfgs_start, lbfgs_end: running indices for storage.
+Hdiag: Hessian diagonal
+precondx: preconditioner function. Apply no preconditioner if empty.
+
+Helper function to append data to S, Y, rho.
+To avoid repeated allocation of memory use fixed storage but keep tract of indices, and recycle if the storage is full. We isolate the code to save current data to storage in this function.
+Data to save:
+S: storage for change in x (steps) (also called q_i)
+Y: storage for Change in gradient
+rho: storage for 1/(Y.S), set to 1 when (Y.S) == 0
+"""
+@inline function _save!(y, s, S, Y, rho, lbfgs_start, lbfgs_end, Hdiag, precondx)
+    ys = dot(y, s)
+    corrections = size(S)[2]
+    if lbfgs_end < corrections
+        lbfgs_end = lbfgs_end + 1
+        if lbfgs_start != 1
+            if lbfgs_start == corrections
+                lbfgs_start = 1
+            else
+                lbfgs_start = lbfgs_start + 1
+            end
+        end
+    else
+        lbfgs_start = min(2, corrections)
+        lbfgs_end = 1
+    end
+
+    S[:, lbfgs_end] .= s
+    Y[:, lbfgs_end] .= y
+    if abs(ys) > 1e-10
+        rho[lbfgs_end] = 1.0 / ys
+    else
+        rho[lbfgs_end] = 1.0
+    end
+    # Update scale of initial Hessian approximation
+    if isnothing(precondx)
+        Hdiag .= ys / dot(y, y)
+    else
+        mul!(Hdiag, ys / dot3(y, precondx, y), precondx)
+    end
+    return lbfgs_start, lbfgs_end
+end
+
+@doc raw"""
+    _cubic_interpolate(x1, f1, g1, x2, f2, g2; bounds=nothing)
+Cubic interpolation of 2 points w/ function and derivative values for both
+Used in Wolfe line search.
+Fit a cubic c(x) going through x1, x2, with specified values and derivative.
+Find the minimum of c(x) in the interval. The first derivative of the cubic
+is a quadratic function, with discriminant d2_square.
+We address the negative discriminant case, taking the half point.
+Compare with function interpolate in
+https://github.com/JuliaNLSolvers/LineSearches.jl/blob/master/src/strongwolfe.jl
+"""
+@inline function _cubic_interpolate(x1, f1, g1, x2, f2, g2; bounds=nothing)
+    # Compute bounds of interpolation area
+    if !isnothing(bounds)
+        xmin_bound, xmax_bound = bounds
+    else
+        xmin_bound, xmax_bound = x1 <= x2 ? (x1, x2) : (x2, x1)
+    end
+    # Code for most common case: cubic interpolation of 2 points
+    #   w/ function and derivative values for both
+    # Solution in this case (where x2 is the farthest point):
+    #   d1 = g1 + g2 - 3*(f1-f2)/(x1-x2);
+    #   d2 = sqrt(d1^2 - g1*g2);
+    #   min_pos = x2 - (x2 - x1)*((g2 + d2 - d1)/(g2 - g1 + 2*d2));
+    #   t_new = min(max(min_pos,xmin_bound),xmax_bound);
+    d1 = g1 + g2 - 3 * (f1 - f2) / (x1 - x2)
+    d2_square = d1 * d1 - g1 * g2
+    if d2_square >= 0
+        d2 = sqrt(d2_square)
+        if x1 <= x2
+            min_pos = x2 - (x2 - x1) * ((g2 + d2 - d1) / (g2 - g1 + 2 * d2))
+        else
+            min_pos = x1 - (x1 - x2) * ((g1 + d2 - d1) / (g1 - g2 + 2 * d2))
+        end
+        return min(max(min_pos, xmin_bound), xmax_bound)
+    else
+        return (xmin_bound + xmax_bound) / 2.0
+    end
+end
+
+@doc raw"""
+     _strong_wolfe(
+        fun_obj, x, t, d, f, g, gx_buff, gtd, c1=1e-4, c2=0.9, tolerance_change=1e-9, max_ls=25)
+Algorithm 3.5 in [^WrightNocedal2006]. The magic number values c_1 and c_2 are suggested from the notes following the algorithm. Converting from the matlab code in https://www.cs.ubc.ca/~schmidtm/Software/minFunc.html
+
+fun_obj returns the objective function value
+
+"""
+@inline function _strong_wolfe(
+    fun_obj,
+    x,
+    t,
+    d,
+    f,
+    g,
+    gx_buff,
+    gtd,
+    c1=1e-4,
+    c2=0.9,
+    tolerance_change=1e-9,
+    max_ls=25,
+)
+    d_norm = max_abs(d)
+    # evaluate objective and gradient using initial step
+    p = size(x)[1]
+    @views begin
+        g_new = gx_buff[1:p]
+        g_prev = gx_buff[(p + 1):(2 * p)]
+        x_buff = gx_buff[(2 * p + 1):(3 * p)]
+    end
+    multd!(x_buff, x, t, d)
+
+    f_new = fun_obj(0.0, g_new, x_buff)
+    ls_func_evals = 1
+    gtd_new = dot(g_new, d)
+
+    # bracket an interval containing a point satisfying the Wolfe criteria
+    t_prev, f_prev, g_prev, gtd_prev = 0, f, g, gtd
+    done = false
+    ls_iter = 0
+    bracket_gtd = Array([NaN, NaN])
+    idx_g = 0
+    idx_g_new = 1
+    idx_g_prev = 2
+    while ls_iter < max_ls
+        # check conditions
+        if f_new > (f + c1 * t * gtd) || ((ls_iter > 1) && (f_new >= f_prev))
+            bracket = [t_prev, t]
+            bracket_f = [f_prev, f_new]
+            bracket_g = [idx_g_prev, idx_g_new]
+            bracket_gtd .= [gtd_prev, gtd_new]
+            break
+        end
+
+        if abs(gtd_new) <= -c2 * gtd
+            bracket = [t]
+            bracket_f = [f_new]
+            bracket_g = [idx_g_new]
+            done = true
+            break
+        end
+        if gtd_new >= 0
+            bracket = [t_prev, t]
+            bracket_f = [f_prev, f_new]
+            bracket_g = [idx_g_prev, idx_g_new]
+            bracket_gtd .= [gtd_prev, gtd_new]
+            break
+        end
+        # interpolate
+        min_step = t + 0.01 * (t - t_prev)
+        max_step = t * 10
+        tmp = t
+        t = _cubic_interpolate(
+            t_prev,
+            f_prev,
+            gtd_prev,
+            t,
+            f_new,
+            gtd_new,
+            bounds=(min_step, max_step),
+        )
+
+        # next step
+        t_prev = tmp
+        f_prev = f_new
+        g_prev .= g_new
+        gtd_prev = gtd_new
+        multd!(x_buff, x, t, d)
+        f_new = fun_obj(f_new, g_new, x_buff)
+        ls_func_evals += 1
+        gtd_new = dot(g_new, d)
+        ls_iter += 1
+    end
+    # reached max number of iterations?
+    if ls_iter >= max_ls
+        bracket = [0.0, t]
+        bracket_f = [f, f_new]
+        bracket_g = [idx_g, idx_g_new]
+    end
+    # zoom phase: we now have a point satisfying the criteria, or
+    # a bracket around it. We refine the bracket until we find the
+    # exact point satisfying the criteria
+    # implementing algorithm 3.6 of [^WrightNocedal2006]
+    insuf_progress = false
+    # find high and low points in bracket
+    low_pos, high_pos = bracket_f[1] <= bracket_f[end] ? (1, 2) : (2, 1)
+
+    while !done && (ls_iter < max_ls)
+        if abs(bracket[2] - bracket[1]) * d_norm < tolerance_change
+            break
+        end
+
+        # compute new trial value
+        t = _cubic_interpolate(
+            bracket[1],
+            bracket_f[1],
+            bracket_gtd[1],
+            bracket[2],
+            bracket_f[2],
+            bracket_gtd[2],
+        )
+
+        # test that we are making sufficient progress
+        # in case `t` is so close to boundary, we mark that we are making
+        # insufficient progress, and if
+        #   + we have made insufficient progress in the last step, or
+        #   + `t` is at one of the boundary,
+        # we will move `t` to a position which is `0.1 * len(bracket)`
+        # away from the nearest boundary point.
+        bklo, bkhi =
+            bracket[1] <= bracket[2] ? (bracket[1], bracket[2]) : (bracket[2], bracket[1])
+        eps = 0.1 * (bkhi - bklo)
+        if min(bkhi - t, t - bklo) < eps
+            # interpolation close to boundary
+            if insuf_progress || (t >= bkhi) || (t <= bklo)
+                # evaluate at 0.1 away from boundary
+                if abs(t - bkhi) < abs(t - bklo)
+                    t = bkhi - eps
+                else
+                    t = bklo + eps
+                end
+                insuf_progress = false
+            else
+                insuf_progress = true
+            end
+        else
+            insuf_progress = false
+        end
+        # Evaluate new point
+        multd!(x_buff, x, t, d)
+        f_new = fun_obj(f_new, g_new, x_buff)
+        ls_func_evals += 1
+        gtd_new = dot(g_new, d)
+        ls_iter += 1
+        if (f_new > (f + c1 * t * gtd)) || (f_new >= bracket_f[low_pos])
+            # Armijo condition not satisfied or not lower than lowest point
+            bracket[high_pos] = t
+            bracket_f[high_pos] = f_new
+            bracket_g[high_pos] = idx_g_new
+            bracket_gtd[high_pos] = gtd_new
+            low_pos, high_pos = bracket_f[1] <= bracket_f[2] ? (1, 2) : (2, 1)
+        else
+            if abs(gtd_new) <= -c2 * gtd
+                # Wolfe conditions satisfied
+                done = true
+
+            elseif gtd_new * (bracket[high_pos] - bracket[low_pos]) >= 0
+                # old high becomes new low
+                bracket[high_pos] = bracket[low_pos]
+                bracket_f[high_pos] = bracket_f[low_pos]
+                bracket_g[high_pos] = bracket_g[low_pos]
+                bracket_gtd[high_pos] = bracket_gtd[low_pos]
+            end
+            # new point becomes new low
+            bracket[low_pos] = t
+            bracket_f[low_pos] = f_new
+            bracket_g[low_pos] = idx_g_new
+            bracket_gtd[low_pos] = gtd_new
+        end
+    end
+    # return stuff
+    t = bracket[low_pos]
+    f_new = bracket_f[low_pos]
+    function set_g!(g, idx)
+        if idx == 1
+            g .= g_new
+        elseif idx == 2
+            g .= g_prev
+        end
+    end
+    set_g!(g, bracket_g[low_pos])
+    return f_new, t, ls_func_evals
+end
+
+@doc raw"""
+    minimize(
+        fun_obj,
+         x0;
+         max_fun_evals=120,
+         max_itr=100,
+         grad_tol=1e-15,
+         func_tol=1e-15,
+         corrections=10,
+         c1=1e-4,
+         c2=0.9,
+         max_ls=25,
+         precond=nothing)
+
+Minimize the objective fun_obj using the L-BFGS two-loop method in [^WrightNocedal2006], chapter 6.
+
+The algorithm calls the internal ``\mathrm{\_strong\_wolfe}`` function for line search. The magic numbers c_1 and c_2 are for the line search, suggested in op. cit.
+Corrections are the number of historical step/gradient saved, max_ls is the max number of line search iterations, precondx is a preconditioner - a function return a vector in the shape of x0.
+
+[^WrightNocedal2006]:
+    ^Wright, S., Nocedal, J. (2006).
+    ^" Numerical Optimization"
+    ^Springer New York. ISBN:9780387227429.
+"""
+function minimize(
+    fun_obj,
+    x0;
+    max_fun_evals=120,
+    max_itr=100,
+    grad_tol=1e-15,
+    func_tol=1e-15,
+    corrections=10,
+    c1=1e-4,
+    c2=0.9,
+    max_ls=25,
+    precond=nothing,
+)
+
+    # Initialize
+    p = size(x0)[1]
+    d = zeros(p)
+    x = copy(x0)
+    t = 1.0
+    if isnothing(precond)
+        precondx = nothing
+    else
+        precondx = precond(x)
+    end
+
+    funEvalMultiplier = 1.0
+
+    # Evaluate Initial Point
+    g = similar(x)
+    gx_buff = Array{eltype(x),1}(undef, 3 * p)
+
+    f = fun_obj(0.0, g, x)
+    funEvals = 1
+    g_old = similar(g)
+
+    # Compute optimality of initial point
+    optCond = max_abs(g)
+
+    # Exit if initial point is optimal
+    if optCond <= grad_tol
+        exitflag = 1
+        msg = "Optimality Condition below grad_tol"
+        output = Dict(
+            "iterations" => 0,
+            "funcCount" => 1,
+            "firstorderopt" => max_abs(g),
+            "message" => msg,
+        )
+        return x, f, exitflag, output
+    end
+    d = -g # Initially use steepest descent direction
+    Tp = Base.eltype(x0)
+
+    S = zeros(p, corrections)
+    Y = zeros(p, corrections)
+    rho = zeros(corrections)
+    lbfgs_start = 0
+    lbfgs_end = 0
+    Hdiag = ones(p)
+    al_buff = zeros(corrections)
+    # Perform up to a maximum of 'max_itr' descent steps:
+    i = 1
+    while i <= max_itr
+        # COMPUTE DESCENT DIRECTION 
+        # Update the direction and step sizes
+        if i > 1
+            lbfgs_start, lbfgs_end, =
+                _save!(g - g_old, t * d, S, Y, rho, lbfgs_start, lbfgs_end, Hdiag, precondx)
+            _lbfgs_calc!(d, g, S, Y, rho, lbfgs_start, lbfgs_end, Hdiag, al_buff)
+        end
+        g_old .= g
+
+        if !_is_legal(d)
+            println("Step direction is illegal!")
+            output = Dict(
+                "iterations" => i,
+                "funcCount" => funEvals * funEvalMultiplier,
+                "firstorderopt" => NaN,
+                "message" => "Step direction is illegal!\n",
+            )
+
+            return x, f, -1, output
+        end
+
+        # COMPUTE STEP LENGTH
+        # Directional Derivative
+        gtd = dot(g, d)
+
+        # Check that progress can be made along direction
+        if gtd > -func_tol
+            exitflag = 2
+            msg = "Directional Derivative below func_tol"
+            break
+        end
+
+        # Select Initial Guess
+        if i == 1
+            t = min(1.0, 1.0 / sum(abs.(g)))
+        else
+            t = 1.0
+        end
+        f_old = f
+        gtd_old = gtd
+
+        f, t, LSfunEvals =
+            _strong_wolfe(fun_obj, x, t, d, f, g, gx_buff, gtd, c1, c2, func_tol, max_ls)
+        funEvals = funEvals + LSfunEvals
+        x .+= t * d
+        optCond = max_abs(g)
+        # Check Optimality Condition
+        if optCond <= grad_tol
+            exitflag = 1
+            msg = "Optimality Condition below grad_tol"
+            break
+        end
+        # Check for lack of progress
+
+        if max_abs(t * d) <= func_tol
+            exitflag = 2
+            msg = "Step Size below func_tol"
+            break
+        end
+        if abs(f - f_old) < func_tol
+            exitflag = 2
+            msg = "Function Value changing by less than func_tol"
+            break
+        end
+        # Check for going over iteration/evaluation limit
+        if funEvals * funEvalMultiplier >= max_fun_evals
+            exitflag = 0
+            msg = "Reached Maximum Number of Function Evaluations"
+            break
+        end
+
+        if i == max_itr
+            exitflag = 0
+            msg = "Reached Maximum Number of Iterations"
+            break
+        end
+        i += 1
+    end
+    output = Dict(
+        "iterations" => i,
+        "funcCount" => funEvals * funEvalMultiplier,
+        "firstorderopt" => max_abs(g),
+        "message" => msg,
+    )
+    return x, f, exitflag, output
+end
+
+# internal functions. Self explanatory - for vector functions
+@inline function multd!(x_buff, x, t, d)
+    mul!(x_buff, t, d)
+    return x_buff .+= x
+end
+
+@inline function _is_legal(v)
+    return isreal(v) && sum(isnan.(v)) == 0 && sum(isinf.(v)) == 0
+end
+
+@inline function dot3(a, b, c)
+    s = 0
+    for i in 1:size(a)[1]
+        s += a[i] * b[i] * c[i]
+    end
+    return s
+end
+
+@inline function max_abs(a)
+    # avoid using maximum(abs.(a)) so we dont have to allocate
+    s = -Inf
+    for i in 1:size(a)[1]
+        if s < abs(a[i])
+            s = abs(a[i])
+        end
+    end
+    return abs(s)
+end
diff --git a/test/manifolds/stiefel.jl b/test/manifolds/stiefel.jl
index b1646ea612..a4f7be2854 100644
--- a/test/manifolds/stiefel.jl
+++ b/test/manifolds/stiefel.jl
@@ -468,9 +468,239 @@ include("../utils.jl")
             end
         end
 
+        @testset "expm_frechet" begin
+            n = 50
+            for ft in (1e-2, 0.19, 0.78, 1.7, 4.5, 9.0)
+                A = rand(n, n)
+
+                A = A / maximum(sum(abs.(A), dims=1)) * ft
+                E = rand(n, n)
+                E = E / norm(E, 2) * ft
+                buff = Array{Float64,2}(undef, 16 * n, n)
+                ret = Manifolds._diff_pade3(A, E)
+                Manifolds._diff_pade3!(buff, A, E)
+                @test maximum(abs.(ret[1] - buff[1:n, 1:end])) < 1e-6
+                @views begin
+                    expA = buff[1:n, :]
+                    expAE = buff[(n + 1):(2 * n), :]
+                end
+                Manifolds.expm_frechet!(buff, A, E)
+                expA1, expAE1 = Manifolds.expm_frechet(A, E)
+                @test maximum(abs.((expA - expA1))) < 1e-9
+                @test maximum(abs.((expAE - expAE1))) < 1e-9
+                dlt = 1e-7
+                @test maximum(abs.((exp(A + dlt * E) .- exp(A)) / dlt .- expAE)) /
+                      norm(expA1, 2) < 1e-3
+            end
+        end
+
+        @testset "lbfgs" begin
+            p = 100
+            function rosenbrock!(f, df, x)
+                D = size(x)[1]
+                f = sum(
+                    100 * (x[2:end] .- x[1:(end - 1)] .^ 2) .^ 2 .+
+                    (1 .- x[1:(end - 1)]) .^ 2,
+                )
+
+                @views begin
+                    xm = x[2:(end - 1)]
+                    xm_m1 = x[1:(end - 2)]
+                    xm_p1 = x[3:end]
+                end
+                df[2:(end - 1)] .= (
+                    200 * (xm - xm_m1 .^ 2) - 400 * (xm_p1 - xm .^ 2) .* xm - 2 * (1 .- xm)
+                )
+                df[1] = -400 * x[1] * (x[2] - x[1]^2) - 2 * (1 - x[1])
+                df[end] = 200 * (x[end] - x[end - 1]^2)
+                return f
+            end
+            x0 = randn(p)
+
+            function pcondR(y)
+                x = fill!(similar(y), 1)
+                ret = similar(x)
+                ret[1] = (1200 * x[1]^2 - 400 * x[2] + 2)
+                ret[2:(end - 1)] .= 202 .+ 1200 * x[2:(end - 1)] .^ 2 .- 400 * x[3:end]
+                ret[end] = 200
+                return 1.0 / ret
+            end
+
+            # test to show different scenarios of the optimizer are reached.
+            x0 .= 0.0
+            x, f, exitflag, output = Manifolds.minimize(
+                rosenbrock!,
+                x0,
+                precond=pcondR,
+                max_itr=2,
+                max_fun_evals=3,
+            )
+            @test exitflag == 0 &&
+                  output["message"] == "Reached Maximum Number of Function Evaluations"
+
+            x0 .= 0.0
+            x, f, exitflag, output = Manifolds.minimize(
+                rosenbrock!,
+                x0,
+                precond=pcondR,
+                max_itr=2,
+                max_fun_evals=30,
+            )
+            @test exitflag == 0 &&
+                  output["message"] == "Reached Maximum Number of Iterations"
+
+            x0 .= 0.0
+            x, f, exitflag, output = Manifolds.minimize(
+                rosenbrock!,
+                x0,
+                precond=pcondR,
+                func_tol=1e-3,
+                max_itr=1000,
+                max_fun_evals=1300,
+            )
+            @test exitflag == 2
+
+            x0 .= 0.0
+            x, f, exitflag, output = Manifolds.minimize(
+                rosenbrock!,
+                x0,
+                precond=pcondR,
+                grad_tol=1e-3,
+                max_itr=1000,
+                max_fun_evals=1300,
+            )
+            @test exitflag == 1
+
+            x, f, exitflag, output = Manifolds.minimize(
+                rosenbrock!,
+                x0,
+                precond=pcondR,
+                max_itr=1000,
+                max_fun_evals=1300,
+            )
+            @test exitflag > 0
+
+            # if initial point is optimal
+            x1, f1, exitflag1, output1 = Manifolds.minimize(
+                rosenbrock!,
+                x,
+                max_itr=1000,
+                max_fun_evals=1300,
+                grad_tol=1e-3,
+                max_ls=2,
+            )
+
+            x1, f1, exitflag1, output1 =
+                Manifolds.minimize(rosenbrock!, x0, max_itr=1000, max_fun_evals=1300)
+            @test exitflag > 0
+
+            x1, f1, exitflag1, output1 = Manifolds.minimize(
+                rosenbrock!,
+                x0,
+                max_itr=1000,
+                max_fun_evals=1300,
+                max_ls=3,
+            )
+
+            x1, f1, exitflag1, output1 = Manifolds.minimize(
+                rosenbrock!,
+                x0,
+                max_itr=1000,
+                max_fun_evals=1300,
+                func_tol=1e-2,
+                max_ls=2,
+            )
+
+            function bad_func!(f, df, x)
+                f = (sum(x .* x))^(1 / 3)
+                df[:] = 2 / 3 * x ./ (f * f)
+                return f
+            end
+            x0[:] .= 0.0
+            x1, f1, exitflag1, output1 = Manifolds.minimize(bad_func!, x0)
+
+            function randpoint(M)
+                return project(M, randn(representation_size(M)))
+            end
+
+            # generate unit vector            
+            function randvec(M, p)
+                X = project(M, p, randn(representation_size(M)))
+                X ./= sqrt(inner(M, p, X, X))
+                return X
+            end
+
+            # run a lot of scenarios - this will hit most
+            # conditions in the optimizer.
+            Random.seed!(0)
+
+            n_samples = 3
+            Random.seed!(0)
+            max_ft = 0.5
+            NN = 50
+            pretol = 1e-3
+
+            for i in 1:NN
+                n = Int(ceil(2^(7 / 1.04 * (0.04 + rand()))))
+                if n < 4
+                    n = 4
+                end
+
+                k = rand(2:(n - 1))
+                α = 3 * rand() - 0.9
+                M = MetricManifold(Stiefel(n, k), StiefelSubmersionMetric(α))
+                p = randpoint(M)
+                X = randvec(M, p)
+
+                ft = (rand() + 0.1) * max_ft / 0.2
+                q = exp(M, p, ft * pi * X)
+                println("i=\t", i, " n=\t", n, " k=\t", k)
+
+                try
+                    XF = Manifolds.log_lbfgs(
+                        M,
+                        p,
+                        q,
+                        tolerance=1e-8,
+                        max_itr=1000,
+                        pretol=pretol,
+                    )
+                catch
+                    println("bad at i=\t", i)
+                end
+            end
+        end
+
         g = StiefelSubmersionMetric(1)
         @test g isa StiefelSubmersionMetric{Int}
 
+        @testset "dot_exp" begin
+            for M in [Stiefel(3, 3), Stiefel(4, 3), Stiefel(4, 2)]
+                for α in [-0.75, -0.25, 0.5]
+                    MM = MetricManifold(M, StiefelSubmersionMetric(α))
+                    p = project(MM, randn(representation_size(M)))
+                    X = project(MM, p, randn(representation_size(M)))
+                    X ./= norm(MM, p, X)
+
+                    t = 1.3
+                    q = exp(MM, p, t * X)
+                    qx = similar(q)
+                    dqx = similar(q)
+
+                    dot_exp!(MM, qx, dqx, p, X, t)
+                    qx1, dqx1 = dot_exp(MM, p, X, t)
+
+                    @test isapprox(MM, qx, q)
+                    @test isapprox(MM, qx, dqx, dqx1)
+                    dlt = 1e-7
+                    qxp = exp(MM, p, (t + dlt) * X)
+                    qxm = exp(MM, p, (t - dlt) * X)
+                    qdt = (qxp - qxm) / dlt / 2
+                    @test maximum(abs.(qdt .- dqx)) < 1e-4
+                end
+            end
+        end
+
         @testset for M in [Stiefel(3, 3), Stiefel(4, 3), Stiefel(4, 2)]
             Mcan = MetricManifold(M, CanonicalMetric())
             Meu = MetricManifold(M, EuclideanMetric())
@@ -484,8 +714,28 @@ include("../utils.jl")
                 q = exp(Mcomp, p, X)
                 @test isapprox(MM, q, exp(Mcomp, p, X))
                 Mcomp === Mcan && isapprox(MM, p, log(MM, p, q), log(Mcomp, p, q))
+                Mcomp === Mcan &&
+                    isapprox(MM, p, log_lbfgs(MM, p, q), log(Mcomp, p, q), atol=1e-6)
+                lbfgs_options = Dict([
+                    ("complementary_rank_cutoff", 1.5e-14),
+                    ("corrections", 5),
+                    ("c1", 1.1e-4),
+                    ("c2", 0.9),
+                    ("max_ls", 20),
+                    ("max_fun_evals", 1500),
+                ])
+
+                Mcomp === Mcan && isapprox(
+                    MM,
+                    p,
+                    log_lbfgs(MM, p, q, lbfgs_options=lbfgs_options),
+                    log(Mcomp, p, q),
+                    atol=1e-6,
+                )
+
                 @test isapprox(MM, exp(MM, p, 0 * X), p)
                 @test isapprox(MM, p, log(MM, p, p), zero_vector(MM, p); atol=1e-6)
+                @test isapprox(MM, p, log_lbfgs(MM, p, p), zero_vector(MM, p); atol=1e-6)
             end
             @testset "α=$α" for α in [-0.75, -0.25, 0.5]
                 MM = MetricManifold(M, StiefelSubmersionMetric(α))
@@ -495,8 +745,11 @@ include("../utils.jl")
                 q = exp(MM, p, X)
                 @test is_point(MM, q)
                 @test isapprox(MM, p, log(MM, p, q), X)
+                @test isapprox(MM, p, Manifolds.log_lbfgs(MM, p, q), X, atol=1e-6)
+
                 @test isapprox(MM, exp(MM, p, 0 * X), p)
                 @test isapprox(MM, p, log(MM, p, p), zero_vector(MM, p); atol=1e-6)
+                @test isapprox(MM, p, log_lbfgs(MM, p, p), zero_vector(MM, p); atol=1e-6)
             end
         end
     end