diff --git a/CHANGES b/CHANGES
new file mode 100644
index 0000000..28b5f6b
--- /dev/null
+++ b/CHANGES
@@ -0,0 +1,41 @@
+***********************************************************
+*              Changes in 2.0                             *
+***********************************************************
+ -Subroutine gsync() has changed to nekgsync() to avoid
+	possible conflict on certain architectures
+
+ -Executable is renamed 'nekbone' to replace 'nekproxy' and
+	other naming changes.
+
+ -iel0 and ielN  set in data.rea file are now used to control the
+ 	range of tests ran.  Test range in size from iel0
+	to ielN elements per process.  (prevoiusly tests 
+	were ran from 1 to lelt elements per process)  The
+	maximum value of ielN is lelt.
+
+ -nx0 and nxN set in data.rea file are now used to control the
+	range of polynomial orders.   Ranging from nx0 to 
+	nxN, where nxN<=lx1 (which is set in SIZE).  Previously
+	tests only ran with nx1=lx1.  The default is set to
+	reflect this, but nekbone now supports a range of
+	polynomial orders without recompiling the code.
+***********************************************************
+*              Changes in 2.1                             *
+***********************************************************
+ -Fixed nx0 and nxN control of polynomial order.  Default is
+ 	now to use lx1 until further notice.  Variable 
+	nx1 caused memory unstabilities and needs further 
+	development.
+ -Fixed a memory copy bug in the jl/ array transfer code.  
+	sarray_trasfer, used for the tuple transfer, should
+	be fixed now.
+
+***********************************************************
+*              Changes in 2.3                             *
+***********************************************************
+ - added OpenMP parallelism, MPITHREADS preprocessor macro
+   controls if MPI is called from one or multiple threads
+ - added timers controlled by TIMERS preprocessor macro
+ - fixed gather-scatter operation gsop() to always use pairwise
+   method
+ - switched to using system_clock routine in dummy mpi_wtime()
diff --git a/COPYRIGHT b/COPYRIGHT
new file mode 100644
index 0000000..9f84af5
--- /dev/null
+++ b/COPYRIGHT
@@ -0,0 +1,34 @@
+				  COPYRIGHT
+
+The following is a notice of limited availability of the code, and disclaimer
+which must be included in the prologue of the code and in all source listings
+of the code.
+
+Copyright Notice
+ + 2012 University of Chicago
+
+Permission is hereby granted to use, reproduce, prepare derivative works, and
+to redistribute to others.  This software was authored by:
+
+P. Fischer: (630) 252-6018; FAX: (630) 252-5986; email: fischer@mcs.anl.gov
+Mathematics and Computer Science Division
+Argonne National Laboratory, Argonne IL 60439
+
+			      GOVERNMENT LICENSE
+
+Portions of this material resulted from work developed under a U.S.
+Government Contract and are subject to the following license: the Government
+is granted for itself and others acting on its behalf a paid-up, nonexclusive,
+irrevocable worldwide license in this computer software to reproduce, prepare
+derivative works, and perform publicly and display publicly.
+
+				  DISCLAIMER
+
+This computer code material was prepared, in part, as an account of work
+sponsored by an agency of the United States Government.  Neither the United
+States, nor the University of Chicago, nor any of their employees, makes any
+warranty express or implied, or assumes any legal liability or responsibility
+for the accuracy, completeness, or usefulness of any information, apparatus,
+product, or process disclosed, or represents that its use would not infringe
+privately owned rights.
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..fab3aad
--- /dev/null
+++ b/README.md
@@ -0,0 +1 @@
+# nekbone_2_3_5
diff --git a/readme.pdf b/readme.pdf
new file mode 100644
index 0000000..8c8aa70
Binary files /dev/null and b/readme.pdf differ
diff --git a/src/DXYZ b/src/DXYZ
new file mode 100644
index 0000000..e5eaca0
--- /dev/null
+++ b/src/DXYZ
@@ -0,0 +1,5 @@
+C
+C     Elemental derivative operators
+C
+      common /dxyz/ dxm1(lx1,lx1),  dxtm1(lx1,lx1)
+
diff --git a/src/INPUT b/src/INPUT
new file mode 100644
index 0000000..124d572
--- /dev/null
+++ b/src/INPUT
@@ -0,0 +1,19 @@
+C
+C     Input parameters from preprocessors.
+C
+C     Note that in parallel implementations, we distinguish between
+C     distributed data (LELT) and uniformly distributed data.
+C
+
+      common /input5/ xc(8,lelt),yc(8,lelt),zc(8,lelt)
+     $               ,bc(5,6,lelt,0:ldimt1)
+
+
+      common /input8/ cbc(6,lelt,0:ldimt1),ccurve(12,lelt)
+      character*1     ccurve
+      character*3     cbc
+
+      real mflops
+      integer*8 flop_a, flop_cg
+      common /cflops/ flop_a,flop_cg,mflops
+
diff --git a/src/MASS b/src/MASS
new file mode 100644
index 0000000..16cde4d
--- /dev/null
+++ b/src/MASS
@@ -0,0 +1,4 @@
+      common /mass/
+     $       bm1   (lx1,ly1,lz1,lelt)
+     $      ,binvm1(lx1,ly1,lz1,lelt)
+     $      ,volvm1
diff --git a/src/PARALLEL b/src/PARALLEL
new file mode 100644
index 0000000..67372b4
--- /dev/null
+++ b/src/PARALLEL
@@ -0,0 +1,31 @@
+C
+C     Communication information
+C     NOTE: NID is stored in 'SIZE' for greater accessibility
+      common /cube1/ node,pid,np,nullpid,node0
+      integer        node,pid,np,nullpid,node0
+
+
+c     Maximum number of elements (limited to 2**31/12, at least for now)
+      parameter(nelgt_max = 178956970)
+
+      common /hcglb/ nvtot,nelgf(0:ldimt1)
+     $              ,lglel(lelt)
+c    $              ,gllel(lelg)
+c    $              ,gllnid(lelg)
+     $              ,nelgv,nelgt
+
+      integer        lglel
+c     integer        gllel,gllnid
+      integer*8      nvtot
+
+      common /diagl/  ifgprnt
+      logical ifgprnt
+      common/precsn/ wdsize,isize,lsize,csize
+      common/precsl/ ifdblas
+      integer wdsize,isize,lsize,csize
+      logical ifdblas
+C
+C     crystal-router, gather-scatter, and xxt handles (xxt=csr grid solve)
+C
+      common /comm_handles/ cr_h, gsh, gsh_fld(0:ldimt1), xxth(ldimt1)
+      integer               cr_h, gsh, gsh_fld          , xxth
diff --git a/src/README b/src/README
new file mode 100644
index 0000000..8996aa9
--- /dev/null
+++ b/src/README
@@ -0,0 +1,66 @@
+  Nek_comm-1.0
+
+This is the communication testing kernel for the MPI 
+all reduce and point to point communication used within
+the nekbone mini-app and standard NEK5000.  This kernel
+runs a battery of platform timers using MPI standard.
+
+To Run:
+
+  NOTE - Unlike the other nek codes, a data.rea file
+	is not needed, since there is no geometry
+	being set up.
+
+  After untarring nek_comm-1.0.tgz, change working 
+  directory to the nek_comm/test/ directory:
+     cd  ~/nek_comm/test/
+
+  Edit the makenek script to specify the compiler and
+  appropriate compiler flags.
+
+  Compile and link the code using the makenek script:
+      ./makenek n
+  where n is the chosen name of test run.
+
+  A successful compilation will result with:
+
+  #############################################################
+  #                  Compilation successful!                  #
+  #############################################################
+  
+  And a nekcomm executable.
+
+  Run the code in parallel using the provided nekpmpi script
+  by specifying the test name (for logfile naming purposes) 
+  and the number of processors.  For example, to run a test
+  called 'n' on 4 processes:
+     ./nekpmpi n 4
+
+  This will produce a logfile, n.log.4.
+
+
+Interpreting results:
+
+  The logfile will have a header describing the parameters the
+  test was ran with and the output of the timing tests.
+
+  All reduce tests are output with the a 'gop' tag:
+		np  nwds time1 time2
+  where,
+	 np - number of processors
+	 nwds - number of words
+	 tmsg - time per message
+ 	 tpwd - time per word
+
+
+  Point to point tests are output with the 'pg' tag:
+		nodeb np nloop nwds tmsg tpwd
+  where,
+	 nodeb - the second processor node 0 is 
+	 testing with
+	 np - number of processors
+	 nloop - number of tests ran with these nodes
+	 nwds - number of words per message
+	 tmsg - time per message
+	 tpwd - time per word  
+        
diff --git a/src/TIMER b/src/TIMER
new file mode 100644
index 0000000..60a761c
--- /dev/null
+++ b/src/TIMER
@@ -0,0 +1,19 @@
+
+      integer tmax
+      parameter (tmax = 1024)
+    
+      integer gopi(tmax)
+
+      real  ttemp1, ttemp2, ttemp3, ttemp4
+
+      real trzero(tmax), tcopy(tmax), tsolvem(tmax)
+      real tglsc3a(tmax), tglsc3b(tmax), tglsc3c(tmax), tglsc3d(tmax)
+      real tadd2s1(tmax), tadd2s2a(tmax), tadd2s2b(tmax), tadd2s2c(tmax)
+      real tlocalgrad3(tmax), twrwswt(tmax), tlocalgrad3t(tmax)
+      real tgsop(tmax), tgop(4,tmax)
+
+      real*8 dnekclock
+
+      common /timer/ trzero, tcopy, tsolvem, tglsc3a, tglsc3b, tglsc3c,
+     +tglsc3d, tadd2s1, tadd2s2a, tadd2s2b, tadd2s2c, tlocalgrad3,
+     +twrwswt, tlocalgrad3t, tgsop, tgop, gopi
diff --git a/src/TOTAL b/src/TOTAL
new file mode 100644
index 0000000..814bc18
--- /dev/null
+++ b/src/TOTAL
@@ -0,0 +1,5 @@
+      include 'DXYZ'
+      include 'INPUT'
+      include 'MASS'
+      include 'PARALLEL'
+      include 'WZ'
diff --git a/src/WZ b/src/WZ
new file mode 100644
index 0000000..502e87c
--- /dev/null
+++ b/src/WZ
@@ -0,0 +1,7 @@
+
+
+c     Gauss-Labotto and Gauss points
+      common /gauss/ zgm1(lx1,3)
+
+c     Weights
+      common /wxyz/ wxm1(lx1), wym1(ly1), wzm1(lz1), w3m1(lx1,ly1,lz1)
diff --git a/src/bg_aligned3.s b/src/bg_aligned3.s
new file mode 100644
index 0000000..8a5ab9a
--- /dev/null
+++ b/src/bg_aligned3.s
@@ -0,0 +1,41 @@
+.set r0,0; .set r1,1; .set r2,2; .set r3,3; .set r4,4
+.set r5,5; .set r6,6; .set r7,7; .set r8,8; .set r9,9
+.set r10,10; .set r11,11; .set r12,12; .set r13,13; .set r14,14
+.set r15,15; .set r16,16; .set r17,17; .set r18,18; .set r19,19
+.set r20,20; .set r21,21; .set r22,22; .set r23,23; .set r24,24
+.set r25,25; .set r26,26; .set r27,27; .set r28,28; .set r29,29
+.set r30,30; .set r31,31
+.set f0,0; .set f1,1; .set f2,2; .set f3,3; .set f4,4
+.set f5,5; .set f6,6; .set f7,7; .set f8,8; .set f9,9
+.set f10,10; .set f11,11; .set f12,12; .set f13,13; .set f14,14
+.set f15,15; .set f16,16; .set f17,17; .set f18,18; .set f19,19
+.set f20,20; .set f21,21; .set f22,22; .set f23,23; .set f24,24
+.set f25,25; .set f26,26; .set f27,27; .set f28,28; .set f29,29
+.set f30,30; .set f31,31
+
+.file "bg_aligned3.s"
+
+.globl bg_aligned3
+.type  bg_aligned3, @function
+.size  bg_aligned3, 48
+
+.section ".text"
+.align 2
+
+bg_aligned3:
+  andi.    r0,r3,15
+  clrlwi   r9,r4,28
+  cmpwi    cr7,r9,0
+  li       r3,0
+  li       r0,0
+  bne-     .L.das_label.58
+  andi.    r9,r5,15
+  bne-     cr7,.L.das_label.58
+  bne-     .L.das_label.58
+  li       r0,1
+ .L.das_label.58:
+  stw      r0,0(r6)
+  blr     
+
+
+.ident "GCC: (GNU) 3.2"
diff --git a/src/bg_mxm3.s b/src/bg_mxm3.s
new file mode 100644
index 0000000..ac8ffe5
--- /dev/null
+++ b/src/bg_mxm3.s
@@ -0,0 +1,406 @@
+.set r0,0; .set r1,1; .set r2,2; .set r3,3; .set r4,4
+.set r5,5; .set r6,6; .set r7,7; .set r8,8; .set r9,9
+.set r10,10; .set r11,11; .set r12,12; .set r13,13; .set r14,14
+.set r15,15; .set r16,16; .set r17,17; .set r18,18; .set r19,19
+.set r20,20; .set r21,21; .set r22,22; .set r23,23; .set r24,24
+.set r25,25; .set r26,26; .set r27,27; .set r28,28; .set r29,29
+.set r30,30; .set r31,31
+.set f0,0; .set f1,1; .set f2,2; .set f3,3; .set f4,4
+.set f5,5; .set f6,6; .set f7,7; .set f8,8; .set f9,9
+.set f10,10; .set f11,11; .set f12,12; .set f13,13; .set f14,14
+.set f15,15; .set f16,16; .set f17,17; .set f18,18; .set f19,19
+.set f20,20; .set f21,21; .set f22,22; .set f23,23; .set f24,24
+.set f25,25; .set f26,26; .set f27,27; .set f28,28; .set f29,29
+.set f30,30; .set f31,31
+
+.file "bg_mxm3.s"
+
+.globl bg_mxm3
+.type  bg_mxm3, @function
+.size  bg_mxm3, 1412
+
+.section ".text"
+.align 2
+
+bg_mxm3:
+  stwu     r1,-96(r1)
+  mflr     r0
+  stw      r0,100(r1)
+  andi.    r0,r7,15
+  stw      r15,28(r1)
+  mr       r15,r8
+  stw      r16,32(r1)
+  mr       r16,r6
+  stw      r25,68(r1)
+  mr       r25,r5
+  stw      r28,80(r1)
+  mr       r28,r4
+  stw      r30,88(r1)
+  mr       r30,r7
+  stw      r31,92(r1)
+  mr       r31,r3
+  stw      r14,24(r1)
+  stw      r17,36(r1)
+  stw      r18,40(r1)
+  stw      r19,44(r1)
+  stw      r20,48(r1)
+  stw      r21,52(r1)
+  stw      r22,56(r1)
+  stw      r23,60(r1)
+  stw      r26,72(r1)
+  stw      r27,76(r1)
+  stw      r29,84(r1)
+  bne-     .L.das_label.15
+ .L.das_label.3:
+  lis      r23,dummy@ha
+  li       r12,0
+  addi     r23,r23,dummy@l
+  li       r13,0
+  li       r27,16
+  addi     r23,r23,-16
+  stfpdux  f13,r23,r27
+  stfpdux  f14,r23,r27
+  stfpdux  f15,r23,r27
+  stfpdux  f16,r23,r27
+  stfpdux  f17,r23,r27
+  stfpdux  f18,r23,r27
+  stfpdux  f19,r23,r27
+  stfpdux  f20,r23,r27
+  stfpdux  f21,r23,r27
+  stfpdux  f22,r23,r27
+  stfpdux  f23,r23,r27
+  stfpdux  f24,r23,r27
+  stfpdux  f25,r23,r27
+  stfpdux  f26,r23,r27
+  stfpdux  f27,r23,r27
+  stfpdux  f28,r23,r27
+  stfpdux  f29,r23,r27
+  stfpdux  f30,r23,r27
+  stfpdux  f31,r23,r27
+  lis      r9,10922
+  lwz      r4,0(r28)
+  ori      r9,r9,43691
+  lwz      r15,0(r15)
+  lwz      r6,0(r16)
+  mulhw    r14,r4,r9
+  srawi    r11,r15,31
+  srawi    r16,r6,1
+  addze    r16,r16
+  srawi    r0,r4,31
+  mulhw    r15,r15,r9
+  subf     r14,r0,r14
+  rlwinm   r26,r16,4,0,27
+  mulli    r22,r16,80
+  subf     r15,r11,r15
+  subfic   r22,r22,16
+  mfctr    r29
+  li       r18,0
+  rlwinm   r28,r4,3,0,28
+  cmpw     r18,r15
+  addi     r28,r28,-32
+  addi     r19,r16,-2
+  cmpwi    cr7,r12,0
+  bge-     .L.das_label.10
+ .L.das_label.4:
+  mullw    r23,r4,r18
+  li       r17,0
+  cmpw     r17,r14
+  mulli    r23,r23,48
+  add      r23,r30,r23
+  addi     r23,r23,-16
+  bge-     .L.das_label.9
+  mullw    r9,r16,r18
+  cmpwi    cr6,r19,0
+  mulli    r9,r9,96
+  mulli    r0,r6,40
+  add      r9,r9,r0
+  add      r9,r9,r25
+  addi     r11,r9,-16
+ .L.das_label.5:
+  mulli    r20,r17,48
+  rlwinm   r0,r4,3,0,28
+  add      r20,r31,r20
+  subf     r20,r0,r20
+  addi     r20,r20,32
+  lfpdux   f1,r20,r28
+  mr       r21,r11
+  lfpdux   f7,r21,r22
+  lfpdux   f2,r20,r27
+  lfpdux   f3,r20,r27
+  lfpdux   f8,r21,r26
+  lfpdux   f9,r21,r26
+  lfpdux   f4,r20,r28
+  lfpdux   f10,r21,r26
+  lfpdux   f5,r20,r27
+  lfpdux   f11,r21,r26
+  lfpdux   f12,r21,r26
+  fxpmul   f13,f7,f1
+  fxpmul   f16,f8,f1
+  lfpdux   f6,r20,r27
+  fxpmul   f19,f9,f1
+  fxpmul   f22,f10,f1
+  fxpmul   f25,f11,f1
+  fxpmul   f28,f12,f1
+  fxpmul   f14,f7,f2
+  lfpdux   f1,r20,r28
+  fxpmul   f17,f8,f2
+  fxpmul   f20,f9,f2
+  fxpmul   f23,f10,f2
+  fxpmul   f26,f11,f2
+  fxpmul   f29,f12,f2
+  fxpmul   f15,f7,f3
+  lfpdux   f2,r20,r27
+  fxpmul   f18,f8,f3
+  fxpmul   f21,f9,f3
+  fxpmul   f24,f10,f3
+  fxpmul   f27,f11,f3
+  fxpmul   f30,f12,f3
+  fxcsmadd f13,f7,f4,f13
+  lfpdux   f3,r20,r27
+  fxcsmadd f16,f8,f4,f16
+  fxcsmadd f14,f7,f5,f14
+  fxcsmadd f17,f8,f5,f17
+  fxcsmadd f19,f9,f4,f19
+  fxcsmadd f22,f10,f4,f22
+  fxcsmadd f25,f11,f4,f25
+  fxcsmadd f28,f12,f4,f28
+  fxcsmadd f20,f9,f5,f20
+  fxcsmadd f15,f7,f6,f15
+  lfpdux   f4,r20,r28
+  fxcsmadd f18,f8,f6,f18
+  lfpdux   f7,r21,r22
+  fxcsmadd f23,f10,f5,f23
+  fxcsmadd f26,f11,f5,f26
+  fxcsmadd f29,f12,f5,f29
+  lfpdux   f8,r21,r26
+  fxcsmadd f21,f9,f6,f21
+  fxcsmadd f24,f10,f6,f24
+  lfpdux   f5,r20,r27
+  lfpdux   f9,r21,r26
+  lfpdux   f10,r21,r26
+  fxcsmadd f27,f11,f6,f27
+  fxcsmadd f30,f12,f6,f30
+  lfpdux   f11,r21,r26
+  beq-     cr6,.L.das_label.6
+  mtctr    r19
+
+.__loopk:
+  lfpdux   f12,r21,r26
+  fxcpmadd f13,f7,f1,f13
+  fxcpmadd f16,f8,f1,f16
+  lfpdux   f6,r20,r27
+  fxcpmadd f19,f9,f1,f19
+  fxcpmadd f22,f10,f1,f22
+  fxcpmadd f25,f11,f1,f25
+  fxcpmadd f28,f12,f1,f28
+  fxcpmadd f14,f7,f2,f14
+  lfpdux   f1,r20,r28
+  fxcpmadd f17,f8,f2,f17
+  fxcpmadd f20,f9,f2,f20
+  fxcpmadd f23,f10,f2,f23
+  fxcpmadd f26,f11,f2,f26
+  fxcpmadd f29,f12,f2,f29
+  fxcpmadd f15,f7,f3,f15
+  lfpdux   f2,r20,r27
+  fxcpmadd f18,f8,f3,f18
+  fxcpmadd f21,f9,f3,f21
+  fxcpmadd f24,f10,f3,f24
+  fxcpmadd f27,f11,f3,f27
+  fxcpmadd f30,f12,f3,f30
+  fxcsmadd f13,f7,f4,f13
+  lfpdux   f3,r20,r27
+  fxcsmadd f16,f8,f4,f16
+  fxcsmadd f14,f7,f5,f14
+  fxcsmadd f17,f8,f5,f17
+  fxcsmadd f19,f9,f4,f19
+  fxcsmadd f22,f10,f4,f22
+  fxcsmadd f25,f11,f4,f25
+  fxcsmadd f28,f12,f4,f28
+  fxcsmadd f20,f9,f5,f20
+  fxcsmadd f15,f7,f6,f15
+  lfpdux   f4,r20,r28
+  fxcsmadd f18,f8,f6,f18
+  lfpdux   f7,r21,r22
+  fxcsmadd f23,f10,f5,f23
+  fxcsmadd f26,f11,f5,f26
+  fxcsmadd f29,f12,f5,f29
+  lfpdux   f8,r21,r26
+  fxcsmadd f21,f9,f6,f21
+  fxcsmadd f24,f10,f6,f24
+  lfpdux   f5,r20,r27
+  lfpdux   f9,r21,r26
+  lfpdux   f10,r21,r26
+  fxcsmadd f27,f11,f6,f27
+  fxcsmadd f30,f12,f6,f30
+  lfpdux   f11,r21,r26
+  bdnz+    .__loopk
+ .L.das_label.6:
+  lfpdux   f12,r21,r26
+  fxcpmadd f13,f7,f1,f13
+  fxcpmadd f14,f7,f2,f14
+  fxcpmadd f15,f7,f3,f15
+  lfpdux   f6,r20,r27
+  fxcpmadd f16,f8,f1,f16
+  fxcpmadd f17,f8,f2,f17
+  fxcpmadd f18,f8,f3,f18
+  fxcsmadd f13,f7,f4,f13
+  fxcsmadd f14,f7,f5,f14
+  fxcsmadd f15,f7,f6,f15
+  fxcsmadd f16,f8,f4,f16
+  fxcsmadd f17,f8,f5,f17
+  fxcsmadd f18,f8,f6,f18
+  stfpdux  f13,r23,r27
+  fxcpmadd f19,f9,f1,f19
+  fxcpmadd f20,f9,f2,f20
+  stfpdux  f14,r23,r27
+  fxcpmadd f21,f9,f3,f21
+  fxcpmadd f22,f10,f1,f22
+  stfpdux  f15,r23,r27
+  fxcpmadd f23,f10,f2,f23
+  fxcpmadd f24,f10,f3,f24
+  stfpdux  f16,r23,r28
+  fxcsmadd f19,f9,f4,f19
+  fxcsmadd f20,f9,f5,f20
+  stfpdux  f17,r23,r27
+  fxcsmadd f21,f9,f6,f21
+  fxcsmadd f22,f10,f4,f22
+  stfpdux  f18,r23,r27
+  fxcsmadd f23,f10,f5,f23
+  fxcsmadd f24,f10,f6,f24
+  stfpdux  f19,r23,r28
+  fxcpmadd f25,f11,f1,f25
+  fxcpmadd f26,f11,f2,f26
+  stfpdux  f20,r23,r27
+  fxcpmadd f27,f11,f3,f27
+  fxcpmadd f28,f12,f1,f28
+  stfpdux  f21,r23,r27
+  fxcpmadd f29,f12,f2,f29
+  fxcpmadd f30,f12,f3,f30
+  stfpdux  f22,r23,r28
+  clrlwi   r0,r23,28
+  subfic   r10,r13,0
+  adde     r9,r10,r13
+  neg      r0,r0
+  rlwinm   r0,r0,1,31,31
+  and.     r10,r0,r9
+  beq-     .L.das_label.7
+  mr       r13,r23
+ .L.das_label.7:
+  fxcsmadd f25,f11,f4,f25
+  fxcsmadd f26,f11,f5,f26
+  stfpdux  f23,r23,r27
+  clrlwi   r0,r23,28
+  mfcr     r9
+  rlwinm   r9,r9,31,31,31
+  neg      r0,r0
+  rlwinm   r0,r0,1,31,31
+  and.     r10,r0,r9
+  beq-     .L.das_label.8
+  mr       r12,r23
+  cmpwi    cr7,r23,0
+ .L.das_label.8:
+  fxcsmadd f27,f11,f6,f27
+  fxcsmadd f28,f12,f4,f28
+  stfpdux  f24,r23,r27
+  fxcsmadd f29,f12,f5,f29
+  fxcsmadd f30,f12,f6,f30
+  stfpdux  f25,r23,r28
+  stfpdux  f26,r23,r27
+  stfpdux  f27,r23,r27
+  stfpdux  f28,r23,r28
+  stfpdux  f29,r23,r27
+  stfpdux  f30,r23,r27
+  addi     r17,r17,1
+  mulli    r0,r4,40
+  cmpw     r17,r14
+  subf     r23,r0,r23
+  blt+     .L.das_label.5
+ .L.das_label.9:
+  addi     r18,r18,1
+  cmpw     r18,r15
+  blt+     .L.das_label.4
+ .L.das_label.10:
+  mtctr    r29
+  lis      r23,dummy@ha
+  addi     r23,r23,dummy@l
+  addi     r23,r23,-16
+  lfpdux   f13,r23,r27
+  lfpdux   f14,r23,r27
+  lfpdux   f15,r23,r27
+  lfpdux   f16,r23,r27
+  lfpdux   f17,r23,r27
+  lfpdux   f18,r23,r27
+  lfpdux   f19,r23,r27
+  lfpdux   f20,r23,r27
+  lfpdux   f21,r23,r27
+  lfpdux   f22,r23,r27
+  lfpdux   f23,r23,r27
+  lfpdux   f24,r23,r27
+  lfpdux   f25,r23,r27
+  lfpdux   f26,r23,r27
+  lfpdux   f27,r23,r27
+  lfpdux   f28,r23,r27
+  lfpdux   f29,r23,r27
+  lfpdux   f30,r23,r27
+  lfpdux   f31,r23,r27
+  bne-     cr7,.L.das_label.14
+ .L.das_label.11:
+  cmpwi    r13,0
+  bne-     .L.das_label.13
+ .L.das_label.12:
+  lwz      r0,100(r1)
+  li       r3,3
+  lwz      r14,24(r1)
+  lwz      r15,28(r1)
+  mtlr     r0
+  lwz      r16,32(r1)
+  lwz      r17,36(r1)
+  lwz      r18,40(r1)
+  lwz      r19,44(r1)
+  lwz      r20,48(r1)
+  lwz      r21,52(r1)
+  lwz      r22,56(r1)
+  lwz      r23,60(r1)
+  lwz      r25,68(r1)
+  lwz      r26,72(r1)
+  lwz      r27,76(r1)
+  lwz      r28,80(r1)
+  lwz      r29,84(r1)
+  lwz      r30,88(r1)
+  lwz      r31,92(r1)
+  addi     r1,r1,96
+  blr     
+ .L.das_label.13:
+  lis      r3,.rodata@ha
+  mr       r4,r13
+  addi     r3,r3,.rodata@l
+  crclr    4*cr1+eq
+  bl       printf
+  b        .L.das_label.12
+ .L.das_label.14:
+  lis      r3,.rodata+0x00000018@ha
+  mr       r4,r12
+  addi     r3,r3,.rodata+0x00000018@l
+  crclr    4*cr1+eq
+  bl       printf
+  b        .L.das_label.11
+ .L.das_label.15:
+  lis      r3,.rodata+0x00000030@ha
+  mr       r4,r7
+  addi     r3,r3,.rodata+0x00000030@l
+  crclr    4*cr1+eq
+  bl       printf
+  b        .L.das_label.3
+
+.comm dummy,512,16
+
+
+.section ".rodata","a"
+.align 2
+  .ascii "ALIGNMENT PROBS1: %x\n"
+.align 3
+  .ascii "ALIGNMENT PROBS: %x\n"
+.align 3
+  .ascii "C PROB: %x\n"
+
+.ident "GCC: (GNU) 3.2"
diff --git a/src/bg_mxm44.s b/src/bg_mxm44.s
new file mode 100644
index 0000000..60ab4e3
--- /dev/null
+++ b/src/bg_mxm44.s
@@ -0,0 +1,497 @@
+.set r0,0; .set r1,1; .set r2,2; .set r3,3; .set r4,4
+.set r5,5; .set r6,6; .set r7,7; .set r8,8; .set r9,9
+.set r10,10; .set r11,11; .set r12,12; .set r13,13; .set r14,14
+.set r15,15; .set r16,16; .set r17,17; .set r18,18; .set r19,19
+.set r20,20; .set r21,21; .set r22,22; .set r23,23; .set r24,24
+.set r25,25; .set r26,26; .set r27,27; .set r28,28; .set r29,29
+.set r30,30; .set r31,31
+.set f0,0; .set f1,1; .set f2,2; .set f3,3; .set f4,4
+.set f5,5; .set f6,6; .set f7,7; .set f8,8; .set f9,9
+.set f10,10; .set f11,11; .set f12,12; .set f13,13; .set f14,14
+.set f15,15; .set f16,16; .set f17,17; .set f18,18; .set f19,19
+.set f20,20; .set f21,21; .set f22,22; .set f23,23; .set f24,24
+.set f25,25; .set f26,26; .set f27,27; .set f28,28; .set f29,29
+.set f30,30; .set f31,31
+
+.file "bg_mxm44.s"
+
+.globl bg_mxm44
+.type  bg_mxm44, @function
+.size  bg_mxm44, 1756
+
+
+
+.section ".text"
+.align 2
+
+
+bg_mxm44:
+  stwu     r1,-576(r1)
+  stw      r14,360(r1)
+  mr       r12,r1
+  stfd     f14,432(r1)
+  li       r14,16
+  stfd     f15,440(r1)
+  stfd     f16,448(r1)
+  stfd     f17,456(r1)
+  stfd     f18,464(r1)
+  stfd     f19,472(r1)
+  stfd     f20,480(r1)
+  stfd     f21,488(r1)
+  stfd     f22,496(r1)
+  stfd     f23,504(r1)
+  stfd     f24,512(r1)
+  stfd     f25,520(r1)
+  stfd     f26,528(r1)
+  stfd     f27,536(r1)
+  stfd     f28,544(r1)
+  stfd     f29,552(r1)
+  stfd     f30,560(r1)
+  stfd     f31,568(r1)
+  stw      r15,364(r1)
+  stw      r16,368(r1)
+  stw      r17,372(r1)
+  stw      r18,376(r1)
+  stw      r19,380(r1)
+  stw      r20,384(r1)
+  stw      r21,388(r1)
+  stw      r22,392(r1)
+  stw      r23,396(r1)
+  stw      r24,400(r1)
+  stw      r25,404(r1)
+  stw      r26,408(r1)
+  stw      r28,416(r1)
+  stw      r29,420(r1)
+  stw      r30,424(r1)
+  stfpdux  f14,r12,r14
+  stfpdux  f15,r12,r14
+  stfpdux  f16,r12,r14
+  stfpdux  f17,r12,r14
+  stfpdux  f18,r12,r14
+  stfpdux  f19,r12,r14
+  stfpdux  f20,r12,r14
+  stfpdux  f21,r12,r14
+  stfpdux  f22,r12,r14
+  stfpdux  f23,r12,r14
+  stfpdux  f24,r12,r14
+  stfpdux  f25,r12,r14
+  stfpdux  f26,r12,r14
+  stfpdux  f27,r12,r14
+  stfpdux  f28,r12,r14
+  stfpdux  f29,r12,r14
+  stfpdux  f30,r12,r14
+  stfpdux  f31,r12,r14
+  lis      r11,.rodata@ha
+  addi     r10,r11,.rodata@l
+  lwz      r4,0(r4)
+  lfd      f23,0(r10)
+  lwz      r6,0(r6)
+  fmr      f0,f23
+  lwz      r8,0(r8)
+  fmr      f1,f23
+  fmr      f2,f23
+  fmr      f3,f23
+  fmr      f8,f23
+  fmr      f9,f23
+  fmr      f10,f23
+  fmr      f11,f23
+  fmr      f16,f23
+  fmr      f17,f23
+  fmr      f18,f23
+  fmr      f19,f23
+  fmr      f20,f23
+  fmr      f21,f23
+  fmr      f22,f23
+  li       r13,16
+  rlwinm   r9,r4,5,0,26
+  rlwinm   r19,r6,5,0,26
+  rlwinm   r14,r4,3,0,28
+  mr       r25,r14
+  rlwinm   r12,r6,3,0,28
+  mulli    r11,r6,-3
+  srawi    r26,r4,2
+  srawi    r28,r8,2
+  rlwinm   r29,r26,2,0,29
+  cmpw     cr6,r29,r4
+  mulli    r18,r6,-4
+  addi     r18,r18,2
+  addi     r11,r11,2
+  li       r10,0
+  rlwinm   r11,r11,3,0,28
+  rlwinm   r18,r18,3,0,28
+  mullw    r23,r4,r6
+  addi     r23,r23,-4
+  rlwinm   r23,r23,3,0,28
+  neg      r23,r23
+  subf     r14,r13,r14
+  add      r23,r23,r14
+  srawi    r16,r6,2
+  mr       r22,r7
+  cmpw     r6,r6
+  rlwinm   r29,r16,2,0,29
+  cmpw     cr7,r29,r6
+  mr       r24,r5
+  addi     r16,r16,-1
+  mr       r15,r7
+  li       r29,0
+
+.grabNgo_jloop:
+  subf     r20,r23,r3
+  subf     r21,r18,r24
+  addi     r30,r15,-32
+  li       r17,0
+
+.grabNgo_iloop:
+  fxcpmadd f16,f8,f0,f16
+  lfpdux   f4,r20,r23
+  fxcpmadd f17,f8,f1,f17
+  lfpdux   f5,r20,r13
+  fxcpmadd f18,f9,f0,f18
+  lfpdux   f12,r21,r18
+  fxcpmadd f19,f9,f1,f19
+  lfpdux   f13,r21,r12
+  fxcpmadd f20,f10,f0,f20
+  lfpdux   f14,r21,r12
+  fxcpmadd f21,f10,f1,f21
+  lfpdux   f15,r21,r12
+  fxcpmadd f22,f11,f0,f22
+  lfpdux   f6,r20,r14
+  fxcpmadd f23,f11,f1,f23
+  lfpdux   f7,r20,r13
+  fxcsmadd f24,f8,f2,f16
+  addi     r17,r17,1
+  fxcsmadd f25,f8,f3,f17
+  mtctr    r16
+  fxcsmadd f26,f9,f2,f18
+  addi     r30,r30,32
+  fxcsmadd f27,f9,f3,f19
+  cmpw     cr1,r17,r26
+  fxcsmadd f28,f10,f2,f20
+  lfpdux   f0,r20,r14
+  fxcsmadd f29,f10,f3,f21
+  lfpdux   f1,r20,r13
+  fxcsmadd f30,f11,f2,f22
+  fxcsmadd f31,f11,f3,f23
+  fxpmul   f16,f12,f4
+  stfpdux  f24,r22,r10
+  fxpmul   f17,f12,f5
+  stfpdux  f25,r22,r13
+  fxpmul   f18,f13,f4
+  stfpdux  f26,r22,r14
+  fxpmul   f19,f13,f5
+  stfpdux  f27,r22,r13
+  fxpmul   f20,f14,f4
+  stfpdux  f28,r22,r14
+  fxpmul   f21,f14,f5
+  stfpdux  f29,r22,r13
+  fxpmul   f22,f15,f4
+  stfpdux  f30,r22,r14
+  fxpmul   f23,f15,f5
+  stfpdux  f31,r22,r13
+  fxcsmadd f16,f12,f6,f16
+  lfpdux   f2,r20,r14
+  fxcsmadd f17,f12,f7,f17
+  lfpdux   f3,r20,r13
+  fxcsmadd f18,f13,f6,f18
+  lfpdux   f8,r21,r11
+  fxcsmadd f19,f13,f7,f19
+  lfpdux   f9,r21,r12
+  fxcsmadd f20,f14,f6,f20
+  lfpdux   f10,r21,r12
+  fxcsmadd f21,f14,f7,f21
+  lfpdux   f11,r21,r12
+  fxcsmadd f22,f15,f6,f22
+  fxcsmadd f23,f15,f7,f23
+  beq-     cr7,.grabNgo_k_even4
+  fxcpmadd f16,f8,f0,f16
+  lfpdux   f24,r20,r14
+  fxcpmadd f17,f8,f1,f17
+  lfpdux   f25,r20,r13
+  fxcpmadd f18,f9,f0,f18
+  lfpdux   f26,r21,r11
+  fxcpmadd f19,f9,f1,f19
+  lfpdux   f27,r21,r12
+  fxcpmadd f20,f10,f0,f20
+  lfpdux   f28,r21,r12
+  fxcpmadd f21,f10,f1,f21
+  lfpdux   f29,r21,r12
+  fxcpmadd f22,f11,f0,f22
+  lfpdux   f30,r20,r14
+  fxcpmadd f23,f11,f1,f23
+  lfpdux   f31,r20,r13
+  fxcsmadd f16,f8,f2,f16
+  fxcsmadd f17,f8,f3,f17
+  fxcsmadd f18,f9,f2,f18
+  fxcsmadd f19,f9,f3,f19
+  fxcsmadd f20,f10,f2,f20
+  fxcsmadd f21,f10,f3,f21
+  fxcsmadd f22,f11,f2,f22
+  fxcsmadd f23,f11,f3,f23
+  fxcpmadd f16,f26,f24,f16
+  lfpdux   f4,r20,r14
+  fxcpmadd f17,f26,f25,f17
+  lfpdux   f5,r20,r13
+  fxcpmadd f18,f27,f24,f18
+  lfpdux   f6,r20,r14
+  fxcpmadd f19,f27,f25,f19
+  lfpdux   f7,r20,r13
+  fxcpmadd f20,f28,f24,f20
+  lfpdux   f12,r21,r11
+  fxcpmadd f21,f28,f25,f21
+  lfpdux   f13,r21,r12
+  fxcpmadd f22,f29,f24,f22
+  lfpdux   f14,r21,r12
+  fxcpmadd f23,f29,f25,f23
+  lfpdux   f15,r21,r12
+  fxcsmadd f16,f26,f30,f16
+  mr       r22,r30
+  fxcsmadd f17,f26,f31,f17
+  fxcsmadd f18,f27,f30,f18
+  fxcsmadd f19,f27,f31,f19
+  fxcsmadd f20,f28,f30,f20
+  fxcsmadd f21,f28,f31,f21
+  fxcsmadd f22,f29,f30,f22
+  fxcsmadd f23,f29,f31,f23
+  b        .grabNgo_kloop_k4
+
+.grabNgo_k_even4:
+.grabNgo_kloop:
+  fxcpmadd f16,f8,f0,f16
+  lfpdux   f4,r20,r14
+  fxcpmadd f17,f8,f1,f17
+  lfpdux   f5,r20,r13
+  fxcpmadd f18,f9,f0,f18
+  lfpdux   f6,r20,r14
+  fxcpmadd f19,f9,f1,f19
+  lfpdux   f7,r20,r13
+  fxcpmadd f20,f10,f0,f20
+  lfpdux   f12,r21,r11
+  fxcpmadd f21,f10,f1,f21
+  lfpdux   f13,r21,r12
+  fxcpmadd f22,f11,f0,f22
+  lfpdux   f14,r21,r12
+  fxcpmadd f23,f11,f1,f23
+  lfpdux   f15,r21,r12
+  fxcsmadd f16,f8,f2,f16
+  mr       r22,r30
+  fxcsmadd f17,f8,f3,f17
+  fxcsmadd f18,f9,f2,f18
+  fxcsmadd f19,f9,f3,f19
+  fxcsmadd f20,f10,f2,f20
+  fxcsmadd f21,f10,f3,f21
+  fxcsmadd f22,f11,f2,f22
+  fxcsmadd f23,f11,f3,f23
+
+.grabNgo_kloop_k4:
+  fxcpmadd f16,f12,f4,f16
+  lfpdux   f0,r20,r14
+  fxcpmadd f17,f12,f5,f17
+  lfpdux   f1,r20,r13
+  fxcpmadd f18,f13,f4,f18
+  lfpdux   f2,r20,r14
+  fxcpmadd f19,f13,f5,f19
+  lfpdux   f3,r20,r13
+  fxcpmadd f20,f14,f4,f20
+  lfpdux   f8,r21,r11
+  fxcpmadd f21,f14,f5,f21
+  lfpdux   f9,r21,r12
+  fxcpmadd f22,f15,f4,f22
+  lfpdux   f10,r21,r12
+  fxcpmadd f23,f15,f5,f23
+  lfpdux   f11,r21,r12
+  fxcsmadd f16,f12,f6,f16
+  fxcsmadd f17,f12,f7,f17
+  fxcsmadd f18,f13,f6,f18
+  fxcsmadd f19,f13,f7,f19
+  fxcsmadd f20,f14,f6,f20
+  fxcsmadd f21,f14,f7,f21
+  fxcsmadd f22,f15,f6,f22
+  fxcsmadd f23,f15,f7,f23
+  bdnz+    .grabNgo_k_even4
+  blt+     cr1,.grabNgo_iloop
+  add      r24,r24,r19
+  add      r15,r15,r9
+  beq-     cr6,.grabNgo_n1even4
+  lfpdux   f4,r20,r23
+  lfpdux   f5,r20,r25
+  lfpdux   f12,r21,r18
+  lfpdux   f13,r21,r12
+  lfpdux   f14,r21,r12
+  lfpdux   f15,r21,r12
+  mtctr    r16
+  fxpmul   f24,f12,f4
+  fxpmul   f25,f13,f4
+  fxpmul   f26,f14,f4
+  fxpmul   f27,f15,f4
+  fxcsmadd f24,f12,f5,f24
+  fxcsmadd f25,f13,f5,f25
+  fxcsmadd f26,f14,f5,f26
+  fxcsmadd f27,f15,f5,f27
+  lfpdux   f6,r20,r25
+  lfpdux   f7,r20,r25
+  lfpdux   f28,r21,r11
+  lfpdux   f29,r21,r12
+  lfpdux   f30,r21,r12
+  lfpdux   f31,r21,r12
+  beq-     cr7,.grabNgo_k_even4_2
+  fxcpmadd f24,f28,f6,f24
+  fxcpmadd f25,f29,f6,f25
+  fxcpmadd f26,f30,f6,f26
+  fxcpmadd f27,f31,f6,f27
+  fxcsmadd f24,f28,f7,f24
+  fxcsmadd f25,f29,f7,f25
+  fxcsmadd f26,f30,f7,f26
+  fxcsmadd f27,f31,f7,f27
+  lfpdux   f6,r20,r25
+  lfpdux   f7,r20,r25
+  lfpdux   f28,r21,r11
+  lfpdux   f29,r21,r12
+  lfpdux   f30,r21,r12
+  lfpdux   f31,r21,r12
+
+.grabNgo_k_even4_2:
+  fxcpmadd f24,f28,f6,f24
+  lfpdux   f4,r20,r25
+  fxcpmadd f25,f29,f6,f25
+  lfpdux   f5,r20,r25
+  fxcpmadd f26,f30,f6,f26
+  lfpdux   f12,r21,r11
+  fxcpmadd f27,f31,f6,f27
+  lfpdux   f13,r21,r12
+  fxcsmadd f24,f28,f7,f24
+  lfpdux   f14,r21,r12
+  fxcsmadd f25,f29,f7,f25
+  lfpdux   f15,r21,r12
+  fxcsmadd f26,f30,f7,f26
+  fxcsmadd f27,f31,f7,f27
+  fxcpmadd f24,f12,f4,f24
+  lfpdux   f6,r20,r25
+  fxcpmadd f25,f13,f4,f25
+  lfpdux   f7,r20,r25
+  fxcpmadd f26,f14,f4,f26
+  lfpdux   f28,r21,r11
+  fxcpmadd f27,f15,f4,f27
+  lfpdux   f29,r21,r12
+  fxcsmadd f24,f12,f5,f24
+  lfpdux   f30,r21,r12
+  fxcsmadd f25,f13,f5,f25
+  lfpdux   f31,r21,r12
+  fxcsmadd f26,f14,f5,f26
+  fxcsmadd f27,f15,f5,f27
+  bdnz+    .grabNgo_k_even4_2
+  fxcpmadd f24,f28,f6,f24
+  fxcpmadd f25,f29,f6,f25
+  fxcpmadd f26,f30,f6,f26
+  fxcpmadd f27,f31,f6,f27
+  fxcsmadd f24,f28,f7,f24
+  fxcsmadd f25,f29,f7,f25
+  fxcsmadd f26,f30,f7,f26
+  fxcsmadd f27,f31,f7,f27
+  addi     r30,r30,32
+  stfpdux  f24,r30,r10
+  stfpdux  f25,r30,r25
+  stfpdux  f26,r30,r25
+  stfpdux  f27,r30,r25
+
+.grabNgo_n1even4:
+  addi     r29,r29,1
+  cmpw     cr5,r29,r28
+  blt+     cr5,.grabNgo_jloop
+  fxcpmadd f16,f8,f0,f16
+  fxcpmadd f17,f8,f1,f17
+  fxcpmadd f18,f9,f0,f18
+  fxcpmadd f19,f9,f1,f19
+  fxcpmadd f20,f10,f0,f20
+  fxcpmadd f21,f10,f1,f21
+  fxcpmadd f22,f11,f0,f22
+  fxcpmadd f23,f11,f1,f23
+  fxcsmadd f16,f8,f2,f16
+  fxcsmadd f17,f8,f3,f17
+  fxcsmadd f18,f9,f2,f18
+  fxcsmadd f19,f9,f3,f19
+  fxcsmadd f20,f10,f2,f20
+  fxcsmadd f21,f10,f3,f21
+  fxcsmadd f22,f11,f2,f22
+  fxcsmadd f23,f11,f3,f23
+  stfpdux  f16,r22,r10
+  stfpdux  f17,r22,r13
+  stfpdux  f18,r22,r14
+  stfpdux  f19,r22,r13
+  stfpdux  f20,r22,r14
+  stfpdux  f21,r22,r13
+  stfpdux  f22,r22,r14
+  stfpdux  f23,r22,r13
+  mr       r3,r1
+  li       r0,16
+  lfpdux   f14,r3,r0
+  lfpdux   f15,r3,r0
+  lfpdux   f16,r3,r0
+  lfpdux   f17,r3,r0
+  lfpdux   f18,r3,r0
+  lfpdux   f19,r3,r0
+  lfpdux   f20,r3,r0
+  lfpdux   f21,r3,r0
+  lfpdux   f22,r3,r0
+  lfpdux   f23,r3,r0
+  lfpdux   f24,r3,r0
+  lfpdux   f25,r3,r0
+  lfpdux   f26,r3,r0
+  lfpdux   f27,r3,r0
+  lfpdux   f28,r3,r0
+  lfpdux   f29,r3,r0
+  lfpdux   f30,r3,r0
+  lfpdux   f31,r3,r0
+  lwz      r14,360(r1)
+  li       r3,0
+  lwz      r15,364(r1)
+  lwz      r16,368(r1)
+  lwz      r17,372(r1)
+  lwz      r18,376(r1)
+  lwz      r19,380(r1)
+  lwz      r20,384(r1)
+  lwz      r21,388(r1)
+  lwz      r22,392(r1)
+  lwz      r23,396(r1)
+  lwz      r24,400(r1)
+  lwz      r25,404(r1)
+  lwz      r26,408(r1)
+  lwz      r28,416(r1)
+  lwz      r29,420(r1)
+  lwz      r30,424(r1)
+  lfd      f14,432(r1)
+  lfd      f15,440(r1)
+  lfd      f16,448(r1)
+  lfd      f17,456(r1)
+  lfd      f18,464(r1)
+  lfd      f19,472(r1)
+  lfd      f20,480(r1)
+  lfd      f21,488(r1)
+  lfd      f22,496(r1)
+  lfd      f23,504(r1)
+  lfd      f24,512(r1)
+  lfd      f25,520(r1)
+  lfd      f26,528(r1)
+  lfd      f27,536(r1)
+  lfd      f28,544(r1)
+  lfd      f29,552(r1)
+  lfd      f30,560(r1)
+  lfd      f31,568(r1)
+  addi     r1,r1,576
+  blr     
+
+.section ".rodata","a"
+.align 3
+  .long 0x00000000
+  .long 0x00000000
+
+
+.section ".data","wa"
+.align 3
+.type  seconds_per_cycle, @object
+.size  seconds_per_cycle, 8
+seconds_per_cycle:
+  .long 0x3e188aec
+  .long 0x70377bb0
+
+
+.ident "GCC: (GNU) 3.2"
diff --git a/src/bg_mxm44_uneven.s b/src/bg_mxm44_uneven.s
new file mode 100644
index 0000000..15bf7b9
--- /dev/null
+++ b/src/bg_mxm44_uneven.s
@@ -0,0 +1,82 @@
+.set r0,0; .set r1,1; .set r2,2; .set r3,3; .set r4,4
+.set r5,5; .set r6,6; .set r7,7; .set r8,8; .set r9,9
+.set r10,10; .set r11,11; .set r12,12; .set r13,13; .set r14,14
+.set r15,15; .set r16,16; .set r17,17; .set r18,18; .set r19,19
+.set r20,20; .set r21,21; .set r22,22; .set r23,23; .set r24,24
+.set r25,25; .set r26,26; .set r27,27; .set r28,28; .set r29,29
+.set r30,30; .set r31,31
+.set f0,0; .set f1,1; .set f2,2; .set f3,3; .set f4,4
+.set f5,5; .set f6,6; .set f7,7; .set f8,8; .set f9,9
+.set f10,10; .set f11,11; .set f12,12; .set f13,13; .set f14,14
+.set f15,15; .set f16,16; .set f17,17; .set f18,18; .set f19,19
+.set f20,20; .set f21,21; .set f22,22; .set f23,23; .set f24,24
+.set f25,25; .set f26,26; .set f27,27; .set f28,28; .set f29,29
+.set f30,30; .set f31,31
+
+.file "bg_mxm44_uneven.s"
+
+.globl bg_mxm44_uneven
+.type  bg_mxm44_uneven, @function
+.size  bg_mxm44_uneven, 220
+
+.section ".text"
+.align 2
+
+bg_mxm44_uneven:
+  stwu     r1,-64(r1)
+  mflr     r0
+  stw      r0,68(r1)
+  stw      r28,48(r1)
+  lwz      r9,0(r8)
+  addi     r8,r1,8
+  lwz      r28,0(r4)
+  srawi    r0,r9,2
+  addze    r0,r0
+  stw      r23,28(r1)
+  rlwinm   r0,r0,2,0,29
+  stw      r24,32(r1)
+  subf     r0,r0,r9
+  stw      r25,36(r1)
+  subf     r9,r0,r9
+  stw      r0,12(r1)
+  stw      r9,8(r1)
+  mr       r23,r4
+  stw      r26,40(r1)
+  mr       r24,r6
+  stw      r27,44(r1)
+  mr       r26,r5
+  stw      r29,52(r1)
+  mr       r27,r7
+  mr       r25,r3
+  lwz      r29,0(r6)
+  crclr    4*cr1+eq
+  bl       bg_mxm44
+  addi     r8,r1,12
+  lwz      r0,8(r1)
+  mr       r3,r25
+  mr       r4,r23
+  mr       r6,r24
+  mullw    r28,r28,r0
+  mullw    r29,r29,r0
+  rlwinm   r28,r28,3,0,28
+  add      r27,r27,r28
+  mr       r7,r27
+  rlwinm   r29,r29,3,0,28
+  add      r26,r26,r29
+  mr       r5,r26
+  crclr    4*cr1+eq
+  bl       mxm44_0
+  lwz      r29,52(r1)
+  lwz      r23,28(r1)
+  mr       r3,r0
+  lwz      r24,32(r1)
+  lwz      r0,68(r1)
+  lwz      r25,36(r1)
+  lwz      r26,40(r1)
+  mtlr     r0
+  lwz      r27,44(r1)
+  lwz      r28,48(r1)
+  addi     r1,r1,64
+  blr     
+
+.ident "GCC: (GNU) 3.2"
diff --git a/src/blas.f b/src/blas.f
new file mode 100644
index 0000000..e0129cc
--- /dev/null
+++ b/src/blas.f
@@ -0,0 +1,30886 @@
+      subroutine caxpy(n,ca,cx,incx,cy,incy)
+c
+c     constant times a vector plus a vector.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      complex cx(*),cy(*),ca
+      integer i,incx,incy,ix,iy,n
+c
+      if(n.le.0)return
+      if (abs(real(ca)) + abs(aimag(ca)) .eq. 0.0 ) return
+      if(incx.eq.1.and.incy.eq.1)go to 20
+c
+c        code for unequal increments or equal increments
+c          not equal to 1
+c
+      ix = 1
+      iy = 1
+      if(incx.lt.0)ix = (-n+1)*incx + 1
+      if(incy.lt.0)iy = (-n+1)*incy + 1
+      do 10 i = 1,n
+        cy(iy) = cy(iy) + ca*cx(ix)
+        ix = ix + incx
+        iy = iy + incy
+   10 continue
+      return
+c
+c        code for both increments equal to 1
+c
+   20 do 30 i = 1,n
+        cy(i) = cy(i) + ca*cx(i)
+   30 continue
+      return
+      end
+      subroutine  ccopy(n,cx,incx,cy,incy)
+c
+c     copies a vector, x, to a vector, y.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      complex cx(*),cy(*)
+      integer i,incx,incy,ix,iy,n
+c
+      if(n.le.0)return
+      if(incx.eq.1.and.incy.eq.1)go to 20
+c
+c        code for unequal increments or equal increments
+c          not equal to 1
+c
+      ix = 1
+      iy = 1
+      if(incx.lt.0)ix = (-n+1)*incx + 1
+      if(incy.lt.0)iy = (-n+1)*incy + 1
+      do 10 i = 1,n
+        cy(iy) = cx(ix)
+        ix = ix + incx
+        iy = iy + incy
+   10 continue
+      return
+c
+c        code for both increments equal to 1
+c
+   20 do 30 i = 1,n
+        cy(i) = cx(i)
+   30 continue
+      return
+      end
+      complex function cdotc(n,cx,incx,cy,incy)
+c
+c     forms the dot product of two vectors, conjugating the first
+c     vector.
+c     jack dongarra, linpack,  3/11/78.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      complex cx(*),cy(*),ctemp
+      integer i,incx,incy,ix,iy,n
+c
+      ctemp = (0.0,0.0)
+      cdotc = (0.0,0.0)
+      if(n.le.0)return
+      if(incx.eq.1.and.incy.eq.1)go to 20
+c
+c        code for unequal increments or equal increments
+c          not equal to 1
+c
+      ix = 1
+      iy = 1
+      if(incx.lt.0)ix = (-n+1)*incx + 1
+      if(incy.lt.0)iy = (-n+1)*incy + 1
+      do 10 i = 1,n
+        ctemp = ctemp + conjg(cx(ix))*cy(iy)
+        ix = ix + incx
+        iy = iy + incy
+   10 continue
+      cdotc = ctemp
+      return
+c
+c        code for both increments equal to 1
+c
+   20 do 30 i = 1,n
+        ctemp = ctemp + conjg(cx(i))*cy(i)
+   30 continue
+      cdotc = ctemp
+      return
+      end
+      complex function cdotu(n,cx,incx,cy,incy)
+c
+c     forms the dot product of two vectors.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      complex cx(*),cy(*),ctemp
+      integer i,incx,incy,ix,iy,n
+c
+      ctemp = (0.0,0.0)
+      cdotu = (0.0,0.0)
+      if(n.le.0)return
+      if(incx.eq.1.and.incy.eq.1)go to 20
+c
+c        code for unequal increments or equal increments
+c          not equal to 1
+c
+      ix = 1
+      iy = 1
+      if(incx.lt.0)ix = (-n+1)*incx + 1
+      if(incy.lt.0)iy = (-n+1)*incy + 1
+      do 10 i = 1,n
+        ctemp = ctemp + cx(ix)*cy(iy)
+        ix = ix + incx
+        iy = iy + incy
+   10 continue
+      cdotu = ctemp
+      return
+c
+c        code for both increments equal to 1
+c
+   20 do 30 i = 1,n
+        ctemp = ctemp + cx(i)*cy(i)
+   30 continue
+      cdotu = ctemp
+      return
+      end
+      SUBROUTINE CGBMV ( TRANS, M, N, KL, KU, ALPHA, A, LDA, X, INCX,
+     $                   BETA, Y, INCY )
+*     .. Scalar Arguments ..
+      COMPLEX            ALPHA, BETA
+      INTEGER            INCX, INCY, KL, KU, LDA, M, N
+      CHARACTER*1        TRANS
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CGBMV  performs one of the matrix-vector operations
+*
+*     y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   or
+*
+*     y := alpha*conjg( A' )*x + beta*y,
+*
+*  where alpha and beta are scalars, x and y are vectors and A is an
+*  m by n band matrix, with kl sub-diagonals and ku super-diagonals.
+*
+*  Parameters
+*  ==========
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.
+*
+*              TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.
+*
+*              TRANS = 'C' or 'c'   y := alpha*conjg( A' )*x + beta*y.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry, M specifies the number of rows of the matrix A.
+*           M must be at least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  KL     - INTEGER.
+*           On entry, KL specifies the number of sub-diagonals of the
+*           matrix A. KL must satisfy  0 .le. KL.
+*           Unchanged on exit.
+*
+*  KU     - INTEGER.
+*           On entry, KU specifies the number of super-diagonals of the
+*           matrix A. KU must satisfy  0 .le. KU.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX         .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX          array of DIMENSION ( LDA, n ).
+*           Before entry, the leading ( kl + ku + 1 ) by n part of the
+*           array A must contain the matrix of coefficients, supplied
+*           column by column, with the leading diagonal of the matrix in
+*           row ( ku + 1 ) of the array, the first super-diagonal
+*           starting at position 2 in row ku, the first sub-diagonal
+*           starting at position 1 in row ( ku + 2 ), and so on.
+*           Elements in the array A that do not correspond to elements
+*           in the band matrix (such as the top left ku by ku triangle)
+*           are not referenced.
+*           The following program segment will transfer a band matrix
+*           from conventional full matrix storage to band storage:
+*
+*                 DO 20, J = 1, N
+*                    K = KU + 1 - J
+*                    DO 10, I = MAX( 1, J - KU ), MIN( M, J + KL )
+*                       A( K + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           ( kl + ku + 1 ).
+*           Unchanged on exit.
+*
+*  X      - COMPLEX          array of DIMENSION at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
+*           and at least
+*           ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
+*           Before entry, the incremented array X must contain the
+*           vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  BETA   - COMPLEX         .
+*           On entry, BETA specifies the scalar beta. When BETA is
+*           supplied as zero then Y need not be set on input.
+*           Unchanged on exit.
+*
+*  Y      - COMPLEX          array of DIMENSION at least
+*           ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
+*           and at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
+*           Before entry, the incremented array Y must contain the
+*           vector y. On exit, Y is overwritten by the updated vector y.
+*
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX            ONE
+      PARAMETER        ( ONE  = ( 1.0E+0, 0.0E+0 ) )
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     .. Local Scalars ..
+      COMPLEX            TEMP
+      INTEGER            I, INFO, IX, IY, J, JX, JY, K, KUP1, KX, KY,
+     $                   LENX, LENY
+      LOGICAL            NOCONJ
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 1
+      ELSE IF( M.LT.0 )THEN
+         INFO = 2
+      ELSE IF( N.LT.0 )THEN
+         INFO = 3
+      ELSE IF( KL.LT.0 )THEN
+         INFO = 4
+      ELSE IF( KU.LT.0 )THEN
+         INFO = 5
+      ELSE IF( LDA.LT.( KL + KU + 1 ) )THEN
+         INFO = 8
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 10
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 13
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CGBMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.
+     $    ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+      NOCONJ = LSAME( TRANS, 'T' )
+*
+*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set
+*     up the start points in  X  and  Y.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+         LENX = N
+         LENY = M
+      ELSE
+         LENX = M
+         LENY = N
+      END IF
+      IF( INCX.GT.0 )THEN
+         KX = 1
+      ELSE
+         KX = 1 - ( LENX - 1 )*INCX
+      END IF
+      IF( INCY.GT.0 )THEN
+         KY = 1
+      ELSE
+         KY = 1 - ( LENY - 1 )*INCY
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through the band part of A.
+*
+*     First form  y := beta*y.
+*
+      IF( BETA.NE.ONE )THEN
+         IF( INCY.EQ.1 )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 10, I = 1, LENY
+                  Y( I ) = ZERO
+   10          CONTINUE
+            ELSE
+               DO 20, I = 1, LENY
+                  Y( I ) = BETA*Y( I )
+   20          CONTINUE
+            END IF
+         ELSE
+            IY = KY
+            IF( BETA.EQ.ZERO )THEN
+               DO 30, I = 1, LENY
+                  Y( IY ) = ZERO
+                  IY      = IY   + INCY
+   30          CONTINUE
+            ELSE
+               DO 40, I = 1, LENY
+                  Y( IY ) = BETA*Y( IY )
+                  IY      = IY           + INCY
+   40          CONTINUE
+            END IF
+         END IF
+      END IF
+      IF( ALPHA.EQ.ZERO )
+     $   RETURN
+      KUP1 = KU + 1
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  y := alpha*A*x + y.
+*
+         JX = KX
+         IF( INCY.EQ.1 )THEN
+            DO 60, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  K    = KUP1 - J
+                  DO 50, I = MAX( 1, J - KU ), MIN( M, J + KL )
+                     Y( I ) = Y( I ) + TEMP*A( K + I, J )
+   50             CONTINUE
+               END IF
+               JX = JX + INCX
+   60       CONTINUE
+         ELSE
+            DO 80, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  IY   = KY
+                  K    = KUP1 - J
+                  DO 70, I = MAX( 1, J - KU ), MIN( M, J + KL )
+                     Y( IY ) = Y( IY ) + TEMP*A( K + I, J )
+                     IY      = IY      + INCY
+   70             CONTINUE
+               END IF
+               JX = JX + INCX
+               IF( J.GT.KU )
+     $            KY = KY + INCY
+   80       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  y := alpha*A'*x + y  or  y := alpha*conjg( A' )*x + y.
+*
+         JY = KY
+         IF( INCX.EQ.1 )THEN
+            DO 110, J = 1, N
+               TEMP = ZERO
+               K    = KUP1 - J
+               IF( NOCONJ )THEN
+                  DO 90, I = MAX( 1, J - KU ), MIN( M, J + KL )
+                     TEMP = TEMP + A( K + I, J )*X( I )
+   90             CONTINUE
+               ELSE
+                  DO 100, I = MAX( 1, J - KU ), MIN( M, J + KL )
+                     TEMP = TEMP + CONJG( A( K + I, J ) )*X( I )
+  100             CONTINUE
+               END IF
+               Y( JY ) = Y( JY ) + ALPHA*TEMP
+               JY      = JY      + INCY
+  110       CONTINUE
+         ELSE
+            DO 140, J = 1, N
+               TEMP = ZERO
+               IX   = KX
+               K    = KUP1 - J
+               IF( NOCONJ )THEN
+                  DO 120, I = MAX( 1, J - KU ), MIN( M, J + KL )
+                     TEMP = TEMP + A( K + I, J )*X( IX )
+                     IX   = IX   + INCX
+  120             CONTINUE
+               ELSE
+                  DO 130, I = MAX( 1, J - KU ), MIN( M, J + KL )
+                     TEMP = TEMP + CONJG( A( K + I, J ) )*X( IX )
+                     IX   = IX   + INCX
+  130             CONTINUE
+               END IF
+               Y( JY ) = Y( JY ) + ALPHA*TEMP
+               JY      = JY      + INCY
+               IF( J.GT.KU )
+     $            KX = KX + INCX
+  140       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CGBMV .
+*
+      END
+      SUBROUTINE CGEMM ( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB,
+     $                   BETA, C, LDC )
+*     .. Scalar Arguments ..
+      CHARACTER*1        TRANSA, TRANSB
+      INTEGER            M, N, K, LDA, LDB, LDC
+      COMPLEX            ALPHA, BETA
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), B( LDB, * ), C( LDC, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CGEMM  performs one of the matrix-matrix operations
+*
+*     C := alpha*op( A )*op( B ) + beta*C,
+*
+*  where  op( X ) is one of
+*
+*     op( X ) = X   or   op( X ) = X'   or   op( X ) = conjg( X' ),
+*
+*  alpha and beta are scalars, and A, B and C are matrices, with op( A )
+*  an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+*
+*  Parameters
+*  ==========
+*
+*  TRANSA - CHARACTER*1.
+*           On entry, TRANSA specifies the form of op( A ) to be used in
+*           the matrix multiplication as follows:
+*
+*              TRANSA = 'N' or 'n',  op( A ) = A.
+*
+*              TRANSA = 'T' or 't',  op( A ) = A'.
+*
+*              TRANSA = 'C' or 'c',  op( A ) = conjg( A' ).
+*
+*           Unchanged on exit.
+*
+*  TRANSB - CHARACTER*1.
+*           On entry, TRANSB specifies the form of op( B ) to be used in
+*           the matrix multiplication as follows:
+*
+*              TRANSB = 'N' or 'n',  op( B ) = B.
+*
+*              TRANSB = 'T' or 't',  op( B ) = B'.
+*
+*              TRANSB = 'C' or 'c',  op( B ) = conjg( B' ).
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry,  M  specifies  the number  of rows  of the  matrix
+*           op( A )  and of the  matrix  C.  M  must  be at least  zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry,  N  specifies the number  of columns of the matrix
+*           op( B ) and the number of columns of the matrix C. N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry,  K  specifies  the number of columns of the matrix
+*           op( A ) and the number of rows of the matrix op( B ). K must
+*           be at least  zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX         .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX          array of DIMENSION ( LDA, ka ), where ka is
+*           k  when  TRANSA = 'N' or 'n',  and is  m  otherwise.
+*           Before entry with  TRANSA = 'N' or 'n',  the leading  m by k
+*           part of the array  A  must contain the matrix  A,  otherwise
+*           the leading  k by m  part of the array  A  must contain  the
+*           matrix A.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. When  TRANSA = 'N' or 'n' then
+*           LDA must be at least  max( 1, m ), otherwise  LDA must be at
+*           least  max( 1, k ).
+*           Unchanged on exit.
+*
+*  B      - COMPLEX          array of DIMENSION ( LDB, kb ), where kb is
+*           n  when  TRANSB = 'N' or 'n',  and is  k  otherwise.
+*           Before entry with  TRANSB = 'N' or 'n',  the leading  k by n
+*           part of the array  B  must contain the matrix  B,  otherwise
+*           the leading  n by k  part of the array  B  must contain  the
+*           matrix B.
+*           Unchanged on exit.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in the calling (sub) program. When  TRANSB = 'N' or 'n' then
+*           LDB must be at least  max( 1, k ), otherwise  LDB must be at
+*           least  max( 1, n ).
+*           Unchanged on exit.
+*
+*  BETA   - COMPLEX         .
+*           On entry,  BETA  specifies the scalar  beta.  When  BETA  is
+*           supplied as zero then C need not be set on input.
+*           Unchanged on exit.
+*
+*  C      - COMPLEX          array of DIMENSION ( LDC, n ).
+*           Before entry, the leading  m by n  part of the array  C must
+*           contain the matrix  C,  except when  beta  is zero, in which
+*           case C need not be set on entry.
+*           On exit, the array  C  is overwritten by the  m by n  matrix
+*           ( alpha*op( A )*op( B ) + beta*C ).
+*
+*  LDC    - INTEGER.
+*           On entry, LDC specifies the first dimension of C as declared
+*           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG, MAX
+*     .. Local Scalars ..
+      LOGICAL            CONJA, CONJB, NOTA, NOTB
+      INTEGER            I, INFO, J, L, NCOLA, NROWA, NROWB
+      COMPLEX            TEMP
+*     .. Parameters ..
+      COMPLEX            ONE
+      PARAMETER        ( ONE  = ( 1.0E+0, 0.0E+0 ) )
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Executable Statements ..
+*
+*     Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
+*     conjugated or transposed, set  CONJA and CONJB  as true if  A  and
+*     B  respectively are to be  transposed but  not conjugated  and set
+*     NROWA, NCOLA and  NROWB  as the number of rows and  columns  of  A
+*     and the number of rows of  B  respectively.
+*
+      NOTA  = LSAME( TRANSA, 'N' )
+      NOTB  = LSAME( TRANSB, 'N' )
+      CONJA = LSAME( TRANSA, 'C' )
+      CONJB = LSAME( TRANSB, 'C' )
+      IF( NOTA )THEN
+         NROWA = M
+         NCOLA = K
+      ELSE
+         NROWA = K
+         NCOLA = M
+      END IF
+      IF( NOTB )THEN
+         NROWB = K
+      ELSE
+         NROWB = N
+      END IF
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF(      ( .NOT.NOTA                 ).AND.
+     $         ( .NOT.CONJA                ).AND.
+     $         ( .NOT.LSAME( TRANSA, 'T' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.NOTB                 ).AND.
+     $         ( .NOT.CONJB                ).AND.
+     $         ( .NOT.LSAME( TRANSB, 'T' ) )      )THEN
+         INFO = 2
+      ELSE IF( M  .LT.0               )THEN
+         INFO = 3
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 4
+      ELSE IF( K  .LT.0               )THEN
+         INFO = 5
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 8
+      ELSE IF( LDB.LT.MAX( 1, NROWB ) )THEN
+         INFO = 10
+      ELSE IF( LDC.LT.MAX( 1, M     ) )THEN
+         INFO = 13
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CGEMM ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.
+     $    ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         IF( BETA.EQ.ZERO )THEN
+            DO 20, J = 1, N
+               DO 10, I = 1, M
+                  C( I, J ) = ZERO
+   10          CONTINUE
+   20       CONTINUE
+         ELSE
+            DO 40, J = 1, N
+               DO 30, I = 1, M
+                  C( I, J ) = BETA*C( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( NOTB )THEN
+         IF( NOTA )THEN
+*
+*           Form  C := alpha*A*B + beta*C.
+*
+            DO 90, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 50, I = 1, M
+                     C( I, J ) = ZERO
+   50             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 60, I = 1, M
+                     C( I, J ) = BETA*C( I, J )
+   60             CONTINUE
+               END IF
+               DO 80, L = 1, K
+                  IF( B( L, J ).NE.ZERO )THEN
+                     TEMP = ALPHA*B( L, J )
+                     DO 70, I = 1, M
+                        C( I, J ) = C( I, J ) + TEMP*A( I, L )
+   70                CONTINUE
+                  END IF
+   80          CONTINUE
+   90       CONTINUE
+         ELSE IF( CONJA )THEN
+*
+*           Form  C := alpha*conjg( A' )*B + beta*C.
+*
+            DO 120, J = 1, N
+               DO 110, I = 1, M
+                  TEMP = ZERO
+                  DO 100, L = 1, K
+                     TEMP = TEMP + CONJG( A( L, I ) )*B( L, J )
+  100             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  110          CONTINUE
+  120       CONTINUE
+         ELSE
+*
+*           Form  C := alpha*A'*B + beta*C
+*
+            DO 150, J = 1, N
+               DO 140, I = 1, M
+                  TEMP = ZERO
+                  DO 130, L = 1, K
+                     TEMP = TEMP + A( L, I )*B( L, J )
+  130             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  140          CONTINUE
+  150       CONTINUE
+         END IF
+      ELSE IF( NOTA )THEN
+         IF( CONJB )THEN
+*
+*           Form  C := alpha*A*conjg( B' ) + beta*C.
+*
+            DO 200, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 160, I = 1, M
+                     C( I, J ) = ZERO
+  160             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 170, I = 1, M
+                     C( I, J ) = BETA*C( I, J )
+  170             CONTINUE
+               END IF
+               DO 190, L = 1, K
+                  IF( B( J, L ).NE.ZERO )THEN
+                     TEMP = ALPHA*CONJG( B( J, L ) )
+                     DO 180, I = 1, M
+                        C( I, J ) = C( I, J ) + TEMP*A( I, L )
+  180                CONTINUE
+                  END IF
+  190          CONTINUE
+  200       CONTINUE
+         ELSE
+*
+*           Form  C := alpha*A*B'          + beta*C
+*
+            DO 250, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 210, I = 1, M
+                     C( I, J ) = ZERO
+  210             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 220, I = 1, M
+                     C( I, J ) = BETA*C( I, J )
+  220             CONTINUE
+               END IF
+               DO 240, L = 1, K
+                  IF( B( J, L ).NE.ZERO )THEN
+                     TEMP = ALPHA*B( J, L )
+                     DO 230, I = 1, M
+                        C( I, J ) = C( I, J ) + TEMP*A( I, L )
+  230                CONTINUE
+                  END IF
+  240          CONTINUE
+  250       CONTINUE
+         END IF
+      ELSE IF( CONJA )THEN
+         IF( CONJB )THEN
+*
+*           Form  C := alpha*conjg( A' )*conjg( B' ) + beta*C.
+*
+            DO 280, J = 1, N
+               DO 270, I = 1, M
+                  TEMP = ZERO
+                  DO 260, L = 1, K
+                     TEMP = TEMP + CONJG( A( L, I ) )*CONJG( B( J, L ) )
+  260             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  270          CONTINUE
+  280       CONTINUE
+         ELSE
+*
+*           Form  C := alpha*conjg( A' )*B' + beta*C
+*
+            DO 310, J = 1, N
+               DO 300, I = 1, M
+                  TEMP = ZERO
+                  DO 290, L = 1, K
+                     TEMP = TEMP + CONJG( A( L, I ) )*B( J, L )
+  290             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  300          CONTINUE
+  310       CONTINUE
+         END IF
+      ELSE
+         IF( CONJB )THEN
+*
+*           Form  C := alpha*A'*conjg( B' ) + beta*C
+*
+            DO 340, J = 1, N
+               DO 330, I = 1, M
+                  TEMP = ZERO
+                  DO 320, L = 1, K
+                     TEMP = TEMP + A( L, I )*CONJG( B( J, L ) )
+  320             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  330          CONTINUE
+  340       CONTINUE
+         ELSE
+*
+*           Form  C := alpha*A'*B' + beta*C
+*
+            DO 370, J = 1, N
+               DO 360, I = 1, M
+                  TEMP = ZERO
+                  DO 350, L = 1, K
+                     TEMP = TEMP + A( L, I )*B( J, L )
+  350             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  360          CONTINUE
+  370       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CGEMM .
+*
+      END
+      SUBROUTINE CGEMV ( TRANS, M, N, ALPHA, A, LDA, X, INCX,
+     $                   BETA, Y, INCY )
+*     .. Scalar Arguments ..
+      COMPLEX            ALPHA, BETA
+      INTEGER            INCX, INCY, LDA, M, N
+      CHARACTER*1        TRANS
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CGEMV  performs one of the matrix-vector operations
+*
+*     y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   or
+*
+*     y := alpha*conjg( A' )*x + beta*y,
+*
+*  where alpha and beta are scalars, x and y are vectors and A is an
+*  m by n matrix.
+*
+*  Parameters
+*  ==========
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.
+*
+*              TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.
+*
+*              TRANS = 'C' or 'c'   y := alpha*conjg( A' )*x + beta*y.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry, M specifies the number of rows of the matrix A.
+*           M must be at least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX         .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX          array of DIMENSION ( LDA, n ).
+*           Before entry, the leading m by n part of the array A must
+*           contain the matrix of coefficients.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*  X      - COMPLEX          array of DIMENSION at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
+*           and at least
+*           ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
+*           Before entry, the incremented array X must contain the
+*           vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  BETA   - COMPLEX         .
+*           On entry, BETA specifies the scalar beta. When BETA is
+*           supplied as zero then Y need not be set on input.
+*           Unchanged on exit.
+*
+*  Y      - COMPLEX          array of DIMENSION at least
+*           ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
+*           and at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
+*           Before entry with BETA non-zero, the incremented array Y
+*           must contain the vector y. On exit, Y is overwritten by the
+*           updated vector y.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX            ONE
+      PARAMETER        ( ONE  = ( 1.0E+0, 0.0E+0 ) )
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     .. Local Scalars ..
+      COMPLEX            TEMP
+      INTEGER            I, INFO, IX, IY, J, JX, JY, KX, KY, LENX, LENY
+      LOGICAL            NOCONJ
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG, MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 1
+      ELSE IF( M.LT.0 )THEN
+         INFO = 2
+      ELSE IF( N.LT.0 )THEN
+         INFO = 3
+      ELSE IF( LDA.LT.MAX( 1, M ) )THEN
+         INFO = 6
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 8
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 11
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CGEMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.
+     $    ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+      NOCONJ = LSAME( TRANS, 'T' )
+*
+*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set
+*     up the start points in  X  and  Y.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+         LENX = N
+         LENY = M
+      ELSE
+         LENX = M
+         LENY = N
+      END IF
+      IF( INCX.GT.0 )THEN
+         KX = 1
+      ELSE
+         KX = 1 - ( LENX - 1 )*INCX
+      END IF
+      IF( INCY.GT.0 )THEN
+         KY = 1
+      ELSE
+         KY = 1 - ( LENY - 1 )*INCY
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through A.
+*
+*     First form  y := beta*y.
+*
+      IF( BETA.NE.ONE )THEN
+         IF( INCY.EQ.1 )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 10, I = 1, LENY
+                  Y( I ) = ZERO
+   10          CONTINUE
+            ELSE
+               DO 20, I = 1, LENY
+                  Y( I ) = BETA*Y( I )
+   20          CONTINUE
+            END IF
+         ELSE
+            IY = KY
+            IF( BETA.EQ.ZERO )THEN
+               DO 30, I = 1, LENY
+                  Y( IY ) = ZERO
+                  IY      = IY   + INCY
+   30          CONTINUE
+            ELSE
+               DO 40, I = 1, LENY
+                  Y( IY ) = BETA*Y( IY )
+                  IY      = IY           + INCY
+   40          CONTINUE
+            END IF
+         END IF
+      END IF
+      IF( ALPHA.EQ.ZERO )
+     $   RETURN
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  y := alpha*A*x + y.
+*
+         JX = KX
+         IF( INCY.EQ.1 )THEN
+            DO 60, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  DO 50, I = 1, M
+                     Y( I ) = Y( I ) + TEMP*A( I, J )
+   50             CONTINUE
+               END IF
+               JX = JX + INCX
+   60       CONTINUE
+         ELSE
+            DO 80, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  IY   = KY
+                  DO 70, I = 1, M
+                     Y( IY ) = Y( IY ) + TEMP*A( I, J )
+                     IY      = IY      + INCY
+   70             CONTINUE
+               END IF
+               JX = JX + INCX
+   80       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  y := alpha*A'*x + y  or  y := alpha*conjg( A' )*x + y.
+*
+         JY = KY
+         IF( INCX.EQ.1 )THEN
+            DO 110, J = 1, N
+               TEMP = ZERO
+               IF( NOCONJ )THEN
+                  DO 90, I = 1, M
+                     TEMP = TEMP + A( I, J )*X( I )
+   90             CONTINUE
+               ELSE
+                  DO 100, I = 1, M
+                     TEMP = TEMP + CONJG( A( I, J ) )*X( I )
+  100             CONTINUE
+               END IF
+               Y( JY ) = Y( JY ) + ALPHA*TEMP
+               JY      = JY      + INCY
+  110       CONTINUE
+         ELSE
+            DO 140, J = 1, N
+               TEMP = ZERO
+               IX   = KX
+               IF( NOCONJ )THEN
+                  DO 120, I = 1, M
+                     TEMP = TEMP + A( I, J )*X( IX )
+                     IX   = IX   + INCX
+  120             CONTINUE
+               ELSE
+                  DO 130, I = 1, M
+                     TEMP = TEMP + CONJG( A( I, J ) )*X( IX )
+                     IX   = IX   + INCX
+  130             CONTINUE
+               END IF
+               Y( JY ) = Y( JY ) + ALPHA*TEMP
+               JY      = JY      + INCY
+  140       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CGEMV .
+*
+      END
+      SUBROUTINE CGERC ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
+*     .. Scalar Arguments ..
+      COMPLEX            ALPHA
+      INTEGER            INCX, INCY, LDA, M, N
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CGERC  performs the rank 1 operation
+*
+*     A := alpha*x*conjg( y' ) + A,
+*
+*  where alpha is a scalar, x is an m element vector, y is an n element
+*  vector and A is an m by n matrix.
+*
+*  Parameters
+*  ==========
+*
+*  M      - INTEGER.
+*           On entry, M specifies the number of rows of the matrix A.
+*           M must be at least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX         .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  X      - COMPLEX          array of dimension at least
+*           ( 1 + ( m - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the m
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  Y      - COMPLEX          array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the n
+*           element vector y.
+*           Unchanged on exit.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX          array of DIMENSION ( LDA, n ).
+*           Before entry, the leading m by n part of the array A must
+*           contain the matrix of coefficients. On exit, A is
+*           overwritten by the updated matrix.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     .. Local Scalars ..
+      COMPLEX            TEMP
+      INTEGER            I, INFO, IX, J, JY, KX
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG, MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( M.LT.0 )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 5
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 7
+      ELSE IF( LDA.LT.MAX( 1, M ) )THEN
+         INFO = 9
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CGERC ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) )
+     $   RETURN
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through A.
+*
+      IF( INCY.GT.0 )THEN
+         JY = 1
+      ELSE
+         JY = 1 - ( N - 1 )*INCY
+      END IF
+      IF( INCX.EQ.1 )THEN
+         DO 20, J = 1, N
+            IF( Y( JY ).NE.ZERO )THEN
+               TEMP = ALPHA*CONJG( Y( JY ) )
+               DO 10, I = 1, M
+                  A( I, J ) = A( I, J ) + X( I )*TEMP
+   10          CONTINUE
+            END IF
+            JY = JY + INCY
+   20    CONTINUE
+      ELSE
+         IF( INCX.GT.0 )THEN
+            KX = 1
+         ELSE
+            KX = 1 - ( M - 1 )*INCX
+         END IF
+         DO 40, J = 1, N
+            IF( Y( JY ).NE.ZERO )THEN
+               TEMP = ALPHA*CONJG( Y( JY ) )
+               IX   = KX
+               DO 30, I = 1, M
+                  A( I, J ) = A( I, J ) + X( IX )*TEMP
+                  IX        = IX        + INCX
+   30          CONTINUE
+            END IF
+            JY = JY + INCY
+   40    CONTINUE
+      END IF
+*
+      RETURN
+*
+*     End of CGERC .
+*
+      END
+      SUBROUTINE CGERU ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
+*     .. Scalar Arguments ..
+      COMPLEX            ALPHA
+      INTEGER            INCX, INCY, LDA, M, N
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CGERU  performs the rank 1 operation
+*
+*     A := alpha*x*y' + A,
+*
+*  where alpha is a scalar, x is an m element vector, y is an n element
+*  vector and A is an m by n matrix.
+*
+*  Parameters
+*  ==========
+*
+*  M      - INTEGER.
+*           On entry, M specifies the number of rows of the matrix A.
+*           M must be at least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX         .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  X      - COMPLEX          array of dimension at least
+*           ( 1 + ( m - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the m
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  Y      - COMPLEX          array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the n
+*           element vector y.
+*           Unchanged on exit.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX          array of DIMENSION ( LDA, n ).
+*           Before entry, the leading m by n part of the array A must
+*           contain the matrix of coefficients. On exit, A is
+*           overwritten by the updated matrix.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     .. Local Scalars ..
+      COMPLEX            TEMP
+      INTEGER            I, INFO, IX, J, JY, KX
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( M.LT.0 )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 5
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 7
+      ELSE IF( LDA.LT.MAX( 1, M ) )THEN
+         INFO = 9
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CGERU ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) )
+     $   RETURN
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through A.
+*
+      IF( INCY.GT.0 )THEN
+         JY = 1
+      ELSE
+         JY = 1 - ( N - 1 )*INCY
+      END IF
+      IF( INCX.EQ.1 )THEN
+         DO 20, J = 1, N
+            IF( Y( JY ).NE.ZERO )THEN
+               TEMP = ALPHA*Y( JY )
+               DO 10, I = 1, M
+                  A( I, J ) = A( I, J ) + X( I )*TEMP
+   10          CONTINUE
+            END IF
+            JY = JY + INCY
+   20    CONTINUE
+      ELSE
+         IF( INCX.GT.0 )THEN
+            KX = 1
+         ELSE
+            KX = 1 - ( M - 1 )*INCX
+         END IF
+         DO 40, J = 1, N
+            IF( Y( JY ).NE.ZERO )THEN
+               TEMP = ALPHA*Y( JY )
+               IX   = KX
+               DO 30, I = 1, M
+                  A( I, J ) = A( I, J ) + X( IX )*TEMP
+                  IX        = IX        + INCX
+   30          CONTINUE
+            END IF
+            JY = JY + INCY
+   40    CONTINUE
+      END IF
+*
+      RETURN
+*
+*     End of CGERU .
+*
+      END
+      SUBROUTINE CHBMV ( UPLO, N, K, ALPHA, A, LDA, X, INCX,
+     $                   BETA, Y, INCY )
+*     .. Scalar Arguments ..
+      COMPLEX            ALPHA, BETA
+      INTEGER            INCX, INCY, K, LDA, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CHBMV  performs the matrix-vector  operation
+*
+*     y := alpha*A*x + beta*y,
+*
+*  where alpha and beta are scalars, x and y are n element vectors and
+*  A is an n by n hermitian band matrix, with k super-diagonals.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the band matrix A is being supplied as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   The upper triangular part of A is
+*                                  being supplied.
+*
+*              UPLO = 'L' or 'l'   The lower triangular part of A is
+*                                  being supplied.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry, K specifies the number of super-diagonals of the
+*           matrix A. K must satisfy  0 .le. K.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX         .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX          array of DIMENSION ( LDA, n ).
+*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
+*           by n part of the array A must contain the upper triangular
+*           band part of the hermitian matrix, supplied column by
+*           column, with the leading diagonal of the matrix in row
+*           ( k + 1 ) of the array, the first super-diagonal starting at
+*           position 2 in row k, and so on. The top left k by k triangle
+*           of the array A is not referenced.
+*           The following program segment will transfer the upper
+*           triangular part of a hermitian band matrix from conventional
+*           full matrix storage to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = K + 1 - J
+*                    DO 10, I = MAX( 1, J - K ), J
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
+*           by n part of the array A must contain the lower triangular
+*           band part of the hermitian matrix, supplied column by
+*           column, with the leading diagonal of the matrix in row 1 of
+*           the array, the first sub-diagonal starting at position 1 in
+*           row 2, and so on. The bottom right k by k triangle of the
+*           array A is not referenced.
+*           The following program segment will transfer the lower
+*           triangular part of a hermitian band matrix from conventional
+*           full matrix storage to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = 1 - J
+*                    DO 10, I = J, MIN( N, J + K )
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Note that the imaginary parts of the diagonal elements need
+*           not be set and are assumed to be zero.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           ( k + 1 ).
+*           Unchanged on exit.
+*
+*  X      - COMPLEX          array of DIMENSION at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the
+*           vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  BETA   - COMPLEX         .
+*           On entry, BETA specifies the scalar beta.
+*           Unchanged on exit.
+*
+*  Y      - COMPLEX          array of DIMENSION at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the
+*           vector y. On exit, Y is overwritten by the updated vector y.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX            ONE
+      PARAMETER        ( ONE  = ( 1.0E+0, 0.0E+0 ) )
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     .. Local Scalars ..
+      COMPLEX            TEMP1, TEMP2
+      INTEGER            I, INFO, IX, IY, J, JX, JY, KPLUS1, KX, KY, L
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG, MAX, MIN, REAL
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( K.LT.0 )THEN
+         INFO = 3
+      ELSE IF( LDA.LT.( K + 1 ) )THEN
+         INFO = 6
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 8
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 11
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CHBMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     Set up the start points in  X  and  Y.
+*
+      IF( INCX.GT.0 )THEN
+         KX = 1
+      ELSE
+         KX = 1 - ( N - 1 )*INCX
+      END IF
+      IF( INCY.GT.0 )THEN
+         KY = 1
+      ELSE
+         KY = 1 - ( N - 1 )*INCY
+      END IF
+*
+*     Start the operations. In this version the elements of the array A
+*     are accessed sequentially with one pass through A.
+*
+*     First form  y := beta*y.
+*
+      IF( BETA.NE.ONE )THEN
+         IF( INCY.EQ.1 )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 10, I = 1, N
+                  Y( I ) = ZERO
+   10          CONTINUE
+            ELSE
+               DO 20, I = 1, N
+                  Y( I ) = BETA*Y( I )
+   20          CONTINUE
+            END IF
+         ELSE
+            IY = KY
+            IF( BETA.EQ.ZERO )THEN
+               DO 30, I = 1, N
+                  Y( IY ) = ZERO
+                  IY      = IY   + INCY
+   30          CONTINUE
+            ELSE
+               DO 40, I = 1, N
+                  Y( IY ) = BETA*Y( IY )
+                  IY      = IY           + INCY
+   40          CONTINUE
+            END IF
+         END IF
+      END IF
+      IF( ALPHA.EQ.ZERO )
+     $   RETURN
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  y  when upper triangle of A is stored.
+*
+         KPLUS1 = K + 1
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 60, J = 1, N
+               TEMP1 = ALPHA*X( J )
+               TEMP2 = ZERO
+               L     = KPLUS1 - J
+               DO 50, I = MAX( 1, J - K ), J - 1
+                  Y( I ) = Y( I ) + TEMP1*A( L + I, J )
+                  TEMP2  = TEMP2  + CONJG( A( L + I, J ) )*X( I )
+   50          CONTINUE
+               Y( J ) = Y( J ) + TEMP1*REAL( A( KPLUS1, J ) )
+     $                         + ALPHA*TEMP2
+   60       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 80, J = 1, N
+               TEMP1 = ALPHA*X( JX )
+               TEMP2 = ZERO
+               IX    = KX
+               IY    = KY
+               L     = KPLUS1 - J
+               DO 70, I = MAX( 1, J - K ), J - 1
+                  Y( IY ) = Y( IY ) + TEMP1*A( L + I, J )
+                  TEMP2   = TEMP2   + CONJG( A( L + I, J ) )*X( IX )
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+   70          CONTINUE
+               Y( JY ) = Y( JY ) + TEMP1*REAL( A( KPLUS1, J ) )
+     $                           + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+               IF( J.GT.K )THEN
+                  KX = KX + INCX
+                  KY = KY + INCY
+               END IF
+   80       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  y  when lower triangle of A is stored.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 100, J = 1, N
+               TEMP1  = ALPHA*X( J )
+               TEMP2  = ZERO
+               Y( J ) = Y( J ) + TEMP1*REAL( A( 1, J ) )
+               L      = 1      - J
+               DO 90, I = J + 1, MIN( N, J + K )
+                  Y( I ) = Y( I ) + TEMP1*A( L + I, J )
+                  TEMP2  = TEMP2  + CONJG( A( L + I, J ) )*X( I )
+   90          CONTINUE
+               Y( J ) = Y( J ) + ALPHA*TEMP2
+  100       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 120, J = 1, N
+               TEMP1   = ALPHA*X( JX )
+               TEMP2   = ZERO
+               Y( JY ) = Y( JY ) + TEMP1*REAL( A( 1, J ) )
+               L       = 1       - J
+               IX      = JX
+               IY      = JY
+               DO 110, I = J + 1, MIN( N, J + K )
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+                  Y( IY ) = Y( IY ) + TEMP1*A( L + I, J )
+                  TEMP2   = TEMP2   + CONJG( A( L + I, J ) )*X( IX )
+  110          CONTINUE
+               Y( JY ) = Y( JY ) + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+  120       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CHBMV .
+*
+      END
+      SUBROUTINE CHEMM ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB,
+     $                   BETA, C, LDC )
+*     .. Scalar Arguments ..
+      CHARACTER*1        SIDE, UPLO
+      INTEGER            M, N, LDA, LDB, LDC
+      COMPLEX            ALPHA, BETA
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), B( LDB, * ), C( LDC, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CHEMM  performs one of the matrix-matrix operations
+*
+*     C := alpha*A*B + beta*C,
+*
+*  or
+*
+*     C := alpha*B*A + beta*C,
+*
+*  where alpha and beta are scalars, A is an hermitian matrix and  B and
+*  C are m by n matrices.
+*
+*  Parameters
+*  ==========
+*
+*  SIDE   - CHARACTER*1.
+*           On entry,  SIDE  specifies whether  the  hermitian matrix  A
+*           appears on the  left or right  in the  operation as follows:
+*
+*              SIDE = 'L' or 'l'   C := alpha*A*B + beta*C,
+*
+*              SIDE = 'R' or 'r'   C := alpha*B*A + beta*C,
+*
+*           Unchanged on exit.
+*
+*  UPLO   - CHARACTER*1.
+*           On  entry,   UPLO  specifies  whether  the  upper  or  lower
+*           triangular  part  of  the  hermitian  matrix   A  is  to  be
+*           referenced as follows:
+*
+*              UPLO = 'U' or 'u'   Only the upper triangular part of the
+*                                  hermitian matrix is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the lower triangular part of the
+*                                  hermitian matrix is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry,  M  specifies the number of rows of the matrix  C.
+*           M  must be at least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of the matrix C.
+*           N  must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX         .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX          array of DIMENSION ( LDA, ka ), where ka is
+*           m  when  SIDE = 'L' or 'l'  and is n  otherwise.
+*           Before entry  with  SIDE = 'L' or 'l',  the  m by m  part of
+*           the array  A  must contain the  hermitian matrix,  such that
+*           when  UPLO = 'U' or 'u', the leading m by m upper triangular
+*           part of the array  A  must contain the upper triangular part
+*           of the  hermitian matrix and the  strictly  lower triangular
+*           part of  A  is not referenced,  and when  UPLO = 'L' or 'l',
+*           the leading  m by m  lower triangular part  of the  array  A
+*           must  contain  the  lower triangular part  of the  hermitian
+*           matrix and the  strictly upper triangular part of  A  is not
+*           referenced.
+*           Before entry  with  SIDE = 'R' or 'r',  the  n by n  part of
+*           the array  A  must contain the  hermitian matrix,  such that
+*           when  UPLO = 'U' or 'u', the leading n by n upper triangular
+*           part of the array  A  must contain the upper triangular part
+*           of the  hermitian matrix and the  strictly  lower triangular
+*           part of  A  is not referenced,  and when  UPLO = 'L' or 'l',
+*           the leading  n by n  lower triangular part  of the  array  A
+*           must  contain  the  lower triangular part  of the  hermitian
+*           matrix and the  strictly upper triangular part of  A  is not
+*           referenced.
+*           Note that the imaginary parts  of the diagonal elements need
+*           not be set, they are assumed to be zero.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the  calling (sub) program. When  SIDE = 'L' or 'l'  then
+*           LDA must be at least  max( 1, m ), otherwise  LDA must be at
+*           least max( 1, n ).
+*           Unchanged on exit.
+*
+*  B      - COMPLEX          array of DIMENSION ( LDB, n ).
+*           Before entry, the leading  m by n part of the array  B  must
+*           contain the matrix B.
+*           Unchanged on exit.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in  the  calling  (sub)  program.   LDB  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*  BETA   - COMPLEX         .
+*           On entry,  BETA  specifies the scalar  beta.  When  BETA  is
+*           supplied as zero then C need not be set on input.
+*           Unchanged on exit.
+*
+*  C      - COMPLEX          array of DIMENSION ( LDC, n ).
+*           Before entry, the leading  m by n  part of the array  C must
+*           contain the matrix  C,  except when  beta  is zero, in which
+*           case C need not be set on entry.
+*           On exit, the array  C  is overwritten by the  m by n updated
+*           matrix.
+*
+*  LDC    - INTEGER.
+*           On entry, LDC specifies the first dimension of C as declared
+*           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG, MAX, REAL
+*     .. Local Scalars ..
+      LOGICAL            UPPER
+      INTEGER            I, INFO, J, K, NROWA
+      COMPLEX            TEMP1, TEMP2
+*     .. Parameters ..
+      COMPLEX            ONE
+      PARAMETER        ( ONE  = ( 1.0E+0, 0.0E+0 ) )
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Executable Statements ..
+*
+*     Set NROWA as the number of rows of A.
+*
+      IF( LSAME( SIDE, 'L' ) )THEN
+         NROWA = M
+      ELSE
+         NROWA = N
+      END IF
+      UPPER = LSAME( UPLO, 'U' )
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF(      ( .NOT.LSAME( SIDE, 'L' ) ).AND.
+     $         ( .NOT.LSAME( SIDE, 'R' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.UPPER              ).AND.
+     $         ( .NOT.LSAME( UPLO, 'L' ) )      )THEN
+         INFO = 2
+      ELSE IF( M  .LT.0               )THEN
+         INFO = 3
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 7
+      ELSE IF( LDB.LT.MAX( 1, M     ) )THEN
+         INFO = 9
+      ELSE IF( LDC.LT.MAX( 1, M     ) )THEN
+         INFO = 12
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CHEMM ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.
+     $    ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         IF( BETA.EQ.ZERO )THEN
+            DO 20, J = 1, N
+               DO 10, I = 1, M
+                  C( I, J ) = ZERO
+   10          CONTINUE
+   20       CONTINUE
+         ELSE
+            DO 40, J = 1, N
+               DO 30, I = 1, M
+                  C( I, J ) = BETA*C( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSAME( SIDE, 'L' ) )THEN
+*
+*        Form  C := alpha*A*B + beta*C.
+*
+         IF( UPPER )THEN
+            DO 70, J = 1, N
+               DO 60, I = 1, M
+                  TEMP1 = ALPHA*B( I, J )
+                  TEMP2 = ZERO
+                  DO 50, K = 1, I - 1
+                     C( K, J ) = C( K, J ) + TEMP1*A( K, I )
+                     TEMP2     = TEMP2     +
+     $                           B( K, J )*CONJG(  A( K, I ) )
+   50             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = TEMP1*REAL( A( I, I ) ) +
+     $                           ALPHA*TEMP2
+                  ELSE
+                     C( I, J ) = BETA *C( I, J )         +
+     $                           TEMP1*REAL( A( I, I ) ) +
+     $                           ALPHA*TEMP2
+                  END IF
+   60          CONTINUE
+   70       CONTINUE
+         ELSE
+            DO 100, J = 1, N
+               DO 90, I = M, 1, -1
+                  TEMP1 = ALPHA*B( I, J )
+                  TEMP2 = ZERO
+                  DO 80, K = I + 1, M
+                     C( K, J ) = C( K, J ) + TEMP1*A( K, I )
+                     TEMP2     = TEMP2     +
+     $                           B( K, J )*CONJG(  A( K, I ) )
+   80             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = TEMP1*REAL( A( I, I ) ) +
+     $                           ALPHA*TEMP2
+                  ELSE
+                     C( I, J ) = BETA *C( I, J )         +
+     $                           TEMP1*REAL( A( I, I ) ) +
+     $                           ALPHA*TEMP2
+                  END IF
+   90          CONTINUE
+  100       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  C := alpha*B*A + beta*C.
+*
+         DO 170, J = 1, N
+            TEMP1 = ALPHA*REAL( A( J, J ) )
+            IF( BETA.EQ.ZERO )THEN
+               DO 110, I = 1, M
+                  C( I, J ) = TEMP1*B( I, J )
+  110          CONTINUE
+            ELSE
+               DO 120, I = 1, M
+                  C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J )
+  120          CONTINUE
+            END IF
+            DO 140, K = 1, J - 1
+               IF( UPPER )THEN
+                  TEMP1 = ALPHA*A( K, J )
+               ELSE
+                  TEMP1 = ALPHA*CONJG( A( J, K ) )
+               END IF
+               DO 130, I = 1, M
+                  C( I, J ) = C( I, J ) + TEMP1*B( I, K )
+  130          CONTINUE
+  140       CONTINUE
+            DO 160, K = J + 1, N
+               IF( UPPER )THEN
+                  TEMP1 = ALPHA*CONJG( A( J, K ) )
+               ELSE
+                  TEMP1 = ALPHA*A( K, J )
+               END IF
+               DO 150, I = 1, M
+                  C( I, J ) = C( I, J ) + TEMP1*B( I, K )
+  150          CONTINUE
+  160       CONTINUE
+  170    CONTINUE
+      END IF
+*
+      RETURN
+*
+*     End of CHEMM .
+*
+      END
+      SUBROUTINE CHEMV ( UPLO, N, ALPHA, A, LDA, X, INCX,
+     $                   BETA, Y, INCY )
+*     .. Scalar Arguments ..
+      COMPLEX            ALPHA, BETA
+      INTEGER            INCX, INCY, LDA, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CHEMV  performs the matrix-vector  operation
+*
+*     y := alpha*A*x + beta*y,
+*
+*  where alpha and beta are scalars, x and y are n element vectors and
+*  A is an n by n hermitian matrix.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the array A is to be referenced as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the upper triangular part of A
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the lower triangular part of A
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX         .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX          array of DIMENSION ( LDA, n ).
+*           Before entry with  UPLO = 'U' or 'u', the leading n by n
+*           upper triangular part of the array A must contain the upper
+*           triangular part of the hermitian matrix and the strictly
+*           lower triangular part of A is not referenced.
+*           Before entry with UPLO = 'L' or 'l', the leading n by n
+*           lower triangular part of the array A must contain the lower
+*           triangular part of the hermitian matrix and the strictly
+*           upper triangular part of A is not referenced.
+*           Note that the imaginary parts of the diagonal elements need
+*           not be set and are assumed to be zero.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*  X      - COMPLEX          array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  BETA   - COMPLEX         .
+*           On entry, BETA specifies the scalar beta. When BETA is
+*           supplied as zero then Y need not be set on input.
+*           Unchanged on exit.
+*
+*  Y      - COMPLEX          array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the n
+*           element vector y. On exit, Y is overwritten by the updated
+*           vector y.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX            ONE
+      PARAMETER        ( ONE  = ( 1.0E+0, 0.0E+0 ) )
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     .. Local Scalars ..
+      COMPLEX            TEMP1, TEMP2
+      INTEGER            I, INFO, IX, IY, J, JX, JY, KX, KY
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG, MAX, REAL
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( LDA.LT.MAX( 1, N ) )THEN
+         INFO = 5
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 7
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 10
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CHEMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     Set up the start points in  X  and  Y.
+*
+      IF( INCX.GT.0 )THEN
+         KX = 1
+      ELSE
+         KX = 1 - ( N - 1 )*INCX
+      END IF
+      IF( INCY.GT.0 )THEN
+         KY = 1
+      ELSE
+         KY = 1 - ( N - 1 )*INCY
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through the triangular part
+*     of A.
+*
+*     First form  y := beta*y.
+*
+      IF( BETA.NE.ONE )THEN
+         IF( INCY.EQ.1 )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 10, I = 1, N
+                  Y( I ) = ZERO
+   10          CONTINUE
+            ELSE
+               DO 20, I = 1, N
+                  Y( I ) = BETA*Y( I )
+   20          CONTINUE
+            END IF
+         ELSE
+            IY = KY
+            IF( BETA.EQ.ZERO )THEN
+               DO 30, I = 1, N
+                  Y( IY ) = ZERO
+                  IY      = IY   + INCY
+   30          CONTINUE
+            ELSE
+               DO 40, I = 1, N
+                  Y( IY ) = BETA*Y( IY )
+                  IY      = IY           + INCY
+   40          CONTINUE
+            END IF
+         END IF
+      END IF
+      IF( ALPHA.EQ.ZERO )
+     $   RETURN
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  y  when A is stored in upper triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 60, J = 1, N
+               TEMP1 = ALPHA*X( J )
+               TEMP2 = ZERO
+               DO 50, I = 1, J - 1
+                  Y( I ) = Y( I ) + TEMP1*A( I, J )
+                  TEMP2  = TEMP2  + CONJG( A( I, J ) )*X( I )
+   50          CONTINUE
+               Y( J ) = Y( J ) + TEMP1*REAL( A( J, J ) ) + ALPHA*TEMP2
+   60       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 80, J = 1, N
+               TEMP1 = ALPHA*X( JX )
+               TEMP2 = ZERO
+               IX    = KX
+               IY    = KY
+               DO 70, I = 1, J - 1
+                  Y( IY ) = Y( IY ) + TEMP1*A( I, J )
+                  TEMP2   = TEMP2   + CONJG( A( I, J ) )*X( IX )
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+   70          CONTINUE
+               Y( JY ) = Y( JY ) + TEMP1*REAL( A( J, J ) ) + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+   80       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  y  when A is stored in lower triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 100, J = 1, N
+               TEMP1  = ALPHA*X( J )
+               TEMP2  = ZERO
+               Y( J ) = Y( J ) + TEMP1*REAL( A( J, J ) )
+               DO 90, I = J + 1, N
+                  Y( I ) = Y( I ) + TEMP1*A( I, J )
+                  TEMP2  = TEMP2  + CONJG( A( I, J ) )*X( I )
+   90          CONTINUE
+               Y( J ) = Y( J ) + ALPHA*TEMP2
+  100       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 120, J = 1, N
+               TEMP1   = ALPHA*X( JX )
+               TEMP2   = ZERO
+               Y( JY ) = Y( JY ) + TEMP1*REAL( A( J, J ) )
+               IX      = JX
+               IY      = JY
+               DO 110, I = J + 1, N
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+                  Y( IY ) = Y( IY ) + TEMP1*A( I, J )
+                  TEMP2   = TEMP2   + CONJG( A( I, J ) )*X( IX )
+  110          CONTINUE
+               Y( JY ) = Y( JY ) + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+  120       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CHEMV .
+*
+      END
+      SUBROUTINE CHER2 ( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA )
+*     .. Scalar Arguments ..
+      COMPLEX            ALPHA
+      INTEGER            INCX, INCY, LDA, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CHER2  performs the hermitian rank 2 operation
+*
+*     A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A,
+*
+*  where alpha is a scalar, x and y are n element vectors and A is an n
+*  by n hermitian matrix.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the array A is to be referenced as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the upper triangular part of A
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the lower triangular part of A
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX         .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  X      - COMPLEX          array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  Y      - COMPLEX          array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the n
+*           element vector y.
+*           Unchanged on exit.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX          array of DIMENSION ( LDA, n ).
+*           Before entry with  UPLO = 'U' or 'u', the leading n by n
+*           upper triangular part of the array A must contain the upper
+*           triangular part of the hermitian matrix and the strictly
+*           lower triangular part of A is not referenced. On exit, the
+*           upper triangular part of the array A is overwritten by the
+*           upper triangular part of the updated matrix.
+*           Before entry with UPLO = 'L' or 'l', the leading n by n
+*           lower triangular part of the array A must contain the lower
+*           triangular part of the hermitian matrix and the strictly
+*           upper triangular part of A is not referenced. On exit, the
+*           lower triangular part of the array A is overwritten by the
+*           lower triangular part of the updated matrix.
+*           Note that the imaginary parts of the diagonal elements need
+*           not be set, they are assumed to be zero, and on exit they
+*           are set to zero.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     .. Local Scalars ..
+      COMPLEX            TEMP1, TEMP2
+      INTEGER            I, INFO, IX, IY, J, JX, JY, KX, KY
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG, MAX, REAL
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 5
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 7
+      ELSE IF( LDA.LT.MAX( 1, N ) )THEN
+         INFO = 9
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CHER2 ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) )
+     $   RETURN
+*
+*     Set up the start points in X and Y if the increments are not both
+*     unity.
+*
+      IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN
+         IF( INCX.GT.0 )THEN
+            KX = 1
+         ELSE
+            KX = 1 - ( N - 1 )*INCX
+         END IF
+         IF( INCY.GT.0 )THEN
+            KY = 1
+         ELSE
+            KY = 1 - ( N - 1 )*INCY
+         END IF
+         JX = KX
+         JY = KY
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through the triangular part
+*     of A.
+*
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  A  when A is stored in the upper triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 20, J = 1, N
+               IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*CONJG( Y( J ) )
+                  TEMP2 = CONJG( ALPHA*X( J ) )
+                  DO 10, I = 1, J - 1
+                     A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2
+   10             CONTINUE
+                  A( J, J ) = REAL( A( J, J ) ) +
+     $                        REAL( X( J )*TEMP1 + Y( J )*TEMP2 )
+               ELSE
+                  A( J, J ) = REAL( A( J, J ) )
+               END IF
+   20       CONTINUE
+         ELSE
+            DO 40, J = 1, N
+               IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*CONJG( Y( JY ) )
+                  TEMP2 = CONJG( ALPHA*X( JX ) )
+                  IX    = KX
+                  IY    = KY
+                  DO 30, I = 1, J - 1
+                     A( I, J ) = A( I, J ) + X( IX )*TEMP1
+     $                                     + Y( IY )*TEMP2
+                     IX        = IX        + INCX
+                     IY        = IY        + INCY
+   30             CONTINUE
+                  A( J, J ) = REAL( A( J, J ) ) +
+     $                        REAL( X( JX )*TEMP1 + Y( JY )*TEMP2 )
+               ELSE
+                  A( J, J ) = REAL( A( J, J ) )
+               END IF
+               JX = JX + INCX
+               JY = JY + INCY
+   40       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  A  when A is stored in the lower triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 60, J = 1, N
+               IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN
+                  TEMP1     = ALPHA*CONJG( Y( J ) )
+                  TEMP2     = CONJG( ALPHA*X( J ) )
+                  A( J, J ) = REAL( A( J, J ) ) +
+     $                        REAL( X( J )*TEMP1 + Y( J )*TEMP2 )
+                  DO 50, I = J + 1, N
+                     A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2
+   50             CONTINUE
+               ELSE
+                  A( J, J ) = REAL( A( J, J ) )
+               END IF
+   60       CONTINUE
+         ELSE
+            DO 80, J = 1, N
+               IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN
+                  TEMP1     = ALPHA*CONJG( Y( JY ) )
+                  TEMP2     = CONJG( ALPHA*X( JX ) )
+                  A( J, J ) = REAL( A( J, J ) ) +
+     $                        REAL( X( JX )*TEMP1 + Y( JY )*TEMP2 )
+                  IX        = JX
+                  IY        = JY
+                  DO 70, I = J + 1, N
+                     IX        = IX        + INCX
+                     IY        = IY        + INCY
+                     A( I, J ) = A( I, J ) + X( IX )*TEMP1
+     $                                     + Y( IY )*TEMP2
+   70             CONTINUE
+               ELSE
+                  A( J, J ) = REAL( A( J, J ) )
+               END IF
+               JX = JX + INCX
+               JY = JY + INCY
+   80       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CHER2 .
+*
+      END
+      SUBROUTINE CHER2K( UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB,
+     $                   BETA, C, LDC )
+*     .. Scalar Arguments ..
+      CHARACTER*1        UPLO, TRANS
+      INTEGER            N, K, LDA, LDB, LDC
+      REAL               BETA
+      COMPLEX            ALPHA
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), B( LDB, * ), C( LDC, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CHER2K  performs one of the hermitian rank 2k operations
+*
+*     C := alpha*A*conjg( B' ) + conjg( alpha )*B*conjg( A' ) + beta*C,
+*
+*  or
+*
+*     C := alpha*conjg( A' )*B + conjg( alpha )*conjg( B' )*A + beta*C,
+*
+*  where  alpha and beta  are scalars with  beta  real,  C is an  n by n
+*  hermitian matrix and  A and B  are  n by k matrices in the first case
+*  and  k by n  matrices in the second case.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On  entry,   UPLO  specifies  whether  the  upper  or  lower
+*           triangular  part  of the  array  C  is to be  referenced  as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the  upper triangular part of  C
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the  lower triangular part of  C
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry,  TRANS  specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'    C := alpha*A*conjg( B' )          +
+*                                         conjg( alpha )*B*conjg( A' ) +
+*                                         beta*C.
+*
+*              TRANS = 'C' or 'c'    C := alpha*conjg( A' )*B          +
+*                                         conjg( alpha )*conjg( B' )*A +
+*                                         beta*C.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry,  N specifies the order of the matrix C.  N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry with  TRANS = 'N' or 'n',  K  specifies  the number
+*           of  columns  of the  matrices  A and B,  and on  entry  with
+*           TRANS = 'C' or 'c',  K  specifies  the number of rows of the
+*           matrices  A and B.  K must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX         .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX          array of DIMENSION ( LDA, ka ), where ka is
+*           k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+*           Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+*           part of the array  A  must contain the matrix  A,  otherwise
+*           the leading  k by n  part of the array  A  must contain  the
+*           matrix A.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+*           then  LDA must be at least  max( 1, n ), otherwise  LDA must
+*           be at least  max( 1, k ).
+*           Unchanged on exit.
+*
+*  B      - COMPLEX          array of DIMENSION ( LDB, kb ), where kb is
+*           k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+*           Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+*           part of the array  B  must contain the matrix  B,  otherwise
+*           the leading  k by n  part of the array  B  must contain  the
+*           matrix B.
+*           Unchanged on exit.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+*           then  LDB must be at least  max( 1, n ), otherwise  LDB must
+*           be at least  max( 1, k ).
+*           Unchanged on exit.
+*
+*  BETA   - REAL            .
+*           On entry, BETA specifies the scalar beta.
+*           Unchanged on exit.
+*
+*  C      - COMPLEX          array of DIMENSION ( LDC, n ).
+*           Before entry  with  UPLO = 'U' or 'u',  the leading  n by n
+*           upper triangular part of the array C must contain the upper
+*           triangular part  of the  hermitian matrix  and the strictly
+*           lower triangular part of C is not referenced.  On exit, the
+*           upper triangular part of the array  C is overwritten by the
+*           upper triangular part of the updated matrix.
+*           Before entry  with  UPLO = 'L' or 'l',  the leading  n by n
+*           lower triangular part of the array C must contain the lower
+*           triangular part  of the  hermitian matrix  and the strictly
+*           upper triangular part of C is not referenced.  On exit, the
+*           lower triangular part of the array  C is overwritten by the
+*           lower triangular part of the updated matrix.
+*           Note that the imaginary parts of the diagonal elements need
+*           not be set,  they are assumed to be zero,  and on exit they
+*           are set to zero.
+*
+*  LDC    - INTEGER.
+*           On entry, LDC specifies the first dimension of C as declared
+*           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*  -- Modified 8-Nov-93 to set C(J,J) to REAL( C(J,J) ) when BETA = 1.
+*     Ed Anderson, Cray Research Inc.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG, MAX, REAL
+*     .. Local Scalars ..
+      LOGICAL            UPPER
+      INTEGER            I, INFO, J, L, NROWA
+      COMPLEX            TEMP1, TEMP2
+*     .. Parameters ..
+      REAL               ONE
+      PARAMETER        ( ONE  = 1.0E+0 )
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+         NROWA = N
+      ELSE
+         NROWA = K
+      END IF
+      UPPER = LSAME( UPLO, 'U' )
+*
+      INFO = 0
+      IF(      ( .NOT.UPPER               ).AND.
+     $         ( .NOT.LSAME( UPLO , 'L' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND.
+     $         ( .NOT.LSAME( TRANS, 'C' ) )      )THEN
+         INFO = 2
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 3
+      ELSE IF( K  .LT.0               )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 7
+      ELSE IF( LDB.LT.MAX( 1, NROWA ) )THEN
+         INFO = 9
+      ELSE IF( LDC.LT.MAX( 1, N     ) )THEN
+         INFO = 12
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CHER2K', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.
+     $    ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         IF( UPPER )THEN
+            IF( BETA.EQ.REAL( ZERO ) )THEN
+               DO 20, J = 1, N
+                  DO 10, I = 1, J
+                     C( I, J ) = ZERO
+   10             CONTINUE
+   20          CONTINUE
+            ELSE
+               DO 40, J = 1, N
+                  DO 30, I = 1, J - 1
+                     C( I, J ) = BETA*C( I, J )
+   30             CONTINUE
+                  C( J, J ) = BETA*REAL( C( J, J ) )
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( BETA.EQ.REAL( ZERO ) )THEN
+               DO 60, J = 1, N
+                  DO 50, I = J, N
+                     C( I, J ) = ZERO
+   50             CONTINUE
+   60          CONTINUE
+            ELSE
+               DO 80, J = 1, N
+                  C( J, J ) = BETA*REAL( C( J, J ) )
+                  DO 70, I = J + 1, N
+                     C( I, J ) = BETA*C( I, J )
+   70             CONTINUE
+   80          CONTINUE
+            END IF
+         END IF
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  C := alpha*A*conjg( B' ) + conjg( alpha )*B*conjg( A' ) +
+*                   C.
+*
+         IF( UPPER )THEN
+            DO 130, J = 1, N
+               IF( BETA.EQ.REAL( ZERO ) )THEN
+                  DO 90, I = 1, J
+                     C( I, J ) = ZERO
+   90             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 100, I = 1, J - 1
+                     C( I, J ) = BETA*C( I, J )
+  100             CONTINUE
+                  C( J, J ) = BETA*REAL( C( J, J ) )
+               ELSE
+                  C( J, J ) = REAL( C( J, J ) )
+               END IF
+               DO 120, L = 1, K
+                  IF( ( A( J, L ).NE.ZERO ).OR.
+     $                ( B( J, L ).NE.ZERO )     )THEN
+                     TEMP1 = ALPHA*CONJG( B( J, L ) )
+                     TEMP2 = CONJG( ALPHA*A( J, L ) )
+                     DO 110, I = 1, J - 1
+                        C( I, J ) = C( I, J ) + A( I, L )*TEMP1 +
+     $                                          B( I, L )*TEMP2
+  110                CONTINUE
+                     C( J, J ) = REAL( C( J, J ) )         +
+     $                           REAL( A( J, L )*TEMP1 +
+     $                                 B( J, L )*TEMP2   )
+                  END IF
+  120          CONTINUE
+  130       CONTINUE
+         ELSE
+            DO 180, J = 1, N
+               IF( BETA.EQ.REAL( ZERO ) )THEN
+                  DO 140, I = J, N
+                     C( I, J ) = ZERO
+  140             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 150, I = J + 1, N
+                     C( I, J ) = BETA*C( I, J )
+  150             CONTINUE
+                  C( J, J ) = BETA*REAL( C( J, J ) )
+               ELSE
+                  C( J, J ) = REAL( C( J, J ) )
+               END IF
+               DO 170, L = 1, K
+                  IF( ( A( J, L ).NE.ZERO ).OR.
+     $                ( B( J, L ).NE.ZERO )     )THEN
+                     TEMP1 = ALPHA*CONJG( B( J, L ) )
+                     TEMP2 = CONJG( ALPHA*A( J, L ) )
+                     DO 160, I = J + 1, N
+                        C( I, J ) = C( I, J ) + A( I, L )*TEMP1 +
+     $                                          B( I, L )*TEMP2
+  160                CONTINUE
+                     C( J, J ) = REAL( C( J, J ) )         +
+     $                           REAL( A( J, L )*TEMP1 +
+     $                                 B( J, L )*TEMP2   )
+                  END IF
+  170          CONTINUE
+  180       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  C := alpha*conjg( A' )*B + conjg( alpha )*conjg( B' )*A +
+*                   C.
+*
+         IF( UPPER )THEN
+            DO 210, J = 1, N
+               DO 200, I = 1, J
+                  TEMP1 = ZERO
+                  TEMP2 = ZERO
+                  DO 190, L = 1, K
+                     TEMP1 = TEMP1 + CONJG( A( L, I ) )*B( L, J )
+                     TEMP2 = TEMP2 + CONJG( B( L, I ) )*A( L, J )
+  190             CONTINUE
+                  IF( I.EQ.J )THEN
+                     IF( BETA.EQ.REAL( ZERO ) )THEN
+                        C( J, J ) = REAL(        ALPHA  *TEMP1 +
+     $                                    CONJG( ALPHA )*TEMP2   )
+                     ELSE
+                        C( J, J ) = BETA*REAL( C( J, J ) )         +
+     $                              REAL(        ALPHA  *TEMP1 +
+     $                                    CONJG( ALPHA )*TEMP2   )
+                     END IF
+                  ELSE
+                     IF( BETA.EQ.REAL( ZERO ) )THEN
+                        C( I, J ) = ALPHA*TEMP1 + CONJG( ALPHA )*TEMP2
+                     ELSE
+                        C( I, J ) = BETA *C( I, J ) +
+     $                              ALPHA*TEMP1 + CONJG( ALPHA )*TEMP2
+                     END IF
+                  END IF
+  200          CONTINUE
+  210       CONTINUE
+         ELSE
+            DO 240, J = 1, N
+               DO 230, I = J, N
+                  TEMP1 = ZERO
+                  TEMP2 = ZERO
+                  DO 220, L = 1, K
+                     TEMP1 = TEMP1 + CONJG( A( L, I ) )*B( L, J )
+                     TEMP2 = TEMP2 + CONJG( B( L, I ) )*A( L, J )
+  220             CONTINUE
+                  IF( I.EQ.J )THEN
+                     IF( BETA.EQ.REAL( ZERO ) )THEN
+                        C( J, J ) = REAL(        ALPHA  *TEMP1 +
+     $                                    CONJG( ALPHA )*TEMP2   )
+                     ELSE
+                        C( J, J ) = BETA*REAL( C( J, J ) )         +
+     $                              REAL(        ALPHA  *TEMP1 +
+     $                                    CONJG( ALPHA )*TEMP2   )
+                     END IF
+                  ELSE
+                     IF( BETA.EQ.REAL( ZERO ) )THEN
+                        C( I, J ) = ALPHA*TEMP1 + CONJG( ALPHA )*TEMP2
+                     ELSE
+                        C( I, J ) = BETA *C( I, J ) +
+     $                              ALPHA*TEMP1 + CONJG( ALPHA )*TEMP2
+                     END IF
+                  END IF
+  230          CONTINUE
+  240       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CHER2K.
+*
+      END
+      SUBROUTINE CHER  ( UPLO, N, ALPHA, X, INCX, A, LDA )
+*     .. Scalar Arguments ..
+      REAL               ALPHA
+      INTEGER            INCX, LDA, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CHER   performs the hermitian rank 1 operation
+*
+*     A := alpha*x*conjg( x' ) + A,
+*
+*  where alpha is a real scalar, x is an n element vector and A is an
+*  n by n hermitian matrix.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the array A is to be referenced as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the upper triangular part of A
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the lower triangular part of A
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - REAL            .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  X      - COMPLEX          array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX          array of DIMENSION ( LDA, n ).
+*           Before entry with  UPLO = 'U' or 'u', the leading n by n
+*           upper triangular part of the array A must contain the upper
+*           triangular part of the hermitian matrix and the strictly
+*           lower triangular part of A is not referenced. On exit, the
+*           upper triangular part of the array A is overwritten by the
+*           upper triangular part of the updated matrix.
+*           Before entry with UPLO = 'L' or 'l', the leading n by n
+*           lower triangular part of the array A must contain the lower
+*           triangular part of the hermitian matrix and the strictly
+*           upper triangular part of A is not referenced. On exit, the
+*           lower triangular part of the array A is overwritten by the
+*           lower triangular part of the updated matrix.
+*           Note that the imaginary parts of the diagonal elements need
+*           not be set, they are assumed to be zero, and on exit they
+*           are set to zero.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     .. Local Scalars ..
+      COMPLEX            TEMP
+      INTEGER            I, INFO, IX, J, JX, KX
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG, MAX, REAL
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 5
+      ELSE IF( LDA.LT.MAX( 1, N ) )THEN
+         INFO = 7
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CHER  ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ALPHA.EQ.REAL( ZERO ) ) )
+     $   RETURN
+*
+*     Set the start point in X if the increment is not unity.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through the triangular part
+*     of A.
+*
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  A  when A is stored in upper triangle.
+*
+         IF( INCX.EQ.1 )THEN
+            DO 20, J = 1, N
+               IF( X( J ).NE.ZERO )THEN
+                  TEMP = ALPHA*CONJG( X( J ) )
+                  DO 10, I = 1, J - 1
+                     A( I, J ) = A( I, J ) + X( I )*TEMP
+   10             CONTINUE
+                  A( J, J ) = REAL( A( J, J ) ) + REAL( X( J )*TEMP )
+               ELSE
+                  A( J, J ) = REAL( A( J, J ) )
+               END IF
+   20       CONTINUE
+         ELSE
+            JX = KX
+            DO 40, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*CONJG( X( JX ) )
+                  IX   = KX
+                  DO 30, I = 1, J - 1
+                     A( I, J ) = A( I, J ) + X( IX )*TEMP
+                     IX        = IX        + INCX
+   30             CONTINUE
+                  A( J, J ) = REAL( A( J, J ) ) + REAL( X( JX )*TEMP )
+               ELSE
+                  A( J, J ) = REAL( A( J, J ) )
+               END IF
+               JX = JX + INCX
+   40       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  A  when A is stored in lower triangle.
+*
+         IF( INCX.EQ.1 )THEN
+            DO 60, J = 1, N
+               IF( X( J ).NE.ZERO )THEN
+                  TEMP      = ALPHA*CONJG( X( J ) )
+                  A( J, J ) = REAL( A( J, J ) ) + REAL( TEMP*X( J ) )
+                  DO 50, I = J + 1, N
+                     A( I, J ) = A( I, J ) + X( I )*TEMP
+   50             CONTINUE
+               ELSE
+                  A( J, J ) = REAL( A( J, J ) )
+               END IF
+   60       CONTINUE
+         ELSE
+            JX = KX
+            DO 80, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP      = ALPHA*CONJG( X( JX ) )
+                  A( J, J ) = REAL( A( J, J ) ) + REAL( TEMP*X( JX ) )
+                  IX        = JX
+                  DO 70, I = J + 1, N
+                     IX        = IX        + INCX
+                     A( I, J ) = A( I, J ) + X( IX )*TEMP
+   70             CONTINUE
+               ELSE
+                  A( J, J ) = REAL( A( J, J ) )
+               END IF
+               JX = JX + INCX
+   80       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CHER  .
+*
+      END
+      SUBROUTINE CHERK ( UPLO, TRANS, N, K, ALPHA, A, LDA,
+     $                   BETA, C, LDC )
+*     .. Scalar Arguments ..
+      CHARACTER*1        UPLO, TRANS
+      INTEGER            N, K, LDA, LDC
+      REAL               ALPHA, BETA
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), C( LDC, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CHERK  performs one of the hermitian rank k operations
+*
+*     C := alpha*A*conjg( A' ) + beta*C,
+*
+*  or
+*
+*     C := alpha*conjg( A' )*A + beta*C,
+*
+*  where  alpha and beta  are  real scalars,  C is an  n by n  hermitian
+*  matrix and  A  is an  n by k  matrix in the  first case and a  k by n
+*  matrix in the second case.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On  entry,   UPLO  specifies  whether  the  upper  or  lower
+*           triangular  part  of the  array  C  is to be  referenced  as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the  upper triangular part of  C
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the  lower triangular part of  C
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry,  TRANS  specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   C := alpha*A*conjg( A' ) + beta*C.
+*
+*              TRANS = 'C' or 'c'   C := alpha*conjg( A' )*A + beta*C.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry,  N specifies the order of the matrix C.  N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry with  TRANS = 'N' or 'n',  K  specifies  the number
+*           of  columns   of  the   matrix   A,   and  on   entry   with
+*           TRANS = 'C' or 'c',  K  specifies  the number of rows of the
+*           matrix A.  K must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - REAL            .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX          array of DIMENSION ( LDA, ka ), where ka is
+*           k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+*           Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+*           part of the array  A  must contain the matrix  A,  otherwise
+*           the leading  k by n  part of the array  A  must contain  the
+*           matrix A.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+*           then  LDA must be at least  max( 1, n ), otherwise  LDA must
+*           be at least  max( 1, k ).
+*           Unchanged on exit.
+*
+*  BETA   - REAL            .
+*           On entry, BETA specifies the scalar beta.
+*           Unchanged on exit.
+*
+*  C      - COMPLEX          array of DIMENSION ( LDC, n ).
+*           Before entry  with  UPLO = 'U' or 'u',  the leading  n by n
+*           upper triangular part of the array C must contain the upper
+*           triangular part  of the  hermitian matrix  and the strictly
+*           lower triangular part of C is not referenced.  On exit, the
+*           upper triangular part of the array  C is overwritten by the
+*           upper triangular part of the updated matrix.
+*           Before entry  with  UPLO = 'L' or 'l',  the leading  n by n
+*           lower triangular part of the array C must contain the lower
+*           triangular part  of the  hermitian matrix  and the strictly
+*           upper triangular part of C is not referenced.  On exit, the
+*           lower triangular part of the array  C is overwritten by the
+*           lower triangular part of the updated matrix.
+*           Note that the imaginary parts of the diagonal elements need
+*           not be set,  they are assumed to be zero,  and on exit they
+*           are set to zero.
+*
+*  LDC    - INTEGER.
+*           On entry, LDC specifies the first dimension of C as declared
+*           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*  -- Modified 8-Nov-93 to set C(J,J) to REAL( C(J,J) ) when BETA = 1.
+*     Ed Anderson, Cray Research Inc.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          CMPLX, CONJG, MAX, REAL
+*     .. Local Scalars ..
+      LOGICAL            UPPER
+      INTEGER            I, INFO, J, L, NROWA
+      REAL               RTEMP
+      COMPLEX            TEMP
+*     .. Parameters ..
+      REAL               ONE ,         ZERO
+      PARAMETER        ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+         NROWA = N
+      ELSE
+         NROWA = K
+      END IF
+      UPPER = LSAME( UPLO, 'U' )
+*
+      INFO = 0
+      IF(      ( .NOT.UPPER               ).AND.
+     $         ( .NOT.LSAME( UPLO , 'L' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND.
+     $         ( .NOT.LSAME( TRANS, 'C' ) )      )THEN
+         INFO = 2
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 3
+      ELSE IF( K  .LT.0               )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 7
+      ELSE IF( LDC.LT.MAX( 1, N     ) )THEN
+         INFO = 10
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CHERK ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.
+     $    ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         IF( UPPER )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 20, J = 1, N
+                  DO 10, I = 1, J
+                     C( I, J ) = ZERO
+   10             CONTINUE
+   20          CONTINUE
+            ELSE
+               DO 40, J = 1, N
+                  DO 30, I = 1, J - 1
+                     C( I, J ) = BETA*C( I, J )
+   30             CONTINUE
+                  C( J, J ) = BETA*REAL( C( J, J ) )
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( BETA.EQ.ZERO )THEN
+               DO 60, J = 1, N
+                  DO 50, I = J, N
+                     C( I, J ) = ZERO
+   50             CONTINUE
+   60          CONTINUE
+            ELSE
+               DO 80, J = 1, N
+                  C( J, J ) = BETA*REAL( C( J, J ) )
+                  DO 70, I = J + 1, N
+                     C( I, J ) = BETA*C( I, J )
+   70             CONTINUE
+   80          CONTINUE
+            END IF
+         END IF
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  C := alpha*A*conjg( A' ) + beta*C.
+*
+         IF( UPPER )THEN
+            DO 130, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 90, I = 1, J
+                     C( I, J ) = ZERO
+   90             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 100, I = 1, J - 1
+                     C( I, J ) = BETA*C( I, J )
+  100             CONTINUE
+                  C( J, J ) = BETA*REAL( C( J, J ) )
+               ELSE
+                  C( J, J ) = REAL( C( J, J ) )
+               END IF
+               DO 120, L = 1, K
+                  IF( A( J, L ).NE.CMPLX( ZERO ) )THEN
+                     TEMP = ALPHA*CONJG( A( J, L ) )
+                     DO 110, I = 1, J - 1
+                        C( I, J ) = C( I, J ) + TEMP*A( I, L )
+  110                CONTINUE
+                     C( J, J ) = REAL( C( J, J )      ) +
+     $                           REAL( TEMP*A( I, L ) )
+                  END IF
+  120          CONTINUE
+  130       CONTINUE
+         ELSE
+            DO 180, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 140, I = J, N
+                     C( I, J ) = ZERO
+  140             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  C( J, J ) = BETA*REAL( C( J, J ) )
+                  DO 150, I = J + 1, N
+                     C( I, J ) = BETA*C( I, J )
+  150             CONTINUE
+               ELSE
+                  C( J, J ) = REAL( C( J, J ) )
+               END IF
+               DO 170, L = 1, K
+                  IF( A( J, L ).NE.CMPLX( ZERO ) )THEN
+                     TEMP      = ALPHA*CONJG( A( J, L ) )
+                     C( J, J ) = REAL( C( J, J )      )   +
+     $                           REAL( TEMP*A( J, L ) )
+                     DO 160, I = J + 1, N
+                        C( I, J ) = C( I, J ) + TEMP*A( I, L )
+  160                CONTINUE
+                  END IF
+  170          CONTINUE
+  180       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  C := alpha*conjg( A' )*A + beta*C.
+*
+         IF( UPPER )THEN
+            DO 220, J = 1, N
+               DO 200, I = 1, J - 1
+                  TEMP = ZERO
+                  DO 190, L = 1, K
+                     TEMP = TEMP + CONJG( A( L, I ) )*A( L, J )
+  190             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  200          CONTINUE
+               RTEMP = ZERO
+               DO 210, L = 1, K
+                  RTEMP = RTEMP + CONJG( A( L, J ) )*A( L, J )
+  210          CONTINUE
+               IF( BETA.EQ.ZERO )THEN
+                  C( J, J ) = ALPHA*RTEMP
+               ELSE
+                  C( J, J ) = ALPHA*RTEMP + BETA*REAL( C( J, J ) )
+               END IF
+  220       CONTINUE
+         ELSE
+            DO 260, J = 1, N
+               RTEMP = ZERO
+               DO 230, L = 1, K
+                  RTEMP = RTEMP + CONJG( A( L, J ) )*A( L, J )
+  230          CONTINUE
+               IF( BETA.EQ.ZERO )THEN
+                  C( J, J ) = ALPHA*RTEMP
+               ELSE
+                  C( J, J ) = ALPHA*RTEMP + BETA*REAL( C( J, J ) )
+               END IF
+               DO 250, I = J + 1, N
+                  TEMP = ZERO
+                  DO 240, L = 1, K
+                     TEMP = TEMP + CONJG( A( L, I ) )*A( L, J )
+  240             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  250          CONTINUE
+  260       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CHERK .
+*
+      END
+      SUBROUTINE CHPMV ( UPLO, N, ALPHA, AP, X, INCX, BETA, Y, INCY )
+*     .. Scalar Arguments ..
+      COMPLEX            ALPHA, BETA
+      INTEGER            INCX, INCY, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      COMPLEX            AP( * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CHPMV  performs the matrix-vector operation
+*
+*     y := alpha*A*x + beta*y,
+*
+*  where alpha and beta are scalars, x and y are n element vectors and
+*  A is an n by n hermitian matrix, supplied in packed form.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the matrix A is supplied in the packed
+*           array AP as follows:
+*
+*              UPLO = 'U' or 'u'   The upper triangular part of A is
+*                                  supplied in AP.
+*
+*              UPLO = 'L' or 'l'   The lower triangular part of A is
+*                                  supplied in AP.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX         .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  AP     - COMPLEX          array of DIMENSION at least
+*           ( ( n*( n + 1 ) )/2 ).
+*           Before entry with UPLO = 'U' or 'u', the array AP must
+*           contain the upper triangular part of the hermitian matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
+*           and a( 2, 2 ) respectively, and so on.
+*           Before entry with UPLO = 'L' or 'l', the array AP must
+*           contain the lower triangular part of the hermitian matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
+*           and a( 3, 1 ) respectively, and so on.
+*           Note that the imaginary parts of the diagonal elements need
+*           not be set and are assumed to be zero.
+*           Unchanged on exit.
+*
+*  X      - COMPLEX          array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  BETA   - COMPLEX         .
+*           On entry, BETA specifies the scalar beta. When BETA is
+*           supplied as zero then Y need not be set on input.
+*           Unchanged on exit.
+*
+*  Y      - COMPLEX          array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the n
+*           element vector y. On exit, Y is overwritten by the updated
+*           vector y.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX            ONE
+      PARAMETER        ( ONE  = ( 1.0E+0, 0.0E+0 ) )
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     .. Local Scalars ..
+      COMPLEX            TEMP1, TEMP2
+      INTEGER            I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG, REAL
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 6
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 9
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CHPMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     Set up the start points in  X  and  Y.
+*
+      IF( INCX.GT.0 )THEN
+         KX = 1
+      ELSE
+         KX = 1 - ( N - 1 )*INCX
+      END IF
+      IF( INCY.GT.0 )THEN
+         KY = 1
+      ELSE
+         KY = 1 - ( N - 1 )*INCY
+      END IF
+*
+*     Start the operations. In this version the elements of the array AP
+*     are accessed sequentially with one pass through AP.
+*
+*     First form  y := beta*y.
+*
+      IF( BETA.NE.ONE )THEN
+         IF( INCY.EQ.1 )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 10, I = 1, N
+                  Y( I ) = ZERO
+   10          CONTINUE
+            ELSE
+               DO 20, I = 1, N
+                  Y( I ) = BETA*Y( I )
+   20          CONTINUE
+            END IF
+         ELSE
+            IY = KY
+            IF( BETA.EQ.ZERO )THEN
+               DO 30, I = 1, N
+                  Y( IY ) = ZERO
+                  IY      = IY   + INCY
+   30          CONTINUE
+            ELSE
+               DO 40, I = 1, N
+                  Y( IY ) = BETA*Y( IY )
+                  IY      = IY           + INCY
+   40          CONTINUE
+            END IF
+         END IF
+      END IF
+      IF( ALPHA.EQ.ZERO )
+     $   RETURN
+      KK = 1
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  y  when AP contains the upper triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 60, J = 1, N
+               TEMP1 = ALPHA*X( J )
+               TEMP2 = ZERO
+               K     = KK
+               DO 50, I = 1, J - 1
+                  Y( I ) = Y( I ) + TEMP1*AP( K )
+                  TEMP2  = TEMP2  + CONJG( AP( K ) )*X( I )
+                  K      = K      + 1
+   50          CONTINUE
+               Y( J ) = Y( J ) + TEMP1*REAL( AP( KK + J - 1 ) )
+     $                         + ALPHA*TEMP2
+               KK     = KK     + J
+   60       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 80, J = 1, N
+               TEMP1 = ALPHA*X( JX )
+               TEMP2 = ZERO
+               IX    = KX
+               IY    = KY
+               DO 70, K = KK, KK + J - 2
+                  Y( IY ) = Y( IY ) + TEMP1*AP( K )
+                  TEMP2   = TEMP2   + CONJG( AP( K ) )*X( IX )
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+   70          CONTINUE
+               Y( JY ) = Y( JY ) + TEMP1*REAL( AP( KK + J - 1 ) )
+     $                           + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+               KK      = KK      + J
+   80       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  y  when AP contains the lower triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 100, J = 1, N
+               TEMP1  = ALPHA*X( J )
+               TEMP2  = ZERO
+               Y( J ) = Y( J ) + TEMP1*REAL( AP( KK ) )
+               K      = KK     + 1
+               DO 90, I = J + 1, N
+                  Y( I ) = Y( I ) + TEMP1*AP( K )
+                  TEMP2  = TEMP2  + CONJG( AP( K ) )*X( I )
+                  K      = K      + 1
+   90          CONTINUE
+               Y( J ) = Y( J ) + ALPHA*TEMP2
+               KK     = KK     + ( N - J + 1 )
+  100       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 120, J = 1, N
+               TEMP1   = ALPHA*X( JX )
+               TEMP2   = ZERO
+               Y( JY ) = Y( JY ) + TEMP1*REAL( AP( KK ) )
+               IX      = JX
+               IY      = JY
+               DO 110, K = KK + 1, KK + N - J
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+                  Y( IY ) = Y( IY ) + TEMP1*AP( K )
+                  TEMP2   = TEMP2   + CONJG( AP( K ) )*X( IX )
+  110          CONTINUE
+               Y( JY ) = Y( JY ) + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+               KK      = KK      + ( N - J + 1 )
+  120       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CHPMV .
+*
+      END
+      SUBROUTINE CHPR2 ( UPLO, N, ALPHA, X, INCX, Y, INCY, AP )
+*     .. Scalar Arguments ..
+      COMPLEX            ALPHA
+      INTEGER            INCX, INCY, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      COMPLEX            AP( * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CHPR2  performs the hermitian rank 2 operation
+*
+*     A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A,
+*
+*  where alpha is a scalar, x and y are n element vectors and A is an
+*  n by n hermitian matrix, supplied in packed form.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the matrix A is supplied in the packed
+*           array AP as follows:
+*
+*              UPLO = 'U' or 'u'   The upper triangular part of A is
+*                                  supplied in AP.
+*
+*              UPLO = 'L' or 'l'   The lower triangular part of A is
+*                                  supplied in AP.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX         .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  X      - COMPLEX          array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  Y      - COMPLEX          array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the n
+*           element vector y.
+*           Unchanged on exit.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*  AP     - COMPLEX          array of DIMENSION at least
+*           ( ( n*( n + 1 ) )/2 ).
+*           Before entry with  UPLO = 'U' or 'u', the array AP must
+*           contain the upper triangular part of the hermitian matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
+*           and a( 2, 2 ) respectively, and so on. On exit, the array
+*           AP is overwritten by the upper triangular part of the
+*           updated matrix.
+*           Before entry with UPLO = 'L' or 'l', the array AP must
+*           contain the lower triangular part of the hermitian matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
+*           and a( 3, 1 ) respectively, and so on. On exit, the array
+*           AP is overwritten by the lower triangular part of the
+*           updated matrix.
+*           Note that the imaginary parts of the diagonal elements need
+*           not be set, they are assumed to be zero, and on exit they
+*           are set to zero.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     .. Local Scalars ..
+      COMPLEX            TEMP1, TEMP2
+      INTEGER            I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG, REAL
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 5
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 7
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CHPR2 ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) )
+     $   RETURN
+*
+*     Set up the start points in X and Y if the increments are not both
+*     unity.
+*
+      IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN
+         IF( INCX.GT.0 )THEN
+            KX = 1
+         ELSE
+            KX = 1 - ( N - 1 )*INCX
+         END IF
+         IF( INCY.GT.0 )THEN
+            KY = 1
+         ELSE
+            KY = 1 - ( N - 1 )*INCY
+         END IF
+         JX = KX
+         JY = KY
+      END IF
+*
+*     Start the operations. In this version the elements of the array AP
+*     are accessed sequentially with one pass through AP.
+*
+      KK = 1
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  A  when upper triangle is stored in AP.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 20, J = 1, N
+               IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*CONJG( Y( J ) )
+                  TEMP2 = CONJG( ALPHA*X( J ) )
+                  K     = KK
+                  DO 10, I = 1, J - 1
+                     AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2
+                     K       = K       + 1
+   10             CONTINUE
+                  AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) +
+     $                               REAL( X( J )*TEMP1 + Y( J )*TEMP2 )
+               ELSE
+                  AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) )
+               END IF
+               KK = KK + J
+   20       CONTINUE
+         ELSE
+            DO 40, J = 1, N
+               IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*CONJG( Y( JY ) )
+                  TEMP2 = CONJG( ALPHA*X( JX ) )
+                  IX    = KX
+                  IY    = KY
+                  DO 30, K = KK, KK + J - 2
+                     AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2
+                     IX      = IX      + INCX
+                     IY      = IY      + INCY
+   30             CONTINUE
+                  AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) +
+     $                               REAL( X( JX )*TEMP1 +
+     $                                     Y( JY )*TEMP2 )
+               ELSE
+                  AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) )
+               END IF
+               JX = JX + INCX
+               JY = JY + INCY
+               KK = KK + J
+   40       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  A  when lower triangle is stored in AP.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 60, J = 1, N
+               IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN
+                  TEMP1   = ALPHA*CONJG( Y( J ) )
+                  TEMP2   = CONJG( ALPHA*X( J ) )
+                  AP( KK ) = REAL( AP( KK ) ) +
+     $                       REAL( X( J )*TEMP1 + Y( J )*TEMP2 )
+                  K        = KK               + 1
+                  DO 50, I = J + 1, N
+                     AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2
+                     K       = K       + 1
+   50             CONTINUE
+               ELSE
+                  AP( KK ) = REAL( AP( KK ) )
+               END IF
+               KK = KK + N - J + 1
+   60       CONTINUE
+         ELSE
+            DO 80, J = 1, N
+               IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN
+                  TEMP1    = ALPHA*CONJG( Y( JY ) )
+                  TEMP2    = CONJG( ALPHA*X( JX ) )
+                  AP( KK ) = REAL( AP( KK ) ) +
+     $                       REAL( X( JX )*TEMP1 + Y( JY )*TEMP2 )
+                  IX       = JX
+                  IY       = JY
+                  DO 70, K = KK + 1, KK + N - J
+                     IX      = IX      + INCX
+                     IY      = IY      + INCY
+                     AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2
+   70             CONTINUE
+               ELSE
+                  AP( KK ) = REAL( AP( KK ) )
+               END IF
+               JX = JX + INCX
+               JY = JY + INCY
+               KK = KK + N - J + 1
+   80       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CHPR2 .
+*
+      END
+      SUBROUTINE CHPR  ( UPLO, N, ALPHA, X, INCX, AP )
+*     .. Scalar Arguments ..
+      REAL               ALPHA
+      INTEGER            INCX, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      COMPLEX            AP( * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CHPR    performs the hermitian rank 1 operation
+*
+*     A := alpha*x*conjg( x' ) + A,
+*
+*  where alpha is a real scalar, x is an n element vector and A is an
+*  n by n hermitian matrix, supplied in packed form.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the matrix A is supplied in the packed
+*           array AP as follows:
+*
+*              UPLO = 'U' or 'u'   The upper triangular part of A is
+*                                  supplied in AP.
+*
+*              UPLO = 'L' or 'l'   The lower triangular part of A is
+*                                  supplied in AP.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - REAL            .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  X      - COMPLEX          array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  AP     - COMPLEX          array of DIMENSION at least
+*           ( ( n*( n + 1 ) )/2 ).
+*           Before entry with  UPLO = 'U' or 'u', the array AP must
+*           contain the upper triangular part of the hermitian matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
+*           and a( 2, 2 ) respectively, and so on. On exit, the array
+*           AP is overwritten by the upper triangular part of the
+*           updated matrix.
+*           Before entry with UPLO = 'L' or 'l', the array AP must
+*           contain the lower triangular part of the hermitian matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
+*           and a( 3, 1 ) respectively, and so on. On exit, the array
+*           AP is overwritten by the lower triangular part of the
+*           updated matrix.
+*           Note that the imaginary parts of the diagonal elements need
+*           not be set, they are assumed to be zero, and on exit they
+*           are set to zero.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     .. Local Scalars ..
+      COMPLEX            TEMP
+      INTEGER            I, INFO, IX, J, JX, K, KK, KX
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG, REAL
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 5
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CHPR  ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ALPHA.EQ.REAL( ZERO ) ) )
+     $   RETURN
+*
+*     Set the start point in X if the increment is not unity.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of the array AP
+*     are accessed sequentially with one pass through AP.
+*
+      KK = 1
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  A  when upper triangle is stored in AP.
+*
+         IF( INCX.EQ.1 )THEN
+            DO 20, J = 1, N
+               IF( X( J ).NE.ZERO )THEN
+                  TEMP = ALPHA*CONJG( X( J ) )
+                  K    = KK
+                  DO 10, I = 1, J - 1
+                     AP( K ) = AP( K ) + X( I )*TEMP
+                     K       = K       + 1
+   10             CONTINUE
+                  AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) )
+     $                               + REAL( X( J )*TEMP )
+               ELSE
+                  AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) )
+               END IF
+               KK = KK + J
+   20       CONTINUE
+         ELSE
+            JX = KX
+            DO 40, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*CONJG( X( JX ) )
+                  IX   = KX
+                  DO 30, K = KK, KK + J - 2
+                     AP( K ) = AP( K ) + X( IX )*TEMP
+                     IX      = IX      + INCX
+   30             CONTINUE
+                  AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) )
+     $                               + REAL( X( JX )*TEMP )
+               ELSE
+                  AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) )
+               END IF
+               JX = JX + INCX
+               KK = KK + J
+   40       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  A  when lower triangle is stored in AP.
+*
+         IF( INCX.EQ.1 )THEN
+            DO 60, J = 1, N
+               IF( X( J ).NE.ZERO )THEN
+                  TEMP     = ALPHA*CONJG( X( J ) )
+                  AP( KK ) = REAL( AP( KK ) ) + REAL( TEMP*X( J ) )
+                  K        = KK               + 1
+                  DO 50, I = J + 1, N
+                     AP( K ) = AP( K ) + X( I )*TEMP
+                     K       = K       + 1
+   50             CONTINUE
+               ELSE
+                  AP( KK ) = REAL( AP( KK ) )
+               END IF
+               KK = KK + N - J + 1
+   60       CONTINUE
+         ELSE
+            JX = KX
+            DO 80, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP    = ALPHA*CONJG( X( JX ) )
+                  AP( KK ) = REAL( AP( KK ) ) + REAL( TEMP*X( JX ) )
+                  IX      = JX
+                  DO 70, K = KK + 1, KK + N - J
+                     IX      = IX      + INCX
+                     AP( K ) = AP( K ) + X( IX )*TEMP
+   70             CONTINUE
+               ELSE
+                  AP( KK ) = REAL( AP( KK ) )
+               END IF
+               JX = JX + INCX
+               KK = KK + N - J + 1
+   80       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CHPR  .
+*
+      END
+      subroutine crotg(ca,cb,c,s)
+      complex ca,cb,s
+      real c
+      real norm,scale
+      complex alpha
+      if (cabs(ca) .ne. 0.) go to 10
+         c = 0.
+         s = (1.,0.)
+         ca = cb
+         go to 20
+   10 continue
+         scale = cabs(ca) + cabs(cb)
+         norm = scale * sqrt((cabs(ca/scale))**2 + (cabs(cb/scale))**2)
+         alpha = ca /cabs(ca)
+         c = cabs(ca) / norm
+         s = alpha * conjg(cb) / norm
+         ca = alpha * norm
+   20 continue
+      return
+      end
+      subroutine  cscal(n,ca,cx,incx)
+c
+c     scales a vector by a constant.
+c     jack dongarra, linpack,  3/11/78.
+c     modified 3/93 to return if incx .le. 0.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      complex ca,cx(*)
+      integer i,incx,n,nincx
+c
+      if( n.le.0 .or. incx.le.0 )return
+      if(incx.eq.1)go to 20
+c
+c        code for increment not equal to 1
+c
+      nincx = n*incx
+      do 10 i = 1,nincx,incx
+        cx(i) = ca*cx(i)
+   10 continue
+      return
+c
+c        code for increment equal to 1
+c
+   20 do 30 i = 1,n
+        cx(i) = ca*cx(i)
+   30 continue
+      return
+      end
+      subroutine  csrot (n,cx,incx,cy,incy,c,s)
+c
+c     applies a plane rotation, where the cos and sin (c and s) are real
+c     and the vectors cx and cy are complex.
+c     jack dongarra, linpack, 3/11/78.
+c
+      complex cx(1),cy(1),ctemp
+      real c,s
+      integer i,incx,incy,ix,iy,n
+c
+      if(n.le.0)return
+      if(incx.eq.1.and.incy.eq.1)go to 20
+c
+c       code for unequal increments or equal increments not equal
+c         to 1
+c
+      ix = 1
+      iy = 1
+      if(incx.lt.0)ix = (-n+1)*incx + 1
+      if(incy.lt.0)iy = (-n+1)*incy + 1
+      do 10 i = 1,n
+        ctemp = c*cx(ix) + s*cy(iy)
+        cy(iy) = c*cy(iy) - s*cx(ix)
+        cx(ix) = ctemp
+        ix = ix + incx
+        iy = iy + incy
+   10 continue
+      return
+c
+c       code for both increments equal to 1
+c
+   20 do 30 i = 1,n
+        ctemp = c*cx(i) + s*cy(i)
+        cy(i) = c*cy(i) - s*cx(i)
+        cx(i) = ctemp
+   30 continue
+      return
+      end
+      subroutine  csscal(n,sa,cx,incx)
+c
+c     scales a complex vector by a real constant.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 3/93 to return if incx .le. 0.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      complex cx(*)
+      real sa
+      integer i,incx,n,nincx
+c
+      if( n.le.0 .or. incx.le.0 )return
+      if(incx.eq.1)go to 20
+c
+c        code for increment not equal to 1
+c
+      nincx = n*incx
+      do 10 i = 1,nincx,incx
+        cx(i) = cmplx(sa*real(cx(i)),sa*aimag(cx(i)))
+   10 continue
+      return
+c
+c        code for increment equal to 1
+c
+   20 do 30 i = 1,n
+        cx(i) = cmplx(sa*real(cx(i)),sa*aimag(cx(i)))
+   30 continue
+      return
+      end
+      subroutine  cswap (n,cx,incx,cy,incy)
+c
+c     interchanges two vectors.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      complex cx(*),cy(*),ctemp
+      integer i,incx,incy,ix,iy,n
+c
+      if(n.le.0)return
+      if(incx.eq.1.and.incy.eq.1)go to 20
+c
+c       code for unequal increments or equal increments not equal
+c         to 1
+c
+      ix = 1
+      iy = 1
+      if(incx.lt.0)ix = (-n+1)*incx + 1
+      if(incy.lt.0)iy = (-n+1)*incy + 1
+      do 10 i = 1,n
+        ctemp = cx(ix)
+        cx(ix) = cy(iy)
+        cy(iy) = ctemp
+        ix = ix + incx
+        iy = iy + incy
+   10 continue
+      return
+c
+c       code for both increments equal to 1
+   20 do 30 i = 1,n
+        ctemp = cx(i)
+        cx(i) = cy(i)
+        cy(i) = ctemp
+   30 continue
+      return
+      end
+      SUBROUTINE CSYMM ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB,
+     $                   BETA, C, LDC )
+*     .. Scalar Arguments ..
+      CHARACTER*1        SIDE, UPLO
+      INTEGER            M, N, LDA, LDB, LDC
+      COMPLEX            ALPHA, BETA
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), B( LDB, * ), C( LDC, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CSYMM  performs one of the matrix-matrix operations
+*
+*     C := alpha*A*B + beta*C,
+*
+*  or
+*
+*     C := alpha*B*A + beta*C,
+*
+*  where  alpha and beta are scalars, A is a symmetric matrix and  B and
+*  C are m by n matrices.
+*
+*  Parameters
+*  ==========
+*
+*  SIDE   - CHARACTER*1.
+*           On entry,  SIDE  specifies whether  the  symmetric matrix  A
+*           appears on the  left or right  in the  operation as follows:
+*
+*              SIDE = 'L' or 'l'   C := alpha*A*B + beta*C,
+*
+*              SIDE = 'R' or 'r'   C := alpha*B*A + beta*C,
+*
+*           Unchanged on exit.
+*
+*  UPLO   - CHARACTER*1.
+*           On  entry,   UPLO  specifies  whether  the  upper  or  lower
+*           triangular  part  of  the  symmetric  matrix   A  is  to  be
+*           referenced as follows:
+*
+*              UPLO = 'U' or 'u'   Only the upper triangular part of the
+*                                  symmetric matrix is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the lower triangular part of the
+*                                  symmetric matrix is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry,  M  specifies the number of rows of the matrix  C.
+*           M  must be at least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of the matrix C.
+*           N  must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX         .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX          array of DIMENSION ( LDA, ka ), where ka is
+*           m  when  SIDE = 'L' or 'l'  and is n  otherwise.
+*           Before entry  with  SIDE = 'L' or 'l',  the  m by m  part of
+*           the array  A  must contain the  symmetric matrix,  such that
+*           when  UPLO = 'U' or 'u', the leading m by m upper triangular
+*           part of the array  A  must contain the upper triangular part
+*           of the  symmetric matrix and the  strictly  lower triangular
+*           part of  A  is not referenced,  and when  UPLO = 'L' or 'l',
+*           the leading  m by m  lower triangular part  of the  array  A
+*           must  contain  the  lower triangular part  of the  symmetric
+*           matrix and the  strictly upper triangular part of  A  is not
+*           referenced.
+*           Before entry  with  SIDE = 'R' or 'r',  the  n by n  part of
+*           the array  A  must contain the  symmetric matrix,  such that
+*           when  UPLO = 'U' or 'u', the leading n by n upper triangular
+*           part of the array  A  must contain the upper triangular part
+*           of the  symmetric matrix and the  strictly  lower triangular
+*           part of  A  is not referenced,  and when  UPLO = 'L' or 'l',
+*           the leading  n by n  lower triangular part  of the  array  A
+*           must  contain  the  lower triangular part  of the  symmetric
+*           matrix and the  strictly upper triangular part of  A  is not
+*           referenced.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the  calling (sub) program. When  SIDE = 'L' or 'l'  then
+*           LDA must be at least  max( 1, m ), otherwise  LDA must be at
+*           least max( 1, n ).
+*           Unchanged on exit.
+*
+*  B      - COMPLEX          array of DIMENSION ( LDB, n ).
+*           Before entry, the leading  m by n part of the array  B  must
+*           contain the matrix B.
+*           Unchanged on exit.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in  the  calling  (sub)  program.   LDB  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*  BETA   - COMPLEX         .
+*           On entry,  BETA  specifies the scalar  beta.  When  BETA  is
+*           supplied as zero then C need not be set on input.
+*           Unchanged on exit.
+*
+*  C      - COMPLEX          array of DIMENSION ( LDC, n ).
+*           Before entry, the leading  m by n  part of the array  C must
+*           contain the matrix  C,  except when  beta  is zero, in which
+*           case C need not be set on entry.
+*           On exit, the array  C  is overwritten by the  m by n updated
+*           matrix.
+*
+*  LDC    - INTEGER.
+*           On entry, LDC specifies the first dimension of C as declared
+*           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Local Scalars ..
+      LOGICAL            UPPER
+      INTEGER            I, INFO, J, K, NROWA
+      COMPLEX            TEMP1, TEMP2
+*     .. Parameters ..
+      COMPLEX            ONE
+      PARAMETER        ( ONE  = ( 1.0E+0, 0.0E+0 ) )
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Executable Statements ..
+*
+*     Set NROWA as the number of rows of A.
+*
+      IF( LSAME( SIDE, 'L' ) )THEN
+         NROWA = M
+      ELSE
+         NROWA = N
+      END IF
+      UPPER = LSAME( UPLO, 'U' )
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF(      ( .NOT.LSAME( SIDE, 'L' ) ).AND.
+     $         ( .NOT.LSAME( SIDE, 'R' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.UPPER              ).AND.
+     $         ( .NOT.LSAME( UPLO, 'L' ) )      )THEN
+         INFO = 2
+      ELSE IF( M  .LT.0               )THEN
+         INFO = 3
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 7
+      ELSE IF( LDB.LT.MAX( 1, M     ) )THEN
+         INFO = 9
+      ELSE IF( LDC.LT.MAX( 1, M     ) )THEN
+         INFO = 12
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CSYMM ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.
+     $    ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         IF( BETA.EQ.ZERO )THEN
+            DO 20, J = 1, N
+               DO 10, I = 1, M
+                  C( I, J ) = ZERO
+   10          CONTINUE
+   20       CONTINUE
+         ELSE
+            DO 40, J = 1, N
+               DO 30, I = 1, M
+                  C( I, J ) = BETA*C( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSAME( SIDE, 'L' ) )THEN
+*
+*        Form  C := alpha*A*B + beta*C.
+*
+         IF( UPPER )THEN
+            DO 70, J = 1, N
+               DO 60, I = 1, M
+                  TEMP1 = ALPHA*B( I, J )
+                  TEMP2 = ZERO
+                  DO 50, K = 1, I - 1
+                     C( K, J ) = C( K, J ) + TEMP1    *A( K, I )
+                     TEMP2     = TEMP2     + B( K, J )*A( K, I )
+   50             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2
+                  ELSE
+                     C( I, J ) = BETA *C( I, J ) +
+     $                           TEMP1*A( I, I ) + ALPHA*TEMP2
+                  END IF
+   60          CONTINUE
+   70       CONTINUE
+         ELSE
+            DO 100, J = 1, N
+               DO 90, I = M, 1, -1
+                  TEMP1 = ALPHA*B( I, J )
+                  TEMP2 = ZERO
+                  DO 80, K = I + 1, M
+                     C( K, J ) = C( K, J ) + TEMP1    *A( K, I )
+                     TEMP2     = TEMP2     + B( K, J )*A( K, I )
+   80             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2
+                  ELSE
+                     C( I, J ) = BETA *C( I, J ) +
+     $                           TEMP1*A( I, I ) + ALPHA*TEMP2
+                  END IF
+   90          CONTINUE
+  100       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  C := alpha*B*A + beta*C.
+*
+         DO 170, J = 1, N
+            TEMP1 = ALPHA*A( J, J )
+            IF( BETA.EQ.ZERO )THEN
+               DO 110, I = 1, M
+                  C( I, J ) = TEMP1*B( I, J )
+  110          CONTINUE
+            ELSE
+               DO 120, I = 1, M
+                  C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J )
+  120          CONTINUE
+            END IF
+            DO 140, K = 1, J - 1
+               IF( UPPER )THEN
+                  TEMP1 = ALPHA*A( K, J )
+               ELSE
+                  TEMP1 = ALPHA*A( J, K )
+               END IF
+               DO 130, I = 1, M
+                  C( I, J ) = C( I, J ) + TEMP1*B( I, K )
+  130          CONTINUE
+  140       CONTINUE
+            DO 160, K = J + 1, N
+               IF( UPPER )THEN
+                  TEMP1 = ALPHA*A( J, K )
+               ELSE
+                  TEMP1 = ALPHA*A( K, J )
+               END IF
+               DO 150, I = 1, M
+                  C( I, J ) = C( I, J ) + TEMP1*B( I, K )
+  150          CONTINUE
+  160       CONTINUE
+  170    CONTINUE
+      END IF
+*
+      RETURN
+*
+*     End of CSYMM .
+*
+      END
+      SUBROUTINE CSYR2K( UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB,
+     $                   BETA, C, LDC )
+*     .. Scalar Arguments ..
+      CHARACTER*1        UPLO, TRANS
+      INTEGER            N, K, LDA, LDB, LDC
+      COMPLEX            ALPHA, BETA
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), B( LDB, * ), C( LDC, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CSYR2K  performs one of the symmetric rank 2k operations
+*
+*     C := alpha*A*B' + alpha*B*A' + beta*C,
+*
+*  or
+*
+*     C := alpha*A'*B + alpha*B'*A + beta*C,
+*
+*  where  alpha and beta  are scalars,  C is an  n by n symmetric matrix
+*  and  A and B  are  n by k  matrices  in the  first  case  and  k by n
+*  matrices in the second case.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On  entry,   UPLO  specifies  whether  the  upper  or  lower
+*           triangular  part  of the  array  C  is to be  referenced  as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the  upper triangular part of  C
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the  lower triangular part of  C
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry,  TRANS  specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'    C := alpha*A*B' + alpha*B*A' +
+*                                         beta*C.
+*
+*              TRANS = 'T' or 't'    C := alpha*A'*B + alpha*B'*A +
+*                                         beta*C.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry,  N specifies the order of the matrix C.  N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry with  TRANS = 'N' or 'n',  K  specifies  the number
+*           of  columns  of the  matrices  A and B,  and on  entry  with
+*           TRANS = 'T' or 't',  K  specifies  the number of rows of the
+*           matrices  A and B.  K must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX         .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX          array of DIMENSION ( LDA, ka ), where ka is
+*           k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+*           Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+*           part of the array  A  must contain the matrix  A,  otherwise
+*           the leading  k by n  part of the array  A  must contain  the
+*           matrix A.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+*           then  LDA must be at least  max( 1, n ), otherwise  LDA must
+*           be at least  max( 1, k ).
+*           Unchanged on exit.
+*
+*  B      - COMPLEX          array of DIMENSION ( LDB, kb ), where kb is
+*           k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+*           Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+*           part of the array  B  must contain the matrix  B,  otherwise
+*           the leading  k by n  part of the array  B  must contain  the
+*           matrix B.
+*           Unchanged on exit.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+*           then  LDB must be at least  max( 1, n ), otherwise  LDB must
+*           be at least  max( 1, k ).
+*           Unchanged on exit.
+*
+*  BETA   - COMPLEX         .
+*           On entry, BETA specifies the scalar beta.
+*           Unchanged on exit.
+*
+*  C      - COMPLEX          array of DIMENSION ( LDC, n ).
+*           Before entry  with  UPLO = 'U' or 'u',  the leading  n by n
+*           upper triangular part of the array C must contain the upper
+*           triangular part  of the  symmetric matrix  and the strictly
+*           lower triangular part of C is not referenced.  On exit, the
+*           upper triangular part of the array  C is overwritten by the
+*           upper triangular part of the updated matrix.
+*           Before entry  with  UPLO = 'L' or 'l',  the leading  n by n
+*           lower triangular part of the array C must contain the lower
+*           triangular part  of the  symmetric matrix  and the strictly
+*           upper triangular part of C is not referenced.  On exit, the
+*           lower triangular part of the array  C is overwritten by the
+*           lower triangular part of the updated matrix.
+*
+*  LDC    - INTEGER.
+*           On entry, LDC specifies the first dimension of C as declared
+*           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Local Scalars ..
+      LOGICAL            UPPER
+      INTEGER            I, INFO, J, L, NROWA
+      COMPLEX            TEMP1, TEMP2
+*     .. Parameters ..
+      COMPLEX            ONE
+      PARAMETER        ( ONE  = ( 1.0E+0, 0.0E+0 ) )
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+         NROWA = N
+      ELSE
+         NROWA = K
+      END IF
+      UPPER = LSAME( UPLO, 'U' )
+*
+      INFO = 0
+      IF(      ( .NOT.UPPER               ).AND.
+     $         ( .NOT.LSAME( UPLO , 'L' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND.
+     $         ( .NOT.LSAME( TRANS, 'T' ) )      )THEN
+         INFO = 2
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 3
+      ELSE IF( K  .LT.0               )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 7
+      ELSE IF( LDB.LT.MAX( 1, NROWA ) )THEN
+         INFO = 9
+      ELSE IF( LDC.LT.MAX( 1, N     ) )THEN
+         INFO = 12
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CSYR2K', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.
+     $    ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         IF( UPPER )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 20, J = 1, N
+                  DO 10, I = 1, J
+                     C( I, J ) = ZERO
+   10             CONTINUE
+   20          CONTINUE
+            ELSE
+               DO 40, J = 1, N
+                  DO 30, I = 1, J
+                     C( I, J ) = BETA*C( I, J )
+   30             CONTINUE
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( BETA.EQ.ZERO )THEN
+               DO 60, J = 1, N
+                  DO 50, I = J, N
+                     C( I, J ) = ZERO
+   50             CONTINUE
+   60          CONTINUE
+            ELSE
+               DO 80, J = 1, N
+                  DO 70, I = J, N
+                     C( I, J ) = BETA*C( I, J )
+   70             CONTINUE
+   80          CONTINUE
+            END IF
+         END IF
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  C := alpha*A*B' + alpha*B*A' + C.
+*
+         IF( UPPER )THEN
+            DO 130, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 90, I = 1, J
+                     C( I, J ) = ZERO
+   90             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 100, I = 1, J
+                     C( I, J ) = BETA*C( I, J )
+  100             CONTINUE
+               END IF
+               DO 120, L = 1, K
+                  IF( ( A( J, L ).NE.ZERO ).OR.
+     $                ( B( J, L ).NE.ZERO )     )THEN
+                     TEMP1 = ALPHA*B( J, L )
+                     TEMP2 = ALPHA*A( J, L )
+                     DO 110, I = 1, J
+                        C( I, J ) = C( I, J ) + A( I, L )*TEMP1 +
+     $                                          B( I, L )*TEMP2
+  110                CONTINUE
+                  END IF
+  120          CONTINUE
+  130       CONTINUE
+         ELSE
+            DO 180, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 140, I = J, N
+                     C( I, J ) = ZERO
+  140             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 150, I = J, N
+                     C( I, J ) = BETA*C( I, J )
+  150             CONTINUE
+               END IF
+               DO 170, L = 1, K
+                  IF( ( A( J, L ).NE.ZERO ).OR.
+     $                ( B( J, L ).NE.ZERO )     )THEN
+                     TEMP1 = ALPHA*B( J, L )
+                     TEMP2 = ALPHA*A( J, L )
+                     DO 160, I = J, N
+                        C( I, J ) = C( I, J ) + A( I, L )*TEMP1 +
+     $                                          B( I, L )*TEMP2
+  160                CONTINUE
+                  END IF
+  170          CONTINUE
+  180       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  C := alpha*A'*B + alpha*B'*A + C.
+*
+         IF( UPPER )THEN
+            DO 210, J = 1, N
+               DO 200, I = 1, J
+                  TEMP1 = ZERO
+                  TEMP2 = ZERO
+                  DO 190, L = 1, K
+                     TEMP1 = TEMP1 + A( L, I )*B( L, J )
+                     TEMP2 = TEMP2 + B( L, I )*A( L, J )
+  190             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2
+                  ELSE
+                     C( I, J ) = BETA *C( I, J ) +
+     $                           ALPHA*TEMP1 + ALPHA*TEMP2
+                  END IF
+  200          CONTINUE
+  210       CONTINUE
+         ELSE
+            DO 240, J = 1, N
+               DO 230, I = J, N
+                  TEMP1 = ZERO
+                  TEMP2 = ZERO
+                  DO 220, L = 1, K
+                     TEMP1 = TEMP1 + A( L, I )*B( L, J )
+                     TEMP2 = TEMP2 + B( L, I )*A( L, J )
+  220             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2
+                  ELSE
+                     C( I, J ) = BETA *C( I, J ) +
+     $                           ALPHA*TEMP1 + ALPHA*TEMP2
+                  END IF
+  230          CONTINUE
+  240       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CSYR2K.
+*
+      END
+      SUBROUTINE CSYRK ( UPLO, TRANS, N, K, ALPHA, A, LDA,
+     $                   BETA, C, LDC )
+*     .. Scalar Arguments ..
+      CHARACTER*1        UPLO, TRANS
+      INTEGER            N, K, LDA, LDC
+      COMPLEX            ALPHA, BETA
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), C( LDC, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CSYRK  performs one of the symmetric rank k operations
+*
+*     C := alpha*A*A' + beta*C,
+*
+*  or
+*
+*     C := alpha*A'*A + beta*C,
+*
+*  where  alpha and beta  are scalars,  C is an  n by n symmetric matrix
+*  and  A  is an  n by k  matrix in the first case and a  k by n  matrix
+*  in the second case.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On  entry,   UPLO  specifies  whether  the  upper  or  lower
+*           triangular  part  of the  array  C  is to be  referenced  as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the  upper triangular part of  C
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the  lower triangular part of  C
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry,  TRANS  specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   C := alpha*A*A' + beta*C.
+*
+*              TRANS = 'T' or 't'   C := alpha*A'*A + beta*C.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry,  N specifies the order of the matrix C.  N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry with  TRANS = 'N' or 'n',  K  specifies  the number
+*           of  columns   of  the   matrix   A,   and  on   entry   with
+*           TRANS = 'T' or 't',  K  specifies  the number of rows of the
+*           matrix A.  K must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX         .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX          array of DIMENSION ( LDA, ka ), where ka is
+*           k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+*           Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+*           part of the array  A  must contain the matrix  A,  otherwise
+*           the leading  k by n  part of the array  A  must contain  the
+*           matrix A.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+*           then  LDA must be at least  max( 1, n ), otherwise  LDA must
+*           be at least  max( 1, k ).
+*           Unchanged on exit.
+*
+*  BETA   - COMPLEX         .
+*           On entry, BETA specifies the scalar beta.
+*           Unchanged on exit.
+*
+*  C      - COMPLEX          array of DIMENSION ( LDC, n ).
+*           Before entry  with  UPLO = 'U' or 'u',  the leading  n by n
+*           upper triangular part of the array C must contain the upper
+*           triangular part  of the  symmetric matrix  and the strictly
+*           lower triangular part of C is not referenced.  On exit, the
+*           upper triangular part of the array  C is overwritten by the
+*           upper triangular part of the updated matrix.
+*           Before entry  with  UPLO = 'L' or 'l',  the leading  n by n
+*           lower triangular part of the array C must contain the lower
+*           triangular part  of the  symmetric matrix  and the strictly
+*           upper triangular part of C is not referenced.  On exit, the
+*           lower triangular part of the array  C is overwritten by the
+*           lower triangular part of the updated matrix.
+*
+*  LDC    - INTEGER.
+*           On entry, LDC specifies the first dimension of C as declared
+*           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Local Scalars ..
+      LOGICAL            UPPER
+      INTEGER            I, INFO, J, L, NROWA
+      COMPLEX            TEMP
+*     .. Parameters ..
+      COMPLEX            ONE
+      PARAMETER        ( ONE  = ( 1.0E+0, 0.0E+0 ) )
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+         NROWA = N
+      ELSE
+         NROWA = K
+      END IF
+      UPPER = LSAME( UPLO, 'U' )
+*
+      INFO = 0
+      IF(      ( .NOT.UPPER               ).AND.
+     $         ( .NOT.LSAME( UPLO , 'L' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND.
+     $         ( .NOT.LSAME( TRANS, 'T' ) )      )THEN
+         INFO = 2
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 3
+      ELSE IF( K  .LT.0               )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 7
+      ELSE IF( LDC.LT.MAX( 1, N     ) )THEN
+         INFO = 10
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CSYRK ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.
+     $    ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         IF( UPPER )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 20, J = 1, N
+                  DO 10, I = 1, J
+                     C( I, J ) = ZERO
+   10             CONTINUE
+   20          CONTINUE
+            ELSE
+               DO 40, J = 1, N
+                  DO 30, I = 1, J
+                     C( I, J ) = BETA*C( I, J )
+   30             CONTINUE
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( BETA.EQ.ZERO )THEN
+               DO 60, J = 1, N
+                  DO 50, I = J, N
+                     C( I, J ) = ZERO
+   50             CONTINUE
+   60          CONTINUE
+            ELSE
+               DO 80, J = 1, N
+                  DO 70, I = J, N
+                     C( I, J ) = BETA*C( I, J )
+   70             CONTINUE
+   80          CONTINUE
+            END IF
+         END IF
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  C := alpha*A*A' + beta*C.
+*
+         IF( UPPER )THEN
+            DO 130, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 90, I = 1, J
+                     C( I, J ) = ZERO
+   90             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 100, I = 1, J
+                     C( I, J ) = BETA*C( I, J )
+  100             CONTINUE
+               END IF
+               DO 120, L = 1, K
+                  IF( A( J, L ).NE.ZERO )THEN
+                     TEMP = ALPHA*A( J, L )
+                     DO 110, I = 1, J
+                        C( I, J ) = C( I, J ) + TEMP*A( I, L )
+  110                CONTINUE
+                  END IF
+  120          CONTINUE
+  130       CONTINUE
+         ELSE
+            DO 180, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 140, I = J, N
+                     C( I, J ) = ZERO
+  140             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 150, I = J, N
+                     C( I, J ) = BETA*C( I, J )
+  150             CONTINUE
+               END IF
+               DO 170, L = 1, K
+                  IF( A( J, L ).NE.ZERO )THEN
+                     TEMP      = ALPHA*A( J, L )
+                     DO 160, I = J, N
+                        C( I, J ) = C( I, J ) + TEMP*A( I, L )
+  160                CONTINUE
+                  END IF
+  170          CONTINUE
+  180       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  C := alpha*A'*A + beta*C.
+*
+         IF( UPPER )THEN
+            DO 210, J = 1, N
+               DO 200, I = 1, J
+                  TEMP = ZERO
+                  DO 190, L = 1, K
+                     TEMP = TEMP + A( L, I )*A( L, J )
+  190             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  200          CONTINUE
+  210       CONTINUE
+         ELSE
+            DO 240, J = 1, N
+               DO 230, I = J, N
+                  TEMP = ZERO
+                  DO 220, L = 1, K
+                     TEMP = TEMP + A( L, I )*A( L, J )
+  220             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  230          CONTINUE
+  240       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CSYRK .
+*
+      END
+      SUBROUTINE CTBMV ( UPLO, TRANS, DIAG, N, K, A, LDA, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, K, LDA, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CTBMV  performs one of the matrix-vector operations
+*
+*     x := A*x,   or   x := A'*x,   or   x := conjg( A' )*x,
+*
+*  where x is an n element vector and  A is an n by n unit, or non-unit,
+*  upper or lower triangular band matrix, with ( k + 1 ) diagonals.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   x := A*x.
+*
+*              TRANS = 'T' or 't'   x := A'*x.
+*
+*              TRANS = 'C' or 'c'   x := conjg( A' )*x.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry with UPLO = 'U' or 'u', K specifies the number of
+*           super-diagonals of the matrix A.
+*           On entry with UPLO = 'L' or 'l', K specifies the number of
+*           sub-diagonals of the matrix A.
+*           K must satisfy  0 .le. K.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX          array of DIMENSION ( LDA, n ).
+*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
+*           by n part of the array A must contain the upper triangular
+*           band part of the matrix of coefficients, supplied column by
+*           column, with the leading diagonal of the matrix in row
+*           ( k + 1 ) of the array, the first super-diagonal starting at
+*           position 2 in row k, and so on. The top left k by k triangle
+*           of the array A is not referenced.
+*           The following program segment will transfer an upper
+*           triangular band matrix from conventional full matrix storage
+*           to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = K + 1 - J
+*                    DO 10, I = MAX( 1, J - K ), J
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
+*           by n part of the array A must contain the lower triangular
+*           band part of the matrix of coefficients, supplied column by
+*           column, with the leading diagonal of the matrix in row 1 of
+*           the array, the first sub-diagonal starting at position 1 in
+*           row 2, and so on. The bottom right k by k triangle of the
+*           array A is not referenced.
+*           The following program segment will transfer a lower
+*           triangular band matrix from conventional full matrix storage
+*           to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = 1 - J
+*                    DO 10, I = J, MIN( N, J + K )
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Note that when DIAG = 'U' or 'u' the elements of the array A
+*           corresponding to the diagonal elements of the matrix are not
+*           referenced, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           ( k + 1 ).
+*           Unchanged on exit.
+*
+*  X      - COMPLEX          array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x. On exit, X is overwritten with the
+*           tranformed vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     .. Local Scalars ..
+      COMPLEX            TEMP
+      INTEGER            I, INFO, IX, J, JX, KPLUS1, KX, L
+      LOGICAL            NOCONJ, NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( K.LT.0 )THEN
+         INFO = 5
+      ELSE IF( LDA.LT.( K + 1 ) )THEN
+         INFO = 7
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 9
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CTBMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOCONJ = LSAME( TRANS, 'T' )
+      NOUNIT = LSAME( DIAG , 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX   too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through A.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*         Form  x := A*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KPLUS1 = K + 1
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     L    = KPLUS1 - J
+                     DO 10, I = MAX( 1, J - K ), J - 1
+                        X( I ) = X( I ) + TEMP*A( L + I, J )
+   10                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*A( KPLUS1, J )
+                  END IF
+   20          CONTINUE
+            ELSE
+               JX = KX
+               DO 40, J = 1, N
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     L    = KPLUS1  - J
+                     DO 30, I = MAX( 1, J - K ), J - 1
+                        X( IX ) = X( IX ) + TEMP*A( L + I, J )
+                        IX      = IX      + INCX
+   30                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*A( KPLUS1, J )
+                  END IF
+                  JX = JX + INCX
+                  IF( J.GT.K )
+     $               KX = KX + INCX
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     L    = 1      - J
+                     DO 50, I = MIN( N, J + K ), J + 1, -1
+                        X( I ) = X( I ) + TEMP*A( L + I, J )
+   50                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*A( 1, J )
+                  END IF
+   60          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 80, J = N, 1, -1
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     L    = 1       - J
+                     DO 70, I = MIN( N, J + K ), J + 1, -1
+                        X( IX ) = X( IX ) + TEMP*A( L + I, J )
+                        IX      = IX      - INCX
+   70                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*A( 1, J )
+                  END IF
+                  JX = JX - INCX
+                  IF( ( N - J ).GE.K )
+     $               KX = KX - INCX
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := A'*x  or  x := conjg( A' )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KPLUS1 = K + 1
+            IF( INCX.EQ.1 )THEN
+               DO 110, J = N, 1, -1
+                  TEMP = X( J )
+                  L    = KPLUS1 - J
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*A( KPLUS1, J )
+                     DO 90, I = J - 1, MAX( 1, J - K ), -1
+                        TEMP = TEMP + A( L + I, J )*X( I )
+   90                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*CONJG( A( KPLUS1, J ) )
+                     DO 100, I = J - 1, MAX( 1, J - K ), -1
+                        TEMP = TEMP + CONJG( A( L + I, J ) )*X( I )
+  100                CONTINUE
+                  END IF
+                  X( J ) = TEMP
+  110          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 140, J = N, 1, -1
+                  TEMP = X( JX )
+                  KX   = KX      - INCX
+                  IX   = KX
+                  L    = KPLUS1  - J
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*A( KPLUS1, J )
+                     DO 120, I = J - 1, MAX( 1, J - K ), -1
+                        TEMP = TEMP + A( L + I, J )*X( IX )
+                        IX   = IX   - INCX
+  120                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*CONJG( A( KPLUS1, J ) )
+                     DO 130, I = J - 1, MAX( 1, J - K ), -1
+                        TEMP = TEMP + CONJG( A( L + I, J ) )*X( IX )
+                        IX   = IX   - INCX
+  130                CONTINUE
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+  140          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 170, J = 1, N
+                  TEMP = X( J )
+                  L    = 1      - J
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*A( 1, J )
+                     DO 150, I = J + 1, MIN( N, J + K )
+                        TEMP = TEMP + A( L + I, J )*X( I )
+  150                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*CONJG( A( 1, J ) )
+                     DO 160, I = J + 1, MIN( N, J + K )
+                        TEMP = TEMP + CONJG( A( L + I, J ) )*X( I )
+  160                CONTINUE
+                  END IF
+                  X( J ) = TEMP
+  170          CONTINUE
+            ELSE
+               JX = KX
+               DO 200, J = 1, N
+                  TEMP = X( JX )
+                  KX   = KX      + INCX
+                  IX   = KX
+                  L    = 1       - J
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*A( 1, J )
+                     DO 180, I = J + 1, MIN( N, J + K )
+                        TEMP = TEMP + A( L + I, J )*X( IX )
+                        IX   = IX   + INCX
+  180                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*CONJG( A( 1, J ) )
+                     DO 190, I = J + 1, MIN( N, J + K )
+                        TEMP = TEMP + CONJG( A( L + I, J ) )*X( IX )
+                        IX   = IX   + INCX
+  190                CONTINUE
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+  200          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CTBMV .
+*
+      END
+      SUBROUTINE CTBSV ( UPLO, TRANS, DIAG, N, K, A, LDA, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, K, LDA, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CTBSV  solves one of the systems of equations
+*
+*     A*x = b,   or   A'*x = b,   or   conjg( A' )*x = b,
+*
+*  where b and x are n element vectors and A is an n by n unit, or
+*  non-unit, upper or lower triangular band matrix, with ( k + 1 )
+*  diagonals.
+*
+*  No test for singularity or near-singularity is included in this
+*  routine. Such tests must be performed before calling this routine.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the equations to be solved as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   A*x = b.
+*
+*              TRANS = 'T' or 't'   A'*x = b.
+*
+*              TRANS = 'C' or 'c'   conjg( A' )*x = b.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry with UPLO = 'U' or 'u', K specifies the number of
+*           super-diagonals of the matrix A.
+*           On entry with UPLO = 'L' or 'l', K specifies the number of
+*           sub-diagonals of the matrix A.
+*           K must satisfy  0 .le. K.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX          array of DIMENSION ( LDA, n ).
+*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
+*           by n part of the array A must contain the upper triangular
+*           band part of the matrix of coefficients, supplied column by
+*           column, with the leading diagonal of the matrix in row
+*           ( k + 1 ) of the array, the first super-diagonal starting at
+*           position 2 in row k, and so on. The top left k by k triangle
+*           of the array A is not referenced.
+*           The following program segment will transfer an upper
+*           triangular band matrix from conventional full matrix storage
+*           to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = K + 1 - J
+*                    DO 10, I = MAX( 1, J - K ), J
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
+*           by n part of the array A must contain the lower triangular
+*           band part of the matrix of coefficients, supplied column by
+*           column, with the leading diagonal of the matrix in row 1 of
+*           the array, the first sub-diagonal starting at position 1 in
+*           row 2, and so on. The bottom right k by k triangle of the
+*           array A is not referenced.
+*           The following program segment will transfer a lower
+*           triangular band matrix from conventional full matrix storage
+*           to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = 1 - J
+*                    DO 10, I = J, MIN( N, J + K )
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Note that when DIAG = 'U' or 'u' the elements of the array A
+*           corresponding to the diagonal elements of the matrix are not
+*           referenced, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           ( k + 1 ).
+*           Unchanged on exit.
+*
+*  X      - COMPLEX          array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element right-hand side vector b. On exit, X is overwritten
+*           with the solution vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     .. Local Scalars ..
+      COMPLEX            TEMP
+      INTEGER            I, INFO, IX, J, JX, KPLUS1, KX, L
+      LOGICAL            NOCONJ, NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( K.LT.0 )THEN
+         INFO = 5
+      ELSE IF( LDA.LT.( K + 1 ) )THEN
+         INFO = 7
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 9
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CTBSV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOCONJ = LSAME( TRANS, 'T' )
+      NOUNIT = LSAME( DIAG , 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX  too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed by sequentially with one pass through A.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  x := inv( A )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KPLUS1 = K + 1
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     L = KPLUS1 - J
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/A( KPLUS1, J )
+                     TEMP = X( J )
+                     DO 10, I = J - 1, MAX( 1, J - K ), -1
+                        X( I ) = X( I ) - TEMP*A( L + I, J )
+   10                CONTINUE
+                  END IF
+   20          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 40, J = N, 1, -1
+                  KX = KX - INCX
+                  IF( X( JX ).NE.ZERO )THEN
+                     IX = KX
+                     L  = KPLUS1 - J
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/A( KPLUS1, J )
+                     TEMP = X( JX )
+                     DO 30, I = J - 1, MAX( 1, J - K ), -1
+                        X( IX ) = X( IX ) - TEMP*A( L + I, J )
+                        IX      = IX      - INCX
+   30                CONTINUE
+                  END IF
+                  JX = JX - INCX
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     L = 1 - J
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/A( 1, J )
+                     TEMP = X( J )
+                     DO 50, I = J + 1, MIN( N, J + K )
+                        X( I ) = X( I ) - TEMP*A( L + I, J )
+   50                CONTINUE
+                  END IF
+   60          CONTINUE
+            ELSE
+               JX = KX
+               DO 80, J = 1, N
+                  KX = KX + INCX
+                  IF( X( JX ).NE.ZERO )THEN
+                     IX = KX
+                     L  = 1  - J
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/A( 1, J )
+                     TEMP = X( JX )
+                     DO 70, I = J + 1, MIN( N, J + K )
+                        X( IX ) = X( IX ) - TEMP*A( L + I, J )
+                        IX      = IX      + INCX
+   70                CONTINUE
+                  END IF
+                  JX = JX + INCX
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := inv( A' )*x  or  x := inv( conjg( A') )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KPLUS1 = K + 1
+            IF( INCX.EQ.1 )THEN
+               DO 110, J = 1, N
+                  TEMP = X( J )
+                  L    = KPLUS1 - J
+                  IF( NOCONJ )THEN
+                     DO 90, I = MAX( 1, J - K ), J - 1
+                        TEMP = TEMP - A( L + I, J )*X( I )
+   90                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/A( KPLUS1, J )
+                  ELSE
+                     DO 100, I = MAX( 1, J - K ), J - 1
+                        TEMP = TEMP - CONJG( A( L + I, J ) )*X( I )
+  100                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/CONJG( A( KPLUS1, J ) )
+                  END IF
+                  X( J ) = TEMP
+  110          CONTINUE
+            ELSE
+               JX = KX
+               DO 140, J = 1, N
+                  TEMP = X( JX )
+                  IX   = KX
+                  L    = KPLUS1  - J
+                  IF( NOCONJ )THEN
+                     DO 120, I = MAX( 1, J - K ), J - 1
+                        TEMP = TEMP - A( L + I, J )*X( IX )
+                        IX   = IX   + INCX
+  120                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/A( KPLUS1, J )
+                  ELSE
+                     DO 130, I = MAX( 1, J - K ), J - 1
+                        TEMP = TEMP - CONJG( A( L + I, J ) )*X( IX )
+                        IX   = IX   + INCX
+  130                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/CONJG( A( KPLUS1, J ) )
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+                  IF( J.GT.K )
+     $               KX = KX + INCX
+  140          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 170, J = N, 1, -1
+                  TEMP = X( J )
+                  L    = 1      - J
+                  IF( NOCONJ )THEN
+                     DO 150, I = MIN( N, J + K ), J + 1, -1
+                        TEMP = TEMP - A( L + I, J )*X( I )
+  150                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/A( 1, J )
+                  ELSE
+                     DO 160, I = MIN( N, J + K ), J + 1, -1
+                        TEMP = TEMP - CONJG( A( L + I, J ) )*X( I )
+  160                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/CONJG( A( 1, J ) )
+                  END IF
+                  X( J ) = TEMP
+  170          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 200, J = N, 1, -1
+                  TEMP = X( JX )
+                  IX   = KX
+                  L    = 1       - J
+                  IF( NOCONJ )THEN
+                     DO 180, I = MIN( N, J + K ), J + 1, -1
+                        TEMP = TEMP - A( L + I, J )*X( IX )
+                        IX   = IX   - INCX
+  180                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/A( 1, J )
+                  ELSE
+                     DO 190, I = MIN( N, J + K ), J + 1, -1
+                        TEMP = TEMP - CONJG( A( L + I, J ) )*X( IX )
+                        IX   = IX   - INCX
+  190                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/CONJG( A( 1, J ) )
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+                  IF( ( N - J ).GE.K )
+     $               KX = KX - INCX
+  200          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CTBSV .
+*
+      END
+      SUBROUTINE CTPMV ( UPLO, TRANS, DIAG, N, AP, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      COMPLEX            AP( * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CTPMV  performs one of the matrix-vector operations
+*
+*     x := A*x,   or   x := A'*x,   or   x := conjg( A' )*x,
+*
+*  where x is an n element vector and  A is an n by n unit, or non-unit,
+*  upper or lower triangular matrix, supplied in packed form.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   x := A*x.
+*
+*              TRANS = 'T' or 't'   x := A'*x.
+*
+*              TRANS = 'C' or 'c'   x := conjg( A' )*x.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  AP     - COMPLEX          array of DIMENSION at least
+*           ( ( n*( n + 1 ) )/2 ).
+*           Before entry with  UPLO = 'U' or 'u', the array AP must
+*           contain the upper triangular matrix packed sequentially,
+*           column by column, so that AP( 1 ) contains a( 1, 1 ),
+*           AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 )
+*           respectively, and so on.
+*           Before entry with UPLO = 'L' or 'l', the array AP must
+*           contain the lower triangular matrix packed sequentially,
+*           column by column, so that AP( 1 ) contains a( 1, 1 ),
+*           AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 )
+*           respectively, and so on.
+*           Note that when  DIAG = 'U' or 'u', the diagonal elements of
+*           A are not referenced, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  X      - COMPLEX          array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x. On exit, X is overwritten with the
+*           tranformed vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     .. Local Scalars ..
+      COMPLEX            TEMP
+      INTEGER            I, INFO, IX, J, JX, K, KK, KX
+      LOGICAL            NOCONJ, NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 7
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CTPMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOCONJ = LSAME( TRANS, 'T' )
+      NOUNIT = LSAME( DIAG , 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX  too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of AP are
+*     accessed sequentially with one pass through AP.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  x:= A*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KK = 1
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     K    = KK
+                     DO 10, I = 1, J - 1
+                        X( I ) = X( I ) + TEMP*AP( K )
+                        K      = K      + 1
+   10                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*AP( KK + J - 1 )
+                  END IF
+                  KK = KK + J
+   20          CONTINUE
+            ELSE
+               JX = KX
+               DO 40, J = 1, N
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     DO 30, K = KK, KK + J - 2
+                        X( IX ) = X( IX ) + TEMP*AP( K )
+                        IX      = IX      + INCX
+   30                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*AP( KK + J - 1 )
+                  END IF
+                  JX = JX + INCX
+                  KK = KK + J
+   40          CONTINUE
+            END IF
+         ELSE
+            KK = ( N*( N + 1 ) )/2
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     K    = KK
+                     DO 50, I = N, J + 1, -1
+                        X( I ) = X( I ) + TEMP*AP( K )
+                        K      = K      - 1
+   50                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*AP( KK - N + J )
+                  END IF
+                  KK = KK - ( N - J + 1 )
+   60          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 80, J = N, 1, -1
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     DO 70, K = KK, KK - ( N - ( J + 1 ) ), -1
+                        X( IX ) = X( IX ) + TEMP*AP( K )
+                        IX      = IX      - INCX
+   70                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*AP( KK - N + J )
+                  END IF
+                  JX = JX - INCX
+                  KK = KK - ( N - J + 1 )
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := A'*x  or  x := conjg( A' )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KK = ( N*( N + 1 ) )/2
+            IF( INCX.EQ.1 )THEN
+               DO 110, J = N, 1, -1
+                  TEMP = X( J )
+                  K    = KK     - 1
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*AP( KK )
+                     DO 90, I = J - 1, 1, -1
+                        TEMP = TEMP + AP( K )*X( I )
+                        K    = K    - 1
+   90                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*CONJG( AP( KK ) )
+                     DO 100, I = J - 1, 1, -1
+                        TEMP = TEMP + CONJG( AP( K ) )*X( I )
+                        K    = K    - 1
+  100                CONTINUE
+                  END IF
+                  X( J ) = TEMP
+                  KK     = KK   - J
+  110          CONTINUE
+            ELSE
+               JX = KX + ( N - 1 )*INCX
+               DO 140, J = N, 1, -1
+                  TEMP = X( JX )
+                  IX   = JX
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*AP( KK )
+                     DO 120, K = KK - 1, KK - J + 1, -1
+                        IX   = IX   - INCX
+                        TEMP = TEMP + AP( K )*X( IX )
+  120                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*CONJG( AP( KK ) )
+                     DO 130, K = KK - 1, KK - J + 1, -1
+                        IX   = IX   - INCX
+                        TEMP = TEMP + CONJG( AP( K ) )*X( IX )
+  130                CONTINUE
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+                  KK      = KK   - J
+  140          CONTINUE
+            END IF
+         ELSE
+            KK = 1
+            IF( INCX.EQ.1 )THEN
+               DO 170, J = 1, N
+                  TEMP = X( J )
+                  K    = KK     + 1
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*AP( KK )
+                     DO 150, I = J + 1, N
+                        TEMP = TEMP + AP( K )*X( I )
+                        K    = K    + 1
+  150                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*CONJG( AP( KK ) )
+                     DO 160, I = J + 1, N
+                        TEMP = TEMP + CONJG( AP( K ) )*X( I )
+                        K    = K    + 1
+  160                CONTINUE
+                  END IF
+                  X( J ) = TEMP
+                  KK     = KK   + ( N - J + 1 )
+  170          CONTINUE
+            ELSE
+               JX = KX
+               DO 200, J = 1, N
+                  TEMP = X( JX )
+                  IX   = JX
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*AP( KK )
+                     DO 180, K = KK + 1, KK + N - J
+                        IX   = IX   + INCX
+                        TEMP = TEMP + AP( K )*X( IX )
+  180                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*CONJG( AP( KK ) )
+                     DO 190, K = KK + 1, KK + N - J
+                        IX   = IX   + INCX
+                        TEMP = TEMP + CONJG( AP( K ) )*X( IX )
+  190                CONTINUE
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+                  KK      = KK   + ( N - J + 1 )
+  200          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CTPMV .
+*
+      END
+      SUBROUTINE CTPSV ( UPLO, TRANS, DIAG, N, AP, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      COMPLEX            AP( * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CTPSV  solves one of the systems of equations
+*
+*     A*x = b,   or   A'*x = b,   or   conjg( A' )*x = b,
+*
+*  where b and x are n element vectors and A is an n by n unit, or
+*  non-unit, upper or lower triangular matrix, supplied in packed form.
+*
+*  No test for singularity or near-singularity is included in this
+*  routine. Such tests must be performed before calling this routine.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the equations to be solved as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   A*x = b.
+*
+*              TRANS = 'T' or 't'   A'*x = b.
+*
+*              TRANS = 'C' or 'c'   conjg( A' )*x = b.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  AP     - COMPLEX          array of DIMENSION at least
+*           ( ( n*( n + 1 ) )/2 ).
+*           Before entry with  UPLO = 'U' or 'u', the array AP must
+*           contain the upper triangular matrix packed sequentially,
+*           column by column, so that AP( 1 ) contains a( 1, 1 ),
+*           AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 )
+*           respectively, and so on.
+*           Before entry with UPLO = 'L' or 'l', the array AP must
+*           contain the lower triangular matrix packed sequentially,
+*           column by column, so that AP( 1 ) contains a( 1, 1 ),
+*           AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 )
+*           respectively, and so on.
+*           Note that when  DIAG = 'U' or 'u', the diagonal elements of
+*           A are not referenced, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  X      - COMPLEX          array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element right-hand side vector b. On exit, X is overwritten
+*           with the solution vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     .. Local Scalars ..
+      COMPLEX            TEMP
+      INTEGER            I, INFO, IX, J, JX, K, KK, KX
+      LOGICAL            NOCONJ, NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 7
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CTPSV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOCONJ = LSAME( TRANS, 'T' )
+      NOUNIT = LSAME( DIAG , 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX  too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of AP are
+*     accessed sequentially with one pass through AP.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  x := inv( A )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KK = ( N*( N + 1 ) )/2
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/AP( KK )
+                     TEMP = X( J )
+                     K    = KK     - 1
+                     DO 10, I = J - 1, 1, -1
+                        X( I ) = X( I ) - TEMP*AP( K )
+                        K      = K      - 1
+   10                CONTINUE
+                  END IF
+                  KK = KK - J
+   20          CONTINUE
+            ELSE
+               JX = KX + ( N - 1 )*INCX
+               DO 40, J = N, 1, -1
+                  IF( X( JX ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/AP( KK )
+                     TEMP = X( JX )
+                     IX   = JX
+                     DO 30, K = KK - 1, KK - J + 1, -1
+                        IX      = IX      - INCX
+                        X( IX ) = X( IX ) - TEMP*AP( K )
+   30                CONTINUE
+                  END IF
+                  JX = JX - INCX
+                  KK = KK - J
+   40          CONTINUE
+            END IF
+         ELSE
+            KK = 1
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/AP( KK )
+                     TEMP = X( J )
+                     K    = KK     + 1
+                     DO 50, I = J + 1, N
+                        X( I ) = X( I ) - TEMP*AP( K )
+                        K      = K      + 1
+   50                CONTINUE
+                  END IF
+                  KK = KK + ( N - J + 1 )
+   60          CONTINUE
+            ELSE
+               JX = KX
+               DO 80, J = 1, N
+                  IF( X( JX ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/AP( KK )
+                     TEMP = X( JX )
+                     IX   = JX
+                     DO 70, K = KK + 1, KK + N - J
+                        IX      = IX      + INCX
+                        X( IX ) = X( IX ) - TEMP*AP( K )
+   70                CONTINUE
+                  END IF
+                  JX = JX + INCX
+                  KK = KK + ( N - J + 1 )
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := inv( A' )*x  or  x := inv( conjg( A' ) )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KK = 1
+            IF( INCX.EQ.1 )THEN
+               DO 110, J = 1, N
+                  TEMP = X( J )
+                  K    = KK
+                  IF( NOCONJ )THEN
+                     DO 90, I = 1, J - 1
+                        TEMP = TEMP - AP( K )*X( I )
+                        K    = K    + 1
+   90                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/AP( KK + J - 1 )
+                  ELSE
+                     DO 100, I = 1, J - 1
+                        TEMP = TEMP - CONJG( AP( K ) )*X( I )
+                        K    = K    + 1
+  100                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/CONJG( AP( KK + J - 1 ) )
+                  END IF
+                  X( J ) = TEMP
+                  KK     = KK   + J
+  110          CONTINUE
+            ELSE
+               JX = KX
+               DO 140, J = 1, N
+                  TEMP = X( JX )
+                  IX   = KX
+                  IF( NOCONJ )THEN
+                     DO 120, K = KK, KK + J - 2
+                        TEMP = TEMP - AP( K )*X( IX )
+                        IX   = IX   + INCX
+  120                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/AP( KK + J - 1 )
+                  ELSE
+                     DO 130, K = KK, KK + J - 2
+                        TEMP = TEMP - CONJG( AP( K ) )*X( IX )
+                        IX   = IX   + INCX
+  130                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/CONJG( AP( KK + J - 1 ) )
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+                  KK      = KK   + J
+  140          CONTINUE
+            END IF
+         ELSE
+            KK = ( N*( N + 1 ) )/2
+            IF( INCX.EQ.1 )THEN
+               DO 170, J = N, 1, -1
+                  TEMP = X( J )
+                  K    = KK
+                  IF( NOCONJ )THEN
+                     DO 150, I = N, J + 1, -1
+                        TEMP = TEMP - AP( K )*X( I )
+                        K    = K    - 1
+  150                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/AP( KK - N + J )
+                  ELSE
+                     DO 160, I = N, J + 1, -1
+                        TEMP = TEMP - CONJG( AP( K ) )*X( I )
+                        K    = K    - 1
+  160                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/CONJG( AP( KK - N + J ) )
+                  END IF
+                  X( J ) = TEMP
+                  KK     = KK   - ( N - J + 1 )
+  170          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 200, J = N, 1, -1
+                  TEMP = X( JX )
+                  IX   = KX
+                  IF( NOCONJ )THEN
+                     DO 180, K = KK, KK - ( N - ( J + 1 ) ), -1
+                        TEMP = TEMP - AP( K )*X( IX )
+                        IX   = IX   - INCX
+  180                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/AP( KK - N + J )
+                  ELSE
+                     DO 190, K = KK, KK - ( N - ( J + 1 ) ), -1
+                        TEMP = TEMP - CONJG( AP( K ) )*X( IX )
+                        IX   = IX   - INCX
+  190                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/CONJG( AP( KK - N + J ) )
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+                  KK      = KK   - ( N - J + 1 )
+  200          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CTPSV .
+*
+      END
+      SUBROUTINE CTRMM ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA,
+     $                   B, LDB )
+*     .. Scalar Arguments ..
+      CHARACTER*1        SIDE, UPLO, TRANSA, DIAG
+      INTEGER            M, N, LDA, LDB
+      COMPLEX            ALPHA
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), B( LDB, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CTRMM  performs one of the matrix-matrix operations
+*
+*     B := alpha*op( A )*B,   or   B := alpha*B*op( A )
+*
+*  where  alpha  is a scalar,  B  is an m by n matrix,  A  is a unit, or
+*  non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+*
+*     op( A ) = A   or   op( A ) = A'   or   op( A ) = conjg( A' ).
+*
+*  Parameters
+*  ==========
+*
+*  SIDE   - CHARACTER*1.
+*           On entry,  SIDE specifies whether  op( A ) multiplies B from
+*           the left or right as follows:
+*
+*              SIDE = 'L' or 'l'   B := alpha*op( A )*B.
+*
+*              SIDE = 'R' or 'r'   B := alpha*B*op( A ).
+*
+*           Unchanged on exit.
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix A is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANSA - CHARACTER*1.
+*           On entry, TRANSA specifies the form of op( A ) to be used in
+*           the matrix multiplication as follows:
+*
+*              TRANSA = 'N' or 'n'   op( A ) = A.
+*
+*              TRANSA = 'T' or 't'   op( A ) = A'.
+*
+*              TRANSA = 'C' or 'c'   op( A ) = conjg( A' ).
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit triangular
+*           as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry, M specifies the number of rows of B. M must be at
+*           least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of B.  N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX         .
+*           On entry,  ALPHA specifies the scalar  alpha. When  alpha is
+*           zero then  A is not referenced and  B need not be set before
+*           entry.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX          array of DIMENSION ( LDA, k ), where k is m
+*           when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'.
+*           Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k
+*           upper triangular part of the array  A must contain the upper
+*           triangular matrix  and the strictly lower triangular part of
+*           A is not referenced.
+*           Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k
+*           lower triangular part of the array  A must contain the lower
+*           triangular matrix  and the strictly upper triangular part of
+*           A is not referenced.
+*           Note that when  DIAG = 'U' or 'u',  the diagonal elements of
+*           A  are not referenced either,  but are assumed to be  unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program.  When  SIDE = 'L' or 'l'  then
+*           LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r'
+*           then LDA must be at least max( 1, n ).
+*           Unchanged on exit.
+*
+*  B      - COMPLEX          array of DIMENSION ( LDB, n ).
+*           Before entry,  the leading  m by n part of the array  B must
+*           contain the matrix  B,  and  on exit  is overwritten  by the
+*           transformed matrix.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in  the  calling  (sub)  program.   LDB  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG, MAX
+*     .. Local Scalars ..
+      LOGICAL            LSIDE, NOCONJ, NOUNIT, UPPER
+      INTEGER            I, INFO, J, K, NROWA
+      COMPLEX            TEMP
+*     .. Parameters ..
+      COMPLEX            ONE
+      PARAMETER        ( ONE  = ( 1.0E+0, 0.0E+0 ) )
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      LSIDE  = LSAME( SIDE  , 'L' )
+      IF( LSIDE )THEN
+         NROWA = M
+      ELSE
+         NROWA = N
+      END IF
+      NOCONJ = LSAME( TRANSA, 'T' )
+      NOUNIT = LSAME( DIAG  , 'N' )
+      UPPER  = LSAME( UPLO  , 'U' )
+*
+      INFO   = 0
+      IF(      ( .NOT.LSIDE                ).AND.
+     $         ( .NOT.LSAME( SIDE  , 'R' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.UPPER                ).AND.
+     $         ( .NOT.LSAME( UPLO  , 'L' ) )      )THEN
+         INFO = 2
+      ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND.
+     $         ( .NOT.LSAME( TRANSA, 'T' ) ).AND.
+     $         ( .NOT.LSAME( TRANSA, 'C' ) )      )THEN
+         INFO = 3
+      ELSE IF( ( .NOT.LSAME( DIAG  , 'U' ) ).AND.
+     $         ( .NOT.LSAME( DIAG  , 'N' ) )      )THEN
+         INFO = 4
+      ELSE IF( M  .LT.0               )THEN
+         INFO = 5
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 6
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 9
+      ELSE IF( LDB.LT.MAX( 1, M     ) )THEN
+         INFO = 11
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CTRMM ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         DO 20, J = 1, N
+            DO 10, I = 1, M
+               B( I, J ) = ZERO
+   10       CONTINUE
+   20    CONTINUE
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSIDE )THEN
+         IF( LSAME( TRANSA, 'N' ) )THEN
+*
+*           Form  B := alpha*A*B.
+*
+            IF( UPPER )THEN
+               DO 50, J = 1, N
+                  DO 40, K = 1, M
+                     IF( B( K, J ).NE.ZERO )THEN
+                        TEMP = ALPHA*B( K, J )
+                        DO 30, I = 1, K - 1
+                           B( I, J ) = B( I, J ) + TEMP*A( I, K )
+   30                   CONTINUE
+                        IF( NOUNIT )
+     $                     TEMP = TEMP*A( K, K )
+                        B( K, J ) = TEMP
+                     END IF
+   40             CONTINUE
+   50          CONTINUE
+            ELSE
+               DO 80, J = 1, N
+                  DO 70 K = M, 1, -1
+                     IF( B( K, J ).NE.ZERO )THEN
+                        TEMP      = ALPHA*B( K, J )
+                        B( K, J ) = TEMP
+                        IF( NOUNIT )
+     $                     B( K, J ) = B( K, J )*A( K, K )
+                        DO 60, I = K + 1, M
+                           B( I, J ) = B( I, J ) + TEMP*A( I, K )
+   60                   CONTINUE
+                     END IF
+   70             CONTINUE
+   80          CONTINUE
+            END IF
+         ELSE
+*
+*           Form  B := alpha*A'*B   or   B := alpha*conjg( A' )*B.
+*
+            IF( UPPER )THEN
+               DO 120, J = 1, N
+                  DO 110, I = M, 1, -1
+                     TEMP = B( I, J )
+                     IF( NOCONJ )THEN
+                        IF( NOUNIT )
+     $                     TEMP = TEMP*A( I, I )
+                        DO 90, K = 1, I - 1
+                           TEMP = TEMP + A( K, I )*B( K, J )
+   90                   CONTINUE
+                     ELSE
+                        IF( NOUNIT )
+     $                     TEMP = TEMP*CONJG( A( I, I ) )
+                        DO 100, K = 1, I - 1
+                           TEMP = TEMP + CONJG( A( K, I ) )*B( K, J )
+  100                   CONTINUE
+                     END IF
+                     B( I, J ) = ALPHA*TEMP
+  110             CONTINUE
+  120          CONTINUE
+            ELSE
+               DO 160, J = 1, N
+                  DO 150, I = 1, M
+                     TEMP = B( I, J )
+                     IF( NOCONJ )THEN
+                        IF( NOUNIT )
+     $                     TEMP = TEMP*A( I, I )
+                        DO 130, K = I + 1, M
+                           TEMP = TEMP + A( K, I )*B( K, J )
+  130                   CONTINUE
+                     ELSE
+                        IF( NOUNIT )
+     $                     TEMP = TEMP*CONJG( A( I, I ) )
+                        DO 140, K = I + 1, M
+                           TEMP = TEMP + CONJG( A( K, I ) )*B( K, J )
+  140                   CONTINUE
+                     END IF
+                     B( I, J ) = ALPHA*TEMP
+  150             CONTINUE
+  160          CONTINUE
+            END IF
+         END IF
+      ELSE
+         IF( LSAME( TRANSA, 'N' ) )THEN
+*
+*           Form  B := alpha*B*A.
+*
+            IF( UPPER )THEN
+               DO 200, J = N, 1, -1
+                  TEMP = ALPHA
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( J, J )
+                  DO 170, I = 1, M
+                     B( I, J ) = TEMP*B( I, J )
+  170             CONTINUE
+                  DO 190, K = 1, J - 1
+                     IF( A( K, J ).NE.ZERO )THEN
+                        TEMP = ALPHA*A( K, J )
+                        DO 180, I = 1, M
+                           B( I, J ) = B( I, J ) + TEMP*B( I, K )
+  180                   CONTINUE
+                     END IF
+  190             CONTINUE
+  200          CONTINUE
+            ELSE
+               DO 240, J = 1, N
+                  TEMP = ALPHA
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( J, J )
+                  DO 210, I = 1, M
+                     B( I, J ) = TEMP*B( I, J )
+  210             CONTINUE
+                  DO 230, K = J + 1, N
+                     IF( A( K, J ).NE.ZERO )THEN
+                        TEMP = ALPHA*A( K, J )
+                        DO 220, I = 1, M
+                           B( I, J ) = B( I, J ) + TEMP*B( I, K )
+  220                   CONTINUE
+                     END IF
+  230             CONTINUE
+  240          CONTINUE
+            END IF
+         ELSE
+*
+*           Form  B := alpha*B*A'   or   B := alpha*B*conjg( A' ).
+*
+            IF( UPPER )THEN
+               DO 280, K = 1, N
+                  DO 260, J = 1, K - 1
+                     IF( A( J, K ).NE.ZERO )THEN
+                        IF( NOCONJ )THEN
+                           TEMP = ALPHA*A( J, K )
+                        ELSE
+                           TEMP = ALPHA*CONJG( A( J, K ) )
+                        END IF
+                        DO 250, I = 1, M
+                           B( I, J ) = B( I, J ) + TEMP*B( I, K )
+  250                   CONTINUE
+                     END IF
+  260             CONTINUE
+                  TEMP = ALPHA
+                  IF( NOUNIT )THEN
+                     IF( NOCONJ )THEN
+                        TEMP = TEMP*A( K, K )
+                     ELSE
+                        TEMP = TEMP*CONJG( A( K, K ) )
+                     END IF
+                  END IF
+                  IF( TEMP.NE.ONE )THEN
+                     DO 270, I = 1, M
+                        B( I, K ) = TEMP*B( I, K )
+  270                CONTINUE
+                  END IF
+  280          CONTINUE
+            ELSE
+               DO 320, K = N, 1, -1
+                  DO 300, J = K + 1, N
+                     IF( A( J, K ).NE.ZERO )THEN
+                        IF( NOCONJ )THEN
+                           TEMP = ALPHA*A( J, K )
+                        ELSE
+                           TEMP = ALPHA*CONJG( A( J, K ) )
+                        END IF
+                        DO 290, I = 1, M
+                           B( I, J ) = B( I, J ) + TEMP*B( I, K )
+  290                   CONTINUE
+                     END IF
+  300             CONTINUE
+                  TEMP = ALPHA
+                  IF( NOUNIT )THEN
+                     IF( NOCONJ )THEN
+                        TEMP = TEMP*A( K, K )
+                     ELSE
+                        TEMP = TEMP*CONJG( A( K, K ) )
+                     END IF
+                  END IF
+                  IF( TEMP.NE.ONE )THEN
+                     DO 310, I = 1, M
+                        B( I, K ) = TEMP*B( I, K )
+  310                CONTINUE
+                  END IF
+  320          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CTRMM .
+*
+      END
+      SUBROUTINE CTRMV ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, LDA, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CTRMV  performs one of the matrix-vector operations
+*
+*     x := A*x,   or   x := A'*x,   or   x := conjg( A' )*x,
+*
+*  where x is an n element vector and  A is an n by n unit, or non-unit,
+*  upper or lower triangular matrix.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   x := A*x.
+*
+*              TRANS = 'T' or 't'   x := A'*x.
+*
+*              TRANS = 'C' or 'c'   x := conjg( A' )*x.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX          array of DIMENSION ( LDA, n ).
+*           Before entry with  UPLO = 'U' or 'u', the leading n by n
+*           upper triangular part of the array A must contain the upper
+*           triangular matrix and the strictly lower triangular part of
+*           A is not referenced.
+*           Before entry with UPLO = 'L' or 'l', the leading n by n
+*           lower triangular part of the array A must contain the lower
+*           triangular matrix and the strictly upper triangular part of
+*           A is not referenced.
+*           Note that when  DIAG = 'U' or 'u', the diagonal elements of
+*           A are not referenced either, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*  X      - COMPLEX          array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x. On exit, X is overwritten with the
+*           tranformed vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     .. Local Scalars ..
+      COMPLEX            TEMP
+      INTEGER            I, INFO, IX, J, JX, KX
+      LOGICAL            NOCONJ, NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG, MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, N ) )THEN
+         INFO = 6
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 8
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CTRMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOCONJ = LSAME( TRANS, 'T' )
+      NOUNIT = LSAME( DIAG , 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX  too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through A.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  x := A*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     DO 10, I = 1, J - 1
+                        X( I ) = X( I ) + TEMP*A( I, J )
+   10                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*A( J, J )
+                  END IF
+   20          CONTINUE
+            ELSE
+               JX = KX
+               DO 40, J = 1, N
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     DO 30, I = 1, J - 1
+                        X( IX ) = X( IX ) + TEMP*A( I, J )
+                        IX      = IX      + INCX
+   30                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*A( J, J )
+                  END IF
+                  JX = JX + INCX
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     DO 50, I = N, J + 1, -1
+                        X( I ) = X( I ) + TEMP*A( I, J )
+   50                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*A( J, J )
+                  END IF
+   60          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 80, J = N, 1, -1
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     DO 70, I = N, J + 1, -1
+                        X( IX ) = X( IX ) + TEMP*A( I, J )
+                        IX      = IX      - INCX
+   70                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*A( J, J )
+                  END IF
+                  JX = JX - INCX
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := A'*x  or  x := conjg( A' )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            IF( INCX.EQ.1 )THEN
+               DO 110, J = N, 1, -1
+                  TEMP = X( J )
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*A( J, J )
+                     DO 90, I = J - 1, 1, -1
+                        TEMP = TEMP + A( I, J )*X( I )
+   90                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*CONJG( A( J, J ) )
+                     DO 100, I = J - 1, 1, -1
+                        TEMP = TEMP + CONJG( A( I, J ) )*X( I )
+  100                CONTINUE
+                  END IF
+                  X( J ) = TEMP
+  110          CONTINUE
+            ELSE
+               JX = KX + ( N - 1 )*INCX
+               DO 140, J = N, 1, -1
+                  TEMP = X( JX )
+                  IX   = JX
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*A( J, J )
+                     DO 120, I = J - 1, 1, -1
+                        IX   = IX   - INCX
+                        TEMP = TEMP + A( I, J )*X( IX )
+  120                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*CONJG( A( J, J ) )
+                     DO 130, I = J - 1, 1, -1
+                        IX   = IX   - INCX
+                        TEMP = TEMP + CONJG( A( I, J ) )*X( IX )
+  130                CONTINUE
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+  140          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 170, J = 1, N
+                  TEMP = X( J )
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*A( J, J )
+                     DO 150, I = J + 1, N
+                        TEMP = TEMP + A( I, J )*X( I )
+  150                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*CONJG( A( J, J ) )
+                     DO 160, I = J + 1, N
+                        TEMP = TEMP + CONJG( A( I, J ) )*X( I )
+  160                CONTINUE
+                  END IF
+                  X( J ) = TEMP
+  170          CONTINUE
+            ELSE
+               JX = KX
+               DO 200, J = 1, N
+                  TEMP = X( JX )
+                  IX   = JX
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*A( J, J )
+                     DO 180, I = J + 1, N
+                        IX   = IX   + INCX
+                        TEMP = TEMP + A( I, J )*X( IX )
+  180                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*CONJG( A( J, J ) )
+                     DO 190, I = J + 1, N
+                        IX   = IX   + INCX
+                        TEMP = TEMP + CONJG( A( I, J ) )*X( IX )
+  190                CONTINUE
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+  200          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CTRMV .
+*
+      END
+      SUBROUTINE CTRSM ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA,
+     $                   B, LDB )
+*     .. Scalar Arguments ..
+      CHARACTER*1        SIDE, UPLO, TRANSA, DIAG
+      INTEGER            M, N, LDA, LDB
+      COMPLEX            ALPHA
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), B( LDB, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CTRSM  solves one of the matrix equations
+*
+*     op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,
+*
+*  where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+*  non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+*
+*     op( A ) = A   or   op( A ) = A'   or   op( A ) = conjg( A' ).
+*
+*  The matrix X is overwritten on B.
+*
+*  Parameters
+*  ==========
+*
+*  SIDE   - CHARACTER*1.
+*           On entry, SIDE specifies whether op( A ) appears on the left
+*           or right of X as follows:
+*
+*              SIDE = 'L' or 'l'   op( A )*X = alpha*B.
+*
+*              SIDE = 'R' or 'r'   X*op( A ) = alpha*B.
+*
+*           Unchanged on exit.
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix A is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANSA - CHARACTER*1.
+*           On entry, TRANSA specifies the form of op( A ) to be used in
+*           the matrix multiplication as follows:
+*
+*              TRANSA = 'N' or 'n'   op( A ) = A.
+*
+*              TRANSA = 'T' or 't'   op( A ) = A'.
+*
+*              TRANSA = 'C' or 'c'   op( A ) = conjg( A' ).
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit triangular
+*           as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry, M specifies the number of rows of B. M must be at
+*           least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of B.  N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX         .
+*           On entry,  ALPHA specifies the scalar  alpha. When  alpha is
+*           zero then  A is not referenced and  B need not be set before
+*           entry.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX          array of DIMENSION ( LDA, k ), where k is m
+*           when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'.
+*           Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k
+*           upper triangular part of the array  A must contain the upper
+*           triangular matrix  and the strictly lower triangular part of
+*           A is not referenced.
+*           Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k
+*           lower triangular part of the array  A must contain the lower
+*           triangular matrix  and the strictly upper triangular part of
+*           A is not referenced.
+*           Note that when  DIAG = 'U' or 'u',  the diagonal elements of
+*           A  are not referenced either,  but are assumed to be  unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program.  When  SIDE = 'L' or 'l'  then
+*           LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r'
+*           then LDA must be at least max( 1, n ).
+*           Unchanged on exit.
+*
+*  B      - COMPLEX          array of DIMENSION ( LDB, n ).
+*           Before entry,  the leading  m by n part of the array  B must
+*           contain  the  right-hand  side  matrix  B,  and  on exit  is
+*           overwritten by the solution matrix  X.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in  the  calling  (sub)  program.   LDB  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG, MAX
+*     .. Local Scalars ..
+      LOGICAL            LSIDE, NOCONJ, NOUNIT, UPPER
+      INTEGER            I, INFO, J, K, NROWA
+      COMPLEX            TEMP
+*     .. Parameters ..
+      COMPLEX            ONE
+      PARAMETER        ( ONE  = ( 1.0E+0, 0.0E+0 ) )
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      LSIDE  = LSAME( SIDE  , 'L' )
+      IF( LSIDE )THEN
+         NROWA = M
+      ELSE
+         NROWA = N
+      END IF
+      NOCONJ = LSAME( TRANSA, 'T' )
+      NOUNIT = LSAME( DIAG  , 'N' )
+      UPPER  = LSAME( UPLO  , 'U' )
+*
+      INFO   = 0
+      IF(      ( .NOT.LSIDE                ).AND.
+     $         ( .NOT.LSAME( SIDE  , 'R' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.UPPER                ).AND.
+     $         ( .NOT.LSAME( UPLO  , 'L' ) )      )THEN
+         INFO = 2
+      ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND.
+     $         ( .NOT.LSAME( TRANSA, 'T' ) ).AND.
+     $         ( .NOT.LSAME( TRANSA, 'C' ) )      )THEN
+         INFO = 3
+      ELSE IF( ( .NOT.LSAME( DIAG  , 'U' ) ).AND.
+     $         ( .NOT.LSAME( DIAG  , 'N' ) )      )THEN
+         INFO = 4
+      ELSE IF( M  .LT.0               )THEN
+         INFO = 5
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 6
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 9
+      ELSE IF( LDB.LT.MAX( 1, M     ) )THEN
+         INFO = 11
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CTRSM ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         DO 20, J = 1, N
+            DO 10, I = 1, M
+               B( I, J ) = ZERO
+   10       CONTINUE
+   20    CONTINUE
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSIDE )THEN
+         IF( LSAME( TRANSA, 'N' ) )THEN
+*
+*           Form  B := alpha*inv( A )*B.
+*
+            IF( UPPER )THEN
+               DO 60, J = 1, N
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 30, I = 1, M
+                        B( I, J ) = ALPHA*B( I, J )
+   30                CONTINUE
+                  END IF
+                  DO 50, K = M, 1, -1
+                     IF( B( K, J ).NE.ZERO )THEN
+                        IF( NOUNIT )
+     $                     B( K, J ) = B( K, J )/A( K, K )
+                        DO 40, I = 1, K - 1
+                           B( I, J ) = B( I, J ) - B( K, J )*A( I, K )
+   40                   CONTINUE
+                     END IF
+   50             CONTINUE
+   60          CONTINUE
+            ELSE
+               DO 100, J = 1, N
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 70, I = 1, M
+                        B( I, J ) = ALPHA*B( I, J )
+   70                CONTINUE
+                  END IF
+                  DO 90 K = 1, M
+                     IF( B( K, J ).NE.ZERO )THEN
+                        IF( NOUNIT )
+     $                     B( K, J ) = B( K, J )/A( K, K )
+                        DO 80, I = K + 1, M
+                           B( I, J ) = B( I, J ) - B( K, J )*A( I, K )
+   80                   CONTINUE
+                     END IF
+   90             CONTINUE
+  100          CONTINUE
+            END IF
+         ELSE
+*
+*           Form  B := alpha*inv( A' )*B
+*           or    B := alpha*inv( conjg( A' ) )*B.
+*
+            IF( UPPER )THEN
+               DO 140, J = 1, N
+                  DO 130, I = 1, M
+                     TEMP = ALPHA*B( I, J )
+                     IF( NOCONJ )THEN
+                        DO 110, K = 1, I - 1
+                           TEMP = TEMP - A( K, I )*B( K, J )
+  110                   CONTINUE
+                        IF( NOUNIT )
+     $                     TEMP = TEMP/A( I, I )
+                     ELSE
+                        DO 120, K = 1, I - 1
+                           TEMP = TEMP - CONJG( A( K, I ) )*B( K, J )
+  120                   CONTINUE
+                        IF( NOUNIT )
+     $                     TEMP = TEMP/CONJG( A( I, I ) )
+                     END IF
+                     B( I, J ) = TEMP
+  130             CONTINUE
+  140          CONTINUE
+            ELSE
+               DO 180, J = 1, N
+                  DO 170, I = M, 1, -1
+                     TEMP = ALPHA*B( I, J )
+                     IF( NOCONJ )THEN
+                        DO 150, K = I + 1, M
+                           TEMP = TEMP - A( K, I )*B( K, J )
+  150                   CONTINUE
+                        IF( NOUNIT )
+     $                     TEMP = TEMP/A( I, I )
+                     ELSE
+                        DO 160, K = I + 1, M
+                           TEMP = TEMP - CONJG( A( K, I ) )*B( K, J )
+  160                   CONTINUE
+                        IF( NOUNIT )
+     $                     TEMP = TEMP/CONJG( A( I, I ) )
+                     END IF
+                     B( I, J ) = TEMP
+  170             CONTINUE
+  180          CONTINUE
+            END IF
+         END IF
+      ELSE
+         IF( LSAME( TRANSA, 'N' ) )THEN
+*
+*           Form  B := alpha*B*inv( A ).
+*
+            IF( UPPER )THEN
+               DO 230, J = 1, N
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 190, I = 1, M
+                        B( I, J ) = ALPHA*B( I, J )
+  190                CONTINUE
+                  END IF
+                  DO 210, K = 1, J - 1
+                     IF( A( K, J ).NE.ZERO )THEN
+                        DO 200, I = 1, M
+                           B( I, J ) = B( I, J ) - A( K, J )*B( I, K )
+  200                   CONTINUE
+                     END IF
+  210             CONTINUE
+                  IF( NOUNIT )THEN
+                     TEMP = ONE/A( J, J )
+                     DO 220, I = 1, M
+                        B( I, J ) = TEMP*B( I, J )
+  220                CONTINUE
+                  END IF
+  230          CONTINUE
+            ELSE
+               DO 280, J = N, 1, -1
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 240, I = 1, M
+                        B( I, J ) = ALPHA*B( I, J )
+  240                CONTINUE
+                  END IF
+                  DO 260, K = J + 1, N
+                     IF( A( K, J ).NE.ZERO )THEN
+                        DO 250, I = 1, M
+                           B( I, J ) = B( I, J ) - A( K, J )*B( I, K )
+  250                   CONTINUE
+                     END IF
+  260             CONTINUE
+                  IF( NOUNIT )THEN
+                     TEMP = ONE/A( J, J )
+                     DO 270, I = 1, M
+                       B( I, J ) = TEMP*B( I, J )
+  270                CONTINUE
+                  END IF
+  280          CONTINUE
+            END IF
+         ELSE
+*
+*           Form  B := alpha*B*inv( A' )
+*           or    B := alpha*B*inv( conjg( A' ) ).
+*
+            IF( UPPER )THEN
+               DO 330, K = N, 1, -1
+                  IF( NOUNIT )THEN
+                     IF( NOCONJ )THEN
+                        TEMP = ONE/A( K, K )
+                     ELSE
+                        TEMP = ONE/CONJG( A( K, K ) )
+                     END IF
+                     DO 290, I = 1, M
+                        B( I, K ) = TEMP*B( I, K )
+  290                CONTINUE
+                  END IF
+                  DO 310, J = 1, K - 1
+                     IF( A( J, K ).NE.ZERO )THEN
+                        IF( NOCONJ )THEN
+                           TEMP = A( J, K )
+                        ELSE
+                           TEMP = CONJG( A( J, K ) )
+                        END IF
+                        DO 300, I = 1, M
+                           B( I, J ) = B( I, J ) - TEMP*B( I, K )
+  300                   CONTINUE
+                     END IF
+  310             CONTINUE
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 320, I = 1, M
+                        B( I, K ) = ALPHA*B( I, K )
+  320                CONTINUE
+                  END IF
+  330          CONTINUE
+            ELSE
+               DO 380, K = 1, N
+                  IF( NOUNIT )THEN
+                     IF( NOCONJ )THEN
+                        TEMP = ONE/A( K, K )
+                     ELSE
+                        TEMP = ONE/CONJG( A( K, K ) )
+                     END IF
+                     DO 340, I = 1, M
+                        B( I, K ) = TEMP*B( I, K )
+  340                CONTINUE
+                  END IF
+                  DO 360, J = K + 1, N
+                     IF( A( J, K ).NE.ZERO )THEN
+                        IF( NOCONJ )THEN
+                           TEMP = A( J, K )
+                        ELSE
+                           TEMP = CONJG( A( J, K ) )
+                        END IF
+                        DO 350, I = 1, M
+                           B( I, J ) = B( I, J ) - TEMP*B( I, K )
+  350                   CONTINUE
+                     END IF
+  360             CONTINUE
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 370, I = 1, M
+                        B( I, K ) = ALPHA*B( I, K )
+  370                CONTINUE
+                  END IF
+  380          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CTRSM .
+*
+      END
+      SUBROUTINE CTRSV ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, LDA, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  CTRSV  solves one of the systems of equations
+*
+*     A*x = b,   or   A'*x = b,   or   conjg( A' )*x = b,
+*
+*  where b and x are n element vectors and A is an n by n unit, or
+*  non-unit, upper or lower triangular matrix.
+*
+*  No test for singularity or near-singularity is included in this
+*  routine. Such tests must be performed before calling this routine.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the equations to be solved as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   A*x = b.
+*
+*              TRANS = 'T' or 't'   A'*x = b.
+*
+*              TRANS = 'C' or 'c'   conjg( A' )*x = b.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX          array of DIMENSION ( LDA, n ).
+*           Before entry with  UPLO = 'U' or 'u', the leading n by n
+*           upper triangular part of the array A must contain the upper
+*           triangular matrix and the strictly lower triangular part of
+*           A is not referenced.
+*           Before entry with UPLO = 'L' or 'l', the leading n by n
+*           lower triangular part of the array A must contain the lower
+*           triangular matrix and the strictly upper triangular part of
+*           A is not referenced.
+*           Note that when  DIAG = 'U' or 'u', the diagonal elements of
+*           A are not referenced either, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*  X      - COMPLEX          array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element right-hand side vector b. On exit, X is overwritten
+*           with the solution vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX            ZERO
+      PARAMETER        ( ZERO = ( 0.0E+0, 0.0E+0 ) )
+*     .. Local Scalars ..
+      COMPLEX            TEMP
+      INTEGER            I, INFO, IX, J, JX, KX
+      LOGICAL            NOCONJ, NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG, MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, N ) )THEN
+         INFO = 6
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 8
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'CTRSV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOCONJ = LSAME( TRANS, 'T' )
+      NOUNIT = LSAME( DIAG , 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX  too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through A.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  x := inv( A )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/A( J, J )
+                     TEMP = X( J )
+                     DO 10, I = J - 1, 1, -1
+                        X( I ) = X( I ) - TEMP*A( I, J )
+   10                CONTINUE
+                  END IF
+   20          CONTINUE
+            ELSE
+               JX = KX + ( N - 1 )*INCX
+               DO 40, J = N, 1, -1
+                  IF( X( JX ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/A( J, J )
+                     TEMP = X( JX )
+                     IX   = JX
+                     DO 30, I = J - 1, 1, -1
+                        IX      = IX      - INCX
+                        X( IX ) = X( IX ) - TEMP*A( I, J )
+   30                CONTINUE
+                  END IF
+                  JX = JX - INCX
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/A( J, J )
+                     TEMP = X( J )
+                     DO 50, I = J + 1, N
+                        X( I ) = X( I ) - TEMP*A( I, J )
+   50                CONTINUE
+                  END IF
+   60          CONTINUE
+            ELSE
+               JX = KX
+               DO 80, J = 1, N
+                  IF( X( JX ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/A( J, J )
+                     TEMP = X( JX )
+                     IX   = JX
+                     DO 70, I = J + 1, N
+                        IX      = IX      + INCX
+                        X( IX ) = X( IX ) - TEMP*A( I, J )
+   70                CONTINUE
+                  END IF
+                  JX = JX + INCX
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := inv( A' )*x  or  x := inv( conjg( A' ) )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            IF( INCX.EQ.1 )THEN
+               DO 110, J = 1, N
+                  TEMP = X( J )
+                  IF( NOCONJ )THEN
+                     DO 90, I = 1, J - 1
+                        TEMP = TEMP - A( I, J )*X( I )
+   90                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/A( J, J )
+                  ELSE
+                     DO 100, I = 1, J - 1
+                        TEMP = TEMP - CONJG( A( I, J ) )*X( I )
+  100                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/CONJG( A( J, J ) )
+                  END IF
+                  X( J ) = TEMP
+  110          CONTINUE
+            ELSE
+               JX = KX
+               DO 140, J = 1, N
+                  IX   = KX
+                  TEMP = X( JX )
+                  IF( NOCONJ )THEN
+                     DO 120, I = 1, J - 1
+                        TEMP = TEMP - A( I, J )*X( IX )
+                        IX   = IX   + INCX
+  120                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/A( J, J )
+                  ELSE
+                     DO 130, I = 1, J - 1
+                        TEMP = TEMP - CONJG( A( I, J ) )*X( IX )
+                        IX   = IX   + INCX
+  130                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/CONJG( A( J, J ) )
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+  140          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 170, J = N, 1, -1
+                  TEMP = X( J )
+                  IF( NOCONJ )THEN
+                     DO 150, I = N, J + 1, -1
+                        TEMP = TEMP - A( I, J )*X( I )
+  150                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/A( J, J )
+                  ELSE
+                     DO 160, I = N, J + 1, -1
+                        TEMP = TEMP - CONJG( A( I, J ) )*X( I )
+  160                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/CONJG( A( J, J ) )
+                  END IF
+                  X( J ) = TEMP
+  170          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 200, J = N, 1, -1
+                  IX   = KX
+                  TEMP = X( JX )
+                  IF( NOCONJ )THEN
+                     DO 180, I = N, J + 1, -1
+                        TEMP = TEMP - A( I, J )*X( IX )
+                        IX   = IX   - INCX
+  180                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/A( J, J )
+                  ELSE
+                     DO 190, I = N, J + 1, -1
+                        TEMP = TEMP - CONJG( A( I, J ) )*X( IX )
+                        IX   = IX   - INCX
+  190                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/CONJG( A( J, J ) )
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+  200          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CTRSV .
+*
+      END
+      double precision function dasum(n,dx,incx)
+c
+c     takes the sum of the absolute values.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 3/93 to return if incx .le. 0.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      double precision dx(*),dtemp
+      integer i,incx,m,mp1,n,nincx
+c
+      dasum = 0.0d0
+      dtemp = 0.0d0
+      if( n.le.0 .or. incx.le.0 )return
+      if(incx.eq.1)go to 20
+c
+c        code for increment not equal to 1
+c
+      nincx = n*incx
+      do 10 i = 1,nincx,incx
+        dtemp = dtemp + dabs(dx(i))
+   10 continue
+      dasum = dtemp
+      return
+c
+c        code for increment equal to 1
+c
+c
+c        clean-up loop
+c
+   20 m = mod(n,6)
+      if( m .eq. 0 ) go to 40
+      do 30 i = 1,m
+        dtemp = dtemp + dabs(dx(i))
+   30 continue
+      if( n .lt. 6 ) go to 60
+   40 mp1 = m + 1
+      do 50 i = mp1,n,6
+        dtemp = dtemp + dabs(dx(i)) + dabs(dx(i + 1)) + dabs(dx(i + 2))
+     *  + dabs(dx(i + 3)) + dabs(dx(i + 4)) + dabs(dx(i + 5))
+   50 continue
+   60 dasum = dtemp
+      return
+      end
+      subroutine daxpy(n,da,dx,incx,dy,incy)
+c
+c     constant times a vector plus a vector.
+c     uses unrolled loops for increments equal to one.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      double precision dx(*),dy(*),da
+      integer i,incx,incy,ix,iy,m,mp1,n
+c
+      if(n.le.0)return
+      if (da .eq. 0.0d0) return
+      if(incx.eq.1.and.incy.eq.1)go to 20
+c
+c        code for unequal increments or equal increments
+c          not equal to 1
+c
+      ix = 1
+      iy = 1
+      if(incx.lt.0)ix = (-n+1)*incx + 1
+      if(incy.lt.0)iy = (-n+1)*incy + 1
+      do 10 i = 1,n
+        dy(iy) = dy(iy) + da*dx(ix)
+        ix = ix + incx
+        iy = iy + incy
+   10 continue
+      return
+c
+c        code for both increments equal to 1
+c
+c
+c        clean-up loop
+c
+   20 m = mod(n,4)
+      if( m .eq. 0 ) go to 40
+      do 30 i = 1,m
+        dy(i) = dy(i) + da*dx(i)
+   30 continue
+      if( n .lt. 4 ) return
+   40 mp1 = m + 1
+      do 50 i = mp1,n,4
+        dy(i) = dy(i) + da*dx(i)
+        dy(i + 1) = dy(i + 1) + da*dx(i + 1)
+        dy(i + 2) = dy(i + 2) + da*dx(i + 2)
+        dy(i + 3) = dy(i + 3) + da*dx(i + 3)
+   50 continue
+      return
+      end
+      double precision function dcabs1(z)
+      double complex z,zz
+      double precision t(2)
+      equivalence (zz,t(1))
+      zz = z
+      dcabs1 = dabs(t(1)) + dabs(t(2))
+      return
+      end
+      subroutine  dcopy(n,dx,incx,dy,incy)
+c
+c     copies a vector, x, to a vector, y.
+c     uses unrolled loops for increments equal to one.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      double precision dx(*),dy(*)
+      integer i,incx,incy,ix,iy,m,mp1,n
+c
+      if(n.le.0)return
+      if(incx.eq.1.and.incy.eq.1)go to 20
+c
+c        code for unequal increments or equal increments
+c          not equal to 1
+c
+      ix = 1
+      iy = 1
+      if(incx.lt.0)ix = (-n+1)*incx + 1
+      if(incy.lt.0)iy = (-n+1)*incy + 1
+      do 10 i = 1,n
+        dy(iy) = dx(ix)
+        ix = ix + incx
+        iy = iy + incy
+   10 continue
+      return
+c
+c        code for both increments equal to 1
+c
+c
+c        clean-up loop
+c
+   20 m = mod(n,7)
+      if( m .eq. 0 ) go to 40
+      do 30 i = 1,m
+        dy(i) = dx(i)
+   30 continue
+      if( n .lt. 7 ) return
+   40 mp1 = m + 1
+      do 50 i = mp1,n,7
+        dy(i) = dx(i)
+        dy(i + 1) = dx(i + 1)
+        dy(i + 2) = dx(i + 2)
+        dy(i + 3) = dx(i + 3)
+        dy(i + 4) = dx(i + 4)
+        dy(i + 5) = dx(i + 5)
+        dy(i + 6) = dx(i + 6)
+   50 continue
+      return
+      end
+      double precision function ddot(n,dx,incx,dy,incy)
+c
+c     forms the dot product of two vectors.
+c     uses unrolled loops for increments equal to one.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      double precision dx(*),dy(*),dtemp
+      integer i,incx,incy,ix,iy,m,mp1,n
+c
+      ddot = 0.0d0
+      dtemp = 0.0d0
+      if(n.le.0)return
+      if(incx.eq.1.and.incy.eq.1)go to 20
+c
+c        code for unequal increments or equal increments
+c          not equal to 1
+c
+      ix = 1
+      iy = 1
+      if(incx.lt.0)ix = (-n+1)*incx + 1
+      if(incy.lt.0)iy = (-n+1)*incy + 1
+      do 10 i = 1,n
+        dtemp = dtemp + dx(ix)*dy(iy)
+        ix = ix + incx
+        iy = iy + incy
+   10 continue
+      ddot = dtemp
+      return
+c
+c        code for both increments equal to 1
+c
+c
+c        clean-up loop
+c
+   20 m = mod(n,5)
+      if( m .eq. 0 ) go to 40
+      do 30 i = 1,m
+        dtemp = dtemp + dx(i)*dy(i)
+   30 continue
+      if( n .lt. 5 ) go to 60
+   40 mp1 = m + 1
+      do 50 i = mp1,n,5
+        dtemp = dtemp + dx(i)*dy(i) + dx(i + 1)*dy(i + 1) +
+     *   dx(i + 2)*dy(i + 2) + dx(i + 3)*dy(i + 3) + dx(i + 4)*dy(i + 4)
+   50 continue
+   60 ddot = dtemp
+      return
+      end
+      SUBROUTINE DGBMV ( TRANS, M, N, KL, KU, ALPHA, A, LDA, X, INCX,
+     $                   BETA, Y, INCY )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   ALPHA, BETA
+      INTEGER            INCX, INCY, KL, KU, LDA, M, N
+      CHARACTER*1        TRANS
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  DGBMV  performs one of the matrix-vector operations
+*
+*     y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,
+*
+*  where alpha and beta are scalars, x and y are vectors and A is an
+*  m by n band matrix, with kl sub-diagonals and ku super-diagonals.
+*
+*  Parameters
+*  ==========
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.
+*
+*              TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.
+*
+*              TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry, M specifies the number of rows of the matrix A.
+*           M must be at least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  KL     - INTEGER.
+*           On entry, KL specifies the number of sub-diagonals of the
+*           matrix A. KL must satisfy  0 .le. KL.
+*           Unchanged on exit.
+*
+*  KU     - INTEGER.
+*           On entry, KU specifies the number of super-diagonals of the
+*           matrix A. KU must satisfy  0 .le. KU.
+*           Unchanged on exit.
+*
+*  ALPHA  - DOUBLE PRECISION.
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
+*           Before entry, the leading ( kl + ku + 1 ) by n part of the
+*           array A must contain the matrix of coefficients, supplied
+*           column by column, with the leading diagonal of the matrix in
+*           row ( ku + 1 ) of the array, the first super-diagonal
+*           starting at position 2 in row ku, the first sub-diagonal
+*           starting at position 1 in row ( ku + 2 ), and so on.
+*           Elements in the array A that do not correspond to elements
+*           in the band matrix (such as the top left ku by ku triangle)
+*           are not referenced.
+*           The following program segment will transfer a band matrix
+*           from conventional full matrix storage to band storage:
+*
+*                 DO 20, J = 1, N
+*                    K = KU + 1 - J
+*                    DO 10, I = MAX( 1, J - KU ), MIN( M, J + KL )
+*                       A( K + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           ( kl + ku + 1 ).
+*           Unchanged on exit.
+*
+*  X      - DOUBLE PRECISION array of DIMENSION at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
+*           and at least
+*           ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
+*           Before entry, the incremented array X must contain the
+*           vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  BETA   - DOUBLE PRECISION.
+*           On entry, BETA specifies the scalar beta. When BETA is
+*           supplied as zero then Y need not be set on input.
+*           Unchanged on exit.
+*
+*  Y      - DOUBLE PRECISION array of DIMENSION at least
+*           ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
+*           and at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
+*           Before entry, the incremented array Y must contain the
+*           vector y. On exit, Y is overwritten by the updated vector y.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE         , ZERO
+      PARAMETER        ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   TEMP
+      INTEGER            I, INFO, IX, IY, J, JX, JY, K, KUP1, KX, KY,
+     $                   LENX, LENY
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 1
+      ELSE IF( M.LT.0 )THEN
+         INFO = 2
+      ELSE IF( N.LT.0 )THEN
+         INFO = 3
+      ELSE IF( KL.LT.0 )THEN
+         INFO = 4
+      ELSE IF( KU.LT.0 )THEN
+         INFO = 5
+      ELSE IF( LDA.LT.( KL + KU + 1 ) )THEN
+         INFO = 8
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 10
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 13
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'DGBMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.
+     $    ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set
+*     up the start points in  X  and  Y.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+         LENX = N
+         LENY = M
+      ELSE
+         LENX = M
+         LENY = N
+      END IF
+      IF( INCX.GT.0 )THEN
+         KX = 1
+      ELSE
+         KX = 1 - ( LENX - 1 )*INCX
+      END IF
+      IF( INCY.GT.0 )THEN
+         KY = 1
+      ELSE
+         KY = 1 - ( LENY - 1 )*INCY
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through the band part of A.
+*
+*     First form  y := beta*y.
+*
+      IF( BETA.NE.ONE )THEN
+         IF( INCY.EQ.1 )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 10, I = 1, LENY
+                  Y( I ) = ZERO
+   10          CONTINUE
+            ELSE
+               DO 20, I = 1, LENY
+                  Y( I ) = BETA*Y( I )
+   20          CONTINUE
+            END IF
+         ELSE
+            IY = KY
+            IF( BETA.EQ.ZERO )THEN
+               DO 30, I = 1, LENY
+                  Y( IY ) = ZERO
+                  IY      = IY   + INCY
+   30          CONTINUE
+            ELSE
+               DO 40, I = 1, LENY
+                  Y( IY ) = BETA*Y( IY )
+                  IY      = IY           + INCY
+   40          CONTINUE
+            END IF
+         END IF
+      END IF
+      IF( ALPHA.EQ.ZERO )
+     $   RETURN
+      KUP1 = KU + 1
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  y := alpha*A*x + y.
+*
+         JX = KX
+         IF( INCY.EQ.1 )THEN
+            DO 60, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  K    = KUP1 - J
+                  DO 50, I = MAX( 1, J - KU ), MIN( M, J + KL )
+                     Y( I ) = Y( I ) + TEMP*A( K + I, J )
+   50             CONTINUE
+               END IF
+               JX = JX + INCX
+   60       CONTINUE
+         ELSE
+            DO 80, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  IY   = KY
+                  K    = KUP1 - J
+                  DO 70, I = MAX( 1, J - KU ), MIN( M, J + KL )
+                     Y( IY ) = Y( IY ) + TEMP*A( K + I, J )
+                     IY      = IY      + INCY
+   70             CONTINUE
+               END IF
+               JX = JX + INCX
+               IF( J.GT.KU )
+     $            KY = KY + INCY
+   80       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  y := alpha*A'*x + y.
+*
+         JY = KY
+         IF( INCX.EQ.1 )THEN
+            DO 100, J = 1, N
+               TEMP = ZERO
+               K    = KUP1 - J
+               DO 90, I = MAX( 1, J - KU ), MIN( M, J + KL )
+                  TEMP = TEMP + A( K + I, J )*X( I )
+   90          CONTINUE
+               Y( JY ) = Y( JY ) + ALPHA*TEMP
+               JY      = JY      + INCY
+  100       CONTINUE
+         ELSE
+            DO 120, J = 1, N
+               TEMP = ZERO
+               IX   = KX
+               K    = KUP1 - J
+               DO 110, I = MAX( 1, J - KU ), MIN( M, J + KL )
+                  TEMP = TEMP + A( K + I, J )*X( IX )
+                  IX   = IX   + INCX
+  110          CONTINUE
+               Y( JY ) = Y( JY ) + ALPHA*TEMP
+               JY      = JY      + INCY
+               IF( J.GT.KU )
+     $            KX = KX + INCX
+  120       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of DGBMV .
+*
+      END
+      SUBROUTINE DGEMM ( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB,
+     $                   BETA, C, LDC )
+*     .. Scalar Arguments ..
+      CHARACTER*1        TRANSA, TRANSB
+      INTEGER            M, N, K, LDA, LDB, LDC
+      DOUBLE PRECISION   ALPHA, BETA
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), C( LDC, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  DGEMM  performs one of the matrix-matrix operations
+*
+*     C := alpha*op( A )*op( B ) + beta*C,
+*
+*  where  op( X ) is one of
+*
+*     op( X ) = X   or   op( X ) = X',
+*
+*  alpha and beta are scalars, and A, B and C are matrices, with op( A )
+*  an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+*
+*  Parameters
+*  ==========
+*
+*  TRANSA - CHARACTER*1.
+*           On entry, TRANSA specifies the form of op( A ) to be used in
+*           the matrix multiplication as follows:
+*
+*              TRANSA = 'N' or 'n',  op( A ) = A.
+*
+*              TRANSA = 'T' or 't',  op( A ) = A'.
+*
+*              TRANSA = 'C' or 'c',  op( A ) = A'.
+*
+*           Unchanged on exit.
+*
+*  TRANSB - CHARACTER*1.
+*           On entry, TRANSB specifies the form of op( B ) to be used in
+*           the matrix multiplication as follows:
+*
+*              TRANSB = 'N' or 'n',  op( B ) = B.
+*
+*              TRANSB = 'T' or 't',  op( B ) = B'.
+*
+*              TRANSB = 'C' or 'c',  op( B ) = B'.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry,  M  specifies  the number  of rows  of the  matrix
+*           op( A )  and of the  matrix  C.  M  must  be at least  zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry,  N  specifies the number  of columns of the matrix
+*           op( B ) and the number of columns of the matrix C. N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry,  K  specifies  the number of columns of the matrix
+*           op( A ) and the number of rows of the matrix op( B ). K must
+*           be at least  zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - DOUBLE PRECISION.
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is
+*           k  when  TRANSA = 'N' or 'n',  and is  m  otherwise.
+*           Before entry with  TRANSA = 'N' or 'n',  the leading  m by k
+*           part of the array  A  must contain the matrix  A,  otherwise
+*           the leading  k by m  part of the array  A  must contain  the
+*           matrix A.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. When  TRANSA = 'N' or 'n' then
+*           LDA must be at least  max( 1, m ), otherwise  LDA must be at
+*           least  max( 1, k ).
+*           Unchanged on exit.
+*
+*  B      - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is
+*           n  when  TRANSB = 'N' or 'n',  and is  k  otherwise.
+*           Before entry with  TRANSB = 'N' or 'n',  the leading  k by n
+*           part of the array  B  must contain the matrix  B,  otherwise
+*           the leading  n by k  part of the array  B  must contain  the
+*           matrix B.
+*           Unchanged on exit.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in the calling (sub) program. When  TRANSB = 'N' or 'n' then
+*           LDB must be at least  max( 1, k ), otherwise  LDB must be at
+*           least  max( 1, n ).
+*           Unchanged on exit.
+*
+*  BETA   - DOUBLE PRECISION.
+*           On entry,  BETA  specifies the scalar  beta.  When  BETA  is
+*           supplied as zero then C need not be set on input.
+*           Unchanged on exit.
+*
+*  C      - DOUBLE PRECISION array of DIMENSION ( LDC, n ).
+*           Before entry, the leading  m by n  part of the array  C must
+*           contain the matrix  C,  except when  beta  is zero, in which
+*           case C need not be set on entry.
+*           On exit, the array  C  is overwritten by the  m by n  matrix
+*           ( alpha*op( A )*op( B ) + beta*C ).
+*
+*  LDC    - INTEGER.
+*           On entry, LDC specifies the first dimension of C as declared
+*           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Local Scalars ..
+      LOGICAL            NOTA, NOTB
+      INTEGER            I, INFO, J, L, NCOLA, NROWA, NROWB
+      DOUBLE PRECISION   TEMP
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE         , ZERO
+      PARAMETER        ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     ..
+*     .. Executable Statements ..
+*
+*     Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
+*     transposed and set  NROWA, NCOLA and  NROWB  as the number of rows
+*     and  columns of  A  and the  number of  rows  of  B  respectively.
+*
+      NOTA  = LSAME( TRANSA, 'N' )
+      NOTB  = LSAME( TRANSB, 'N' )
+      IF( NOTA )THEN
+         NROWA = M
+         NCOLA = K
+      ELSE
+         NROWA = K
+         NCOLA = M
+      END IF
+      IF( NOTB )THEN
+         NROWB = K
+      ELSE
+         NROWB = N
+      END IF
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF(      ( .NOT.NOTA                 ).AND.
+     $         ( .NOT.LSAME( TRANSA, 'C' ) ).AND.
+     $         ( .NOT.LSAME( TRANSA, 'T' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.NOTB                 ).AND.
+     $         ( .NOT.LSAME( TRANSB, 'C' ) ).AND.
+     $         ( .NOT.LSAME( TRANSB, 'T' ) )      )THEN
+         INFO = 2
+      ELSE IF( M  .LT.0               )THEN
+         INFO = 3
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 4
+      ELSE IF( K  .LT.0               )THEN
+         INFO = 5
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 8
+      ELSE IF( LDB.LT.MAX( 1, NROWB ) )THEN
+         INFO = 10
+      ELSE IF( LDC.LT.MAX( 1, M     ) )THEN
+         INFO = 13
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'DGEMM ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.
+     $    ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     And if  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         IF( BETA.EQ.ZERO )THEN
+            DO 20, J = 1, N
+               DO 10, I = 1, M
+                  C( I, J ) = ZERO
+   10          CONTINUE
+   20       CONTINUE
+         ELSE
+            DO 40, J = 1, N
+               DO 30, I = 1, M
+                  C( I, J ) = BETA*C( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( NOTB )THEN
+         IF( NOTA )THEN
+*
+*           Form  C := alpha*A*B + beta*C.
+*
+            DO 90, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 50, I = 1, M
+                     C( I, J ) = ZERO
+   50             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 60, I = 1, M
+                     C( I, J ) = BETA*C( I, J )
+   60             CONTINUE
+               END IF
+               DO 80, L = 1, K
+                  IF( B( L, J ).NE.ZERO )THEN
+                     TEMP = ALPHA*B( L, J )
+                     DO 70, I = 1, M
+                        C( I, J ) = C( I, J ) + TEMP*A( I, L )
+   70                CONTINUE
+                  END IF
+   80          CONTINUE
+   90       CONTINUE
+         ELSE
+*
+*           Form  C := alpha*A'*B + beta*C
+*
+            DO 120, J = 1, N
+               DO 110, I = 1, M
+                  TEMP = ZERO
+                  DO 100, L = 1, K
+                     TEMP = TEMP + A( L, I )*B( L, J )
+  100             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  110          CONTINUE
+  120       CONTINUE
+         END IF
+      ELSE
+         IF( NOTA )THEN
+*
+*           Form  C := alpha*A*B' + beta*C
+*
+            DO 170, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 130, I = 1, M
+                     C( I, J ) = ZERO
+  130             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 140, I = 1, M
+                     C( I, J ) = BETA*C( I, J )
+  140             CONTINUE
+               END IF
+               DO 160, L = 1, K
+                  IF( B( J, L ).NE.ZERO )THEN
+                     TEMP = ALPHA*B( J, L )
+                     DO 150, I = 1, M
+                        C( I, J ) = C( I, J ) + TEMP*A( I, L )
+  150                CONTINUE
+                  END IF
+  160          CONTINUE
+  170       CONTINUE
+         ELSE
+*
+*           Form  C := alpha*A'*B' + beta*C
+*
+            DO 200, J = 1, N
+               DO 190, I = 1, M
+                  TEMP = ZERO
+                  DO 180, L = 1, K
+                     TEMP = TEMP + A( L, I )*B( J, L )
+  180             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  190          CONTINUE
+  200       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of DGEMM .
+*
+      END
+      SUBROUTINE DGEMV ( TRANS, M, N, ALPHA, A, LDA, X, INCX,
+     $                   BETA, Y, INCY )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   ALPHA, BETA
+      INTEGER            INCX, INCY, LDA, M, N
+      CHARACTER*1        TRANS
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  DGEMV  performs one of the matrix-vector operations
+*
+*     y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,
+*
+*  where alpha and beta are scalars, x and y are vectors and A is an
+*  m by n matrix.
+*
+*  Parameters
+*  ==========
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.
+*
+*              TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.
+*
+*              TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry, M specifies the number of rows of the matrix A.
+*           M must be at least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - DOUBLE PRECISION.
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
+*           Before entry, the leading m by n part of the array A must
+*           contain the matrix of coefficients.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*  X      - DOUBLE PRECISION array of DIMENSION at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
+*           and at least
+*           ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
+*           Before entry, the incremented array X must contain the
+*           vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  BETA   - DOUBLE PRECISION.
+*           On entry, BETA specifies the scalar beta. When BETA is
+*           supplied as zero then Y need not be set on input.
+*           Unchanged on exit.
+*
+*  Y      - DOUBLE PRECISION array of DIMENSION at least
+*           ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
+*           and at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
+*           Before entry with BETA non-zero, the incremented array Y
+*           must contain the vector y. On exit, Y is overwritten by the
+*           updated vector y.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE         , ZERO
+      PARAMETER        ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   TEMP
+      INTEGER            I, INFO, IX, IY, J, JX, JY, KX, KY, LENX, LENY
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 1
+      ELSE IF( M.LT.0 )THEN
+         INFO = 2
+      ELSE IF( N.LT.0 )THEN
+         INFO = 3
+      ELSE IF( LDA.LT.MAX( 1, M ) )THEN
+         INFO = 6
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 8
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 11
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'DGEMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.
+     $    ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set
+*     up the start points in  X  and  Y.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+         LENX = N
+         LENY = M
+      ELSE
+         LENX = M
+         LENY = N
+      END IF
+      IF( INCX.GT.0 )THEN
+         KX = 1
+      ELSE
+         KX = 1 - ( LENX - 1 )*INCX
+      END IF
+      IF( INCY.GT.0 )THEN
+         KY = 1
+      ELSE
+         KY = 1 - ( LENY - 1 )*INCY
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through A.
+*
+*     First form  y := beta*y.
+*
+      IF( BETA.NE.ONE )THEN
+         IF( INCY.EQ.1 )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 10, I = 1, LENY
+                  Y( I ) = ZERO
+   10          CONTINUE
+            ELSE
+               DO 20, I = 1, LENY
+                  Y( I ) = BETA*Y( I )
+   20          CONTINUE
+            END IF
+         ELSE
+            IY = KY
+            IF( BETA.EQ.ZERO )THEN
+               DO 30, I = 1, LENY
+                  Y( IY ) = ZERO
+                  IY      = IY   + INCY
+   30          CONTINUE
+            ELSE
+               DO 40, I = 1, LENY
+                  Y( IY ) = BETA*Y( IY )
+                  IY      = IY           + INCY
+   40          CONTINUE
+            END IF
+         END IF
+      END IF
+      IF( ALPHA.EQ.ZERO )
+     $   RETURN
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  y := alpha*A*x + y.
+*
+         JX = KX
+         IF( INCY.EQ.1 )THEN
+            DO 60, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  DO 50, I = 1, M
+                     Y( I ) = Y( I ) + TEMP*A( I, J )
+   50             CONTINUE
+               END IF
+               JX = JX + INCX
+   60       CONTINUE
+         ELSE
+            DO 80, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  IY   = KY
+                  DO 70, I = 1, M
+                     Y( IY ) = Y( IY ) + TEMP*A( I, J )
+                     IY      = IY      + INCY
+   70             CONTINUE
+               END IF
+               JX = JX + INCX
+   80       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  y := alpha*A'*x + y.
+*
+         JY = KY
+         IF( INCX.EQ.1 )THEN
+            DO 100, J = 1, N
+               TEMP = ZERO
+               DO 90, I = 1, M
+                  TEMP = TEMP + A( I, J )*X( I )
+   90          CONTINUE
+               Y( JY ) = Y( JY ) + ALPHA*TEMP
+               JY      = JY      + INCY
+  100       CONTINUE
+         ELSE
+            DO 120, J = 1, N
+               TEMP = ZERO
+               IX   = KX
+               DO 110, I = 1, M
+                  TEMP = TEMP + A( I, J )*X( IX )
+                  IX   = IX   + INCX
+  110          CONTINUE
+               Y( JY ) = Y( JY ) + ALPHA*TEMP
+               JY      = JY      + INCY
+  120       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of DGEMV .
+*
+      END
+      SUBROUTINE DGER  ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   ALPHA
+      INTEGER            INCX, INCY, LDA, M, N
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  DGER   performs the rank 1 operation
+*
+*     A := alpha*x*y' + A,
+*
+*  where alpha is a scalar, x is an m element vector, y is an n element
+*  vector and A is an m by n matrix.
+*
+*  Parameters
+*  ==========
+*
+*  M      - INTEGER.
+*           On entry, M specifies the number of rows of the matrix A.
+*           M must be at least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - DOUBLE PRECISION.
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  X      - DOUBLE PRECISION array of dimension at least
+*           ( 1 + ( m - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the m
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  Y      - DOUBLE PRECISION array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the n
+*           element vector y.
+*           Unchanged on exit.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
+*           Before entry, the leading m by n part of the array A must
+*           contain the matrix of coefficients. On exit, A is
+*           overwritten by the updated matrix.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO
+      PARAMETER        ( ZERO = 0.0D+0 )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   TEMP
+      INTEGER            I, INFO, IX, J, JY, KX
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( M.LT.0 )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 5
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 7
+      ELSE IF( LDA.LT.MAX( 1, M ) )THEN
+         INFO = 9
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'DGER  ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) )
+     $   RETURN
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through A.
+*
+      IF( INCY.GT.0 )THEN
+         JY = 1
+      ELSE
+         JY = 1 - ( N - 1 )*INCY
+      END IF
+      IF( INCX.EQ.1 )THEN
+         DO 20, J = 1, N
+            IF( Y( JY ).NE.ZERO )THEN
+               TEMP = ALPHA*Y( JY )
+               DO 10, I = 1, M
+                  A( I, J ) = A( I, J ) + X( I )*TEMP
+   10          CONTINUE
+            END IF
+            JY = JY + INCY
+   20    CONTINUE
+      ELSE
+         IF( INCX.GT.0 )THEN
+            KX = 1
+         ELSE
+            KX = 1 - ( M - 1 )*INCX
+         END IF
+         DO 40, J = 1, N
+            IF( Y( JY ).NE.ZERO )THEN
+               TEMP = ALPHA*Y( JY )
+               IX   = KX
+               DO 30, I = 1, M
+                  A( I, J ) = A( I, J ) + X( IX )*TEMP
+                  IX        = IX        + INCX
+   30          CONTINUE
+            END IF
+            JY = JY + INCY
+   40    CONTINUE
+      END IF
+*
+      RETURN
+*
+*     End of DGER  .
+*
+      END
+      DOUBLE PRECISION FUNCTION DNRM2 ( N, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER                           INCX, N
+*     .. Array Arguments ..
+      DOUBLE PRECISION                  X( * )
+*     ..
+*
+*  DNRM2 returns the euclidean norm of a vector via the function
+*  name, so that
+*
+*     DNRM2 := sqrt( x'*x )
+*
+*
+*
+*  -- This version written on 25-October-1982.
+*     Modified on 14-October-1993 to inline the call to DLASSQ.
+*     Sven Hammarling, Nag Ltd.
+*
+*
+*     .. Parameters ..
+      DOUBLE PRECISION      ONE         , ZERO
+      PARAMETER           ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     .. Local Scalars ..
+      INTEGER               IX
+      DOUBLE PRECISION      ABSXI, NORM, SCALE, SSQ
+*     .. Intrinsic Functions ..
+      INTRINSIC             ABS, SQRT
+*     ..
+*     .. Executable Statements ..
+      IF( N.LT.1 .OR. INCX.LT.1 )THEN
+         NORM  = ZERO
+      ELSE IF( N.EQ.1 )THEN
+         NORM  = ABS( X( 1 ) )
+      ELSE
+         SCALE = ZERO
+         SSQ   = ONE
+*        The following loop is equivalent to this call to the LAPACK
+*        auxiliary routine:
+*        CALL DLASSQ( N, X, INCX, SCALE, SSQ )
+*
+         DO 10, IX = 1, 1 + ( N - 1 )*INCX, INCX
+            IF( X( IX ).NE.ZERO )THEN
+               ABSXI = ABS( X( IX ) )
+               IF( SCALE.LT.ABSXI )THEN
+                  SSQ   = ONE   + SSQ*( SCALE/ABSXI )**2
+                  SCALE = ABSXI
+               ELSE
+                  SSQ   = SSQ   +     ( ABSXI/SCALE )**2
+               END IF
+            END IF
+   10    CONTINUE
+         NORM  = SCALE * SQRT( SSQ )
+      END IF
+*
+      DNRM2 = NORM
+      RETURN
+*
+*     End of DNRM2.
+*
+      END
+      subroutine  drot (n,dx,incx,dy,incy,c,s)
+c
+c     applies a plane rotation.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      double precision dx(*),dy(*),dtemp,c,s
+      integer i,incx,incy,ix,iy,n
+c
+      if(n.le.0)return
+      if(incx.eq.1.and.incy.eq.1)go to 20
+c
+c       code for unequal increments or equal increments not equal
+c         to 1
+c
+      ix = 1
+      iy = 1
+      if(incx.lt.0)ix = (-n+1)*incx + 1
+      if(incy.lt.0)iy = (-n+1)*incy + 1
+      do 10 i = 1,n
+        dtemp = c*dx(ix) + s*dy(iy)
+        dy(iy) = c*dy(iy) - s*dx(ix)
+        dx(ix) = dtemp
+        ix = ix + incx
+        iy = iy + incy
+   10 continue
+      return
+c
+c       code for both increments equal to 1
+c
+   20 do 30 i = 1,n
+        dtemp = c*dx(i) + s*dy(i)
+        dy(i) = c*dy(i) - s*dx(i)
+        dx(i) = dtemp
+   30 continue
+      return
+      end
+      subroutine drotg(da,db,c,s)
+c
+c     construct givens plane rotation.
+c     jack dongarra, linpack, 3/11/78.
+c
+      double precision da,db,c,s,roe,scale,r,z
+c
+      roe = db
+      if( dabs(da) .gt. dabs(db) ) roe = da
+      scale = dabs(da) + dabs(db)
+      if( scale .ne. 0.0d0 ) go to 10
+         c = 1.0d0
+         s = 0.0d0
+         r = 0.0d0
+         z = 0.0d0
+         go to 20
+   10 r = scale*dsqrt((da/scale)**2 + (db/scale)**2)
+      r = dsign(1.0d0,roe)*r
+      c = da/r
+      s = db/r
+      z = 1.0d0
+      if( dabs(da) .gt. dabs(db) ) z = s
+      if( dabs(db) .ge. dabs(da) .and. c .ne. 0.0d0 ) z = 1.0d0/c
+   20 da = r
+      db = z
+      return
+      end
+      SUBROUTINE DROTM (N,DX,INCX,DY,INCY,DPARAM)
+C
+C     APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX
+C
+C     (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN
+C     (DY**T)
+C
+C     DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE
+C     LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY.
+C     WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS..
+C
+C     DFLAG=-1.D0     DFLAG=0.D0        DFLAG=1.D0     DFLAG=-2.D0
+C
+C       (DH11  DH12)    (1.D0  DH12)    (DH11  1.D0)    (1.D0  0.D0)
+C     H=(          )    (          )    (          )    (          )
+C       (DH21  DH22),   (DH21  1.D0),   (-1.D0 DH22),   (0.D0  1.D0).
+C     SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM.
+C
+      DOUBLE PRECISION DFLAG,DH12,DH22,DX,TWO,Z,DH11,DH21,
+     1 DPARAM,DY,W,ZERO
+      DIMENSION DX(1),DY(1),DPARAM(5)
+      DATA ZERO,TWO/0.D0,2.D0/
+C
+      DFLAG=DPARAM(1)
+      IF(N .LE. 0 .OR.(DFLAG+TWO.EQ.ZERO)) GO TO 140
+          IF(.NOT.(INCX.EQ.INCY.AND. INCX .GT.0)) GO TO 70
+C
+               NSTEPS=N*INCX
+               IF(DFLAG) 50,10,30
+   10          CONTINUE
+               DH12=DPARAM(4)
+               DH21=DPARAM(3)
+                    DO 20 I=1,NSTEPS,INCX
+                    W=DX(I)
+                    Z=DY(I)
+                    DX(I)=W+Z*DH12
+                    DY(I)=W*DH21+Z
+   20               CONTINUE
+               GO TO 140
+   30          CONTINUE
+               DH11=DPARAM(2)
+               DH22=DPARAM(5)
+                    DO 40 I=1,NSTEPS,INCX
+                    W=DX(I)
+                    Z=DY(I)
+                    DX(I)=W*DH11+Z
+                    DY(I)=-W+DH22*Z
+   40               CONTINUE
+               GO TO 140
+   50          CONTINUE
+               DH11=DPARAM(2)
+               DH12=DPARAM(4)
+               DH21=DPARAM(3)
+               DH22=DPARAM(5)
+                    DO 60 I=1,NSTEPS,INCX
+                    W=DX(I)
+                    Z=DY(I)
+                    DX(I)=W*DH11+Z*DH12
+                    DY(I)=W*DH21+Z*DH22
+   60               CONTINUE
+               GO TO 140
+   70     CONTINUE
+          KX=1
+          KY=1
+          IF(INCX .LT. 0) KX=1+(1-N)*INCX
+          IF(INCY .LT. 0) KY=1+(1-N)*INCY
+C
+          IF(DFLAG)120,80,100
+   80     CONTINUE
+          DH12=DPARAM(4)
+          DH21=DPARAM(3)
+               DO 90 I=1,N
+               W=DX(KX)
+               Z=DY(KY)
+               DX(KX)=W+Z*DH12
+               DY(KY)=W*DH21+Z
+               KX=KX+INCX
+               KY=KY+INCY
+   90          CONTINUE
+          GO TO 140
+  100     CONTINUE
+          DH11=DPARAM(2)
+          DH22=DPARAM(5)
+               DO 110 I=1,N
+               W=DX(KX)
+               Z=DY(KY)
+               DX(KX)=W*DH11+Z
+               DY(KY)=-W+DH22*Z
+               KX=KX+INCX
+               KY=KY+INCY
+  110          CONTINUE
+          GO TO 140
+  120     CONTINUE
+          DH11=DPARAM(2)
+          DH12=DPARAM(4)
+          DH21=DPARAM(3)
+          DH22=DPARAM(5)
+               DO 130 I=1,N
+               W=DX(KX)
+               Z=DY(KY)
+               DX(KX)=W*DH11+Z*DH12
+               DY(KY)=W*DH21+Z*DH22
+               KX=KX+INCX
+               KY=KY+INCY
+  130          CONTINUE
+  140     CONTINUE
+          RETURN
+          END
+      SUBROUTINE DROTMG (DD1,DD2,DX1,DY1,DPARAM)
+C
+C     CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS
+C     THE SECOND COMPONENT OF THE 2-VECTOR  (DSQRT(DD1)*DX1,DSQRT(DD2)*
+C     DY2)**T.
+C     WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS..
+C
+C     DFLAG=-1.D0     DFLAG=0.D0        DFLAG=1.D0     DFLAG=-2.D0
+C
+C       (DH11  DH12)    (1.D0  DH12)    (DH11  1.D0)    (1.D0  0.D0)
+C     H=(          )    (          )    (          )    (          )
+C       (DH21  DH22),   (DH21  1.D0),   (-1.D0 DH22),   (0.D0  1.D0).
+C     LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22
+C     RESPECTIVELY. (VALUES OF 1.D0, -1.D0, OR 0.D0 IMPLIED BY THE
+C     VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.)
+C
+C     THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE
+C     INEXACT.  THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE
+C     OF DD1 AND DD2.  ALL ACTUAL SCALING OF DATA IS DONE USING GAM.
+C
+      DOUBLE PRECISION GAM,ONE,RGAMSQ,DD2,DH11,DH21,DPARAM,DP2,
+     1 DQ2,DU,DY1,ZERO,GAMSQ,DD1,DFLAG,DH12,DH22,DP1,DQ1,
+     2 DTEMP,DX1,TWO
+      DIMENSION DPARAM(5)
+C
+      DATA ZERO,ONE,TWO /0.D0,1.D0,2.D0/
+      DATA GAM,GAMSQ,RGAMSQ/4096.D0,16777216.D0,5.9604645D-8/
+      IF(.NOT. DD1 .LT. ZERO) GO TO 10
+C       GO ZERO-H-D-AND-DX1..
+          GO TO 60
+   10 CONTINUE
+C     CASE-DD1-NONNEGATIVE
+      DP2=DD2*DY1
+      IF(.NOT. DP2 .EQ. ZERO) GO TO 20
+          DFLAG=-TWO
+          GO TO 260
+C     REGULAR-CASE..
+   20 CONTINUE
+      DP1=DD1*DX1
+      DQ2=DP2*DY1
+      DQ1=DP1*DX1
+C
+      IF(.NOT. DABS(DQ1) .GT. DABS(DQ2)) GO TO 40
+          DH21=-DY1/DX1
+          DH12=DP2/DP1
+C
+          DU=ONE-DH12*DH21
+C
+          IF(.NOT. DU .LE. ZERO) GO TO 30
+C         GO ZERO-H-D-AND-DX1..
+               GO TO 60
+   30     CONTINUE
+               DFLAG=ZERO
+               DD1=DD1/DU
+               DD2=DD2/DU
+               DX1=DX1*DU
+C         GO SCALE-CHECK..
+               GO TO 100
+   40 CONTINUE
+          IF(.NOT. DQ2 .LT. ZERO) GO TO 50
+C         GO ZERO-H-D-AND-DX1..
+               GO TO 60
+   50     CONTINUE
+               DFLAG=ONE
+               DH11=DP1/DP2
+               DH22=DX1/DY1
+               DU=ONE+DH11*DH22
+               DTEMP=DD2/DU
+               DD2=DD1/DU
+               DD1=DTEMP
+               DX1=DY1*DU
+C         GO SCALE-CHECK
+               GO TO 100
+C     PROCEDURE..ZERO-H-D-AND-DX1..
+   60 CONTINUE
+          DFLAG=-ONE
+          DH11=ZERO
+          DH12=ZERO
+          DH21=ZERO
+          DH22=ZERO
+C
+          DD1=ZERO
+          DD2=ZERO
+          DX1=ZERO
+C         RETURN..
+          GO TO 220
+C     PROCEDURE..FIX-H..
+   70 CONTINUE
+      IF(.NOT. DFLAG .GE. ZERO) GO TO 90
+C
+          IF(.NOT. DFLAG .EQ. ZERO) GO TO 80
+          DH11=ONE
+          DH22=ONE
+          DFLAG=-ONE
+          GO TO 90
+   80     CONTINUE
+          DH21=-ONE
+          DH12=ONE
+          DFLAG=-ONE
+   90 CONTINUE
+      GO TO IGO,(120,150,180,210)
+C     PROCEDURE..SCALE-CHECK
+  100 CONTINUE
+  110     CONTINUE
+          IF(.NOT. DD1 .LE. RGAMSQ) GO TO 130
+               IF(DD1 .EQ. ZERO) GO TO 160
+               ASSIGN 120 TO IGO
+C              FIX-H..
+               GO TO 70
+  120          CONTINUE
+               DD1=DD1*GAM**2
+               DX1=DX1/GAM
+               DH11=DH11/GAM
+               DH12=DH12/GAM
+          GO TO 110
+  130 CONTINUE
+  140     CONTINUE
+          IF(.NOT. DD1 .GE. GAMSQ) GO TO 160
+               ASSIGN 150 TO IGO
+C              FIX-H..
+               GO TO 70
+  150          CONTINUE
+               DD1=DD1/GAM**2
+               DX1=DX1*GAM
+               DH11=DH11*GAM
+               DH12=DH12*GAM
+          GO TO 140
+  160 CONTINUE
+  170     CONTINUE
+          IF(.NOT. DABS(DD2) .LE. RGAMSQ) GO TO 190
+               IF(DD2 .EQ. ZERO) GO TO 220
+               ASSIGN 180 TO IGO
+C              FIX-H..
+               GO TO 70
+  180          CONTINUE
+               DD2=DD2*GAM**2
+               DH21=DH21/GAM
+               DH22=DH22/GAM
+          GO TO 170
+  190 CONTINUE
+  200     CONTINUE
+          IF(.NOT. DABS(DD2) .GE. GAMSQ) GO TO 220
+               ASSIGN 210 TO IGO
+C              FIX-H..
+               GO TO 70
+  210          CONTINUE
+               DD2=DD2/GAM**2
+               DH21=DH21*GAM
+               DH22=DH22*GAM
+          GO TO 200
+  220 CONTINUE
+          IF(DFLAG)250,230,240
+  230     CONTINUE
+               DPARAM(3)=DH21
+               DPARAM(4)=DH12
+               GO TO 260
+  240     CONTINUE
+               DPARAM(2)=DH11
+               DPARAM(5)=DH22
+               GO TO 260
+  250     CONTINUE
+               DPARAM(2)=DH11
+               DPARAM(3)=DH21
+               DPARAM(4)=DH12
+               DPARAM(5)=DH22
+  260 CONTINUE
+          DPARAM(1)=DFLAG
+          RETURN
+      END
+      SUBROUTINE DSBMV ( UPLO, N, K, ALPHA, A, LDA, X, INCX,
+     $                   BETA, Y, INCY )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   ALPHA, BETA
+      INTEGER            INCX, INCY, K, LDA, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  DSBMV  performs the matrix-vector  operation
+*
+*     y := alpha*A*x + beta*y,
+*
+*  where alpha and beta are scalars, x and y are n element vectors and
+*  A is an n by n symmetric band matrix, with k super-diagonals.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the band matrix A is being supplied as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   The upper triangular part of A is
+*                                  being supplied.
+*
+*              UPLO = 'L' or 'l'   The lower triangular part of A is
+*                                  being supplied.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry, K specifies the number of super-diagonals of the
+*           matrix A. K must satisfy  0 .le. K.
+*           Unchanged on exit.
+*
+*  ALPHA  - DOUBLE PRECISION.
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
+*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
+*           by n part of the array A must contain the upper triangular
+*           band part of the symmetric matrix, supplied column by
+*           column, with the leading diagonal of the matrix in row
+*           ( k + 1 ) of the array, the first super-diagonal starting at
+*           position 2 in row k, and so on. The top left k by k triangle
+*           of the array A is not referenced.
+*           The following program segment will transfer the upper
+*           triangular part of a symmetric band matrix from conventional
+*           full matrix storage to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = K + 1 - J
+*                    DO 10, I = MAX( 1, J - K ), J
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
+*           by n part of the array A must contain the lower triangular
+*           band part of the symmetric matrix, supplied column by
+*           column, with the leading diagonal of the matrix in row 1 of
+*           the array, the first sub-diagonal starting at position 1 in
+*           row 2, and so on. The bottom right k by k triangle of the
+*           array A is not referenced.
+*           The following program segment will transfer the lower
+*           triangular part of a symmetric band matrix from conventional
+*           full matrix storage to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = 1 - J
+*                    DO 10, I = J, MIN( N, J + K )
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           ( k + 1 ).
+*           Unchanged on exit.
+*
+*  X      - DOUBLE PRECISION array of DIMENSION at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the
+*           vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  BETA   - DOUBLE PRECISION.
+*           On entry, BETA specifies the scalar beta.
+*           Unchanged on exit.
+*
+*  Y      - DOUBLE PRECISION array of DIMENSION at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the
+*           vector y. On exit, Y is overwritten by the updated vector y.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE         , ZERO
+      PARAMETER        ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   TEMP1, TEMP2
+      INTEGER            I, INFO, IX, IY, J, JX, JY, KPLUS1, KX, KY, L
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( K.LT.0 )THEN
+         INFO = 3
+      ELSE IF( LDA.LT.( K + 1 ) )THEN
+         INFO = 6
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 8
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 11
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'DSBMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     Set up the start points in  X  and  Y.
+*
+      IF( INCX.GT.0 )THEN
+         KX = 1
+      ELSE
+         KX = 1 - ( N - 1 )*INCX
+      END IF
+      IF( INCY.GT.0 )THEN
+         KY = 1
+      ELSE
+         KY = 1 - ( N - 1 )*INCY
+      END IF
+*
+*     Start the operations. In this version the elements of the array A
+*     are accessed sequentially with one pass through A.
+*
+*     First form  y := beta*y.
+*
+      IF( BETA.NE.ONE )THEN
+         IF( INCY.EQ.1 )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 10, I = 1, N
+                  Y( I ) = ZERO
+   10          CONTINUE
+            ELSE
+               DO 20, I = 1, N
+                  Y( I ) = BETA*Y( I )
+   20          CONTINUE
+            END IF
+         ELSE
+            IY = KY
+            IF( BETA.EQ.ZERO )THEN
+               DO 30, I = 1, N
+                  Y( IY ) = ZERO
+                  IY      = IY   + INCY
+   30          CONTINUE
+            ELSE
+               DO 40, I = 1, N
+                  Y( IY ) = BETA*Y( IY )
+                  IY      = IY           + INCY
+   40          CONTINUE
+            END IF
+         END IF
+      END IF
+      IF( ALPHA.EQ.ZERO )
+     $   RETURN
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  y  when upper triangle of A is stored.
+*
+         KPLUS1 = K + 1
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 60, J = 1, N
+               TEMP1 = ALPHA*X( J )
+               TEMP2 = ZERO
+               L     = KPLUS1 - J
+               DO 50, I = MAX( 1, J - K ), J - 1
+                  Y( I ) = Y( I ) + TEMP1*A( L + I, J )
+                  TEMP2  = TEMP2  + A( L + I, J )*X( I )
+   50          CONTINUE
+               Y( J ) = Y( J ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2
+   60       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 80, J = 1, N
+               TEMP1 = ALPHA*X( JX )
+               TEMP2 = ZERO
+               IX    = KX
+               IY    = KY
+               L     = KPLUS1 - J
+               DO 70, I = MAX( 1, J - K ), J - 1
+                  Y( IY ) = Y( IY ) + TEMP1*A( L + I, J )
+                  TEMP2   = TEMP2   + A( L + I, J )*X( IX )
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+   70          CONTINUE
+               Y( JY ) = Y( JY ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+               IF( J.GT.K )THEN
+                  KX = KX + INCX
+                  KY = KY + INCY
+               END IF
+   80       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  y  when lower triangle of A is stored.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 100, J = 1, N
+               TEMP1  = ALPHA*X( J )
+               TEMP2  = ZERO
+               Y( J ) = Y( J )       + TEMP1*A( 1, J )
+               L      = 1            - J
+               DO 90, I = J + 1, MIN( N, J + K )
+                  Y( I ) = Y( I ) + TEMP1*A( L + I, J )
+                  TEMP2  = TEMP2  + A( L + I, J )*X( I )
+   90          CONTINUE
+               Y( J ) = Y( J ) + ALPHA*TEMP2
+  100       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 120, J = 1, N
+               TEMP1   = ALPHA*X( JX )
+               TEMP2   = ZERO
+               Y( JY ) = Y( JY )       + TEMP1*A( 1, J )
+               L       = 1             - J
+               IX      = JX
+               IY      = JY
+               DO 110, I = J + 1, MIN( N, J + K )
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+                  Y( IY ) = Y( IY ) + TEMP1*A( L + I, J )
+                  TEMP2   = TEMP2   + A( L + I, J )*X( IX )
+  110          CONTINUE
+               Y( JY ) = Y( JY ) + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+  120       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of DSBMV .
+*
+      END
+      subroutine  dscal(n,da,dx,incx)
+c
+c     scales a vector by a constant.
+c     uses unrolled loops for increment equal to one.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 3/93 to return if incx .le. 0.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      double precision da,dx(*)
+      integer i,incx,m,mp1,n,nincx
+c
+      if( n.le.0 .or. incx.le.0 )return
+      if(incx.eq.1)go to 20
+c
+c        code for increment not equal to 1
+c
+      nincx = n*incx
+      do 10 i = 1,nincx,incx
+        dx(i) = da*dx(i)
+   10 continue
+      return
+c
+c        code for increment equal to 1
+c
+c
+c        clean-up loop
+c
+   20 m = mod(n,5)
+      if( m .eq. 0 ) go to 40
+      do 30 i = 1,m
+        dx(i) = da*dx(i)
+   30 continue
+      if( n .lt. 5 ) return
+   40 mp1 = m + 1
+      do 50 i = mp1,n,5
+        dx(i) = da*dx(i)
+        dx(i + 1) = da*dx(i + 1)
+        dx(i + 2) = da*dx(i + 2)
+        dx(i + 3) = da*dx(i + 3)
+        dx(i + 4) = da*dx(i + 4)
+   50 continue
+      return
+      end
+*DECK DSDOT
+      DOUBLE PRECISION FUNCTION DSDOT (N, SX, INCX, SY, INCY)
+C***BEGIN PROLOGUE  DSDOT
+C***PURPOSE  Compute the inner product of two vectors with extended
+C            precision accumulation and result.
+C***LIBRARY   SLATEC (BLAS)
+C***CATEGORY  D1A4
+C***TYPE      DOUBLE PRECISION (DSDOT-D, DCDOT-C)
+C***KEYWORDS  BLAS, COMPLEX VECTORS, DOT PRODUCT, INNER PRODUCT,
+C             LINEAR ALGEBRA, VECTOR
+C***AUTHOR  Lawson, C. L., (JPL)
+C           Hanson, R. J., (SNLA)
+C           Kincaid, D. R., (U. of Texas)
+C           Krogh, F. T., (JPL)
+C***DESCRIPTION
+C
+C                B L A S  Subprogram
+C    Description of Parameters
+C
+C     --Input--
+C        N  number of elements in input vector(s)
+C       SX  single precision vector with N elements
+C     INCX  storage spacing between elements of SX
+C       SY  single precision vector with N elements
+C     INCY  storage spacing between elements of SY
+C
+C     --Output--
+C    DSDOT  double precision dot product (zero if N.LE.0)
+C
+C     Returns D.P. dot product accumulated in D.P., for S.P. SX and SY
+C     DSDOT = sum for I = 0 to N-1 of  SX(LX+I*INCX) * SY(LY+I*INCY),
+C     where LX = 1 if INCX .GE. 0, else LX = 1+(1-N)*INCX, and LY is
+C     defined in a similar way using INCY.
+C
+C***REFERENCES  C. L. Lawson, R. J. Hanson, D. R. Kincaid and F. T.
+C                 Krogh, Basic linear algebra subprograms for Fortran
+C                 usage, Algorithm No. 539, Transactions on Mathematical
+C                 Software 5, 3 (September 1979), pp. 308-323.
+C***ROUTINES CALLED  (NONE)
+C***REVISION HISTORY  (YYMMDD)
+C   791001  DATE WRITTEN
+C   890831  Modified array declarations.  (WRB)
+C   890831  REVISION DATE from Version 3.2
+C   891214  Prologue converted to Version 4.0 format.  (BAB)
+C   920310  Corrected definition of LX in DESCRIPTION.  (WRB)
+C   920501  Reformatted the REFERENCES section.  (WRB)
+C***END PROLOGUE  DSDOT
+      REAL SX(*),SY(*)
+C***FIRST EXECUTABLE STATEMENT  DSDOT
+      DSDOT = 0.0D0
+      IF (N .LE. 0) RETURN
+      IF (INCX.EQ.INCY .AND. INCX.GT.0) GO TO 20
+C
+C     Code for unequal or nonpositive increments.
+C
+      KX = 1
+      KY = 1
+      IF (INCX .LT. 0) KX = 1+(1-N)*INCX
+      IF (INCY .LT. 0) KY = 1+(1-N)*INCY
+      DO 10 I = 1,N
+        DSDOT = DSDOT + DBLE(SX(KX))*DBLE(SY(KY))
+        KX = KX + INCX
+        KY = KY + INCY
+   10 CONTINUE
+      RETURN
+C
+C     Code for equal, positive, non-unit increments.
+C
+   20 NS = N*INCX
+      DO 30 I = 1,NS,INCX
+        DSDOT = DSDOT + DBLE(SX(I))*DBLE(SY(I))
+   30 CONTINUE
+      RETURN
+      END
+      SUBROUTINE DSPMV ( UPLO, N, ALPHA, AP, X, INCX, BETA, Y, INCY )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   ALPHA, BETA
+      INTEGER            INCX, INCY, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      DOUBLE PRECISION   AP( * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  DSPMV  performs the matrix-vector operation
+*
+*     y := alpha*A*x + beta*y,
+*
+*  where alpha and beta are scalars, x and y are n element vectors and
+*  A is an n by n symmetric matrix, supplied in packed form.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the matrix A is supplied in the packed
+*           array AP as follows:
+*
+*              UPLO = 'U' or 'u'   The upper triangular part of A is
+*                                  supplied in AP.
+*
+*              UPLO = 'L' or 'l'   The lower triangular part of A is
+*                                  supplied in AP.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - DOUBLE PRECISION.
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  AP     - DOUBLE PRECISION array of DIMENSION at least
+*           ( ( n*( n + 1 ) )/2 ).
+*           Before entry with UPLO = 'U' or 'u', the array AP must
+*           contain the upper triangular part of the symmetric matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
+*           and a( 2, 2 ) respectively, and so on.
+*           Before entry with UPLO = 'L' or 'l', the array AP must
+*           contain the lower triangular part of the symmetric matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
+*           and a( 3, 1 ) respectively, and so on.
+*           Unchanged on exit.
+*
+*  X      - DOUBLE PRECISION array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  BETA   - DOUBLE PRECISION.
+*           On entry, BETA specifies the scalar beta. When BETA is
+*           supplied as zero then Y need not be set on input.
+*           Unchanged on exit.
+*
+*  Y      - DOUBLE PRECISION array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the n
+*           element vector y. On exit, Y is overwritten by the updated
+*           vector y.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE         , ZERO
+      PARAMETER        ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   TEMP1, TEMP2
+      INTEGER            I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 6
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 9
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'DSPMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     Set up the start points in  X  and  Y.
+*
+      IF( INCX.GT.0 )THEN
+         KX = 1
+      ELSE
+         KX = 1 - ( N - 1 )*INCX
+      END IF
+      IF( INCY.GT.0 )THEN
+         KY = 1
+      ELSE
+         KY = 1 - ( N - 1 )*INCY
+      END IF
+*
+*     Start the operations. In this version the elements of the array AP
+*     are accessed sequentially with one pass through AP.
+*
+*     First form  y := beta*y.
+*
+      IF( BETA.NE.ONE )THEN
+         IF( INCY.EQ.1 )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 10, I = 1, N
+                  Y( I ) = ZERO
+   10          CONTINUE
+            ELSE
+               DO 20, I = 1, N
+                  Y( I ) = BETA*Y( I )
+   20          CONTINUE
+            END IF
+         ELSE
+            IY = KY
+            IF( BETA.EQ.ZERO )THEN
+               DO 30, I = 1, N
+                  Y( IY ) = ZERO
+                  IY      = IY   + INCY
+   30          CONTINUE
+            ELSE
+               DO 40, I = 1, N
+                  Y( IY ) = BETA*Y( IY )
+                  IY      = IY           + INCY
+   40          CONTINUE
+            END IF
+         END IF
+      END IF
+      IF( ALPHA.EQ.ZERO )
+     $   RETURN
+      KK = 1
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  y  when AP contains the upper triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 60, J = 1, N
+               TEMP1 = ALPHA*X( J )
+               TEMP2 = ZERO
+               K     = KK
+               DO 50, I = 1, J - 1
+                  Y( I ) = Y( I ) + TEMP1*AP( K )
+                  TEMP2  = TEMP2  + AP( K )*X( I )
+                  K      = K      + 1
+   50          CONTINUE
+               Y( J ) = Y( J ) + TEMP1*AP( KK + J - 1 ) + ALPHA*TEMP2
+               KK     = KK     + J
+   60       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 80, J = 1, N
+               TEMP1 = ALPHA*X( JX )
+               TEMP2 = ZERO
+               IX    = KX
+               IY    = KY
+               DO 70, K = KK, KK + J - 2
+                  Y( IY ) = Y( IY ) + TEMP1*AP( K )
+                  TEMP2   = TEMP2   + AP( K )*X( IX )
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+   70          CONTINUE
+               Y( JY ) = Y( JY ) + TEMP1*AP( KK + J - 1 ) + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+               KK      = KK      + J
+   80       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  y  when AP contains the lower triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 100, J = 1, N
+               TEMP1  = ALPHA*X( J )
+               TEMP2  = ZERO
+               Y( J ) = Y( J )       + TEMP1*AP( KK )
+               K      = KK           + 1
+               DO 90, I = J + 1, N
+                  Y( I ) = Y( I ) + TEMP1*AP( K )
+                  TEMP2  = TEMP2  + AP( K )*X( I )
+                  K      = K      + 1
+   90          CONTINUE
+               Y( J ) = Y( J ) + ALPHA*TEMP2
+               KK     = KK     + ( N - J + 1 )
+  100       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 120, J = 1, N
+               TEMP1   = ALPHA*X( JX )
+               TEMP2   = ZERO
+               Y( JY ) = Y( JY )       + TEMP1*AP( KK )
+               IX      = JX
+               IY      = JY
+               DO 110, K = KK + 1, KK + N - J
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+                  Y( IY ) = Y( IY ) + TEMP1*AP( K )
+                  TEMP2   = TEMP2   + AP( K )*X( IX )
+  110          CONTINUE
+               Y( JY ) = Y( JY ) + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+               KK      = KK      + ( N - J + 1 )
+  120       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of DSPMV .
+*
+      END
+      SUBROUTINE DSPR2 ( UPLO, N, ALPHA, X, INCX, Y, INCY, AP )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   ALPHA
+      INTEGER            INCX, INCY, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      DOUBLE PRECISION   AP( * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  DSPR2  performs the symmetric rank 2 operation
+*
+*     A := alpha*x*y' + alpha*y*x' + A,
+*
+*  where alpha is a scalar, x and y are n element vectors and A is an
+*  n by n symmetric matrix, supplied in packed form.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the matrix A is supplied in the packed
+*           array AP as follows:
+*
+*              UPLO = 'U' or 'u'   The upper triangular part of A is
+*                                  supplied in AP.
+*
+*              UPLO = 'L' or 'l'   The lower triangular part of A is
+*                                  supplied in AP.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - DOUBLE PRECISION.
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  X      - DOUBLE PRECISION array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  Y      - DOUBLE PRECISION array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the n
+*           element vector y.
+*           Unchanged on exit.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*  AP     - DOUBLE PRECISION array of DIMENSION at least
+*           ( ( n*( n + 1 ) )/2 ).
+*           Before entry with  UPLO = 'U' or 'u', the array AP must
+*           contain the upper triangular part of the symmetric matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
+*           and a( 2, 2 ) respectively, and so on. On exit, the array
+*           AP is overwritten by the upper triangular part of the
+*           updated matrix.
+*           Before entry with UPLO = 'L' or 'l', the array AP must
+*           contain the lower triangular part of the symmetric matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
+*           and a( 3, 1 ) respectively, and so on. On exit, the array
+*           AP is overwritten by the lower triangular part of the
+*           updated matrix.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO
+      PARAMETER        ( ZERO = 0.0D+0 )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   TEMP1, TEMP2
+      INTEGER            I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 5
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 7
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'DSPR2 ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) )
+     $   RETURN
+*
+*     Set up the start points in X and Y if the increments are not both
+*     unity.
+*
+      IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN
+         IF( INCX.GT.0 )THEN
+            KX = 1
+         ELSE
+            KX = 1 - ( N - 1 )*INCX
+         END IF
+         IF( INCY.GT.0 )THEN
+            KY = 1
+         ELSE
+            KY = 1 - ( N - 1 )*INCY
+         END IF
+         JX = KX
+         JY = KY
+      END IF
+*
+*     Start the operations. In this version the elements of the array AP
+*     are accessed sequentially with one pass through AP.
+*
+      KK = 1
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  A  when upper triangle is stored in AP.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 20, J = 1, N
+               IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*Y( J )
+                  TEMP2 = ALPHA*X( J )
+                  K     = KK
+                  DO 10, I = 1, J
+                     AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2
+                     K       = K       + 1
+   10             CONTINUE
+               END IF
+               KK = KK + J
+   20       CONTINUE
+         ELSE
+            DO 40, J = 1, N
+               IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*Y( JY )
+                  TEMP2 = ALPHA*X( JX )
+                  IX    = KX
+                  IY    = KY
+                  DO 30, K = KK, KK + J - 1
+                     AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2
+                     IX      = IX      + INCX
+                     IY      = IY      + INCY
+   30             CONTINUE
+               END IF
+               JX = JX + INCX
+               JY = JY + INCY
+               KK = KK + J
+   40       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  A  when lower triangle is stored in AP.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 60, J = 1, N
+               IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*Y( J )
+                  TEMP2 = ALPHA*X( J )
+                  K     = KK
+                  DO 50, I = J, N
+                     AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2
+                     K       = K       + 1
+   50             CONTINUE
+               END IF
+               KK = KK + N - J + 1
+   60       CONTINUE
+         ELSE
+            DO 80, J = 1, N
+               IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*Y( JY )
+                  TEMP2 = ALPHA*X( JX )
+                  IX    = JX
+                  IY    = JY
+                  DO 70, K = KK, KK + N - J
+                     AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2
+                     IX      = IX      + INCX
+                     IY      = IY      + INCY
+   70             CONTINUE
+               END IF
+               JX = JX + INCX
+               JY = JY + INCY
+               KK = KK + N - J + 1
+   80       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of DSPR2 .
+*
+      END
+      SUBROUTINE DSPR  ( UPLO, N, ALPHA, X, INCX, AP )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   ALPHA
+      INTEGER            INCX, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      DOUBLE PRECISION   AP( * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  DSPR    performs the symmetric rank 1 operation
+*
+*     A := alpha*x*x' + A,
+*
+*  where alpha is a real scalar, x is an n element vector and A is an
+*  n by n symmetric matrix, supplied in packed form.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the matrix A is supplied in the packed
+*           array AP as follows:
+*
+*              UPLO = 'U' or 'u'   The upper triangular part of A is
+*                                  supplied in AP.
+*
+*              UPLO = 'L' or 'l'   The lower triangular part of A is
+*                                  supplied in AP.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - DOUBLE PRECISION.
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  X      - DOUBLE PRECISION array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  AP     - DOUBLE PRECISION array of DIMENSION at least
+*           ( ( n*( n + 1 ) )/2 ).
+*           Before entry with  UPLO = 'U' or 'u', the array AP must
+*           contain the upper triangular part of the symmetric matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
+*           and a( 2, 2 ) respectively, and so on. On exit, the array
+*           AP is overwritten by the upper triangular part of the
+*           updated matrix.
+*           Before entry with UPLO = 'L' or 'l', the array AP must
+*           contain the lower triangular part of the symmetric matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
+*           and a( 3, 1 ) respectively, and so on. On exit, the array
+*           AP is overwritten by the lower triangular part of the
+*           updated matrix.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO
+      PARAMETER        ( ZERO = 0.0D+0 )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   TEMP
+      INTEGER            I, INFO, IX, J, JX, K, KK, KX
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 5
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'DSPR  ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) )
+     $   RETURN
+*
+*     Set the start point in X if the increment is not unity.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of the array AP
+*     are accessed sequentially with one pass through AP.
+*
+      KK = 1
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  A  when upper triangle is stored in AP.
+*
+         IF( INCX.EQ.1 )THEN
+            DO 20, J = 1, N
+               IF( X( J ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( J )
+                  K    = KK
+                  DO 10, I = 1, J
+                     AP( K ) = AP( K ) + X( I )*TEMP
+                     K       = K       + 1
+   10             CONTINUE
+               END IF
+               KK = KK + J
+   20       CONTINUE
+         ELSE
+            JX = KX
+            DO 40, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  IX   = KX
+                  DO 30, K = KK, KK + J - 1
+                     AP( K ) = AP( K ) + X( IX )*TEMP
+                     IX      = IX      + INCX
+   30             CONTINUE
+               END IF
+               JX = JX + INCX
+               KK = KK + J
+   40       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  A  when lower triangle is stored in AP.
+*
+         IF( INCX.EQ.1 )THEN
+            DO 60, J = 1, N
+               IF( X( J ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( J )
+                  K    = KK
+                  DO 50, I = J, N
+                     AP( K ) = AP( K ) + X( I )*TEMP
+                     K       = K       + 1
+   50             CONTINUE
+               END IF
+               KK = KK + N - J + 1
+   60       CONTINUE
+         ELSE
+            JX = KX
+            DO 80, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  IX   = JX
+                  DO 70, K = KK, KK + N - J
+                     AP( K ) = AP( K ) + X( IX )*TEMP
+                     IX      = IX      + INCX
+   70             CONTINUE
+               END IF
+               JX = JX + INCX
+               KK = KK + N - J + 1
+   80       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of DSPR  .
+*
+      END
+      subroutine  dswap (n,dx,incx,dy,incy)
+c
+c     interchanges two vectors.
+c     uses unrolled loops for increments equal one.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      double precision dx(*),dy(*),dtemp
+      integer i,incx,incy,ix,iy,m,mp1,n
+c
+      if(n.le.0)return
+      if(incx.eq.1.and.incy.eq.1)go to 20
+c
+c       code for unequal increments or equal increments not equal
+c         to 1
+c
+      ix = 1
+      iy = 1
+      if(incx.lt.0)ix = (-n+1)*incx + 1
+      if(incy.lt.0)iy = (-n+1)*incy + 1
+      do 10 i = 1,n
+        dtemp = dx(ix)
+        dx(ix) = dy(iy)
+        dy(iy) = dtemp
+        ix = ix + incx
+        iy = iy + incy
+   10 continue
+      return
+c
+c       code for both increments equal to 1
+c
+c
+c       clean-up loop
+c
+   20 m = mod(n,3)
+      if( m .eq. 0 ) go to 40
+      do 30 i = 1,m
+        dtemp = dx(i)
+        dx(i) = dy(i)
+        dy(i) = dtemp
+   30 continue
+      if( n .lt. 3 ) return
+   40 mp1 = m + 1
+      do 50 i = mp1,n,3
+        dtemp = dx(i)
+        dx(i) = dy(i)
+        dy(i) = dtemp
+        dtemp = dx(i + 1)
+        dx(i + 1) = dy(i + 1)
+        dy(i + 1) = dtemp
+        dtemp = dx(i + 2)
+        dx(i + 2) = dy(i + 2)
+        dy(i + 2) = dtemp
+   50 continue
+      return
+      end
+      SUBROUTINE DSYMM ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB,
+     $                   BETA, C, LDC )
+*     .. Scalar Arguments ..
+      CHARACTER*1        SIDE, UPLO
+      INTEGER            M, N, LDA, LDB, LDC
+      DOUBLE PRECISION   ALPHA, BETA
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), C( LDC, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  DSYMM  performs one of the matrix-matrix operations
+*
+*     C := alpha*A*B + beta*C,
+*
+*  or
+*
+*     C := alpha*B*A + beta*C,
+*
+*  where alpha and beta are scalars,  A is a symmetric matrix and  B and
+*  C are  m by n matrices.
+*
+*  Parameters
+*  ==========
+*
+*  SIDE   - CHARACTER*1.
+*           On entry,  SIDE  specifies whether  the  symmetric matrix  A
+*           appears on the  left or right  in the  operation as follows:
+*
+*              SIDE = 'L' or 'l'   C := alpha*A*B + beta*C,
+*
+*              SIDE = 'R' or 'r'   C := alpha*B*A + beta*C,
+*
+*           Unchanged on exit.
+*
+*  UPLO   - CHARACTER*1.
+*           On  entry,   UPLO  specifies  whether  the  upper  or  lower
+*           triangular  part  of  the  symmetric  matrix   A  is  to  be
+*           referenced as follows:
+*
+*              UPLO = 'U' or 'u'   Only the upper triangular part of the
+*                                  symmetric matrix is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the lower triangular part of the
+*                                  symmetric matrix is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry,  M  specifies the number of rows of the matrix  C.
+*           M  must be at least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of the matrix C.
+*           N  must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - DOUBLE PRECISION.
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is
+*           m  when  SIDE = 'L' or 'l'  and is  n otherwise.
+*           Before entry  with  SIDE = 'L' or 'l',  the  m by m  part of
+*           the array  A  must contain the  symmetric matrix,  such that
+*           when  UPLO = 'U' or 'u', the leading m by m upper triangular
+*           part of the array  A  must contain the upper triangular part
+*           of the  symmetric matrix and the  strictly  lower triangular
+*           part of  A  is not referenced,  and when  UPLO = 'L' or 'l',
+*           the leading  m by m  lower triangular part  of the  array  A
+*           must  contain  the  lower triangular part  of the  symmetric
+*           matrix and the  strictly upper triangular part of  A  is not
+*           referenced.
+*           Before entry  with  SIDE = 'R' or 'r',  the  n by n  part of
+*           the array  A  must contain the  symmetric matrix,  such that
+*           when  UPLO = 'U' or 'u', the leading n by n upper triangular
+*           part of the array  A  must contain the upper triangular part
+*           of the  symmetric matrix and the  strictly  lower triangular
+*           part of  A  is not referenced,  and when  UPLO = 'L' or 'l',
+*           the leading  n by n  lower triangular part  of the  array  A
+*           must  contain  the  lower triangular part  of the  symmetric
+*           matrix and the  strictly upper triangular part of  A  is not
+*           referenced.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program.  When  SIDE = 'L' or 'l'  then
+*           LDA must be at least  max( 1, m ), otherwise  LDA must be at
+*           least  max( 1, n ).
+*           Unchanged on exit.
+*
+*  B      - DOUBLE PRECISION array of DIMENSION ( LDB, n ).
+*           Before entry, the leading  m by n part of the array  B  must
+*           contain the matrix B.
+*           Unchanged on exit.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in  the  calling  (sub)  program.   LDB  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*  BETA   - DOUBLE PRECISION.
+*           On entry,  BETA  specifies the scalar  beta.  When  BETA  is
+*           supplied as zero then C need not be set on input.
+*           Unchanged on exit.
+*
+*  C      - DOUBLE PRECISION array of DIMENSION ( LDC, n ).
+*           Before entry, the leading  m by n  part of the array  C must
+*           contain the matrix  C,  except when  beta  is zero, in which
+*           case C need not be set on entry.
+*           On exit, the array  C  is overwritten by the  m by n updated
+*           matrix.
+*
+*  LDC    - INTEGER.
+*           On entry, LDC specifies the first dimension of C as declared
+*           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Local Scalars ..
+      LOGICAL            UPPER
+      INTEGER            I, INFO, J, K, NROWA
+      DOUBLE PRECISION   TEMP1, TEMP2
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE         , ZERO
+      PARAMETER        ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     ..
+*     .. Executable Statements ..
+*
+*     Set NROWA as the number of rows of A.
+*
+      IF( LSAME( SIDE, 'L' ) )THEN
+         NROWA = M
+      ELSE
+         NROWA = N
+      END IF
+      UPPER = LSAME( UPLO, 'U' )
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF(      ( .NOT.LSAME( SIDE, 'L' ) ).AND.
+     $         ( .NOT.LSAME( SIDE, 'R' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.UPPER              ).AND.
+     $         ( .NOT.LSAME( UPLO, 'L' ) )      )THEN
+         INFO = 2
+      ELSE IF( M  .LT.0               )THEN
+         INFO = 3
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 7
+      ELSE IF( LDB.LT.MAX( 1, M     ) )THEN
+         INFO = 9
+      ELSE IF( LDC.LT.MAX( 1, M     ) )THEN
+         INFO = 12
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'DSYMM ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.
+     $    ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         IF( BETA.EQ.ZERO )THEN
+            DO 20, J = 1, N
+               DO 10, I = 1, M
+                  C( I, J ) = ZERO
+   10          CONTINUE
+   20       CONTINUE
+         ELSE
+            DO 40, J = 1, N
+               DO 30, I = 1, M
+                  C( I, J ) = BETA*C( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSAME( SIDE, 'L' ) )THEN
+*
+*        Form  C := alpha*A*B + beta*C.
+*
+         IF( UPPER )THEN
+            DO 70, J = 1, N
+               DO 60, I = 1, M
+                  TEMP1 = ALPHA*B( I, J )
+                  TEMP2 = ZERO
+                  DO 50, K = 1, I - 1
+                     C( K, J ) = C( K, J ) + TEMP1    *A( K, I )
+                     TEMP2     = TEMP2     + B( K, J )*A( K, I )
+   50             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2
+                  ELSE
+                     C( I, J ) = BETA *C( I, J ) +
+     $                           TEMP1*A( I, I ) + ALPHA*TEMP2
+                  END IF
+   60          CONTINUE
+   70       CONTINUE
+         ELSE
+            DO 100, J = 1, N
+               DO 90, I = M, 1, -1
+                  TEMP1 = ALPHA*B( I, J )
+                  TEMP2 = ZERO
+                  DO 80, K = I + 1, M
+                     C( K, J ) = C( K, J ) + TEMP1    *A( K, I )
+                     TEMP2     = TEMP2     + B( K, J )*A( K, I )
+   80             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2
+                  ELSE
+                     C( I, J ) = BETA *C( I, J ) +
+     $                           TEMP1*A( I, I ) + ALPHA*TEMP2
+                  END IF
+   90          CONTINUE
+  100       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  C := alpha*B*A + beta*C.
+*
+         DO 170, J = 1, N
+            TEMP1 = ALPHA*A( J, J )
+            IF( BETA.EQ.ZERO )THEN
+               DO 110, I = 1, M
+                  C( I, J ) = TEMP1*B( I, J )
+  110          CONTINUE
+            ELSE
+               DO 120, I = 1, M
+                  C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J )
+  120          CONTINUE
+            END IF
+            DO 140, K = 1, J - 1
+               IF( UPPER )THEN
+                  TEMP1 = ALPHA*A( K, J )
+               ELSE
+                  TEMP1 = ALPHA*A( J, K )
+               END IF
+               DO 130, I = 1, M
+                  C( I, J ) = C( I, J ) + TEMP1*B( I, K )
+  130          CONTINUE
+  140       CONTINUE
+            DO 160, K = J + 1, N
+               IF( UPPER )THEN
+                  TEMP1 = ALPHA*A( J, K )
+               ELSE
+                  TEMP1 = ALPHA*A( K, J )
+               END IF
+               DO 150, I = 1, M
+                  C( I, J ) = C( I, J ) + TEMP1*B( I, K )
+  150          CONTINUE
+  160       CONTINUE
+  170    CONTINUE
+      END IF
+*
+      RETURN
+*
+*     End of DSYMM .
+*
+      END
+      SUBROUTINE DSYMV ( UPLO, N, ALPHA, A, LDA, X, INCX,
+     $                   BETA, Y, INCY )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   ALPHA, BETA
+      INTEGER            INCX, INCY, LDA, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  DSYMV  performs the matrix-vector  operation
+*
+*     y := alpha*A*x + beta*y,
+*
+*  where alpha and beta are scalars, x and y are n element vectors and
+*  A is an n by n symmetric matrix.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the array A is to be referenced as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the upper triangular part of A
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the lower triangular part of A
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - DOUBLE PRECISION.
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
+*           Before entry with  UPLO = 'U' or 'u', the leading n by n
+*           upper triangular part of the array A must contain the upper
+*           triangular part of the symmetric matrix and the strictly
+*           lower triangular part of A is not referenced.
+*           Before entry with UPLO = 'L' or 'l', the leading n by n
+*           lower triangular part of the array A must contain the lower
+*           triangular part of the symmetric matrix and the strictly
+*           upper triangular part of A is not referenced.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*  X      - DOUBLE PRECISION array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  BETA   - DOUBLE PRECISION.
+*           On entry, BETA specifies the scalar beta. When BETA is
+*           supplied as zero then Y need not be set on input.
+*           Unchanged on exit.
+*
+*  Y      - DOUBLE PRECISION array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the n
+*           element vector y. On exit, Y is overwritten by the updated
+*           vector y.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE         , ZERO
+      PARAMETER        ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   TEMP1, TEMP2
+      INTEGER            I, INFO, IX, IY, J, JX, JY, KX, KY
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( LDA.LT.MAX( 1, N ) )THEN
+         INFO = 5
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 7
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 10
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'DSYMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     Set up the start points in  X  and  Y.
+*
+      IF( INCX.GT.0 )THEN
+         KX = 1
+      ELSE
+         KX = 1 - ( N - 1 )*INCX
+      END IF
+      IF( INCY.GT.0 )THEN
+         KY = 1
+      ELSE
+         KY = 1 - ( N - 1 )*INCY
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through the triangular part
+*     of A.
+*
+*     First form  y := beta*y.
+*
+      IF( BETA.NE.ONE )THEN
+         IF( INCY.EQ.1 )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 10, I = 1, N
+                  Y( I ) = ZERO
+   10          CONTINUE
+            ELSE
+               DO 20, I = 1, N
+                  Y( I ) = BETA*Y( I )
+   20          CONTINUE
+            END IF
+         ELSE
+            IY = KY
+            IF( BETA.EQ.ZERO )THEN
+               DO 30, I = 1, N
+                  Y( IY ) = ZERO
+                  IY      = IY   + INCY
+   30          CONTINUE
+            ELSE
+               DO 40, I = 1, N
+                  Y( IY ) = BETA*Y( IY )
+                  IY      = IY           + INCY
+   40          CONTINUE
+            END IF
+         END IF
+      END IF
+      IF( ALPHA.EQ.ZERO )
+     $   RETURN
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  y  when A is stored in upper triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 60, J = 1, N
+               TEMP1 = ALPHA*X( J )
+               TEMP2 = ZERO
+               DO 50, I = 1, J - 1
+                  Y( I ) = Y( I ) + TEMP1*A( I, J )
+                  TEMP2  = TEMP2  + A( I, J )*X( I )
+   50          CONTINUE
+               Y( J ) = Y( J ) + TEMP1*A( J, J ) + ALPHA*TEMP2
+   60       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 80, J = 1, N
+               TEMP1 = ALPHA*X( JX )
+               TEMP2 = ZERO
+               IX    = KX
+               IY    = KY
+               DO 70, I = 1, J - 1
+                  Y( IY ) = Y( IY ) + TEMP1*A( I, J )
+                  TEMP2   = TEMP2   + A( I, J )*X( IX )
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+   70          CONTINUE
+               Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+   80       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  y  when A is stored in lower triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 100, J = 1, N
+               TEMP1  = ALPHA*X( J )
+               TEMP2  = ZERO
+               Y( J ) = Y( J )       + TEMP1*A( J, J )
+               DO 90, I = J + 1, N
+                  Y( I ) = Y( I ) + TEMP1*A( I, J )
+                  TEMP2  = TEMP2  + A( I, J )*X( I )
+   90          CONTINUE
+               Y( J ) = Y( J ) + ALPHA*TEMP2
+  100       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 120, J = 1, N
+               TEMP1   = ALPHA*X( JX )
+               TEMP2   = ZERO
+               Y( JY ) = Y( JY )       + TEMP1*A( J, J )
+               IX      = JX
+               IY      = JY
+               DO 110, I = J + 1, N
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+                  Y( IY ) = Y( IY ) + TEMP1*A( I, J )
+                  TEMP2   = TEMP2   + A( I, J )*X( IX )
+  110          CONTINUE
+               Y( JY ) = Y( JY ) + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+  120       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of DSYMV .
+*
+      END
+      SUBROUTINE DSYR2 ( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   ALPHA
+      INTEGER            INCX, INCY, LDA, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  DSYR2  performs the symmetric rank 2 operation
+*
+*     A := alpha*x*y' + alpha*y*x' + A,
+*
+*  where alpha is a scalar, x and y are n element vectors and A is an n
+*  by n symmetric matrix.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the array A is to be referenced as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the upper triangular part of A
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the lower triangular part of A
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - DOUBLE PRECISION.
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  X      - DOUBLE PRECISION array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  Y      - DOUBLE PRECISION array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the n
+*           element vector y.
+*           Unchanged on exit.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
+*           Before entry with  UPLO = 'U' or 'u', the leading n by n
+*           upper triangular part of the array A must contain the upper
+*           triangular part of the symmetric matrix and the strictly
+*           lower triangular part of A is not referenced. On exit, the
+*           upper triangular part of the array A is overwritten by the
+*           upper triangular part of the updated matrix.
+*           Before entry with UPLO = 'L' or 'l', the leading n by n
+*           lower triangular part of the array A must contain the lower
+*           triangular part of the symmetric matrix and the strictly
+*           upper triangular part of A is not referenced. On exit, the
+*           lower triangular part of the array A is overwritten by the
+*           lower triangular part of the updated matrix.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO
+      PARAMETER        ( ZERO = 0.0D+0 )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   TEMP1, TEMP2
+      INTEGER            I, INFO, IX, IY, J, JX, JY, KX, KY
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 5
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 7
+      ELSE IF( LDA.LT.MAX( 1, N ) )THEN
+         INFO = 9
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'DSYR2 ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) )
+     $   RETURN
+*
+*     Set up the start points in X and Y if the increments are not both
+*     unity.
+*
+      IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN
+         IF( INCX.GT.0 )THEN
+            KX = 1
+         ELSE
+            KX = 1 - ( N - 1 )*INCX
+         END IF
+         IF( INCY.GT.0 )THEN
+            KY = 1
+         ELSE
+            KY = 1 - ( N - 1 )*INCY
+         END IF
+         JX = KX
+         JY = KY
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through the triangular part
+*     of A.
+*
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  A  when A is stored in the upper triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 20, J = 1, N
+               IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*Y( J )
+                  TEMP2 = ALPHA*X( J )
+                  DO 10, I = 1, J
+                     A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2
+   10             CONTINUE
+               END IF
+   20       CONTINUE
+         ELSE
+            DO 40, J = 1, N
+               IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*Y( JY )
+                  TEMP2 = ALPHA*X( JX )
+                  IX    = KX
+                  IY    = KY
+                  DO 30, I = 1, J
+                     A( I, J ) = A( I, J ) + X( IX )*TEMP1
+     $                                     + Y( IY )*TEMP2
+                     IX        = IX        + INCX
+                     IY        = IY        + INCY
+   30             CONTINUE
+               END IF
+               JX = JX + INCX
+               JY = JY + INCY
+   40       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  A  when A is stored in the lower triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 60, J = 1, N
+               IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*Y( J )
+                  TEMP2 = ALPHA*X( J )
+                  DO 50, I = J, N
+                     A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2
+   50             CONTINUE
+               END IF
+   60       CONTINUE
+         ELSE
+            DO 80, J = 1, N
+               IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*Y( JY )
+                  TEMP2 = ALPHA*X( JX )
+                  IX    = JX
+                  IY    = JY
+                  DO 70, I = J, N
+                     A( I, J ) = A( I, J ) + X( IX )*TEMP1
+     $                                     + Y( IY )*TEMP2
+                     IX        = IX        + INCX
+                     IY        = IY        + INCY
+   70             CONTINUE
+               END IF
+               JX = JX + INCX
+               JY = JY + INCY
+   80       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of DSYR2 .
+*
+      END
+      SUBROUTINE DSYR2K( UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB,
+     $                   BETA, C, LDC )
+*     .. Scalar Arguments ..
+      CHARACTER*1        UPLO, TRANS
+      INTEGER            N, K, LDA, LDB, LDC
+      DOUBLE PRECISION   ALPHA, BETA
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), C( LDC, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  DSYR2K  performs one of the symmetric rank 2k operations
+*
+*     C := alpha*A*B' + alpha*B*A' + beta*C,
+*
+*  or
+*
+*     C := alpha*A'*B + alpha*B'*A + beta*C,
+*
+*  where  alpha and beta  are scalars, C is an  n by n  symmetric matrix
+*  and  A and B  are  n by k  matrices  in the  first  case  and  k by n
+*  matrices in the second case.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On  entry,   UPLO  specifies  whether  the  upper  or  lower
+*           triangular  part  of the  array  C  is to be  referenced  as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the  upper triangular part of  C
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the  lower triangular part of  C
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry,  TRANS  specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   C := alpha*A*B' + alpha*B*A' +
+*                                        beta*C.
+*
+*              TRANS = 'T' or 't'   C := alpha*A'*B + alpha*B'*A +
+*                                        beta*C.
+*
+*              TRANS = 'C' or 'c'   C := alpha*A'*B + alpha*B'*A +
+*                                        beta*C.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry,  N specifies the order of the matrix C.  N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry with  TRANS = 'N' or 'n',  K  specifies  the number
+*           of  columns  of the  matrices  A and B,  and on  entry  with
+*           TRANS = 'T' or 't' or 'C' or 'c',  K  specifies  the  number
+*           of rows of the matrices  A and B.  K must be at least  zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - DOUBLE PRECISION.
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is
+*           k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+*           Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+*           part of the array  A  must contain the matrix  A,  otherwise
+*           the leading  k by n  part of the array  A  must contain  the
+*           matrix A.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+*           then  LDA must be at least  max( 1, n ), otherwise  LDA must
+*           be at least  max( 1, k ).
+*           Unchanged on exit.
+*
+*  B      - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is
+*           k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+*           Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+*           part of the array  B  must contain the matrix  B,  otherwise
+*           the leading  k by n  part of the array  B  must contain  the
+*           matrix B.
+*           Unchanged on exit.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+*           then  LDB must be at least  max( 1, n ), otherwise  LDB must
+*           be at least  max( 1, k ).
+*           Unchanged on exit.
+*
+*  BETA   - DOUBLE PRECISION.
+*           On entry, BETA specifies the scalar beta.
+*           Unchanged on exit.
+*
+*  C      - DOUBLE PRECISION array of DIMENSION ( LDC, n ).
+*           Before entry  with  UPLO = 'U' or 'u',  the leading  n by n
+*           upper triangular part of the array C must contain the upper
+*           triangular part  of the  symmetric matrix  and the strictly
+*           lower triangular part of C is not referenced.  On exit, the
+*           upper triangular part of the array  C is overwritten by the
+*           upper triangular part of the updated matrix.
+*           Before entry  with  UPLO = 'L' or 'l',  the leading  n by n
+*           lower triangular part of the array C must contain the lower
+*           triangular part  of the  symmetric matrix  and the strictly
+*           upper triangular part of C is not referenced.  On exit, the
+*           lower triangular part of the array  C is overwritten by the
+*           lower triangular part of the updated matrix.
+*
+*  LDC    - INTEGER.
+*           On entry, LDC specifies the first dimension of C as declared
+*           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Local Scalars ..
+      LOGICAL            UPPER
+      INTEGER            I, INFO, J, L, NROWA
+      DOUBLE PRECISION   TEMP1, TEMP2
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE         , ZERO
+      PARAMETER        ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+         NROWA = N
+      ELSE
+         NROWA = K
+      END IF
+      UPPER = LSAME( UPLO, 'U' )
+*
+      INFO = 0
+      IF(      ( .NOT.UPPER               ).AND.
+     $         ( .NOT.LSAME( UPLO , 'L' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND.
+     $         ( .NOT.LSAME( TRANS, 'T' ) ).AND.
+     $         ( .NOT.LSAME( TRANS, 'C' ) )      )THEN
+         INFO = 2
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 3
+      ELSE IF( K  .LT.0               )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 7
+      ELSE IF( LDB.LT.MAX( 1, NROWA ) )THEN
+         INFO = 9
+      ELSE IF( LDC.LT.MAX( 1, N     ) )THEN
+         INFO = 12
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'DSYR2K', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.
+     $    ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         IF( UPPER )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 20, J = 1, N
+                  DO 10, I = 1, J
+                     C( I, J ) = ZERO
+   10             CONTINUE
+   20          CONTINUE
+            ELSE
+               DO 40, J = 1, N
+                  DO 30, I = 1, J
+                     C( I, J ) = BETA*C( I, J )
+   30             CONTINUE
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( BETA.EQ.ZERO )THEN
+               DO 60, J = 1, N
+                  DO 50, I = J, N
+                     C( I, J ) = ZERO
+   50             CONTINUE
+   60          CONTINUE
+            ELSE
+               DO 80, J = 1, N
+                  DO 70, I = J, N
+                     C( I, J ) = BETA*C( I, J )
+   70             CONTINUE
+   80          CONTINUE
+            END IF
+         END IF
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  C := alpha*A*B' + alpha*B*A' + C.
+*
+         IF( UPPER )THEN
+            DO 130, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 90, I = 1, J
+                     C( I, J ) = ZERO
+   90             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 100, I = 1, J
+                     C( I, J ) = BETA*C( I, J )
+  100             CONTINUE
+               END IF
+               DO 120, L = 1, K
+                  IF( ( A( J, L ).NE.ZERO ).OR.
+     $                ( B( J, L ).NE.ZERO )     )THEN
+                     TEMP1 = ALPHA*B( J, L )
+                     TEMP2 = ALPHA*A( J, L )
+                     DO 110, I = 1, J
+                        C( I, J ) = C( I, J ) +
+     $                              A( I, L )*TEMP1 + B( I, L )*TEMP2
+  110                CONTINUE
+                  END IF
+  120          CONTINUE
+  130       CONTINUE
+         ELSE
+            DO 180, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 140, I = J, N
+                     C( I, J ) = ZERO
+  140             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 150, I = J, N
+                     C( I, J ) = BETA*C( I, J )
+  150             CONTINUE
+               END IF
+               DO 170, L = 1, K
+                  IF( ( A( J, L ).NE.ZERO ).OR.
+     $                ( B( J, L ).NE.ZERO )     )THEN
+                     TEMP1 = ALPHA*B( J, L )
+                     TEMP2 = ALPHA*A( J, L )
+                     DO 160, I = J, N
+                        C( I, J ) = C( I, J ) +
+     $                              A( I, L )*TEMP1 + B( I, L )*TEMP2
+  160                CONTINUE
+                  END IF
+  170          CONTINUE
+  180       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  C := alpha*A'*B + alpha*B'*A + C.
+*
+         IF( UPPER )THEN
+            DO 210, J = 1, N
+               DO 200, I = 1, J
+                  TEMP1 = ZERO
+                  TEMP2 = ZERO
+                  DO 190, L = 1, K
+                     TEMP1 = TEMP1 + A( L, I )*B( L, J )
+                     TEMP2 = TEMP2 + B( L, I )*A( L, J )
+  190             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2
+                  ELSE
+                     C( I, J ) = BETA *C( I, J ) +
+     $                           ALPHA*TEMP1 + ALPHA*TEMP2
+                  END IF
+  200          CONTINUE
+  210       CONTINUE
+         ELSE
+            DO 240, J = 1, N
+               DO 230, I = J, N
+                  TEMP1 = ZERO
+                  TEMP2 = ZERO
+                  DO 220, L = 1, K
+                     TEMP1 = TEMP1 + A( L, I )*B( L, J )
+                     TEMP2 = TEMP2 + B( L, I )*A( L, J )
+  220             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2
+                  ELSE
+                     C( I, J ) = BETA *C( I, J ) +
+     $                           ALPHA*TEMP1 + ALPHA*TEMP2
+                  END IF
+  230          CONTINUE
+  240       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of DSYR2K.
+*
+      END
+      SUBROUTINE DSYR  ( UPLO, N, ALPHA, X, INCX, A, LDA )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   ALPHA
+      INTEGER            INCX, LDA, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  DSYR   performs the symmetric rank 1 operation
+*
+*     A := alpha*x*x' + A,
+*
+*  where alpha is a real scalar, x is an n element vector and A is an
+*  n by n symmetric matrix.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the array A is to be referenced as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the upper triangular part of A
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the lower triangular part of A
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - DOUBLE PRECISION.
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  X      - DOUBLE PRECISION array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
+*           Before entry with  UPLO = 'U' or 'u', the leading n by n
+*           upper triangular part of the array A must contain the upper
+*           triangular part of the symmetric matrix and the strictly
+*           lower triangular part of A is not referenced. On exit, the
+*           upper triangular part of the array A is overwritten by the
+*           upper triangular part of the updated matrix.
+*           Before entry with UPLO = 'L' or 'l', the leading n by n
+*           lower triangular part of the array A must contain the lower
+*           triangular part of the symmetric matrix and the strictly
+*           upper triangular part of A is not referenced. On exit, the
+*           lower triangular part of the array A is overwritten by the
+*           lower triangular part of the updated matrix.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO
+      PARAMETER        ( ZERO = 0.0D+0 )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   TEMP
+      INTEGER            I, INFO, IX, J, JX, KX
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 5
+      ELSE IF( LDA.LT.MAX( 1, N ) )THEN
+         INFO = 7
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'DSYR  ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) )
+     $   RETURN
+*
+*     Set the start point in X if the increment is not unity.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through the triangular part
+*     of A.
+*
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  A  when A is stored in upper triangle.
+*
+         IF( INCX.EQ.1 )THEN
+            DO 20, J = 1, N
+               IF( X( J ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( J )
+                  DO 10, I = 1, J
+                     A( I, J ) = A( I, J ) + X( I )*TEMP
+   10             CONTINUE
+               END IF
+   20       CONTINUE
+         ELSE
+            JX = KX
+            DO 40, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  IX   = KX
+                  DO 30, I = 1, J
+                     A( I, J ) = A( I, J ) + X( IX )*TEMP
+                     IX        = IX        + INCX
+   30             CONTINUE
+               END IF
+               JX = JX + INCX
+   40       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  A  when A is stored in lower triangle.
+*
+         IF( INCX.EQ.1 )THEN
+            DO 60, J = 1, N
+               IF( X( J ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( J )
+                  DO 50, I = J, N
+                     A( I, J ) = A( I, J ) + X( I )*TEMP
+   50             CONTINUE
+               END IF
+   60       CONTINUE
+         ELSE
+            JX = KX
+            DO 80, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  IX   = JX
+                  DO 70, I = J, N
+                     A( I, J ) = A( I, J ) + X( IX )*TEMP
+                     IX        = IX        + INCX
+   70             CONTINUE
+               END IF
+               JX = JX + INCX
+   80       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of DSYR  .
+*
+      END
+      SUBROUTINE DSYRK ( UPLO, TRANS, N, K, ALPHA, A, LDA,
+     $                   BETA, C, LDC )
+*     .. Scalar Arguments ..
+      CHARACTER*1        UPLO, TRANS
+      INTEGER            N, K, LDA, LDC
+      DOUBLE PRECISION   ALPHA, BETA
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), C( LDC, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  DSYRK  performs one of the symmetric rank k operations
+*
+*     C := alpha*A*A' + beta*C,
+*
+*  or
+*
+*     C := alpha*A'*A + beta*C,
+*
+*  where  alpha and beta  are scalars, C is an  n by n  symmetric matrix
+*  and  A  is an  n by k  matrix in the first case and a  k by n  matrix
+*  in the second case.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On  entry,   UPLO  specifies  whether  the  upper  or  lower
+*           triangular  part  of the  array  C  is to be  referenced  as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the  upper triangular part of  C
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the  lower triangular part of  C
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry,  TRANS  specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   C := alpha*A*A' + beta*C.
+*
+*              TRANS = 'T' or 't'   C := alpha*A'*A + beta*C.
+*
+*              TRANS = 'C' or 'c'   C := alpha*A'*A + beta*C.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry,  N specifies the order of the matrix C.  N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry with  TRANS = 'N' or 'n',  K  specifies  the number
+*           of  columns   of  the   matrix   A,   and  on   entry   with
+*           TRANS = 'T' or 't' or 'C' or 'c',  K  specifies  the  number
+*           of rows of the matrix  A.  K must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - DOUBLE PRECISION.
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is
+*           k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+*           Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+*           part of the array  A  must contain the matrix  A,  otherwise
+*           the leading  k by n  part of the array  A  must contain  the
+*           matrix A.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+*           then  LDA must be at least  max( 1, n ), otherwise  LDA must
+*           be at least  max( 1, k ).
+*           Unchanged on exit.
+*
+*  BETA   - DOUBLE PRECISION.
+*           On entry, BETA specifies the scalar beta.
+*           Unchanged on exit.
+*
+*  C      - DOUBLE PRECISION array of DIMENSION ( LDC, n ).
+*           Before entry  with  UPLO = 'U' or 'u',  the leading  n by n
+*           upper triangular part of the array C must contain the upper
+*           triangular part  of the  symmetric matrix  and the strictly
+*           lower triangular part of C is not referenced.  On exit, the
+*           upper triangular part of the array  C is overwritten by the
+*           upper triangular part of the updated matrix.
+*           Before entry  with  UPLO = 'L' or 'l',  the leading  n by n
+*           lower triangular part of the array C must contain the lower
+*           triangular part  of the  symmetric matrix  and the strictly
+*           upper triangular part of C is not referenced.  On exit, the
+*           lower triangular part of the array  C is overwritten by the
+*           lower triangular part of the updated matrix.
+*
+*  LDC    - INTEGER.
+*           On entry, LDC specifies the first dimension of C as declared
+*           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Local Scalars ..
+      LOGICAL            UPPER
+      INTEGER            I, INFO, J, L, NROWA
+      DOUBLE PRECISION   TEMP
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE ,         ZERO
+      PARAMETER        ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+         NROWA = N
+      ELSE
+         NROWA = K
+      END IF
+      UPPER = LSAME( UPLO, 'U' )
+*
+      INFO = 0
+      IF(      ( .NOT.UPPER               ).AND.
+     $         ( .NOT.LSAME( UPLO , 'L' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND.
+     $         ( .NOT.LSAME( TRANS, 'T' ) ).AND.
+     $         ( .NOT.LSAME( TRANS, 'C' ) )      )THEN
+         INFO = 2
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 3
+      ELSE IF( K  .LT.0               )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 7
+      ELSE IF( LDC.LT.MAX( 1, N     ) )THEN
+         INFO = 10
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'DSYRK ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.
+     $    ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         IF( UPPER )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 20, J = 1, N
+                  DO 10, I = 1, J
+                     C( I, J ) = ZERO
+   10             CONTINUE
+   20          CONTINUE
+            ELSE
+               DO 40, J = 1, N
+                  DO 30, I = 1, J
+                     C( I, J ) = BETA*C( I, J )
+   30             CONTINUE
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( BETA.EQ.ZERO )THEN
+               DO 60, J = 1, N
+                  DO 50, I = J, N
+                     C( I, J ) = ZERO
+   50             CONTINUE
+   60          CONTINUE
+            ELSE
+               DO 80, J = 1, N
+                  DO 70, I = J, N
+                     C( I, J ) = BETA*C( I, J )
+   70             CONTINUE
+   80          CONTINUE
+            END IF
+         END IF
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  C := alpha*A*A' + beta*C.
+*
+         IF( UPPER )THEN
+            DO 130, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 90, I = 1, J
+                     C( I, J ) = ZERO
+   90             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 100, I = 1, J
+                     C( I, J ) = BETA*C( I, J )
+  100             CONTINUE
+               END IF
+               DO 120, L = 1, K
+                  IF( A( J, L ).NE.ZERO )THEN
+                     TEMP = ALPHA*A( J, L )
+                     DO 110, I = 1, J
+                        C( I, J ) = C( I, J ) + TEMP*A( I, L )
+  110                CONTINUE
+                  END IF
+  120          CONTINUE
+  130       CONTINUE
+         ELSE
+            DO 180, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 140, I = J, N
+                     C( I, J ) = ZERO
+  140             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 150, I = J, N
+                     C( I, J ) = BETA*C( I, J )
+  150             CONTINUE
+               END IF
+               DO 170, L = 1, K
+                  IF( A( J, L ).NE.ZERO )THEN
+                     TEMP      = ALPHA*A( J, L )
+                     DO 160, I = J, N
+                        C( I, J ) = C( I, J ) + TEMP*A( I, L )
+  160                CONTINUE
+                  END IF
+  170          CONTINUE
+  180       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  C := alpha*A'*A + beta*C.
+*
+         IF( UPPER )THEN
+            DO 210, J = 1, N
+               DO 200, I = 1, J
+                  TEMP = ZERO
+                  DO 190, L = 1, K
+                     TEMP = TEMP + A( L, I )*A( L, J )
+  190             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  200          CONTINUE
+  210       CONTINUE
+         ELSE
+            DO 240, J = 1, N
+               DO 230, I = J, N
+                  TEMP = ZERO
+                  DO 220, L = 1, K
+                     TEMP = TEMP + A( L, I )*A( L, J )
+  220             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  230          CONTINUE
+  240       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of DSYRK .
+*
+      END
+      SUBROUTINE DTBMV ( UPLO, TRANS, DIAG, N, K, A, LDA, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, K, LDA, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  DTBMV  performs one of the matrix-vector operations
+*
+*     x := A*x,   or   x := A'*x,
+*
+*  where x is an n element vector and  A is an n by n unit, or non-unit,
+*  upper or lower triangular band matrix, with ( k + 1 ) diagonals.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   x := A*x.
+*
+*              TRANS = 'T' or 't'   x := A'*x.
+*
+*              TRANS = 'C' or 'c'   x := A'*x.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry with UPLO = 'U' or 'u', K specifies the number of
+*           super-diagonals of the matrix A.
+*           On entry with UPLO = 'L' or 'l', K specifies the number of
+*           sub-diagonals of the matrix A.
+*           K must satisfy  0 .le. K.
+*           Unchanged on exit.
+*
+*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
+*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
+*           by n part of the array A must contain the upper triangular
+*           band part of the matrix of coefficients, supplied column by
+*           column, with the leading diagonal of the matrix in row
+*           ( k + 1 ) of the array, the first super-diagonal starting at
+*           position 2 in row k, and so on. The top left k by k triangle
+*           of the array A is not referenced.
+*           The following program segment will transfer an upper
+*           triangular band matrix from conventional full matrix storage
+*           to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = K + 1 - J
+*                    DO 10, I = MAX( 1, J - K ), J
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
+*           by n part of the array A must contain the lower triangular
+*           band part of the matrix of coefficients, supplied column by
+*           column, with the leading diagonal of the matrix in row 1 of
+*           the array, the first sub-diagonal starting at position 1 in
+*           row 2, and so on. The bottom right k by k triangle of the
+*           array A is not referenced.
+*           The following program segment will transfer a lower
+*           triangular band matrix from conventional full matrix storage
+*           to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = 1 - J
+*                    DO 10, I = J, MIN( N, J + K )
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Note that when DIAG = 'U' or 'u' the elements of the array A
+*           corresponding to the diagonal elements of the matrix are not
+*           referenced, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           ( k + 1 ).
+*           Unchanged on exit.
+*
+*  X      - DOUBLE PRECISION array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x. On exit, X is overwritten with the
+*           tranformed vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO
+      PARAMETER        ( ZERO = 0.0D+0 )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   TEMP
+      INTEGER            I, INFO, IX, J, JX, KPLUS1, KX, L
+      LOGICAL            NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( K.LT.0 )THEN
+         INFO = 5
+      ELSE IF( LDA.LT.( K + 1 ) )THEN
+         INFO = 7
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 9
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'DTBMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOUNIT = LSAME( DIAG, 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX   too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through A.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*         Form  x := A*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KPLUS1 = K + 1
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     L    = KPLUS1 - J
+                     DO 10, I = MAX( 1, J - K ), J - 1
+                        X( I ) = X( I ) + TEMP*A( L + I, J )
+   10                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*A( KPLUS1, J )
+                  END IF
+   20          CONTINUE
+            ELSE
+               JX = KX
+               DO 40, J = 1, N
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     L    = KPLUS1  - J
+                     DO 30, I = MAX( 1, J - K ), J - 1
+                        X( IX ) = X( IX ) + TEMP*A( L + I, J )
+                        IX      = IX      + INCX
+   30                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*A( KPLUS1, J )
+                  END IF
+                  JX = JX + INCX
+                  IF( J.GT.K )
+     $               KX = KX + INCX
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     L    = 1      - J
+                     DO 50, I = MIN( N, J + K ), J + 1, -1
+                        X( I ) = X( I ) + TEMP*A( L + I, J )
+   50                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*A( 1, J )
+                  END IF
+   60          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 80, J = N, 1, -1
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     L    = 1       - J
+                     DO 70, I = MIN( N, J + K ), J + 1, -1
+                        X( IX ) = X( IX ) + TEMP*A( L + I, J )
+                        IX      = IX      - INCX
+   70                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*A( 1, J )
+                  END IF
+                  JX = JX - INCX
+                  IF( ( N - J ).GE.K )
+     $               KX = KX - INCX
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := A'*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KPLUS1 = K + 1
+            IF( INCX.EQ.1 )THEN
+               DO 100, J = N, 1, -1
+                  TEMP = X( J )
+                  L    = KPLUS1 - J
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( KPLUS1, J )
+                  DO 90, I = J - 1, MAX( 1, J - K ), -1
+                     TEMP = TEMP + A( L + I, J )*X( I )
+   90             CONTINUE
+                  X( J ) = TEMP
+  100          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 120, J = N, 1, -1
+                  TEMP = X( JX )
+                  KX   = KX      - INCX
+                  IX   = KX
+                  L    = KPLUS1  - J
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( KPLUS1, J )
+                  DO 110, I = J - 1, MAX( 1, J - K ), -1
+                     TEMP = TEMP + A( L + I, J )*X( IX )
+                     IX   = IX   - INCX
+  110             CONTINUE
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+  120          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 140, J = 1, N
+                  TEMP = X( J )
+                  L    = 1      - J
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( 1, J )
+                  DO 130, I = J + 1, MIN( N, J + K )
+                     TEMP = TEMP + A( L + I, J )*X( I )
+  130             CONTINUE
+                  X( J ) = TEMP
+  140          CONTINUE
+            ELSE
+               JX = KX
+               DO 160, J = 1, N
+                  TEMP = X( JX )
+                  KX   = KX      + INCX
+                  IX   = KX
+                  L    = 1       - J
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( 1, J )
+                  DO 150, I = J + 1, MIN( N, J + K )
+                     TEMP = TEMP + A( L + I, J )*X( IX )
+                     IX   = IX   + INCX
+  150             CONTINUE
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+  160          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of DTBMV .
+*
+      END
+      SUBROUTINE DTBSV ( UPLO, TRANS, DIAG, N, K, A, LDA, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, K, LDA, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  DTBSV  solves one of the systems of equations
+*
+*     A*x = b,   or   A'*x = b,
+*
+*  where b and x are n element vectors and A is an n by n unit, or
+*  non-unit, upper or lower triangular band matrix, with ( k + 1 )
+*  diagonals.
+*
+*  No test for singularity or near-singularity is included in this
+*  routine. Such tests must be performed before calling this routine.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the equations to be solved as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   A*x = b.
+*
+*              TRANS = 'T' or 't'   A'*x = b.
+*
+*              TRANS = 'C' or 'c'   A'*x = b.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry with UPLO = 'U' or 'u', K specifies the number of
+*           super-diagonals of the matrix A.
+*           On entry with UPLO = 'L' or 'l', K specifies the number of
+*           sub-diagonals of the matrix A.
+*           K must satisfy  0 .le. K.
+*           Unchanged on exit.
+*
+*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
+*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
+*           by n part of the array A must contain the upper triangular
+*           band part of the matrix of coefficients, supplied column by
+*           column, with the leading diagonal of the matrix in row
+*           ( k + 1 ) of the array, the first super-diagonal starting at
+*           position 2 in row k, and so on. The top left k by k triangle
+*           of the array A is not referenced.
+*           The following program segment will transfer an upper
+*           triangular band matrix from conventional full matrix storage
+*           to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = K + 1 - J
+*                    DO 10, I = MAX( 1, J - K ), J
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
+*           by n part of the array A must contain the lower triangular
+*           band part of the matrix of coefficients, supplied column by
+*           column, with the leading diagonal of the matrix in row 1 of
+*           the array, the first sub-diagonal starting at position 1 in
+*           row 2, and so on. The bottom right k by k triangle of the
+*           array A is not referenced.
+*           The following program segment will transfer a lower
+*           triangular band matrix from conventional full matrix storage
+*           to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = 1 - J
+*                    DO 10, I = J, MIN( N, J + K )
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Note that when DIAG = 'U' or 'u' the elements of the array A
+*           corresponding to the diagonal elements of the matrix are not
+*           referenced, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           ( k + 1 ).
+*           Unchanged on exit.
+*
+*  X      - DOUBLE PRECISION array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element right-hand side vector b. On exit, X is overwritten
+*           with the solution vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO
+      PARAMETER        ( ZERO = 0.0D+0 )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   TEMP
+      INTEGER            I, INFO, IX, J, JX, KPLUS1, KX, L
+      LOGICAL            NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( K.LT.0 )THEN
+         INFO = 5
+      ELSE IF( LDA.LT.( K + 1 ) )THEN
+         INFO = 7
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 9
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'DTBSV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOUNIT = LSAME( DIAG, 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX  too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed by sequentially with one pass through A.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  x := inv( A )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KPLUS1 = K + 1
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     L = KPLUS1 - J
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/A( KPLUS1, J )
+                     TEMP = X( J )
+                     DO 10, I = J - 1, MAX( 1, J - K ), -1
+                        X( I ) = X( I ) - TEMP*A( L + I, J )
+   10                CONTINUE
+                  END IF
+   20          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 40, J = N, 1, -1
+                  KX = KX - INCX
+                  IF( X( JX ).NE.ZERO )THEN
+                     IX = KX
+                     L  = KPLUS1 - J
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/A( KPLUS1, J )
+                     TEMP = X( JX )
+                     DO 30, I = J - 1, MAX( 1, J - K ), -1
+                        X( IX ) = X( IX ) - TEMP*A( L + I, J )
+                        IX      = IX      - INCX
+   30                CONTINUE
+                  END IF
+                  JX = JX - INCX
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     L = 1 - J
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/A( 1, J )
+                     TEMP = X( J )
+                     DO 50, I = J + 1, MIN( N, J + K )
+                        X( I ) = X( I ) - TEMP*A( L + I, J )
+   50                CONTINUE
+                  END IF
+   60          CONTINUE
+            ELSE
+               JX = KX
+               DO 80, J = 1, N
+                  KX = KX + INCX
+                  IF( X( JX ).NE.ZERO )THEN
+                     IX = KX
+                     L  = 1  - J
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/A( 1, J )
+                     TEMP = X( JX )
+                     DO 70, I = J + 1, MIN( N, J + K )
+                        X( IX ) = X( IX ) - TEMP*A( L + I, J )
+                        IX      = IX      + INCX
+   70                CONTINUE
+                  END IF
+                  JX = JX + INCX
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := inv( A')*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KPLUS1 = K + 1
+            IF( INCX.EQ.1 )THEN
+               DO 100, J = 1, N
+                  TEMP = X( J )
+                  L    = KPLUS1 - J
+                  DO 90, I = MAX( 1, J - K ), J - 1
+                     TEMP = TEMP - A( L + I, J )*X( I )
+   90             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/A( KPLUS1, J )
+                  X( J ) = TEMP
+  100          CONTINUE
+            ELSE
+               JX = KX
+               DO 120, J = 1, N
+                  TEMP = X( JX )
+                  IX   = KX
+                  L    = KPLUS1  - J
+                  DO 110, I = MAX( 1, J - K ), J - 1
+                     TEMP = TEMP - A( L + I, J )*X( IX )
+                     IX   = IX   + INCX
+  110             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/A( KPLUS1, J )
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+                  IF( J.GT.K )
+     $               KX = KX + INCX
+  120          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 140, J = N, 1, -1
+                  TEMP = X( J )
+                  L    = 1      - J
+                  DO 130, I = MIN( N, J + K ), J + 1, -1
+                     TEMP = TEMP - A( L + I, J )*X( I )
+  130             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/A( 1, J )
+                  X( J ) = TEMP
+  140          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 160, J = N, 1, -1
+                  TEMP = X( JX )
+                  IX   = KX
+                  L    = 1       - J
+                  DO 150, I = MIN( N, J + K ), J + 1, -1
+                     TEMP = TEMP - A( L + I, J )*X( IX )
+                     IX   = IX   - INCX
+  150             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/A( 1, J )
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+                  IF( ( N - J ).GE.K )
+     $               KX = KX - INCX
+  160          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of DTBSV .
+*
+      END
+      SUBROUTINE DTPMV ( UPLO, TRANS, DIAG, N, AP, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      DOUBLE PRECISION   AP( * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  DTPMV  performs one of the matrix-vector operations
+*
+*     x := A*x,   or   x := A'*x,
+*
+*  where x is an n element vector and  A is an n by n unit, or non-unit,
+*  upper or lower triangular matrix, supplied in packed form.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   x := A*x.
+*
+*              TRANS = 'T' or 't'   x := A'*x.
+*
+*              TRANS = 'C' or 'c'   x := A'*x.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  AP     - DOUBLE PRECISION array of DIMENSION at least
+*           ( ( n*( n + 1 ) )/2 ).
+*           Before entry with  UPLO = 'U' or 'u', the array AP must
+*           contain the upper triangular matrix packed sequentially,
+*           column by column, so that AP( 1 ) contains a( 1, 1 ),
+*           AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 )
+*           respectively, and so on.
+*           Before entry with UPLO = 'L' or 'l', the array AP must
+*           contain the lower triangular matrix packed sequentially,
+*           column by column, so that AP( 1 ) contains a( 1, 1 ),
+*           AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 )
+*           respectively, and so on.
+*           Note that when  DIAG = 'U' or 'u', the diagonal elements of
+*           A are not referenced, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  X      - DOUBLE PRECISION array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x. On exit, X is overwritten with the
+*           tranformed vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO
+      PARAMETER        ( ZERO = 0.0D+0 )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   TEMP
+      INTEGER            I, INFO, IX, J, JX, K, KK, KX
+      LOGICAL            NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 7
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'DTPMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOUNIT = LSAME( DIAG, 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX  too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of AP are
+*     accessed sequentially with one pass through AP.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  x:= A*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KK =1
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     K    = KK
+                     DO 10, I = 1, J - 1
+                        X( I ) = X( I ) + TEMP*AP( K )
+                        K      = K      + 1
+   10                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*AP( KK + J - 1 )
+                  END IF
+                  KK = KK + J
+   20          CONTINUE
+            ELSE
+               JX = KX
+               DO 40, J = 1, N
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     DO 30, K = KK, KK + J - 2
+                        X( IX ) = X( IX ) + TEMP*AP( K )
+                        IX      = IX      + INCX
+   30                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*AP( KK + J - 1 )
+                  END IF
+                  JX = JX + INCX
+                  KK = KK + J
+   40          CONTINUE
+            END IF
+         ELSE
+            KK = ( N*( N + 1 ) )/2
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     K    = KK
+                     DO 50, I = N, J + 1, -1
+                        X( I ) = X( I ) + TEMP*AP( K )
+                        K      = K      - 1
+   50                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*AP( KK - N + J )
+                  END IF
+                  KK = KK - ( N - J + 1 )
+   60          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 80, J = N, 1, -1
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     DO 70, K = KK, KK - ( N - ( J + 1 ) ), -1
+                        X( IX ) = X( IX ) + TEMP*AP( K )
+                        IX      = IX      - INCX
+   70                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*AP( KK - N + J )
+                  END IF
+                  JX = JX - INCX
+                  KK = KK - ( N - J + 1 )
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := A'*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KK = ( N*( N + 1 ) )/2
+            IF( INCX.EQ.1 )THEN
+               DO 100, J = N, 1, -1
+                  TEMP = X( J )
+                  IF( NOUNIT )
+     $               TEMP = TEMP*AP( KK )
+                  K = KK - 1
+                  DO 90, I = J - 1, 1, -1
+                     TEMP = TEMP + AP( K )*X( I )
+                     K    = K    - 1
+   90             CONTINUE
+                  X( J ) = TEMP
+                  KK     = KK   - J
+  100          CONTINUE
+            ELSE
+               JX = KX + ( N - 1 )*INCX
+               DO 120, J = N, 1, -1
+                  TEMP = X( JX )
+                  IX   = JX
+                  IF( NOUNIT )
+     $               TEMP = TEMP*AP( KK )
+                  DO 110, K = KK - 1, KK - J + 1, -1
+                     IX   = IX   - INCX
+                     TEMP = TEMP + AP( K )*X( IX )
+  110             CONTINUE
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+                  KK      = KK   - J
+  120          CONTINUE
+            END IF
+         ELSE
+            KK = 1
+            IF( INCX.EQ.1 )THEN
+               DO 140, J = 1, N
+                  TEMP = X( J )
+                  IF( NOUNIT )
+     $               TEMP = TEMP*AP( KK )
+                  K = KK + 1
+                  DO 130, I = J + 1, N
+                     TEMP = TEMP + AP( K )*X( I )
+                     K    = K    + 1
+  130             CONTINUE
+                  X( J ) = TEMP
+                  KK     = KK   + ( N - J + 1 )
+  140          CONTINUE
+            ELSE
+               JX = KX
+               DO 160, J = 1, N
+                  TEMP = X( JX )
+                  IX   = JX
+                  IF( NOUNIT )
+     $               TEMP = TEMP*AP( KK )
+                  DO 150, K = KK + 1, KK + N - J
+                     IX   = IX   + INCX
+                     TEMP = TEMP + AP( K )*X( IX )
+  150             CONTINUE
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+                  KK      = KK   + ( N - J + 1 )
+  160          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of DTPMV .
+*
+      END
+      SUBROUTINE DTPSV ( UPLO, TRANS, DIAG, N, AP, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      DOUBLE PRECISION   AP( * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  DTPSV  solves one of the systems of equations
+*
+*     A*x = b,   or   A'*x = b,
+*
+*  where b and x are n element vectors and A is an n by n unit, or
+*  non-unit, upper or lower triangular matrix, supplied in packed form.
+*
+*  No test for singularity or near-singularity is included in this
+*  routine. Such tests must be performed before calling this routine.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the equations to be solved as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   A*x = b.
+*
+*              TRANS = 'T' or 't'   A'*x = b.
+*
+*              TRANS = 'C' or 'c'   A'*x = b.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  AP     - DOUBLE PRECISION array of DIMENSION at least
+*           ( ( n*( n + 1 ) )/2 ).
+*           Before entry with  UPLO = 'U' or 'u', the array AP must
+*           contain the upper triangular matrix packed sequentially,
+*           column by column, so that AP( 1 ) contains a( 1, 1 ),
+*           AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 )
+*           respectively, and so on.
+*           Before entry with UPLO = 'L' or 'l', the array AP must
+*           contain the lower triangular matrix packed sequentially,
+*           column by column, so that AP( 1 ) contains a( 1, 1 ),
+*           AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 )
+*           respectively, and so on.
+*           Note that when  DIAG = 'U' or 'u', the diagonal elements of
+*           A are not referenced, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  X      - DOUBLE PRECISION array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element right-hand side vector b. On exit, X is overwritten
+*           with the solution vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO
+      PARAMETER        ( ZERO = 0.0D+0 )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   TEMP
+      INTEGER            I, INFO, IX, J, JX, K, KK, KX
+      LOGICAL            NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 7
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'DTPSV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOUNIT = LSAME( DIAG, 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX  too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of AP are
+*     accessed sequentially with one pass through AP.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  x := inv( A )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KK = ( N*( N + 1 ) )/2
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/AP( KK )
+                     TEMP = X( J )
+                     K    = KK     - 1
+                     DO 10, I = J - 1, 1, -1
+                        X( I ) = X( I ) - TEMP*AP( K )
+                        K      = K      - 1
+   10                CONTINUE
+                  END IF
+                  KK = KK - J
+   20          CONTINUE
+            ELSE
+               JX = KX + ( N - 1 )*INCX
+               DO 40, J = N, 1, -1
+                  IF( X( JX ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/AP( KK )
+                     TEMP = X( JX )
+                     IX   = JX
+                     DO 30, K = KK - 1, KK - J + 1, -1
+                        IX      = IX      - INCX
+                        X( IX ) = X( IX ) - TEMP*AP( K )
+   30                CONTINUE
+                  END IF
+                  JX = JX - INCX
+                  KK = KK - J
+   40          CONTINUE
+            END IF
+         ELSE
+            KK = 1
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/AP( KK )
+                     TEMP = X( J )
+                     K    = KK     + 1
+                     DO 50, I = J + 1, N
+                        X( I ) = X( I ) - TEMP*AP( K )
+                        K      = K      + 1
+   50                CONTINUE
+                  END IF
+                  KK = KK + ( N - J + 1 )
+   60          CONTINUE
+            ELSE
+               JX = KX
+               DO 80, J = 1, N
+                  IF( X( JX ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/AP( KK )
+                     TEMP = X( JX )
+                     IX   = JX
+                     DO 70, K = KK + 1, KK + N - J
+                        IX      = IX      + INCX
+                        X( IX ) = X( IX ) - TEMP*AP( K )
+   70                CONTINUE
+                  END IF
+                  JX = JX + INCX
+                  KK = KK + ( N - J + 1 )
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := inv( A' )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KK = 1
+            IF( INCX.EQ.1 )THEN
+               DO 100, J = 1, N
+                  TEMP = X( J )
+                  K    = KK
+                  DO 90, I = 1, J - 1
+                     TEMP = TEMP - AP( K )*X( I )
+                     K    = K    + 1
+   90             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/AP( KK + J - 1 )
+                  X( J ) = TEMP
+                  KK     = KK   + J
+  100          CONTINUE
+            ELSE
+               JX = KX
+               DO 120, J = 1, N
+                  TEMP = X( JX )
+                  IX   = KX
+                  DO 110, K = KK, KK + J - 2
+                     TEMP = TEMP - AP( K )*X( IX )
+                     IX   = IX   + INCX
+  110             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/AP( KK + J - 1 )
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+                  KK      = KK   + J
+  120          CONTINUE
+            END IF
+         ELSE
+            KK = ( N*( N + 1 ) )/2
+            IF( INCX.EQ.1 )THEN
+               DO 140, J = N, 1, -1
+                  TEMP = X( J )
+                  K = KK
+                  DO 130, I = N, J + 1, -1
+                     TEMP = TEMP - AP( K )*X( I )
+                     K    = K    - 1
+  130             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/AP( KK - N + J )
+                  X( J ) = TEMP
+                  KK     = KK   - ( N - J + 1 )
+  140          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 160, J = N, 1, -1
+                  TEMP = X( JX )
+                  IX   = KX
+                  DO 150, K = KK, KK - ( N - ( J + 1 ) ), -1
+                     TEMP = TEMP - AP( K )*X( IX )
+                     IX   = IX   - INCX
+  150             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/AP( KK - N + J )
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+                  KK      = KK   - (N - J + 1 )
+  160          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of DTPSV .
+*
+      END
+      SUBROUTINE DTRMM ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA,
+     $                   B, LDB )
+*     .. Scalar Arguments ..
+      CHARACTER*1        SIDE, UPLO, TRANSA, DIAG
+      INTEGER            M, N, LDA, LDB
+      DOUBLE PRECISION   ALPHA
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), B( LDB, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  DTRMM  performs one of the matrix-matrix operations
+*
+*     B := alpha*op( A )*B,   or   B := alpha*B*op( A ),
+*
+*  where  alpha  is a scalar,  B  is an m by n matrix,  A  is a unit, or
+*  non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+*
+*     op( A ) = A   or   op( A ) = A'.
+*
+*  Parameters
+*  ==========
+*
+*  SIDE   - CHARACTER*1.
+*           On entry,  SIDE specifies whether  op( A ) multiplies B from
+*           the left or right as follows:
+*
+*              SIDE = 'L' or 'l'   B := alpha*op( A )*B.
+*
+*              SIDE = 'R' or 'r'   B := alpha*B*op( A ).
+*
+*           Unchanged on exit.
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix A is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANSA - CHARACTER*1.
+*           On entry, TRANSA specifies the form of op( A ) to be used in
+*           the matrix multiplication as follows:
+*
+*              TRANSA = 'N' or 'n'   op( A ) = A.
+*
+*              TRANSA = 'T' or 't'   op( A ) = A'.
+*
+*              TRANSA = 'C' or 'c'   op( A ) = A'.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit triangular
+*           as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry, M specifies the number of rows of B. M must be at
+*           least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of B.  N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - DOUBLE PRECISION.
+*           On entry,  ALPHA specifies the scalar  alpha. When  alpha is
+*           zero then  A is not referenced and  B need not be set before
+*           entry.
+*           Unchanged on exit.
+*
+*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, k ), where k is m
+*           when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'.
+*           Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k
+*           upper triangular part of the array  A must contain the upper
+*           triangular matrix  and the strictly lower triangular part of
+*           A is not referenced.
+*           Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k
+*           lower triangular part of the array  A must contain the lower
+*           triangular matrix  and the strictly upper triangular part of
+*           A is not referenced.
+*           Note that when  DIAG = 'U' or 'u',  the diagonal elements of
+*           A  are not referenced either,  but are assumed to be  unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program.  When  SIDE = 'L' or 'l'  then
+*           LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r'
+*           then LDA must be at least max( 1, n ).
+*           Unchanged on exit.
+*
+*  B      - DOUBLE PRECISION array of DIMENSION ( LDB, n ).
+*           Before entry,  the leading  m by n part of the array  B must
+*           contain the matrix  B,  and  on exit  is overwritten  by the
+*           transformed matrix.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in  the  calling  (sub)  program.   LDB  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Local Scalars ..
+      LOGICAL            LSIDE, NOUNIT, UPPER
+      INTEGER            I, INFO, J, K, NROWA
+      DOUBLE PRECISION   TEMP
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE         , ZERO
+      PARAMETER        ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      LSIDE  = LSAME( SIDE  , 'L' )
+      IF( LSIDE )THEN
+         NROWA = M
+      ELSE
+         NROWA = N
+      END IF
+      NOUNIT = LSAME( DIAG  , 'N' )
+      UPPER  = LSAME( UPLO  , 'U' )
+*
+      INFO   = 0
+      IF(      ( .NOT.LSIDE                ).AND.
+     $         ( .NOT.LSAME( SIDE  , 'R' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.UPPER                ).AND.
+     $         ( .NOT.LSAME( UPLO  , 'L' ) )      )THEN
+         INFO = 2
+      ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND.
+     $         ( .NOT.LSAME( TRANSA, 'T' ) ).AND.
+     $         ( .NOT.LSAME( TRANSA, 'C' ) )      )THEN
+         INFO = 3
+      ELSE IF( ( .NOT.LSAME( DIAG  , 'U' ) ).AND.
+     $         ( .NOT.LSAME( DIAG  , 'N' ) )      )THEN
+         INFO = 4
+      ELSE IF( M  .LT.0               )THEN
+         INFO = 5
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 6
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 9
+      ELSE IF( LDB.LT.MAX( 1, M     ) )THEN
+         INFO = 11
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'DTRMM ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         DO 20, J = 1, N
+            DO 10, I = 1, M
+               B( I, J ) = ZERO
+   10       CONTINUE
+   20    CONTINUE
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSIDE )THEN
+         IF( LSAME( TRANSA, 'N' ) )THEN
+*
+*           Form  B := alpha*A*B.
+*
+            IF( UPPER )THEN
+               DO 50, J = 1, N
+                  DO 40, K = 1, M
+                     IF( B( K, J ).NE.ZERO )THEN
+                        TEMP = ALPHA*B( K, J )
+                        DO 30, I = 1, K - 1
+                           B( I, J ) = B( I, J ) + TEMP*A( I, K )
+   30                   CONTINUE
+                        IF( NOUNIT )
+     $                     TEMP = TEMP*A( K, K )
+                        B( K, J ) = TEMP
+                     END IF
+   40             CONTINUE
+   50          CONTINUE
+            ELSE
+               DO 80, J = 1, N
+                  DO 70 K = M, 1, -1
+                     IF( B( K, J ).NE.ZERO )THEN
+                        TEMP      = ALPHA*B( K, J )
+                        B( K, J ) = TEMP
+                        IF( NOUNIT )
+     $                     B( K, J ) = B( K, J )*A( K, K )
+                        DO 60, I = K + 1, M
+                           B( I, J ) = B( I, J ) + TEMP*A( I, K )
+   60                   CONTINUE
+                     END IF
+   70             CONTINUE
+   80          CONTINUE
+            END IF
+         ELSE
+*
+*           Form  B := alpha*A'*B.
+*
+            IF( UPPER )THEN
+               DO 110, J = 1, N
+                  DO 100, I = M, 1, -1
+                     TEMP = B( I, J )
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*A( I, I )
+                     DO 90, K = 1, I - 1
+                        TEMP = TEMP + A( K, I )*B( K, J )
+   90                CONTINUE
+                     B( I, J ) = ALPHA*TEMP
+  100             CONTINUE
+  110          CONTINUE
+            ELSE
+               DO 140, J = 1, N
+                  DO 130, I = 1, M
+                     TEMP = B( I, J )
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*A( I, I )
+                     DO 120, K = I + 1, M
+                        TEMP = TEMP + A( K, I )*B( K, J )
+  120                CONTINUE
+                     B( I, J ) = ALPHA*TEMP
+  130             CONTINUE
+  140          CONTINUE
+            END IF
+         END IF
+      ELSE
+         IF( LSAME( TRANSA, 'N' ) )THEN
+*
+*           Form  B := alpha*B*A.
+*
+            IF( UPPER )THEN
+               DO 180, J = N, 1, -1
+                  TEMP = ALPHA
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( J, J )
+                  DO 150, I = 1, M
+                     B( I, J ) = TEMP*B( I, J )
+  150             CONTINUE
+                  DO 170, K = 1, J - 1
+                     IF( A( K, J ).NE.ZERO )THEN
+                        TEMP = ALPHA*A( K, J )
+                        DO 160, I = 1, M
+                           B( I, J ) = B( I, J ) + TEMP*B( I, K )
+  160                   CONTINUE
+                     END IF
+  170             CONTINUE
+  180          CONTINUE
+            ELSE
+               DO 220, J = 1, N
+                  TEMP = ALPHA
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( J, J )
+                  DO 190, I = 1, M
+                     B( I, J ) = TEMP*B( I, J )
+  190             CONTINUE
+                  DO 210, K = J + 1, N
+                     IF( A( K, J ).NE.ZERO )THEN
+                        TEMP = ALPHA*A( K, J )
+                        DO 200, I = 1, M
+                           B( I, J ) = B( I, J ) + TEMP*B( I, K )
+  200                   CONTINUE
+                     END IF
+  210             CONTINUE
+  220          CONTINUE
+            END IF
+         ELSE
+*
+*           Form  B := alpha*B*A'.
+*
+            IF( UPPER )THEN
+               DO 260, K = 1, N
+                  DO 240, J = 1, K - 1
+                     IF( A( J, K ).NE.ZERO )THEN
+                        TEMP = ALPHA*A( J, K )
+                        DO 230, I = 1, M
+                           B( I, J ) = B( I, J ) + TEMP*B( I, K )
+  230                   CONTINUE
+                     END IF
+  240             CONTINUE
+                  TEMP = ALPHA
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( K, K )
+                  IF( TEMP.NE.ONE )THEN
+                     DO 250, I = 1, M
+                        B( I, K ) = TEMP*B( I, K )
+  250                CONTINUE
+                  END IF
+  260          CONTINUE
+            ELSE
+               DO 300, K = N, 1, -1
+                  DO 280, J = K + 1, N
+                     IF( A( J, K ).NE.ZERO )THEN
+                        TEMP = ALPHA*A( J, K )
+                        DO 270, I = 1, M
+                           B( I, J ) = B( I, J ) + TEMP*B( I, K )
+  270                   CONTINUE
+                     END IF
+  280             CONTINUE
+                  TEMP = ALPHA
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( K, K )
+                  IF( TEMP.NE.ONE )THEN
+                     DO 290, I = 1, M
+                        B( I, K ) = TEMP*B( I, K )
+  290                CONTINUE
+                  END IF
+  300          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of DTRMM .
+*
+      END
+      SUBROUTINE DTRMV ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, LDA, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  DTRMV  performs one of the matrix-vector operations
+*
+*     x := A*x,   or   x := A'*x,
+*
+*  where x is an n element vector and  A is an n by n unit, or non-unit,
+*  upper or lower triangular matrix.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   x := A*x.
+*
+*              TRANS = 'T' or 't'   x := A'*x.
+*
+*              TRANS = 'C' or 'c'   x := A'*x.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
+*           Before entry with  UPLO = 'U' or 'u', the leading n by n
+*           upper triangular part of the array A must contain the upper
+*           triangular matrix and the strictly lower triangular part of
+*           A is not referenced.
+*           Before entry with UPLO = 'L' or 'l', the leading n by n
+*           lower triangular part of the array A must contain the lower
+*           triangular matrix and the strictly upper triangular part of
+*           A is not referenced.
+*           Note that when  DIAG = 'U' or 'u', the diagonal elements of
+*           A are not referenced either, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*  X      - DOUBLE PRECISION array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x. On exit, X is overwritten with the
+*           tranformed vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO
+      PARAMETER        ( ZERO = 0.0D+0 )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   TEMP
+      INTEGER            I, INFO, IX, J, JX, KX
+      LOGICAL            NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, N ) )THEN
+         INFO = 6
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 8
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'DTRMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOUNIT = LSAME( DIAG, 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX  too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through A.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  x := A*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     DO 10, I = 1, J - 1
+                        X( I ) = X( I ) + TEMP*A( I, J )
+   10                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*A( J, J )
+                  END IF
+   20          CONTINUE
+            ELSE
+               JX = KX
+               DO 40, J = 1, N
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     DO 30, I = 1, J - 1
+                        X( IX ) = X( IX ) + TEMP*A( I, J )
+                        IX      = IX      + INCX
+   30                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*A( J, J )
+                  END IF
+                  JX = JX + INCX
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     DO 50, I = N, J + 1, -1
+                        X( I ) = X( I ) + TEMP*A( I, J )
+   50                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*A( J, J )
+                  END IF
+   60          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 80, J = N, 1, -1
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     DO 70, I = N, J + 1, -1
+                        X( IX ) = X( IX ) + TEMP*A( I, J )
+                        IX      = IX      - INCX
+   70                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*A( J, J )
+                  END IF
+                  JX = JX - INCX
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := A'*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            IF( INCX.EQ.1 )THEN
+               DO 100, J = N, 1, -1
+                  TEMP = X( J )
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( J, J )
+                  DO 90, I = J - 1, 1, -1
+                     TEMP = TEMP + A( I, J )*X( I )
+   90             CONTINUE
+                  X( J ) = TEMP
+  100          CONTINUE
+            ELSE
+               JX = KX + ( N - 1 )*INCX
+               DO 120, J = N, 1, -1
+                  TEMP = X( JX )
+                  IX   = JX
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( J, J )
+                  DO 110, I = J - 1, 1, -1
+                     IX   = IX   - INCX
+                     TEMP = TEMP + A( I, J )*X( IX )
+  110             CONTINUE
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+  120          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 140, J = 1, N
+                  TEMP = X( J )
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( J, J )
+                  DO 130, I = J + 1, N
+                     TEMP = TEMP + A( I, J )*X( I )
+  130             CONTINUE
+                  X( J ) = TEMP
+  140          CONTINUE
+            ELSE
+               JX = KX
+               DO 160, J = 1, N
+                  TEMP = X( JX )
+                  IX   = JX
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( J, J )
+                  DO 150, I = J + 1, N
+                     IX   = IX   + INCX
+                     TEMP = TEMP + A( I, J )*X( IX )
+  150             CONTINUE
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+  160          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of DTRMV .
+*
+      END
+      SUBROUTINE DTRSM ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA,
+     $                   B, LDB )
+*     .. Scalar Arguments ..
+      CHARACTER*1        SIDE, UPLO, TRANSA, DIAG
+      INTEGER            M, N, LDA, LDB
+      DOUBLE PRECISION   ALPHA
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), B( LDB, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  DTRSM  solves one of the matrix equations
+*
+*     op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,
+*
+*  where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+*  non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+*
+*     op( A ) = A   or   op( A ) = A'.
+*
+*  The matrix X is overwritten on B.
+*
+*  Parameters
+*  ==========
+*
+*  SIDE   - CHARACTER*1.
+*           On entry, SIDE specifies whether op( A ) appears on the left
+*           or right of X as follows:
+*
+*              SIDE = 'L' or 'l'   op( A )*X = alpha*B.
+*
+*              SIDE = 'R' or 'r'   X*op( A ) = alpha*B.
+*
+*           Unchanged on exit.
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix A is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANSA - CHARACTER*1.
+*           On entry, TRANSA specifies the form of op( A ) to be used in
+*           the matrix multiplication as follows:
+*
+*              TRANSA = 'N' or 'n'   op( A ) = A.
+*
+*              TRANSA = 'T' or 't'   op( A ) = A'.
+*
+*              TRANSA = 'C' or 'c'   op( A ) = A'.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit triangular
+*           as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry, M specifies the number of rows of B. M must be at
+*           least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of B.  N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - DOUBLE PRECISION.
+*           On entry,  ALPHA specifies the scalar  alpha. When  alpha is
+*           zero then  A is not referenced and  B need not be set before
+*           entry.
+*           Unchanged on exit.
+*
+*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, k ), where k is m
+*           when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'.
+*           Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k
+*           upper triangular part of the array  A must contain the upper
+*           triangular matrix  and the strictly lower triangular part of
+*           A is not referenced.
+*           Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k
+*           lower triangular part of the array  A must contain the lower
+*           triangular matrix  and the strictly upper triangular part of
+*           A is not referenced.
+*           Note that when  DIAG = 'U' or 'u',  the diagonal elements of
+*           A  are not referenced either,  but are assumed to be  unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program.  When  SIDE = 'L' or 'l'  then
+*           LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r'
+*           then LDA must be at least max( 1, n ).
+*           Unchanged on exit.
+*
+*  B      - DOUBLE PRECISION array of DIMENSION ( LDB, n ).
+*           Before entry,  the leading  m by n part of the array  B must
+*           contain  the  right-hand  side  matrix  B,  and  on exit  is
+*           overwritten by the solution matrix  X.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in  the  calling  (sub)  program.   LDB  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Local Scalars ..
+      LOGICAL            LSIDE, NOUNIT, UPPER
+      INTEGER            I, INFO, J, K, NROWA
+      DOUBLE PRECISION   TEMP
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE         , ZERO
+      PARAMETER        ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      LSIDE  = LSAME( SIDE  , 'L' )
+      IF( LSIDE )THEN
+         NROWA = M
+      ELSE
+         NROWA = N
+      END IF
+      NOUNIT = LSAME( DIAG  , 'N' )
+      UPPER  = LSAME( UPLO  , 'U' )
+*
+      INFO   = 0
+      IF(      ( .NOT.LSIDE                ).AND.
+     $         ( .NOT.LSAME( SIDE  , 'R' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.UPPER                ).AND.
+     $         ( .NOT.LSAME( UPLO  , 'L' ) )      )THEN
+         INFO = 2
+      ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND.
+     $         ( .NOT.LSAME( TRANSA, 'T' ) ).AND.
+     $         ( .NOT.LSAME( TRANSA, 'C' ) )      )THEN
+         INFO = 3
+      ELSE IF( ( .NOT.LSAME( DIAG  , 'U' ) ).AND.
+     $         ( .NOT.LSAME( DIAG  , 'N' ) )      )THEN
+         INFO = 4
+      ELSE IF( M  .LT.0               )THEN
+         INFO = 5
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 6
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 9
+      ELSE IF( LDB.LT.MAX( 1, M     ) )THEN
+         INFO = 11
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'DTRSM ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         DO 20, J = 1, N
+            DO 10, I = 1, M
+               B( I, J ) = ZERO
+   10       CONTINUE
+   20    CONTINUE
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSIDE )THEN
+         IF( LSAME( TRANSA, 'N' ) )THEN
+*
+*           Form  B := alpha*inv( A )*B.
+*
+            IF( UPPER )THEN
+               DO 60, J = 1, N
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 30, I = 1, M
+                        B( I, J ) = ALPHA*B( I, J )
+   30                CONTINUE
+                  END IF
+                  DO 50, K = M, 1, -1
+                     IF( B( K, J ).NE.ZERO )THEN
+                        IF( NOUNIT )
+     $                     B( K, J ) = B( K, J )/A( K, K )
+                        DO 40, I = 1, K - 1
+                           B( I, J ) = B( I, J ) - B( K, J )*A( I, K )
+   40                   CONTINUE
+                     END IF
+   50             CONTINUE
+   60          CONTINUE
+            ELSE
+               DO 100, J = 1, N
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 70, I = 1, M
+                        B( I, J ) = ALPHA*B( I, J )
+   70                CONTINUE
+                  END IF
+                  DO 90 K = 1, M
+                     IF( B( K, J ).NE.ZERO )THEN
+                        IF( NOUNIT )
+     $                     B( K, J ) = B( K, J )/A( K, K )
+                        DO 80, I = K + 1, M
+                           B( I, J ) = B( I, J ) - B( K, J )*A( I, K )
+   80                   CONTINUE
+                     END IF
+   90             CONTINUE
+  100          CONTINUE
+            END IF
+         ELSE
+*
+*           Form  B := alpha*inv( A' )*B.
+*
+            IF( UPPER )THEN
+               DO 130, J = 1, N
+                  DO 120, I = 1, M
+                     TEMP = ALPHA*B( I, J )
+                     DO 110, K = 1, I - 1
+                        TEMP = TEMP - A( K, I )*B( K, J )
+  110                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/A( I, I )
+                     B( I, J ) = TEMP
+  120             CONTINUE
+  130          CONTINUE
+            ELSE
+               DO 160, J = 1, N
+                  DO 150, I = M, 1, -1
+                     TEMP = ALPHA*B( I, J )
+                     DO 140, K = I + 1, M
+                        TEMP = TEMP - A( K, I )*B( K, J )
+  140                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/A( I, I )
+                     B( I, J ) = TEMP
+  150             CONTINUE
+  160          CONTINUE
+            END IF
+         END IF
+      ELSE
+         IF( LSAME( TRANSA, 'N' ) )THEN
+*
+*           Form  B := alpha*B*inv( A ).
+*
+            IF( UPPER )THEN
+               DO 210, J = 1, N
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 170, I = 1, M
+                        B( I, J ) = ALPHA*B( I, J )
+  170                CONTINUE
+                  END IF
+                  DO 190, K = 1, J - 1
+                     IF( A( K, J ).NE.ZERO )THEN
+                        DO 180, I = 1, M
+                           B( I, J ) = B( I, J ) - A( K, J )*B( I, K )
+  180                   CONTINUE
+                     END IF
+  190             CONTINUE
+                  IF( NOUNIT )THEN
+                     TEMP = ONE/A( J, J )
+                     DO 200, I = 1, M
+                        B( I, J ) = TEMP*B( I, J )
+  200                CONTINUE
+                  END IF
+  210          CONTINUE
+            ELSE
+               DO 260, J = N, 1, -1
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 220, I = 1, M
+                        B( I, J ) = ALPHA*B( I, J )
+  220                CONTINUE
+                  END IF
+                  DO 240, K = J + 1, N
+                     IF( A( K, J ).NE.ZERO )THEN
+                        DO 230, I = 1, M
+                           B( I, J ) = B( I, J ) - A( K, J )*B( I, K )
+  230                   CONTINUE
+                     END IF
+  240             CONTINUE
+                  IF( NOUNIT )THEN
+                     TEMP = ONE/A( J, J )
+                     DO 250, I = 1, M
+                       B( I, J ) = TEMP*B( I, J )
+  250                CONTINUE
+                  END IF
+  260          CONTINUE
+            END IF
+         ELSE
+*
+*           Form  B := alpha*B*inv( A' ).
+*
+            IF( UPPER )THEN
+               DO 310, K = N, 1, -1
+                  IF( NOUNIT )THEN
+                     TEMP = ONE/A( K, K )
+                     DO 270, I = 1, M
+                        B( I, K ) = TEMP*B( I, K )
+  270                CONTINUE
+                  END IF
+                  DO 290, J = 1, K - 1
+                     IF( A( J, K ).NE.ZERO )THEN
+                        TEMP = A( J, K )
+                        DO 280, I = 1, M
+                           B( I, J ) = B( I, J ) - TEMP*B( I, K )
+  280                   CONTINUE
+                     END IF
+  290             CONTINUE
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 300, I = 1, M
+                        B( I, K ) = ALPHA*B( I, K )
+  300                CONTINUE
+                  END IF
+  310          CONTINUE
+            ELSE
+               DO 360, K = 1, N
+                  IF( NOUNIT )THEN
+                     TEMP = ONE/A( K, K )
+                     DO 320, I = 1, M
+                        B( I, K ) = TEMP*B( I, K )
+  320                CONTINUE
+                  END IF
+                  DO 340, J = K + 1, N
+                     IF( A( J, K ).NE.ZERO )THEN
+                        TEMP = A( J, K )
+                        DO 330, I = 1, M
+                           B( I, J ) = B( I, J ) - TEMP*B( I, K )
+  330                   CONTINUE
+                     END IF
+  340             CONTINUE
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 350, I = 1, M
+                        B( I, K ) = ALPHA*B( I, K )
+  350                CONTINUE
+                  END IF
+  360          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of DTRSM .
+*
+      END
+      SUBROUTINE DTRSV ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, LDA, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  DTRSV  solves one of the systems of equations
+*
+*     A*x = b,   or   A'*x = b,
+*
+*  where b and x are n element vectors and A is an n by n unit, or
+*  non-unit, upper or lower triangular matrix.
+*
+*  No test for singularity or near-singularity is included in this
+*  routine. Such tests must be performed before calling this routine.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the equations to be solved as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   A*x = b.
+*
+*              TRANS = 'T' or 't'   A'*x = b.
+*
+*              TRANS = 'C' or 'c'   A'*x = b.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
+*           Before entry with  UPLO = 'U' or 'u', the leading n by n
+*           upper triangular part of the array A must contain the upper
+*           triangular matrix and the strictly lower triangular part of
+*           A is not referenced.
+*           Before entry with UPLO = 'L' or 'l', the leading n by n
+*           lower triangular part of the array A must contain the lower
+*           triangular matrix and the strictly upper triangular part of
+*           A is not referenced.
+*           Note that when  DIAG = 'U' or 'u', the diagonal elements of
+*           A are not referenced either, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*  X      - DOUBLE PRECISION array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element right-hand side vector b. On exit, X is overwritten
+*           with the solution vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO
+      PARAMETER        ( ZERO = 0.0D+0 )
+*     .. Local Scalars ..
+      DOUBLE PRECISION   TEMP
+      INTEGER            I, INFO, IX, J, JX, KX
+      LOGICAL            NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, N ) )THEN
+         INFO = 6
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 8
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'DTRSV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOUNIT = LSAME( DIAG, 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX  too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through A.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  x := inv( A )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/A( J, J )
+                     TEMP = X( J )
+                     DO 10, I = J - 1, 1, -1
+                        X( I ) = X( I ) - TEMP*A( I, J )
+   10                CONTINUE
+                  END IF
+   20          CONTINUE
+            ELSE
+               JX = KX + ( N - 1 )*INCX
+               DO 40, J = N, 1, -1
+                  IF( X( JX ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/A( J, J )
+                     TEMP = X( JX )
+                     IX   = JX
+                     DO 30, I = J - 1, 1, -1
+                        IX      = IX      - INCX
+                        X( IX ) = X( IX ) - TEMP*A( I, J )
+   30                CONTINUE
+                  END IF
+                  JX = JX - INCX
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/A( J, J )
+                     TEMP = X( J )
+                     DO 50, I = J + 1, N
+                        X( I ) = X( I ) - TEMP*A( I, J )
+   50                CONTINUE
+                  END IF
+   60          CONTINUE
+            ELSE
+               JX = KX
+               DO 80, J = 1, N
+                  IF( X( JX ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/A( J, J )
+                     TEMP = X( JX )
+                     IX   = JX
+                     DO 70, I = J + 1, N
+                        IX      = IX      + INCX
+                        X( IX ) = X( IX ) - TEMP*A( I, J )
+   70                CONTINUE
+                  END IF
+                  JX = JX + INCX
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := inv( A' )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            IF( INCX.EQ.1 )THEN
+               DO 100, J = 1, N
+                  TEMP = X( J )
+                  DO 90, I = 1, J - 1
+                     TEMP = TEMP - A( I, J )*X( I )
+   90             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/A( J, J )
+                  X( J ) = TEMP
+  100          CONTINUE
+            ELSE
+               JX = KX
+               DO 120, J = 1, N
+                  TEMP = X( JX )
+                  IX   = KX
+                  DO 110, I = 1, J - 1
+                     TEMP = TEMP - A( I, J )*X( IX )
+                     IX   = IX   + INCX
+  110             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/A( J, J )
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+  120          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 140, J = N, 1, -1
+                  TEMP = X( J )
+                  DO 130, I = N, J + 1, -1
+                     TEMP = TEMP - A( I, J )*X( I )
+  130             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/A( J, J )
+                  X( J ) = TEMP
+  140          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 160, J = N, 1, -1
+                  TEMP = X( JX )
+                  IX   = KX
+                  DO 150, I = N, J + 1, -1
+                     TEMP = TEMP - A( I, J )*X( IX )
+                     IX   = IX   - INCX
+  150             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/A( J, J )
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+  160          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of DTRSV .
+*
+      END
+      double precision function dzasum(n,zx,incx)
+c
+c     takes the sum of the absolute values.
+c     jack dongarra, 3/11/78.
+c     modified 3/93 to return if incx .le. 0.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      double complex zx(*)
+      double precision stemp,dcabs1
+      integer i,incx,ix,n
+c
+      dzasum = 0.0d0
+      stemp = 0.0d0
+      if( n.le.0 .or. incx.le.0 )return
+      if(incx.eq.1)go to 20
+c
+c        code for increment not equal to 1
+c
+      ix = 1
+      do 10 i = 1,n
+        stemp = stemp + dcabs1(zx(ix))
+        ix = ix + incx
+   10 continue
+      dzasum = stemp
+      return
+c
+c        code for increment equal to 1
+c
+   20 do 30 i = 1,n
+        stemp = stemp + dcabs1(zx(i))
+   30 continue
+      dzasum = stemp
+      return
+      end
+      DOUBLE PRECISION FUNCTION DZNRM2( N, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER                           INCX, N
+*     .. Array Arguments ..
+      COMPLEX*16                        X( * )
+*     ..
+*
+*  DZNRM2 returns the euclidean norm of a vector via the function
+*  name, so that
+*
+*     DZNRM2 := sqrt( conjg( x' )*x )
+*
+*
+*
+*  -- This version written on 25-October-1982.
+*     Modified on 14-October-1993 to inline the call to ZLASSQ.
+*     Sven Hammarling, Nag Ltd.
+*
+*
+*     .. Parameters ..
+      DOUBLE PRECISION      ONE         , ZERO
+      PARAMETER           ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     .. Local Scalars ..
+      INTEGER               IX
+      DOUBLE PRECISION      NORM, SCALE, SSQ, TEMP
+*     .. Intrinsic Functions ..
+      INTRINSIC             ABS, DIMAG, DBLE, SQRT
+*     ..
+*     .. Executable Statements ..
+      IF( N.LT.1 .OR. INCX.LT.1 )THEN
+         NORM  = ZERO
+      ELSE
+         SCALE = ZERO
+         SSQ   = ONE
+*        The following loop is equivalent to this call to the LAPACK
+*        auxiliary routine:
+*        CALL ZLASSQ( N, X, INCX, SCALE, SSQ )
+*
+         DO 10, IX = 1, 1 + ( N - 1 )*INCX, INCX
+            IF( DBLE( X( IX ) ).NE.ZERO )THEN
+               TEMP = ABS( DBLE( X( IX ) ) )
+               IF( SCALE.LT.TEMP )THEN
+                  SSQ   = ONE   + SSQ*( SCALE/TEMP )**2
+                  SCALE = TEMP
+               ELSE
+                  SSQ   = SSQ   +     ( TEMP/SCALE )**2
+               END IF
+            END IF
+            IF( DIMAG( X( IX ) ).NE.ZERO )THEN
+               TEMP = ABS( DIMAG( X( IX ) ) )
+               IF( SCALE.LT.TEMP )THEN
+                  SSQ   = ONE   + SSQ*( SCALE/TEMP )**2
+                  SCALE = TEMP
+               ELSE
+                  SSQ   = SSQ   +     ( TEMP/SCALE )**2
+               END IF
+            END IF
+   10    CONTINUE
+         NORM  = SCALE * SQRT( SSQ )
+      END IF
+*
+      DZNRM2 = NORM
+      RETURN
+*
+*     End of DZNRM2.
+*
+      END
+      integer function icamax(n,cx,incx)
+c
+c     finds the index of element having max. absolute value.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 3/93 to return if incx .le. 0.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      complex cx(*)
+      real smax
+      integer i,incx,ix,n
+      complex zdum
+      real cabs1
+      cabs1(zdum) = abs(real(zdum)) + abs(aimag(zdum))
+c
+      icamax = 0
+      if( n.lt.1 .or. incx.le.0 ) return
+      icamax = 1
+      if(n.eq.1)return
+      if(incx.eq.1)go to 20
+c
+c        code for increment not equal to 1
+c
+      ix = 1
+      smax = cabs1(cx(1))
+      ix = ix + incx
+      do 10 i = 2,n
+         if(cabs1(cx(ix)).le.smax) go to 5
+         icamax = i
+         smax = cabs1(cx(ix))
+    5    ix = ix + incx
+   10 continue
+      return
+c
+c        code for increment equal to 1
+c
+   20 smax = cabs1(cx(1))
+      do 30 i = 2,n
+         if(cabs1(cx(i)).le.smax) go to 30
+         icamax = i
+         smax = cabs1(cx(i))
+   30 continue
+      return
+      end
+      integer function idamax(n,dx,incx)
+c
+c     finds the index of element having max. absolute value.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 3/93 to return if incx .le. 0.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      double precision dx(*),dmax
+      integer i,incx,ix,n
+c
+      idamax = 0
+      if( n.lt.1 .or. incx.le.0 ) return
+      idamax = 1
+      if(n.eq.1)return
+      if(incx.eq.1)go to 20
+c
+c        code for increment not equal to 1
+c
+      ix = 1
+      dmax = dabs(dx(1))
+      ix = ix + incx
+      do 10 i = 2,n
+         if(dabs(dx(ix)).le.dmax) go to 5
+         idamax = i
+         dmax = dabs(dx(ix))
+    5    ix = ix + incx
+   10 continue
+      return
+c
+c        code for increment equal to 1
+c
+   20 dmax = dabs(dx(1))
+      do 30 i = 2,n
+         if(dabs(dx(i)).le.dmax) go to 30
+         idamax = i
+         dmax = dabs(dx(i))
+   30 continue
+      return
+      end
+      integer function isamax(n,sx,incx)
+c
+c     finds the index of element having max. absolute value.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 3/93 to return if incx .le. 0.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      real sx(*),smax
+      integer i,incx,ix,n
+c
+      isamax = 0
+      if( n.lt.1 .or. incx.le.0 ) return
+      isamax = 1
+      if(n.eq.1)return
+      if(incx.eq.1)go to 20
+c
+c        code for increment not equal to 1
+c
+      ix = 1
+      smax = abs(sx(1))
+      ix = ix + incx
+      do 10 i = 2,n
+         if(abs(sx(ix)).le.smax) go to 5
+         isamax = i
+         smax = abs(sx(ix))
+    5    ix = ix + incx
+   10 continue
+      return
+c
+c        code for increment equal to 1
+c
+   20 smax = abs(sx(1))
+      do 30 i = 2,n
+         if(abs(sx(i)).le.smax) go to 30
+         isamax = i
+         smax = abs(sx(i))
+   30 continue
+      return
+      end
+      integer function izamax(n,zx,incx)
+c
+c     finds the index of element having max. absolute value.
+c     jack dongarra, 1/15/85.
+c     modified 3/93 to return if incx .le. 0.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      double complex zx(*)
+      double precision smax
+      integer i,incx,ix,n
+      double precision dcabs1
+c
+      izamax = 0
+      if( n.lt.1 .or. incx.le.0 )return
+      izamax = 1
+      if(n.eq.1)return
+      if(incx.eq.1)go to 20
+c
+c        code for increment not equal to 1
+c
+      ix = 1
+      smax = dcabs1(zx(1))
+      ix = ix + incx
+      do 10 i = 2,n
+         if(dcabs1(zx(ix)).le.smax) go to 5
+         izamax = i
+         smax = dcabs1(zx(ix))
+    5    ix = ix + incx
+   10 continue
+      return
+c
+c        code for increment equal to 1
+c
+   20 smax = dcabs1(zx(1))
+      do 30 i = 2,n
+         if(dcabs1(zx(i)).le.smax) go to 30
+         izamax = i
+         smax = dcabs1(zx(i))
+   30 continue
+      return
+      end
+      LOGICAL          FUNCTION LSAME( CA, CB )
+*
+*  -- LAPACK auxiliary routine (version 2.0) --
+*     Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
+*     Courant Institute, Argonne National Lab, and Rice University
+*     January 31, 1994
+*
+*     .. Scalar Arguments ..
+      CHARACTER          CA, CB
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  LSAME returns .TRUE. if CA is the same letter as CB regardless of
+*  case.
+*
+*  Arguments
+*  =========
+*
+*  CA      (input) CHARACTER*1
+*  CB      (input) CHARACTER*1
+*          CA and CB specify the single characters to be compared.
+*
+* =====================================================================
+*
+*     .. Intrinsic Functions ..
+      INTRINSIC          ICHAR
+*     ..
+*     .. Local Scalars ..
+      INTEGER            INTA, INTB, ZCODE
+*     ..
+*     .. Executable Statements ..
+*
+*     Test if the characters are equal
+*
+      LSAME = CA.EQ.CB
+      IF( LSAME )
+     $   RETURN
+*
+*     Now test for equivalence if both characters are alphabetic.
+*
+      ZCODE = ICHAR( 'Z' )
+*
+*     Use 'Z' rather than 'A' so that ASCII can be detected on Prime
+*     machines, on which ICHAR returns a value with bit 8 set.
+*     ICHAR('A') on Prime machines returns 193 which is the same as
+*     ICHAR('A') on an EBCDIC machine.
+*
+      INTA = ICHAR( CA )
+      INTB = ICHAR( CB )
+*
+      IF( ZCODE.EQ.90 .OR. ZCODE.EQ.122 ) THEN
+*
+*        ASCII is assumed - ZCODE is the ASCII code of either lower or
+*        upper case 'Z'.
+*
+         IF( INTA.GE.97 .AND. INTA.LE.122 ) INTA = INTA - 32
+         IF( INTB.GE.97 .AND. INTB.LE.122 ) INTB = INTB - 32
+*
+      ELSE IF( ZCODE.EQ.233 .OR. ZCODE.EQ.169 ) THEN
+*
+*        EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or
+*        upper case 'Z'.
+*
+         IF( INTA.GE.129 .AND. INTA.LE.137 .OR.
+     $       INTA.GE.145 .AND. INTA.LE.153 .OR.
+     $       INTA.GE.162 .AND. INTA.LE.169 ) INTA = INTA + 64
+         IF( INTB.GE.129 .AND. INTB.LE.137 .OR.
+     $       INTB.GE.145 .AND. INTB.LE.153 .OR.
+     $       INTB.GE.162 .AND. INTB.LE.169 ) INTB = INTB + 64
+*
+      ELSE IF( ZCODE.EQ.218 .OR. ZCODE.EQ.250 ) THEN
+*
+*        ASCII is assumed, on Prime machines - ZCODE is the ASCII code
+*        plus 128 of either lower or upper case 'Z'.
+*
+         IF( INTA.GE.225 .AND. INTA.LE.250 ) INTA = INTA - 32
+         IF( INTB.GE.225 .AND. INTB.LE.250 ) INTB = INTB - 32
+      END IF
+      LSAME = INTA.EQ.INTB
+*
+*     RETURN
+*
+*     End of LSAME
+*
+      END
+      real function sasum(n,sx,incx)
+c
+c     takes the sum of the absolute values.
+c     uses unrolled loops for increment equal to one.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 3/93 to return if incx .le. 0.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      real sx(*),stemp
+      integer i,incx,m,mp1,n,nincx
+c
+      sasum = 0.0e0
+      stemp = 0.0e0
+      if( n.le.0 .or. incx.le.0 )return
+      if(incx.eq.1)go to 20
+c
+c        code for increment not equal to 1
+c
+      nincx = n*incx
+      do 10 i = 1,nincx,incx
+        stemp = stemp + abs(sx(i))
+   10 continue
+      sasum = stemp
+      return
+c
+c        code for increment equal to 1
+c
+c
+c        clean-up loop
+c
+   20 m = mod(n,6)
+      if( m .eq. 0 ) go to 40
+      do 30 i = 1,m
+        stemp = stemp + abs(sx(i))
+   30 continue
+      if( n .lt. 6 ) go to 60
+   40 mp1 = m + 1
+      do 50 i = mp1,n,6
+        stemp = stemp + abs(sx(i)) + abs(sx(i + 1)) + abs(sx(i + 2))
+     *  + abs(sx(i + 3)) + abs(sx(i + 4)) + abs(sx(i + 5))
+   50 continue
+   60 sasum = stemp
+      return
+      end
+      subroutine saxpy(n,sa,sx,incx,sy,incy)
+c
+c     constant times a vector plus a vector.
+c     uses unrolled loop for increments equal to one.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      real sx(*),sy(*),sa
+      integer i,incx,incy,ix,iy,m,mp1,n
+c
+      if(n.le.0)return
+      if (sa .eq. 0.0) return
+      if(incx.eq.1.and.incy.eq.1)go to 20
+c
+c        code for unequal increments or equal increments
+c          not equal to 1
+c
+      ix = 1
+      iy = 1
+      if(incx.lt.0)ix = (-n+1)*incx + 1
+      if(incy.lt.0)iy = (-n+1)*incy + 1
+      do 10 i = 1,n
+        sy(iy) = sy(iy) + sa*sx(ix)
+        ix = ix + incx
+        iy = iy + incy
+   10 continue
+      return
+c
+c        code for both increments equal to 1
+c
+c
+c        clean-up loop
+c
+   20 m = mod(n,4)
+      if( m .eq. 0 ) go to 40
+      do 30 i = 1,m
+        sy(i) = sy(i) + sa*sx(i)
+   30 continue
+      if( n .lt. 4 ) return
+   40 mp1 = m + 1
+      do 50 i = mp1,n,4
+        sy(i) = sy(i) + sa*sx(i)
+        sy(i + 1) = sy(i + 1) + sa*sx(i + 1)
+        sy(i + 2) = sy(i + 2) + sa*sx(i + 2)
+        sy(i + 3) = sy(i + 3) + sa*sx(i + 3)
+   50 continue
+      return
+      end
+      real function scasum(n,cx,incx)
+c
+c     takes the sum of the absolute values of a complex vector and
+c     returns a single precision result.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 3/93 to return if incx .le. 0.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      complex cx(*)
+      real stemp
+      integer i,incx,n,nincx
+c
+      scasum = 0.0e0
+      stemp = 0.0e0
+      if( n.le.0 .or. incx.le.0 )return
+      if(incx.eq.1)go to 20
+c
+c        code for increment not equal to 1
+c
+      nincx = n*incx
+      do 10 i = 1,nincx,incx
+        stemp = stemp + abs(real(cx(i))) + abs(aimag(cx(i)))
+   10 continue
+      scasum = stemp
+      return
+c
+c        code for increment equal to 1
+c
+   20 do 30 i = 1,n
+        stemp = stemp + abs(real(cx(i))) + abs(aimag(cx(i)))
+   30 continue
+      scasum = stemp
+      return
+      end
+      REAL             FUNCTION SCNRM2( N, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER                           INCX, N
+*     .. Array Arguments ..
+      COMPLEX                           X( * )
+*     ..
+*
+*  SCNRM2 returns the euclidean norm of a vector via the function
+*  name, so that
+*
+*     SCNRM2 := sqrt( conjg( x' )*x )
+*
+*
+*
+*  -- This version written on 25-October-1982.
+*     Modified on 14-October-1993 to inline the call to CLASSQ.
+*     Sven Hammarling, Nag Ltd.
+*
+*
+*     .. Parameters ..
+      REAL                  ONE         , ZERO
+      PARAMETER           ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     .. Local Scalars ..
+      INTEGER               IX
+      REAL                  NORM, SCALE, SSQ, TEMP
+*     .. Intrinsic Functions ..
+      INTRINSIC             ABS, AIMAG, REAL, SQRT
+*     ..
+*     .. Executable Statements ..
+      IF( N.LT.1 .OR. INCX.LT.1 )THEN
+         NORM  = ZERO
+      ELSE
+         SCALE = ZERO
+         SSQ   = ONE
+*        The following loop is equivalent to this call to the LAPACK
+*        auxiliary routine:
+*        CALL CLASSQ( N, X, INCX, SCALE, SSQ )
+*
+         DO 10, IX = 1, 1 + ( N - 1 )*INCX, INCX
+            IF( REAL( X( IX ) ).NE.ZERO )THEN
+               TEMP = ABS( REAL( X( IX ) ) )
+               IF( SCALE.LT.TEMP )THEN
+                  SSQ   = ONE   + SSQ*( SCALE/TEMP )**2
+                  SCALE = TEMP
+               ELSE
+                  SSQ   = SSQ   +     ( TEMP/SCALE )**2
+               END IF
+            END IF
+            IF( AIMAG( X( IX ) ).NE.ZERO )THEN
+               TEMP = ABS( AIMAG( X( IX ) ) )
+               IF( SCALE.LT.TEMP )THEN
+                  SSQ   = ONE   + SSQ*( SCALE/TEMP )**2
+                  SCALE = TEMP
+               ELSE
+                  SSQ   = SSQ   +     ( TEMP/SCALE )**2
+               END IF
+            END IF
+   10    CONTINUE
+         NORM  = SCALE * SQRT( SSQ )
+      END IF
+*
+      SCNRM2 = NORM
+      RETURN
+*
+*     End of SCNRM2.
+*
+      END
+      subroutine scopy(n,sx,incx,sy,incy)
+c
+c     copies a vector, x, to a vector, y.
+c     uses unrolled loops for increments equal to 1.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      real sx(*),sy(*)
+      integer i,incx,incy,ix,iy,m,mp1,n
+c
+      if(n.le.0)return
+      if(incx.eq.1.and.incy.eq.1)go to 20
+c
+c        code for unequal increments or equal increments
+c          not equal to 1
+c
+      ix = 1
+      iy = 1
+      if(incx.lt.0)ix = (-n+1)*incx + 1
+      if(incy.lt.0)iy = (-n+1)*incy + 1
+      do 10 i = 1,n
+        sy(iy) = sx(ix)
+        ix = ix + incx
+        iy = iy + incy
+   10 continue
+      return
+c
+c        code for both increments equal to 1
+c
+c
+c        clean-up loop
+c
+   20 m = mod(n,7)
+      if( m .eq. 0 ) go to 40
+      do 30 i = 1,m
+        sy(i) = sx(i)
+   30 continue
+      if( n .lt. 7 ) return
+   40 mp1 = m + 1
+      do 50 i = mp1,n,7
+        sy(i) = sx(i)
+        sy(i + 1) = sx(i + 1)
+        sy(i + 2) = sx(i + 2)
+        sy(i + 3) = sx(i + 3)
+        sy(i + 4) = sx(i + 4)
+        sy(i + 5) = sx(i + 5)
+        sy(i + 6) = sx(i + 6)
+   50 continue
+      return
+      end
+      real function sdot(n,sx,incx,sy,incy)
+c
+c     forms the dot product of two vectors.
+c     uses unrolled loops for increments equal to one.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      real sx(*),sy(*),stemp
+      integer i,incx,incy,ix,iy,m,mp1,n
+c
+      stemp = 0.0e0
+      sdot = 0.0e0
+      if(n.le.0)return
+      if(incx.eq.1.and.incy.eq.1)go to 20
+c
+c        code for unequal increments or equal increments
+c          not equal to 1
+c
+      ix = 1
+      iy = 1
+      if(incx.lt.0)ix = (-n+1)*incx + 1
+      if(incy.lt.0)iy = (-n+1)*incy + 1
+      do 10 i = 1,n
+        stemp = stemp + sx(ix)*sy(iy)
+        ix = ix + incx
+        iy = iy + incy
+   10 continue
+      sdot = stemp
+      return
+c
+c        code for both increments equal to 1
+c
+c
+c        clean-up loop
+c
+   20 m = mod(n,5)
+      if( m .eq. 0 ) go to 40
+      do 30 i = 1,m
+        stemp = stemp + sx(i)*sy(i)
+   30 continue
+      if( n .lt. 5 ) go to 60
+   40 mp1 = m + 1
+      do 50 i = mp1,n,5
+        stemp = stemp + sx(i)*sy(i) + sx(i + 1)*sy(i + 1) +
+     *   sx(i + 2)*sy(i + 2) + sx(i + 3)*sy(i + 3) + sx(i + 4)*sy(i + 4)
+   50 continue
+   60 sdot = stemp
+      return
+      end
+*DECK SDSDOT
+      REAL FUNCTION SDSDOT (N, SB, SX, INCX, SY, INCY)
+C***BEGIN PROLOGUE  SDSDOT
+C***PURPOSE  Compute the inner product of two vectors with extended
+C            precision accumulation.
+C***LIBRARY   SLATEC (BLAS)
+C***CATEGORY  D1A4
+C***TYPE      SINGLE PRECISION (SDSDOT-S, CDCDOT-C)
+C***KEYWORDS  BLAS, DOT PRODUCT, INNER PRODUCT, LINEAR ALGEBRA, VECTOR
+C***AUTHOR  Lawson, C. L., (JPL)
+C           Hanson, R. J., (SNLA)
+C           Kincaid, D. R., (U. of Texas)
+C           Krogh, F. T., (JPL)
+C***DESCRIPTION
+C
+C                B L A S  Subprogram
+C    Description of Parameters
+C
+C     --Input--
+C        N  number of elements in input vector(s)
+C       SB  single precision scalar to be added to inner product
+C       SX  single precision vector with N elements
+C     INCX  storage spacing between elements of SX
+C       SY  single precision vector with N elements
+C     INCY  storage spacing between elements of SY
+C
+C     --Output--
+C   SDSDOT  single precision dot product (SB if N .LE. 0)
+C
+C     Returns S.P. result with dot product accumulated in D.P.
+C     SDSDOT = SB + sum for I = 0 to N-1 of SX(LX+I*INCX)*SY(LY+I*INCY),
+C     where LX = 1 if INCX .GE. 0, else LX = 1+(1-N)*INCX, and LY is
+C     defined in a similar way using INCY.
+C
+C***REFERENCES  C. L. Lawson, R. J. Hanson, D. R. Kincaid and F. T.
+C                 Krogh, Basic linear algebra subprograms for Fortran
+C                 usage, Algorithm No. 539, Transactions on Mathematical
+C                 Software 5, 3 (September 1979), pp. 308-323.
+C***ROUTINES CALLED  (NONE)
+C***REVISION HISTORY  (YYMMDD)
+C   791001  DATE WRITTEN
+C   890531  Changed all specific intrinsics to generic.  (WRB)
+C   890831  Modified array declarations.  (WRB)
+C   890831  REVISION DATE from Version 3.2
+C   891214  Prologue converted to Version 4.0 format.  (BAB)
+C   920310  Corrected definition of LX in DESCRIPTION.  (WRB)
+C   920501  Reformatted the REFERENCES section.  (WRB)
+C***END PROLOGUE  SDSDOT
+      REAL SX(*), SY(*), SB
+      DOUBLE PRECISION DSDOT
+C***FIRST EXECUTABLE STATEMENT  SDSDOT
+      DSDOT = SB
+      IF (N .LE. 0) GO TO 30
+      IF (INCX.EQ.INCY .AND. INCX.GT.0) GO TO 40
+C
+C     Code for unequal or nonpositive increments.
+C
+      KX = 1
+      KY = 1
+      IF (INCX .LT. 0) KX = 1+(1-N)*INCX
+      IF (INCY .LT. 0) KY = 1+(1-N)*INCY
+      DO 10 I = 1,N
+        DSDOT = DSDOT + DBLE(SX(KX))*DBLE(SY(KY))
+        KX = KX + INCX
+        KY = KY + INCY
+   10 CONTINUE
+   30 SDSDOT = DSDOT
+      RETURN
+C
+C     Code for equal and positive increments.
+C
+   40 NS = N*INCX
+      DO 50 I = 1,NS,INCX
+        DSDOT = DSDOT + DBLE(SX(I))*DBLE(SY(I))
+   50 CONTINUE
+      SDSDOT = DSDOT
+      RETURN
+      END
+      SUBROUTINE SGBMV ( TRANS, M, N, KL, KU, ALPHA, A, LDA, X, INCX,
+     $                   BETA, Y, INCY )
+*     .. Scalar Arguments ..
+      REAL               ALPHA, BETA
+      INTEGER            INCX, INCY, KL, KU, LDA, M, N
+      CHARACTER*1        TRANS
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  SGBMV  performs one of the matrix-vector operations
+*
+*     y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,
+*
+*  where alpha and beta are scalars, x and y are vectors and A is an
+*  m by n band matrix, with kl sub-diagonals and ku super-diagonals.
+*
+*  Parameters
+*  ==========
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.
+*
+*              TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.
+*
+*              TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry, M specifies the number of rows of the matrix A.
+*           M must be at least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  KL     - INTEGER.
+*           On entry, KL specifies the number of sub-diagonals of the
+*           matrix A. KL must satisfy  0 .le. KL.
+*           Unchanged on exit.
+*
+*  KU     - INTEGER.
+*           On entry, KU specifies the number of super-diagonals of the
+*           matrix A. KU must satisfy  0 .le. KU.
+*           Unchanged on exit.
+*
+*  ALPHA  - REAL            .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - REAL             array of DIMENSION ( LDA, n ).
+*           Before entry, the leading ( kl + ku + 1 ) by n part of the
+*           array A must contain the matrix of coefficients, supplied
+*           column by column, with the leading diagonal of the matrix in
+*           row ( ku + 1 ) of the array, the first super-diagonal
+*           starting at position 2 in row ku, the first sub-diagonal
+*           starting at position 1 in row ( ku + 2 ), and so on.
+*           Elements in the array A that do not correspond to elements
+*           in the band matrix (such as the top left ku by ku triangle)
+*           are not referenced.
+*           The following program segment will transfer a band matrix
+*           from conventional full matrix storage to band storage:
+*
+*                 DO 20, J = 1, N
+*                    K = KU + 1 - J
+*                    DO 10, I = MAX( 1, J - KU ), MIN( M, J + KL )
+*                       A( K + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           ( kl + ku + 1 ).
+*           Unchanged on exit.
+*
+*  X      - REAL             array of DIMENSION at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
+*           and at least
+*           ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
+*           Before entry, the incremented array X must contain the
+*           vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  BETA   - REAL            .
+*           On entry, BETA specifies the scalar beta. When BETA is
+*           supplied as zero then Y need not be set on input.
+*           Unchanged on exit.
+*
+*  Y      - REAL             array of DIMENSION at least
+*           ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
+*           and at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
+*           Before entry, the incremented array Y must contain the
+*           vector y. On exit, Y is overwritten by the updated vector y.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*     .. Parameters ..
+      REAL               ONE         , ZERO
+      PARAMETER        ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     .. Local Scalars ..
+      REAL               TEMP
+      INTEGER            I, INFO, IX, IY, J, JX, JY, K, KUP1, KX, KY,
+     $                   LENX, LENY
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 1
+      ELSE IF( M.LT.0 )THEN
+         INFO = 2
+      ELSE IF( N.LT.0 )THEN
+         INFO = 3
+      ELSE IF( KL.LT.0 )THEN
+         INFO = 4
+      ELSE IF( KU.LT.0 )THEN
+         INFO = 5
+      ELSE IF( LDA.LT.( KL + KU + 1 ) )THEN
+         INFO = 8
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 10
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 13
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'SGBMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.
+     $    ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set
+*     up the start points in  X  and  Y.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+         LENX = N
+         LENY = M
+      ELSE
+         LENX = M
+         LENY = N
+      END IF
+      IF( INCX.GT.0 )THEN
+         KX = 1
+      ELSE
+         KX = 1 - ( LENX - 1 )*INCX
+      END IF
+      IF( INCY.GT.0 )THEN
+         KY = 1
+      ELSE
+         KY = 1 - ( LENY - 1 )*INCY
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through the band part of A.
+*
+*     First form  y := beta*y.
+*
+      IF( BETA.NE.ONE )THEN
+         IF( INCY.EQ.1 )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 10, I = 1, LENY
+                  Y( I ) = ZERO
+   10          CONTINUE
+            ELSE
+               DO 20, I = 1, LENY
+                  Y( I ) = BETA*Y( I )
+   20          CONTINUE
+            END IF
+         ELSE
+            IY = KY
+            IF( BETA.EQ.ZERO )THEN
+               DO 30, I = 1, LENY
+                  Y( IY ) = ZERO
+                  IY      = IY   + INCY
+   30          CONTINUE
+            ELSE
+               DO 40, I = 1, LENY
+                  Y( IY ) = BETA*Y( IY )
+                  IY      = IY           + INCY
+   40          CONTINUE
+            END IF
+         END IF
+      END IF
+      IF( ALPHA.EQ.ZERO )
+     $   RETURN
+      KUP1 = KU + 1
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  y := alpha*A*x + y.
+*
+         JX = KX
+         IF( INCY.EQ.1 )THEN
+            DO 60, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  K    = KUP1 - J
+                  DO 50, I = MAX( 1, J - KU ), MIN( M, J + KL )
+                     Y( I ) = Y( I ) + TEMP*A( K + I, J )
+   50             CONTINUE
+               END IF
+               JX = JX + INCX
+   60       CONTINUE
+         ELSE
+            DO 80, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  IY   = KY
+                  K    = KUP1 - J
+                  DO 70, I = MAX( 1, J - KU ), MIN( M, J + KL )
+                     Y( IY ) = Y( IY ) + TEMP*A( K + I, J )
+                     IY      = IY      + INCY
+   70             CONTINUE
+               END IF
+               JX = JX + INCX
+               IF( J.GT.KU )
+     $            KY = KY + INCY
+   80       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  y := alpha*A'*x + y.
+*
+         JY = KY
+         IF( INCX.EQ.1 )THEN
+            DO 100, J = 1, N
+               TEMP = ZERO
+               K    = KUP1 - J
+               DO 90, I = MAX( 1, J - KU ), MIN( M, J + KL )
+                  TEMP = TEMP + A( K + I, J )*X( I )
+   90          CONTINUE
+               Y( JY ) = Y( JY ) + ALPHA*TEMP
+               JY      = JY      + INCY
+  100       CONTINUE
+         ELSE
+            DO 120, J = 1, N
+               TEMP = ZERO
+               IX   = KX
+               K    = KUP1 - J
+               DO 110, I = MAX( 1, J - KU ), MIN( M, J + KL )
+                  TEMP = TEMP + A( K + I, J )*X( IX )
+                  IX   = IX   + INCX
+  110          CONTINUE
+               Y( JY ) = Y( JY ) + ALPHA*TEMP
+               JY      = JY      + INCY
+               IF( J.GT.KU )
+     $            KX = KX + INCX
+  120       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of SGBMV .
+*
+      END
+      SUBROUTINE SGEMM ( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB,
+     $                   BETA, C, LDC )
+*     .. Scalar Arguments ..
+      CHARACTER*1        TRANSA, TRANSB
+      INTEGER            M, N, K, LDA, LDB, LDC
+      REAL               ALPHA, BETA
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), B( LDB, * ), C( LDC, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  SGEMM  performs one of the matrix-matrix operations
+*
+*     C := alpha*op( A )*op( B ) + beta*C,
+*
+*  where  op( X ) is one of
+*
+*     op( X ) = X   or   op( X ) = X',
+*
+*  alpha and beta are scalars, and A, B and C are matrices, with op( A )
+*  an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+*
+*  Parameters
+*  ==========
+*
+*  TRANSA - CHARACTER*1.
+*           On entry, TRANSA specifies the form of op( A ) to be used in
+*           the matrix multiplication as follows:
+*
+*              TRANSA = 'N' or 'n',  op( A ) = A.
+*
+*              TRANSA = 'T' or 't',  op( A ) = A'.
+*
+*              TRANSA = 'C' or 'c',  op( A ) = A'.
+*
+*           Unchanged on exit.
+*
+*  TRANSB - CHARACTER*1.
+*           On entry, TRANSB specifies the form of op( B ) to be used in
+*           the matrix multiplication as follows:
+*
+*              TRANSB = 'N' or 'n',  op( B ) = B.
+*
+*              TRANSB = 'T' or 't',  op( B ) = B'.
+*
+*              TRANSB = 'C' or 'c',  op( B ) = B'.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry,  M  specifies  the number  of rows  of the  matrix
+*           op( A )  and of the  matrix  C.  M  must  be at least  zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry,  N  specifies the number  of columns of the matrix
+*           op( B ) and the number of columns of the matrix C. N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry,  K  specifies  the number of columns of the matrix
+*           op( A ) and the number of rows of the matrix op( B ). K must
+*           be at least  zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - REAL            .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - REAL             array of DIMENSION ( LDA, ka ), where ka is
+*           k  when  TRANSA = 'N' or 'n',  and is  m  otherwise.
+*           Before entry with  TRANSA = 'N' or 'n',  the leading  m by k
+*           part of the array  A  must contain the matrix  A,  otherwise
+*           the leading  k by m  part of the array  A  must contain  the
+*           matrix A.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. When  TRANSA = 'N' or 'n' then
+*           LDA must be at least  max( 1, m ), otherwise  LDA must be at
+*           least  max( 1, k ).
+*           Unchanged on exit.
+*
+*  B      - REAL             array of DIMENSION ( LDB, kb ), where kb is
+*           n  when  TRANSB = 'N' or 'n',  and is  k  otherwise.
+*           Before entry with  TRANSB = 'N' or 'n',  the leading  k by n
+*           part of the array  B  must contain the matrix  B,  otherwise
+*           the leading  n by k  part of the array  B  must contain  the
+*           matrix B.
+*           Unchanged on exit.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in the calling (sub) program. When  TRANSB = 'N' or 'n' then
+*           LDB must be at least  max( 1, k ), otherwise  LDB must be at
+*           least  max( 1, n ).
+*           Unchanged on exit.
+*
+*  BETA   - REAL            .
+*           On entry,  BETA  specifies the scalar  beta.  When  BETA  is
+*           supplied as zero then C need not be set on input.
+*           Unchanged on exit.
+*
+*  C      - REAL             array of DIMENSION ( LDC, n ).
+*           Before entry, the leading  m by n  part of the array  C must
+*           contain the matrix  C,  except when  beta  is zero, in which
+*           case C need not be set on entry.
+*           On exit, the array  C  is overwritten by the  m by n  matrix
+*           ( alpha*op( A )*op( B ) + beta*C ).
+*
+*  LDC    - INTEGER.
+*           On entry, LDC specifies the first dimension of C as declared
+*           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Local Scalars ..
+      LOGICAL            NOTA, NOTB
+      INTEGER            I, INFO, J, L, NCOLA, NROWA, NROWB
+      REAL               TEMP
+*     .. Parameters ..
+      REAL               ONE         , ZERO
+      PARAMETER        ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     ..
+*     .. Executable Statements ..
+*
+*     Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
+*     transposed and set  NROWA, NCOLA and  NROWB  as the number of rows
+*     and  columns of  A  and the  number of  rows  of  B  respectively.
+*
+      NOTA  = LSAME( TRANSA, 'N' )
+      NOTB  = LSAME( TRANSB, 'N' )
+      IF( NOTA )THEN
+         NROWA = M
+         NCOLA = K
+      ELSE
+         NROWA = K
+         NCOLA = M
+      END IF
+      IF( NOTB )THEN
+         NROWB = K
+      ELSE
+         NROWB = N
+      END IF
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF(      ( .NOT.NOTA                 ).AND.
+     $         ( .NOT.LSAME( TRANSA, 'C' ) ).AND.
+     $         ( .NOT.LSAME( TRANSA, 'T' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.NOTB                 ).AND.
+     $         ( .NOT.LSAME( TRANSB, 'C' ) ).AND.
+     $         ( .NOT.LSAME( TRANSB, 'T' ) )      )THEN
+         INFO = 2
+      ELSE IF( M  .LT.0               )THEN
+         INFO = 3
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 4
+      ELSE IF( K  .LT.0               )THEN
+         INFO = 5
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 8
+      ELSE IF( LDB.LT.MAX( 1, NROWB ) )THEN
+         INFO = 10
+      ELSE IF( LDC.LT.MAX( 1, M     ) )THEN
+         INFO = 13
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'SGEMM ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.
+     $    ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     And if  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         IF( BETA.EQ.ZERO )THEN
+            DO 20, J = 1, N
+               DO 10, I = 1, M
+                  C( I, J ) = ZERO
+   10          CONTINUE
+   20       CONTINUE
+         ELSE
+            DO 40, J = 1, N
+               DO 30, I = 1, M
+                  C( I, J ) = BETA*C( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( NOTB )THEN
+         IF( NOTA )THEN
+*
+*           Form  C := alpha*A*B + beta*C.
+*
+            DO 90, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 50, I = 1, M
+                     C( I, J ) = ZERO
+   50             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 60, I = 1, M
+                     C( I, J ) = BETA*C( I, J )
+   60             CONTINUE
+               END IF
+               DO 80, L = 1, K
+                  IF( B( L, J ).NE.ZERO )THEN
+                     TEMP = ALPHA*B( L, J )
+                     DO 70, I = 1, M
+                        C( I, J ) = C( I, J ) + TEMP*A( I, L )
+   70                CONTINUE
+                  END IF
+   80          CONTINUE
+   90       CONTINUE
+         ELSE
+*
+*           Form  C := alpha*A'*B + beta*C
+*
+            DO 120, J = 1, N
+               DO 110, I = 1, M
+                  TEMP = ZERO
+                  DO 100, L = 1, K
+                     TEMP = TEMP + A( L, I )*B( L, J )
+  100             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  110          CONTINUE
+  120       CONTINUE
+         END IF
+      ELSE
+         IF( NOTA )THEN
+*
+*           Form  C := alpha*A*B' + beta*C
+*
+            DO 170, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 130, I = 1, M
+                     C( I, J ) = ZERO
+  130             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 140, I = 1, M
+                     C( I, J ) = BETA*C( I, J )
+  140             CONTINUE
+               END IF
+               DO 160, L = 1, K
+                  IF( B( J, L ).NE.ZERO )THEN
+                     TEMP = ALPHA*B( J, L )
+                     DO 150, I = 1, M
+                        C( I, J ) = C( I, J ) + TEMP*A( I, L )
+  150                CONTINUE
+                  END IF
+  160          CONTINUE
+  170       CONTINUE
+         ELSE
+*
+*           Form  C := alpha*A'*B' + beta*C
+*
+            DO 200, J = 1, N
+               DO 190, I = 1, M
+                  TEMP = ZERO
+                  DO 180, L = 1, K
+                     TEMP = TEMP + A( L, I )*B( J, L )
+  180             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  190          CONTINUE
+  200       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of SGEMM .
+*
+      END
+      SUBROUTINE SGEMV ( TRANS, M, N, ALPHA, A, LDA, X, INCX,
+     $                   BETA, Y, INCY )
+*     .. Scalar Arguments ..
+      REAL               ALPHA, BETA
+      INTEGER            INCX, INCY, LDA, M, N
+      CHARACTER*1        TRANS
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  SGEMV  performs one of the matrix-vector operations
+*
+*     y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,
+*
+*  where alpha and beta are scalars, x and y are vectors and A is an
+*  m by n matrix.
+*
+*  Parameters
+*  ==========
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.
+*
+*              TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.
+*
+*              TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry, M specifies the number of rows of the matrix A.
+*           M must be at least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - REAL            .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - REAL             array of DIMENSION ( LDA, n ).
+*           Before entry, the leading m by n part of the array A must
+*           contain the matrix of coefficients.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*  X      - REAL             array of DIMENSION at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
+*           and at least
+*           ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
+*           Before entry, the incremented array X must contain the
+*           vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  BETA   - REAL            .
+*           On entry, BETA specifies the scalar beta. When BETA is
+*           supplied as zero then Y need not be set on input.
+*           Unchanged on exit.
+*
+*  Y      - REAL             array of DIMENSION at least
+*           ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
+*           and at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
+*           Before entry with BETA non-zero, the incremented array Y
+*           must contain the vector y. On exit, Y is overwritten by the
+*           updated vector y.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      REAL               ONE         , ZERO
+      PARAMETER        ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     .. Local Scalars ..
+      REAL               TEMP
+      INTEGER            I, INFO, IX, IY, J, JX, JY, KX, KY, LENX, LENY
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 1
+      ELSE IF( M.LT.0 )THEN
+         INFO = 2
+      ELSE IF( N.LT.0 )THEN
+         INFO = 3
+      ELSE IF( LDA.LT.MAX( 1, M ) )THEN
+         INFO = 6
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 8
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 11
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'SGEMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.
+     $    ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set
+*     up the start points in  X  and  Y.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+         LENX = N
+         LENY = M
+      ELSE
+         LENX = M
+         LENY = N
+      END IF
+      IF( INCX.GT.0 )THEN
+         KX = 1
+      ELSE
+         KX = 1 - ( LENX - 1 )*INCX
+      END IF
+      IF( INCY.GT.0 )THEN
+         KY = 1
+      ELSE
+         KY = 1 - ( LENY - 1 )*INCY
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through A.
+*
+*     First form  y := beta*y.
+*
+      IF( BETA.NE.ONE )THEN
+         IF( INCY.EQ.1 )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 10, I = 1, LENY
+                  Y( I ) = ZERO
+   10          CONTINUE
+            ELSE
+               DO 20, I = 1, LENY
+                  Y( I ) = BETA*Y( I )
+   20          CONTINUE
+            END IF
+         ELSE
+            IY = KY
+            IF( BETA.EQ.ZERO )THEN
+               DO 30, I = 1, LENY
+                  Y( IY ) = ZERO
+                  IY      = IY   + INCY
+   30          CONTINUE
+            ELSE
+               DO 40, I = 1, LENY
+                  Y( IY ) = BETA*Y( IY )
+                  IY      = IY           + INCY
+   40          CONTINUE
+            END IF
+         END IF
+      END IF
+      IF( ALPHA.EQ.ZERO )
+     $   RETURN
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  y := alpha*A*x + y.
+*
+         JX = KX
+         IF( INCY.EQ.1 )THEN
+            DO 60, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  DO 50, I = 1, M
+                     Y( I ) = Y( I ) + TEMP*A( I, J )
+   50             CONTINUE
+               END IF
+               JX = JX + INCX
+   60       CONTINUE
+         ELSE
+            DO 80, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  IY   = KY
+                  DO 70, I = 1, M
+                     Y( IY ) = Y( IY ) + TEMP*A( I, J )
+                     IY      = IY      + INCY
+   70             CONTINUE
+               END IF
+               JX = JX + INCX
+   80       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  y := alpha*A'*x + y.
+*
+         JY = KY
+         IF( INCX.EQ.1 )THEN
+            DO 100, J = 1, N
+               TEMP = ZERO
+               DO 90, I = 1, M
+                  TEMP = TEMP + A( I, J )*X( I )
+   90          CONTINUE
+               Y( JY ) = Y( JY ) + ALPHA*TEMP
+               JY      = JY      + INCY
+  100       CONTINUE
+         ELSE
+            DO 120, J = 1, N
+               TEMP = ZERO
+               IX   = KX
+               DO 110, I = 1, M
+                  TEMP = TEMP + A( I, J )*X( IX )
+                  IX   = IX   + INCX
+  110          CONTINUE
+               Y( JY ) = Y( JY ) + ALPHA*TEMP
+               JY      = JY      + INCY
+  120       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of SGEMV .
+*
+      END
+      SUBROUTINE SGER  ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
+*     .. Scalar Arguments ..
+      REAL               ALPHA
+      INTEGER            INCX, INCY, LDA, M, N
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  SGER   performs the rank 1 operation
+*
+*     A := alpha*x*y' + A,
+*
+*  where alpha is a scalar, x is an m element vector, y is an n element
+*  vector and A is an m by n matrix.
+*
+*  Parameters
+*  ==========
+*
+*  M      - INTEGER.
+*           On entry, M specifies the number of rows of the matrix A.
+*           M must be at least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - REAL            .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  X      - REAL             array of dimension at least
+*           ( 1 + ( m - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the m
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  Y      - REAL             array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the n
+*           element vector y.
+*           Unchanged on exit.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*  A      - REAL             array of DIMENSION ( LDA, n ).
+*           Before entry, the leading m by n part of the array A must
+*           contain the matrix of coefficients. On exit, A is
+*           overwritten by the updated matrix.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      REAL               ZERO
+      PARAMETER        ( ZERO = 0.0E+0 )
+*     .. Local Scalars ..
+      REAL               TEMP
+      INTEGER            I, INFO, IX, J, JY, KX
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( M.LT.0 )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 5
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 7
+      ELSE IF( LDA.LT.MAX( 1, M ) )THEN
+         INFO = 9
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'SGER  ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) )
+     $   RETURN
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through A.
+*
+      IF( INCY.GT.0 )THEN
+         JY = 1
+      ELSE
+         JY = 1 - ( N - 1 )*INCY
+      END IF
+      IF( INCX.EQ.1 )THEN
+         DO 20, J = 1, N
+            IF( Y( JY ).NE.ZERO )THEN
+               TEMP = ALPHA*Y( JY )
+               DO 10, I = 1, M
+                  A( I, J ) = A( I, J ) + X( I )*TEMP
+   10          CONTINUE
+            END IF
+            JY = JY + INCY
+   20    CONTINUE
+      ELSE
+         IF( INCX.GT.0 )THEN
+            KX = 1
+         ELSE
+            KX = 1 - ( M - 1 )*INCX
+         END IF
+         DO 40, J = 1, N
+            IF( Y( JY ).NE.ZERO )THEN
+               TEMP = ALPHA*Y( JY )
+               IX   = KX
+               DO 30, I = 1, M
+                  A( I, J ) = A( I, J ) + X( IX )*TEMP
+                  IX        = IX        + INCX
+   30          CONTINUE
+            END IF
+            JY = JY + INCY
+   40    CONTINUE
+      END IF
+*
+      RETURN
+*
+*     End of SGER  .
+*
+      END
+      REAL             FUNCTION SNRM2 ( N, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER                           INCX, N
+*     .. Array Arguments ..
+      REAL                              X( * )
+*     ..
+*
+*  SNRM2 returns the euclidean norm of a vector via the function
+*  name, so that
+*
+*     SNRM2 := sqrt( x'*x )
+*
+*
+*
+*  -- This version written on 25-October-1982.
+*     Modified on 14-October-1993 to inline the call to SLASSQ.
+*     Sven Hammarling, Nag Ltd.
+*
+*
+*     .. Parameters ..
+      REAL                  ONE         , ZERO
+      PARAMETER           ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     .. Local Scalars ..
+      INTEGER               IX
+      REAL                  ABSXI, NORM, SCALE, SSQ
+*     .. Intrinsic Functions ..
+      INTRINSIC             ABS, SQRT
+*     ..
+*     .. Executable Statements ..
+      IF( N.LT.1 .OR. INCX.LT.1 )THEN
+         NORM  = ZERO
+      ELSE IF( N.EQ.1 )THEN
+         NORM  = ABS( X( 1 ) )
+      ELSE
+         SCALE = ZERO
+         SSQ   = ONE
+*        The following loop is equivalent to this call to the LAPACK
+*        auxiliary routine:
+*        CALL SLASSQ( N, X, INCX, SCALE, SSQ )
+*
+         DO 10, IX = 1, 1 + ( N - 1 )*INCX, INCX
+            IF( X( IX ).NE.ZERO )THEN
+               ABSXI = ABS( X( IX ) )
+               IF( SCALE.LT.ABSXI )THEN
+                  SSQ   = ONE   + SSQ*( SCALE/ABSXI )**2
+                  SCALE = ABSXI
+               ELSE
+                  SSQ   = SSQ   +     ( ABSXI/SCALE )**2
+               END IF
+            END IF
+   10    CONTINUE
+         NORM  = SCALE * SQRT( SSQ )
+      END IF
+*
+      SNRM2 = NORM
+      RETURN
+*
+*     End of SNRM2.
+*
+      END
+      subroutine srot (n,sx,incx,sy,incy,c,s)
+c
+c     applies a plane rotation.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      real sx(*),sy(*),stemp,c,s
+      integer i,incx,incy,ix,iy,n
+c
+      if(n.le.0)return
+      if(incx.eq.1.and.incy.eq.1)go to 20
+c
+c       code for unequal increments or equal increments not equal
+c         to 1
+c
+      ix = 1
+      iy = 1
+      if(incx.lt.0)ix = (-n+1)*incx + 1
+      if(incy.lt.0)iy = (-n+1)*incy + 1
+      do 10 i = 1,n
+        stemp = c*sx(ix) + s*sy(iy)
+        sy(iy) = c*sy(iy) - s*sx(ix)
+        sx(ix) = stemp
+        ix = ix + incx
+        iy = iy + incy
+   10 continue
+      return
+c
+c       code for both increments equal to 1
+c
+   20 do 30 i = 1,n
+        stemp = c*sx(i) + s*sy(i)
+        sy(i) = c*sy(i) - s*sx(i)
+        sx(i) = stemp
+   30 continue
+      return
+      end
+      subroutine srotg(sa,sb,c,s)
+c
+c     construct givens plane rotation.
+c     jack dongarra, linpack, 3/11/78.
+c
+      real sa,sb,c,s,roe,scale,r,z
+c
+      roe = sb
+      if( abs(sa) .gt. abs(sb) ) roe = sa
+      scale = abs(sa) + abs(sb)
+      if( scale .ne. 0.0 ) go to 10
+         c = 1.0
+         s = 0.0
+         r = 0.0
+         z = 0.0
+         go to 20
+   10 r = scale*sqrt((sa/scale)**2 + (sb/scale)**2)
+      r = sign(1.0,roe)*r
+      c = sa/r
+      s = sb/r
+      z = 1.0
+      if( abs(sa) .gt. abs(sb) ) z = s
+      if( abs(sb) .ge. abs(sa) .and. c .ne. 0.0 ) z = 1.0/c
+   20 sa = r
+      sb = z
+      return
+      end
+      SUBROUTINE SROTM (N,SX,INCX,SY,INCY,SPARAM)
+C
+C     APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX
+C
+C     (SX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF SX ARE IN
+C     (DX**T)
+C
+C     SX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE
+C     LX = (-INCX)*N, AND SIMILARLY FOR SY USING USING LY AND INCY.
+C     WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS..
+C
+C     SFLAG=-1.E0     SFLAG=0.E0        SFLAG=1.E0     SFLAG=-2.E0
+C
+C       (SH11  SH12)    (1.E0  SH12)    (SH11  1.E0)    (1.E0  0.E0)
+C     H=(          )    (          )    (          )    (          )
+C       (SH21  SH22),   (SH21  1.E0),   (-1.E0 SH22),   (0.E0  1.E0).
+C     SEE  SROTMG FOR A DESCRIPTION OF DATA STORAGE IN SPARAM.
+C
+      DIMENSION SX(1),SY(1),SPARAM(5)
+      DATA ZERO,TWO/0.E0,2.E0/
+C
+      SFLAG=SPARAM(1)
+      IF(N .LE. 0 .OR.(SFLAG+TWO.EQ.ZERO)) GO TO 140
+          IF(.NOT.(INCX.EQ.INCY.AND. INCX .GT.0)) GO TO 70
+C
+               NSTEPS=N*INCX
+               IF(SFLAG) 50,10,30
+   10          CONTINUE
+               SH12=SPARAM(4)
+               SH21=SPARAM(3)
+                    DO 20 I=1,NSTEPS,INCX
+                    W=SX(I)
+                    Z=SY(I)
+                    SX(I)=W+Z*SH12
+                    SY(I)=W*SH21+Z
+   20               CONTINUE
+               GO TO 140
+   30          CONTINUE
+               SH11=SPARAM(2)
+               SH22=SPARAM(5)
+                    DO 40 I=1,NSTEPS,INCX
+                    W=SX(I)
+                    Z=SY(I)
+                    SX(I)=W*SH11+Z
+                    SY(I)=-W+SH22*Z
+   40               CONTINUE
+               GO TO 140
+   50          CONTINUE
+               SH11=SPARAM(2)
+               SH12=SPARAM(4)
+               SH21=SPARAM(3)
+               SH22=SPARAM(5)
+                    DO 60 I=1,NSTEPS,INCX
+                    W=SX(I)
+                    Z=SY(I)
+                    SX(I)=W*SH11+Z*SH12
+                    SY(I)=W*SH21+Z*SH22
+   60               CONTINUE
+               GO TO 140
+   70     CONTINUE
+          KX=1
+          KY=1
+          IF(INCX .LT. 0) KX=1+(1-N)*INCX
+          IF(INCY .LT. 0) KY=1+(1-N)*INCY
+C
+          IF(SFLAG)120,80,100
+   80     CONTINUE
+          SH12=SPARAM(4)
+          SH21=SPARAM(3)
+               DO 90 I=1,N
+               W=SX(KX)
+               Z=SY(KY)
+               SX(KX)=W+Z*SH12
+               SY(KY)=W*SH21+Z
+               KX=KX+INCX
+               KY=KY+INCY
+   90          CONTINUE
+          GO TO 140
+  100     CONTINUE
+          SH11=SPARAM(2)
+          SH22=SPARAM(5)
+               DO 110 I=1,N
+               W=SX(KX)
+               Z=SY(KY)
+               SX(KX)=W*SH11+Z
+               SY(KY)=-W+SH22*Z
+               KX=KX+INCX
+               KY=KY+INCY
+  110          CONTINUE
+          GO TO 140
+  120     CONTINUE
+          SH11=SPARAM(2)
+          SH12=SPARAM(4)
+          SH21=SPARAM(3)
+          SH22=SPARAM(5)
+               DO 130 I=1,N
+               W=SX(KX)
+               Z=SY(KY)
+               SX(KX)=W*SH11+Z*SH12
+               SY(KY)=W*SH21+Z*SH22
+               KX=KX+INCX
+               KY=KY+INCY
+  130          CONTINUE
+  140     CONTINUE
+          RETURN
+          END
+      SUBROUTINE SROTMG (SD1,SD2,SX1,SY1,SPARAM)
+C
+C     CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS
+C     THE SECOND COMPONENT OF THE 2-VECTOR  (SQRT(SD1)*SX1,SQRT(SD2)*
+C     SY2)**T.
+C     WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS..
+C
+C     SFLAG=-1.E0     SFLAG=0.E0        SFLAG=1.E0     SFLAG=-2.E0
+C
+C       (SH11  SH12)    (1.E0  SH12)    (SH11  1.E0)    (1.E0  0.E0)
+C     H=(          )    (          )    (          )    (          )
+C       (SH21  SH22),   (SH21  1.E0),   (-1.E0 SH22),   (0.E0  1.E0).
+C     LOCATIONS 2-4 OF SPARAM CONTAIN SH11,SH21,SH12, AND SH22
+C     RESPECTIVELY. (VALUES OF 1.E0, -1.E0, OR 0.E0 IMPLIED BY THE
+C     VALUE OF SPARAM(1) ARE NOT STORED IN SPARAM.)
+C
+C     THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE
+C     INEXACT.  THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE
+C     OF SD1 AND SD2.  ALL ACTUAL SCALING OF DATA IS DONE USING GAM.
+C
+      DIMENSION SPARAM(5)
+C
+      DATA ZERO,ONE,TWO /0.E0,1.E0,2.E0/
+      DATA GAM,GAMSQ,RGAMSQ/4096.E0,1.67772E7,5.96046E-8/
+      IF(.NOT. SD1 .LT. ZERO) GO TO 10
+C       GO ZERO-H-D-AND-SX1..
+          GO TO 60
+   10 CONTINUE
+C     CASE-SD1-NONNEGATIVE
+      SP2=SD2*SY1
+      IF(.NOT. SP2 .EQ. ZERO) GO TO 20
+          SFLAG=-TWO
+          GO TO 260
+C     REGULAR-CASE..
+   20 CONTINUE
+      SP1=SD1*SX1
+      SQ2=SP2*SY1
+      SQ1=SP1*SX1
+C
+      IF(.NOT. ABS(SQ1) .GT. ABS(SQ2)) GO TO 40
+          SH21=-SY1/SX1
+          SH12=SP2/SP1
+C
+          SU=ONE-SH12*SH21
+C
+          IF(.NOT. SU .LE. ZERO) GO TO 30
+C         GO ZERO-H-D-AND-SX1..
+               GO TO 60
+   30     CONTINUE
+               SFLAG=ZERO
+               SD1=SD1/SU
+               SD2=SD2/SU
+               SX1=SX1*SU
+C         GO SCALE-CHECK..
+               GO TO 100
+   40 CONTINUE
+          IF(.NOT. SQ2 .LT. ZERO) GO TO 50
+C         GO ZERO-H-D-AND-SX1..
+               GO TO 60
+   50     CONTINUE
+               SFLAG=ONE
+               SH11=SP1/SP2
+               SH22=SX1/SY1
+               SU=ONE+SH11*SH22
+               STEMP=SD2/SU
+               SD2=SD1/SU
+               SD1=STEMP
+               SX1=SY1*SU
+C         GO SCALE-CHECK
+               GO TO 100
+C     PROCEDURE..ZERO-H-D-AND-SX1..
+   60 CONTINUE
+          SFLAG=-ONE
+          SH11=ZERO
+          SH12=ZERO
+          SH21=ZERO
+          SH22=ZERO
+C
+          SD1=ZERO
+          SD2=ZERO
+          SX1=ZERO
+C         RETURN..
+          GO TO 220
+C     PROCEDURE..FIX-H..
+   70 CONTINUE
+      IF(.NOT. SFLAG .GE. ZERO) GO TO 90
+C
+          IF(.NOT. SFLAG .EQ. ZERO) GO TO 80
+          SH11=ONE
+          SH22=ONE
+          SFLAG=-ONE
+          GO TO 90
+   80     CONTINUE
+          SH21=-ONE
+          SH12=ONE
+          SFLAG=-ONE
+   90 CONTINUE
+      GO TO IGO,(120,150,180,210)
+C     PROCEDURE..SCALE-CHECK
+  100 CONTINUE
+  110     CONTINUE
+          IF(.NOT. SD1 .LE. RGAMSQ) GO TO 130
+               IF(SD1 .EQ. ZERO) GO TO 160
+               ASSIGN 120 TO IGO
+C              FIX-H..
+               GO TO 70
+  120          CONTINUE
+               SD1=SD1*GAM**2
+               SX1=SX1/GAM
+               SH11=SH11/GAM
+               SH12=SH12/GAM
+          GO TO 110
+  130 CONTINUE
+  140     CONTINUE
+          IF(.NOT. SD1 .GE. GAMSQ) GO TO 160
+               ASSIGN 150 TO IGO
+C              FIX-H..
+               GO TO 70
+  150          CONTINUE
+               SD1=SD1/GAM**2
+               SX1=SX1*GAM
+               SH11=SH11*GAM
+               SH12=SH12*GAM
+          GO TO 140
+  160 CONTINUE
+  170     CONTINUE
+          IF(.NOT. ABS(SD2) .LE. RGAMSQ) GO TO 190
+               IF(SD2 .EQ. ZERO) GO TO 220
+               ASSIGN 180 TO IGO
+C              FIX-H..
+               GO TO 70
+  180          CONTINUE
+               SD2=SD2*GAM**2
+               SH21=SH21/GAM
+               SH22=SH22/GAM
+          GO TO 170
+  190 CONTINUE
+  200     CONTINUE
+          IF(.NOT. ABS(SD2) .GE. GAMSQ) GO TO 220
+               ASSIGN 210 TO IGO
+C              FIX-H..
+               GO TO 70
+  210          CONTINUE
+               SD2=SD2/GAM**2
+               SH21=SH21*GAM
+               SH22=SH22*GAM
+          GO TO 200
+  220 CONTINUE
+          IF(SFLAG)250,230,240
+  230     CONTINUE
+               SPARAM(3)=SH21
+               SPARAM(4)=SH12
+               GO TO 260
+  240     CONTINUE
+               SPARAM(2)=SH11
+               SPARAM(5)=SH22
+               GO TO 260
+  250     CONTINUE
+               SPARAM(2)=SH11
+               SPARAM(3)=SH21
+               SPARAM(4)=SH12
+               SPARAM(5)=SH22
+  260 CONTINUE
+          SPARAM(1)=SFLAG
+          RETURN
+      END
+      SUBROUTINE SSBMV ( UPLO, N, K, ALPHA, A, LDA, X, INCX,
+     $                   BETA, Y, INCY )
+*     .. Scalar Arguments ..
+      REAL               ALPHA, BETA
+      INTEGER            INCX, INCY, K, LDA, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  SSBMV  performs the matrix-vector  operation
+*
+*     y := alpha*A*x + beta*y,
+*
+*  where alpha and beta are scalars, x and y are n element vectors and
+*  A is an n by n symmetric band matrix, with k super-diagonals.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the band matrix A is being supplied as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   The upper triangular part of A is
+*                                  being supplied.
+*
+*              UPLO = 'L' or 'l'   The lower triangular part of A is
+*                                  being supplied.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry, K specifies the number of super-diagonals of the
+*           matrix A. K must satisfy  0 .le. K.
+*           Unchanged on exit.
+*
+*  ALPHA  - REAL            .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - REAL             array of DIMENSION ( LDA, n ).
+*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
+*           by n part of the array A must contain the upper triangular
+*           band part of the symmetric matrix, supplied column by
+*           column, with the leading diagonal of the matrix in row
+*           ( k + 1 ) of the array, the first super-diagonal starting at
+*           position 2 in row k, and so on. The top left k by k triangle
+*           of the array A is not referenced.
+*           The following program segment will transfer the upper
+*           triangular part of a symmetric band matrix from conventional
+*           full matrix storage to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = K + 1 - J
+*                    DO 10, I = MAX( 1, J - K ), J
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
+*           by n part of the array A must contain the lower triangular
+*           band part of the symmetric matrix, supplied column by
+*           column, with the leading diagonal of the matrix in row 1 of
+*           the array, the first sub-diagonal starting at position 1 in
+*           row 2, and so on. The bottom right k by k triangle of the
+*           array A is not referenced.
+*           The following program segment will transfer the lower
+*           triangular part of a symmetric band matrix from conventional
+*           full matrix storage to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = 1 - J
+*                    DO 10, I = J, MIN( N, J + K )
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           ( k + 1 ).
+*           Unchanged on exit.
+*
+*  X      - REAL             array of DIMENSION at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the
+*           vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  BETA   - REAL            .
+*           On entry, BETA specifies the scalar beta.
+*           Unchanged on exit.
+*
+*  Y      - REAL             array of DIMENSION at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the
+*           vector y. On exit, Y is overwritten by the updated vector y.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      REAL               ONE         , ZERO
+      PARAMETER        ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     .. Local Scalars ..
+      REAL               TEMP1, TEMP2
+      INTEGER            I, INFO, IX, IY, J, JX, JY, KPLUS1, KX, KY, L
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( K.LT.0 )THEN
+         INFO = 3
+      ELSE IF( LDA.LT.( K + 1 ) )THEN
+         INFO = 6
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 8
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 11
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'SSBMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     Set up the start points in  X  and  Y.
+*
+      IF( INCX.GT.0 )THEN
+         KX = 1
+      ELSE
+         KX = 1 - ( N - 1 )*INCX
+      END IF
+      IF( INCY.GT.0 )THEN
+         KY = 1
+      ELSE
+         KY = 1 - ( N - 1 )*INCY
+      END IF
+*
+*     Start the operations. In this version the elements of the array A
+*     are accessed sequentially with one pass through A.
+*
+*     First form  y := beta*y.
+*
+      IF( BETA.NE.ONE )THEN
+         IF( INCY.EQ.1 )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 10, I = 1, N
+                  Y( I ) = ZERO
+   10          CONTINUE
+            ELSE
+               DO 20, I = 1, N
+                  Y( I ) = BETA*Y( I )
+   20          CONTINUE
+            END IF
+         ELSE
+            IY = KY
+            IF( BETA.EQ.ZERO )THEN
+               DO 30, I = 1, N
+                  Y( IY ) = ZERO
+                  IY      = IY   + INCY
+   30          CONTINUE
+            ELSE
+               DO 40, I = 1, N
+                  Y( IY ) = BETA*Y( IY )
+                  IY      = IY           + INCY
+   40          CONTINUE
+            END IF
+         END IF
+      END IF
+      IF( ALPHA.EQ.ZERO )
+     $   RETURN
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  y  when upper triangle of A is stored.
+*
+         KPLUS1 = K + 1
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 60, J = 1, N
+               TEMP1 = ALPHA*X( J )
+               TEMP2 = ZERO
+               L     = KPLUS1 - J
+               DO 50, I = MAX( 1, J - K ), J - 1
+                  Y( I ) = Y( I ) + TEMP1*A( L + I, J )
+                  TEMP2  = TEMP2  + A( L + I, J )*X( I )
+   50          CONTINUE
+               Y( J ) = Y( J ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2
+   60       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 80, J = 1, N
+               TEMP1 = ALPHA*X( JX )
+               TEMP2 = ZERO
+               IX    = KX
+               IY    = KY
+               L     = KPLUS1 - J
+               DO 70, I = MAX( 1, J - K ), J - 1
+                  Y( IY ) = Y( IY ) + TEMP1*A( L + I, J )
+                  TEMP2   = TEMP2   + A( L + I, J )*X( IX )
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+   70          CONTINUE
+               Y( JY ) = Y( JY ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+               IF( J.GT.K )THEN
+                  KX = KX + INCX
+                  KY = KY + INCY
+               END IF
+   80       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  y  when lower triangle of A is stored.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 100, J = 1, N
+               TEMP1  = ALPHA*X( J )
+               TEMP2  = ZERO
+               Y( J ) = Y( J )       + TEMP1*A( 1, J )
+               L      = 1            - J
+               DO 90, I = J + 1, MIN( N, J + K )
+                  Y( I ) = Y( I ) + TEMP1*A( L + I, J )
+                  TEMP2  = TEMP2  + A( L + I, J )*X( I )
+   90          CONTINUE
+               Y( J ) = Y( J ) + ALPHA*TEMP2
+  100       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 120, J = 1, N
+               TEMP1   = ALPHA*X( JX )
+               TEMP2   = ZERO
+               Y( JY ) = Y( JY )       + TEMP1*A( 1, J )
+               L       = 1             - J
+               IX      = JX
+               IY      = JY
+               DO 110, I = J + 1, MIN( N, J + K )
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+                  Y( IY ) = Y( IY ) + TEMP1*A( L + I, J )
+                  TEMP2   = TEMP2   + A( L + I, J )*X( IX )
+  110          CONTINUE
+               Y( JY ) = Y( JY ) + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+  120       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of SSBMV .
+*
+      END
+      subroutine sscal(n,sa,sx,incx)
+c
+c     scales a vector by a constant.
+c     uses unrolled loops for increment equal to 1.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 3/93 to return if incx .le. 0.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      real sa,sx(*)
+      integer i,incx,m,mp1,n,nincx
+c
+      if( n.le.0 .or. incx.le.0 )return
+      if(incx.eq.1)go to 20
+c
+c        code for increment not equal to 1
+c
+      nincx = n*incx
+      do 10 i = 1,nincx,incx
+        sx(i) = sa*sx(i)
+   10 continue
+      return
+c
+c        code for increment equal to 1
+c
+c
+c        clean-up loop
+c
+   20 m = mod(n,5)
+      if( m .eq. 0 ) go to 40
+      do 30 i = 1,m
+        sx(i) = sa*sx(i)
+   30 continue
+      if( n .lt. 5 ) return
+   40 mp1 = m + 1
+      do 50 i = mp1,n,5
+        sx(i) = sa*sx(i)
+        sx(i + 1) = sa*sx(i + 1)
+        sx(i + 2) = sa*sx(i + 2)
+        sx(i + 3) = sa*sx(i + 3)
+        sx(i + 4) = sa*sx(i + 4)
+   50 continue
+      return
+      end
+      SUBROUTINE SSPMV ( UPLO, N, ALPHA, AP, X, INCX, BETA, Y, INCY )
+*     .. Scalar Arguments ..
+      REAL               ALPHA, BETA
+      INTEGER            INCX, INCY, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      REAL               AP( * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  SSPMV  performs the matrix-vector operation
+*
+*     y := alpha*A*x + beta*y,
+*
+*  where alpha and beta are scalars, x and y are n element vectors and
+*  A is an n by n symmetric matrix, supplied in packed form.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the matrix A is supplied in the packed
+*           array AP as follows:
+*
+*              UPLO = 'U' or 'u'   The upper triangular part of A is
+*                                  supplied in AP.
+*
+*              UPLO = 'L' or 'l'   The lower triangular part of A is
+*                                  supplied in AP.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - REAL            .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  AP     - REAL             array of DIMENSION at least
+*           ( ( n*( n + 1 ) )/2 ).
+*           Before entry with UPLO = 'U' or 'u', the array AP must
+*           contain the upper triangular part of the symmetric matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
+*           and a( 2, 2 ) respectively, and so on.
+*           Before entry with UPLO = 'L' or 'l', the array AP must
+*           contain the lower triangular part of the symmetric matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
+*           and a( 3, 1 ) respectively, and so on.
+*           Unchanged on exit.
+*
+*  X      - REAL             array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  BETA   - REAL            .
+*           On entry, BETA specifies the scalar beta. When BETA is
+*           supplied as zero then Y need not be set on input.
+*           Unchanged on exit.
+*
+*  Y      - REAL             array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the n
+*           element vector y. On exit, Y is overwritten by the updated
+*           vector y.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      REAL               ONE         , ZERO
+      PARAMETER        ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     .. Local Scalars ..
+      REAL               TEMP1, TEMP2
+      INTEGER            I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 6
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 9
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'SSPMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     Set up the start points in  X  and  Y.
+*
+      IF( INCX.GT.0 )THEN
+         KX = 1
+      ELSE
+         KX = 1 - ( N - 1 )*INCX
+      END IF
+      IF( INCY.GT.0 )THEN
+         KY = 1
+      ELSE
+         KY = 1 - ( N - 1 )*INCY
+      END IF
+*
+*     Start the operations. In this version the elements of the array AP
+*     are accessed sequentially with one pass through AP.
+*
+*     First form  y := beta*y.
+*
+      IF( BETA.NE.ONE )THEN
+         IF( INCY.EQ.1 )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 10, I = 1, N
+                  Y( I ) = ZERO
+   10          CONTINUE
+            ELSE
+               DO 20, I = 1, N
+                  Y( I ) = BETA*Y( I )
+   20          CONTINUE
+            END IF
+         ELSE
+            IY = KY
+            IF( BETA.EQ.ZERO )THEN
+               DO 30, I = 1, N
+                  Y( IY ) = ZERO
+                  IY      = IY   + INCY
+   30          CONTINUE
+            ELSE
+               DO 40, I = 1, N
+                  Y( IY ) = BETA*Y( IY )
+                  IY      = IY           + INCY
+   40          CONTINUE
+            END IF
+         END IF
+      END IF
+      IF( ALPHA.EQ.ZERO )
+     $   RETURN
+      KK = 1
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  y  when AP contains the upper triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 60, J = 1, N
+               TEMP1 = ALPHA*X( J )
+               TEMP2 = ZERO
+               K     = KK
+               DO 50, I = 1, J - 1
+                  Y( I ) = Y( I ) + TEMP1*AP( K )
+                  TEMP2  = TEMP2  + AP( K )*X( I )
+                  K      = K      + 1
+   50          CONTINUE
+               Y( J ) = Y( J ) + TEMP1*AP( KK + J - 1 ) + ALPHA*TEMP2
+               KK     = KK     + J
+   60       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 80, J = 1, N
+               TEMP1 = ALPHA*X( JX )
+               TEMP2 = ZERO
+               IX    = KX
+               IY    = KY
+               DO 70, K = KK, KK + J - 2
+                  Y( IY ) = Y( IY ) + TEMP1*AP( K )
+                  TEMP2   = TEMP2   + AP( K )*X( IX )
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+   70          CONTINUE
+               Y( JY ) = Y( JY ) + TEMP1*AP( KK + J - 1 ) + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+               KK      = KK      + J
+   80       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  y  when AP contains the lower triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 100, J = 1, N
+               TEMP1  = ALPHA*X( J )
+               TEMP2  = ZERO
+               Y( J ) = Y( J )       + TEMP1*AP( KK )
+               K      = KK           + 1
+               DO 90, I = J + 1, N
+                  Y( I ) = Y( I ) + TEMP1*AP( K )
+                  TEMP2  = TEMP2  + AP( K )*X( I )
+                  K      = K      + 1
+   90          CONTINUE
+               Y( J ) = Y( J ) + ALPHA*TEMP2
+               KK     = KK     + ( N - J + 1 )
+  100       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 120, J = 1, N
+               TEMP1   = ALPHA*X( JX )
+               TEMP2   = ZERO
+               Y( JY ) = Y( JY )       + TEMP1*AP( KK )
+               IX      = JX
+               IY      = JY
+               DO 110, K = KK + 1, KK + N - J
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+                  Y( IY ) = Y( IY ) + TEMP1*AP( K )
+                  TEMP2   = TEMP2   + AP( K )*X( IX )
+  110          CONTINUE
+               Y( JY ) = Y( JY ) + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+               KK      = KK      + ( N - J + 1 )
+  120       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of SSPMV .
+*
+      END
+      SUBROUTINE SSPR2 ( UPLO, N, ALPHA, X, INCX, Y, INCY, AP )
+*     .. Scalar Arguments ..
+      REAL               ALPHA
+      INTEGER            INCX, INCY, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      REAL               AP( * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  SSPR2  performs the symmetric rank 2 operation
+*
+*     A := alpha*x*y' + alpha*y*x' + A,
+*
+*  where alpha is a scalar, x and y are n element vectors and A is an
+*  n by n symmetric matrix, supplied in packed form.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the matrix A is supplied in the packed
+*           array AP as follows:
+*
+*              UPLO = 'U' or 'u'   The upper triangular part of A is
+*                                  supplied in AP.
+*
+*              UPLO = 'L' or 'l'   The lower triangular part of A is
+*                                  supplied in AP.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - REAL            .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  X      - REAL             array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  Y      - REAL             array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the n
+*           element vector y.
+*           Unchanged on exit.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*  AP     - REAL             array of DIMENSION at least
+*           ( ( n*( n + 1 ) )/2 ).
+*           Before entry with  UPLO = 'U' or 'u', the array AP must
+*           contain the upper triangular part of the symmetric matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
+*           and a( 2, 2 ) respectively, and so on. On exit, the array
+*           AP is overwritten by the upper triangular part of the
+*           updated matrix.
+*           Before entry with UPLO = 'L' or 'l', the array AP must
+*           contain the lower triangular part of the symmetric matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
+*           and a( 3, 1 ) respectively, and so on. On exit, the array
+*           AP is overwritten by the lower triangular part of the
+*           updated matrix.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      REAL               ZERO
+      PARAMETER        ( ZERO = 0.0E+0 )
+*     .. Local Scalars ..
+      REAL               TEMP1, TEMP2
+      INTEGER            I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 5
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 7
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'SSPR2 ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) )
+     $   RETURN
+*
+*     Set up the start points in X and Y if the increments are not both
+*     unity.
+*
+      IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN
+         IF( INCX.GT.0 )THEN
+            KX = 1
+         ELSE
+            KX = 1 - ( N - 1 )*INCX
+         END IF
+         IF( INCY.GT.0 )THEN
+            KY = 1
+         ELSE
+            KY = 1 - ( N - 1 )*INCY
+         END IF
+         JX = KX
+         JY = KY
+      END IF
+*
+*     Start the operations. In this version the elements of the array AP
+*     are accessed sequentially with one pass through AP.
+*
+      KK = 1
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  A  when upper triangle is stored in AP.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 20, J = 1, N
+               IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*Y( J )
+                  TEMP2 = ALPHA*X( J )
+                  K     = KK
+                  DO 10, I = 1, J
+                     AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2
+                     K       = K       + 1
+   10             CONTINUE
+               END IF
+               KK = KK + J
+   20       CONTINUE
+         ELSE
+            DO 40, J = 1, N
+               IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*Y( JY )
+                  TEMP2 = ALPHA*X( JX )
+                  IX    = KX
+                  IY    = KY
+                  DO 30, K = KK, KK + J - 1
+                     AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2
+                     IX      = IX      + INCX
+                     IY      = IY      + INCY
+   30             CONTINUE
+               END IF
+               JX = JX + INCX
+               JY = JY + INCY
+               KK = KK + J
+   40       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  A  when lower triangle is stored in AP.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 60, J = 1, N
+               IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*Y( J )
+                  TEMP2 = ALPHA*X( J )
+                  K     = KK
+                  DO 50, I = J, N
+                     AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2
+                     K       = K       + 1
+   50             CONTINUE
+               END IF
+               KK = KK + N - J + 1
+   60       CONTINUE
+         ELSE
+            DO 80, J = 1, N
+               IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*Y( JY )
+                  TEMP2 = ALPHA*X( JX )
+                  IX    = JX
+                  IY    = JY
+                  DO 70, K = KK, KK + N - J
+                     AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2
+                     IX      = IX      + INCX
+                     IY      = IY      + INCY
+   70             CONTINUE
+               END IF
+               JX = JX + INCX
+               JY = JY + INCY
+               KK = KK + N - J + 1
+   80       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of SSPR2 .
+*
+      END
+      SUBROUTINE SSPR  ( UPLO, N, ALPHA, X, INCX, AP )
+*     .. Scalar Arguments ..
+      REAL               ALPHA
+      INTEGER            INCX, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      REAL               AP( * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  SSPR    performs the symmetric rank 1 operation
+*
+*     A := alpha*x*x' + A,
+*
+*  where alpha is a real scalar, x is an n element vector and A is an
+*  n by n symmetric matrix, supplied in packed form.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the matrix A is supplied in the packed
+*           array AP as follows:
+*
+*              UPLO = 'U' or 'u'   The upper triangular part of A is
+*                                  supplied in AP.
+*
+*              UPLO = 'L' or 'l'   The lower triangular part of A is
+*                                  supplied in AP.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - REAL            .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  X      - REAL             array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  AP     - REAL             array of DIMENSION at least
+*           ( ( n*( n + 1 ) )/2 ).
+*           Before entry with  UPLO = 'U' or 'u', the array AP must
+*           contain the upper triangular part of the symmetric matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
+*           and a( 2, 2 ) respectively, and so on. On exit, the array
+*           AP is overwritten by the upper triangular part of the
+*           updated matrix.
+*           Before entry with UPLO = 'L' or 'l', the array AP must
+*           contain the lower triangular part of the symmetric matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
+*           and a( 3, 1 ) respectively, and so on. On exit, the array
+*           AP is overwritten by the lower triangular part of the
+*           updated matrix.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      REAL               ZERO
+      PARAMETER        ( ZERO = 0.0E+0 )
+*     .. Local Scalars ..
+      REAL               TEMP
+      INTEGER            I, INFO, IX, J, JX, K, KK, KX
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 5
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'SSPR  ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) )
+     $   RETURN
+*
+*     Set the start point in X if the increment is not unity.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of the array AP
+*     are accessed sequentially with one pass through AP.
+*
+      KK = 1
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  A  when upper triangle is stored in AP.
+*
+         IF( INCX.EQ.1 )THEN
+            DO 20, J = 1, N
+               IF( X( J ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( J )
+                  K    = KK
+                  DO 10, I = 1, J
+                     AP( K ) = AP( K ) + X( I )*TEMP
+                     K       = K       + 1
+   10             CONTINUE
+               END IF
+               KK = KK + J
+   20       CONTINUE
+         ELSE
+            JX = KX
+            DO 40, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  IX   = KX
+                  DO 30, K = KK, KK + J - 1
+                     AP( K ) = AP( K ) + X( IX )*TEMP
+                     IX      = IX      + INCX
+   30             CONTINUE
+               END IF
+               JX = JX + INCX
+               KK = KK + J
+   40       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  A  when lower triangle is stored in AP.
+*
+         IF( INCX.EQ.1 )THEN
+            DO 60, J = 1, N
+               IF( X( J ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( J )
+                  K    = KK
+                  DO 50, I = J, N
+                     AP( K ) = AP( K ) + X( I )*TEMP
+                     K       = K       + 1
+   50             CONTINUE
+               END IF
+               KK = KK + N - J + 1
+   60       CONTINUE
+         ELSE
+            JX = KX
+            DO 80, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  IX   = JX
+                  DO 70, K = KK, KK + N - J
+                     AP( K ) = AP( K ) + X( IX )*TEMP
+                     IX      = IX      + INCX
+   70             CONTINUE
+               END IF
+               JX = JX + INCX
+               KK = KK + N - J + 1
+   80       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of SSPR  .
+*
+      END
+      subroutine sswap (n,sx,incx,sy,incy)
+c
+c     interchanges two vectors.
+c     uses unrolled loops for increments equal to 1.
+c     jack dongarra, linpack, 3/11/78.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      real sx(*),sy(*),stemp
+      integer i,incx,incy,ix,iy,m,mp1,n
+c
+      if(n.le.0)return
+      if(incx.eq.1.and.incy.eq.1)go to 20
+c
+c       code for unequal increments or equal increments not equal
+c         to 1
+c
+      ix = 1
+      iy = 1
+      if(incx.lt.0)ix = (-n+1)*incx + 1
+      if(incy.lt.0)iy = (-n+1)*incy + 1
+      do 10 i = 1,n
+        stemp = sx(ix)
+        sx(ix) = sy(iy)
+        sy(iy) = stemp
+        ix = ix + incx
+        iy = iy + incy
+   10 continue
+      return
+c
+c       code for both increments equal to 1
+c
+c
+c       clean-up loop
+c
+   20 m = mod(n,3)
+      if( m .eq. 0 ) go to 40
+      do 30 i = 1,m
+        stemp = sx(i)
+        sx(i) = sy(i)
+        sy(i) = stemp
+   30 continue
+      if( n .lt. 3 ) return
+   40 mp1 = m + 1
+      do 50 i = mp1,n,3
+        stemp = sx(i)
+        sx(i) = sy(i)
+        sy(i) = stemp
+        stemp = sx(i + 1)
+        sx(i + 1) = sy(i + 1)
+        sy(i + 1) = stemp
+        stemp = sx(i + 2)
+        sx(i + 2) = sy(i + 2)
+        sy(i + 2) = stemp
+   50 continue
+      return
+      end
+      SUBROUTINE SSYMM ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB,
+     $                   BETA, C, LDC )
+*     .. Scalar Arguments ..
+      CHARACTER*1        SIDE, UPLO
+      INTEGER            M, N, LDA, LDB, LDC
+      REAL               ALPHA, BETA
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), B( LDB, * ), C( LDC, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  SSYMM  performs one of the matrix-matrix operations
+*
+*     C := alpha*A*B + beta*C,
+*
+*  or
+*
+*     C := alpha*B*A + beta*C,
+*
+*  where alpha and beta are scalars,  A is a symmetric matrix and  B and
+*  C are  m by n matrices.
+*
+*  Parameters
+*  ==========
+*
+*  SIDE   - CHARACTER*1.
+*           On entry,  SIDE  specifies whether  the  symmetric matrix  A
+*           appears on the  left or right  in the  operation as follows:
+*
+*              SIDE = 'L' or 'l'   C := alpha*A*B + beta*C,
+*
+*              SIDE = 'R' or 'r'   C := alpha*B*A + beta*C,
+*
+*           Unchanged on exit.
+*
+*  UPLO   - CHARACTER*1.
+*           On  entry,   UPLO  specifies  whether  the  upper  or  lower
+*           triangular  part  of  the  symmetric  matrix   A  is  to  be
+*           referenced as follows:
+*
+*              UPLO = 'U' or 'u'   Only the upper triangular part of the
+*                                  symmetric matrix is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the lower triangular part of the
+*                                  symmetric matrix is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry,  M  specifies the number of rows of the matrix  C.
+*           M  must be at least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of the matrix C.
+*           N  must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - REAL            .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - REAL             array of DIMENSION ( LDA, ka ), where ka is
+*           m  when  SIDE = 'L' or 'l'  and is  n otherwise.
+*           Before entry  with  SIDE = 'L' or 'l',  the  m by m  part of
+*           the array  A  must contain the  symmetric matrix,  such that
+*           when  UPLO = 'U' or 'u', the leading m by m upper triangular
+*           part of the array  A  must contain the upper triangular part
+*           of the  symmetric matrix and the  strictly  lower triangular
+*           part of  A  is not referenced,  and when  UPLO = 'L' or 'l',
+*           the leading  m by m  lower triangular part  of the  array  A
+*           must  contain  the  lower triangular part  of the  symmetric
+*           matrix and the  strictly upper triangular part of  A  is not
+*           referenced.
+*           Before entry  with  SIDE = 'R' or 'r',  the  n by n  part of
+*           the array  A  must contain the  symmetric matrix,  such that
+*           when  UPLO = 'U' or 'u', the leading n by n upper triangular
+*           part of the array  A  must contain the upper triangular part
+*           of the  symmetric matrix and the  strictly  lower triangular
+*           part of  A  is not referenced,  and when  UPLO = 'L' or 'l',
+*           the leading  n by n  lower triangular part  of the  array  A
+*           must  contain  the  lower triangular part  of the  symmetric
+*           matrix and the  strictly upper triangular part of  A  is not
+*           referenced.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program.  When  SIDE = 'L' or 'l'  then
+*           LDA must be at least  max( 1, m ), otherwise  LDA must be at
+*           least  max( 1, n ).
+*           Unchanged on exit.
+*
+*  B      - REAL             array of DIMENSION ( LDB, n ).
+*           Before entry, the leading  m by n part of the array  B  must
+*           contain the matrix B.
+*           Unchanged on exit.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in  the  calling  (sub)  program.   LDB  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*  BETA   - REAL            .
+*           On entry,  BETA  specifies the scalar  beta.  When  BETA  is
+*           supplied as zero then C need not be set on input.
+*           Unchanged on exit.
+*
+*  C      - REAL             array of DIMENSION ( LDC, n ).
+*           Before entry, the leading  m by n  part of the array  C must
+*           contain the matrix  C,  except when  beta  is zero, in which
+*           case C need not be set on entry.
+*           On exit, the array  C  is overwritten by the  m by n updated
+*           matrix.
+*
+*  LDC    - INTEGER.
+*           On entry, LDC specifies the first dimension of C as declared
+*           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Local Scalars ..
+      LOGICAL            UPPER
+      INTEGER            I, INFO, J, K, NROWA
+      REAL               TEMP1, TEMP2
+*     .. Parameters ..
+      REAL               ONE         , ZERO
+      PARAMETER        ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     ..
+*     .. Executable Statements ..
+*
+*     Set NROWA as the number of rows of A.
+*
+      IF( LSAME( SIDE, 'L' ) )THEN
+         NROWA = M
+      ELSE
+         NROWA = N
+      END IF
+      UPPER = LSAME( UPLO, 'U' )
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF(      ( .NOT.LSAME( SIDE, 'L' ) ).AND.
+     $         ( .NOT.LSAME( SIDE, 'R' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.UPPER              ).AND.
+     $         ( .NOT.LSAME( UPLO, 'L' ) )      )THEN
+         INFO = 2
+      ELSE IF( M  .LT.0               )THEN
+         INFO = 3
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 7
+      ELSE IF( LDB.LT.MAX( 1, M     ) )THEN
+         INFO = 9
+      ELSE IF( LDC.LT.MAX( 1, M     ) )THEN
+         INFO = 12
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'SSYMM ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.
+     $    ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         IF( BETA.EQ.ZERO )THEN
+            DO 20, J = 1, N
+               DO 10, I = 1, M
+                  C( I, J ) = ZERO
+   10          CONTINUE
+   20       CONTINUE
+         ELSE
+            DO 40, J = 1, N
+               DO 30, I = 1, M
+                  C( I, J ) = BETA*C( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSAME( SIDE, 'L' ) )THEN
+*
+*        Form  C := alpha*A*B + beta*C.
+*
+         IF( UPPER )THEN
+            DO 70, J = 1, N
+               DO 60, I = 1, M
+                  TEMP1 = ALPHA*B( I, J )
+                  TEMP2 = ZERO
+                  DO 50, K = 1, I - 1
+                     C( K, J ) = C( K, J ) + TEMP1    *A( K, I )
+                     TEMP2     = TEMP2     + B( K, J )*A( K, I )
+   50             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2
+                  ELSE
+                     C( I, J ) = BETA *C( I, J ) +
+     $                           TEMP1*A( I, I ) + ALPHA*TEMP2
+                  END IF
+   60          CONTINUE
+   70       CONTINUE
+         ELSE
+            DO 100, J = 1, N
+               DO 90, I = M, 1, -1
+                  TEMP1 = ALPHA*B( I, J )
+                  TEMP2 = ZERO
+                  DO 80, K = I + 1, M
+                     C( K, J ) = C( K, J ) + TEMP1    *A( K, I )
+                     TEMP2     = TEMP2     + B( K, J )*A( K, I )
+   80             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2
+                  ELSE
+                     C( I, J ) = BETA *C( I, J ) +
+     $                           TEMP1*A( I, I ) + ALPHA*TEMP2
+                  END IF
+   90          CONTINUE
+  100       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  C := alpha*B*A + beta*C.
+*
+         DO 170, J = 1, N
+            TEMP1 = ALPHA*A( J, J )
+            IF( BETA.EQ.ZERO )THEN
+               DO 110, I = 1, M
+                  C( I, J ) = TEMP1*B( I, J )
+  110          CONTINUE
+            ELSE
+               DO 120, I = 1, M
+                  C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J )
+  120          CONTINUE
+            END IF
+            DO 140, K = 1, J - 1
+               IF( UPPER )THEN
+                  TEMP1 = ALPHA*A( K, J )
+               ELSE
+                  TEMP1 = ALPHA*A( J, K )
+               END IF
+               DO 130, I = 1, M
+                  C( I, J ) = C( I, J ) + TEMP1*B( I, K )
+  130          CONTINUE
+  140       CONTINUE
+            DO 160, K = J + 1, N
+               IF( UPPER )THEN
+                  TEMP1 = ALPHA*A( J, K )
+               ELSE
+                  TEMP1 = ALPHA*A( K, J )
+               END IF
+               DO 150, I = 1, M
+                  C( I, J ) = C( I, J ) + TEMP1*B( I, K )
+  150          CONTINUE
+  160       CONTINUE
+  170    CONTINUE
+      END IF
+*
+      RETURN
+*
+*     End of SSYMM .
+*
+      END
+      SUBROUTINE SSYMV ( UPLO, N, ALPHA, A, LDA, X, INCX,
+     $                   BETA, Y, INCY )
+*     .. Scalar Arguments ..
+      REAL               ALPHA, BETA
+      INTEGER            INCX, INCY, LDA, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  SSYMV  performs the matrix-vector  operation
+*
+*     y := alpha*A*x + beta*y,
+*
+*  where alpha and beta are scalars, x and y are n element vectors and
+*  A is an n by n symmetric matrix.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the array A is to be referenced as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the upper triangular part of A
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the lower triangular part of A
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - REAL            .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - REAL             array of DIMENSION ( LDA, n ).
+*           Before entry with  UPLO = 'U' or 'u', the leading n by n
+*           upper triangular part of the array A must contain the upper
+*           triangular part of the symmetric matrix and the strictly
+*           lower triangular part of A is not referenced.
+*           Before entry with UPLO = 'L' or 'l', the leading n by n
+*           lower triangular part of the array A must contain the lower
+*           triangular part of the symmetric matrix and the strictly
+*           upper triangular part of A is not referenced.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*  X      - REAL             array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  BETA   - REAL            .
+*           On entry, BETA specifies the scalar beta. When BETA is
+*           supplied as zero then Y need not be set on input.
+*           Unchanged on exit.
+*
+*  Y      - REAL             array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the n
+*           element vector y. On exit, Y is overwritten by the updated
+*           vector y.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      REAL               ONE         , ZERO
+      PARAMETER        ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     .. Local Scalars ..
+      REAL               TEMP1, TEMP2
+      INTEGER            I, INFO, IX, IY, J, JX, JY, KX, KY
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( LDA.LT.MAX( 1, N ) )THEN
+         INFO = 5
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 7
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 10
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'SSYMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     Set up the start points in  X  and  Y.
+*
+      IF( INCX.GT.0 )THEN
+         KX = 1
+      ELSE
+         KX = 1 - ( N - 1 )*INCX
+      END IF
+      IF( INCY.GT.0 )THEN
+         KY = 1
+      ELSE
+         KY = 1 - ( N - 1 )*INCY
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through the triangular part
+*     of A.
+*
+*     First form  y := beta*y.
+*
+      IF( BETA.NE.ONE )THEN
+         IF( INCY.EQ.1 )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 10, I = 1, N
+                  Y( I ) = ZERO
+   10          CONTINUE
+            ELSE
+               DO 20, I = 1, N
+                  Y( I ) = BETA*Y( I )
+   20          CONTINUE
+            END IF
+         ELSE
+            IY = KY
+            IF( BETA.EQ.ZERO )THEN
+               DO 30, I = 1, N
+                  Y( IY ) = ZERO
+                  IY      = IY   + INCY
+   30          CONTINUE
+            ELSE
+               DO 40, I = 1, N
+                  Y( IY ) = BETA*Y( IY )
+                  IY      = IY           + INCY
+   40          CONTINUE
+            END IF
+         END IF
+      END IF
+      IF( ALPHA.EQ.ZERO )
+     $   RETURN
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  y  when A is stored in upper triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 60, J = 1, N
+               TEMP1 = ALPHA*X( J )
+               TEMP2 = ZERO
+               DO 50, I = 1, J - 1
+                  Y( I ) = Y( I ) + TEMP1*A( I, J )
+                  TEMP2  = TEMP2  + A( I, J )*X( I )
+   50          CONTINUE
+               Y( J ) = Y( J ) + TEMP1*A( J, J ) + ALPHA*TEMP2
+   60       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 80, J = 1, N
+               TEMP1 = ALPHA*X( JX )
+               TEMP2 = ZERO
+               IX    = KX
+               IY    = KY
+               DO 70, I = 1, J - 1
+                  Y( IY ) = Y( IY ) + TEMP1*A( I, J )
+                  TEMP2   = TEMP2   + A( I, J )*X( IX )
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+   70          CONTINUE
+               Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+   80       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  y  when A is stored in lower triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 100, J = 1, N
+               TEMP1  = ALPHA*X( J )
+               TEMP2  = ZERO
+               Y( J ) = Y( J )       + TEMP1*A( J, J )
+               DO 90, I = J + 1, N
+                  Y( I ) = Y( I ) + TEMP1*A( I, J )
+                  TEMP2  = TEMP2  + A( I, J )*X( I )
+   90          CONTINUE
+               Y( J ) = Y( J ) + ALPHA*TEMP2
+  100       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 120, J = 1, N
+               TEMP1   = ALPHA*X( JX )
+               TEMP2   = ZERO
+               Y( JY ) = Y( JY )       + TEMP1*A( J, J )
+               IX      = JX
+               IY      = JY
+               DO 110, I = J + 1, N
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+                  Y( IY ) = Y( IY ) + TEMP1*A( I, J )
+                  TEMP2   = TEMP2   + A( I, J )*X( IX )
+  110          CONTINUE
+               Y( JY ) = Y( JY ) + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+  120       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of SSYMV .
+*
+      END
+      SUBROUTINE SSYR2 ( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA )
+*     .. Scalar Arguments ..
+      REAL               ALPHA
+      INTEGER            INCX, INCY, LDA, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  SSYR2  performs the symmetric rank 2 operation
+*
+*     A := alpha*x*y' + alpha*y*x' + A,
+*
+*  where alpha is a scalar, x and y are n element vectors and A is an n
+*  by n symmetric matrix.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the array A is to be referenced as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the upper triangular part of A
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the lower triangular part of A
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - REAL            .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  X      - REAL             array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  Y      - REAL             array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the n
+*           element vector y.
+*           Unchanged on exit.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*  A      - REAL             array of DIMENSION ( LDA, n ).
+*           Before entry with  UPLO = 'U' or 'u', the leading n by n
+*           upper triangular part of the array A must contain the upper
+*           triangular part of the symmetric matrix and the strictly
+*           lower triangular part of A is not referenced. On exit, the
+*           upper triangular part of the array A is overwritten by the
+*           upper triangular part of the updated matrix.
+*           Before entry with UPLO = 'L' or 'l', the leading n by n
+*           lower triangular part of the array A must contain the lower
+*           triangular part of the symmetric matrix and the strictly
+*           upper triangular part of A is not referenced. On exit, the
+*           lower triangular part of the array A is overwritten by the
+*           lower triangular part of the updated matrix.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      REAL               ZERO
+      PARAMETER        ( ZERO = 0.0E+0 )
+*     .. Local Scalars ..
+      REAL               TEMP1, TEMP2
+      INTEGER            I, INFO, IX, IY, J, JX, JY, KX, KY
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 5
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 7
+      ELSE IF( LDA.LT.MAX( 1, N ) )THEN
+         INFO = 9
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'SSYR2 ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) )
+     $   RETURN
+*
+*     Set up the start points in X and Y if the increments are not both
+*     unity.
+*
+      IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN
+         IF( INCX.GT.0 )THEN
+            KX = 1
+         ELSE
+            KX = 1 - ( N - 1 )*INCX
+         END IF
+         IF( INCY.GT.0 )THEN
+            KY = 1
+         ELSE
+            KY = 1 - ( N - 1 )*INCY
+         END IF
+         JX = KX
+         JY = KY
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through the triangular part
+*     of A.
+*
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  A  when A is stored in the upper triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 20, J = 1, N
+               IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*Y( J )
+                  TEMP2 = ALPHA*X( J )
+                  DO 10, I = 1, J
+                     A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2
+   10             CONTINUE
+               END IF
+   20       CONTINUE
+         ELSE
+            DO 40, J = 1, N
+               IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*Y( JY )
+                  TEMP2 = ALPHA*X( JX )
+                  IX    = KX
+                  IY    = KY
+                  DO 30, I = 1, J
+                     A( I, J ) = A( I, J ) + X( IX )*TEMP1
+     $                                     + Y( IY )*TEMP2
+                     IX        = IX        + INCX
+                     IY        = IY        + INCY
+   30             CONTINUE
+               END IF
+               JX = JX + INCX
+               JY = JY + INCY
+   40       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  A  when A is stored in the lower triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 60, J = 1, N
+               IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*Y( J )
+                  TEMP2 = ALPHA*X( J )
+                  DO 50, I = J, N
+                     A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2
+   50             CONTINUE
+               END IF
+   60       CONTINUE
+         ELSE
+            DO 80, J = 1, N
+               IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*Y( JY )
+                  TEMP2 = ALPHA*X( JX )
+                  IX    = JX
+                  IY    = JY
+                  DO 70, I = J, N
+                     A( I, J ) = A( I, J ) + X( IX )*TEMP1
+     $                                     + Y( IY )*TEMP2
+                     IX        = IX        + INCX
+                     IY        = IY        + INCY
+   70             CONTINUE
+               END IF
+               JX = JX + INCX
+               JY = JY + INCY
+   80       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of SSYR2 .
+*
+      END
+      SUBROUTINE SSYR2K( UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB,
+     $                   BETA, C, LDC )
+*     .. Scalar Arguments ..
+      CHARACTER*1        UPLO, TRANS
+      INTEGER            N, K, LDA, LDB, LDC
+      REAL               ALPHA, BETA
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), B( LDB, * ), C( LDC, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  SSYR2K  performs one of the symmetric rank 2k operations
+*
+*     C := alpha*A*B' + alpha*B*A' + beta*C,
+*
+*  or
+*
+*     C := alpha*A'*B + alpha*B'*A + beta*C,
+*
+*  where  alpha and beta  are scalars, C is an  n by n  symmetric matrix
+*  and  A and B  are  n by k  matrices  in the  first  case  and  k by n
+*  matrices in the second case.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On  entry,   UPLO  specifies  whether  the  upper  or  lower
+*           triangular  part  of the  array  C  is to be  referenced  as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the  upper triangular part of  C
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the  lower triangular part of  C
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry,  TRANS  specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   C := alpha*A*B' + alpha*B*A' +
+*                                        beta*C.
+*
+*              TRANS = 'T' or 't'   C := alpha*A'*B + alpha*B'*A +
+*                                        beta*C.
+*
+*              TRANS = 'C' or 'c'   C := alpha*A'*B + alpha*B'*A +
+*                                        beta*C.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry,  N specifies the order of the matrix C.  N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry with  TRANS = 'N' or 'n',  K  specifies  the number
+*           of  columns  of the  matrices  A and B,  and on  entry  with
+*           TRANS = 'T' or 't' or 'C' or 'c',  K  specifies  the  number
+*           of rows of the matrices  A and B.  K must be at least  zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - REAL            .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - REAL             array of DIMENSION ( LDA, ka ), where ka is
+*           k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+*           Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+*           part of the array  A  must contain the matrix  A,  otherwise
+*           the leading  k by n  part of the array  A  must contain  the
+*           matrix A.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+*           then  LDA must be at least  max( 1, n ), otherwise  LDA must
+*           be at least  max( 1, k ).
+*           Unchanged on exit.
+*
+*  B      - REAL             array of DIMENSION ( LDB, kb ), where kb is
+*           k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+*           Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+*           part of the array  B  must contain the matrix  B,  otherwise
+*           the leading  k by n  part of the array  B  must contain  the
+*           matrix B.
+*           Unchanged on exit.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+*           then  LDB must be at least  max( 1, n ), otherwise  LDB must
+*           be at least  max( 1, k ).
+*           Unchanged on exit.
+*
+*  BETA   - REAL            .
+*           On entry, BETA specifies the scalar beta.
+*           Unchanged on exit.
+*
+*  C      - REAL             array of DIMENSION ( LDC, n ).
+*           Before entry  with  UPLO = 'U' or 'u',  the leading  n by n
+*           upper triangular part of the array C must contain the upper
+*           triangular part  of the  symmetric matrix  and the strictly
+*           lower triangular part of C is not referenced.  On exit, the
+*           upper triangular part of the array  C is overwritten by the
+*           upper triangular part of the updated matrix.
+*           Before entry  with  UPLO = 'L' or 'l',  the leading  n by n
+*           lower triangular part of the array C must contain the lower
+*           triangular part  of the  symmetric matrix  and the strictly
+*           upper triangular part of C is not referenced.  On exit, the
+*           lower triangular part of the array  C is overwritten by the
+*           lower triangular part of the updated matrix.
+*
+*  LDC    - INTEGER.
+*           On entry, LDC specifies the first dimension of C as declared
+*           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Local Scalars ..
+      LOGICAL            UPPER
+      INTEGER            I, INFO, J, L, NROWA
+      REAL               TEMP1, TEMP2
+*     .. Parameters ..
+      REAL               ONE         , ZERO
+      PARAMETER        ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+         NROWA = N
+      ELSE
+         NROWA = K
+      END IF
+      UPPER = LSAME( UPLO, 'U' )
+*
+      INFO = 0
+      IF(      ( .NOT.UPPER               ).AND.
+     $         ( .NOT.LSAME( UPLO , 'L' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND.
+     $         ( .NOT.LSAME( TRANS, 'T' ) ).AND.
+     $         ( .NOT.LSAME( TRANS, 'C' ) )      )THEN
+         INFO = 2
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 3
+      ELSE IF( K  .LT.0               )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 7
+      ELSE IF( LDB.LT.MAX( 1, NROWA ) )THEN
+         INFO = 9
+      ELSE IF( LDC.LT.MAX( 1, N     ) )THEN
+         INFO = 12
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'SSYR2K', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.
+     $    ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         IF( UPPER )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 20, J = 1, N
+                  DO 10, I = 1, J
+                     C( I, J ) = ZERO
+   10             CONTINUE
+   20          CONTINUE
+            ELSE
+               DO 40, J = 1, N
+                  DO 30, I = 1, J
+                     C( I, J ) = BETA*C( I, J )
+   30             CONTINUE
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( BETA.EQ.ZERO )THEN
+               DO 60, J = 1, N
+                  DO 50, I = J, N
+                     C( I, J ) = ZERO
+   50             CONTINUE
+   60          CONTINUE
+            ELSE
+               DO 80, J = 1, N
+                  DO 70, I = J, N
+                     C( I, J ) = BETA*C( I, J )
+   70             CONTINUE
+   80          CONTINUE
+            END IF
+         END IF
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  C := alpha*A*B' + alpha*B*A' + C.
+*
+         IF( UPPER )THEN
+            DO 130, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 90, I = 1, J
+                     C( I, J ) = ZERO
+   90             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 100, I = 1, J
+                     C( I, J ) = BETA*C( I, J )
+  100             CONTINUE
+               END IF
+               DO 120, L = 1, K
+                  IF( ( A( J, L ).NE.ZERO ).OR.
+     $                ( B( J, L ).NE.ZERO )     )THEN
+                     TEMP1 = ALPHA*B( J, L )
+                     TEMP2 = ALPHA*A( J, L )
+                     DO 110, I = 1, J
+                        C( I, J ) = C( I, J ) +
+     $                              A( I, L )*TEMP1 + B( I, L )*TEMP2
+  110                CONTINUE
+                  END IF
+  120          CONTINUE
+  130       CONTINUE
+         ELSE
+            DO 180, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 140, I = J, N
+                     C( I, J ) = ZERO
+  140             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 150, I = J, N
+                     C( I, J ) = BETA*C( I, J )
+  150             CONTINUE
+               END IF
+               DO 170, L = 1, K
+                  IF( ( A( J, L ).NE.ZERO ).OR.
+     $                ( B( J, L ).NE.ZERO )     )THEN
+                     TEMP1 = ALPHA*B( J, L )
+                     TEMP2 = ALPHA*A( J, L )
+                     DO 160, I = J, N
+                        C( I, J ) = C( I, J ) +
+     $                              A( I, L )*TEMP1 + B( I, L )*TEMP2
+  160                CONTINUE
+                  END IF
+  170          CONTINUE
+  180       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  C := alpha*A'*B + alpha*B'*A + C.
+*
+         IF( UPPER )THEN
+            DO 210, J = 1, N
+               DO 200, I = 1, J
+                  TEMP1 = ZERO
+                  TEMP2 = ZERO
+                  DO 190, L = 1, K
+                     TEMP1 = TEMP1 + A( L, I )*B( L, J )
+                     TEMP2 = TEMP2 + B( L, I )*A( L, J )
+  190             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2
+                  ELSE
+                     C( I, J ) = BETA *C( I, J ) +
+     $                           ALPHA*TEMP1 + ALPHA*TEMP2
+                  END IF
+  200          CONTINUE
+  210       CONTINUE
+         ELSE
+            DO 240, J = 1, N
+               DO 230, I = J, N
+                  TEMP1 = ZERO
+                  TEMP2 = ZERO
+                  DO 220, L = 1, K
+                     TEMP1 = TEMP1 + A( L, I )*B( L, J )
+                     TEMP2 = TEMP2 + B( L, I )*A( L, J )
+  220             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2
+                  ELSE
+                     C( I, J ) = BETA *C( I, J ) +
+     $                           ALPHA*TEMP1 + ALPHA*TEMP2
+                  END IF
+  230          CONTINUE
+  240       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of SSYR2K.
+*
+      END
+      SUBROUTINE SSYR  ( UPLO, N, ALPHA, X, INCX, A, LDA )
+*     .. Scalar Arguments ..
+      REAL               ALPHA
+      INTEGER            INCX, LDA, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  SSYR   performs the symmetric rank 1 operation
+*
+*     A := alpha*x*x' + A,
+*
+*  where alpha is a real scalar, x is an n element vector and A is an
+*  n by n symmetric matrix.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the array A is to be referenced as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the upper triangular part of A
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the lower triangular part of A
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - REAL            .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  X      - REAL             array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  A      - REAL             array of DIMENSION ( LDA, n ).
+*           Before entry with  UPLO = 'U' or 'u', the leading n by n
+*           upper triangular part of the array A must contain the upper
+*           triangular part of the symmetric matrix and the strictly
+*           lower triangular part of A is not referenced. On exit, the
+*           upper triangular part of the array A is overwritten by the
+*           upper triangular part of the updated matrix.
+*           Before entry with UPLO = 'L' or 'l', the leading n by n
+*           lower triangular part of the array A must contain the lower
+*           triangular part of the symmetric matrix and the strictly
+*           upper triangular part of A is not referenced. On exit, the
+*           lower triangular part of the array A is overwritten by the
+*           lower triangular part of the updated matrix.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      REAL               ZERO
+      PARAMETER        ( ZERO = 0.0E+0 )
+*     .. Local Scalars ..
+      REAL               TEMP
+      INTEGER            I, INFO, IX, J, JX, KX
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 5
+      ELSE IF( LDA.LT.MAX( 1, N ) )THEN
+         INFO = 7
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'SSYR  ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) )
+     $   RETURN
+*
+*     Set the start point in X if the increment is not unity.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through the triangular part
+*     of A.
+*
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  A  when A is stored in upper triangle.
+*
+         IF( INCX.EQ.1 )THEN
+            DO 20, J = 1, N
+               IF( X( J ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( J )
+                  DO 10, I = 1, J
+                     A( I, J ) = A( I, J ) + X( I )*TEMP
+   10             CONTINUE
+               END IF
+   20       CONTINUE
+         ELSE
+            JX = KX
+            DO 40, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  IX   = KX
+                  DO 30, I = 1, J
+                     A( I, J ) = A( I, J ) + X( IX )*TEMP
+                     IX        = IX        + INCX
+   30             CONTINUE
+               END IF
+               JX = JX + INCX
+   40       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  A  when A is stored in lower triangle.
+*
+         IF( INCX.EQ.1 )THEN
+            DO 60, J = 1, N
+               IF( X( J ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( J )
+                  DO 50, I = J, N
+                     A( I, J ) = A( I, J ) + X( I )*TEMP
+   50             CONTINUE
+               END IF
+   60       CONTINUE
+         ELSE
+            JX = KX
+            DO 80, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  IX   = JX
+                  DO 70, I = J, N
+                     A( I, J ) = A( I, J ) + X( IX )*TEMP
+                     IX        = IX        + INCX
+   70             CONTINUE
+               END IF
+               JX = JX + INCX
+   80       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of SSYR  .
+*
+      END
+      SUBROUTINE SSYRK ( UPLO, TRANS, N, K, ALPHA, A, LDA,
+     $                   BETA, C, LDC )
+*     .. Scalar Arguments ..
+      CHARACTER*1        UPLO, TRANS
+      INTEGER            N, K, LDA, LDC
+      REAL               ALPHA, BETA
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), C( LDC, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  SSYRK  performs one of the symmetric rank k operations
+*
+*     C := alpha*A*A' + beta*C,
+*
+*  or
+*
+*     C := alpha*A'*A + beta*C,
+*
+*  where  alpha and beta  are scalars, C is an  n by n  symmetric matrix
+*  and  A  is an  n by k  matrix in the first case and a  k by n  matrix
+*  in the second case.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On  entry,   UPLO  specifies  whether  the  upper  or  lower
+*           triangular  part  of the  array  C  is to be  referenced  as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the  upper triangular part of  C
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the  lower triangular part of  C
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry,  TRANS  specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   C := alpha*A*A' + beta*C.
+*
+*              TRANS = 'T' or 't'   C := alpha*A'*A + beta*C.
+*
+*              TRANS = 'C' or 'c'   C := alpha*A'*A + beta*C.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry,  N specifies the order of the matrix C.  N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry with  TRANS = 'N' or 'n',  K  specifies  the number
+*           of  columns   of  the   matrix   A,   and  on   entry   with
+*           TRANS = 'T' or 't' or 'C' or 'c',  K  specifies  the  number
+*           of rows of the matrix  A.  K must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - REAL            .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - REAL             array of DIMENSION ( LDA, ka ), where ka is
+*           k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+*           Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+*           part of the array  A  must contain the matrix  A,  otherwise
+*           the leading  k by n  part of the array  A  must contain  the
+*           matrix A.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+*           then  LDA must be at least  max( 1, n ), otherwise  LDA must
+*           be at least  max( 1, k ).
+*           Unchanged on exit.
+*
+*  BETA   - REAL            .
+*           On entry, BETA specifies the scalar beta.
+*           Unchanged on exit.
+*
+*  C      - REAL             array of DIMENSION ( LDC, n ).
+*           Before entry  with  UPLO = 'U' or 'u',  the leading  n by n
+*           upper triangular part of the array C must contain the upper
+*           triangular part  of the  symmetric matrix  and the strictly
+*           lower triangular part of C is not referenced.  On exit, the
+*           upper triangular part of the array  C is overwritten by the
+*           upper triangular part of the updated matrix.
+*           Before entry  with  UPLO = 'L' or 'l',  the leading  n by n
+*           lower triangular part of the array C must contain the lower
+*           triangular part  of the  symmetric matrix  and the strictly
+*           upper triangular part of C is not referenced.  On exit, the
+*           lower triangular part of the array  C is overwritten by the
+*           lower triangular part of the updated matrix.
+*
+*  LDC    - INTEGER.
+*           On entry, LDC specifies the first dimension of C as declared
+*           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Local Scalars ..
+      LOGICAL            UPPER
+      INTEGER            I, INFO, J, L, NROWA
+      REAL               TEMP
+*     .. Parameters ..
+      REAL               ONE ,         ZERO
+      PARAMETER        ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+         NROWA = N
+      ELSE
+         NROWA = K
+      END IF
+      UPPER = LSAME( UPLO, 'U' )
+*
+      INFO = 0
+      IF(      ( .NOT.UPPER               ).AND.
+     $         ( .NOT.LSAME( UPLO , 'L' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND.
+     $         ( .NOT.LSAME( TRANS, 'T' ) ).AND.
+     $         ( .NOT.LSAME( TRANS, 'C' ) )      )THEN
+         INFO = 2
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 3
+      ELSE IF( K  .LT.0               )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 7
+      ELSE IF( LDC.LT.MAX( 1, N     ) )THEN
+         INFO = 10
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'SSYRK ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.
+     $    ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         IF( UPPER )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 20, J = 1, N
+                  DO 10, I = 1, J
+                     C( I, J ) = ZERO
+   10             CONTINUE
+   20          CONTINUE
+            ELSE
+               DO 40, J = 1, N
+                  DO 30, I = 1, J
+                     C( I, J ) = BETA*C( I, J )
+   30             CONTINUE
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( BETA.EQ.ZERO )THEN
+               DO 60, J = 1, N
+                  DO 50, I = J, N
+                     C( I, J ) = ZERO
+   50             CONTINUE
+   60          CONTINUE
+            ELSE
+               DO 80, J = 1, N
+                  DO 70, I = J, N
+                     C( I, J ) = BETA*C( I, J )
+   70             CONTINUE
+   80          CONTINUE
+            END IF
+         END IF
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  C := alpha*A*A' + beta*C.
+*
+         IF( UPPER )THEN
+            DO 130, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 90, I = 1, J
+                     C( I, J ) = ZERO
+   90             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 100, I = 1, J
+                     C( I, J ) = BETA*C( I, J )
+  100             CONTINUE
+               END IF
+               DO 120, L = 1, K
+                  IF( A( J, L ).NE.ZERO )THEN
+                     TEMP = ALPHA*A( J, L )
+                     DO 110, I = 1, J
+                        C( I, J ) = C( I, J ) + TEMP*A( I, L )
+  110                CONTINUE
+                  END IF
+  120          CONTINUE
+  130       CONTINUE
+         ELSE
+            DO 180, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 140, I = J, N
+                     C( I, J ) = ZERO
+  140             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 150, I = J, N
+                     C( I, J ) = BETA*C( I, J )
+  150             CONTINUE
+               END IF
+               DO 170, L = 1, K
+                  IF( A( J, L ).NE.ZERO )THEN
+                     TEMP      = ALPHA*A( J, L )
+                     DO 160, I = J, N
+                        C( I, J ) = C( I, J ) + TEMP*A( I, L )
+  160                CONTINUE
+                  END IF
+  170          CONTINUE
+  180       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  C := alpha*A'*A + beta*C.
+*
+         IF( UPPER )THEN
+            DO 210, J = 1, N
+               DO 200, I = 1, J
+                  TEMP = ZERO
+                  DO 190, L = 1, K
+                     TEMP = TEMP + A( L, I )*A( L, J )
+  190             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  200          CONTINUE
+  210       CONTINUE
+         ELSE
+            DO 240, J = 1, N
+               DO 230, I = J, N
+                  TEMP = ZERO
+                  DO 220, L = 1, K
+                     TEMP = TEMP + A( L, I )*A( L, J )
+  220             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  230          CONTINUE
+  240       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of SSYRK .
+*
+      END
+      SUBROUTINE STBMV ( UPLO, TRANS, DIAG, N, K, A, LDA, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, K, LDA, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  STBMV  performs one of the matrix-vector operations
+*
+*     x := A*x,   or   x := A'*x,
+*
+*  where x is an n element vector and  A is an n by n unit, or non-unit,
+*  upper or lower triangular band matrix, with ( k + 1 ) diagonals.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   x := A*x.
+*
+*              TRANS = 'T' or 't'   x := A'*x.
+*
+*              TRANS = 'C' or 'c'   x := A'*x.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry with UPLO = 'U' or 'u', K specifies the number of
+*           super-diagonals of the matrix A.
+*           On entry with UPLO = 'L' or 'l', K specifies the number of
+*           sub-diagonals of the matrix A.
+*           K must satisfy  0 .le. K.
+*           Unchanged on exit.
+*
+*  A      - REAL             array of DIMENSION ( LDA, n ).
+*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
+*           by n part of the array A must contain the upper triangular
+*           band part of the matrix of coefficients, supplied column by
+*           column, with the leading diagonal of the matrix in row
+*           ( k + 1 ) of the array, the first super-diagonal starting at
+*           position 2 in row k, and so on. The top left k by k triangle
+*           of the array A is not referenced.
+*           The following program segment will transfer an upper
+*           triangular band matrix from conventional full matrix storage
+*           to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = K + 1 - J
+*                    DO 10, I = MAX( 1, J - K ), J
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
+*           by n part of the array A must contain the lower triangular
+*           band part of the matrix of coefficients, supplied column by
+*           column, with the leading diagonal of the matrix in row 1 of
+*           the array, the first sub-diagonal starting at position 1 in
+*           row 2, and so on. The bottom right k by k triangle of the
+*           array A is not referenced.
+*           The following program segment will transfer a lower
+*           triangular band matrix from conventional full matrix storage
+*           to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = 1 - J
+*                    DO 10, I = J, MIN( N, J + K )
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Note that when DIAG = 'U' or 'u' the elements of the array A
+*           corresponding to the diagonal elements of the matrix are not
+*           referenced, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           ( k + 1 ).
+*           Unchanged on exit.
+*
+*  X      - REAL             array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x. On exit, X is overwritten with the
+*           tranformed vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      REAL               ZERO
+      PARAMETER        ( ZERO = 0.0E+0 )
+*     .. Local Scalars ..
+      REAL               TEMP
+      INTEGER            I, INFO, IX, J, JX, KPLUS1, KX, L
+      LOGICAL            NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( K.LT.0 )THEN
+         INFO = 5
+      ELSE IF( LDA.LT.( K + 1 ) )THEN
+         INFO = 7
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 9
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'STBMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOUNIT = LSAME( DIAG, 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX   too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through A.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*         Form  x := A*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KPLUS1 = K + 1
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     L    = KPLUS1 - J
+                     DO 10, I = MAX( 1, J - K ), J - 1
+                        X( I ) = X( I ) + TEMP*A( L + I, J )
+   10                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*A( KPLUS1, J )
+                  END IF
+   20          CONTINUE
+            ELSE
+               JX = KX
+               DO 40, J = 1, N
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     L    = KPLUS1  - J
+                     DO 30, I = MAX( 1, J - K ), J - 1
+                        X( IX ) = X( IX ) + TEMP*A( L + I, J )
+                        IX      = IX      + INCX
+   30                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*A( KPLUS1, J )
+                  END IF
+                  JX = JX + INCX
+                  IF( J.GT.K )
+     $               KX = KX + INCX
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     L    = 1      - J
+                     DO 50, I = MIN( N, J + K ), J + 1, -1
+                        X( I ) = X( I ) + TEMP*A( L + I, J )
+   50                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*A( 1, J )
+                  END IF
+   60          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 80, J = N, 1, -1
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     L    = 1       - J
+                     DO 70, I = MIN( N, J + K ), J + 1, -1
+                        X( IX ) = X( IX ) + TEMP*A( L + I, J )
+                        IX      = IX      - INCX
+   70                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*A( 1, J )
+                  END IF
+                  JX = JX - INCX
+                  IF( ( N - J ).GE.K )
+     $               KX = KX - INCX
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := A'*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KPLUS1 = K + 1
+            IF( INCX.EQ.1 )THEN
+               DO 100, J = N, 1, -1
+                  TEMP = X( J )
+                  L    = KPLUS1 - J
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( KPLUS1, J )
+                  DO 90, I = J - 1, MAX( 1, J - K ), -1
+                     TEMP = TEMP + A( L + I, J )*X( I )
+   90             CONTINUE
+                  X( J ) = TEMP
+  100          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 120, J = N, 1, -1
+                  TEMP = X( JX )
+                  KX   = KX      - INCX
+                  IX   = KX
+                  L    = KPLUS1  - J
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( KPLUS1, J )
+                  DO 110, I = J - 1, MAX( 1, J - K ), -1
+                     TEMP = TEMP + A( L + I, J )*X( IX )
+                     IX   = IX   - INCX
+  110             CONTINUE
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+  120          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 140, J = 1, N
+                  TEMP = X( J )
+                  L    = 1      - J
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( 1, J )
+                  DO 130, I = J + 1, MIN( N, J + K )
+                     TEMP = TEMP + A( L + I, J )*X( I )
+  130             CONTINUE
+                  X( J ) = TEMP
+  140          CONTINUE
+            ELSE
+               JX = KX
+               DO 160, J = 1, N
+                  TEMP = X( JX )
+                  KX   = KX      + INCX
+                  IX   = KX
+                  L    = 1       - J
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( 1, J )
+                  DO 150, I = J + 1, MIN( N, J + K )
+                     TEMP = TEMP + A( L + I, J )*X( IX )
+                     IX   = IX   + INCX
+  150             CONTINUE
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+  160          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of STBMV .
+*
+      END
+      SUBROUTINE STBSV ( UPLO, TRANS, DIAG, N, K, A, LDA, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, K, LDA, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  STBSV  solves one of the systems of equations
+*
+*     A*x = b,   or   A'*x = b,
+*
+*  where b and x are n element vectors and A is an n by n unit, or
+*  non-unit, upper or lower triangular band matrix, with ( k + 1 )
+*  diagonals.
+*
+*  No test for singularity or near-singularity is included in this
+*  routine. Such tests must be performed before calling this routine.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the equations to be solved as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   A*x = b.
+*
+*              TRANS = 'T' or 't'   A'*x = b.
+*
+*              TRANS = 'C' or 'c'   A'*x = b.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry with UPLO = 'U' or 'u', K specifies the number of
+*           super-diagonals of the matrix A.
+*           On entry with UPLO = 'L' or 'l', K specifies the number of
+*           sub-diagonals of the matrix A.
+*           K must satisfy  0 .le. K.
+*           Unchanged on exit.
+*
+*  A      - REAL             array of DIMENSION ( LDA, n ).
+*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
+*           by n part of the array A must contain the upper triangular
+*           band part of the matrix of coefficients, supplied column by
+*           column, with the leading diagonal of the matrix in row
+*           ( k + 1 ) of the array, the first super-diagonal starting at
+*           position 2 in row k, and so on. The top left k by k triangle
+*           of the array A is not referenced.
+*           The following program segment will transfer an upper
+*           triangular band matrix from conventional full matrix storage
+*           to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = K + 1 - J
+*                    DO 10, I = MAX( 1, J - K ), J
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
+*           by n part of the array A must contain the lower triangular
+*           band part of the matrix of coefficients, supplied column by
+*           column, with the leading diagonal of the matrix in row 1 of
+*           the array, the first sub-diagonal starting at position 1 in
+*           row 2, and so on. The bottom right k by k triangle of the
+*           array A is not referenced.
+*           The following program segment will transfer a lower
+*           triangular band matrix from conventional full matrix storage
+*           to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = 1 - J
+*                    DO 10, I = J, MIN( N, J + K )
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Note that when DIAG = 'U' or 'u' the elements of the array A
+*           corresponding to the diagonal elements of the matrix are not
+*           referenced, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           ( k + 1 ).
+*           Unchanged on exit.
+*
+*  X      - REAL             array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element right-hand side vector b. On exit, X is overwritten
+*           with the solution vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      REAL               ZERO
+      PARAMETER        ( ZERO = 0.0E+0 )
+*     .. Local Scalars ..
+      REAL               TEMP
+      INTEGER            I, INFO, IX, J, JX, KPLUS1, KX, L
+      LOGICAL            NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( K.LT.0 )THEN
+         INFO = 5
+      ELSE IF( LDA.LT.( K + 1 ) )THEN
+         INFO = 7
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 9
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'STBSV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOUNIT = LSAME( DIAG, 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX  too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed by sequentially with one pass through A.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  x := inv( A )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KPLUS1 = K + 1
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     L = KPLUS1 - J
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/A( KPLUS1, J )
+                     TEMP = X( J )
+                     DO 10, I = J - 1, MAX( 1, J - K ), -1
+                        X( I ) = X( I ) - TEMP*A( L + I, J )
+   10                CONTINUE
+                  END IF
+   20          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 40, J = N, 1, -1
+                  KX = KX - INCX
+                  IF( X( JX ).NE.ZERO )THEN
+                     IX = KX
+                     L  = KPLUS1 - J
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/A( KPLUS1, J )
+                     TEMP = X( JX )
+                     DO 30, I = J - 1, MAX( 1, J - K ), -1
+                        X( IX ) = X( IX ) - TEMP*A( L + I, J )
+                        IX      = IX      - INCX
+   30                CONTINUE
+                  END IF
+                  JX = JX - INCX
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     L = 1 - J
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/A( 1, J )
+                     TEMP = X( J )
+                     DO 50, I = J + 1, MIN( N, J + K )
+                        X( I ) = X( I ) - TEMP*A( L + I, J )
+   50                CONTINUE
+                  END IF
+   60          CONTINUE
+            ELSE
+               JX = KX
+               DO 80, J = 1, N
+                  KX = KX + INCX
+                  IF( X( JX ).NE.ZERO )THEN
+                     IX = KX
+                     L  = 1  - J
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/A( 1, J )
+                     TEMP = X( JX )
+                     DO 70, I = J + 1, MIN( N, J + K )
+                        X( IX ) = X( IX ) - TEMP*A( L + I, J )
+                        IX      = IX      + INCX
+   70                CONTINUE
+                  END IF
+                  JX = JX + INCX
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := inv( A')*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KPLUS1 = K + 1
+            IF( INCX.EQ.1 )THEN
+               DO 100, J = 1, N
+                  TEMP = X( J )
+                  L    = KPLUS1 - J
+                  DO 90, I = MAX( 1, J - K ), J - 1
+                     TEMP = TEMP - A( L + I, J )*X( I )
+   90             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/A( KPLUS1, J )
+                  X( J ) = TEMP
+  100          CONTINUE
+            ELSE
+               JX = KX
+               DO 120, J = 1, N
+                  TEMP = X( JX )
+                  IX   = KX
+                  L    = KPLUS1  - J
+                  DO 110, I = MAX( 1, J - K ), J - 1
+                     TEMP = TEMP - A( L + I, J )*X( IX )
+                     IX   = IX   + INCX
+  110             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/A( KPLUS1, J )
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+                  IF( J.GT.K )
+     $               KX = KX + INCX
+  120          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 140, J = N, 1, -1
+                  TEMP = X( J )
+                  L    = 1      - J
+                  DO 130, I = MIN( N, J + K ), J + 1, -1
+                     TEMP = TEMP - A( L + I, J )*X( I )
+  130             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/A( 1, J )
+                  X( J ) = TEMP
+  140          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 160, J = N, 1, -1
+                  TEMP = X( JX )
+                  IX   = KX
+                  L    = 1       - J
+                  DO 150, I = MIN( N, J + K ), J + 1, -1
+                     TEMP = TEMP - A( L + I, J )*X( IX )
+                     IX   = IX   - INCX
+  150             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/A( 1, J )
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+                  IF( ( N - J ).GE.K )
+     $               KX = KX - INCX
+  160          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of STBSV .
+*
+      END
+      SUBROUTINE STPMV ( UPLO, TRANS, DIAG, N, AP, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      REAL               AP( * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  STPMV  performs one of the matrix-vector operations
+*
+*     x := A*x,   or   x := A'*x,
+*
+*  where x is an n element vector and  A is an n by n unit, or non-unit,
+*  upper or lower triangular matrix, supplied in packed form.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   x := A*x.
+*
+*              TRANS = 'T' or 't'   x := A'*x.
+*
+*              TRANS = 'C' or 'c'   x := A'*x.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  AP     - REAL             array of DIMENSION at least
+*           ( ( n*( n + 1 ) )/2 ).
+*           Before entry with  UPLO = 'U' or 'u', the array AP must
+*           contain the upper triangular matrix packed sequentially,
+*           column by column, so that AP( 1 ) contains a( 1, 1 ),
+*           AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 )
+*           respectively, and so on.
+*           Before entry with UPLO = 'L' or 'l', the array AP must
+*           contain the lower triangular matrix packed sequentially,
+*           column by column, so that AP( 1 ) contains a( 1, 1 ),
+*           AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 )
+*           respectively, and so on.
+*           Note that when  DIAG = 'U' or 'u', the diagonal elements of
+*           A are not referenced, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  X      - REAL             array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x. On exit, X is overwritten with the
+*           tranformed vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      REAL               ZERO
+      PARAMETER        ( ZERO = 0.0E+0 )
+*     .. Local Scalars ..
+      REAL               TEMP
+      INTEGER            I, INFO, IX, J, JX, K, KK, KX
+      LOGICAL            NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 7
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'STPMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOUNIT = LSAME( DIAG, 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX  too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of AP are
+*     accessed sequentially with one pass through AP.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  x:= A*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KK =1
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     K    = KK
+                     DO 10, I = 1, J - 1
+                        X( I ) = X( I ) + TEMP*AP( K )
+                        K      = K      + 1
+   10                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*AP( KK + J - 1 )
+                  END IF
+                  KK = KK + J
+   20          CONTINUE
+            ELSE
+               JX = KX
+               DO 40, J = 1, N
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     DO 30, K = KK, KK + J - 2
+                        X( IX ) = X( IX ) + TEMP*AP( K )
+                        IX      = IX      + INCX
+   30                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*AP( KK + J - 1 )
+                  END IF
+                  JX = JX + INCX
+                  KK = KK + J
+   40          CONTINUE
+            END IF
+         ELSE
+            KK = ( N*( N + 1 ) )/2
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     K    = KK
+                     DO 50, I = N, J + 1, -1
+                        X( I ) = X( I ) + TEMP*AP( K )
+                        K      = K      - 1
+   50                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*AP( KK - N + J )
+                  END IF
+                  KK = KK - ( N - J + 1 )
+   60          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 80, J = N, 1, -1
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     DO 70, K = KK, KK - ( N - ( J + 1 ) ), -1
+                        X( IX ) = X( IX ) + TEMP*AP( K )
+                        IX      = IX      - INCX
+   70                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*AP( KK - N + J )
+                  END IF
+                  JX = JX - INCX
+                  KK = KK - ( N - J + 1 )
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := A'*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KK = ( N*( N + 1 ) )/2
+            IF( INCX.EQ.1 )THEN
+               DO 100, J = N, 1, -1
+                  TEMP = X( J )
+                  IF( NOUNIT )
+     $               TEMP = TEMP*AP( KK )
+                  K = KK - 1
+                  DO 90, I = J - 1, 1, -1
+                     TEMP = TEMP + AP( K )*X( I )
+                     K    = K    - 1
+   90             CONTINUE
+                  X( J ) = TEMP
+                  KK     = KK   - J
+  100          CONTINUE
+            ELSE
+               JX = KX + ( N - 1 )*INCX
+               DO 120, J = N, 1, -1
+                  TEMP = X( JX )
+                  IX   = JX
+                  IF( NOUNIT )
+     $               TEMP = TEMP*AP( KK )
+                  DO 110, K = KK - 1, KK - J + 1, -1
+                     IX   = IX   - INCX
+                     TEMP = TEMP + AP( K )*X( IX )
+  110             CONTINUE
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+                  KK      = KK   - J
+  120          CONTINUE
+            END IF
+         ELSE
+            KK = 1
+            IF( INCX.EQ.1 )THEN
+               DO 140, J = 1, N
+                  TEMP = X( J )
+                  IF( NOUNIT )
+     $               TEMP = TEMP*AP( KK )
+                  K = KK + 1
+                  DO 130, I = J + 1, N
+                     TEMP = TEMP + AP( K )*X( I )
+                     K    = K    + 1
+  130             CONTINUE
+                  X( J ) = TEMP
+                  KK     = KK   + ( N - J + 1 )
+  140          CONTINUE
+            ELSE
+               JX = KX
+               DO 160, J = 1, N
+                  TEMP = X( JX )
+                  IX   = JX
+                  IF( NOUNIT )
+     $               TEMP = TEMP*AP( KK )
+                  DO 150, K = KK + 1, KK + N - J
+                     IX   = IX   + INCX
+                     TEMP = TEMP + AP( K )*X( IX )
+  150             CONTINUE
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+                  KK      = KK   + ( N - J + 1 )
+  160          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of STPMV .
+*
+      END
+      SUBROUTINE STPSV ( UPLO, TRANS, DIAG, N, AP, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      REAL               AP( * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  STPSV  solves one of the systems of equations
+*
+*     A*x = b,   or   A'*x = b,
+*
+*  where b and x are n element vectors and A is an n by n unit, or
+*  non-unit, upper or lower triangular matrix, supplied in packed form.
+*
+*  No test for singularity or near-singularity is included in this
+*  routine. Such tests must be performed before calling this routine.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the equations to be solved as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   A*x = b.
+*
+*              TRANS = 'T' or 't'   A'*x = b.
+*
+*              TRANS = 'C' or 'c'   A'*x = b.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  AP     - REAL             array of DIMENSION at least
+*           ( ( n*( n + 1 ) )/2 ).
+*           Before entry with  UPLO = 'U' or 'u', the array AP must
+*           contain the upper triangular matrix packed sequentially,
+*           column by column, so that AP( 1 ) contains a( 1, 1 ),
+*           AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 )
+*           respectively, and so on.
+*           Before entry with UPLO = 'L' or 'l', the array AP must
+*           contain the lower triangular matrix packed sequentially,
+*           column by column, so that AP( 1 ) contains a( 1, 1 ),
+*           AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 )
+*           respectively, and so on.
+*           Note that when  DIAG = 'U' or 'u', the diagonal elements of
+*           A are not referenced, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  X      - REAL             array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element right-hand side vector b. On exit, X is overwritten
+*           with the solution vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      REAL               ZERO
+      PARAMETER        ( ZERO = 0.0E+0 )
+*     .. Local Scalars ..
+      REAL               TEMP
+      INTEGER            I, INFO, IX, J, JX, K, KK, KX
+      LOGICAL            NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 7
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'STPSV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOUNIT = LSAME( DIAG, 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX  too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of AP are
+*     accessed sequentially with one pass through AP.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  x := inv( A )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KK = ( N*( N + 1 ) )/2
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/AP( KK )
+                     TEMP = X( J )
+                     K    = KK     - 1
+                     DO 10, I = J - 1, 1, -1
+                        X( I ) = X( I ) - TEMP*AP( K )
+                        K      = K      - 1
+   10                CONTINUE
+                  END IF
+                  KK = KK - J
+   20          CONTINUE
+            ELSE
+               JX = KX + ( N - 1 )*INCX
+               DO 40, J = N, 1, -1
+                  IF( X( JX ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/AP( KK )
+                     TEMP = X( JX )
+                     IX   = JX
+                     DO 30, K = KK - 1, KK - J + 1, -1
+                        IX      = IX      - INCX
+                        X( IX ) = X( IX ) - TEMP*AP( K )
+   30                CONTINUE
+                  END IF
+                  JX = JX - INCX
+                  KK = KK - J
+   40          CONTINUE
+            END IF
+         ELSE
+            KK = 1
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/AP( KK )
+                     TEMP = X( J )
+                     K    = KK     + 1
+                     DO 50, I = J + 1, N
+                        X( I ) = X( I ) - TEMP*AP( K )
+                        K      = K      + 1
+   50                CONTINUE
+                  END IF
+                  KK = KK + ( N - J + 1 )
+   60          CONTINUE
+            ELSE
+               JX = KX
+               DO 80, J = 1, N
+                  IF( X( JX ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/AP( KK )
+                     TEMP = X( JX )
+                     IX   = JX
+                     DO 70, K = KK + 1, KK + N - J
+                        IX      = IX      + INCX
+                        X( IX ) = X( IX ) - TEMP*AP( K )
+   70                CONTINUE
+                  END IF
+                  JX = JX + INCX
+                  KK = KK + ( N - J + 1 )
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := inv( A' )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KK = 1
+            IF( INCX.EQ.1 )THEN
+               DO 100, J = 1, N
+                  TEMP = X( J )
+                  K    = KK
+                  DO 90, I = 1, J - 1
+                     TEMP = TEMP - AP( K )*X( I )
+                     K    = K    + 1
+   90             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/AP( KK + J - 1 )
+                  X( J ) = TEMP
+                  KK     = KK   + J
+  100          CONTINUE
+            ELSE
+               JX = KX
+               DO 120, J = 1, N
+                  TEMP = X( JX )
+                  IX   = KX
+                  DO 110, K = KK, KK + J - 2
+                     TEMP = TEMP - AP( K )*X( IX )
+                     IX   = IX   + INCX
+  110             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/AP( KK + J - 1 )
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+                  KK      = KK   + J
+  120          CONTINUE
+            END IF
+         ELSE
+            KK = ( N*( N + 1 ) )/2
+            IF( INCX.EQ.1 )THEN
+               DO 140, J = N, 1, -1
+                  TEMP = X( J )
+                  K = KK
+                  DO 130, I = N, J + 1, -1
+                     TEMP = TEMP - AP( K )*X( I )
+                     K    = K    - 1
+  130             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/AP( KK - N + J )
+                  X( J ) = TEMP
+                  KK     = KK   - ( N - J + 1 )
+  140          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 160, J = N, 1, -1
+                  TEMP = X( JX )
+                  IX   = KX
+                  DO 150, K = KK, KK - ( N - ( J + 1 ) ), -1
+                     TEMP = TEMP - AP( K )*X( IX )
+                     IX   = IX   - INCX
+  150             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/AP( KK - N + J )
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+                  KK      = KK   - (N - J + 1 )
+  160          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of STPSV .
+*
+      END
+      SUBROUTINE STRMM ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA,
+     $                   B, LDB )
+*     .. Scalar Arguments ..
+      CHARACTER*1        SIDE, UPLO, TRANSA, DIAG
+      INTEGER            M, N, LDA, LDB
+      REAL               ALPHA
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), B( LDB, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  STRMM  performs one of the matrix-matrix operations
+*
+*     B := alpha*op( A )*B,   or   B := alpha*B*op( A ),
+*
+*  where  alpha  is a scalar,  B  is an m by n matrix,  A  is a unit, or
+*  non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+*
+*     op( A ) = A   or   op( A ) = A'.
+*
+*  Parameters
+*  ==========
+*
+*  SIDE   - CHARACTER*1.
+*           On entry,  SIDE specifies whether  op( A ) multiplies B from
+*           the left or right as follows:
+*
+*              SIDE = 'L' or 'l'   B := alpha*op( A )*B.
+*
+*              SIDE = 'R' or 'r'   B := alpha*B*op( A ).
+*
+*           Unchanged on exit.
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix A is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANSA - CHARACTER*1.
+*           On entry, TRANSA specifies the form of op( A ) to be used in
+*           the matrix multiplication as follows:
+*
+*              TRANSA = 'N' or 'n'   op( A ) = A.
+*
+*              TRANSA = 'T' or 't'   op( A ) = A'.
+*
+*              TRANSA = 'C' or 'c'   op( A ) = A'.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit triangular
+*           as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry, M specifies the number of rows of B. M must be at
+*           least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of B.  N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - REAL            .
+*           On entry,  ALPHA specifies the scalar  alpha. When  alpha is
+*           zero then  A is not referenced and  B need not be set before
+*           entry.
+*           Unchanged on exit.
+*
+*  A      - REAL             array of DIMENSION ( LDA, k ), where k is m
+*           when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'.
+*           Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k
+*           upper triangular part of the array  A must contain the upper
+*           triangular matrix  and the strictly lower triangular part of
+*           A is not referenced.
+*           Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k
+*           lower triangular part of the array  A must contain the lower
+*           triangular matrix  and the strictly upper triangular part of
+*           A is not referenced.
+*           Note that when  DIAG = 'U' or 'u',  the diagonal elements of
+*           A  are not referenced either,  but are assumed to be  unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program.  When  SIDE = 'L' or 'l'  then
+*           LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r'
+*           then LDA must be at least max( 1, n ).
+*           Unchanged on exit.
+*
+*  B      - REAL             array of DIMENSION ( LDB, n ).
+*           Before entry,  the leading  m by n part of the array  B must
+*           contain the matrix  B,  and  on exit  is overwritten  by the
+*           transformed matrix.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in  the  calling  (sub)  program.   LDB  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Local Scalars ..
+      LOGICAL            LSIDE, NOUNIT, UPPER
+      INTEGER            I, INFO, J, K, NROWA
+      REAL               TEMP
+*     .. Parameters ..
+      REAL               ONE         , ZERO
+      PARAMETER        ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      LSIDE  = LSAME( SIDE  , 'L' )
+      IF( LSIDE )THEN
+         NROWA = M
+      ELSE
+         NROWA = N
+      END IF
+      NOUNIT = LSAME( DIAG  , 'N' )
+      UPPER  = LSAME( UPLO  , 'U' )
+*
+      INFO   = 0
+      IF(      ( .NOT.LSIDE                ).AND.
+     $         ( .NOT.LSAME( SIDE  , 'R' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.UPPER                ).AND.
+     $         ( .NOT.LSAME( UPLO  , 'L' ) )      )THEN
+         INFO = 2
+      ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND.
+     $         ( .NOT.LSAME( TRANSA, 'T' ) ).AND.
+     $         ( .NOT.LSAME( TRANSA, 'C' ) )      )THEN
+         INFO = 3
+      ELSE IF( ( .NOT.LSAME( DIAG  , 'U' ) ).AND.
+     $         ( .NOT.LSAME( DIAG  , 'N' ) )      )THEN
+         INFO = 4
+      ELSE IF( M  .LT.0               )THEN
+         INFO = 5
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 6
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 9
+      ELSE IF( LDB.LT.MAX( 1, M     ) )THEN
+         INFO = 11
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'STRMM ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         DO 20, J = 1, N
+            DO 10, I = 1, M
+               B( I, J ) = ZERO
+   10       CONTINUE
+   20    CONTINUE
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSIDE )THEN
+         IF( LSAME( TRANSA, 'N' ) )THEN
+*
+*           Form  B := alpha*A*B.
+*
+            IF( UPPER )THEN
+               DO 50, J = 1, N
+                  DO 40, K = 1, M
+                     IF( B( K, J ).NE.ZERO )THEN
+                        TEMP = ALPHA*B( K, J )
+                        DO 30, I = 1, K - 1
+                           B( I, J ) = B( I, J ) + TEMP*A( I, K )
+   30                   CONTINUE
+                        IF( NOUNIT )
+     $                     TEMP = TEMP*A( K, K )
+                        B( K, J ) = TEMP
+                     END IF
+   40             CONTINUE
+   50          CONTINUE
+            ELSE
+               DO 80, J = 1, N
+                  DO 70 K = M, 1, -1
+                     IF( B( K, J ).NE.ZERO )THEN
+                        TEMP      = ALPHA*B( K, J )
+                        B( K, J ) = TEMP
+                        IF( NOUNIT )
+     $                     B( K, J ) = B( K, J )*A( K, K )
+                        DO 60, I = K + 1, M
+                           B( I, J ) = B( I, J ) + TEMP*A( I, K )
+   60                   CONTINUE
+                     END IF
+   70             CONTINUE
+   80          CONTINUE
+            END IF
+         ELSE
+*
+*           Form  B := alpha*A'*B.
+*
+            IF( UPPER )THEN
+               DO 110, J = 1, N
+                  DO 100, I = M, 1, -1
+                     TEMP = B( I, J )
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*A( I, I )
+                     DO 90, K = 1, I - 1
+                        TEMP = TEMP + A( K, I )*B( K, J )
+   90                CONTINUE
+                     B( I, J ) = ALPHA*TEMP
+  100             CONTINUE
+  110          CONTINUE
+            ELSE
+               DO 140, J = 1, N
+                  DO 130, I = 1, M
+                     TEMP = B( I, J )
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*A( I, I )
+                     DO 120, K = I + 1, M
+                        TEMP = TEMP + A( K, I )*B( K, J )
+  120                CONTINUE
+                     B( I, J ) = ALPHA*TEMP
+  130             CONTINUE
+  140          CONTINUE
+            END IF
+         END IF
+      ELSE
+         IF( LSAME( TRANSA, 'N' ) )THEN
+*
+*           Form  B := alpha*B*A.
+*
+            IF( UPPER )THEN
+               DO 180, J = N, 1, -1
+                  TEMP = ALPHA
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( J, J )
+                  DO 150, I = 1, M
+                     B( I, J ) = TEMP*B( I, J )
+  150             CONTINUE
+                  DO 170, K = 1, J - 1
+                     IF( A( K, J ).NE.ZERO )THEN
+                        TEMP = ALPHA*A( K, J )
+                        DO 160, I = 1, M
+                           B( I, J ) = B( I, J ) + TEMP*B( I, K )
+  160                   CONTINUE
+                     END IF
+  170             CONTINUE
+  180          CONTINUE
+            ELSE
+               DO 220, J = 1, N
+                  TEMP = ALPHA
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( J, J )
+                  DO 190, I = 1, M
+                     B( I, J ) = TEMP*B( I, J )
+  190             CONTINUE
+                  DO 210, K = J + 1, N
+                     IF( A( K, J ).NE.ZERO )THEN
+                        TEMP = ALPHA*A( K, J )
+                        DO 200, I = 1, M
+                           B( I, J ) = B( I, J ) + TEMP*B( I, K )
+  200                   CONTINUE
+                     END IF
+  210             CONTINUE
+  220          CONTINUE
+            END IF
+         ELSE
+*
+*           Form  B := alpha*B*A'.
+*
+            IF( UPPER )THEN
+               DO 260, K = 1, N
+                  DO 240, J = 1, K - 1
+                     IF( A( J, K ).NE.ZERO )THEN
+                        TEMP = ALPHA*A( J, K )
+                        DO 230, I = 1, M
+                           B( I, J ) = B( I, J ) + TEMP*B( I, K )
+  230                   CONTINUE
+                     END IF
+  240             CONTINUE
+                  TEMP = ALPHA
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( K, K )
+                  IF( TEMP.NE.ONE )THEN
+                     DO 250, I = 1, M
+                        B( I, K ) = TEMP*B( I, K )
+  250                CONTINUE
+                  END IF
+  260          CONTINUE
+            ELSE
+               DO 300, K = N, 1, -1
+                  DO 280, J = K + 1, N
+                     IF( A( J, K ).NE.ZERO )THEN
+                        TEMP = ALPHA*A( J, K )
+                        DO 270, I = 1, M
+                           B( I, J ) = B( I, J ) + TEMP*B( I, K )
+  270                   CONTINUE
+                     END IF
+  280             CONTINUE
+                  TEMP = ALPHA
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( K, K )
+                  IF( TEMP.NE.ONE )THEN
+                     DO 290, I = 1, M
+                        B( I, K ) = TEMP*B( I, K )
+  290                CONTINUE
+                  END IF
+  300          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of STRMM .
+*
+      END
+      SUBROUTINE STRMV ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, LDA, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  STRMV  performs one of the matrix-vector operations
+*
+*     x := A*x,   or   x := A'*x,
+*
+*  where x is an n element vector and  A is an n by n unit, or non-unit,
+*  upper or lower triangular matrix.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   x := A*x.
+*
+*              TRANS = 'T' or 't'   x := A'*x.
+*
+*              TRANS = 'C' or 'c'   x := A'*x.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  A      - REAL             array of DIMENSION ( LDA, n ).
+*           Before entry with  UPLO = 'U' or 'u', the leading n by n
+*           upper triangular part of the array A must contain the upper
+*           triangular matrix and the strictly lower triangular part of
+*           A is not referenced.
+*           Before entry with UPLO = 'L' or 'l', the leading n by n
+*           lower triangular part of the array A must contain the lower
+*           triangular matrix and the strictly upper triangular part of
+*           A is not referenced.
+*           Note that when  DIAG = 'U' or 'u', the diagonal elements of
+*           A are not referenced either, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*  X      - REAL             array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x. On exit, X is overwritten with the
+*           tranformed vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      REAL               ZERO
+      PARAMETER        ( ZERO = 0.0E+0 )
+*     .. Local Scalars ..
+      REAL               TEMP
+      INTEGER            I, INFO, IX, J, JX, KX
+      LOGICAL            NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, N ) )THEN
+         INFO = 6
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 8
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'STRMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOUNIT = LSAME( DIAG, 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX  too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through A.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  x := A*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     DO 10, I = 1, J - 1
+                        X( I ) = X( I ) + TEMP*A( I, J )
+   10                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*A( J, J )
+                  END IF
+   20          CONTINUE
+            ELSE
+               JX = KX
+               DO 40, J = 1, N
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     DO 30, I = 1, J - 1
+                        X( IX ) = X( IX ) + TEMP*A( I, J )
+                        IX      = IX      + INCX
+   30                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*A( J, J )
+                  END IF
+                  JX = JX + INCX
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     DO 50, I = N, J + 1, -1
+                        X( I ) = X( I ) + TEMP*A( I, J )
+   50                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*A( J, J )
+                  END IF
+   60          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 80, J = N, 1, -1
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     DO 70, I = N, J + 1, -1
+                        X( IX ) = X( IX ) + TEMP*A( I, J )
+                        IX      = IX      - INCX
+   70                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*A( J, J )
+                  END IF
+                  JX = JX - INCX
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := A'*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            IF( INCX.EQ.1 )THEN
+               DO 100, J = N, 1, -1
+                  TEMP = X( J )
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( J, J )
+                  DO 90, I = J - 1, 1, -1
+                     TEMP = TEMP + A( I, J )*X( I )
+   90             CONTINUE
+                  X( J ) = TEMP
+  100          CONTINUE
+            ELSE
+               JX = KX + ( N - 1 )*INCX
+               DO 120, J = N, 1, -1
+                  TEMP = X( JX )
+                  IX   = JX
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( J, J )
+                  DO 110, I = J - 1, 1, -1
+                     IX   = IX   - INCX
+                     TEMP = TEMP + A( I, J )*X( IX )
+  110             CONTINUE
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+  120          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 140, J = 1, N
+                  TEMP = X( J )
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( J, J )
+                  DO 130, I = J + 1, N
+                     TEMP = TEMP + A( I, J )*X( I )
+  130             CONTINUE
+                  X( J ) = TEMP
+  140          CONTINUE
+            ELSE
+               JX = KX
+               DO 160, J = 1, N
+                  TEMP = X( JX )
+                  IX   = JX
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( J, J )
+                  DO 150, I = J + 1, N
+                     IX   = IX   + INCX
+                     TEMP = TEMP + A( I, J )*X( IX )
+  150             CONTINUE
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+  160          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of STRMV .
+*
+      END
+      SUBROUTINE STRSM ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA,
+     $                   B, LDB )
+*     .. Scalar Arguments ..
+      CHARACTER*1        SIDE, UPLO, TRANSA, DIAG
+      INTEGER            M, N, LDA, LDB
+      REAL               ALPHA
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), B( LDB, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  STRSM  solves one of the matrix equations
+*
+*     op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,
+*
+*  where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+*  non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+*
+*     op( A ) = A   or   op( A ) = A'.
+*
+*  The matrix X is overwritten on B.
+*
+*  Parameters
+*  ==========
+*
+*  SIDE   - CHARACTER*1.
+*           On entry, SIDE specifies whether op( A ) appears on the left
+*           or right of X as follows:
+*
+*              SIDE = 'L' or 'l'   op( A )*X = alpha*B.
+*
+*              SIDE = 'R' or 'r'   X*op( A ) = alpha*B.
+*
+*           Unchanged on exit.
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix A is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANSA - CHARACTER*1.
+*           On entry, TRANSA specifies the form of op( A ) to be used in
+*           the matrix multiplication as follows:
+*
+*              TRANSA = 'N' or 'n'   op( A ) = A.
+*
+*              TRANSA = 'T' or 't'   op( A ) = A'.
+*
+*              TRANSA = 'C' or 'c'   op( A ) = A'.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit triangular
+*           as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry, M specifies the number of rows of B. M must be at
+*           least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of B.  N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - REAL            .
+*           On entry,  ALPHA specifies the scalar  alpha. When  alpha is
+*           zero then  A is not referenced and  B need not be set before
+*           entry.
+*           Unchanged on exit.
+*
+*  A      - REAL             array of DIMENSION ( LDA, k ), where k is m
+*           when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'.
+*           Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k
+*           upper triangular part of the array  A must contain the upper
+*           triangular matrix  and the strictly lower triangular part of
+*           A is not referenced.
+*           Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k
+*           lower triangular part of the array  A must contain the lower
+*           triangular matrix  and the strictly upper triangular part of
+*           A is not referenced.
+*           Note that when  DIAG = 'U' or 'u',  the diagonal elements of
+*           A  are not referenced either,  but are assumed to be  unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program.  When  SIDE = 'L' or 'l'  then
+*           LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r'
+*           then LDA must be at least max( 1, n ).
+*           Unchanged on exit.
+*
+*  B      - REAL             array of DIMENSION ( LDB, n ).
+*           Before entry,  the leading  m by n part of the array  B must
+*           contain  the  right-hand  side  matrix  B,  and  on exit  is
+*           overwritten by the solution matrix  X.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in  the  calling  (sub)  program.   LDB  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Local Scalars ..
+      LOGICAL            LSIDE, NOUNIT, UPPER
+      INTEGER            I, INFO, J, K, NROWA
+      REAL               TEMP
+*     .. Parameters ..
+      REAL               ONE         , ZERO
+      PARAMETER        ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      LSIDE  = LSAME( SIDE  , 'L' )
+      IF( LSIDE )THEN
+         NROWA = M
+      ELSE
+         NROWA = N
+      END IF
+      NOUNIT = LSAME( DIAG  , 'N' )
+      UPPER  = LSAME( UPLO  , 'U' )
+*
+      INFO   = 0
+      IF(      ( .NOT.LSIDE                ).AND.
+     $         ( .NOT.LSAME( SIDE  , 'R' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.UPPER                ).AND.
+     $         ( .NOT.LSAME( UPLO  , 'L' ) )      )THEN
+         INFO = 2
+      ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND.
+     $         ( .NOT.LSAME( TRANSA, 'T' ) ).AND.
+     $         ( .NOT.LSAME( TRANSA, 'C' ) )      )THEN
+         INFO = 3
+      ELSE IF( ( .NOT.LSAME( DIAG  , 'U' ) ).AND.
+     $         ( .NOT.LSAME( DIAG  , 'N' ) )      )THEN
+         INFO = 4
+      ELSE IF( M  .LT.0               )THEN
+         INFO = 5
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 6
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 9
+      ELSE IF( LDB.LT.MAX( 1, M     ) )THEN
+         INFO = 11
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'STRSM ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         DO 20, J = 1, N
+            DO 10, I = 1, M
+               B( I, J ) = ZERO
+   10       CONTINUE
+   20    CONTINUE
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSIDE )THEN
+         IF( LSAME( TRANSA, 'N' ) )THEN
+*
+*           Form  B := alpha*inv( A )*B.
+*
+            IF( UPPER )THEN
+               DO 60, J = 1, N
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 30, I = 1, M
+                        B( I, J ) = ALPHA*B( I, J )
+   30                CONTINUE
+                  END IF
+                  DO 50, K = M, 1, -1
+                     IF( B( K, J ).NE.ZERO )THEN
+                        IF( NOUNIT )
+     $                     B( K, J ) = B( K, J )/A( K, K )
+                        DO 40, I = 1, K - 1
+                           B( I, J ) = B( I, J ) - B( K, J )*A( I, K )
+   40                   CONTINUE
+                     END IF
+   50             CONTINUE
+   60          CONTINUE
+            ELSE
+               DO 100, J = 1, N
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 70, I = 1, M
+                        B( I, J ) = ALPHA*B( I, J )
+   70                CONTINUE
+                  END IF
+                  DO 90 K = 1, M
+                     IF( B( K, J ).NE.ZERO )THEN
+                        IF( NOUNIT )
+     $                     B( K, J ) = B( K, J )/A( K, K )
+                        DO 80, I = K + 1, M
+                           B( I, J ) = B( I, J ) - B( K, J )*A( I, K )
+   80                   CONTINUE
+                     END IF
+   90             CONTINUE
+  100          CONTINUE
+            END IF
+         ELSE
+*
+*           Form  B := alpha*inv( A' )*B.
+*
+            IF( UPPER )THEN
+               DO 130, J = 1, N
+                  DO 120, I = 1, M
+                     TEMP = ALPHA*B( I, J )
+                     DO 110, K = 1, I - 1
+                        TEMP = TEMP - A( K, I )*B( K, J )
+  110                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/A( I, I )
+                     B( I, J ) = TEMP
+  120             CONTINUE
+  130          CONTINUE
+            ELSE
+               DO 160, J = 1, N
+                  DO 150, I = M, 1, -1
+                     TEMP = ALPHA*B( I, J )
+                     DO 140, K = I + 1, M
+                        TEMP = TEMP - A( K, I )*B( K, J )
+  140                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/A( I, I )
+                     B( I, J ) = TEMP
+  150             CONTINUE
+  160          CONTINUE
+            END IF
+         END IF
+      ELSE
+         IF( LSAME( TRANSA, 'N' ) )THEN
+*
+*           Form  B := alpha*B*inv( A ).
+*
+            IF( UPPER )THEN
+               DO 210, J = 1, N
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 170, I = 1, M
+                        B( I, J ) = ALPHA*B( I, J )
+  170                CONTINUE
+                  END IF
+                  DO 190, K = 1, J - 1
+                     IF( A( K, J ).NE.ZERO )THEN
+                        DO 180, I = 1, M
+                           B( I, J ) = B( I, J ) - A( K, J )*B( I, K )
+  180                   CONTINUE
+                     END IF
+  190             CONTINUE
+                  IF( NOUNIT )THEN
+                     TEMP = ONE/A( J, J )
+                     DO 200, I = 1, M
+                        B( I, J ) = TEMP*B( I, J )
+  200                CONTINUE
+                  END IF
+  210          CONTINUE
+            ELSE
+               DO 260, J = N, 1, -1
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 220, I = 1, M
+                        B( I, J ) = ALPHA*B( I, J )
+  220                CONTINUE
+                  END IF
+                  DO 240, K = J + 1, N
+                     IF( A( K, J ).NE.ZERO )THEN
+                        DO 230, I = 1, M
+                           B( I, J ) = B( I, J ) - A( K, J )*B( I, K )
+  230                   CONTINUE
+                     END IF
+  240             CONTINUE
+                  IF( NOUNIT )THEN
+                     TEMP = ONE/A( J, J )
+                     DO 250, I = 1, M
+                       B( I, J ) = TEMP*B( I, J )
+  250                CONTINUE
+                  END IF
+  260          CONTINUE
+            END IF
+         ELSE
+*
+*           Form  B := alpha*B*inv( A' ).
+*
+            IF( UPPER )THEN
+               DO 310, K = N, 1, -1
+                  IF( NOUNIT )THEN
+                     TEMP = ONE/A( K, K )
+                     DO 270, I = 1, M
+                        B( I, K ) = TEMP*B( I, K )
+  270                CONTINUE
+                  END IF
+                  DO 290, J = 1, K - 1
+                     IF( A( J, K ).NE.ZERO )THEN
+                        TEMP = A( J, K )
+                        DO 280, I = 1, M
+                           B( I, J ) = B( I, J ) - TEMP*B( I, K )
+  280                   CONTINUE
+                     END IF
+  290             CONTINUE
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 300, I = 1, M
+                        B( I, K ) = ALPHA*B( I, K )
+  300                CONTINUE
+                  END IF
+  310          CONTINUE
+            ELSE
+               DO 360, K = 1, N
+                  IF( NOUNIT )THEN
+                     TEMP = ONE/A( K, K )
+                     DO 320, I = 1, M
+                        B( I, K ) = TEMP*B( I, K )
+  320                CONTINUE
+                  END IF
+                  DO 340, J = K + 1, N
+                     IF( A( J, K ).NE.ZERO )THEN
+                        TEMP = A( J, K )
+                        DO 330, I = 1, M
+                           B( I, J ) = B( I, J ) - TEMP*B( I, K )
+  330                   CONTINUE
+                     END IF
+  340             CONTINUE
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 350, I = 1, M
+                        B( I, K ) = ALPHA*B( I, K )
+  350                CONTINUE
+                  END IF
+  360          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of STRSM .
+*
+      END
+      SUBROUTINE STRSV ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, LDA, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  STRSV  solves one of the systems of equations
+*
+*     A*x = b,   or   A'*x = b,
+*
+*  where b and x are n element vectors and A is an n by n unit, or
+*  non-unit, upper or lower triangular matrix.
+*
+*  No test for singularity or near-singularity is included in this
+*  routine. Such tests must be performed before calling this routine.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the equations to be solved as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   A*x = b.
+*
+*              TRANS = 'T' or 't'   A'*x = b.
+*
+*              TRANS = 'C' or 'c'   A'*x = b.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  A      - REAL             array of DIMENSION ( LDA, n ).
+*           Before entry with  UPLO = 'U' or 'u', the leading n by n
+*           upper triangular part of the array A must contain the upper
+*           triangular matrix and the strictly lower triangular part of
+*           A is not referenced.
+*           Before entry with UPLO = 'L' or 'l', the leading n by n
+*           lower triangular part of the array A must contain the lower
+*           triangular matrix and the strictly upper triangular part of
+*           A is not referenced.
+*           Note that when  DIAG = 'U' or 'u', the diagonal elements of
+*           A are not referenced either, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*  X      - REAL             array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element right-hand side vector b. On exit, X is overwritten
+*           with the solution vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      REAL               ZERO
+      PARAMETER        ( ZERO = 0.0E+0 )
+*     .. Local Scalars ..
+      REAL               TEMP
+      INTEGER            I, INFO, IX, J, JX, KX
+      LOGICAL            NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, N ) )THEN
+         INFO = 6
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 8
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'STRSV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOUNIT = LSAME( DIAG, 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX  too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through A.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  x := inv( A )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/A( J, J )
+                     TEMP = X( J )
+                     DO 10, I = J - 1, 1, -1
+                        X( I ) = X( I ) - TEMP*A( I, J )
+   10                CONTINUE
+                  END IF
+   20          CONTINUE
+            ELSE
+               JX = KX + ( N - 1 )*INCX
+               DO 40, J = N, 1, -1
+                  IF( X( JX ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/A( J, J )
+                     TEMP = X( JX )
+                     IX   = JX
+                     DO 30, I = J - 1, 1, -1
+                        IX      = IX      - INCX
+                        X( IX ) = X( IX ) - TEMP*A( I, J )
+   30                CONTINUE
+                  END IF
+                  JX = JX - INCX
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/A( J, J )
+                     TEMP = X( J )
+                     DO 50, I = J + 1, N
+                        X( I ) = X( I ) - TEMP*A( I, J )
+   50                CONTINUE
+                  END IF
+   60          CONTINUE
+            ELSE
+               JX = KX
+               DO 80, J = 1, N
+                  IF( X( JX ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/A( J, J )
+                     TEMP = X( JX )
+                     IX   = JX
+                     DO 70, I = J + 1, N
+                        IX      = IX      + INCX
+                        X( IX ) = X( IX ) - TEMP*A( I, J )
+   70                CONTINUE
+                  END IF
+                  JX = JX + INCX
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := inv( A' )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            IF( INCX.EQ.1 )THEN
+               DO 100, J = 1, N
+                  TEMP = X( J )
+                  DO 90, I = 1, J - 1
+                     TEMP = TEMP - A( I, J )*X( I )
+   90             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/A( J, J )
+                  X( J ) = TEMP
+  100          CONTINUE
+            ELSE
+               JX = KX
+               DO 120, J = 1, N
+                  TEMP = X( JX )
+                  IX   = KX
+                  DO 110, I = 1, J - 1
+                     TEMP = TEMP - A( I, J )*X( IX )
+                     IX   = IX   + INCX
+  110             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/A( J, J )
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+  120          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 140, J = N, 1, -1
+                  TEMP = X( J )
+                  DO 130, I = N, J + 1, -1
+                     TEMP = TEMP - A( I, J )*X( I )
+  130             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/A( J, J )
+                  X( J ) = TEMP
+  140          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 160, J = N, 1, -1
+                  TEMP = X( JX )
+                  IX   = KX
+                  DO 150, I = N, J + 1, -1
+                     TEMP = TEMP - A( I, J )*X( IX )
+                     IX   = IX   - INCX
+  150             CONTINUE
+                  IF( NOUNIT )
+     $               TEMP = TEMP/A( J, J )
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+  160          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of STRSV .
+*
+      END
+      SUBROUTINE XERBLA( SRNAME, INFO )
+*
+*  -- LAPACK auxiliary routine (preliminary version) --
+*     Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
+*     Courant Institute, Argonne National Lab, and Rice University
+*     February 29, 1992
+*
+*     .. Scalar Arguments ..
+      CHARACTER*6        SRNAME
+      INTEGER            INFO
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  XERBLA  is an error handler for the LAPACK routines.
+*  It is called by an LAPACK routine if an input parameter has an
+*  invalid value.  A message is printed and execution stops.
+*
+*  Installers may consider modifying the STOP statement in order to
+*  call system-specific exception-handling facilities.
+*
+*  Arguments
+*  =========
+*
+*  SRNAME  (input) CHARACTER*6
+*          The name of the routine which called XERBLA.
+*
+*  INFO    (input) INTEGER
+*          The position of the invalid parameter in the parameter list
+*          of the calling routine.
+*
+*
+      WRITE( *, FMT = 9999 )SRNAME, INFO
+*
+      STOP
+*
+ 9999 FORMAT( ' ** On entry to ', A6, ' parameter number ', I2, ' had ',
+     $      'an illegal value' )
+*
+*     End of XERBLA
+*
+      END
+      subroutine zaxpy(n,za,zx,incx,zy,incy)
+c
+c     constant times a vector plus a vector.
+c     jack dongarra, 3/11/78.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      double complex zx(*),zy(*),za
+      integer i,incx,incy,ix,iy,n
+      double precision dcabs1
+      if(n.le.0)return
+      if (dcabs1(za) .eq. 0.0d0) return
+      if (incx.eq.1.and.incy.eq.1)go to 20
+c
+c        code for unequal increments or equal increments
+c          not equal to 1
+c
+      ix = 1
+      iy = 1
+      if(incx.lt.0)ix = (-n+1)*incx + 1
+      if(incy.lt.0)iy = (-n+1)*incy + 1
+      do 10 i = 1,n
+        zy(iy) = zy(iy) + za*zx(ix)
+        ix = ix + incx
+        iy = iy + incy
+   10 continue
+      return
+c
+c        code for both increments equal to 1
+c
+   20 do 30 i = 1,n
+        zy(i) = zy(i) + za*zx(i)
+   30 continue
+      return
+      end
+      subroutine  zcopy(n,zx,incx,zy,incy)
+c
+c     copies a vector, x, to a vector, y.
+c     jack dongarra, linpack, 4/11/78.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      double complex zx(*),zy(*)
+      integer i,incx,incy,ix,iy,n
+c
+      if(n.le.0)return
+      if(incx.eq.1.and.incy.eq.1)go to 20
+c
+c        code for unequal increments or equal increments
+c          not equal to 1
+c
+      ix = 1
+      iy = 1
+      if(incx.lt.0)ix = (-n+1)*incx + 1
+      if(incy.lt.0)iy = (-n+1)*incy + 1
+      do 10 i = 1,n
+        zy(iy) = zx(ix)
+        ix = ix + incx
+        iy = iy + incy
+   10 continue
+      return
+c
+c        code for both increments equal to 1
+c
+   20 do 30 i = 1,n
+        zy(i) = zx(i)
+   30 continue
+      return
+      end
+      double complex function zdotc(n,zx,incx,zy,incy)
+c
+c     forms the dot product of a vector.
+c     jack dongarra, 3/11/78.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      double complex zx(*),zy(*),ztemp
+      integer i,incx,incy,ix,iy,n
+      ztemp = (0.0d0,0.0d0)
+      zdotc = (0.0d0,0.0d0)
+      if(n.le.0)return
+      if(incx.eq.1.and.incy.eq.1)go to 20
+c
+c        code for unequal increments or equal increments
+c          not equal to 1
+c
+      ix = 1
+      iy = 1
+      if(incx.lt.0)ix = (-n+1)*incx + 1
+      if(incy.lt.0)iy = (-n+1)*incy + 1
+      do 10 i = 1,n
+        ztemp = ztemp + dconjg(zx(ix))*zy(iy)
+        ix = ix + incx
+        iy = iy + incy
+   10 continue
+      zdotc = ztemp
+      return
+c
+c        code for both increments equal to 1
+c
+   20 do 30 i = 1,n
+        ztemp = ztemp + dconjg(zx(i))*zy(i)
+   30 continue
+      zdotc = ztemp
+      return
+      end
+      double complex function zdotu(n,zx,incx,zy,incy)
+c
+c     forms the dot product of two vectors.
+c     jack dongarra, 3/11/78.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      double complex zx(*),zy(*),ztemp
+      integer i,incx,incy,ix,iy,n
+      ztemp = (0.0d0,0.0d0)
+      zdotu = (0.0d0,0.0d0)
+      if(n.le.0)return
+      if(incx.eq.1.and.incy.eq.1)go to 20
+c
+c        code for unequal increments or equal increments
+c          not equal to 1
+c
+      ix = 1
+      iy = 1
+      if(incx.lt.0)ix = (-n+1)*incx + 1
+      if(incy.lt.0)iy = (-n+1)*incy + 1
+      do 10 i = 1,n
+        ztemp = ztemp + zx(ix)*zy(iy)
+        ix = ix + incx
+        iy = iy + incy
+   10 continue
+      zdotu = ztemp
+      return
+c
+c        code for both increments equal to 1
+c
+   20 do 30 i = 1,n
+        ztemp = ztemp + zx(i)*zy(i)
+   30 continue
+      zdotu = ztemp
+      return
+      end
+      subroutine  zdrot (n,zx,incx,zy,incy,c,s)
+c
+c     applies a plane rotation, where the cos and sin (c and s) are
+c     double precision and the vectors zx and zy are double complex.
+c     jack dongarra, linpack, 3/11/78.
+c
+      double complex zx(1),zy(1),ztemp
+      double precision c,s
+      integer i,incx,incy,ix,iy,n
+c
+      if(n.le.0)return
+      if(incx.eq.1.and.incy.eq.1)go to 20
+c
+c       code for unequal increments or equal increments not equal
+c         to 1
+c
+      ix = 1
+      iy = 1
+      if(incx.lt.0)ix = (-n+1)*incx + 1
+      if(incy.lt.0)iy = (-n+1)*incy + 1
+      do 10 i = 1,n
+        ztemp = c*zx(ix) + s*zy(iy)
+        zy(iy) = c*zy(iy) - s*zx(ix)
+        zx(ix) = ztemp
+        ix = ix + incx
+        iy = iy + incy
+   10 continue
+      return
+c
+c       code for both increments equal to 1
+c
+   20 do 30 i = 1,n
+        ztemp = c*zx(i) + s*zy(i)
+        zy(i) = c*zy(i) - s*zx(i)
+        zx(i) = ztemp
+   30 continue
+      return
+      end
+      subroutine  zdscal(n,da,zx,incx)
+c
+c     scales a vector by a constant.
+c     jack dongarra, 3/11/78.
+c     modified 3/93 to return if incx .le. 0.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      double complex zx(*)
+      double precision da
+      integer i,incx,ix,n
+c
+      if( n.le.0 .or. incx.le.0 )return
+      if(incx.eq.1)go to 20
+c
+c        code for increment not equal to 1
+c
+      ix = 1
+      do 10 i = 1,n
+        zx(ix) = dcmplx(da,0.0d0)*zx(ix)
+        ix = ix + incx
+   10 continue
+      return
+c
+c        code for increment equal to 1
+c
+   20 do 30 i = 1,n
+        zx(i) = dcmplx(da,0.0d0)*zx(i)
+   30 continue
+      return
+      end
+      SUBROUTINE ZGBMV ( TRANS, M, N, KL, KU, ALPHA, A, LDA, X, INCX,
+     $                   BETA, Y, INCY )
+*     .. Scalar Arguments ..
+      COMPLEX*16         ALPHA, BETA
+      INTEGER            INCX, INCY, KL, KU, LDA, M, N
+      CHARACTER*1        TRANS
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZGBMV  performs one of the matrix-vector operations
+*
+*     y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   or
+*
+*     y := alpha*conjg( A' )*x + beta*y,
+*
+*  where alpha and beta are scalars, x and y are vectors and A is an
+*  m by n band matrix, with kl sub-diagonals and ku super-diagonals.
+*
+*  Parameters
+*  ==========
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.
+*
+*              TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.
+*
+*              TRANS = 'C' or 'c'   y := alpha*conjg( A' )*x + beta*y.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry, M specifies the number of rows of the matrix A.
+*           M must be at least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  KL     - INTEGER.
+*           On entry, KL specifies the number of sub-diagonals of the
+*           matrix A. KL must satisfy  0 .le. KL.
+*           Unchanged on exit.
+*
+*  KU     - INTEGER.
+*           On entry, KU specifies the number of super-diagonals of the
+*           matrix A. KU must satisfy  0 .le. KU.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX*16      .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
+*           Before entry, the leading ( kl + ku + 1 ) by n part of the
+*           array A must contain the matrix of coefficients, supplied
+*           column by column, with the leading diagonal of the matrix in
+*           row ( ku + 1 ) of the array, the first super-diagonal
+*           starting at position 2 in row ku, the first sub-diagonal
+*           starting at position 1 in row ( ku + 2 ), and so on.
+*           Elements in the array A that do not correspond to elements
+*           in the band matrix (such as the top left ku by ku triangle)
+*           are not referenced.
+*           The following program segment will transfer a band matrix
+*           from conventional full matrix storage to band storage:
+*
+*                 DO 20, J = 1, N
+*                    K = KU + 1 - J
+*                    DO 10, I = MAX( 1, J - KU ), MIN( M, J + KL )
+*                       A( K + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           ( kl + ku + 1 ).
+*           Unchanged on exit.
+*
+*  X      - COMPLEX*16       array of DIMENSION at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
+*           and at least
+*           ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
+*           Before entry, the incremented array X must contain the
+*           vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  BETA   - COMPLEX*16      .
+*           On entry, BETA specifies the scalar beta. When BETA is
+*           supplied as zero then Y need not be set on input.
+*           Unchanged on exit.
+*
+*  Y      - COMPLEX*16       array of DIMENSION at least
+*           ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
+*           and at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
+*           Before entry, the incremented array Y must contain the
+*           vector y. On exit, Y is overwritten by the updated vector y.
+*
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX*16         ONE
+      PARAMETER        ( ONE  = ( 1.0D+0, 0.0D+0 ) )
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     .. Local Scalars ..
+      COMPLEX*16         TEMP
+      INTEGER            I, INFO, IX, IY, J, JX, JY, K, KUP1, KX, KY,
+     $                   LENX, LENY
+      LOGICAL            NOCONJ
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 1
+      ELSE IF( M.LT.0 )THEN
+         INFO = 2
+      ELSE IF( N.LT.0 )THEN
+         INFO = 3
+      ELSE IF( KL.LT.0 )THEN
+         INFO = 4
+      ELSE IF( KU.LT.0 )THEN
+         INFO = 5
+      ELSE IF( LDA.LT.( KL + KU + 1 ) )THEN
+         INFO = 8
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 10
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 13
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZGBMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.
+     $    ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+      NOCONJ = LSAME( TRANS, 'T' )
+*
+*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set
+*     up the start points in  X  and  Y.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+         LENX = N
+         LENY = M
+      ELSE
+         LENX = M
+         LENY = N
+      END IF
+      IF( INCX.GT.0 )THEN
+         KX = 1
+      ELSE
+         KX = 1 - ( LENX - 1 )*INCX
+      END IF
+      IF( INCY.GT.0 )THEN
+         KY = 1
+      ELSE
+         KY = 1 - ( LENY - 1 )*INCY
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through the band part of A.
+*
+*     First form  y := beta*y.
+*
+      IF( BETA.NE.ONE )THEN
+         IF( INCY.EQ.1 )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 10, I = 1, LENY
+                  Y( I ) = ZERO
+   10          CONTINUE
+            ELSE
+               DO 20, I = 1, LENY
+                  Y( I ) = BETA*Y( I )
+   20          CONTINUE
+            END IF
+         ELSE
+            IY = KY
+            IF( BETA.EQ.ZERO )THEN
+               DO 30, I = 1, LENY
+                  Y( IY ) = ZERO
+                  IY      = IY   + INCY
+   30          CONTINUE
+            ELSE
+               DO 40, I = 1, LENY
+                  Y( IY ) = BETA*Y( IY )
+                  IY      = IY           + INCY
+   40          CONTINUE
+            END IF
+         END IF
+      END IF
+      IF( ALPHA.EQ.ZERO )
+     $   RETURN
+      KUP1 = KU + 1
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  y := alpha*A*x + y.
+*
+         JX = KX
+         IF( INCY.EQ.1 )THEN
+            DO 60, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  K    = KUP1 - J
+                  DO 50, I = MAX( 1, J - KU ), MIN( M, J + KL )
+                     Y( I ) = Y( I ) + TEMP*A( K + I, J )
+   50             CONTINUE
+               END IF
+               JX = JX + INCX
+   60       CONTINUE
+         ELSE
+            DO 80, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  IY   = KY
+                  K    = KUP1 - J
+                  DO 70, I = MAX( 1, J - KU ), MIN( M, J + KL )
+                     Y( IY ) = Y( IY ) + TEMP*A( K + I, J )
+                     IY      = IY      + INCY
+   70             CONTINUE
+               END IF
+               JX = JX + INCX
+               IF( J.GT.KU )
+     $            KY = KY + INCY
+   80       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  y := alpha*A'*x + y  or  y := alpha*conjg( A' )*x + y.
+*
+         JY = KY
+         IF( INCX.EQ.1 )THEN
+            DO 110, J = 1, N
+               TEMP = ZERO
+               K    = KUP1 - J
+               IF( NOCONJ )THEN
+                  DO 90, I = MAX( 1, J - KU ), MIN( M, J + KL )
+                     TEMP = TEMP + A( K + I, J )*X( I )
+   90             CONTINUE
+               ELSE
+                  DO 100, I = MAX( 1, J - KU ), MIN( M, J + KL )
+                     TEMP = TEMP + DCONJG( A( K + I, J ) )*X( I )
+  100             CONTINUE
+               END IF
+               Y( JY ) = Y( JY ) + ALPHA*TEMP
+               JY      = JY      + INCY
+  110       CONTINUE
+         ELSE
+            DO 140, J = 1, N
+               TEMP = ZERO
+               IX   = KX
+               K    = KUP1 - J
+               IF( NOCONJ )THEN
+                  DO 120, I = MAX( 1, J - KU ), MIN( M, J + KL )
+                     TEMP = TEMP + A( K + I, J )*X( IX )
+                     IX   = IX   + INCX
+  120             CONTINUE
+               ELSE
+                  DO 130, I = MAX( 1, J - KU ), MIN( M, J + KL )
+                     TEMP = TEMP + DCONJG( A( K + I, J ) )*X( IX )
+                     IX   = IX   + INCX
+  130             CONTINUE
+               END IF
+               Y( JY ) = Y( JY ) + ALPHA*TEMP
+               JY      = JY      + INCY
+               IF( J.GT.KU )
+     $            KX = KX + INCX
+  140       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZGBMV .
+*
+      END
+      SUBROUTINE ZGEMM ( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB,
+     $                   BETA, C, LDC )
+*     .. Scalar Arguments ..
+      CHARACTER*1        TRANSA, TRANSB
+      INTEGER            M, N, K, LDA, LDB, LDC
+      COMPLEX*16         ALPHA, BETA
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), B( LDB, * ), C( LDC, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZGEMM  performs one of the matrix-matrix operations
+*
+*     C := alpha*op( A )*op( B ) + beta*C,
+*
+*  where  op( X ) is one of
+*
+*     op( X ) = X   or   op( X ) = X'   or   op( X ) = conjg( X' ),
+*
+*  alpha and beta are scalars, and A, B and C are matrices, with op( A )
+*  an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+*
+*  Parameters
+*  ==========
+*
+*  TRANSA - CHARACTER*1.
+*           On entry, TRANSA specifies the form of op( A ) to be used in
+*           the matrix multiplication as follows:
+*
+*              TRANSA = 'N' or 'n',  op( A ) = A.
+*
+*              TRANSA = 'T' or 't',  op( A ) = A'.
+*
+*              TRANSA = 'C' or 'c',  op( A ) = conjg( A' ).
+*
+*           Unchanged on exit.
+*
+*  TRANSB - CHARACTER*1.
+*           On entry, TRANSB specifies the form of op( B ) to be used in
+*           the matrix multiplication as follows:
+*
+*              TRANSB = 'N' or 'n',  op( B ) = B.
+*
+*              TRANSB = 'T' or 't',  op( B ) = B'.
+*
+*              TRANSB = 'C' or 'c',  op( B ) = conjg( B' ).
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry,  M  specifies  the number  of rows  of the  matrix
+*           op( A )  and of the  matrix  C.  M  must  be at least  zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry,  N  specifies the number  of columns of the matrix
+*           op( B ) and the number of columns of the matrix C. N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry,  K  specifies  the number of columns of the matrix
+*           op( A ) and the number of rows of the matrix op( B ). K must
+*           be at least  zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX*16      .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX*16       array of DIMENSION ( LDA, ka ), where ka is
+*           k  when  TRANSA = 'N' or 'n',  and is  m  otherwise.
+*           Before entry with  TRANSA = 'N' or 'n',  the leading  m by k
+*           part of the array  A  must contain the matrix  A,  otherwise
+*           the leading  k by m  part of the array  A  must contain  the
+*           matrix A.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. When  TRANSA = 'N' or 'n' then
+*           LDA must be at least  max( 1, m ), otherwise  LDA must be at
+*           least  max( 1, k ).
+*           Unchanged on exit.
+*
+*  B      - COMPLEX*16       array of DIMENSION ( LDB, kb ), where kb is
+*           n  when  TRANSB = 'N' or 'n',  and is  k  otherwise.
+*           Before entry with  TRANSB = 'N' or 'n',  the leading  k by n
+*           part of the array  B  must contain the matrix  B,  otherwise
+*           the leading  n by k  part of the array  B  must contain  the
+*           matrix B.
+*           Unchanged on exit.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in the calling (sub) program. When  TRANSB = 'N' or 'n' then
+*           LDB must be at least  max( 1, k ), otherwise  LDB must be at
+*           least  max( 1, n ).
+*           Unchanged on exit.
+*
+*  BETA   - COMPLEX*16      .
+*           On entry,  BETA  specifies the scalar  beta.  When  BETA  is
+*           supplied as zero then C need not be set on input.
+*           Unchanged on exit.
+*
+*  C      - COMPLEX*16       array of DIMENSION ( LDC, n ).
+*           Before entry, the leading  m by n  part of the array  C must
+*           contain the matrix  C,  except when  beta  is zero, in which
+*           case C need not be set on entry.
+*           On exit, the array  C  is overwritten by the  m by n  matrix
+*           ( alpha*op( A )*op( B ) + beta*C ).
+*
+*  LDC    - INTEGER.
+*           On entry, LDC specifies the first dimension of C as declared
+*           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG, MAX
+*     .. Local Scalars ..
+      LOGICAL            CONJA, CONJB, NOTA, NOTB
+      INTEGER            I, INFO, J, L, NCOLA, NROWA, NROWB
+      COMPLEX*16         TEMP
+*     .. Parameters ..
+      COMPLEX*16         ONE
+      PARAMETER        ( ONE  = ( 1.0D+0, 0.0D+0 ) )
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Executable Statements ..
+*
+*     Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
+*     conjugated or transposed, set  CONJA and CONJB  as true if  A  and
+*     B  respectively are to be  transposed but  not conjugated  and set
+*     NROWA, NCOLA and  NROWB  as the number of rows and  columns  of  A
+*     and the number of rows of  B  respectively.
+*
+      NOTA  = LSAME( TRANSA, 'N' )
+      NOTB  = LSAME( TRANSB, 'N' )
+      CONJA = LSAME( TRANSA, 'C' )
+      CONJB = LSAME( TRANSB, 'C' )
+      IF( NOTA )THEN
+         NROWA = M
+         NCOLA = K
+      ELSE
+         NROWA = K
+         NCOLA = M
+      END IF
+      IF( NOTB )THEN
+         NROWB = K
+      ELSE
+         NROWB = N
+      END IF
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF(      ( .NOT.NOTA                 ).AND.
+     $         ( .NOT.CONJA                ).AND.
+     $         ( .NOT.LSAME( TRANSA, 'T' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.NOTB                 ).AND.
+     $         ( .NOT.CONJB                ).AND.
+     $         ( .NOT.LSAME( TRANSB, 'T' ) )      )THEN
+         INFO = 2
+      ELSE IF( M  .LT.0               )THEN
+         INFO = 3
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 4
+      ELSE IF( K  .LT.0               )THEN
+         INFO = 5
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 8
+      ELSE IF( LDB.LT.MAX( 1, NROWB ) )THEN
+         INFO = 10
+      ELSE IF( LDC.LT.MAX( 1, M     ) )THEN
+         INFO = 13
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZGEMM ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.
+     $    ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         IF( BETA.EQ.ZERO )THEN
+            DO 20, J = 1, N
+               DO 10, I = 1, M
+                  C( I, J ) = ZERO
+   10          CONTINUE
+   20       CONTINUE
+         ELSE
+            DO 40, J = 1, N
+               DO 30, I = 1, M
+                  C( I, J ) = BETA*C( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( NOTB )THEN
+         IF( NOTA )THEN
+*
+*           Form  C := alpha*A*B + beta*C.
+*
+            DO 90, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 50, I = 1, M
+                     C( I, J ) = ZERO
+   50             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 60, I = 1, M
+                     C( I, J ) = BETA*C( I, J )
+   60             CONTINUE
+               END IF
+               DO 80, L = 1, K
+                  IF( B( L, J ).NE.ZERO )THEN
+                     TEMP = ALPHA*B( L, J )
+                     DO 70, I = 1, M
+                        C( I, J ) = C( I, J ) + TEMP*A( I, L )
+   70                CONTINUE
+                  END IF
+   80          CONTINUE
+   90       CONTINUE
+         ELSE IF( CONJA )THEN
+*
+*           Form  C := alpha*conjg( A' )*B + beta*C.
+*
+            DO 120, J = 1, N
+               DO 110, I = 1, M
+                  TEMP = ZERO
+                  DO 100, L = 1, K
+                     TEMP = TEMP + DCONJG( A( L, I ) )*B( L, J )
+  100             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  110          CONTINUE
+  120       CONTINUE
+         ELSE
+*
+*           Form  C := alpha*A'*B + beta*C
+*
+            DO 150, J = 1, N
+               DO 140, I = 1, M
+                  TEMP = ZERO
+                  DO 130, L = 1, K
+                     TEMP = TEMP + A( L, I )*B( L, J )
+  130             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  140          CONTINUE
+  150       CONTINUE
+         END IF
+      ELSE IF( NOTA )THEN
+         IF( CONJB )THEN
+*
+*           Form  C := alpha*A*conjg( B' ) + beta*C.
+*
+            DO 200, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 160, I = 1, M
+                     C( I, J ) = ZERO
+  160             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 170, I = 1, M
+                     C( I, J ) = BETA*C( I, J )
+  170             CONTINUE
+               END IF
+               DO 190, L = 1, K
+                  IF( B( J, L ).NE.ZERO )THEN
+                     TEMP = ALPHA*DCONJG( B( J, L ) )
+                     DO 180, I = 1, M
+                        C( I, J ) = C( I, J ) + TEMP*A( I, L )
+  180                CONTINUE
+                  END IF
+  190          CONTINUE
+  200       CONTINUE
+         ELSE
+*
+*           Form  C := alpha*A*B'          + beta*C
+*
+            DO 250, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 210, I = 1, M
+                     C( I, J ) = ZERO
+  210             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 220, I = 1, M
+                     C( I, J ) = BETA*C( I, J )
+  220             CONTINUE
+               END IF
+               DO 240, L = 1, K
+                  IF( B( J, L ).NE.ZERO )THEN
+                     TEMP = ALPHA*B( J, L )
+                     DO 230, I = 1, M
+                        C( I, J ) = C( I, J ) + TEMP*A( I, L )
+  230                CONTINUE
+                  END IF
+  240          CONTINUE
+  250       CONTINUE
+         END IF
+      ELSE IF( CONJA )THEN
+         IF( CONJB )THEN
+*
+*           Form  C := alpha*conjg( A' )*conjg( B' ) + beta*C.
+*
+            DO 280, J = 1, N
+               DO 270, I = 1, M
+                  TEMP = ZERO
+                  DO 260, L = 1, K
+                     TEMP = TEMP +
+     $                      DCONJG( A( L, I ) )*DCONJG( B( J, L ) )
+  260             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  270          CONTINUE
+  280       CONTINUE
+         ELSE
+*
+*           Form  C := alpha*conjg( A' )*B' + beta*C
+*
+            DO 310, J = 1, N
+               DO 300, I = 1, M
+                  TEMP = ZERO
+                  DO 290, L = 1, K
+                     TEMP = TEMP + DCONJG( A( L, I ) )*B( J, L )
+  290             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  300          CONTINUE
+  310       CONTINUE
+         END IF
+      ELSE
+         IF( CONJB )THEN
+*
+*           Form  C := alpha*A'*conjg( B' ) + beta*C
+*
+            DO 340, J = 1, N
+               DO 330, I = 1, M
+                  TEMP = ZERO
+                  DO 320, L = 1, K
+                     TEMP = TEMP + A( L, I )*DCONJG( B( J, L ) )
+  320             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  330          CONTINUE
+  340       CONTINUE
+         ELSE
+*
+*           Form  C := alpha*A'*B' + beta*C
+*
+            DO 370, J = 1, N
+               DO 360, I = 1, M
+                  TEMP = ZERO
+                  DO 350, L = 1, K
+                     TEMP = TEMP + A( L, I )*B( J, L )
+  350             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  360          CONTINUE
+  370       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZGEMM .
+*
+      END
+      SUBROUTINE ZGEMV ( TRANS, M, N, ALPHA, A, LDA, X, INCX,
+     $                   BETA, Y, INCY )
+*     .. Scalar Arguments ..
+      COMPLEX*16         ALPHA, BETA
+      INTEGER            INCX, INCY, LDA, M, N
+      CHARACTER*1        TRANS
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZGEMV  performs one of the matrix-vector operations
+*
+*     y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   or
+*
+*     y := alpha*conjg( A' )*x + beta*y,
+*
+*  where alpha and beta are scalars, x and y are vectors and A is an
+*  m by n matrix.
+*
+*  Parameters
+*  ==========
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.
+*
+*              TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.
+*
+*              TRANS = 'C' or 'c'   y := alpha*conjg( A' )*x + beta*y.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry, M specifies the number of rows of the matrix A.
+*           M must be at least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX*16      .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
+*           Before entry, the leading m by n part of the array A must
+*           contain the matrix of coefficients.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*  X      - COMPLEX*16       array of DIMENSION at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
+*           and at least
+*           ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
+*           Before entry, the incremented array X must contain the
+*           vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  BETA   - COMPLEX*16      .
+*           On entry, BETA specifies the scalar beta. When BETA is
+*           supplied as zero then Y need not be set on input.
+*           Unchanged on exit.
+*
+*  Y      - COMPLEX*16       array of DIMENSION at least
+*           ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
+*           and at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
+*           Before entry with BETA non-zero, the incremented array Y
+*           must contain the vector y. On exit, Y is overwritten by the
+*           updated vector y.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX*16         ONE
+      PARAMETER        ( ONE  = ( 1.0D+0, 0.0D+0 ) )
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     .. Local Scalars ..
+      COMPLEX*16         TEMP
+      INTEGER            I, INFO, IX, IY, J, JX, JY, KX, KY, LENX, LENY
+      LOGICAL            NOCONJ
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG, MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 1
+      ELSE IF( M.LT.0 )THEN
+         INFO = 2
+      ELSE IF( N.LT.0 )THEN
+         INFO = 3
+      ELSE IF( LDA.LT.MAX( 1, M ) )THEN
+         INFO = 6
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 8
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 11
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZGEMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.
+     $    ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+      NOCONJ = LSAME( TRANS, 'T' )
+*
+*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set
+*     up the start points in  X  and  Y.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+         LENX = N
+         LENY = M
+      ELSE
+         LENX = M
+         LENY = N
+      END IF
+      IF( INCX.GT.0 )THEN
+         KX = 1
+      ELSE
+         KX = 1 - ( LENX - 1 )*INCX
+      END IF
+      IF( INCY.GT.0 )THEN
+         KY = 1
+      ELSE
+         KY = 1 - ( LENY - 1 )*INCY
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through A.
+*
+*     First form  y := beta*y.
+*
+      IF( BETA.NE.ONE )THEN
+         IF( INCY.EQ.1 )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 10, I = 1, LENY
+                  Y( I ) = ZERO
+   10          CONTINUE
+            ELSE
+               DO 20, I = 1, LENY
+                  Y( I ) = BETA*Y( I )
+   20          CONTINUE
+            END IF
+         ELSE
+            IY = KY
+            IF( BETA.EQ.ZERO )THEN
+               DO 30, I = 1, LENY
+                  Y( IY ) = ZERO
+                  IY      = IY   + INCY
+   30          CONTINUE
+            ELSE
+               DO 40, I = 1, LENY
+                  Y( IY ) = BETA*Y( IY )
+                  IY      = IY           + INCY
+   40          CONTINUE
+            END IF
+         END IF
+      END IF
+      IF( ALPHA.EQ.ZERO )
+     $   RETURN
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  y := alpha*A*x + y.
+*
+         JX = KX
+         IF( INCY.EQ.1 )THEN
+            DO 60, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  DO 50, I = 1, M
+                     Y( I ) = Y( I ) + TEMP*A( I, J )
+   50             CONTINUE
+               END IF
+               JX = JX + INCX
+   60       CONTINUE
+         ELSE
+            DO 80, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*X( JX )
+                  IY   = KY
+                  DO 70, I = 1, M
+                     Y( IY ) = Y( IY ) + TEMP*A( I, J )
+                     IY      = IY      + INCY
+   70             CONTINUE
+               END IF
+               JX = JX + INCX
+   80       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  y := alpha*A'*x + y  or  y := alpha*conjg( A' )*x + y.
+*
+         JY = KY
+         IF( INCX.EQ.1 )THEN
+            DO 110, J = 1, N
+               TEMP = ZERO
+               IF( NOCONJ )THEN
+                  DO 90, I = 1, M
+                     TEMP = TEMP + A( I, J )*X( I )
+   90             CONTINUE
+               ELSE
+                  DO 100, I = 1, M
+                     TEMP = TEMP + DCONJG( A( I, J ) )*X( I )
+  100             CONTINUE
+               END IF
+               Y( JY ) = Y( JY ) + ALPHA*TEMP
+               JY      = JY      + INCY
+  110       CONTINUE
+         ELSE
+            DO 140, J = 1, N
+               TEMP = ZERO
+               IX   = KX
+               IF( NOCONJ )THEN
+                  DO 120, I = 1, M
+                     TEMP = TEMP + A( I, J )*X( IX )
+                     IX   = IX   + INCX
+  120             CONTINUE
+               ELSE
+                  DO 130, I = 1, M
+                     TEMP = TEMP + DCONJG( A( I, J ) )*X( IX )
+                     IX   = IX   + INCX
+  130             CONTINUE
+               END IF
+               Y( JY ) = Y( JY ) + ALPHA*TEMP
+               JY      = JY      + INCY
+  140       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZGEMV .
+*
+      END
+      SUBROUTINE ZGERC ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
+*     .. Scalar Arguments ..
+      COMPLEX*16         ALPHA
+      INTEGER            INCX, INCY, LDA, M, N
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZGERC  performs the rank 1 operation
+*
+*     A := alpha*x*conjg( y' ) + A,
+*
+*  where alpha is a scalar, x is an m element vector, y is an n element
+*  vector and A is an m by n matrix.
+*
+*  Parameters
+*  ==========
+*
+*  M      - INTEGER.
+*           On entry, M specifies the number of rows of the matrix A.
+*           M must be at least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX*16      .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  X      - COMPLEX*16       array of dimension at least
+*           ( 1 + ( m - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the m
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  Y      - COMPLEX*16       array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the n
+*           element vector y.
+*           Unchanged on exit.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
+*           Before entry, the leading m by n part of the array A must
+*           contain the matrix of coefficients. On exit, A is
+*           overwritten by the updated matrix.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     .. Local Scalars ..
+      COMPLEX*16         TEMP
+      INTEGER            I, INFO, IX, J, JY, KX
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG, MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( M.LT.0 )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 5
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 7
+      ELSE IF( LDA.LT.MAX( 1, M ) )THEN
+         INFO = 9
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZGERC ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) )
+     $   RETURN
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through A.
+*
+      IF( INCY.GT.0 )THEN
+         JY = 1
+      ELSE
+         JY = 1 - ( N - 1 )*INCY
+      END IF
+      IF( INCX.EQ.1 )THEN
+         DO 20, J = 1, N
+            IF( Y( JY ).NE.ZERO )THEN
+               TEMP = ALPHA*DCONJG( Y( JY ) )
+               DO 10, I = 1, M
+                  A( I, J ) = A( I, J ) + X( I )*TEMP
+   10          CONTINUE
+            END IF
+            JY = JY + INCY
+   20    CONTINUE
+      ELSE
+         IF( INCX.GT.0 )THEN
+            KX = 1
+         ELSE
+            KX = 1 - ( M - 1 )*INCX
+         END IF
+         DO 40, J = 1, N
+            IF( Y( JY ).NE.ZERO )THEN
+               TEMP = ALPHA*DCONJG( Y( JY ) )
+               IX   = KX
+               DO 30, I = 1, M
+                  A( I, J ) = A( I, J ) + X( IX )*TEMP
+                  IX        = IX        + INCX
+   30          CONTINUE
+            END IF
+            JY = JY + INCY
+   40    CONTINUE
+      END IF
+*
+      RETURN
+*
+*     End of ZGERC .
+*
+      END
+      SUBROUTINE ZGERU ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
+*     .. Scalar Arguments ..
+      COMPLEX*16         ALPHA
+      INTEGER            INCX, INCY, LDA, M, N
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZGERU  performs the rank 1 operation
+*
+*     A := alpha*x*y' + A,
+*
+*  where alpha is a scalar, x is an m element vector, y is an n element
+*  vector and A is an m by n matrix.
+*
+*  Parameters
+*  ==========
+*
+*  M      - INTEGER.
+*           On entry, M specifies the number of rows of the matrix A.
+*           M must be at least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX*16      .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  X      - COMPLEX*16       array of dimension at least
+*           ( 1 + ( m - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the m
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  Y      - COMPLEX*16       array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the n
+*           element vector y.
+*           Unchanged on exit.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
+*           Before entry, the leading m by n part of the array A must
+*           contain the matrix of coefficients. On exit, A is
+*           overwritten by the updated matrix.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     .. Local Scalars ..
+      COMPLEX*16         TEMP
+      INTEGER            I, INFO, IX, J, JY, KX
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( M.LT.0 )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 5
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 7
+      ELSE IF( LDA.LT.MAX( 1, M ) )THEN
+         INFO = 9
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZGERU ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) )
+     $   RETURN
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through A.
+*
+      IF( INCY.GT.0 )THEN
+         JY = 1
+      ELSE
+         JY = 1 - ( N - 1 )*INCY
+      END IF
+      IF( INCX.EQ.1 )THEN
+         DO 20, J = 1, N
+            IF( Y( JY ).NE.ZERO )THEN
+               TEMP = ALPHA*Y( JY )
+               DO 10, I = 1, M
+                  A( I, J ) = A( I, J ) + X( I )*TEMP
+   10          CONTINUE
+            END IF
+            JY = JY + INCY
+   20    CONTINUE
+      ELSE
+         IF( INCX.GT.0 )THEN
+            KX = 1
+         ELSE
+            KX = 1 - ( M - 1 )*INCX
+         END IF
+         DO 40, J = 1, N
+            IF( Y( JY ).NE.ZERO )THEN
+               TEMP = ALPHA*Y( JY )
+               IX   = KX
+               DO 30, I = 1, M
+                  A( I, J ) = A( I, J ) + X( IX )*TEMP
+                  IX        = IX        + INCX
+   30          CONTINUE
+            END IF
+            JY = JY + INCY
+   40    CONTINUE
+      END IF
+*
+      RETURN
+*
+*     End of ZGERU .
+*
+      END
+      SUBROUTINE ZHBMV ( UPLO, N, K, ALPHA, A, LDA, X, INCX,
+     $                   BETA, Y, INCY )
+*     .. Scalar Arguments ..
+      COMPLEX*16         ALPHA, BETA
+      INTEGER            INCX, INCY, K, LDA, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZHBMV  performs the matrix-vector  operation
+*
+*     y := alpha*A*x + beta*y,
+*
+*  where alpha and beta are scalars, x and y are n element vectors and
+*  A is an n by n hermitian band matrix, with k super-diagonals.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the band matrix A is being supplied as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   The upper triangular part of A is
+*                                  being supplied.
+*
+*              UPLO = 'L' or 'l'   The lower triangular part of A is
+*                                  being supplied.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry, K specifies the number of super-diagonals of the
+*           matrix A. K must satisfy  0 .le. K.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX*16      .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
+*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
+*           by n part of the array A must contain the upper triangular
+*           band part of the hermitian matrix, supplied column by
+*           column, with the leading diagonal of the matrix in row
+*           ( k + 1 ) of the array, the first super-diagonal starting at
+*           position 2 in row k, and so on. The top left k by k triangle
+*           of the array A is not referenced.
+*           The following program segment will transfer the upper
+*           triangular part of a hermitian band matrix from conventional
+*           full matrix storage to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = K + 1 - J
+*                    DO 10, I = MAX( 1, J - K ), J
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
+*           by n part of the array A must contain the lower triangular
+*           band part of the hermitian matrix, supplied column by
+*           column, with the leading diagonal of the matrix in row 1 of
+*           the array, the first sub-diagonal starting at position 1 in
+*           row 2, and so on. The bottom right k by k triangle of the
+*           array A is not referenced.
+*           The following program segment will transfer the lower
+*           triangular part of a hermitian band matrix from conventional
+*           full matrix storage to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = 1 - J
+*                    DO 10, I = J, MIN( N, J + K )
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Note that the imaginary parts of the diagonal elements need
+*           not be set and are assumed to be zero.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           ( k + 1 ).
+*           Unchanged on exit.
+*
+*  X      - COMPLEX*16       array of DIMENSION at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the
+*           vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  BETA   - COMPLEX*16      .
+*           On entry, BETA specifies the scalar beta.
+*           Unchanged on exit.
+*
+*  Y      - COMPLEX*16       array of DIMENSION at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the
+*           vector y. On exit, Y is overwritten by the updated vector y.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX*16         ONE
+      PARAMETER        ( ONE  = ( 1.0D+0, 0.0D+0 ) )
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     .. Local Scalars ..
+      COMPLEX*16         TEMP1, TEMP2
+      INTEGER            I, INFO, IX, IY, J, JX, JY, KPLUS1, KX, KY, L
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG, MAX, MIN, DBLE
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( K.LT.0 )THEN
+         INFO = 3
+      ELSE IF( LDA.LT.( K + 1 ) )THEN
+         INFO = 6
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 8
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 11
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZHBMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     Set up the start points in  X  and  Y.
+*
+      IF( INCX.GT.0 )THEN
+         KX = 1
+      ELSE
+         KX = 1 - ( N - 1 )*INCX
+      END IF
+      IF( INCY.GT.0 )THEN
+         KY = 1
+      ELSE
+         KY = 1 - ( N - 1 )*INCY
+      END IF
+*
+*     Start the operations. In this version the elements of the array A
+*     are accessed sequentially with one pass through A.
+*
+*     First form  y := beta*y.
+*
+      IF( BETA.NE.ONE )THEN
+         IF( INCY.EQ.1 )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 10, I = 1, N
+                  Y( I ) = ZERO
+   10          CONTINUE
+            ELSE
+               DO 20, I = 1, N
+                  Y( I ) = BETA*Y( I )
+   20          CONTINUE
+            END IF
+         ELSE
+            IY = KY
+            IF( BETA.EQ.ZERO )THEN
+               DO 30, I = 1, N
+                  Y( IY ) = ZERO
+                  IY      = IY   + INCY
+   30          CONTINUE
+            ELSE
+               DO 40, I = 1, N
+                  Y( IY ) = BETA*Y( IY )
+                  IY      = IY           + INCY
+   40          CONTINUE
+            END IF
+         END IF
+      END IF
+      IF( ALPHA.EQ.ZERO )
+     $   RETURN
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  y  when upper triangle of A is stored.
+*
+         KPLUS1 = K + 1
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 60, J = 1, N
+               TEMP1 = ALPHA*X( J )
+               TEMP2 = ZERO
+               L     = KPLUS1 - J
+               DO 50, I = MAX( 1, J - K ), J - 1
+                  Y( I ) = Y( I ) + TEMP1*A( L + I, J )
+                  TEMP2  = TEMP2  + DCONJG( A( L + I, J ) )*X( I )
+   50          CONTINUE
+               Y( J ) = Y( J ) + TEMP1*DBLE( A( KPLUS1, J ) )
+     $                         + ALPHA*TEMP2
+   60       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 80, J = 1, N
+               TEMP1 = ALPHA*X( JX )
+               TEMP2 = ZERO
+               IX    = KX
+               IY    = KY
+               L     = KPLUS1 - J
+               DO 70, I = MAX( 1, J - K ), J - 1
+                  Y( IY ) = Y( IY ) + TEMP1*A( L + I, J )
+                  TEMP2   = TEMP2   + DCONJG( A( L + I, J ) )*X( IX )
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+   70          CONTINUE
+               Y( JY ) = Y( JY ) + TEMP1*DBLE( A( KPLUS1, J ) )
+     $                           + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+               IF( J.GT.K )THEN
+                  KX = KX + INCX
+                  KY = KY + INCY
+               END IF
+   80       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  y  when lower triangle of A is stored.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 100, J = 1, N
+               TEMP1  = ALPHA*X( J )
+               TEMP2  = ZERO
+               Y( J ) = Y( J ) + TEMP1*DBLE( A( 1, J ) )
+               L      = 1      - J
+               DO 90, I = J + 1, MIN( N, J + K )
+                  Y( I ) = Y( I ) + TEMP1*A( L + I, J )
+                  TEMP2  = TEMP2  + DCONJG( A( L + I, J ) )*X( I )
+   90          CONTINUE
+               Y( J ) = Y( J ) + ALPHA*TEMP2
+  100       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 120, J = 1, N
+               TEMP1   = ALPHA*X( JX )
+               TEMP2   = ZERO
+               Y( JY ) = Y( JY ) + TEMP1*DBLE( A( 1, J ) )
+               L       = 1       - J
+               IX      = JX
+               IY      = JY
+               DO 110, I = J + 1, MIN( N, J + K )
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+                  Y( IY ) = Y( IY ) + TEMP1*A( L + I, J )
+                  TEMP2   = TEMP2   + DCONJG( A( L + I, J ) )*X( IX )
+  110          CONTINUE
+               Y( JY ) = Y( JY ) + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+  120       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZHBMV .
+*
+      END
+      SUBROUTINE ZHEMM ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB,
+     $                   BETA, C, LDC )
+*     .. Scalar Arguments ..
+      CHARACTER*1        SIDE, UPLO
+      INTEGER            M, N, LDA, LDB, LDC
+      COMPLEX*16         ALPHA, BETA
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), B( LDB, * ), C( LDC, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZHEMM  performs one of the matrix-matrix operations
+*
+*     C := alpha*A*B + beta*C,
+*
+*  or
+*
+*     C := alpha*B*A + beta*C,
+*
+*  where alpha and beta are scalars, A is an hermitian matrix and  B and
+*  C are m by n matrices.
+*
+*  Parameters
+*  ==========
+*
+*  SIDE   - CHARACTER*1.
+*           On entry,  SIDE  specifies whether  the  hermitian matrix  A
+*           appears on the  left or right  in the  operation as follows:
+*
+*              SIDE = 'L' or 'l'   C := alpha*A*B + beta*C,
+*
+*              SIDE = 'R' or 'r'   C := alpha*B*A + beta*C,
+*
+*           Unchanged on exit.
+*
+*  UPLO   - CHARACTER*1.
+*           On  entry,   UPLO  specifies  whether  the  upper  or  lower
+*           triangular  part  of  the  hermitian  matrix   A  is  to  be
+*           referenced as follows:
+*
+*              UPLO = 'U' or 'u'   Only the upper triangular part of the
+*                                  hermitian matrix is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the lower triangular part of the
+*                                  hermitian matrix is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry,  M  specifies the number of rows of the matrix  C.
+*           M  must be at least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of the matrix C.
+*           N  must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX*16      .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX*16       array of DIMENSION ( LDA, ka ), where ka is
+*           m  when  SIDE = 'L' or 'l'  and is n  otherwise.
+*           Before entry  with  SIDE = 'L' or 'l',  the  m by m  part of
+*           the array  A  must contain the  hermitian matrix,  such that
+*           when  UPLO = 'U' or 'u', the leading m by m upper triangular
+*           part of the array  A  must contain the upper triangular part
+*           of the  hermitian matrix and the  strictly  lower triangular
+*           part of  A  is not referenced,  and when  UPLO = 'L' or 'l',
+*           the leading  m by m  lower triangular part  of the  array  A
+*           must  contain  the  lower triangular part  of the  hermitian
+*           matrix and the  strictly upper triangular part of  A  is not
+*           referenced.
+*           Before entry  with  SIDE = 'R' or 'r',  the  n by n  part of
+*           the array  A  must contain the  hermitian matrix,  such that
+*           when  UPLO = 'U' or 'u', the leading n by n upper triangular
+*           part of the array  A  must contain the upper triangular part
+*           of the  hermitian matrix and the  strictly  lower triangular
+*           part of  A  is not referenced,  and when  UPLO = 'L' or 'l',
+*           the leading  n by n  lower triangular part  of the  array  A
+*           must  contain  the  lower triangular part  of the  hermitian
+*           matrix and the  strictly upper triangular part of  A  is not
+*           referenced.
+*           Note that the imaginary parts  of the diagonal elements need
+*           not be set, they are assumed to be zero.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the  calling (sub) program. When  SIDE = 'L' or 'l'  then
+*           LDA must be at least  max( 1, m ), otherwise  LDA must be at
+*           least max( 1, n ).
+*           Unchanged on exit.
+*
+*  B      - COMPLEX*16       array of DIMENSION ( LDB, n ).
+*           Before entry, the leading  m by n part of the array  B  must
+*           contain the matrix B.
+*           Unchanged on exit.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in  the  calling  (sub)  program.   LDB  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*  BETA   - COMPLEX*16      .
+*           On entry,  BETA  specifies the scalar  beta.  When  BETA  is
+*           supplied as zero then C need not be set on input.
+*           Unchanged on exit.
+*
+*  C      - COMPLEX*16       array of DIMENSION ( LDC, n ).
+*           Before entry, the leading  m by n  part of the array  C must
+*           contain the matrix  C,  except when  beta  is zero, in which
+*           case C need not be set on entry.
+*           On exit, the array  C  is overwritten by the  m by n updated
+*           matrix.
+*
+*  LDC    - INTEGER.
+*           On entry, LDC specifies the first dimension of C as declared
+*           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG, MAX, DBLE
+*     .. Local Scalars ..
+      LOGICAL            UPPER
+      INTEGER            I, INFO, J, K, NROWA
+      COMPLEX*16         TEMP1, TEMP2
+*     .. Parameters ..
+      COMPLEX*16         ONE
+      PARAMETER        ( ONE  = ( 1.0D+0, 0.0D+0 ) )
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Executable Statements ..
+*
+*     Set NROWA as the number of rows of A.
+*
+      IF( LSAME( SIDE, 'L' ) )THEN
+         NROWA = M
+      ELSE
+         NROWA = N
+      END IF
+      UPPER = LSAME( UPLO, 'U' )
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF(      ( .NOT.LSAME( SIDE, 'L' ) ).AND.
+     $         ( .NOT.LSAME( SIDE, 'R' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.UPPER              ).AND.
+     $         ( .NOT.LSAME( UPLO, 'L' ) )      )THEN
+         INFO = 2
+      ELSE IF( M  .LT.0               )THEN
+         INFO = 3
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 7
+      ELSE IF( LDB.LT.MAX( 1, M     ) )THEN
+         INFO = 9
+      ELSE IF( LDC.LT.MAX( 1, M     ) )THEN
+         INFO = 12
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZHEMM ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.
+     $    ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         IF( BETA.EQ.ZERO )THEN
+            DO 20, J = 1, N
+               DO 10, I = 1, M
+                  C( I, J ) = ZERO
+   10          CONTINUE
+   20       CONTINUE
+         ELSE
+            DO 40, J = 1, N
+               DO 30, I = 1, M
+                  C( I, J ) = BETA*C( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSAME( SIDE, 'L' ) )THEN
+*
+*        Form  C := alpha*A*B + beta*C.
+*
+         IF( UPPER )THEN
+            DO 70, J = 1, N
+               DO 60, I = 1, M
+                  TEMP1 = ALPHA*B( I, J )
+                  TEMP2 = ZERO
+                  DO 50, K = 1, I - 1
+                     C( K, J ) = C( K, J ) + TEMP1*A( K, I )
+                     TEMP2     = TEMP2     +
+     $                           B( K, J )*DCONJG( A( K, I ) )
+   50             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = TEMP1*DBLE( A( I, I ) ) +
+     $                           ALPHA*TEMP2
+                  ELSE
+                     C( I, J ) = BETA *C( I, J )         +
+     $                           TEMP1*DBLE( A( I, I ) ) +
+     $                           ALPHA*TEMP2
+                  END IF
+   60          CONTINUE
+   70       CONTINUE
+         ELSE
+            DO 100, J = 1, N
+               DO 90, I = M, 1, -1
+                  TEMP1 = ALPHA*B( I, J )
+                  TEMP2 = ZERO
+                  DO 80, K = I + 1, M
+                     C( K, J ) = C( K, J ) + TEMP1*A( K, I )
+                     TEMP2     = TEMP2     +
+     $                           B( K, J )*DCONJG( A( K, I ) )
+   80             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = TEMP1*DBLE( A( I, I ) ) +
+     $                           ALPHA*TEMP2
+                  ELSE
+                     C( I, J ) = BETA *C( I, J )         +
+     $                           TEMP1*DBLE( A( I, I ) ) +
+     $                           ALPHA*TEMP2
+                  END IF
+   90          CONTINUE
+  100       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  C := alpha*B*A + beta*C.
+*
+         DO 170, J = 1, N
+            TEMP1 = ALPHA*DBLE( A( J, J ) )
+            IF( BETA.EQ.ZERO )THEN
+               DO 110, I = 1, M
+                  C( I, J ) = TEMP1*B( I, J )
+  110          CONTINUE
+            ELSE
+               DO 120, I = 1, M
+                  C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J )
+  120          CONTINUE
+            END IF
+            DO 140, K = 1, J - 1
+               IF( UPPER )THEN
+                  TEMP1 = ALPHA*A( K, J )
+               ELSE
+                  TEMP1 = ALPHA*DCONJG( A( J, K ) )
+               END IF
+               DO 130, I = 1, M
+                  C( I, J ) = C( I, J ) + TEMP1*B( I, K )
+  130          CONTINUE
+  140       CONTINUE
+            DO 160, K = J + 1, N
+               IF( UPPER )THEN
+                  TEMP1 = ALPHA*DCONJG( A( J, K ) )
+               ELSE
+                  TEMP1 = ALPHA*A( K, J )
+               END IF
+               DO 150, I = 1, M
+                  C( I, J ) = C( I, J ) + TEMP1*B( I, K )
+  150          CONTINUE
+  160       CONTINUE
+  170    CONTINUE
+      END IF
+*
+      RETURN
+*
+*     End of ZHEMM .
+*
+      END
+      SUBROUTINE ZHEMV ( UPLO, N, ALPHA, A, LDA, X, INCX,
+     $                   BETA, Y, INCY )
+*     .. Scalar Arguments ..
+      COMPLEX*16         ALPHA, BETA
+      INTEGER            INCX, INCY, LDA, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZHEMV  performs the matrix-vector  operation
+*
+*     y := alpha*A*x + beta*y,
+*
+*  where alpha and beta are scalars, x and y are n element vectors and
+*  A is an n by n hermitian matrix.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the array A is to be referenced as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the upper triangular part of A
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the lower triangular part of A
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX*16      .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
+*           Before entry with  UPLO = 'U' or 'u', the leading n by n
+*           upper triangular part of the array A must contain the upper
+*           triangular part of the hermitian matrix and the strictly
+*           lower triangular part of A is not referenced.
+*           Before entry with UPLO = 'L' or 'l', the leading n by n
+*           lower triangular part of the array A must contain the lower
+*           triangular part of the hermitian matrix and the strictly
+*           upper triangular part of A is not referenced.
+*           Note that the imaginary parts of the diagonal elements need
+*           not be set and are assumed to be zero.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*  X      - COMPLEX*16       array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  BETA   - COMPLEX*16      .
+*           On entry, BETA specifies the scalar beta. When BETA is
+*           supplied as zero then Y need not be set on input.
+*           Unchanged on exit.
+*
+*  Y      - COMPLEX*16       array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the n
+*           element vector y. On exit, Y is overwritten by the updated
+*           vector y.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX*16         ONE
+      PARAMETER        ( ONE  = ( 1.0D+0, 0.0D+0 ) )
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     .. Local Scalars ..
+      COMPLEX*16         TEMP1, TEMP2
+      INTEGER            I, INFO, IX, IY, J, JX, JY, KX, KY
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG, MAX, DBLE
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( LDA.LT.MAX( 1, N ) )THEN
+         INFO = 5
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 7
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 10
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZHEMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     Set up the start points in  X  and  Y.
+*
+      IF( INCX.GT.0 )THEN
+         KX = 1
+      ELSE
+         KX = 1 - ( N - 1 )*INCX
+      END IF
+      IF( INCY.GT.0 )THEN
+         KY = 1
+      ELSE
+         KY = 1 - ( N - 1 )*INCY
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through the triangular part
+*     of A.
+*
+*     First form  y := beta*y.
+*
+      IF( BETA.NE.ONE )THEN
+         IF( INCY.EQ.1 )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 10, I = 1, N
+                  Y( I ) = ZERO
+   10          CONTINUE
+            ELSE
+               DO 20, I = 1, N
+                  Y( I ) = BETA*Y( I )
+   20          CONTINUE
+            END IF
+         ELSE
+            IY = KY
+            IF( BETA.EQ.ZERO )THEN
+               DO 30, I = 1, N
+                  Y( IY ) = ZERO
+                  IY      = IY   + INCY
+   30          CONTINUE
+            ELSE
+               DO 40, I = 1, N
+                  Y( IY ) = BETA*Y( IY )
+                  IY      = IY           + INCY
+   40          CONTINUE
+            END IF
+         END IF
+      END IF
+      IF( ALPHA.EQ.ZERO )
+     $   RETURN
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  y  when A is stored in upper triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 60, J = 1, N
+               TEMP1 = ALPHA*X( J )
+               TEMP2 = ZERO
+               DO 50, I = 1, J - 1
+                  Y( I ) = Y( I ) + TEMP1*A( I, J )
+                  TEMP2  = TEMP2  + DCONJG( A( I, J ) )*X( I )
+   50          CONTINUE
+               Y( J ) = Y( J ) + TEMP1*DBLE( A( J, J ) ) + ALPHA*TEMP2
+   60       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 80, J = 1, N
+               TEMP1 = ALPHA*X( JX )
+               TEMP2 = ZERO
+               IX    = KX
+               IY    = KY
+               DO 70, I = 1, J - 1
+                  Y( IY ) = Y( IY ) + TEMP1*A( I, J )
+                  TEMP2   = TEMP2   + DCONJG( A( I, J ) )*X( IX )
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+   70          CONTINUE
+               Y( JY ) = Y( JY ) + TEMP1*DBLE( A( J, J ) ) + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+   80       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  y  when A is stored in lower triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 100, J = 1, N
+               TEMP1  = ALPHA*X( J )
+               TEMP2  = ZERO
+               Y( J ) = Y( J ) + TEMP1*DBLE( A( J, J ) )
+               DO 90, I = J + 1, N
+                  Y( I ) = Y( I ) + TEMP1*A( I, J )
+                  TEMP2  = TEMP2  + DCONJG( A( I, J ) )*X( I )
+   90          CONTINUE
+               Y( J ) = Y( J ) + ALPHA*TEMP2
+  100       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 120, J = 1, N
+               TEMP1   = ALPHA*X( JX )
+               TEMP2   = ZERO
+               Y( JY ) = Y( JY ) + TEMP1*DBLE( A( J, J ) )
+               IX      = JX
+               IY      = JY
+               DO 110, I = J + 1, N
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+                  Y( IY ) = Y( IY ) + TEMP1*A( I, J )
+                  TEMP2   = TEMP2   + DCONJG( A( I, J ) )*X( IX )
+  110          CONTINUE
+               Y( JY ) = Y( JY ) + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+  120       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZHEMV .
+*
+      END
+      SUBROUTINE ZHER2 ( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA )
+*     .. Scalar Arguments ..
+      COMPLEX*16         ALPHA
+      INTEGER            INCX, INCY, LDA, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZHER2  performs the hermitian rank 2 operation
+*
+*     A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A,
+*
+*  where alpha is a scalar, x and y are n element vectors and A is an n
+*  by n hermitian matrix.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the array A is to be referenced as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the upper triangular part of A
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the lower triangular part of A
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX*16      .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  X      - COMPLEX*16       array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  Y      - COMPLEX*16       array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the n
+*           element vector y.
+*           Unchanged on exit.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
+*           Before entry with  UPLO = 'U' or 'u', the leading n by n
+*           upper triangular part of the array A must contain the upper
+*           triangular part of the hermitian matrix and the strictly
+*           lower triangular part of A is not referenced. On exit, the
+*           upper triangular part of the array A is overwritten by the
+*           upper triangular part of the updated matrix.
+*           Before entry with UPLO = 'L' or 'l', the leading n by n
+*           lower triangular part of the array A must contain the lower
+*           triangular part of the hermitian matrix and the strictly
+*           upper triangular part of A is not referenced. On exit, the
+*           lower triangular part of the array A is overwritten by the
+*           lower triangular part of the updated matrix.
+*           Note that the imaginary parts of the diagonal elements need
+*           not be set, they are assumed to be zero, and on exit they
+*           are set to zero.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     .. Local Scalars ..
+      COMPLEX*16         TEMP1, TEMP2
+      INTEGER            I, INFO, IX, IY, J, JX, JY, KX, KY
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG, MAX, DBLE
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 5
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 7
+      ELSE IF( LDA.LT.MAX( 1, N ) )THEN
+         INFO = 9
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZHER2 ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) )
+     $   RETURN
+*
+*     Set up the start points in X and Y if the increments are not both
+*     unity.
+*
+      IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN
+         IF( INCX.GT.0 )THEN
+            KX = 1
+         ELSE
+            KX = 1 - ( N - 1 )*INCX
+         END IF
+         IF( INCY.GT.0 )THEN
+            KY = 1
+         ELSE
+            KY = 1 - ( N - 1 )*INCY
+         END IF
+         JX = KX
+         JY = KY
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through the triangular part
+*     of A.
+*
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  A  when A is stored in the upper triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 20, J = 1, N
+               IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*DCONJG( Y( J ) )
+                  TEMP2 = DCONJG( ALPHA*X( J ) )
+                  DO 10, I = 1, J - 1
+                     A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2
+   10             CONTINUE
+                  A( J, J ) = DBLE( A( J, J ) ) +
+     $                        DBLE( X( J )*TEMP1 + Y( J )*TEMP2 )
+               ELSE
+                  A( J, J ) = DBLE( A( J, J ) )
+               END IF
+   20       CONTINUE
+         ELSE
+            DO 40, J = 1, N
+               IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*DCONJG( Y( JY ) )
+                  TEMP2 = DCONJG( ALPHA*X( JX ) )
+                  IX    = KX
+                  IY    = KY
+                  DO 30, I = 1, J - 1
+                     A( I, J ) = A( I, J ) + X( IX )*TEMP1
+     $                                     + Y( IY )*TEMP2
+                     IX        = IX        + INCX
+                     IY        = IY        + INCY
+   30             CONTINUE
+                  A( J, J ) = DBLE( A( J, J ) ) +
+     $                        DBLE( X( JX )*TEMP1 + Y( JY )*TEMP2 )
+               ELSE
+                  A( J, J ) = DBLE( A( J, J ) )
+               END IF
+               JX = JX + INCX
+               JY = JY + INCY
+   40       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  A  when A is stored in the lower triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 60, J = 1, N
+               IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN
+                  TEMP1     = ALPHA*DCONJG( Y( J ) )
+                  TEMP2     = DCONJG( ALPHA*X( J ) )
+                  A( J, J ) = DBLE( A( J, J ) ) +
+     $                        DBLE( X( J )*TEMP1 + Y( J )*TEMP2 )
+                  DO 50, I = J + 1, N
+                     A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2
+   50             CONTINUE
+               ELSE
+                  A( J, J ) = DBLE( A( J, J ) )
+               END IF
+   60       CONTINUE
+         ELSE
+            DO 80, J = 1, N
+               IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN
+                  TEMP1     = ALPHA*DCONJG( Y( JY ) )
+                  TEMP2     = DCONJG( ALPHA*X( JX ) )
+                  A( J, J ) = DBLE( A( J, J ) ) +
+     $                        DBLE( X( JX )*TEMP1 + Y( JY )*TEMP2 )
+                  IX        = JX
+                  IY        = JY
+                  DO 70, I = J + 1, N
+                     IX        = IX        + INCX
+                     IY        = IY        + INCY
+                     A( I, J ) = A( I, J ) + X( IX )*TEMP1
+     $                                     + Y( IY )*TEMP2
+   70             CONTINUE
+               ELSE
+                  A( J, J ) = DBLE( A( J, J ) )
+               END IF
+               JX = JX + INCX
+               JY = JY + INCY
+   80       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZHER2 .
+*
+      END
+      SUBROUTINE ZHER2K( UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB, BETA,
+     $                   C, LDC )
+*     .. Scalar Arguments ..
+      CHARACTER          TRANS, UPLO
+      INTEGER            K, LDA, LDB, LDC, N
+      DOUBLE PRECISION   BETA
+      COMPLEX*16         ALPHA
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), B( LDB, * ), C( LDC, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZHER2K  performs one of the hermitian rank 2k operations
+*
+*     C := alpha*A*conjg( B' ) + conjg( alpha )*B*conjg( A' ) + beta*C,
+*
+*  or
+*
+*     C := alpha*conjg( A' )*B + conjg( alpha )*conjg( B' )*A + beta*C,
+*
+*  where  alpha and beta  are scalars with  beta  real,  C is an  n by n
+*  hermitian matrix and  A and B  are  n by k matrices in the first case
+*  and  k by n  matrices in the second case.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On  entry,   UPLO  specifies  whether  the  upper  or  lower
+*           triangular  part  of the  array  C  is to be  referenced  as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the  upper triangular part of  C
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the  lower triangular part of  C
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry,  TRANS  specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'    C := alpha*A*conjg( B' )          +
+*                                         conjg( alpha )*B*conjg( A' ) +
+*                                         beta*C.
+*
+*              TRANS = 'C' or 'c'    C := alpha*conjg( A' )*B          +
+*                                         conjg( alpha )*conjg( B' )*A +
+*                                         beta*C.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry,  N specifies the order of the matrix C.  N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry with  TRANS = 'N' or 'n',  K  specifies  the number
+*           of  columns  of the  matrices  A and B,  and on  entry  with
+*           TRANS = 'C' or 'c',  K  specifies  the number of rows of the
+*           matrices  A and B.  K must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX*16         .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX*16       array of DIMENSION ( LDA, ka ), where ka is
+*           k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+*           Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+*           part of the array  A  must contain the matrix  A,  otherwise
+*           the leading  k by n  part of the array  A  must contain  the
+*           matrix A.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+*           then  LDA must be at least  max( 1, n ), otherwise  LDA must
+*           be at least  max( 1, k ).
+*           Unchanged on exit.
+*
+*  B      - COMPLEX*16       array of DIMENSION ( LDB, kb ), where kb is
+*           k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+*           Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+*           part of the array  B  must contain the matrix  B,  otherwise
+*           the leading  k by n  part of the array  B  must contain  the
+*           matrix B.
+*           Unchanged on exit.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+*           then  LDB must be at least  max( 1, n ), otherwise  LDB must
+*           be at least  max( 1, k ).
+*           Unchanged on exit.
+*
+*  BETA   - DOUBLE PRECISION            .
+*           On entry, BETA specifies the scalar beta.
+*           Unchanged on exit.
+*
+*  C      - COMPLEX*16          array of DIMENSION ( LDC, n ).
+*           Before entry  with  UPLO = 'U' or 'u',  the leading  n by n
+*           upper triangular part of the array C must contain the upper
+*           triangular part  of the  hermitian matrix  and the strictly
+*           lower triangular part of C is not referenced.  On exit, the
+*           upper triangular part of the array  C is overwritten by the
+*           upper triangular part of the updated matrix.
+*           Before entry  with  UPLO = 'L' or 'l',  the leading  n by n
+*           lower triangular part of the array C must contain the lower
+*           triangular part  of the  hermitian matrix  and the strictly
+*           upper triangular part of C is not referenced.  On exit, the
+*           lower triangular part of the array  C is overwritten by the
+*           lower triangular part of the updated matrix.
+*           Note that the imaginary parts of the diagonal elements need
+*           not be set,  they are assumed to be zero,  and on exit they
+*           are set to zero.
+*
+*  LDC    - INTEGER.
+*           On entry, LDC specifies the first dimension of C as declared
+*           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*  -- Modified 8-Nov-93 to set C(J,J) to DBLE( C(J,J) ) when BETA = 1.
+*     Ed Anderson, Cray Research Inc.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DBLE, DCONJG, MAX
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            UPPER
+      INTEGER            I, INFO, J, L, NROWA
+      COMPLEX*16         TEMP1, TEMP2
+*     ..
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE
+      PARAMETER          ( ONE = 1.0D+0 )
+      COMPLEX*16         ZERO
+      PARAMETER          ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      IF( LSAME( TRANS, 'N' ) ) THEN
+         NROWA = N
+      ELSE
+         NROWA = K
+      END IF
+      UPPER = LSAME( UPLO, 'U' )
+*
+      INFO = 0
+      IF( ( .NOT.UPPER ) .AND. ( .NOT.LSAME( UPLO, 'L' ) ) ) THEN
+         INFO = 1
+      ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ) .AND.
+     $         ( .NOT.LSAME( TRANS, 'C' ) ) ) THEN
+         INFO = 2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = 3
+      ELSE IF( K.LT.0 ) THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) ) THEN
+         INFO = 7
+      ELSE IF( LDB.LT.MAX( 1, NROWA ) ) THEN
+         INFO = 9
+      ELSE IF( LDC.LT.MAX( 1, N ) ) THEN
+         INFO = 12
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZHER2K', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ) .OR. ( ( ( ALPHA.EQ.ZERO ) .OR. ( K.EQ.0 ) ) .AND.
+     $    ( BETA.EQ.ONE ) ) )RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO ) THEN
+         IF( UPPER ) THEN
+            IF( BETA.EQ.DBLE( ZERO ) ) THEN
+               DO 20 J = 1, N
+                  DO 10 I = 1, J
+                     C( I, J ) = ZERO
+   10             CONTINUE
+   20          CONTINUE
+            ELSE
+               DO 40 J = 1, N
+                  DO 30 I = 1, J - 1
+                     C( I, J ) = BETA*C( I, J )
+   30             CONTINUE
+                  C( J, J ) = BETA*DBLE( C( J, J ) )
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( BETA.EQ.DBLE( ZERO ) ) THEN
+               DO 60 J = 1, N
+                  DO 50 I = J, N
+                     C( I, J ) = ZERO
+   50             CONTINUE
+   60          CONTINUE
+            ELSE
+               DO 80 J = 1, N
+                  C( J, J ) = BETA*DBLE( C( J, J ) )
+                  DO 70 I = J + 1, N
+                     C( I, J ) = BETA*C( I, J )
+   70             CONTINUE
+   80          CONTINUE
+            END IF
+         END IF
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSAME( TRANS, 'N' ) ) THEN
+*
+*        Form  C := alpha*A*conjg( B' ) + conjg( alpha )*B*conjg( A' ) +
+*                   C.
+*
+         IF( UPPER ) THEN
+            DO 130 J = 1, N
+               IF( BETA.EQ.DBLE( ZERO ) ) THEN
+                  DO 90 I = 1, J
+                     C( I, J ) = ZERO
+   90             CONTINUE
+               ELSE IF( BETA.NE.ONE ) THEN
+                  DO 100 I = 1, J - 1
+                     C( I, J ) = BETA*C( I, J )
+  100             CONTINUE
+                  C( J, J ) = BETA*DBLE( C( J, J ) )
+               ELSE
+                  C( J, J ) = DBLE( C( J, J ) )
+               END IF
+               DO 120 L = 1, K
+                  IF( ( A( J, L ).NE.ZERO ) .OR. ( B( J, L ).NE.ZERO ) )
+     $                 THEN
+                     TEMP1 = ALPHA*DCONJG( B( J, L ) )
+                     TEMP2 = DCONJG( ALPHA*A( J, L ) )
+                     DO 110 I = 1, J - 1
+                        C( I, J ) = C( I, J ) + A( I, L )*TEMP1 +
+     $                              B( I, L )*TEMP2
+  110                CONTINUE
+                     C( J, J ) = DBLE( C( J, J ) ) +
+     $                           DBLE( A( J, L )*TEMP1+B( J, L )*TEMP2 )
+                  END IF
+  120          CONTINUE
+  130       CONTINUE
+         ELSE
+            DO 180 J = 1, N
+               IF( BETA.EQ.DBLE( ZERO ) ) THEN
+                  DO 140 I = J, N
+                     C( I, J ) = ZERO
+  140             CONTINUE
+               ELSE IF( BETA.NE.ONE ) THEN
+                  DO 150 I = J + 1, N
+                     C( I, J ) = BETA*C( I, J )
+  150             CONTINUE
+                  C( J, J ) = BETA*DBLE( C( J, J ) )
+               ELSE
+                  C( J, J ) = DBLE( C( J, J ) )
+               END IF
+               DO 170 L = 1, K
+                  IF( ( A( J, L ).NE.ZERO ) .OR. ( B( J, L ).NE.ZERO ) )
+     $                 THEN
+                     TEMP1 = ALPHA*DCONJG( B( J, L ) )
+                     TEMP2 = DCONJG( ALPHA*A( J, L ) )
+                     DO 160 I = J + 1, N
+                        C( I, J ) = C( I, J ) + A( I, L )*TEMP1 +
+     $                              B( I, L )*TEMP2
+  160                CONTINUE
+                     C( J, J ) = DBLE( C( J, J ) ) +
+     $                           DBLE( A( J, L )*TEMP1+B( J, L )*TEMP2 )
+                  END IF
+  170          CONTINUE
+  180       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  C := alpha*conjg( A' )*B + conjg( alpha )*conjg( B' )*A +
+*                   C.
+*
+         IF( UPPER ) THEN
+            DO 210 J = 1, N
+               DO 200 I = 1, J
+                  TEMP1 = ZERO
+                  TEMP2 = ZERO
+                  DO 190 L = 1, K
+                     TEMP1 = TEMP1 + DCONJG( A( L, I ) )*B( L, J )
+                     TEMP2 = TEMP2 + DCONJG( B( L, I ) )*A( L, J )
+  190             CONTINUE
+                  IF( I.EQ.J ) THEN
+                     IF( BETA.EQ.DBLE( ZERO ) ) THEN
+                        C( J, J ) = DBLE( ALPHA*TEMP1+DCONJG( ALPHA )*
+     $                              TEMP2 )
+                     ELSE
+                        C( J, J ) = BETA*DBLE( C( J, J ) ) +
+     $                              DBLE( ALPHA*TEMP1+DCONJG( ALPHA )*
+     $                              TEMP2 )
+                     END IF
+                  ELSE
+                     IF( BETA.EQ.DBLE( ZERO ) ) THEN
+                        C( I, J ) = ALPHA*TEMP1 + DCONJG( ALPHA )*TEMP2
+                     ELSE
+                        C( I, J ) = BETA*C( I, J ) + ALPHA*TEMP1 +
+     $                              DCONJG( ALPHA )*TEMP2
+                     END IF
+                  END IF
+  200          CONTINUE
+  210       CONTINUE
+         ELSE
+            DO 240 J = 1, N
+               DO 230 I = J, N
+                  TEMP1 = ZERO
+                  TEMP2 = ZERO
+                  DO 220 L = 1, K
+                     TEMP1 = TEMP1 + DCONJG( A( L, I ) )*B( L, J )
+                     TEMP2 = TEMP2 + DCONJG( B( L, I ) )*A( L, J )
+  220             CONTINUE
+                  IF( I.EQ.J ) THEN
+                     IF( BETA.EQ.DBLE( ZERO ) ) THEN
+                        C( J, J ) = DBLE( ALPHA*TEMP1+DCONJG( ALPHA )*
+     $                              TEMP2 )
+                     ELSE
+                        C( J, J ) = BETA*DBLE( C( J, J ) ) +
+     $                              DBLE( ALPHA*TEMP1+DCONJG( ALPHA )*
+     $                              TEMP2 )
+                     END IF
+                  ELSE
+                     IF( BETA.EQ.DBLE( ZERO ) ) THEN
+                        C( I, J ) = ALPHA*TEMP1 + DCONJG( ALPHA )*TEMP2
+                     ELSE
+                        C( I, J ) = BETA*C( I, J ) + ALPHA*TEMP1 +
+     $                              DCONJG( ALPHA )*TEMP2
+                     END IF
+                  END IF
+  230          CONTINUE
+  240       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZHER2K.
+*
+      END
+      SUBROUTINE ZHER  ( UPLO, N, ALPHA, X, INCX, A, LDA )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   ALPHA
+      INTEGER            INCX, LDA, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZHER   performs the hermitian rank 1 operation
+*
+*     A := alpha*x*conjg( x' ) + A,
+*
+*  where alpha is a real scalar, x is an n element vector and A is an
+*  n by n hermitian matrix.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the array A is to be referenced as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the upper triangular part of A
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the lower triangular part of A
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - DOUBLE PRECISION.
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  X      - COMPLEX*16       array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
+*           Before entry with  UPLO = 'U' or 'u', the leading n by n
+*           upper triangular part of the array A must contain the upper
+*           triangular part of the hermitian matrix and the strictly
+*           lower triangular part of A is not referenced. On exit, the
+*           upper triangular part of the array A is overwritten by the
+*           upper triangular part of the updated matrix.
+*           Before entry with UPLO = 'L' or 'l', the leading n by n
+*           lower triangular part of the array A must contain the lower
+*           triangular part of the hermitian matrix and the strictly
+*           upper triangular part of A is not referenced. On exit, the
+*           lower triangular part of the array A is overwritten by the
+*           lower triangular part of the updated matrix.
+*           Note that the imaginary parts of the diagonal elements need
+*           not be set, they are assumed to be zero, and on exit they
+*           are set to zero.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     .. Local Scalars ..
+      COMPLEX*16         TEMP
+      INTEGER            I, INFO, IX, J, JX, KX
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG, MAX, DBLE
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 5
+      ELSE IF( LDA.LT.MAX( 1, N ) )THEN
+         INFO = 7
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZHER  ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ALPHA.EQ.DBLE( ZERO ) ) )
+     $   RETURN
+*
+*     Set the start point in X if the increment is not unity.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through the triangular part
+*     of A.
+*
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  A  when A is stored in upper triangle.
+*
+         IF( INCX.EQ.1 )THEN
+            DO 20, J = 1, N
+               IF( X( J ).NE.ZERO )THEN
+                  TEMP = ALPHA*DCONJG( X( J ) )
+                  DO 10, I = 1, J - 1
+                     A( I, J ) = A( I, J ) + X( I )*TEMP
+   10             CONTINUE
+                  A( J, J ) = DBLE( A( J, J ) ) + DBLE( X( J )*TEMP )
+               ELSE
+                  A( J, J ) = DBLE( A( J, J ) )
+               END IF
+   20       CONTINUE
+         ELSE
+            JX = KX
+            DO 40, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*DCONJG( X( JX ) )
+                  IX   = KX
+                  DO 30, I = 1, J - 1
+                     A( I, J ) = A( I, J ) + X( IX )*TEMP
+                     IX        = IX        + INCX
+   30             CONTINUE
+                  A( J, J ) = DBLE( A( J, J ) ) + DBLE( X( JX )*TEMP )
+               ELSE
+                  A( J, J ) = DBLE( A( J, J ) )
+               END IF
+               JX = JX + INCX
+   40       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  A  when A is stored in lower triangle.
+*
+         IF( INCX.EQ.1 )THEN
+            DO 60, J = 1, N
+               IF( X( J ).NE.ZERO )THEN
+                  TEMP      = ALPHA*DCONJG( X( J ) )
+                  A( J, J ) = DBLE( A( J, J ) ) + DBLE( TEMP*X( J ) )
+                  DO 50, I = J + 1, N
+                     A( I, J ) = A( I, J ) + X( I )*TEMP
+   50             CONTINUE
+               ELSE
+                  A( J, J ) = DBLE( A( J, J ) )
+               END IF
+   60       CONTINUE
+         ELSE
+            JX = KX
+            DO 80, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP      = ALPHA*DCONJG( X( JX ) )
+                  A( J, J ) = DBLE( A( J, J ) ) + DBLE( TEMP*X( JX ) )
+                  IX        = JX
+                  DO 70, I = J + 1, N
+                     IX        = IX        + INCX
+                     A( I, J ) = A( I, J ) + X( IX )*TEMP
+   70             CONTINUE
+               ELSE
+                  A( J, J ) = DBLE( A( J, J ) )
+               END IF
+               JX = JX + INCX
+   80       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZHER  .
+*
+      END
+      SUBROUTINE ZHERK( UPLO, TRANS, N, K, ALPHA, A, LDA, BETA, C, LDC )
+*     .. Scalar Arguments ..
+      CHARACTER          TRANS, UPLO
+      INTEGER            K, LDA, LDC, N
+      DOUBLE PRECISION   ALPHA, BETA
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), C( LDC, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZHERK  performs one of the hermitian rank k operations
+*
+*     C := alpha*A*conjg( A' ) + beta*C,
+*
+*  or
+*
+*     C := alpha*conjg( A' )*A + beta*C,
+*
+*  where  alpha and beta  are  real scalars,  C is an  n by n  hermitian
+*  matrix and  A  is an  n by k  matrix in the  first case and a  k by n
+*  matrix in the second case.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On  entry,   UPLO  specifies  whether  the  upper  or  lower
+*           triangular  part  of the  array  C  is to be  referenced  as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the  upper triangular part of  C
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the  lower triangular part of  C
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry,  TRANS  specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   C := alpha*A*conjg( A' ) + beta*C.
+*
+*              TRANS = 'C' or 'c'   C := alpha*conjg( A' )*A + beta*C.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry,  N specifies the order of the matrix C.  N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry with  TRANS = 'N' or 'n',  K  specifies  the number
+*           of  columns   of  the   matrix   A,   and  on   entry   with
+*           TRANS = 'C' or 'c',  K  specifies  the number of rows of the
+*           matrix A.  K must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - DOUBLE PRECISION            .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX*16       array of DIMENSION ( LDA, ka ), where ka is
+*           k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+*           Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+*           part of the array  A  must contain the matrix  A,  otherwise
+*           the leading  k by n  part of the array  A  must contain  the
+*           matrix A.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+*           then  LDA must be at least  max( 1, n ), otherwise  LDA must
+*           be at least  max( 1, k ).
+*           Unchanged on exit.
+*
+*  BETA   - DOUBLE PRECISION.
+*           On entry, BETA specifies the scalar beta.
+*           Unchanged on exit.
+*
+*  C      - COMPLEX*16          array of DIMENSION ( LDC, n ).
+*           Before entry  with  UPLO = 'U' or 'u',  the leading  n by n
+*           upper triangular part of the array C must contain the upper
+*           triangular part  of the  hermitian matrix  and the strictly
+*           lower triangular part of C is not referenced.  On exit, the
+*           upper triangular part of the array  C is overwritten by the
+*           upper triangular part of the updated matrix.
+*           Before entry  with  UPLO = 'L' or 'l',  the leading  n by n
+*           lower triangular part of the array C must contain the lower
+*           triangular part  of the  hermitian matrix  and the strictly
+*           upper triangular part of C is not referenced.  On exit, the
+*           lower triangular part of the array  C is overwritten by the
+*           lower triangular part of the updated matrix.
+*           Note that the imaginary parts of the diagonal elements need
+*           not be set,  they are assumed to be zero,  and on exit they
+*           are set to zero.
+*
+*  LDC    - INTEGER.
+*           On entry, LDC specifies the first dimension of C as declared
+*           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*  -- Modified 8-Nov-93 to set C(J,J) to DBLE( C(J,J) ) when BETA = 1.
+*     Ed Anderson, Cray Research Inc.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DBLE, DCMPLX, DCONJG, MAX
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            UPPER
+      INTEGER            I, INFO, J, L, NROWA
+      DOUBLE PRECISION   RTEMP
+      COMPLEX*16         TEMP
+*     ..
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE, ZERO
+      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      IF( LSAME( TRANS, 'N' ) ) THEN
+         NROWA = N
+      ELSE
+         NROWA = K
+      END IF
+      UPPER = LSAME( UPLO, 'U' )
+*
+      INFO = 0
+      IF( ( .NOT.UPPER ) .AND. ( .NOT.LSAME( UPLO, 'L' ) ) ) THEN
+         INFO = 1
+      ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ) .AND.
+     $         ( .NOT.LSAME( TRANS, 'C' ) ) ) THEN
+         INFO = 2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = 3
+      ELSE IF( K.LT.0 ) THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) ) THEN
+         INFO = 7
+      ELSE IF( LDC.LT.MAX( 1, N ) ) THEN
+         INFO = 10
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZHERK ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ) .OR. ( ( ( ALPHA.EQ.ZERO ) .OR. ( K.EQ.0 ) ) .AND.
+     $    ( BETA.EQ.ONE ) ) )RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO ) THEN
+         IF( UPPER ) THEN
+            IF( BETA.EQ.ZERO ) THEN
+               DO 20 J = 1, N
+                  DO 10 I = 1, J
+                     C( I, J ) = ZERO
+   10             CONTINUE
+   20          CONTINUE
+            ELSE
+               DO 40 J = 1, N
+                  DO 30 I = 1, J - 1
+                     C( I, J ) = BETA*C( I, J )
+   30             CONTINUE
+                  C( J, J ) = BETA*DBLE( C( J, J ) )
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( BETA.EQ.ZERO ) THEN
+               DO 60 J = 1, N
+                  DO 50 I = J, N
+                     C( I, J ) = ZERO
+   50             CONTINUE
+   60          CONTINUE
+            ELSE
+               DO 80 J = 1, N
+                  C( J, J ) = BETA*DBLE( C( J, J ) )
+                  DO 70 I = J + 1, N
+                     C( I, J ) = BETA*C( I, J )
+   70             CONTINUE
+   80          CONTINUE
+            END IF
+         END IF
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSAME( TRANS, 'N' ) ) THEN
+*
+*        Form  C := alpha*A*conjg( A' ) + beta*C.
+*
+         IF( UPPER ) THEN
+            DO 130 J = 1, N
+               IF( BETA.EQ.ZERO ) THEN
+                  DO 90 I = 1, J
+                     C( I, J ) = ZERO
+   90             CONTINUE
+               ELSE IF( BETA.NE.ONE ) THEN
+                  DO 100 I = 1, J - 1
+                     C( I, J ) = BETA*C( I, J )
+  100             CONTINUE
+                  C( J, J ) = BETA*DBLE( C( J, J ) )
+               ELSE
+                  C( J, J ) = DBLE( C( J, J ) )
+               END IF
+               DO 120 L = 1, K
+                  IF( A( J, L ).NE.DCMPLX( ZERO ) ) THEN
+                     TEMP = ALPHA*DCONJG( A( J, L ) )
+                     DO 110 I = 1, J - 1
+                        C( I, J ) = C( I, J ) + TEMP*A( I, L )
+  110                CONTINUE
+                     C( J, J ) = DBLE( C( J, J ) ) +
+     $                           DBLE( TEMP*A( I, L ) )
+                  END IF
+  120          CONTINUE
+  130       CONTINUE
+         ELSE
+            DO 180 J = 1, N
+               IF( BETA.EQ.ZERO ) THEN
+                  DO 140 I = J, N
+                     C( I, J ) = ZERO
+  140             CONTINUE
+               ELSE IF( BETA.NE.ONE ) THEN
+                  C( J, J ) = BETA*DBLE( C( J, J ) )
+                  DO 150 I = J + 1, N
+                     C( I, J ) = BETA*C( I, J )
+  150             CONTINUE
+               ELSE
+                  C( J, J ) = DBLE( C( J, J ) )
+               END IF
+               DO 170 L = 1, K
+                  IF( A( J, L ).NE.DCMPLX( ZERO ) ) THEN
+                     TEMP = ALPHA*DCONJG( A( J, L ) )
+                     C( J, J ) = DBLE( C( J, J ) ) +
+     $                           DBLE( TEMP*A( J, L ) )
+                     DO 160 I = J + 1, N
+                        C( I, J ) = C( I, J ) + TEMP*A( I, L )
+  160                CONTINUE
+                  END IF
+  170          CONTINUE
+  180       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  C := alpha*conjg( A' )*A + beta*C.
+*
+         IF( UPPER ) THEN
+            DO 220 J = 1, N
+               DO 200 I = 1, J - 1
+                  TEMP = ZERO
+                  DO 190 L = 1, K
+                     TEMP = TEMP + DCONJG( A( L, I ) )*A( L, J )
+  190             CONTINUE
+                  IF( BETA.EQ.ZERO ) THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  200          CONTINUE
+               RTEMP = ZERO
+               DO 210 L = 1, K
+                  RTEMP = RTEMP + DCONJG( A( L, J ) )*A( L, J )
+  210          CONTINUE
+               IF( BETA.EQ.ZERO ) THEN
+                  C( J, J ) = ALPHA*RTEMP
+               ELSE
+                  C( J, J ) = ALPHA*RTEMP + BETA*DBLE( C( J, J ) )
+               END IF
+  220       CONTINUE
+         ELSE
+            DO 260 J = 1, N
+               RTEMP = ZERO
+               DO 230 L = 1, K
+                  RTEMP = RTEMP + DCONJG( A( L, J ) )*A( L, J )
+  230          CONTINUE
+               IF( BETA.EQ.ZERO ) THEN
+                  C( J, J ) = ALPHA*RTEMP
+               ELSE
+                  C( J, J ) = ALPHA*RTEMP + BETA*DBLE( C( J, J ) )
+               END IF
+               DO 250 I = J + 1, N
+                  TEMP = ZERO
+                  DO 240 L = 1, K
+                     TEMP = TEMP + DCONJG( A( L, I ) )*A( L, J )
+  240             CONTINUE
+                  IF( BETA.EQ.ZERO ) THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  250          CONTINUE
+  260       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZHERK .
+*
+      END
+      SUBROUTINE ZHPMV ( UPLO, N, ALPHA, AP, X, INCX, BETA, Y, INCY )
+*     .. Scalar Arguments ..
+      COMPLEX*16         ALPHA, BETA
+      INTEGER            INCX, INCY, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      COMPLEX*16         AP( * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZHPMV  performs the matrix-vector operation
+*
+*     y := alpha*A*x + beta*y,
+*
+*  where alpha and beta are scalars, x and y are n element vectors and
+*  A is an n by n hermitian matrix, supplied in packed form.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the matrix A is supplied in the packed
+*           array AP as follows:
+*
+*              UPLO = 'U' or 'u'   The upper triangular part of A is
+*                                  supplied in AP.
+*
+*              UPLO = 'L' or 'l'   The lower triangular part of A is
+*                                  supplied in AP.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX*16      .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  AP     - COMPLEX*16       array of DIMENSION at least
+*           ( ( n*( n + 1 ) )/2 ).
+*           Before entry with UPLO = 'U' or 'u', the array AP must
+*           contain the upper triangular part of the hermitian matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
+*           and a( 2, 2 ) respectively, and so on.
+*           Before entry with UPLO = 'L' or 'l', the array AP must
+*           contain the lower triangular part of the hermitian matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
+*           and a( 3, 1 ) respectively, and so on.
+*           Note that the imaginary parts of the diagonal elements need
+*           not be set and are assumed to be zero.
+*           Unchanged on exit.
+*
+*  X      - COMPLEX*16       array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  BETA   - COMPLEX*16      .
+*           On entry, BETA specifies the scalar beta. When BETA is
+*           supplied as zero then Y need not be set on input.
+*           Unchanged on exit.
+*
+*  Y      - COMPLEX*16       array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the n
+*           element vector y. On exit, Y is overwritten by the updated
+*           vector y.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX*16         ONE
+      PARAMETER        ( ONE  = ( 1.0D+0, 0.0D+0 ) )
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     .. Local Scalars ..
+      COMPLEX*16         TEMP1, TEMP2
+      INTEGER            I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG, DBLE
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 6
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 9
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZHPMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     Set up the start points in  X  and  Y.
+*
+      IF( INCX.GT.0 )THEN
+         KX = 1
+      ELSE
+         KX = 1 - ( N - 1 )*INCX
+      END IF
+      IF( INCY.GT.0 )THEN
+         KY = 1
+      ELSE
+         KY = 1 - ( N - 1 )*INCY
+      END IF
+*
+*     Start the operations. In this version the elements of the array AP
+*     are accessed sequentially with one pass through AP.
+*
+*     First form  y := beta*y.
+*
+      IF( BETA.NE.ONE )THEN
+         IF( INCY.EQ.1 )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 10, I = 1, N
+                  Y( I ) = ZERO
+   10          CONTINUE
+            ELSE
+               DO 20, I = 1, N
+                  Y( I ) = BETA*Y( I )
+   20          CONTINUE
+            END IF
+         ELSE
+            IY = KY
+            IF( BETA.EQ.ZERO )THEN
+               DO 30, I = 1, N
+                  Y( IY ) = ZERO
+                  IY      = IY   + INCY
+   30          CONTINUE
+            ELSE
+               DO 40, I = 1, N
+                  Y( IY ) = BETA*Y( IY )
+                  IY      = IY           + INCY
+   40          CONTINUE
+            END IF
+         END IF
+      END IF
+      IF( ALPHA.EQ.ZERO )
+     $   RETURN
+      KK = 1
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  y  when AP contains the upper triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 60, J = 1, N
+               TEMP1 = ALPHA*X( J )
+               TEMP2 = ZERO
+               K     = KK
+               DO 50, I = 1, J - 1
+                  Y( I ) = Y( I ) + TEMP1*AP( K )
+                  TEMP2  = TEMP2  + DCONJG( AP( K ) )*X( I )
+                  K      = K      + 1
+   50          CONTINUE
+               Y( J ) = Y( J ) + TEMP1*DBLE( AP( KK + J - 1 ) )
+     $                         + ALPHA*TEMP2
+               KK     = KK     + J
+   60       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 80, J = 1, N
+               TEMP1 = ALPHA*X( JX )
+               TEMP2 = ZERO
+               IX    = KX
+               IY    = KY
+               DO 70, K = KK, KK + J - 2
+                  Y( IY ) = Y( IY ) + TEMP1*AP( K )
+                  TEMP2   = TEMP2   + DCONJG( AP( K ) )*X( IX )
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+   70          CONTINUE
+               Y( JY ) = Y( JY ) + TEMP1*DBLE( AP( KK + J - 1 ) )
+     $                           + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+               KK      = KK      + J
+   80       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  y  when AP contains the lower triangle.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 100, J = 1, N
+               TEMP1  = ALPHA*X( J )
+               TEMP2  = ZERO
+               Y( J ) = Y( J ) + TEMP1*DBLE( AP( KK ) )
+               K      = KK     + 1
+               DO 90, I = J + 1, N
+                  Y( I ) = Y( I ) + TEMP1*AP( K )
+                  TEMP2  = TEMP2  + DCONJG( AP( K ) )*X( I )
+                  K      = K      + 1
+   90          CONTINUE
+               Y( J ) = Y( J ) + ALPHA*TEMP2
+               KK     = KK     + ( N - J + 1 )
+  100       CONTINUE
+         ELSE
+            JX = KX
+            JY = KY
+            DO 120, J = 1, N
+               TEMP1   = ALPHA*X( JX )
+               TEMP2   = ZERO
+               Y( JY ) = Y( JY ) + TEMP1*DBLE( AP( KK ) )
+               IX      = JX
+               IY      = JY
+               DO 110, K = KK + 1, KK + N - J
+                  IX      = IX      + INCX
+                  IY      = IY      + INCY
+                  Y( IY ) = Y( IY ) + TEMP1*AP( K )
+                  TEMP2   = TEMP2   + DCONJG( AP( K ) )*X( IX )
+  110          CONTINUE
+               Y( JY ) = Y( JY ) + ALPHA*TEMP2
+               JX      = JX      + INCX
+               JY      = JY      + INCY
+               KK      = KK      + ( N - J + 1 )
+  120       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZHPMV .
+*
+      END
+      SUBROUTINE ZHPR2 ( UPLO, N, ALPHA, X, INCX, Y, INCY, AP )
+*     .. Scalar Arguments ..
+      COMPLEX*16         ALPHA
+      INTEGER            INCX, INCY, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      COMPLEX*16         AP( * ), X( * ), Y( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZHPR2  performs the hermitian rank 2 operation
+*
+*     A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A,
+*
+*  where alpha is a scalar, x and y are n element vectors and A is an
+*  n by n hermitian matrix, supplied in packed form.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the matrix A is supplied in the packed
+*           array AP as follows:
+*
+*              UPLO = 'U' or 'u'   The upper triangular part of A is
+*                                  supplied in AP.
+*
+*              UPLO = 'L' or 'l'   The lower triangular part of A is
+*                                  supplied in AP.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX*16      .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  X      - COMPLEX*16       array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  Y      - COMPLEX*16       array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCY ) ).
+*           Before entry, the incremented array Y must contain the n
+*           element vector y.
+*           Unchanged on exit.
+*
+*  INCY   - INTEGER.
+*           On entry, INCY specifies the increment for the elements of
+*           Y. INCY must not be zero.
+*           Unchanged on exit.
+*
+*  AP     - COMPLEX*16       array of DIMENSION at least
+*           ( ( n*( n + 1 ) )/2 ).
+*           Before entry with  UPLO = 'U' or 'u', the array AP must
+*           contain the upper triangular part of the hermitian matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
+*           and a( 2, 2 ) respectively, and so on. On exit, the array
+*           AP is overwritten by the upper triangular part of the
+*           updated matrix.
+*           Before entry with UPLO = 'L' or 'l', the array AP must
+*           contain the lower triangular part of the hermitian matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
+*           and a( 3, 1 ) respectively, and so on. On exit, the array
+*           AP is overwritten by the lower triangular part of the
+*           updated matrix.
+*           Note that the imaginary parts of the diagonal elements need
+*           not be set, they are assumed to be zero, and on exit they
+*           are set to zero.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     .. Local Scalars ..
+      COMPLEX*16         TEMP1, TEMP2
+      INTEGER            I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG, DBLE
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 5
+      ELSE IF( INCY.EQ.0 )THEN
+         INFO = 7
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZHPR2 ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) )
+     $   RETURN
+*
+*     Set up the start points in X and Y if the increments are not both
+*     unity.
+*
+      IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN
+         IF( INCX.GT.0 )THEN
+            KX = 1
+         ELSE
+            KX = 1 - ( N - 1 )*INCX
+         END IF
+         IF( INCY.GT.0 )THEN
+            KY = 1
+         ELSE
+            KY = 1 - ( N - 1 )*INCY
+         END IF
+         JX = KX
+         JY = KY
+      END IF
+*
+*     Start the operations. In this version the elements of the array AP
+*     are accessed sequentially with one pass through AP.
+*
+      KK = 1
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  A  when upper triangle is stored in AP.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 20, J = 1, N
+               IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*DCONJG( Y( J ) )
+                  TEMP2 = DCONJG( ALPHA*X( J ) )
+                  K     = KK
+                  DO 10, I = 1, J - 1
+                     AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2
+                     K       = K       + 1
+   10             CONTINUE
+                  AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) +
+     $                               DBLE( X( J )*TEMP1 + Y( J )*TEMP2 )
+               ELSE
+                  AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) )
+               END IF
+               KK = KK + J
+   20       CONTINUE
+         ELSE
+            DO 40, J = 1, N
+               IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN
+                  TEMP1 = ALPHA*DCONJG( Y( JY ) )
+                  TEMP2 = DCONJG( ALPHA*X( JX ) )
+                  IX    = KX
+                  IY    = KY
+                  DO 30, K = KK, KK + J - 2
+                     AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2
+                     IX      = IX      + INCX
+                     IY      = IY      + INCY
+   30             CONTINUE
+                  AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) +
+     $                               DBLE( X( JX )*TEMP1 +
+     $                                     Y( JY )*TEMP2 )
+               ELSE
+                  AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) )
+               END IF
+               JX = JX + INCX
+               JY = JY + INCY
+               KK = KK + J
+   40       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  A  when lower triangle is stored in AP.
+*
+         IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN
+            DO 60, J = 1, N
+               IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN
+                  TEMP1   = ALPHA*DCONJG( Y( J ) )
+                  TEMP2   = DCONJG( ALPHA*X( J ) )
+                  AP( KK ) = DBLE( AP( KK ) ) +
+     $                       DBLE( X( J )*TEMP1 + Y( J )*TEMP2 )
+                  K        = KK               + 1
+                  DO 50, I = J + 1, N
+                     AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2
+                     K       = K       + 1
+   50             CONTINUE
+               ELSE
+                  AP( KK ) = DBLE( AP( KK ) )
+               END IF
+               KK = KK + N - J + 1
+   60       CONTINUE
+         ELSE
+            DO 80, J = 1, N
+               IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN
+                  TEMP1    = ALPHA*DCONJG( Y( JY ) )
+                  TEMP2    = DCONJG( ALPHA*X( JX ) )
+                  AP( KK ) = DBLE( AP( KK ) ) +
+     $                       DBLE( X( JX )*TEMP1 + Y( JY )*TEMP2 )
+                  IX       = JX
+                  IY       = JY
+                  DO 70, K = KK + 1, KK + N - J
+                     IX      = IX      + INCX
+                     IY      = IY      + INCY
+                     AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2
+   70             CONTINUE
+               ELSE
+                  AP( KK ) = DBLE( AP( KK ) )
+               END IF
+               JX = JX + INCX
+               JY = JY + INCY
+               KK = KK + N - J + 1
+   80       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZHPR2 .
+*
+      END
+      SUBROUTINE ZHPR  ( UPLO, N, ALPHA, X, INCX, AP )
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   ALPHA
+      INTEGER            INCX, N
+      CHARACTER*1        UPLO
+*     .. Array Arguments ..
+      COMPLEX*16         AP( * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZHPR    performs the hermitian rank 1 operation
+*
+*     A := alpha*x*conjg( x' ) + A,
+*
+*  where alpha is a real scalar, x is an n element vector and A is an
+*  n by n hermitian matrix, supplied in packed form.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the upper or lower
+*           triangular part of the matrix A is supplied in the packed
+*           array AP as follows:
+*
+*              UPLO = 'U' or 'u'   The upper triangular part of A is
+*                                  supplied in AP.
+*
+*              UPLO = 'L' or 'l'   The lower triangular part of A is
+*                                  supplied in AP.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - DOUBLE PRECISION.
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  X      - COMPLEX*16       array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x.
+*           Unchanged on exit.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*  AP     - COMPLEX*16       array of DIMENSION at least
+*           ( ( n*( n + 1 ) )/2 ).
+*           Before entry with  UPLO = 'U' or 'u', the array AP must
+*           contain the upper triangular part of the hermitian matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
+*           and a( 2, 2 ) respectively, and so on. On exit, the array
+*           AP is overwritten by the upper triangular part of the
+*           updated matrix.
+*           Before entry with UPLO = 'L' or 'l', the array AP must
+*           contain the lower triangular part of the hermitian matrix
+*           packed sequentially, column by column, so that AP( 1 )
+*           contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
+*           and a( 3, 1 ) respectively, and so on. On exit, the array
+*           AP is overwritten by the lower triangular part of the
+*           updated matrix.
+*           Note that the imaginary parts of the diagonal elements need
+*           not be set, they are assumed to be zero, and on exit they
+*           are set to zero.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     .. Local Scalars ..
+      COMPLEX*16         TEMP
+      INTEGER            I, INFO, IX, J, JX, K, KK, KX
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG, DBLE
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO, 'U' ).AND.
+     $         .NOT.LSAME( UPLO, 'L' )      )THEN
+         INFO = 1
+      ELSE IF( N.LT.0 )THEN
+         INFO = 2
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 5
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZHPR  ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.( ALPHA.EQ.DBLE( ZERO ) ) )
+     $   RETURN
+*
+*     Set the start point in X if the increment is not unity.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of the array AP
+*     are accessed sequentially with one pass through AP.
+*
+      KK = 1
+      IF( LSAME( UPLO, 'U' ) )THEN
+*
+*        Form  A  when upper triangle is stored in AP.
+*
+         IF( INCX.EQ.1 )THEN
+            DO 20, J = 1, N
+               IF( X( J ).NE.ZERO )THEN
+                  TEMP = ALPHA*DCONJG( X( J ) )
+                  K    = KK
+                  DO 10, I = 1, J - 1
+                     AP( K ) = AP( K ) + X( I )*TEMP
+                     K       = K       + 1
+   10             CONTINUE
+                  AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) )
+     $                               + DBLE( X( J )*TEMP )
+               ELSE
+                  AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) )
+               END IF
+               KK = KK + J
+   20       CONTINUE
+         ELSE
+            JX = KX
+            DO 40, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP = ALPHA*DCONJG( X( JX ) )
+                  IX   = KX
+                  DO 30, K = KK, KK + J - 2
+                     AP( K ) = AP( K ) + X( IX )*TEMP
+                     IX      = IX      + INCX
+   30             CONTINUE
+                  AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) )
+     $                               + DBLE( X( JX )*TEMP )
+               ELSE
+                  AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) )
+               END IF
+               JX = JX + INCX
+               KK = KK + J
+   40       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  A  when lower triangle is stored in AP.
+*
+         IF( INCX.EQ.1 )THEN
+            DO 60, J = 1, N
+               IF( X( J ).NE.ZERO )THEN
+                  TEMP     = ALPHA*DCONJG( X( J ) )
+                  AP( KK ) = DBLE( AP( KK ) ) + DBLE( TEMP*X( J ) )
+                  K        = KK               + 1
+                  DO 50, I = J + 1, N
+                     AP( K ) = AP( K ) + X( I )*TEMP
+                     K       = K       + 1
+   50             CONTINUE
+               ELSE
+                  AP( KK ) = DBLE( AP( KK ) )
+               END IF
+               KK = KK + N - J + 1
+   60       CONTINUE
+         ELSE
+            JX = KX
+            DO 80, J = 1, N
+               IF( X( JX ).NE.ZERO )THEN
+                  TEMP    = ALPHA*DCONJG( X( JX ) )
+                  AP( KK ) = DBLE( AP( KK ) ) + DBLE( TEMP*X( JX ) )
+                  IX      = JX
+                  DO 70, K = KK + 1, KK + N - J
+                     IX      = IX      + INCX
+                     AP( K ) = AP( K ) + X( IX )*TEMP
+   70             CONTINUE
+               ELSE
+                  AP( KK ) = DBLE( AP( KK ) )
+               END IF
+               JX = JX + INCX
+               KK = KK + N - J + 1
+   80       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZHPR  .
+*
+      END
+      subroutine zrotg(ca,cb,c,s)
+      double complex ca,cb,s
+      double precision c
+      double precision norm,scale
+      double complex alpha
+      if (cdabs(ca) .ne. 0.0d0) go to 10
+         c = 0.0d0
+         s = (1.0d0,0.0d0)
+         ca = cb
+         go to 20
+   10 continue
+         scale = cdabs(ca) + cdabs(cb)
+         norm = scale*dsqrt((cdabs(ca/dcmplx(scale,0.0d0)))**2 +
+     *                      (cdabs(cb/dcmplx(scale,0.0d0)))**2)
+         alpha = ca /cdabs(ca)
+         c = cdabs(ca) / norm
+         s = alpha * dconjg(cb) / norm
+         ca = alpha * norm
+   20 continue
+      return
+      end
+      subroutine  zscal(n,za,zx,incx)
+c
+c     scales a vector by a constant.
+c     jack dongarra, 3/11/78.
+c     modified 3/93 to return if incx .le. 0.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      double complex za,zx(*)
+      integer i,incx,ix,n
+c
+      if( n.le.0 .or. incx.le.0 )return
+      if(incx.eq.1)go to 20
+c
+c        code for increment not equal to 1
+c
+      ix = 1
+      do 10 i = 1,n
+        zx(ix) = za*zx(ix)
+        ix = ix + incx
+   10 continue
+      return
+c
+c        code for increment equal to 1
+c
+   20 do 30 i = 1,n
+        zx(i) = za*zx(i)
+   30 continue
+      return
+      end
+      subroutine  zswap (n,zx,incx,zy,incy)
+c
+c     interchanges two vectors.
+c     jack dongarra, 3/11/78.
+c     modified 12/3/93, array(1) declarations changed to array(*)
+c
+      double complex zx(*),zy(*),ztemp
+      integer i,incx,incy,ix,iy,n
+c
+      if(n.le.0)return
+      if(incx.eq.1.and.incy.eq.1)go to 20
+c
+c       code for unequal increments or equal increments not equal
+c         to 1
+c
+      ix = 1
+      iy = 1
+      if(incx.lt.0)ix = (-n+1)*incx + 1
+      if(incy.lt.0)iy = (-n+1)*incy + 1
+      do 10 i = 1,n
+        ztemp = zx(ix)
+        zx(ix) = zy(iy)
+        zy(iy) = ztemp
+        ix = ix + incx
+        iy = iy + incy
+   10 continue
+      return
+c
+c       code for both increments equal to 1
+   20 do 30 i = 1,n
+        ztemp = zx(i)
+        zx(i) = zy(i)
+        zy(i) = ztemp
+   30 continue
+      return
+      end
+      SUBROUTINE ZSYMM ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB,
+     $                   BETA, C, LDC )
+*     .. Scalar Arguments ..
+      CHARACTER*1        SIDE, UPLO
+      INTEGER            M, N, LDA, LDB, LDC
+      COMPLEX*16         ALPHA, BETA
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), B( LDB, * ), C( LDC, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZSYMM  performs one of the matrix-matrix operations
+*
+*     C := alpha*A*B + beta*C,
+*
+*  or
+*
+*     C := alpha*B*A + beta*C,
+*
+*  where  alpha and beta are scalars, A is a symmetric matrix and  B and
+*  C are m by n matrices.
+*
+*  Parameters
+*  ==========
+*
+*  SIDE   - CHARACTER*1.
+*           On entry,  SIDE  specifies whether  the  symmetric matrix  A
+*           appears on the  left or right  in the  operation as follows:
+*
+*              SIDE = 'L' or 'l'   C := alpha*A*B + beta*C,
+*
+*              SIDE = 'R' or 'r'   C := alpha*B*A + beta*C,
+*
+*           Unchanged on exit.
+*
+*  UPLO   - CHARACTER*1.
+*           On  entry,   UPLO  specifies  whether  the  upper  or  lower
+*           triangular  part  of  the  symmetric  matrix   A  is  to  be
+*           referenced as follows:
+*
+*              UPLO = 'U' or 'u'   Only the upper triangular part of the
+*                                  symmetric matrix is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the lower triangular part of the
+*                                  symmetric matrix is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry,  M  specifies the number of rows of the matrix  C.
+*           M  must be at least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of the matrix C.
+*           N  must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX*16      .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX*16       array of DIMENSION ( LDA, ka ), where ka is
+*           m  when  SIDE = 'L' or 'l'  and is n  otherwise.
+*           Before entry  with  SIDE = 'L' or 'l',  the  m by m  part of
+*           the array  A  must contain the  symmetric matrix,  such that
+*           when  UPLO = 'U' or 'u', the leading m by m upper triangular
+*           part of the array  A  must contain the upper triangular part
+*           of the  symmetric matrix and the  strictly  lower triangular
+*           part of  A  is not referenced,  and when  UPLO = 'L' or 'l',
+*           the leading  m by m  lower triangular part  of the  array  A
+*           must  contain  the  lower triangular part  of the  symmetric
+*           matrix and the  strictly upper triangular part of  A  is not
+*           referenced.
+*           Before entry  with  SIDE = 'R' or 'r',  the  n by n  part of
+*           the array  A  must contain the  symmetric matrix,  such that
+*           when  UPLO = 'U' or 'u', the leading n by n upper triangular
+*           part of the array  A  must contain the upper triangular part
+*           of the  symmetric matrix and the  strictly  lower triangular
+*           part of  A  is not referenced,  and when  UPLO = 'L' or 'l',
+*           the leading  n by n  lower triangular part  of the  array  A
+*           must  contain  the  lower triangular part  of the  symmetric
+*           matrix and the  strictly upper triangular part of  A  is not
+*           referenced.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the  calling (sub) program. When  SIDE = 'L' or 'l'  then
+*           LDA must be at least  max( 1, m ), otherwise  LDA must be at
+*           least max( 1, n ).
+*           Unchanged on exit.
+*
+*  B      - COMPLEX*16       array of DIMENSION ( LDB, n ).
+*           Before entry, the leading  m by n part of the array  B  must
+*           contain the matrix B.
+*           Unchanged on exit.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in  the  calling  (sub)  program.   LDB  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*  BETA   - COMPLEX*16      .
+*           On entry,  BETA  specifies the scalar  beta.  When  BETA  is
+*           supplied as zero then C need not be set on input.
+*           Unchanged on exit.
+*
+*  C      - COMPLEX*16       array of DIMENSION ( LDC, n ).
+*           Before entry, the leading  m by n  part of the array  C must
+*           contain the matrix  C,  except when  beta  is zero, in which
+*           case C need not be set on entry.
+*           On exit, the array  C  is overwritten by the  m by n updated
+*           matrix.
+*
+*  LDC    - INTEGER.
+*           On entry, LDC specifies the first dimension of C as declared
+*           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Local Scalars ..
+      LOGICAL            UPPER
+      INTEGER            I, INFO, J, K, NROWA
+      COMPLEX*16         TEMP1, TEMP2
+*     .. Parameters ..
+      COMPLEX*16         ONE
+      PARAMETER        ( ONE  = ( 1.0D+0, 0.0D+0 ) )
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Executable Statements ..
+*
+*     Set NROWA as the number of rows of A.
+*
+      IF( LSAME( SIDE, 'L' ) )THEN
+         NROWA = M
+      ELSE
+         NROWA = N
+      END IF
+      UPPER = LSAME( UPLO, 'U' )
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF(      ( .NOT.LSAME( SIDE, 'L' ) ).AND.
+     $         ( .NOT.LSAME( SIDE, 'R' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.UPPER              ).AND.
+     $         ( .NOT.LSAME( UPLO, 'L' ) )      )THEN
+         INFO = 2
+      ELSE IF( M  .LT.0               )THEN
+         INFO = 3
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 7
+      ELSE IF( LDB.LT.MAX( 1, M     ) )THEN
+         INFO = 9
+      ELSE IF( LDC.LT.MAX( 1, M     ) )THEN
+         INFO = 12
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZSYMM ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.
+     $    ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         IF( BETA.EQ.ZERO )THEN
+            DO 20, J = 1, N
+               DO 10, I = 1, M
+                  C( I, J ) = ZERO
+   10          CONTINUE
+   20       CONTINUE
+         ELSE
+            DO 40, J = 1, N
+               DO 30, I = 1, M
+                  C( I, J ) = BETA*C( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSAME( SIDE, 'L' ) )THEN
+*
+*        Form  C := alpha*A*B + beta*C.
+*
+         IF( UPPER )THEN
+            DO 70, J = 1, N
+               DO 60, I = 1, M
+                  TEMP1 = ALPHA*B( I, J )
+                  TEMP2 = ZERO
+                  DO 50, K = 1, I - 1
+                     C( K, J ) = C( K, J ) + TEMP1    *A( K, I )
+                     TEMP2     = TEMP2     + B( K, J )*A( K, I )
+   50             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2
+                  ELSE
+                     C( I, J ) = BETA *C( I, J ) +
+     $                           TEMP1*A( I, I ) + ALPHA*TEMP2
+                  END IF
+   60          CONTINUE
+   70       CONTINUE
+         ELSE
+            DO 100, J = 1, N
+               DO 90, I = M, 1, -1
+                  TEMP1 = ALPHA*B( I, J )
+                  TEMP2 = ZERO
+                  DO 80, K = I + 1, M
+                     C( K, J ) = C( K, J ) + TEMP1    *A( K, I )
+                     TEMP2     = TEMP2     + B( K, J )*A( K, I )
+   80             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2
+                  ELSE
+                     C( I, J ) = BETA *C( I, J ) +
+     $                           TEMP1*A( I, I ) + ALPHA*TEMP2
+                  END IF
+   90          CONTINUE
+  100       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  C := alpha*B*A + beta*C.
+*
+         DO 170, J = 1, N
+            TEMP1 = ALPHA*A( J, J )
+            IF( BETA.EQ.ZERO )THEN
+               DO 110, I = 1, M
+                  C( I, J ) = TEMP1*B( I, J )
+  110          CONTINUE
+            ELSE
+               DO 120, I = 1, M
+                  C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J )
+  120          CONTINUE
+            END IF
+            DO 140, K = 1, J - 1
+               IF( UPPER )THEN
+                  TEMP1 = ALPHA*A( K, J )
+               ELSE
+                  TEMP1 = ALPHA*A( J, K )
+               END IF
+               DO 130, I = 1, M
+                  C( I, J ) = C( I, J ) + TEMP1*B( I, K )
+  130          CONTINUE
+  140       CONTINUE
+            DO 160, K = J + 1, N
+               IF( UPPER )THEN
+                  TEMP1 = ALPHA*A( J, K )
+               ELSE
+                  TEMP1 = ALPHA*A( K, J )
+               END IF
+               DO 150, I = 1, M
+                  C( I, J ) = C( I, J ) + TEMP1*B( I, K )
+  150          CONTINUE
+  160       CONTINUE
+  170    CONTINUE
+      END IF
+*
+      RETURN
+*
+*     End of ZSYMM .
+*
+      END
+      SUBROUTINE ZSYR2K( UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB,
+     $                   BETA, C, LDC )
+*     .. Scalar Arguments ..
+      CHARACTER*1        UPLO, TRANS
+      INTEGER            N, K, LDA, LDB, LDC
+      COMPLEX*16         ALPHA, BETA
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), B( LDB, * ), C( LDC, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZSYR2K  performs one of the symmetric rank 2k operations
+*
+*     C := alpha*A*B' + alpha*B*A' + beta*C,
+*
+*  or
+*
+*     C := alpha*A'*B + alpha*B'*A + beta*C,
+*
+*  where  alpha and beta  are scalars,  C is an  n by n symmetric matrix
+*  and  A and B  are  n by k  matrices  in the  first  case  and  k by n
+*  matrices in the second case.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On  entry,   UPLO  specifies  whether  the  upper  or  lower
+*           triangular  part  of the  array  C  is to be  referenced  as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the  upper triangular part of  C
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the  lower triangular part of  C
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry,  TRANS  specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'    C := alpha*A*B' + alpha*B*A' +
+*                                         beta*C.
+*
+*              TRANS = 'T' or 't'    C := alpha*A'*B + alpha*B'*A +
+*                                         beta*C.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry,  N specifies the order of the matrix C.  N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry with  TRANS = 'N' or 'n',  K  specifies  the number
+*           of  columns  of the  matrices  A and B,  and on  entry  with
+*           TRANS = 'T' or 't',  K  specifies  the number of rows of the
+*           matrices  A and B.  K must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX*16      .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX*16       array of DIMENSION ( LDA, ka ), where ka is
+*           k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+*           Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+*           part of the array  A  must contain the matrix  A,  otherwise
+*           the leading  k by n  part of the array  A  must contain  the
+*           matrix A.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+*           then  LDA must be at least  max( 1, n ), otherwise  LDA must
+*           be at least  max( 1, k ).
+*           Unchanged on exit.
+*
+*  B      - COMPLEX*16       array of DIMENSION ( LDB, kb ), where kb is
+*           k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+*           Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+*           part of the array  B  must contain the matrix  B,  otherwise
+*           the leading  k by n  part of the array  B  must contain  the
+*           matrix B.
+*           Unchanged on exit.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+*           then  LDB must be at least  max( 1, n ), otherwise  LDB must
+*           be at least  max( 1, k ).
+*           Unchanged on exit.
+*
+*  BETA   - COMPLEX*16      .
+*           On entry, BETA specifies the scalar beta.
+*           Unchanged on exit.
+*
+*  C      - COMPLEX*16       array of DIMENSION ( LDC, n ).
+*           Before entry  with  UPLO = 'U' or 'u',  the leading  n by n
+*           upper triangular part of the array C must contain the upper
+*           triangular part  of the  symmetric matrix  and the strictly
+*           lower triangular part of C is not referenced.  On exit, the
+*           upper triangular part of the array  C is overwritten by the
+*           upper triangular part of the updated matrix.
+*           Before entry  with  UPLO = 'L' or 'l',  the leading  n by n
+*           lower triangular part of the array C must contain the lower
+*           triangular part  of the  symmetric matrix  and the strictly
+*           upper triangular part of C is not referenced.  On exit, the
+*           lower triangular part of the array  C is overwritten by the
+*           lower triangular part of the updated matrix.
+*
+*  LDC    - INTEGER.
+*           On entry, LDC specifies the first dimension of C as declared
+*           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Local Scalars ..
+      LOGICAL            UPPER
+      INTEGER            I, INFO, J, L, NROWA
+      COMPLEX*16         TEMP1, TEMP2
+*     .. Parameters ..
+      COMPLEX*16         ONE
+      PARAMETER        ( ONE  = ( 1.0D+0, 0.0D+0 ) )
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+         NROWA = N
+      ELSE
+         NROWA = K
+      END IF
+      UPPER = LSAME( UPLO, 'U' )
+*
+      INFO = 0
+      IF(      ( .NOT.UPPER               ).AND.
+     $         ( .NOT.LSAME( UPLO , 'L' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND.
+     $         ( .NOT.LSAME( TRANS, 'T' ) )      )THEN
+         INFO = 2
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 3
+      ELSE IF( K  .LT.0               )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 7
+      ELSE IF( LDB.LT.MAX( 1, NROWA ) )THEN
+         INFO = 9
+      ELSE IF( LDC.LT.MAX( 1, N     ) )THEN
+         INFO = 12
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZSYR2K', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.
+     $    ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         IF( UPPER )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 20, J = 1, N
+                  DO 10, I = 1, J
+                     C( I, J ) = ZERO
+   10             CONTINUE
+   20          CONTINUE
+            ELSE
+               DO 40, J = 1, N
+                  DO 30, I = 1, J
+                     C( I, J ) = BETA*C( I, J )
+   30             CONTINUE
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( BETA.EQ.ZERO )THEN
+               DO 60, J = 1, N
+                  DO 50, I = J, N
+                     C( I, J ) = ZERO
+   50             CONTINUE
+   60          CONTINUE
+            ELSE
+               DO 80, J = 1, N
+                  DO 70, I = J, N
+                     C( I, J ) = BETA*C( I, J )
+   70             CONTINUE
+   80          CONTINUE
+            END IF
+         END IF
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  C := alpha*A*B' + alpha*B*A' + C.
+*
+         IF( UPPER )THEN
+            DO 130, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 90, I = 1, J
+                     C( I, J ) = ZERO
+   90             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 100, I = 1, J
+                     C( I, J ) = BETA*C( I, J )
+  100             CONTINUE
+               END IF
+               DO 120, L = 1, K
+                  IF( ( A( J, L ).NE.ZERO ).OR.
+     $                ( B( J, L ).NE.ZERO )     )THEN
+                     TEMP1 = ALPHA*B( J, L )
+                     TEMP2 = ALPHA*A( J, L )
+                     DO 110, I = 1, J
+                        C( I, J ) = C( I, J ) + A( I, L )*TEMP1 +
+     $                                          B( I, L )*TEMP2
+  110                CONTINUE
+                  END IF
+  120          CONTINUE
+  130       CONTINUE
+         ELSE
+            DO 180, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 140, I = J, N
+                     C( I, J ) = ZERO
+  140             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 150, I = J, N
+                     C( I, J ) = BETA*C( I, J )
+  150             CONTINUE
+               END IF
+               DO 170, L = 1, K
+                  IF( ( A( J, L ).NE.ZERO ).OR.
+     $                ( B( J, L ).NE.ZERO )     )THEN
+                     TEMP1 = ALPHA*B( J, L )
+                     TEMP2 = ALPHA*A( J, L )
+                     DO 160, I = J, N
+                        C( I, J ) = C( I, J ) + A( I, L )*TEMP1 +
+     $                                          B( I, L )*TEMP2
+  160                CONTINUE
+                  END IF
+  170          CONTINUE
+  180       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  C := alpha*A'*B + alpha*B'*A + C.
+*
+         IF( UPPER )THEN
+            DO 210, J = 1, N
+               DO 200, I = 1, J
+                  TEMP1 = ZERO
+                  TEMP2 = ZERO
+                  DO 190, L = 1, K
+                     TEMP1 = TEMP1 + A( L, I )*B( L, J )
+                     TEMP2 = TEMP2 + B( L, I )*A( L, J )
+  190             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2
+                  ELSE
+                     C( I, J ) = BETA *C( I, J ) +
+     $                           ALPHA*TEMP1 + ALPHA*TEMP2
+                  END IF
+  200          CONTINUE
+  210       CONTINUE
+         ELSE
+            DO 240, J = 1, N
+               DO 230, I = J, N
+                  TEMP1 = ZERO
+                  TEMP2 = ZERO
+                  DO 220, L = 1, K
+                     TEMP1 = TEMP1 + A( L, I )*B( L, J )
+                     TEMP2 = TEMP2 + B( L, I )*A( L, J )
+  220             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2
+                  ELSE
+                     C( I, J ) = BETA *C( I, J ) +
+     $                           ALPHA*TEMP1 + ALPHA*TEMP2
+                  END IF
+  230          CONTINUE
+  240       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZSYR2K.
+*
+      END
+      SUBROUTINE ZSYRK ( UPLO, TRANS, N, K, ALPHA, A, LDA,
+     $                   BETA, C, LDC )
+*     .. Scalar Arguments ..
+      CHARACTER*1        UPLO, TRANS
+      INTEGER            N, K, LDA, LDC
+      COMPLEX*16         ALPHA, BETA
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), C( LDC, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZSYRK  performs one of the symmetric rank k operations
+*
+*     C := alpha*A*A' + beta*C,
+*
+*  or
+*
+*     C := alpha*A'*A + beta*C,
+*
+*  where  alpha and beta  are scalars,  C is an  n by n symmetric matrix
+*  and  A  is an  n by k  matrix in the first case and a  k by n  matrix
+*  in the second case.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On  entry,   UPLO  specifies  whether  the  upper  or  lower
+*           triangular  part  of the  array  C  is to be  referenced  as
+*           follows:
+*
+*              UPLO = 'U' or 'u'   Only the  upper triangular part of  C
+*                                  is to be referenced.
+*
+*              UPLO = 'L' or 'l'   Only the  lower triangular part of  C
+*                                  is to be referenced.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry,  TRANS  specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   C := alpha*A*A' + beta*C.
+*
+*              TRANS = 'T' or 't'   C := alpha*A'*A + beta*C.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry,  N specifies the order of the matrix C.  N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry with  TRANS = 'N' or 'n',  K  specifies  the number
+*           of  columns   of  the   matrix   A,   and  on   entry   with
+*           TRANS = 'T' or 't',  K  specifies  the number of rows of the
+*           matrix A.  K must be at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX*16      .
+*           On entry, ALPHA specifies the scalar alpha.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX*16       array of DIMENSION ( LDA, ka ), where ka is
+*           k  when  TRANS = 'N' or 'n',  and is  n  otherwise.
+*           Before entry with  TRANS = 'N' or 'n',  the  leading  n by k
+*           part of the array  A  must contain the matrix  A,  otherwise
+*           the leading  k by n  part of the array  A  must contain  the
+*           matrix A.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in  the  calling  (sub)  program.   When  TRANS = 'N' or 'n'
+*           then  LDA must be at least  max( 1, n ), otherwise  LDA must
+*           be at least  max( 1, k ).
+*           Unchanged on exit.
+*
+*  BETA   - COMPLEX*16      .
+*           On entry, BETA specifies the scalar beta.
+*           Unchanged on exit.
+*
+*  C      - COMPLEX*16       array of DIMENSION ( LDC, n ).
+*           Before entry  with  UPLO = 'U' or 'u',  the leading  n by n
+*           upper triangular part of the array C must contain the upper
+*           triangular part  of the  symmetric matrix  and the strictly
+*           lower triangular part of C is not referenced.  On exit, the
+*           upper triangular part of the array  C is overwritten by the
+*           upper triangular part of the updated matrix.
+*           Before entry  with  UPLO = 'L' or 'l',  the leading  n by n
+*           lower triangular part of the array C must contain the lower
+*           triangular part  of the  symmetric matrix  and the strictly
+*           upper triangular part of C is not referenced.  On exit, the
+*           lower triangular part of the array  C is overwritten by the
+*           lower triangular part of the updated matrix.
+*
+*  LDC    - INTEGER.
+*           On entry, LDC specifies the first dimension of C as declared
+*           in  the  calling  (sub)  program.   LDC  must  be  at  least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX
+*     .. Local Scalars ..
+      LOGICAL            UPPER
+      INTEGER            I, INFO, J, L, NROWA
+      COMPLEX*16         TEMP
+*     .. Parameters ..
+      COMPLEX*16         ONE
+      PARAMETER        ( ONE  = ( 1.0D+0, 0.0D+0 ) )
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+         NROWA = N
+      ELSE
+         NROWA = K
+      END IF
+      UPPER = LSAME( UPLO, 'U' )
+*
+      INFO = 0
+      IF(      ( .NOT.UPPER               ).AND.
+     $         ( .NOT.LSAME( UPLO , 'L' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND.
+     $         ( .NOT.LSAME( TRANS, 'T' ) )      )THEN
+         INFO = 2
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 3
+      ELSE IF( K  .LT.0               )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 7
+      ELSE IF( LDC.LT.MAX( 1, N     ) )THEN
+         INFO = 10
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZSYRK ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( ( N.EQ.0 ).OR.
+     $    ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         IF( UPPER )THEN
+            IF( BETA.EQ.ZERO )THEN
+               DO 20, J = 1, N
+                  DO 10, I = 1, J
+                     C( I, J ) = ZERO
+   10             CONTINUE
+   20          CONTINUE
+            ELSE
+               DO 40, J = 1, N
+                  DO 30, I = 1, J
+                     C( I, J ) = BETA*C( I, J )
+   30             CONTINUE
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( BETA.EQ.ZERO )THEN
+               DO 60, J = 1, N
+                  DO 50, I = J, N
+                     C( I, J ) = ZERO
+   50             CONTINUE
+   60          CONTINUE
+            ELSE
+               DO 80, J = 1, N
+                  DO 70, I = J, N
+                     C( I, J ) = BETA*C( I, J )
+   70             CONTINUE
+   80          CONTINUE
+            END IF
+         END IF
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  C := alpha*A*A' + beta*C.
+*
+         IF( UPPER )THEN
+            DO 130, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 90, I = 1, J
+                     C( I, J ) = ZERO
+   90             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 100, I = 1, J
+                     C( I, J ) = BETA*C( I, J )
+  100             CONTINUE
+               END IF
+               DO 120, L = 1, K
+                  IF( A( J, L ).NE.ZERO )THEN
+                     TEMP = ALPHA*A( J, L )
+                     DO 110, I = 1, J
+                        C( I, J ) = C( I, J ) + TEMP*A( I, L )
+  110                CONTINUE
+                  END IF
+  120          CONTINUE
+  130       CONTINUE
+         ELSE
+            DO 180, J = 1, N
+               IF( BETA.EQ.ZERO )THEN
+                  DO 140, I = J, N
+                     C( I, J ) = ZERO
+  140             CONTINUE
+               ELSE IF( BETA.NE.ONE )THEN
+                  DO 150, I = J, N
+                     C( I, J ) = BETA*C( I, J )
+  150             CONTINUE
+               END IF
+               DO 170, L = 1, K
+                  IF( A( J, L ).NE.ZERO )THEN
+                     TEMP      = ALPHA*A( J, L )
+                     DO 160, I = J, N
+                        C( I, J ) = C( I, J ) + TEMP*A( I, L )
+  160                CONTINUE
+                  END IF
+  170          CONTINUE
+  180       CONTINUE
+         END IF
+      ELSE
+*
+*        Form  C := alpha*A'*A + beta*C.
+*
+         IF( UPPER )THEN
+            DO 210, J = 1, N
+               DO 200, I = 1, J
+                  TEMP = ZERO
+                  DO 190, L = 1, K
+                     TEMP = TEMP + A( L, I )*A( L, J )
+  190             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  200          CONTINUE
+  210       CONTINUE
+         ELSE
+            DO 240, J = 1, N
+               DO 230, I = J, N
+                  TEMP = ZERO
+                  DO 220, L = 1, K
+                     TEMP = TEMP + A( L, I )*A( L, J )
+  220             CONTINUE
+                  IF( BETA.EQ.ZERO )THEN
+                     C( I, J ) = ALPHA*TEMP
+                  ELSE
+                     C( I, J ) = ALPHA*TEMP + BETA*C( I, J )
+                  END IF
+  230          CONTINUE
+  240       CONTINUE
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZSYRK .
+*
+      END
+      SUBROUTINE ZTBMV ( UPLO, TRANS, DIAG, N, K, A, LDA, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, K, LDA, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZTBMV  performs one of the matrix-vector operations
+*
+*     x := A*x,   or   x := A'*x,   or   x := conjg( A' )*x,
+*
+*  where x is an n element vector and  A is an n by n unit, or non-unit,
+*  upper or lower triangular band matrix, with ( k + 1 ) diagonals.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   x := A*x.
+*
+*              TRANS = 'T' or 't'   x := A'*x.
+*
+*              TRANS = 'C' or 'c'   x := conjg( A' )*x.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry with UPLO = 'U' or 'u', K specifies the number of
+*           super-diagonals of the matrix A.
+*           On entry with UPLO = 'L' or 'l', K specifies the number of
+*           sub-diagonals of the matrix A.
+*           K must satisfy  0 .le. K.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
+*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
+*           by n part of the array A must contain the upper triangular
+*           band part of the matrix of coefficients, supplied column by
+*           column, with the leading diagonal of the matrix in row
+*           ( k + 1 ) of the array, the first super-diagonal starting at
+*           position 2 in row k, and so on. The top left k by k triangle
+*           of the array A is not referenced.
+*           The following program segment will transfer an upper
+*           triangular band matrix from conventional full matrix storage
+*           to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = K + 1 - J
+*                    DO 10, I = MAX( 1, J - K ), J
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
+*           by n part of the array A must contain the lower triangular
+*           band part of the matrix of coefficients, supplied column by
+*           column, with the leading diagonal of the matrix in row 1 of
+*           the array, the first sub-diagonal starting at position 1 in
+*           row 2, and so on. The bottom right k by k triangle of the
+*           array A is not referenced.
+*           The following program segment will transfer a lower
+*           triangular band matrix from conventional full matrix storage
+*           to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = 1 - J
+*                    DO 10, I = J, MIN( N, J + K )
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Note that when DIAG = 'U' or 'u' the elements of the array A
+*           corresponding to the diagonal elements of the matrix are not
+*           referenced, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           ( k + 1 ).
+*           Unchanged on exit.
+*
+*  X      - COMPLEX*16       array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x. On exit, X is overwritten with the
+*           tranformed vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     .. Local Scalars ..
+      COMPLEX*16         TEMP
+      INTEGER            I, INFO, IX, J, JX, KPLUS1, KX, L
+      LOGICAL            NOCONJ, NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( K.LT.0 )THEN
+         INFO = 5
+      ELSE IF( LDA.LT.( K + 1 ) )THEN
+         INFO = 7
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 9
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZTBMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOCONJ = LSAME( TRANS, 'T' )
+      NOUNIT = LSAME( DIAG , 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX   too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through A.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*         Form  x := A*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KPLUS1 = K + 1
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     L    = KPLUS1 - J
+                     DO 10, I = MAX( 1, J - K ), J - 1
+                        X( I ) = X( I ) + TEMP*A( L + I, J )
+   10                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*A( KPLUS1, J )
+                  END IF
+   20          CONTINUE
+            ELSE
+               JX = KX
+               DO 40, J = 1, N
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     L    = KPLUS1  - J
+                     DO 30, I = MAX( 1, J - K ), J - 1
+                        X( IX ) = X( IX ) + TEMP*A( L + I, J )
+                        IX      = IX      + INCX
+   30                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*A( KPLUS1, J )
+                  END IF
+                  JX = JX + INCX
+                  IF( J.GT.K )
+     $               KX = KX + INCX
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     L    = 1      - J
+                     DO 50, I = MIN( N, J + K ), J + 1, -1
+                        X( I ) = X( I ) + TEMP*A( L + I, J )
+   50                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*A( 1, J )
+                  END IF
+   60          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 80, J = N, 1, -1
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     L    = 1       - J
+                     DO 70, I = MIN( N, J + K ), J + 1, -1
+                        X( IX ) = X( IX ) + TEMP*A( L + I, J )
+                        IX      = IX      - INCX
+   70                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*A( 1, J )
+                  END IF
+                  JX = JX - INCX
+                  IF( ( N - J ).GE.K )
+     $               KX = KX - INCX
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := A'*x  or  x := conjg( A' )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KPLUS1 = K + 1
+            IF( INCX.EQ.1 )THEN
+               DO 110, J = N, 1, -1
+                  TEMP = X( J )
+                  L    = KPLUS1 - J
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*A( KPLUS1, J )
+                     DO 90, I = J - 1, MAX( 1, J - K ), -1
+                        TEMP = TEMP + A( L + I, J )*X( I )
+   90                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*DCONJG( A( KPLUS1, J ) )
+                     DO 100, I = J - 1, MAX( 1, J - K ), -1
+                        TEMP = TEMP + DCONJG( A( L + I, J ) )*X( I )
+  100                CONTINUE
+                  END IF
+                  X( J ) = TEMP
+  110          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 140, J = N, 1, -1
+                  TEMP = X( JX )
+                  KX   = KX      - INCX
+                  IX   = KX
+                  L    = KPLUS1  - J
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*A( KPLUS1, J )
+                     DO 120, I = J - 1, MAX( 1, J - K ), -1
+                        TEMP = TEMP + A( L + I, J )*X( IX )
+                        IX   = IX   - INCX
+  120                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*DCONJG( A( KPLUS1, J ) )
+                     DO 130, I = J - 1, MAX( 1, J - K ), -1
+                        TEMP = TEMP + DCONJG( A( L + I, J ) )*X( IX )
+                        IX   = IX   - INCX
+  130                CONTINUE
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+  140          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 170, J = 1, N
+                  TEMP = X( J )
+                  L    = 1      - J
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*A( 1, J )
+                     DO 150, I = J + 1, MIN( N, J + K )
+                        TEMP = TEMP + A( L + I, J )*X( I )
+  150                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*DCONJG( A( 1, J ) )
+                     DO 160, I = J + 1, MIN( N, J + K )
+                        TEMP = TEMP + DCONJG( A( L + I, J ) )*X( I )
+  160                CONTINUE
+                  END IF
+                  X( J ) = TEMP
+  170          CONTINUE
+            ELSE
+               JX = KX
+               DO 200, J = 1, N
+                  TEMP = X( JX )
+                  KX   = KX      + INCX
+                  IX   = KX
+                  L    = 1       - J
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*A( 1, J )
+                     DO 180, I = J + 1, MIN( N, J + K )
+                        TEMP = TEMP + A( L + I, J )*X( IX )
+                        IX   = IX   + INCX
+  180                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*DCONJG( A( 1, J ) )
+                     DO 190, I = J + 1, MIN( N, J + K )
+                        TEMP = TEMP + DCONJG( A( L + I, J ) )*X( IX )
+                        IX   = IX   + INCX
+  190                CONTINUE
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+  200          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZTBMV .
+*
+      END
+      SUBROUTINE ZTBSV ( UPLO, TRANS, DIAG, N, K, A, LDA, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, K, LDA, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZTBSV  solves one of the systems of equations
+*
+*     A*x = b,   or   A'*x = b,   or   conjg( A' )*x = b,
+*
+*  where b and x are n element vectors and A is an n by n unit, or
+*  non-unit, upper or lower triangular band matrix, with ( k + 1 )
+*  diagonals.
+*
+*  No test for singularity or near-singularity is included in this
+*  routine. Such tests must be performed before calling this routine.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the equations to be solved as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   A*x = b.
+*
+*              TRANS = 'T' or 't'   A'*x = b.
+*
+*              TRANS = 'C' or 'c'   conjg( A' )*x = b.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  K      - INTEGER.
+*           On entry with UPLO = 'U' or 'u', K specifies the number of
+*           super-diagonals of the matrix A.
+*           On entry with UPLO = 'L' or 'l', K specifies the number of
+*           sub-diagonals of the matrix A.
+*           K must satisfy  0 .le. K.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
+*           Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
+*           by n part of the array A must contain the upper triangular
+*           band part of the matrix of coefficients, supplied column by
+*           column, with the leading diagonal of the matrix in row
+*           ( k + 1 ) of the array, the first super-diagonal starting at
+*           position 2 in row k, and so on. The top left k by k triangle
+*           of the array A is not referenced.
+*           The following program segment will transfer an upper
+*           triangular band matrix from conventional full matrix storage
+*           to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = K + 1 - J
+*                    DO 10, I = MAX( 1, J - K ), J
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
+*           by n part of the array A must contain the lower triangular
+*           band part of the matrix of coefficients, supplied column by
+*           column, with the leading diagonal of the matrix in row 1 of
+*           the array, the first sub-diagonal starting at position 1 in
+*           row 2, and so on. The bottom right k by k triangle of the
+*           array A is not referenced.
+*           The following program segment will transfer a lower
+*           triangular band matrix from conventional full matrix storage
+*           to band storage:
+*
+*                 DO 20, J = 1, N
+*                    M = 1 - J
+*                    DO 10, I = J, MIN( N, J + K )
+*                       A( M + I, J ) = matrix( I, J )
+*              10    CONTINUE
+*              20 CONTINUE
+*
+*           Note that when DIAG = 'U' or 'u' the elements of the array A
+*           corresponding to the diagonal elements of the matrix are not
+*           referenced, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           ( k + 1 ).
+*           Unchanged on exit.
+*
+*  X      - COMPLEX*16       array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element right-hand side vector b. On exit, X is overwritten
+*           with the solution vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     .. Local Scalars ..
+      COMPLEX*16         TEMP
+      INTEGER            I, INFO, IX, J, JX, KPLUS1, KX, L
+      LOGICAL            NOCONJ, NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( K.LT.0 )THEN
+         INFO = 5
+      ELSE IF( LDA.LT.( K + 1 ) )THEN
+         INFO = 7
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 9
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZTBSV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOCONJ = LSAME( TRANS, 'T' )
+      NOUNIT = LSAME( DIAG , 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX  too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed by sequentially with one pass through A.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  x := inv( A )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KPLUS1 = K + 1
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     L = KPLUS1 - J
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/A( KPLUS1, J )
+                     TEMP = X( J )
+                     DO 10, I = J - 1, MAX( 1, J - K ), -1
+                        X( I ) = X( I ) - TEMP*A( L + I, J )
+   10                CONTINUE
+                  END IF
+   20          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 40, J = N, 1, -1
+                  KX = KX - INCX
+                  IF( X( JX ).NE.ZERO )THEN
+                     IX = KX
+                     L  = KPLUS1 - J
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/A( KPLUS1, J )
+                     TEMP = X( JX )
+                     DO 30, I = J - 1, MAX( 1, J - K ), -1
+                        X( IX ) = X( IX ) - TEMP*A( L + I, J )
+                        IX      = IX      - INCX
+   30                CONTINUE
+                  END IF
+                  JX = JX - INCX
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     L = 1 - J
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/A( 1, J )
+                     TEMP = X( J )
+                     DO 50, I = J + 1, MIN( N, J + K )
+                        X( I ) = X( I ) - TEMP*A( L + I, J )
+   50                CONTINUE
+                  END IF
+   60          CONTINUE
+            ELSE
+               JX = KX
+               DO 80, J = 1, N
+                  KX = KX + INCX
+                  IF( X( JX ).NE.ZERO )THEN
+                     IX = KX
+                     L  = 1  - J
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/A( 1, J )
+                     TEMP = X( JX )
+                     DO 70, I = J + 1, MIN( N, J + K )
+                        X( IX ) = X( IX ) - TEMP*A( L + I, J )
+                        IX      = IX      + INCX
+   70                CONTINUE
+                  END IF
+                  JX = JX + INCX
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := inv( A' )*x  or  x := inv( conjg( A') )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KPLUS1 = K + 1
+            IF( INCX.EQ.1 )THEN
+               DO 110, J = 1, N
+                  TEMP = X( J )
+                  L    = KPLUS1 - J
+                  IF( NOCONJ )THEN
+                     DO 90, I = MAX( 1, J - K ), J - 1
+                        TEMP = TEMP - A( L + I, J )*X( I )
+   90                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/A( KPLUS1, J )
+                  ELSE
+                     DO 100, I = MAX( 1, J - K ), J - 1
+                        TEMP = TEMP - DCONJG( A( L + I, J ) )*X( I )
+  100                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/DCONJG( A( KPLUS1, J ) )
+                  END IF
+                  X( J ) = TEMP
+  110          CONTINUE
+            ELSE
+               JX = KX
+               DO 140, J = 1, N
+                  TEMP = X( JX )
+                  IX   = KX
+                  L    = KPLUS1  - J
+                  IF( NOCONJ )THEN
+                     DO 120, I = MAX( 1, J - K ), J - 1
+                        TEMP = TEMP - A( L + I, J )*X( IX )
+                        IX   = IX   + INCX
+  120                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/A( KPLUS1, J )
+                  ELSE
+                     DO 130, I = MAX( 1, J - K ), J - 1
+                        TEMP = TEMP - DCONJG( A( L + I, J ) )*X( IX )
+                        IX   = IX   + INCX
+  130                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/DCONJG( A( KPLUS1, J ) )
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+                  IF( J.GT.K )
+     $               KX = KX + INCX
+  140          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 170, J = N, 1, -1
+                  TEMP = X( J )
+                  L    = 1      - J
+                  IF( NOCONJ )THEN
+                     DO 150, I = MIN( N, J + K ), J + 1, -1
+                        TEMP = TEMP - A( L + I, J )*X( I )
+  150                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/A( 1, J )
+                  ELSE
+                     DO 160, I = MIN( N, J + K ), J + 1, -1
+                        TEMP = TEMP - DCONJG( A( L + I, J ) )*X( I )
+  160                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/DCONJG( A( 1, J ) )
+                  END IF
+                  X( J ) = TEMP
+  170          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 200, J = N, 1, -1
+                  TEMP = X( JX )
+                  IX   = KX
+                  L    = 1       - J
+                  IF( NOCONJ )THEN
+                     DO 180, I = MIN( N, J + K ), J + 1, -1
+                        TEMP = TEMP - A( L + I, J )*X( IX )
+                        IX   = IX   - INCX
+  180                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/A( 1, J )
+                  ELSE
+                     DO 190, I = MIN( N, J + K ), J + 1, -1
+                        TEMP = TEMP - DCONJG( A( L + I, J ) )*X( IX )
+                        IX   = IX   - INCX
+  190                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/DCONJG( A( 1, J ) )
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+                  IF( ( N - J ).GE.K )
+     $               KX = KX - INCX
+  200          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZTBSV .
+*
+      END
+      SUBROUTINE ZTPMV ( UPLO, TRANS, DIAG, N, AP, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      COMPLEX*16         AP( * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZTPMV  performs one of the matrix-vector operations
+*
+*     x := A*x,   or   x := A'*x,   or   x := conjg( A' )*x,
+*
+*  where x is an n element vector and  A is an n by n unit, or non-unit,
+*  upper or lower triangular matrix, supplied in packed form.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   x := A*x.
+*
+*              TRANS = 'T' or 't'   x := A'*x.
+*
+*              TRANS = 'C' or 'c'   x := conjg( A' )*x.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  AP     - COMPLEX*16       array of DIMENSION at least
+*           ( ( n*( n + 1 ) )/2 ).
+*           Before entry with  UPLO = 'U' or 'u', the array AP must
+*           contain the upper triangular matrix packed sequentially,
+*           column by column, so that AP( 1 ) contains a( 1, 1 ),
+*           AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 )
+*           respectively, and so on.
+*           Before entry with UPLO = 'L' or 'l', the array AP must
+*           contain the lower triangular matrix packed sequentially,
+*           column by column, so that AP( 1 ) contains a( 1, 1 ),
+*           AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 )
+*           respectively, and so on.
+*           Note that when  DIAG = 'U' or 'u', the diagonal elements of
+*           A are not referenced, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  X      - COMPLEX*16       array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x. On exit, X is overwritten with the
+*           tranformed vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     .. Local Scalars ..
+      COMPLEX*16         TEMP
+      INTEGER            I, INFO, IX, J, JX, K, KK, KX
+      LOGICAL            NOCONJ, NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 7
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZTPMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOCONJ = LSAME( TRANS, 'T' )
+      NOUNIT = LSAME( DIAG , 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX  too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of AP are
+*     accessed sequentially with one pass through AP.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  x:= A*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KK = 1
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     K    = KK
+                     DO 10, I = 1, J - 1
+                        X( I ) = X( I ) + TEMP*AP( K )
+                        K      = K      + 1
+   10                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*AP( KK + J - 1 )
+                  END IF
+                  KK = KK + J
+   20          CONTINUE
+            ELSE
+               JX = KX
+               DO 40, J = 1, N
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     DO 30, K = KK, KK + J - 2
+                        X( IX ) = X( IX ) + TEMP*AP( K )
+                        IX      = IX      + INCX
+   30                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*AP( KK + J - 1 )
+                  END IF
+                  JX = JX + INCX
+                  KK = KK + J
+   40          CONTINUE
+            END IF
+         ELSE
+            KK = ( N*( N + 1 ) )/2
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     K    = KK
+                     DO 50, I = N, J + 1, -1
+                        X( I ) = X( I ) + TEMP*AP( K )
+                        K      = K      - 1
+   50                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*AP( KK - N + J )
+                  END IF
+                  KK = KK - ( N - J + 1 )
+   60          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 80, J = N, 1, -1
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     DO 70, K = KK, KK - ( N - ( J + 1 ) ), -1
+                        X( IX ) = X( IX ) + TEMP*AP( K )
+                        IX      = IX      - INCX
+   70                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*AP( KK - N + J )
+                  END IF
+                  JX = JX - INCX
+                  KK = KK - ( N - J + 1 )
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := A'*x  or  x := conjg( A' )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KK = ( N*( N + 1 ) )/2
+            IF( INCX.EQ.1 )THEN
+               DO 110, J = N, 1, -1
+                  TEMP = X( J )
+                  K    = KK     - 1
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*AP( KK )
+                     DO 90, I = J - 1, 1, -1
+                        TEMP = TEMP + AP( K )*X( I )
+                        K    = K    - 1
+   90                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*DCONJG( AP( KK ) )
+                     DO 100, I = J - 1, 1, -1
+                        TEMP = TEMP + DCONJG( AP( K ) )*X( I )
+                        K    = K    - 1
+  100                CONTINUE
+                  END IF
+                  X( J ) = TEMP
+                  KK     = KK   - J
+  110          CONTINUE
+            ELSE
+               JX = KX + ( N - 1 )*INCX
+               DO 140, J = N, 1, -1
+                  TEMP = X( JX )
+                  IX   = JX
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*AP( KK )
+                     DO 120, K = KK - 1, KK - J + 1, -1
+                        IX   = IX   - INCX
+                        TEMP = TEMP + AP( K )*X( IX )
+  120                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*DCONJG( AP( KK ) )
+                     DO 130, K = KK - 1, KK - J + 1, -1
+                        IX   = IX   - INCX
+                        TEMP = TEMP + DCONJG( AP( K ) )*X( IX )
+  130                CONTINUE
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+                  KK      = KK   - J
+  140          CONTINUE
+            END IF
+         ELSE
+            KK = 1
+            IF( INCX.EQ.1 )THEN
+               DO 170, J = 1, N
+                  TEMP = X( J )
+                  K    = KK     + 1
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*AP( KK )
+                     DO 150, I = J + 1, N
+                        TEMP = TEMP + AP( K )*X( I )
+                        K    = K    + 1
+  150                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*DCONJG( AP( KK ) )
+                     DO 160, I = J + 1, N
+                        TEMP = TEMP + DCONJG( AP( K ) )*X( I )
+                        K    = K    + 1
+  160                CONTINUE
+                  END IF
+                  X( J ) = TEMP
+                  KK     = KK   + ( N - J + 1 )
+  170          CONTINUE
+            ELSE
+               JX = KX
+               DO 200, J = 1, N
+                  TEMP = X( JX )
+                  IX   = JX
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*AP( KK )
+                     DO 180, K = KK + 1, KK + N - J
+                        IX   = IX   + INCX
+                        TEMP = TEMP + AP( K )*X( IX )
+  180                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*DCONJG( AP( KK ) )
+                     DO 190, K = KK + 1, KK + N - J
+                        IX   = IX   + INCX
+                        TEMP = TEMP + DCONJG( AP( K ) )*X( IX )
+  190                CONTINUE
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+                  KK      = KK   + ( N - J + 1 )
+  200          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZTPMV .
+*
+      END
+      SUBROUTINE ZTPSV ( UPLO, TRANS, DIAG, N, AP, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      COMPLEX*16         AP( * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZTPSV  solves one of the systems of equations
+*
+*     A*x = b,   or   A'*x = b,   or   conjg( A' )*x = b,
+*
+*  where b and x are n element vectors and A is an n by n unit, or
+*  non-unit, upper or lower triangular matrix, supplied in packed form.
+*
+*  No test for singularity or near-singularity is included in this
+*  routine. Such tests must be performed before calling this routine.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the equations to be solved as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   A*x = b.
+*
+*              TRANS = 'T' or 't'   A'*x = b.
+*
+*              TRANS = 'C' or 'c'   conjg( A' )*x = b.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  AP     - COMPLEX*16       array of DIMENSION at least
+*           ( ( n*( n + 1 ) )/2 ).
+*           Before entry with  UPLO = 'U' or 'u', the array AP must
+*           contain the upper triangular matrix packed sequentially,
+*           column by column, so that AP( 1 ) contains a( 1, 1 ),
+*           AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 )
+*           respectively, and so on.
+*           Before entry with UPLO = 'L' or 'l', the array AP must
+*           contain the lower triangular matrix packed sequentially,
+*           column by column, so that AP( 1 ) contains a( 1, 1 ),
+*           AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 )
+*           respectively, and so on.
+*           Note that when  DIAG = 'U' or 'u', the diagonal elements of
+*           A are not referenced, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  X      - COMPLEX*16       array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element right-hand side vector b. On exit, X is overwritten
+*           with the solution vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     .. Local Scalars ..
+      COMPLEX*16         TEMP
+      INTEGER            I, INFO, IX, J, JX, K, KK, KX
+      LOGICAL            NOCONJ, NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 7
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZTPSV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOCONJ = LSAME( TRANS, 'T' )
+      NOUNIT = LSAME( DIAG , 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX  too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of AP are
+*     accessed sequentially with one pass through AP.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  x := inv( A )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KK = ( N*( N + 1 ) )/2
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/AP( KK )
+                     TEMP = X( J )
+                     K    = KK     - 1
+                     DO 10, I = J - 1, 1, -1
+                        X( I ) = X( I ) - TEMP*AP( K )
+                        K      = K      - 1
+   10                CONTINUE
+                  END IF
+                  KK = KK - J
+   20          CONTINUE
+            ELSE
+               JX = KX + ( N - 1 )*INCX
+               DO 40, J = N, 1, -1
+                  IF( X( JX ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/AP( KK )
+                     TEMP = X( JX )
+                     IX   = JX
+                     DO 30, K = KK - 1, KK - J + 1, -1
+                        IX      = IX      - INCX
+                        X( IX ) = X( IX ) - TEMP*AP( K )
+   30                CONTINUE
+                  END IF
+                  JX = JX - INCX
+                  KK = KK - J
+   40          CONTINUE
+            END IF
+         ELSE
+            KK = 1
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/AP( KK )
+                     TEMP = X( J )
+                     K    = KK     + 1
+                     DO 50, I = J + 1, N
+                        X( I ) = X( I ) - TEMP*AP( K )
+                        K      = K      + 1
+   50                CONTINUE
+                  END IF
+                  KK = KK + ( N - J + 1 )
+   60          CONTINUE
+            ELSE
+               JX = KX
+               DO 80, J = 1, N
+                  IF( X( JX ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/AP( KK )
+                     TEMP = X( JX )
+                     IX   = JX
+                     DO 70, K = KK + 1, KK + N - J
+                        IX      = IX      + INCX
+                        X( IX ) = X( IX ) - TEMP*AP( K )
+   70                CONTINUE
+                  END IF
+                  JX = JX + INCX
+                  KK = KK + ( N - J + 1 )
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := inv( A' )*x  or  x := inv( conjg( A' ) )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            KK = 1
+            IF( INCX.EQ.1 )THEN
+               DO 110, J = 1, N
+                  TEMP = X( J )
+                  K    = KK
+                  IF( NOCONJ )THEN
+                     DO 90, I = 1, J - 1
+                        TEMP = TEMP - AP( K )*X( I )
+                        K    = K    + 1
+   90                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/AP( KK + J - 1 )
+                  ELSE
+                     DO 100, I = 1, J - 1
+                        TEMP = TEMP - DCONJG( AP( K ) )*X( I )
+                        K    = K    + 1
+  100                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/DCONJG( AP( KK + J - 1 ) )
+                  END IF
+                  X( J ) = TEMP
+                  KK     = KK   + J
+  110          CONTINUE
+            ELSE
+               JX = KX
+               DO 140, J = 1, N
+                  TEMP = X( JX )
+                  IX   = KX
+                  IF( NOCONJ )THEN
+                     DO 120, K = KK, KK + J - 2
+                        TEMP = TEMP - AP( K )*X( IX )
+                        IX   = IX   + INCX
+  120                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/AP( KK + J - 1 )
+                  ELSE
+                     DO 130, K = KK, KK + J - 2
+                        TEMP = TEMP - DCONJG( AP( K ) )*X( IX )
+                        IX   = IX   + INCX
+  130                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/DCONJG( AP( KK + J - 1 ) )
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+                  KK      = KK   + J
+  140          CONTINUE
+            END IF
+         ELSE
+            KK = ( N*( N + 1 ) )/2
+            IF( INCX.EQ.1 )THEN
+               DO 170, J = N, 1, -1
+                  TEMP = X( J )
+                  K    = KK
+                  IF( NOCONJ )THEN
+                     DO 150, I = N, J + 1, -1
+                        TEMP = TEMP - AP( K )*X( I )
+                        K    = K    - 1
+  150                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/AP( KK - N + J )
+                  ELSE
+                     DO 160, I = N, J + 1, -1
+                        TEMP = TEMP - DCONJG( AP( K ) )*X( I )
+                        K    = K    - 1
+  160                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/DCONJG( AP( KK - N + J ) )
+                  END IF
+                  X( J ) = TEMP
+                  KK     = KK   - ( N - J + 1 )
+  170          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 200, J = N, 1, -1
+                  TEMP = X( JX )
+                  IX   = KX
+                  IF( NOCONJ )THEN
+                     DO 180, K = KK, KK - ( N - ( J + 1 ) ), -1
+                        TEMP = TEMP - AP( K )*X( IX )
+                        IX   = IX   - INCX
+  180                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/AP( KK - N + J )
+                  ELSE
+                     DO 190, K = KK, KK - ( N - ( J + 1 ) ), -1
+                        TEMP = TEMP - DCONJG( AP( K ) )*X( IX )
+                        IX   = IX   - INCX
+  190                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/DCONJG( AP( KK - N + J ) )
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+                  KK      = KK   - ( N - J + 1 )
+  200          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZTPSV .
+*
+      END
+      SUBROUTINE ZTRMM ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA,
+     $                   B, LDB )
+*     .. Scalar Arguments ..
+      CHARACTER*1        SIDE, UPLO, TRANSA, DIAG
+      INTEGER            M, N, LDA, LDB
+      COMPLEX*16         ALPHA
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), B( LDB, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZTRMM  performs one of the matrix-matrix operations
+*
+*     B := alpha*op( A )*B,   or   B := alpha*B*op( A )
+*
+*  where  alpha  is a scalar,  B  is an m by n matrix,  A  is a unit, or
+*  non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+*
+*     op( A ) = A   or   op( A ) = A'   or   op( A ) = conjg( A' ).
+*
+*  Parameters
+*  ==========
+*
+*  SIDE   - CHARACTER*1.
+*           On entry,  SIDE specifies whether  op( A ) multiplies B from
+*           the left or right as follows:
+*
+*              SIDE = 'L' or 'l'   B := alpha*op( A )*B.
+*
+*              SIDE = 'R' or 'r'   B := alpha*B*op( A ).
+*
+*           Unchanged on exit.
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix A is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANSA - CHARACTER*1.
+*           On entry, TRANSA specifies the form of op( A ) to be used in
+*           the matrix multiplication as follows:
+*
+*              TRANSA = 'N' or 'n'   op( A ) = A.
+*
+*              TRANSA = 'T' or 't'   op( A ) = A'.
+*
+*              TRANSA = 'C' or 'c'   op( A ) = conjg( A' ).
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit triangular
+*           as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry, M specifies the number of rows of B. M must be at
+*           least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of B.  N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX*16      .
+*           On entry,  ALPHA specifies the scalar  alpha. When  alpha is
+*           zero then  A is not referenced and  B need not be set before
+*           entry.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX*16       array of DIMENSION ( LDA, k ), where k is m
+*           when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'.
+*           Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k
+*           upper triangular part of the array  A must contain the upper
+*           triangular matrix  and the strictly lower triangular part of
+*           A is not referenced.
+*           Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k
+*           lower triangular part of the array  A must contain the lower
+*           triangular matrix  and the strictly upper triangular part of
+*           A is not referenced.
+*           Note that when  DIAG = 'U' or 'u',  the diagonal elements of
+*           A  are not referenced either,  but are assumed to be  unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program.  When  SIDE = 'L' or 'l'  then
+*           LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r'
+*           then LDA must be at least max( 1, n ).
+*           Unchanged on exit.
+*
+*  B      - COMPLEX*16       array of DIMENSION ( LDB, n ).
+*           Before entry,  the leading  m by n part of the array  B must
+*           contain the matrix  B,  and  on exit  is overwritten  by the
+*           transformed matrix.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in  the  calling  (sub)  program.   LDB  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG, MAX
+*     .. Local Scalars ..
+      LOGICAL            LSIDE, NOCONJ, NOUNIT, UPPER
+      INTEGER            I, INFO, J, K, NROWA
+      COMPLEX*16         TEMP
+*     .. Parameters ..
+      COMPLEX*16         ONE
+      PARAMETER        ( ONE  = ( 1.0D+0, 0.0D+0 ) )
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      LSIDE  = LSAME( SIDE  , 'L' )
+      IF( LSIDE )THEN
+         NROWA = M
+      ELSE
+         NROWA = N
+      END IF
+      NOCONJ = LSAME( TRANSA, 'T' )
+      NOUNIT = LSAME( DIAG  , 'N' )
+      UPPER  = LSAME( UPLO  , 'U' )
+*
+      INFO   = 0
+      IF(      ( .NOT.LSIDE                ).AND.
+     $         ( .NOT.LSAME( SIDE  , 'R' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.UPPER                ).AND.
+     $         ( .NOT.LSAME( UPLO  , 'L' ) )      )THEN
+         INFO = 2
+      ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND.
+     $         ( .NOT.LSAME( TRANSA, 'T' ) ).AND.
+     $         ( .NOT.LSAME( TRANSA, 'C' ) )      )THEN
+         INFO = 3
+      ELSE IF( ( .NOT.LSAME( DIAG  , 'U' ) ).AND.
+     $         ( .NOT.LSAME( DIAG  , 'N' ) )      )THEN
+         INFO = 4
+      ELSE IF( M  .LT.0               )THEN
+         INFO = 5
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 6
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 9
+      ELSE IF( LDB.LT.MAX( 1, M     ) )THEN
+         INFO = 11
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZTRMM ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         DO 20, J = 1, N
+            DO 10, I = 1, M
+               B( I, J ) = ZERO
+   10       CONTINUE
+   20    CONTINUE
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSIDE )THEN
+         IF( LSAME( TRANSA, 'N' ) )THEN
+*
+*           Form  B := alpha*A*B.
+*
+            IF( UPPER )THEN
+               DO 50, J = 1, N
+                  DO 40, K = 1, M
+                     IF( B( K, J ).NE.ZERO )THEN
+                        TEMP = ALPHA*B( K, J )
+                        DO 30, I = 1, K - 1
+                           B( I, J ) = B( I, J ) + TEMP*A( I, K )
+   30                   CONTINUE
+                        IF( NOUNIT )
+     $                     TEMP = TEMP*A( K, K )
+                        B( K, J ) = TEMP
+                     END IF
+   40             CONTINUE
+   50          CONTINUE
+            ELSE
+               DO 80, J = 1, N
+                  DO 70 K = M, 1, -1
+                     IF( B( K, J ).NE.ZERO )THEN
+                        TEMP      = ALPHA*B( K, J )
+                        B( K, J ) = TEMP
+                        IF( NOUNIT )
+     $                     B( K, J ) = B( K, J )*A( K, K )
+                        DO 60, I = K + 1, M
+                           B( I, J ) = B( I, J ) + TEMP*A( I, K )
+   60                   CONTINUE
+                     END IF
+   70             CONTINUE
+   80          CONTINUE
+            END IF
+         ELSE
+*
+*           Form  B := alpha*A'*B   or   B := alpha*conjg( A' )*B.
+*
+            IF( UPPER )THEN
+               DO 120, J = 1, N
+                  DO 110, I = M, 1, -1
+                     TEMP = B( I, J )
+                     IF( NOCONJ )THEN
+                        IF( NOUNIT )
+     $                     TEMP = TEMP*A( I, I )
+                        DO 90, K = 1, I - 1
+                           TEMP = TEMP + A( K, I )*B( K, J )
+   90                   CONTINUE
+                     ELSE
+                        IF( NOUNIT )
+     $                     TEMP = TEMP*DCONJG( A( I, I ) )
+                        DO 100, K = 1, I - 1
+                           TEMP = TEMP + DCONJG( A( K, I ) )*B( K, J )
+  100                   CONTINUE
+                     END IF
+                     B( I, J ) = ALPHA*TEMP
+  110             CONTINUE
+  120          CONTINUE
+            ELSE
+               DO 160, J = 1, N
+                  DO 150, I = 1, M
+                     TEMP = B( I, J )
+                     IF( NOCONJ )THEN
+                        IF( NOUNIT )
+     $                     TEMP = TEMP*A( I, I )
+                        DO 130, K = I + 1, M
+                           TEMP = TEMP + A( K, I )*B( K, J )
+  130                   CONTINUE
+                     ELSE
+                        IF( NOUNIT )
+     $                     TEMP = TEMP*DCONJG( A( I, I ) )
+                        DO 140, K = I + 1, M
+                           TEMP = TEMP + DCONJG( A( K, I ) )*B( K, J )
+  140                   CONTINUE
+                     END IF
+                     B( I, J ) = ALPHA*TEMP
+  150             CONTINUE
+  160          CONTINUE
+            END IF
+         END IF
+      ELSE
+         IF( LSAME( TRANSA, 'N' ) )THEN
+*
+*           Form  B := alpha*B*A.
+*
+            IF( UPPER )THEN
+               DO 200, J = N, 1, -1
+                  TEMP = ALPHA
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( J, J )
+                  DO 170, I = 1, M
+                     B( I, J ) = TEMP*B( I, J )
+  170             CONTINUE
+                  DO 190, K = 1, J - 1
+                     IF( A( K, J ).NE.ZERO )THEN
+                        TEMP = ALPHA*A( K, J )
+                        DO 180, I = 1, M
+                           B( I, J ) = B( I, J ) + TEMP*B( I, K )
+  180                   CONTINUE
+                     END IF
+  190             CONTINUE
+  200          CONTINUE
+            ELSE
+               DO 240, J = 1, N
+                  TEMP = ALPHA
+                  IF( NOUNIT )
+     $               TEMP = TEMP*A( J, J )
+                  DO 210, I = 1, M
+                     B( I, J ) = TEMP*B( I, J )
+  210             CONTINUE
+                  DO 230, K = J + 1, N
+                     IF( A( K, J ).NE.ZERO )THEN
+                        TEMP = ALPHA*A( K, J )
+                        DO 220, I = 1, M
+                           B( I, J ) = B( I, J ) + TEMP*B( I, K )
+  220                   CONTINUE
+                     END IF
+  230             CONTINUE
+  240          CONTINUE
+            END IF
+         ELSE
+*
+*           Form  B := alpha*B*A'   or   B := alpha*B*conjg( A' ).
+*
+            IF( UPPER )THEN
+               DO 280, K = 1, N
+                  DO 260, J = 1, K - 1
+                     IF( A( J, K ).NE.ZERO )THEN
+                        IF( NOCONJ )THEN
+                           TEMP = ALPHA*A( J, K )
+                        ELSE
+                           TEMP = ALPHA*DCONJG( A( J, K ) )
+                        END IF
+                        DO 250, I = 1, M
+                           B( I, J ) = B( I, J ) + TEMP*B( I, K )
+  250                   CONTINUE
+                     END IF
+  260             CONTINUE
+                  TEMP = ALPHA
+                  IF( NOUNIT )THEN
+                     IF( NOCONJ )THEN
+                        TEMP = TEMP*A( K, K )
+                     ELSE
+                        TEMP = TEMP*DCONJG( A( K, K ) )
+                     END IF
+                  END IF
+                  IF( TEMP.NE.ONE )THEN
+                     DO 270, I = 1, M
+                        B( I, K ) = TEMP*B( I, K )
+  270                CONTINUE
+                  END IF
+  280          CONTINUE
+            ELSE
+               DO 320, K = N, 1, -1
+                  DO 300, J = K + 1, N
+                     IF( A( J, K ).NE.ZERO )THEN
+                        IF( NOCONJ )THEN
+                           TEMP = ALPHA*A( J, K )
+                        ELSE
+                           TEMP = ALPHA*DCONJG( A( J, K ) )
+                        END IF
+                        DO 290, I = 1, M
+                           B( I, J ) = B( I, J ) + TEMP*B( I, K )
+  290                   CONTINUE
+                     END IF
+  300             CONTINUE
+                  TEMP = ALPHA
+                  IF( NOUNIT )THEN
+                     IF( NOCONJ )THEN
+                        TEMP = TEMP*A( K, K )
+                     ELSE
+                        TEMP = TEMP*DCONJG( A( K, K ) )
+                     END IF
+                  END IF
+                  IF( TEMP.NE.ONE )THEN
+                     DO 310, I = 1, M
+                        B( I, K ) = TEMP*B( I, K )
+  310                CONTINUE
+                  END IF
+  320          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZTRMM .
+*
+      END
+      SUBROUTINE ZTRMV ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, LDA, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZTRMV  performs one of the matrix-vector operations
+*
+*     x := A*x,   or   x := A'*x,   or   x := conjg( A' )*x,
+*
+*  where x is an n element vector and  A is an n by n unit, or non-unit,
+*  upper or lower triangular matrix.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the operation to be performed as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   x := A*x.
+*
+*              TRANS = 'T' or 't'   x := A'*x.
+*
+*              TRANS = 'C' or 'c'   x := conjg( A' )*x.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
+*           Before entry with  UPLO = 'U' or 'u', the leading n by n
+*           upper triangular part of the array A must contain the upper
+*           triangular matrix and the strictly lower triangular part of
+*           A is not referenced.
+*           Before entry with UPLO = 'L' or 'l', the leading n by n
+*           lower triangular part of the array A must contain the lower
+*           triangular matrix and the strictly upper triangular part of
+*           A is not referenced.
+*           Note that when  DIAG = 'U' or 'u', the diagonal elements of
+*           A are not referenced either, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*  X      - COMPLEX*16       array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element vector x. On exit, X is overwritten with the
+*           tranformed vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     .. Local Scalars ..
+      COMPLEX*16         TEMP
+      INTEGER            I, INFO, IX, J, JX, KX
+      LOGICAL            NOCONJ, NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG, MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, N ) )THEN
+         INFO = 6
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 8
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZTRMV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOCONJ = LSAME( TRANS, 'T' )
+      NOUNIT = LSAME( DIAG , 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX  too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through A.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  x := A*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     DO 10, I = 1, J - 1
+                        X( I ) = X( I ) + TEMP*A( I, J )
+   10                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*A( J, J )
+                  END IF
+   20          CONTINUE
+            ELSE
+               JX = KX
+               DO 40, J = 1, N
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     DO 30, I = 1, J - 1
+                        X( IX ) = X( IX ) + TEMP*A( I, J )
+                        IX      = IX      + INCX
+   30                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*A( J, J )
+                  END IF
+                  JX = JX + INCX
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     TEMP = X( J )
+                     DO 50, I = N, J + 1, -1
+                        X( I ) = X( I ) + TEMP*A( I, J )
+   50                CONTINUE
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )*A( J, J )
+                  END IF
+   60          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 80, J = N, 1, -1
+                  IF( X( JX ).NE.ZERO )THEN
+                     TEMP = X( JX )
+                     IX   = KX
+                     DO 70, I = N, J + 1, -1
+                        X( IX ) = X( IX ) + TEMP*A( I, J )
+                        IX      = IX      - INCX
+   70                CONTINUE
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )*A( J, J )
+                  END IF
+                  JX = JX - INCX
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := A'*x  or  x := conjg( A' )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            IF( INCX.EQ.1 )THEN
+               DO 110, J = N, 1, -1
+                  TEMP = X( J )
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*A( J, J )
+                     DO 90, I = J - 1, 1, -1
+                        TEMP = TEMP + A( I, J )*X( I )
+   90                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*DCONJG( A( J, J ) )
+                     DO 100, I = J - 1, 1, -1
+                        TEMP = TEMP + DCONJG( A( I, J ) )*X( I )
+  100                CONTINUE
+                  END IF
+                  X( J ) = TEMP
+  110          CONTINUE
+            ELSE
+               JX = KX + ( N - 1 )*INCX
+               DO 140, J = N, 1, -1
+                  TEMP = X( JX )
+                  IX   = JX
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*A( J, J )
+                     DO 120, I = J - 1, 1, -1
+                        IX   = IX   - INCX
+                        TEMP = TEMP + A( I, J )*X( IX )
+  120                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*DCONJG( A( J, J ) )
+                     DO 130, I = J - 1, 1, -1
+                        IX   = IX   - INCX
+                        TEMP = TEMP + DCONJG( A( I, J ) )*X( IX )
+  130                CONTINUE
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+  140          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 170, J = 1, N
+                  TEMP = X( J )
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*A( J, J )
+                     DO 150, I = J + 1, N
+                        TEMP = TEMP + A( I, J )*X( I )
+  150                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*DCONJG( A( J, J ) )
+                     DO 160, I = J + 1, N
+                        TEMP = TEMP + DCONJG( A( I, J ) )*X( I )
+  160                CONTINUE
+                  END IF
+                  X( J ) = TEMP
+  170          CONTINUE
+            ELSE
+               JX = KX
+               DO 200, J = 1, N
+                  TEMP = X( JX )
+                  IX   = JX
+                  IF( NOCONJ )THEN
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*A( J, J )
+                     DO 180, I = J + 1, N
+                        IX   = IX   + INCX
+                        TEMP = TEMP + A( I, J )*X( IX )
+  180                CONTINUE
+                  ELSE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP*DCONJG( A( J, J ) )
+                     DO 190, I = J + 1, N
+                        IX   = IX   + INCX
+                        TEMP = TEMP + DCONJG( A( I, J ) )*X( IX )
+  190                CONTINUE
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+  200          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZTRMV .
+*
+      END
+      SUBROUTINE ZTRSM ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA,
+     $                   B, LDB )
+*     .. Scalar Arguments ..
+      CHARACTER*1        SIDE, UPLO, TRANSA, DIAG
+      INTEGER            M, N, LDA, LDB
+      COMPLEX*16         ALPHA
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), B( LDB, * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZTRSM  solves one of the matrix equations
+*
+*     op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,
+*
+*  where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+*  non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+*
+*     op( A ) = A   or   op( A ) = A'   or   op( A ) = conjg( A' ).
+*
+*  The matrix X is overwritten on B.
+*
+*  Parameters
+*  ==========
+*
+*  SIDE   - CHARACTER*1.
+*           On entry, SIDE specifies whether op( A ) appears on the left
+*           or right of X as follows:
+*
+*              SIDE = 'L' or 'l'   op( A )*X = alpha*B.
+*
+*              SIDE = 'R' or 'r'   X*op( A ) = alpha*B.
+*
+*           Unchanged on exit.
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix A is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANSA - CHARACTER*1.
+*           On entry, TRANSA specifies the form of op( A ) to be used in
+*           the matrix multiplication as follows:
+*
+*              TRANSA = 'N' or 'n'   op( A ) = A.
+*
+*              TRANSA = 'T' or 't'   op( A ) = A'.
+*
+*              TRANSA = 'C' or 'c'   op( A ) = conjg( A' ).
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit triangular
+*           as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  M      - INTEGER.
+*           On entry, M specifies the number of rows of B. M must be at
+*           least zero.
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the number of columns of B.  N must be
+*           at least zero.
+*           Unchanged on exit.
+*
+*  ALPHA  - COMPLEX*16      .
+*           On entry,  ALPHA specifies the scalar  alpha. When  alpha is
+*           zero then  A is not referenced and  B need not be set before
+*           entry.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX*16       array of DIMENSION ( LDA, k ), where k is m
+*           when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'.
+*           Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k
+*           upper triangular part of the array  A must contain the upper
+*           triangular matrix  and the strictly lower triangular part of
+*           A is not referenced.
+*           Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k
+*           lower triangular part of the array  A must contain the lower
+*           triangular matrix  and the strictly upper triangular part of
+*           A is not referenced.
+*           Note that when  DIAG = 'U' or 'u',  the diagonal elements of
+*           A  are not referenced either,  but are assumed to be  unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program.  When  SIDE = 'L' or 'l'  then
+*           LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r'
+*           then LDA must be at least max( 1, n ).
+*           Unchanged on exit.
+*
+*  B      - COMPLEX*16       array of DIMENSION ( LDB, n ).
+*           Before entry,  the leading  m by n part of the array  B must
+*           contain  the  right-hand  side  matrix  B,  and  on exit  is
+*           overwritten by the solution matrix  X.
+*
+*  LDB    - INTEGER.
+*           On entry, LDB specifies the first dimension of B as declared
+*           in  the  calling  (sub)  program.   LDB  must  be  at  least
+*           max( 1, m ).
+*           Unchanged on exit.
+*
+*
+*  Level 3 Blas routine.
+*
+*  -- Written on 8-February-1989.
+*     Jack Dongarra, Argonne National Laboratory.
+*     Iain Duff, AERE Harwell.
+*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+*
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG, MAX
+*     .. Local Scalars ..
+      LOGICAL            LSIDE, NOCONJ, NOUNIT, UPPER
+      INTEGER            I, INFO, J, K, NROWA
+      COMPLEX*16         TEMP
+*     .. Parameters ..
+      COMPLEX*16         ONE
+      PARAMETER        ( ONE  = ( 1.0D+0, 0.0D+0 ) )
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      LSIDE  = LSAME( SIDE  , 'L' )
+      IF( LSIDE )THEN
+         NROWA = M
+      ELSE
+         NROWA = N
+      END IF
+      NOCONJ = LSAME( TRANSA, 'T' )
+      NOUNIT = LSAME( DIAG  , 'N' )
+      UPPER  = LSAME( UPLO  , 'U' )
+*
+      INFO   = 0
+      IF(      ( .NOT.LSIDE                ).AND.
+     $         ( .NOT.LSAME( SIDE  , 'R' ) )      )THEN
+         INFO = 1
+      ELSE IF( ( .NOT.UPPER                ).AND.
+     $         ( .NOT.LSAME( UPLO  , 'L' ) )      )THEN
+         INFO = 2
+      ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND.
+     $         ( .NOT.LSAME( TRANSA, 'T' ) ).AND.
+     $         ( .NOT.LSAME( TRANSA, 'C' ) )      )THEN
+         INFO = 3
+      ELSE IF( ( .NOT.LSAME( DIAG  , 'U' ) ).AND.
+     $         ( .NOT.LSAME( DIAG  , 'N' ) )      )THEN
+         INFO = 4
+      ELSE IF( M  .LT.0               )THEN
+         INFO = 5
+      ELSE IF( N  .LT.0               )THEN
+         INFO = 6
+      ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN
+         INFO = 9
+      ELSE IF( LDB.LT.MAX( 1, M     ) )THEN
+         INFO = 11
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZTRSM ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+*     And when  alpha.eq.zero.
+*
+      IF( ALPHA.EQ.ZERO )THEN
+         DO 20, J = 1, N
+            DO 10, I = 1, M
+               B( I, J ) = ZERO
+   10       CONTINUE
+   20    CONTINUE
+         RETURN
+      END IF
+*
+*     Start the operations.
+*
+      IF( LSIDE )THEN
+         IF( LSAME( TRANSA, 'N' ) )THEN
+*
+*           Form  B := alpha*inv( A )*B.
+*
+            IF( UPPER )THEN
+               DO 60, J = 1, N
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 30, I = 1, M
+                        B( I, J ) = ALPHA*B( I, J )
+   30                CONTINUE
+                  END IF
+                  DO 50, K = M, 1, -1
+                     IF( B( K, J ).NE.ZERO )THEN
+                        IF( NOUNIT )
+     $                     B( K, J ) = B( K, J )/A( K, K )
+                        DO 40, I = 1, K - 1
+                           B( I, J ) = B( I, J ) - B( K, J )*A( I, K )
+   40                   CONTINUE
+                     END IF
+   50             CONTINUE
+   60          CONTINUE
+            ELSE
+               DO 100, J = 1, N
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 70, I = 1, M
+                        B( I, J ) = ALPHA*B( I, J )
+   70                CONTINUE
+                  END IF
+                  DO 90 K = 1, M
+                     IF( B( K, J ).NE.ZERO )THEN
+                        IF( NOUNIT )
+     $                     B( K, J ) = B( K, J )/A( K, K )
+                        DO 80, I = K + 1, M
+                           B( I, J ) = B( I, J ) - B( K, J )*A( I, K )
+   80                   CONTINUE
+                     END IF
+   90             CONTINUE
+  100          CONTINUE
+            END IF
+         ELSE
+*
+*           Form  B := alpha*inv( A' )*B
+*           or    B := alpha*inv( conjg( A' ) )*B.
+*
+            IF( UPPER )THEN
+               DO 140, J = 1, N
+                  DO 130, I = 1, M
+                     TEMP = ALPHA*B( I, J )
+                     IF( NOCONJ )THEN
+                        DO 110, K = 1, I - 1
+                           TEMP = TEMP - A( K, I )*B( K, J )
+  110                   CONTINUE
+                        IF( NOUNIT )
+     $                     TEMP = TEMP/A( I, I )
+                     ELSE
+                        DO 120, K = 1, I - 1
+                           TEMP = TEMP - DCONJG( A( K, I ) )*B( K, J )
+  120                   CONTINUE
+                        IF( NOUNIT )
+     $                     TEMP = TEMP/DCONJG( A( I, I ) )
+                     END IF
+                     B( I, J ) = TEMP
+  130             CONTINUE
+  140          CONTINUE
+            ELSE
+               DO 180, J = 1, N
+                  DO 170, I = M, 1, -1
+                     TEMP = ALPHA*B( I, J )
+                     IF( NOCONJ )THEN
+                        DO 150, K = I + 1, M
+                           TEMP = TEMP - A( K, I )*B( K, J )
+  150                   CONTINUE
+                        IF( NOUNIT )
+     $                     TEMP = TEMP/A( I, I )
+                     ELSE
+                        DO 160, K = I + 1, M
+                           TEMP = TEMP - DCONJG( A( K, I ) )*B( K, J )
+  160                   CONTINUE
+                        IF( NOUNIT )
+     $                     TEMP = TEMP/DCONJG( A( I, I ) )
+                     END IF
+                     B( I, J ) = TEMP
+  170             CONTINUE
+  180          CONTINUE
+            END IF
+         END IF
+      ELSE
+         IF( LSAME( TRANSA, 'N' ) )THEN
+*
+*           Form  B := alpha*B*inv( A ).
+*
+            IF( UPPER )THEN
+               DO 230, J = 1, N
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 190, I = 1, M
+                        B( I, J ) = ALPHA*B( I, J )
+  190                CONTINUE
+                  END IF
+                  DO 210, K = 1, J - 1
+                     IF( A( K, J ).NE.ZERO )THEN
+                        DO 200, I = 1, M
+                           B( I, J ) = B( I, J ) - A( K, J )*B( I, K )
+  200                   CONTINUE
+                     END IF
+  210             CONTINUE
+                  IF( NOUNIT )THEN
+                     TEMP = ONE/A( J, J )
+                     DO 220, I = 1, M
+                        B( I, J ) = TEMP*B( I, J )
+  220                CONTINUE
+                  END IF
+  230          CONTINUE
+            ELSE
+               DO 280, J = N, 1, -1
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 240, I = 1, M
+                        B( I, J ) = ALPHA*B( I, J )
+  240                CONTINUE
+                  END IF
+                  DO 260, K = J + 1, N
+                     IF( A( K, J ).NE.ZERO )THEN
+                        DO 250, I = 1, M
+                           B( I, J ) = B( I, J ) - A( K, J )*B( I, K )
+  250                   CONTINUE
+                     END IF
+  260             CONTINUE
+                  IF( NOUNIT )THEN
+                     TEMP = ONE/A( J, J )
+                     DO 270, I = 1, M
+                       B( I, J ) = TEMP*B( I, J )
+  270                CONTINUE
+                  END IF
+  280          CONTINUE
+            END IF
+         ELSE
+*
+*           Form  B := alpha*B*inv( A' )
+*           or    B := alpha*B*inv( conjg( A' ) ).
+*
+            IF( UPPER )THEN
+               DO 330, K = N, 1, -1
+                  IF( NOUNIT )THEN
+                     IF( NOCONJ )THEN
+                        TEMP = ONE/A( K, K )
+                     ELSE
+                        TEMP = ONE/DCONJG( A( K, K ) )
+                     END IF
+                     DO 290, I = 1, M
+                        B( I, K ) = TEMP*B( I, K )
+  290                CONTINUE
+                  END IF
+                  DO 310, J = 1, K - 1
+                     IF( A( J, K ).NE.ZERO )THEN
+                        IF( NOCONJ )THEN
+                           TEMP = A( J, K )
+                        ELSE
+                           TEMP = DCONJG( A( J, K ) )
+                        END IF
+                        DO 300, I = 1, M
+                           B( I, J ) = B( I, J ) - TEMP*B( I, K )
+  300                   CONTINUE
+                     END IF
+  310             CONTINUE
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 320, I = 1, M
+                        B( I, K ) = ALPHA*B( I, K )
+  320                CONTINUE
+                  END IF
+  330          CONTINUE
+            ELSE
+               DO 380, K = 1, N
+                  IF( NOUNIT )THEN
+                     IF( NOCONJ )THEN
+                        TEMP = ONE/A( K, K )
+                     ELSE
+                        TEMP = ONE/DCONJG( A( K, K ) )
+                     END IF
+                     DO 340, I = 1, M
+                        B( I, K ) = TEMP*B( I, K )
+  340                CONTINUE
+                  END IF
+                  DO 360, J = K + 1, N
+                     IF( A( J, K ).NE.ZERO )THEN
+                        IF( NOCONJ )THEN
+                           TEMP = A( J, K )
+                        ELSE
+                           TEMP = DCONJG( A( J, K ) )
+                        END IF
+                        DO 350, I = 1, M
+                           B( I, J ) = B( I, J ) - TEMP*B( I, K )
+  350                   CONTINUE
+                     END IF
+  360             CONTINUE
+                  IF( ALPHA.NE.ONE )THEN
+                     DO 370, I = 1, M
+                        B( I, K ) = ALPHA*B( I, K )
+  370                CONTINUE
+                  END IF
+  380          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZTRSM .
+*
+      END
+      SUBROUTINE ZTRSV ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX )
+*     .. Scalar Arguments ..
+      INTEGER            INCX, LDA, N
+      CHARACTER*1        DIAG, TRANS, UPLO
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), X( * )
+*     ..
+*
+*  Purpose
+*  =======
+*
+*  ZTRSV  solves one of the systems of equations
+*
+*     A*x = b,   or   A'*x = b,   or   conjg( A' )*x = b,
+*
+*  where b and x are n element vectors and A is an n by n unit, or
+*  non-unit, upper or lower triangular matrix.
+*
+*  No test for singularity or near-singularity is included in this
+*  routine. Such tests must be performed before calling this routine.
+*
+*  Parameters
+*  ==========
+*
+*  UPLO   - CHARACTER*1.
+*           On entry, UPLO specifies whether the matrix is an upper or
+*           lower triangular matrix as follows:
+*
+*              UPLO = 'U' or 'u'   A is an upper triangular matrix.
+*
+*              UPLO = 'L' or 'l'   A is a lower triangular matrix.
+*
+*           Unchanged on exit.
+*
+*  TRANS  - CHARACTER*1.
+*           On entry, TRANS specifies the equations to be solved as
+*           follows:
+*
+*              TRANS = 'N' or 'n'   A*x = b.
+*
+*              TRANS = 'T' or 't'   A'*x = b.
+*
+*              TRANS = 'C' or 'c'   conjg( A' )*x = b.
+*
+*           Unchanged on exit.
+*
+*  DIAG   - CHARACTER*1.
+*           On entry, DIAG specifies whether or not A is unit
+*           triangular as follows:
+*
+*              DIAG = 'U' or 'u'   A is assumed to be unit triangular.
+*
+*              DIAG = 'N' or 'n'   A is not assumed to be unit
+*                                  triangular.
+*
+*           Unchanged on exit.
+*
+*  N      - INTEGER.
+*           On entry, N specifies the order of the matrix A.
+*           N must be at least zero.
+*           Unchanged on exit.
+*
+*  A      - COMPLEX*16       array of DIMENSION ( LDA, n ).
+*           Before entry with  UPLO = 'U' or 'u', the leading n by n
+*           upper triangular part of the array A must contain the upper
+*           triangular matrix and the strictly lower triangular part of
+*           A is not referenced.
+*           Before entry with UPLO = 'L' or 'l', the leading n by n
+*           lower triangular part of the array A must contain the lower
+*           triangular matrix and the strictly upper triangular part of
+*           A is not referenced.
+*           Note that when  DIAG = 'U' or 'u', the diagonal elements of
+*           A are not referenced either, but are assumed to be unity.
+*           Unchanged on exit.
+*
+*  LDA    - INTEGER.
+*           On entry, LDA specifies the first dimension of A as declared
+*           in the calling (sub) program. LDA must be at least
+*           max( 1, n ).
+*           Unchanged on exit.
+*
+*  X      - COMPLEX*16       array of dimension at least
+*           ( 1 + ( n - 1 )*abs( INCX ) ).
+*           Before entry, the incremented array X must contain the n
+*           element right-hand side vector b. On exit, X is overwritten
+*           with the solution vector x.
+*
+*  INCX   - INTEGER.
+*           On entry, INCX specifies the increment for the elements of
+*           X. INCX must not be zero.
+*           Unchanged on exit.
+*
+*
+*  Level 2 Blas routine.
+*
+*  -- Written on 22-October-1986.
+*     Jack Dongarra, Argonne National Lab.
+*     Jeremy Du Croz, Nag Central Office.
+*     Sven Hammarling, Nag Central Office.
+*     Richard Hanson, Sandia National Labs.
+*
+*
+*     .. Parameters ..
+      COMPLEX*16         ZERO
+      PARAMETER        ( ZERO = ( 0.0D+0, 0.0D+0 ) )
+*     .. Local Scalars ..
+      COMPLEX*16         TEMP
+      INTEGER            I, INFO, IX, J, JX, KX
+      LOGICAL            NOCONJ, NOUNIT
+*     .. External Functions ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG, MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF     ( .NOT.LSAME( UPLO , 'U' ).AND.
+     $         .NOT.LSAME( UPLO , 'L' )      )THEN
+         INFO = 1
+      ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND.
+     $         .NOT.LSAME( TRANS, 'T' ).AND.
+     $         .NOT.LSAME( TRANS, 'C' )      )THEN
+         INFO = 2
+      ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND.
+     $         .NOT.LSAME( DIAG , 'N' )      )THEN
+         INFO = 3
+      ELSE IF( N.LT.0 )THEN
+         INFO = 4
+      ELSE IF( LDA.LT.MAX( 1, N ) )THEN
+         INFO = 6
+      ELSE IF( INCX.EQ.0 )THEN
+         INFO = 8
+      END IF
+      IF( INFO.NE.0 )THEN
+         CALL XERBLA( 'ZTRSV ', INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible.
+*
+      IF( N.EQ.0 )
+     $   RETURN
+*
+      NOCONJ = LSAME( TRANS, 'T' )
+      NOUNIT = LSAME( DIAG , 'N' )
+*
+*     Set up the start point in X if the increment is not unity. This
+*     will be  ( N - 1 )*INCX  too small for descending loops.
+*
+      IF( INCX.LE.0 )THEN
+         KX = 1 - ( N - 1 )*INCX
+      ELSE IF( INCX.NE.1 )THEN
+         KX = 1
+      END IF
+*
+*     Start the operations. In this version the elements of A are
+*     accessed sequentially with one pass through A.
+*
+      IF( LSAME( TRANS, 'N' ) )THEN
+*
+*        Form  x := inv( A )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            IF( INCX.EQ.1 )THEN
+               DO 20, J = N, 1, -1
+                  IF( X( J ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/A( J, J )
+                     TEMP = X( J )
+                     DO 10, I = J - 1, 1, -1
+                        X( I ) = X( I ) - TEMP*A( I, J )
+   10                CONTINUE
+                  END IF
+   20          CONTINUE
+            ELSE
+               JX = KX + ( N - 1 )*INCX
+               DO 40, J = N, 1, -1
+                  IF( X( JX ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/A( J, J )
+                     TEMP = X( JX )
+                     IX   = JX
+                     DO 30, I = J - 1, 1, -1
+                        IX      = IX      - INCX
+                        X( IX ) = X( IX ) - TEMP*A( I, J )
+   30                CONTINUE
+                  END IF
+                  JX = JX - INCX
+   40          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 60, J = 1, N
+                  IF( X( J ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( J ) = X( J )/A( J, J )
+                     TEMP = X( J )
+                     DO 50, I = J + 1, N
+                        X( I ) = X( I ) - TEMP*A( I, J )
+   50                CONTINUE
+                  END IF
+   60          CONTINUE
+            ELSE
+               JX = KX
+               DO 80, J = 1, N
+                  IF( X( JX ).NE.ZERO )THEN
+                     IF( NOUNIT )
+     $                  X( JX ) = X( JX )/A( J, J )
+                     TEMP = X( JX )
+                     IX   = JX
+                     DO 70, I = J + 1, N
+                        IX      = IX      + INCX
+                        X( IX ) = X( IX ) - TEMP*A( I, J )
+   70                CONTINUE
+                  END IF
+                  JX = JX + INCX
+   80          CONTINUE
+            END IF
+         END IF
+      ELSE
+*
+*        Form  x := inv( A' )*x  or  x := inv( conjg( A' ) )*x.
+*
+         IF( LSAME( UPLO, 'U' ) )THEN
+            IF( INCX.EQ.1 )THEN
+               DO 110, J = 1, N
+                  TEMP = X( J )
+                  IF( NOCONJ )THEN
+                     DO 90, I = 1, J - 1
+                        TEMP = TEMP - A( I, J )*X( I )
+   90                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/A( J, J )
+                  ELSE
+                     DO 100, I = 1, J - 1
+                        TEMP = TEMP - DCONJG( A( I, J ) )*X( I )
+  100                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/DCONJG( A( J, J ) )
+                  END IF
+                  X( J ) = TEMP
+  110          CONTINUE
+            ELSE
+               JX = KX
+               DO 140, J = 1, N
+                  IX   = KX
+                  TEMP = X( JX )
+                  IF( NOCONJ )THEN
+                     DO 120, I = 1, J - 1
+                        TEMP = TEMP - A( I, J )*X( IX )
+                        IX   = IX   + INCX
+  120                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/A( J, J )
+                  ELSE
+                     DO 130, I = 1, J - 1
+                        TEMP = TEMP - DCONJG( A( I, J ) )*X( IX )
+                        IX   = IX   + INCX
+  130                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/DCONJG( A( J, J ) )
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   + INCX
+  140          CONTINUE
+            END IF
+         ELSE
+            IF( INCX.EQ.1 )THEN
+               DO 170, J = N, 1, -1
+                  TEMP = X( J )
+                  IF( NOCONJ )THEN
+                     DO 150, I = N, J + 1, -1
+                        TEMP = TEMP - A( I, J )*X( I )
+  150                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/A( J, J )
+                  ELSE
+                     DO 160, I = N, J + 1, -1
+                        TEMP = TEMP - DCONJG( A( I, J ) )*X( I )
+  160                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/DCONJG( A( J, J ) )
+                  END IF
+                  X( J ) = TEMP
+  170          CONTINUE
+            ELSE
+               KX = KX + ( N - 1 )*INCX
+               JX = KX
+               DO 200, J = N, 1, -1
+                  IX   = KX
+                  TEMP = X( JX )
+                  IF( NOCONJ )THEN
+                     DO 180, I = N, J + 1, -1
+                        TEMP = TEMP - A( I, J )*X( IX )
+                        IX   = IX   - INCX
+  180                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/A( J, J )
+                  ELSE
+                     DO 190, I = N, J + 1, -1
+                        TEMP = TEMP - DCONJG( A( I, J ) )*X( IX )
+                        IX   = IX   - INCX
+  190                CONTINUE
+                     IF( NOUNIT )
+     $                  TEMP = TEMP/DCONJG( A( J, J ) )
+                  END IF
+                  X( JX ) = TEMP
+                  JX      = JX   - INCX
+  200          CONTINUE
+            END IF
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZTRSV .
+*
+      END
diff --git a/src/byte_mpi.f b/src/byte_mpi.f
new file mode 100644
index 0000000..5815646
--- /dev/null
+++ b/src/byte_mpi.f
@@ -0,0 +1,209 @@
+      subroutine byte_sync_mpi(mpi_fh)
+
+#ifdef MPIIO
+      include 'mpif.h'
+      call MPI_file_sync(mpi_fh,ierr)
+#endif
+
+      return
+      end
+C--------------------------------------------------------------------------
+      subroutine byte_open_mpi(fname,mpi_fh)
+
+      include 'SIZE'
+      include 'RESTART'
+
+#ifdef MPIIO
+      include 'mpif.h'
+
+      character*132 fname
+
+      if(nid.eq.pid0 .or. nid.eq.pid0r) then
+c        write(*,*) nid, 'call MPI_file_open',fname
+        call MPI_file_open(nekcomm_io,fname,
+     &                     MPI_MODE_RDWR+MPI_MODE_CREATE,
+     &                     MPI_INFO_NULL,mpi_fh,ierr)
+        if(ierr.ne.0) then
+          write(6,*) 'ABORT: Error in byte_open_mpi ', ierr
+          call exitt
+        endif
+      endif
+#else
+      write(6,*) 'byte_open_mpi: No MPI-IO support!'
+      call exitt
+#endif
+      return
+      end
+C--------------------------------------------------------------------------
+      subroutine byte_read_mpi(buf,icount,iorank,mpi_fh)
+
+      include 'SIZE'
+      include 'RESTART'
+
+#ifdef MPIIO
+      include 'mpif.h'
+
+      real*4 buf(1)          ! buffer
+
+      if(nid.eq.pid0 .or. nid.eq.pid0r) then
+        iout = 4*icount ! icount is in 4-byte words
+        if(iorank.ge.0 .and. nid.ne.iorank) iout = 0
+c        write(*,*) 'byte_read_mpi', nid, iout/4
+#ifdef MPIIO_NOCOL
+        call MPI_file_read(mpi_fh,buf,iout,MPI_BYTE,
+     &                     MPI_STATUS_IGNORE,ierr)
+#else
+        call MPI_file_read_all(mpi_fh,buf,iout,MPI_BYTE,
+     &                         MPI_STATUS_IGNORE,ierr)
+#endif
+        if(ierr.ne.0) then
+          write(6,*) 'ABORT: Error in byte_read_mpi ', ierr
+          call exitt
+        endif
+      endif
+#else
+      write(6,*) 'byte_read_mpi: No MPI-IO support!'
+      call exitt
+#endif
+
+      return
+      end
+C--------------------------------------------------------------------------
+      subroutine byte_write_mpi(buf,icount,iorank,mpi_fh)
+
+      include 'SIZE'
+      include 'RESTART'
+
+#ifdef MPIIO
+      include 'mpif.h'
+
+      real*4 buf(1)          ! buffer
+
+      if(nid.eq.pid0 .or. nid.eq.pid0r) then
+        iout = 4*icount ! icount is in 4-byte words
+        if(iorank.ge.0 .and. nid.ne.iorank) iout = 0
+c        write(*,*) 'byte_write', nid, iout/4
+#ifdef MPIIO_NOCOL
+        call MPI_file_write(mpi_fh,buf,iout,MPI_BYTE,
+     &                      MPI_STATUS_IGNORE,ierr)
+#else
+        call MPI_file_write_all(mpi_fh,buf,iout,MPI_BYTE,
+     &                          MPI_STATUS_IGNORE,ierr)
+#endif
+        if(ierr.ne.0) then
+          write(6,*) 'ABORT: Error in byte_write_mpi ', ierr
+          call exitt
+        endif
+      endif
+#else
+      write(6,*) 'byte_write_mpi: No MPI-IO support!'
+      call exitt
+#endif
+
+      return
+      end
+C--------------------------------------------------------------------------
+      subroutine byte_close_mpi(mpi_fh)
+
+      include 'SIZE'
+      include 'RESTART'
+
+#ifdef MPIIO
+      include 'mpif.h'
+      if(nid.eq.pid0 .or. nid.eq.pid0r) then
+        call MPI_file_close(mpi_fh,ierr)
+      endif
+#else
+      if(nid.eq.0) write(6,*) 'byte_close_mpi: No MPI-IO support!'
+      call exitt
+#endif
+
+      return
+      end
+C--------------------------------------------------------------------------
+      subroutine byte_set_view(ioff_in,mpi_fh)
+
+      include 'SIZE'
+      include 'RESTART'
+
+#ifdef MPIIO
+      include 'mpif.h'
+      integer*8 ioff_in
+    
+      if(nid.eq.pid0 .or. nid.eq.pid0r) then
+         if(ioff_in.lt.0) then
+           write(6,*) 'byte_set_view: offset<0!'
+           call exitt
+         endif
+c         write(*,*) 'dataoffset', nid, ioff_in
+         call MPI_file_set_view(mpi_fh,ioff_in,MPI_BYTE,MPI_BYTE,
+     &                          'native',MPI_INFO_NULL,ierr)
+         if(ierr.ne.0) then
+           write(6,*) 'ABORT: Error in byte_set_view ', ierr
+           call exitt
+         endif
+      endif
+#endif
+
+      return
+      end
+C--------------------------------------------------------------------------
+      subroutine nek_comm_io(nn)
+
+      include 'SIZE'
+      include 'RESTART'
+      include 'PARALLEL'
+
+#ifdef MPIIO
+      include 'mpif.h'
+      common /nekmpi/ mid,mp,nekcomm,nekgroup,nekreal
+      common /scrns/  irank_io(0:lp-1)
+
+#ifdef MPIIO_NOCOL
+      if(nid.eq.0) then
+        j = 0
+        if(nid.eq.pid0 .or. nid.eq.pid0r) then
+          irank_io(j) = nid
+          j = j + 1
+        endif
+        do ir = 1,np-1
+          call csend(ir,idum,4,ir,0)           ! handshake
+          call crecv(ir,ibuf,4)
+          if(ibuf.gt.0) then 
+            irank_io(j) = ibuf
+            j = j + 1
+          endif 
+        enddo
+      else
+         mtype = nid
+         ibuf = -1
+         if(nid.eq.pid0) then
+           ibuf = nid
+         endif
+         call crecv(mtype,idum,4)                ! hand-shake
+         call csend(mtype,ibuf,4,0,0)            ! u4 :=: u8
+      endif
+
+      call bcast(irank_io,isize*nn)
+
+c      write(6,*) 'nid', nid, (irank_io(i),i=0,nn-1)
+
+      call mpi_comm_group (nekcomm,nekgroup,ierr)
+      if(ierr.gt.0) call exitt
+      call mpi_group_incl (nekgroup,nn,irank_io,nekgroup_io,ierr)
+      if(ierr.gt.0) call exitt
+      call mpi_comm_create(nekcomm,nekgroup_io,nekcomm_io,ierr)
+      if(ierr.gt.0) call exitt
+      call mpi_group_free (nekgroup_io,ierr)
+      if(ierr.gt.0) call exitt
+      call mpi_group_free (nekgroup,ierr)
+      if(ierr.gt.0) call exitt
+#else
+      nekcomm_io = nekcomm
+      return    
+#endif
+
+#endif
+
+      return
+      end
diff --git a/src/cg.f b/src/cg.f
new file mode 100644
index 0000000..b4214dd
--- /dev/null
+++ b/src/cg.f
@@ -0,0 +1,335 @@
+#ifdef TIMERS
+#define NBTIMER(a) a = dnekclock()
+#define STIMER(a) a = dnekclock_sync()
+#define ACCUMTIMER(b,a) b = b + (dnekclock()- a )
+#else
+#define NBTIMER(a)
+#define STIMER(a)
+#define ACCUMTIMER(a,b)
+#endif
+
+
+c-----------------------------------------------------------------------
+      subroutine cg(x,f,g,c,r,w,p,z,n,niter,flop_cg)
+
+#if defined(XSMM_DISPATCH)
+      USE :: LIBXSMM
+#endif
+
+      include 'SIZE'
+      include 'TIMER'
+
+
+c     Solve Ax=f where A is SPD and is invoked by ax()
+c
+c     Output:  x - vector of length n
+c
+c     Input:   f - vector of length n
+c     Input:   g - geometric factors for SEM operator
+c     Input:   c - inverse of the counting matrix
+c
+c     Work arrays:   r,w,p,z  - vectors of length n
+c
+c     User-provided ax(w,z,n) returns  w := Az,  
+c
+c     User-provided solveM(z,r,n) ) returns  z := M^-1 r,  
+c
+      parameter (lt=lx1*ly1*lz1*lelt)
+c     real ur(lt),us(lt),ut(lt)
+
+c     parameter (lxyz=lx1*ly1*lz1)
+c     real ur(lxyz),us(lxyz),ut(lxyz),wk(lxyz)
+
+      real x(n),f(n),r(n),w(n),p(n),z(n),g(1),c(n)
+      real rnorminit, fbeta, fpap, falpha, frnorm
+
+      integer*8 flop_cg
+      integer thread, numth, find, lind, fel, lel
+      integer omp_get_thread_num, omp_get_num_threads
+      integer fiter, tmt
+
+      pap = 0.0
+
+c     set machine tolerances
+      one = 1.
+      eps = 1.e-20
+      if (one+eps .eq. one) eps = 1.e-14
+      if (one+eps .eq. one) eps = 1.e-7
+
+      rtz1=1.0
+      miter = niter
+
+c$OMP PARALLEL DEFAULT(shared) PRIVATE(thread,numth,find,lind,iter,
+c$OMP&  fel,lel,rtz2,beta,alpha,alphm,rlim2,rtr0,tmt,ttemp1)
+
+      thread = 0
+      numth = 1
+#ifdef _OPENMP
+      thread = omp_get_thread_num()
+      numth = omp_get_num_threads()
+#endif
+      tmt = thread + 1
+
+      if (numth < nelt) then
+        fel = (thread*nelt)/numth + 1
+        lel = ((thread+1)*nelt)/numth
+      else
+        if (thread < nelt) then
+          fel = thread + 1
+          lel = fel
+        else
+          fel = nelt+1
+          lel = nelt
+        end if
+      end if
+
+      find = (fel-1) *(nx1*ny1*nz1)+1
+      lind = lel * (nx1*ny1*nz1)
+
+      NBTIMER(ttemp1)
+      call rzeroi(x,n,find,lind)
+      ACCUMTIMER(trzero(tmt), ttemp1)
+
+      NBTIMER(ttemp1)
+      call copyi(r,f,n,find,lind)
+      ACCUMTIMER(tcopy(tmt), ttemp1)
+
+      if (thread == 0) call mask (r)   ! Zero out Dirichlet conditions
+
+      gopi(tmt)=1
+      NBTIMER(ttemp1)
+      call glsc3i(rnorminit,r,c,r,n,find,lind)
+      ACCUMTIMER(tglsc3a(tmt), ttemp1)
+
+
+      do iter=1,miter
+#ifdef LOG
+         if ((nid.eq.0) .and. (thread.eq.0)) write(*,*) "iter = ", iter
+#endif
+         NBTIMER(ttemp1)
+         call solveMi(z,r,n,find,lind)    ! preconditioner here
+         ACCUMTIMER(tsolvem(tmt), ttemp1)
+
+         rtz2=rtz1                                                       ! OPS
+         gopi(tmt)=2
+         NBTIMER(ttemp1)
+         call glsc3i(rtz1,r,c,z,n,find,lind)
+         ACCUMTIMER(tglsc3b(tmt), ttemp1)
+
+         beta = rtz1/rtz2
+         if (iter.eq.1) beta=0.0
+
+         NBTIMER(ttemp1)
+         call add2s1i(p,z,beta,n,find,lind)                              ! 2n
+         ACCUMTIMER(tadd2s1(tmt), ttemp1)
+
+         call axi(w,p,g,n,fel,lel,find,lind)                             ! flopa
+
+         gopi(tmt)=3
+         NBTIMER(ttemp1)
+         call glsc3i(pap, w,c,p,n,find,lind)                             ! 3n
+         ACCUMTIMER(tglsc3c(tmt), ttemp1)
+
+         alpha=rtz1/pap
+         alphm=-alpha
+
+         NBTIMER(ttemp1)
+         call add2s2i(x,p,alpha,n,find,lind)                             ! 2n
+         ACCUMTIMER(tadd2s2b(tmt), ttemp1)
+
+         NBTIMER(ttemp1)
+         call add2s2i(r,w,alphm,n,find,lind)                             ! 2n
+         ACCUMTIMER(tadd2s2c(tmt), ttemp1)
+
+         gopi(tmt)=4
+         NBTIMER(ttemp1)
+         call  glsc3i(rtr, r,c,r,n,find,lind)                            ! 3n
+         ACCUMTIMER(tglsc3d(tmt), ttemp1)
+
+         if (iter.eq.1) rlim2 = rtr*eps**2
+         if (iter.eq.1) rtr0  = rtr
+         rnorm = sqrt(rtr)
+
+      enddo
+
+      if (thread == 0) then
+        fiter = iter
+        fbeta = beta
+        falpha= alpha
+        fpap  = pap
+        frnorm = rnorm
+      end if
+
+c$OMP END PARALLEL
+
+    6    format('cg:',i4,1p4e12.4)
+
+      if (nid.eq.0) then
+        write(6,6) 0,sqrt(rnorminit)
+        write(6,6) fiter,frnorm,falpha,fbeta,fpap
+      end if
+
+      flop_cg = flop_cg + (fiter-1)*15_8*n + 3_8*n
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine solveM(z,r,n)
+      real z(n),r(n)
+
+      call copy(z,r,n)
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine axi(w,u,gxyz,n,fel,lel,find,lind) ! Matrix-vector product: w=A*u
+
+      include 'SIZE'
+      include 'TOTAL'
+      include 'TIMER'
+
+      parameter (lxyz=lx1*ly1*lz1)
+      real w(nx1*ny1*nz1,nelt),u(nx1*ny1*nz1,nelt)
+      real gxyz(2*ldim,nx1*ny1*nz1,nelt)
+      parameter (lt=lx1*ly1*lz1*lelt)
+
+      integer fel, lel, find, lind
+      integer e,thread, tmt, omp_get_thread_num
+
+      thread = 0
+#ifdef _OPENMP
+      thread = omp_get_thread_num()
+#endif
+      tmt = thread + 1
+
+      do e= fel, lel
+         call ax_e( w(1,e),u(1,e),gxyz(1,1,e))
+      enddo
+
+      NBTIMER(ttemp2)
+      call gs_op(gsh,w,1,1,0)  ! Gather-scatter operation  ! w   = QQ  w
+      ACCUMTIMER(tgsop(tmt),ttemp2)
+                                                           !            L
+      NBTIMER(ttemp2)
+      call add2s2i(w,u,.1,n,find,lind)
+      ACCUMTIMER(tadd2s2a(tmt),ttemp2)
+
+      if (find == 1) then
+        call mask(w)             ! Zero out Dirichlet conditions
+        nxyz=nx1*ny1*nz1
+        flop_a = flop_a + (19_8*nxyz+12_8*nx1*nxyz)*nelt
+      end if
+
+      return
+      end
+c-------------------------------------------------------------------------
+      subroutine ax1(w,u,n)
+      include 'SIZE'
+      real w(n),u(n)
+      real h2i
+  
+      h2i = (n+1)*(n+1)  
+      do i = 2,n-1
+         w(i)=h2i*(2*u(i)-u(i-1)-u(i+1))
+      enddo
+      w(1)  = h2i*(2*u(1)-u(2  ))
+      w(n)  = h2i*(2*u(n)-u(n-1))
+
+      return
+      end
+c-------------------------------------------------------------------------
+      subroutine ax_e(w,u,g) ! Local matrix-vector product
+
+      include 'SIZE'
+      include 'TOTAL'
+      include 'TIMER'
+
+      parameter (lxyz=lx1*ly1*lz1)
+      real w(lxyz),u(lxyz),g(2*ldim,lxyz)
+      real ur(nx1*ny1*nz1),us(nx1*ny1*nz1),ut(nx1*ny1*nz1)
+      integer thread, tmt, omp_get_thread_num
+
+      thread = 0
+#ifdef _OPENMP
+      thread = omp_get_thread_num()
+#endif
+      tmt = thread + 1
+
+      nxyz = nx1*ny1*nz1
+      n    = nx1-1
+
+      NBTIMER(ttemp3)
+      call local_grad3(ur,us,ut,u,n,dxm1,dxtm1)
+      ACCUMTIMER(tlocalgrad3(tmt),ttemp3)
+
+      NBTIMER(ttemp3)
+      do i=1,nxyz
+         wr = g(1,i)*ur(i) + g(2,i)*us(i) + g(3,i)*ut(i)
+         ws = g(2,i)*ur(i) + g(4,i)*us(i) + g(5,i)*ut(i)
+         wt = g(3,i)*ur(i) + g(5,i)*us(i) + g(6,i)*ut(i)
+         ur(i) = wr
+         us(i) = ws
+         ut(i) = wt
+      enddo
+      ACCUMTIMER(twrwswt(tmt),ttemp3)
+
+      NBTIMER(ttemp3)
+      call local_grad3_t(w,ur,us,ut,n,dxm1,dxtm1)
+      ACCUMTIMER(tlocalgrad3t(tmt),ttemp3)
+
+      return
+      end
+c-------------------------------------------------------------------------
+      subroutine local_grad3(ur,us,ut,u,n,D,Dt)
+c     Output: ur,us,ut         Input:u,n,D,Dt
+      real ur(0:n,0:n,0:n),us(0:n,0:n,0:n),ut(0:n,0:n,0:n)
+      real u (0:n,0:n,0:n)
+      real D (0:n,0:n),Dt(0:n,0:n)
+      integer e
+
+      m1 = n+1
+      m2 = m1*m1
+
+      call mxm(D ,m1,u,m1,ur,m2)
+      do k=0,n
+         call mxm(u(0,0,k),m1,Dt,m1,us(0,0,k),m1)
+      enddo
+      call mxm(u,m2,Dt,m1,ut,m1)
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine local_grad3_t(u,ur,us,ut,N,D,Dt)
+c     Output: ur,us,ut         Input:u,N,D,Dt
+      real u (0:N,0:N,0:N)
+      real ur(0:N,0:N,0:N),us(0:N,0:N,0:N),ut(0:N,0:N,0:N)
+      real D (0:N,0:N),Dt(0:N,0:N)
+      real w (0:N,0:N,0:N)
+      integer e
+
+      m1 = N+1
+      m2 = m1*m1
+      m3 = m1*m1*m1
+
+      call mxm(Dt,m1,ur,m1,u,m2)
+
+      do k=0,N
+         call mxm(us(0,0,k),m1,D ,m1,w(0,0,k),m1)
+      enddo
+      call add2(u,w,m3)
+
+      call mxm(ut,m2,D ,m1,w,m1)
+      call add2(u,w,m3)
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mask(w)   ! Zero out Dirichlet conditions
+      include 'SIZE'
+      real w(1)
+
+      if (nid.eq.0) w(1) = 0.  ! suitable for solvability
+
+      return
+      end
+c-----------------------------------------------------------------------
diff --git a/src/comm_mpi.f b/src/comm_mpi.f
new file mode 100644
index 0000000..26f1bc0
--- /dev/null
+++ b/src/comm_mpi.f
@@ -0,0 +1,1212 @@
+c-----------------------------------------------------------------------
+      subroutine iniproc(intracomm)
+      include 'SIZE'
+      include 'PARALLEL'
+      include 'mpif.h'
+
+      common /nekmpi/ nid_,np_,nekcomm,nekgroup,nekreal
+
+      logical flag
+      integer provided
+
+      call mpi_initialized(mpi_is_initialized, ierr) !  Initialize MPI
+      if ( mpi_is_initialized .eq. 0 ) then
+#ifdef MPITHREADS
+        call mpi_init_thread (MPI_THREAD_MULTIPLE,provided,ierr)
+#else 
+        call mpi_init (ierr)
+#endif
+      endif
+
+      ! create communicator
+      call init_nek_comm(intracomm)
+      np  = np_
+      nid = nid_
+
+      if(nid.eq.0) call printHeader
+
+      ! check upper tag size limit
+      call mpi_attr_get(MPI_COMM_WORLD,MPI_TAG_UB,nval,flag,ierr)
+      if (nval.lt.(10000+max(lp,lelg))) then
+         if(nid.eq.0) write(6,*) 'WARNING: MPI_TAG_UB too small!', nval
+c        call exitt
+      endif
+
+      IF (NP.GT.LP) THEN
+         WRITE(6,*) 
+     $   'ERROR: Code compiled for a max of',LP,' processors.'
+         WRITE(6,*) 
+     $   'Recompile with LP =',NP,' or run with fewer processors.'
+         WRITE(6,*) 
+     $   'Aborting in routine INIPROC.'
+         call exitt
+      endif
+
+      ! set word size for REAL
+      wdsize=4
+      eps=1.0e-12
+      oneeps = 1.0+eps
+      if (oneeps.ne.1.0) then
+         wdsize=8
+      else
+         if(nid.eq.0) 
+     &     write(6,*) 'ABORT: single precision mode not supported!'
+         call exitt
+      endif
+      nekreal = mpi_real
+      if (wdsize.eq.8) nekreal = mpi_double_precision
+
+      ifdblas = .false.
+      if (wdsize.eq.8) ifdblas = .true.
+
+      ! set word size for INTEGER
+      ! HARDCODED since there is no secure way to detect an int overflow
+      isize = 4
+
+      ! set word size for LOGICAL
+      lsize = 4
+
+      ! set word size for CHARACTER
+      csize = 1
+c
+      PID = 0
+      NULLPID=0
+      NODE0=0
+      NODE= NID+1
+
+      if (nid.eq.0) then 
+         write(6,*) 'Number of processors:',np
+         WRITE(6,*) 'REAL    wdsize      :',WDSIZE
+         WRITE(6,*) 'INTEGER wdsize      :',ISIZE
+      endif
+
+      call crystal_setup(cr_h,nekcomm,np)  ! set cr handle to new instance
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine init_nek_comm(intracomm)
+      include 'mpif.h'
+      common /nekmpi/ nid_,np_,nekcomm,nekgroup,nekreal
+C
+      call create_comm(intracomm) ! set up nekton specific communicator
+c
+      nid_  = mynode()
+      np_   = numnodes()
+c
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine gop( x, w, op, n)
+c
+c     Global vector commutative operation
+c
+      include 'mpif.h'
+      common /nekmpi/ nid,np,nekcomm,nekgroup,nekreal
+c
+      real x(n), w(n)
+      character*3 op
+c
+      if (op.eq.'+  ') then
+         call mpi_allreduce (x,w,n,nekreal,mpi_sum ,nekcomm,ierr)
+      elseif (op.EQ.'M  ') then
+         call mpi_allreduce (x,w,n,nekreal,mpi_max ,nekcomm,ierr)
+      elseif (op.EQ.'m  ') then
+         call mpi_allreduce (x,w,n,nekreal,mpi_min ,nekcomm,ierr)
+      elseif (op.EQ.'*  ') then
+         call mpi_allreduce (x,w,n,nekreal,mpi_prod,nekcomm,ierr)
+      else
+         write(6,*) nid,' OP ',op,' not supported.  ABORT in GOP.'
+         call exitt
+      endif
+
+      call copy(x,w,n)
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine igop( x, w, op, n)
+c
+c     Global vector commutative operation
+c
+      include 'mpif.h'
+      common /nekmpi/ nid,np,nekcomm,nekgroup,nekreal
+
+      integer x(n), w(n)
+      character*3 op
+
+      if     (op.eq.'+  ') then
+        call mpi_allreduce (x,w,n,mpi_integer,mpi_sum ,nekcomm,ierr)
+      elseif (op.EQ.'M  ') then
+        call mpi_allreduce (x,w,n,mpi_integer,mpi_max ,nekcomm,ierr)
+      elseif (op.EQ.'m  ') then
+        call mpi_allreduce (x,w,n,mpi_integer,mpi_min ,nekcomm,ierr)
+      elseif (op.EQ.'*  ') then
+        call mpi_allreduce (x,w,n,mpi_integer,mpi_prod,nekcomm,ierr)
+      else
+        write(6,*) nid,' OP ',op,' not supported.  ABORT in igop.'
+        call exitt
+      endif
+
+      call icopy(x,w,n)
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine i8gop( x, w, op, n)
+c
+c     Global vector commutative operation
+c
+      include 'mpif.h'
+      common /nekmpi/ nid,np,nekcomm,nekgroup,nekreal
+
+      integer*8 x(n), w(n)
+      character*3 op
+
+      if     (op.eq.'+  ') then
+        call mpi_allreduce (x,w,n,mpi_integer8,mpi_sum ,nekcomm,ierr)
+      elseif (op.EQ.'M  ') then
+        call mpi_allreduce (x,w,n,mpi_integer8,mpi_max ,nekcomm,ierr)
+      elseif (op.EQ.'m  ') then
+        call mpi_allreduce (x,w,n,mpi_integer8,mpi_min ,nekcomm,ierr)
+      elseif (op.EQ.'*  ') then
+        call mpi_allreduce (x,w,n,mpi_integer8,mpi_prod,nekcomm,ierr)
+      else
+        write(6,*) nid,' OP ',op,' not supported.  ABORT in igop.'
+        call exitt
+      endif
+
+      call i8copy(x,w,n)
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine csend(mtype,buf,len,jnid,jpid)
+      include 'mpif.h'
+      common /nekmpi/ nid,np,nekcomm,nekgroup,nekreal
+      real*4 buf(1)
+
+      call mpi_send (buf,len,mpi_byte,jnid,mtype,nekcomm,ierr)
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine crecv(mtype,buf,lenm)
+      include 'mpif.h'
+      common /nekmpi/ nid,np,nekcomm,nekgroup,nekreal
+      integer status(mpi_status_size)
+C
+      real*4 buf(1)
+      len = lenm
+      jnid = mpi_any_source
+
+      call mpi_recv (buf,len,mpi_byte
+     $              ,jnid,mtype,nekcomm,status,ierr)
+c
+      if (len.gt.lenm) then 
+          write(6,*) nid,'long message in mpi_crecv:',len,lenm
+          call exitt
+      endif
+c
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine crecv3(mtype,buf,len,lenm)
+      include 'mpif.h'
+      common /nekmpi/ nid,np,nekcomm,nekgroup,nekreal
+      integer status(mpi_status_size)
+C
+      real*4 buf(1)
+      len = lenm
+      jnid = mpi_any_source
+
+      call mpi_recv (buf,len,mpi_byte
+     $            ,jnid,mtype,nekcomm,status,ierr)
+      call mpi_get_count (status,mpi_byte,len,ierr)
+c
+      if (len.gt.lenm) then 
+          write(6,*) nid,'long message in mpi_crecv:',len,lenm
+          call exitt
+      endif
+c
+      return
+      end
+c-----------------------------------------------------------------------
+      integer function numnodes()
+      include 'mpif.h'
+      common /nekmpi/ nid,np,nekcomm,nekgroup,nekreal
+
+      call mpi_comm_size (nekcomm, numnodes , ierr)
+
+      return
+      end
+c-----------------------------------------------------------------------
+      integer function mynode()
+      include 'mpif.h'
+      common /nekmpi/ nid,np,nekcomm,nekgroup,nekreal
+      integer myid
+
+      call mpi_comm_rank (nekcomm, myid, ierr)
+      mynode = myid
+
+      return
+      end
+c-----------------------------------------------------------------------
+      real*8 function dnekclock()
+      implicit none
+
+#if defined (MPITIMER)
+      include 'mpif.h'
+      dnekclock=mpi_wtime()
+#elif defined (BGQTIMER)
+      double precision readtimebase_double
+      external readtimebase_double
+      dnekclock = 0.625D-9*ReadTimeBase_Double()
+#elif defined (CGTTIMER)
+      double precision fclock_gettime
+      external fclock_gettime
+      dnekclock = fclock_gettime()
+#else 
+      integer*8 countval, countrate, countmax
+      double precision countd
+      call system_clock(countval, countrate, countmax)
+      countd = countval
+      dnekclock = countd/countrate
+#endif
+
+      return
+      end
+c-----------------------------------------------------------------------
+      real*8 function dnekclock_sync()
+      real*8 dnekclock
+      external dnekclock
+c
+      call nekgsync()
+      dnekclock_sync=dnekclock()
+c
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine lbcast(ifif)
+C
+C     Broadcast logical variable to all processors.
+C
+      include 'SIZE'
+      include 'PARALLEL'
+      include 'mpif.h'
+
+      logical ifif
+
+      if (np.eq.1) return
+
+      item=0
+      if (ifif) item=1
+      call bcast(item,isize)
+      ifif=.false.
+      if (item.eq.1) ifif=.true.
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine bcast(buf,len)
+      include 'mpif.h'
+      common /nekmpi/ nid,np,nekcomm,nekgroup,nekreal
+      real*4 buf(1)
+
+      call mpi_bcast (buf,len,mpi_byte,0,nekcomm,ierr)
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine create_comm(intracomm)
+      include 'mpif.h'
+      common /nekmpi/ nid,np,nekcomm,nekgroup,nekreal
+
+c      call mpi_comm_group (mpi_comm_world,itmp,ierr)
+c      call mpi_comm_create (mpi_comm_world,itmp,icomm,ierr)
+c      call mpi_group_free (itmp,ierr)
+
+      call mpi_comm_dup(intracomm,nekcomm,ierr)
+
+c     write(6,*) 'nekcomm:',nekcomm
+
+      return
+      end
+c-----------------------------------------------------------------------
+      function isend(msgtag,x,len,jnid,jpid)
+c
+c     Note: len in bytes
+c
+      integer x(1)
+C
+      include 'mpif.h'
+      common /nekmpi/ nid,np,nekcomm,nekgroup,nekreal
+C
+      call mpi_isend (x,len,mpi_byte,jnid,msgtag
+     $       ,nekcomm,imsg,ierr)
+      isend = imsg
+c     write(6,*) nid,' isend:',imsg,msgtag,len,jnid,(x(k),k=1,len/4)
+c
+      return
+      end
+c-----------------------------------------------------------------------
+      function irecv(msgtag,x,len)
+c
+c     Note: len in bytes
+c
+      integer x(1)
+C
+      include 'mpif.h'
+      common /nekmpi/ nid,np,nekcomm,nekgroup,nekreal
+C
+      call mpi_irecv (x,len,mpi_byte,mpi_any_source,msgtag
+     $       ,nekcomm,imsg,ierr)
+      irecv = imsg
+c     write(6,*) nid,' irecv:',imsg,msgtag,len
+c
+c
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine msgwait(imsg)
+c
+      include 'mpif.h'
+      common /nekmpi/ nid,np,nekcomm,nekgroup,nekreal
+      integer status(mpi_status_size)
+c
+c     write(6,*) nid,' msgwait:',imsg
+c
+      call mpi_wait (imsg,status,ierr)
+c
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine nekgsync()
+
+      include 'mpif.h'
+      common /nekmpi/ nid,np,nekcomm,nekgroup,nekreal
+
+      call mpi_barrier(nekcomm,ierr)
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine exittr(stringi,rdata,idata)
+      character*1 stringi(132)
+      character*1 stringo(132)
+      character*25 s25
+      include 'SIZE'
+      include 'TOTAL'
+
+      call blank(stringo,132)
+      call chcopy(stringo,stringi,132)
+      len = indx1(stringo,'$',1)
+      write(s25,25) rdata,idata
+   25 format(1x,1p1e14.6,i10)
+      call chcopy(stringo(len),s25,25)
+
+      if (nid.eq.0) write(6,1) (stringo(k),k=1,len+24)
+    1 format('EXIT: ',132a1)
+
+      call exitt
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine exitti(stringi,idata)
+      character*1 stringi(132)
+      character*1 stringo(132)
+      character*11 s11
+      include 'SIZE'
+      include 'TOTAL'
+
+      call blank(stringo,132)
+      call chcopy(stringo,stringi,132)
+      len = indx1(stringo,'$',1)
+      write(s11,11) idata
+   11 format(1x,i10)
+      call chcopy(stringo(len),s11,11)
+
+      if (nid.eq.0) write(6,1) (stringo(k),k=1,len+10)
+    1 format('EXIT: ',132a1)
+
+      call exitt
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine err_chk(ierr,string)
+      character*1 string(132)
+      character*1 ostring(132)
+      character*10 s10
+      include 'SIZE'
+      include 'TOTAL'
+
+      ierr = iglsum(ierr,1)
+      if(ierr.eq.0) return 
+
+      len = indx1(string,'$',1)
+      call blank(ostring,132)
+      write(s10,11) ierr
+   11 format(1x,' ierr=',i3)
+
+      call chcopy(ostring,string,len-1)
+      call chcopy(ostring(len),s10,10)
+
+      if (nid.eq.0) write(6,1) (ostring(k),k=1,len+10)
+    1 format('ERROR: ',132a1)
+
+      call exitt
+
+      return
+      end
+c
+c-----------------------------------------------------------------------
+      subroutine exitt0
+      include 'SIZE'
+      include 'TOTAL'
+      include 'mpif.h'
+
+#ifdef BGQ
+#define M_EXIT(X) exit_((X))
+#else
+#define M_EXIT(X) exit((X))
+#endif
+
+      real*4 papi_mflops
+      integer*8 papi_flops
+
+      if(nid.eq.0) write(6,*) 'Exitting....'
+
+c     call print_stack()
+
+      call mpi_finalize (ierr)
+      call M_EXIT(0)
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine exitt
+      include 'SIZE'
+      include 'TOTAL'
+      include 'mpif.h'
+
+      real*4 papi_mflops
+      integer*8 papi_flops
+c
+      call nekgsync()
+
+#ifdef PAPI
+      call nek_flops(papi_flops,papi_mflops)
+#endif
+
+      tstop  = dnekclock()
+      ttotal = tstop-etimes
+      nxyz   = nx1*ny1*nz1
+
+      if (nid.eq.0) then 
+         dtmp1 = 0
+         dtmp2 = 0
+         dtmp3 = 0
+         if(istep.gt.0) then
+           dgp   = nvtot
+           dgp   = max(dgp,1.)
+           dtmp1 = np*ttime/(dgp*max(istep,1))
+           dtmp2 = ttime/max(istep,1)
+           dtmp3 = 1.*papi_flops/1e6
+         endif 
+         write(6,*) ' '
+         write(6,'(A)') 'call exitt: dying ...'
+         write(6,*) ' '
+c        call print_stack()
+         write(6,*) ' '
+         write(6,'(4(A,1p1e13.5,A,/))') 
+     &       'total elapsed time             : ',ttotal, ' sec'
+     &      ,'total solver time incl. I/O    : ',ttime , ' sec'
+     &      ,'time/timestep                  : ',dtmp2 , ' sec'
+     &      ,'CPU seconds/timestep/gridpt    : ',dtmp1 , ' sec'
+#ifdef PAPI
+         write(6,'(2(A,1g13.5,/))') 
+     &       'Gflops                         : ',dtmp3/1000.
+     &      ,'Gflops/s                       : ',papi_mflops/1000.
+#endif
+      endif 
+
+      nz1 = 1/(nx1-ny1)
+
+      call mpi_finalize (ierr)
+      call exit(0)
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine printHeader
+
+
+      return
+      end
+c-----------------------------------------------------------------------
+      function igl_running_sum(in)
+c
+      include 'mpif.h'
+      common /nekmpi/ nid,np,nekcomm,nekgroup,nekreal
+      integer status(mpi_status_size)
+      integer x,w,r
+
+      x = in  ! running sum
+      w = in  ! working buff
+      r = 0   ! recv buff
+
+      call mpi_scan(x,r,1,mpi_integer,mpi_sum,nekcomm,ierr)
+      igl_running_sum = r
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine platform_timer(ivb) ! mxm, ping-pong, and all_reduce timer
+
+      include 'SIZE'
+      include 'TOTAL'
+
+
+      call mxm_test_all(nid,ivb)  ! measure mxm times
+c     call exitti('done mxm_test_all$',ivb)
+
+      call comm_test(ivb)         ! measure message-passing and all-reduce times
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine comm_test(ivb) ! measure message-passing and all-reduce times
+                                ! ivb = 0 --> minimal verbosity
+                                ! ivb = 1 --> fully verbose
+                                ! ivb = 2 --> smaller sample set(shorter)
+
+      include 'SIZE'
+      include 'PARALLEL'
+
+      call gop_test(ivb)   ! added, Jan. 8, 2008
+
+      log_np=log2(np)
+      np2 = 2**log_np
+      if (np2.eq.np) call gp2_test(ivb)   ! added, Jan. 8, 2008
+
+      io = 6
+      n512 = min(512,np-1)
+
+      do nodec=1,n512
+         nodeb=nodec
+         if (nodec.gt.256.and.np.gt.512)
+     $      nodeb = 256+(np-256)*(nodec-256)/(n512-256) - 1
+         nodeb = min(nodeb,np-1)
+
+         call pingpong(alphas,betas,0,nodeb,.0005,io,ivb)
+         if (nid.eq.0) write(6,1) nodeb,np,alphas,betas
+    1    format(2i10,1p2e15.7,' alpha beta')
+      enddo
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine pingpong(alphas,betas,nodea,nodeb,dt,io,ivb)
+
+      include 'SIZE'
+      common /nekmpi/ mid,np,nekcomm,nekgroup,nekreal
+
+      parameter  (lt=lx1*ly1*lz1*lelt)
+      parameter (mwd1 = 3*lt,mwd2=100 000, mwd = max(mwd1,mwd2))
+      common /scrns/ x(mwd),y(mwd)
+
+      include 'mpif.h'
+      integer status(mpi_status_size)
+
+      character*10 fname
+
+      if (nid.eq.nodea) then
+         write(fname,3) np,nodeb
+    3    format('t',i4.4,'.',i4.4)
+         if (io.ne.6) open (unit=io,file=fname)
+      endif
+
+      call nekgsync
+      call get_msg_vol(msg_vol,dt,nodea,nodeb) ! Est. msg vol for dt s
+
+      nwds = 0
+      if (nid.eq.nodea.and.ivb.gt.0) write(io,*)
+
+      betas = 0  ! Reported inverse bandwidth
+      count = 0
+
+      do itest = 1,500
+
+         nloop = msg_vol/(nwds+2)
+         nloop = min(nloop,1000)
+         nloop = max(nloop,1)
+
+         len   = 8*nwds
+     
+         call ping_loop(t1,t0,len,nloop,nodea,nodeb,nid,x,y,x,y)
+
+         if (nid.eq.nodea) then
+            tmsg = (t1-t0)/(2*nloop)   ! 2*nloop--> Double Buffer
+            tmsg = tmsg / 2.           ! one-way cost = 1/2 round-trip
+            tpwd = tmsg                ! time-per-word
+            if (nwds.gt.0) tpwd = tmsg/nwds
+            if (ivb.gt.0) write(io,1) nodeb,np,nloop,nwds,tmsg,tpwd
+    1       format(3i6,i12,1p2e16.8,' pg')
+
+            if (nwds.eq.1) then
+               alphas = tmsg
+            elseif (nwds.gt.10000) then   ! "average" beta
+               betas = (betas*count + tpwd)/(count+1)
+               count = count + 1
+            endif
+         endif
+
+         if (ivb.eq.2) then
+            nwds = (nwds+1)*1.25
+         else
+            nwds = (nwds+1)*1.016
+         endif
+         if (nwds.gt.mwd) then
+c        if (nwds.gt.1024) then
+            if (nid.eq.nodea.and.io.ne.6) close(unit=io)
+            call nekgsync
+            return
+         endif
+
+      enddo
+
+      if (nid.eq.nodea.and.io.ne.6) close(unit=io)
+      call nekgsync
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine pingpongo(alphas,betas,nodea,nodeb,dt,io,ivb)
+
+      include 'SIZE'
+      common /nekmpi/ mid,np,nekcomm,nekgroup,nekreal
+
+      parameter  (lt=lx1*ly1*lz1*lelt)
+      parameter (mwd1 = 3*lt,mwd2=100 000, mwd = max(mwd1,mwd2))
+      common /scrns/ x(mwd),y(mwd)
+
+      include 'mpif.h'
+      integer status(mpi_status_size)
+
+      character*10 fname
+
+      if (nid.eq.nodea) then
+         write(fname,3) np,nodeb
+    3    format('t',i4.4,'.',i4.4)
+         if (io.ne.6) open (unit=io,file=fname)
+      endif
+
+      call nekgsync
+      call get_msg_vol(msg_vol,dt,nodea,nodeb) ! Est. msg vol for dt s
+
+      nwds = 0
+      if (nid.eq.nodea.and.ivb.gt.0) write(io,*)
+
+      betas = 0  ! Reported inverse bandwidth
+      count = 0
+
+      do itest = 1,500
+         call nekgsync
+         nloop = msg_vol/(nwds+2)
+         nloop = min(nloop,1000)
+         nloop = max(nloop,1)
+
+         len   = 8*nwds
+         jnid = mpi_any_source
+
+         if (nid.eq.nodea) then
+
+            msg  = irecv(itest,y,1)
+            call csend(itest,x,1,nodeb,0)   ! Initiate send, to synch.
+            call msgwait(msg)
+
+            t0 = mpi_wtime ()
+            do i=1,nloop
+               call mpi_irecv(y,len,mpi_byte,mpi_any_source,i
+     $                        ,nekcomm,msg,ierr)
+               call mpi_send (x,len,mpi_byte,nodeb,i,nekcomm,ierr)
+               call mpi_wait (msg,status,ierr)
+            enddo
+            t1 = mpi_wtime ()
+            tmsg = (t1-t0)/nloop
+            tmsg = tmsg / 2.       ! Round-trip message time = twice one-way
+            tpwd = tmsg
+            if (nwds.gt.0) tpwd = tmsg/nwds
+            if (ivb.gt.0) write(io,1) nodeb,np,nloop,nwds,tmsg,tpwd
+    1       format(3i6,i12,1p2e16.8,' pg')
+
+            if (nwds.eq.1) then
+               alphas = tmsg
+            elseif (nwds.gt.10000) then
+               betas = (betas*count + tpwd)/(count+1)
+               count = count + 1
+            endif
+
+         elseif (nid.eq.nodeb) then
+
+            call crecv(itest,y,1)           ! Initiate send, to synch.
+            call csend(itest,x,1,nodea,0)
+
+            t0 = dnekclock()
+            do i=1,nloop
+               call mpi_recv (y,len,mpi_byte
+     $               ,jnid,i,nekcomm,status,ierr)
+               call mpi_send (x,len,mpi_byte,nodea,i,nekcomm,ierr)
+            enddo
+            t1 = dnekclock()
+            tmsg = (t1-t0)/nloop
+
+         endif
+
+         nwds = (nwds+1)*1.016
+         if (nwds.gt.mwd) then
+            if (nid.eq.nodea.and.io.ne.6) close(unit=io)
+            call nekgsync
+            return
+         endif
+
+      enddo
+
+      if (nid.eq.nodea.and.io.ne.6) close(unit=io)
+      call nekgsync
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine get_msg_vol(msg_vol,dt,nodea,nodeb)
+      include 'SIZE'
+      common /nekmpi/ mid,np,nekcomm,nekgroup,nekreal
+      parameter (lt=lx1*ly1*lz1*lelt)
+      common /scrns/ x(3*lt),y(3*lt)
+!
+!     Est. msg vol for dt s
+!
+      msg_vol = 1000
+
+      nwds  = min(1000,lt)
+      nloop = 50
+ 
+      tmsg = 0.
+      call gop(tmsg,t1,'+  ',1)
+
+      len = 8*nwds
+      if (nid.eq.nodea) then
+
+         msg  = irecv(1,y,1)
+         call csend(1,x,1,nodeb,0)   ! Initiate send, to synch.
+         call msgwait(msg)
+
+         t0 = dnekclock()
+         do i=1,nloop
+            msg  = irecv(i,y,len)
+            call csend(i,x,len,nodeb,0)
+            call msgwait(msg)
+         enddo
+         t1   = dnekclock()
+         tmsg = (t1-t0)/nloop
+         tpwd = tmsg/nwds
+
+      elseif (nid.eq.nodeb) then
+
+         call crecv(1,y,1)           ! Initiate send, to synch.
+         call csend(1,x,1,nodea,0)
+
+         t0 = dnekclock()
+         do i=1,nloop
+            call crecv(i,y,len)
+            call csend(i,x,len,nodea,0)
+         enddo
+         t1   = dnekclock()
+         tmsg = (t1-t0)/nloop
+         tmsg = 0.
+
+      endif
+
+      call gop(tmsg,t1,'+  ',1)
+      msg_vol = nwds*(dt/tmsg)
+c     if (nid.eq.nodea) write(6,*) nid,msg_vol,nwds,dt,tmsg,' msgvol'
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine gop_test(ivb)
+      include 'SIZE'
+      common /nekmpi/ mid,np,nekcomm,nekgroup,nekreal
+      include 'mpif.h'
+      integer status(mpi_status_size)
+
+      parameter  (lt=lx1*ly1*lz1*lelt)
+      parameter (mwd = 3*lt)
+      common /scrns/ x(mwd),y(mwd)
+      common /scruz/ times(2,500)
+      common/scrcg/nwd(500)
+
+
+      nwds = 1
+      mtest = 0
+      do itest = 1,500
+         nwds = (nwds+1)*1.016
+         if (nwds.gt.mwd) goto 100
+         mtest = mtest+1
+         nwd(mtest) = nwds
+      enddo
+  100 continue
+
+      nwds = 1
+      do itest = mtest,1,-1
+
+         tiny = 1.e-27
+         call cfill(x,tiny,mwd)
+         nwds = nwd(itest)
+         call nekgsync
+
+
+         t0 = mpi_wtime ()
+         call gop(x,y,'+  ',nwds)
+         call gop(x,y,'+  ',nwds)
+         call gop(x,y,'+  ',nwds)
+         call gop(x,y,'+  ',nwds)
+         call gop(x,y,'+  ',nwds)
+         call gop(x,y,'+  ',nwds)
+         t1 = mpi_wtime ()
+
+         tmsg = (t1-t0)/6 ! six calls
+         tpwd = tmsg
+         if (nwds.gt.0) tpwd = tmsg/nwds
+         times(1,itest) = tmsg
+         times(2,itest) = tpwd
+
+      enddo
+  101 continue
+
+
+      if (nid.eq.0) then
+         nwds = 1
+         do itest=1,500
+            if (ivb.gt.0.or.itest.eq.1) 
+     $         write(6,1) np,nwds,(times(k,itest),k=1,2)
+    1       format(i9,i12,1p2e16.8,' gop')
+            nwds = (nwds+1)*1.016
+            if (nwds.gt.mwd) goto 102
+         enddo
+  102    continue
+      endif
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine gp2_test(ivb)
+
+      include 'SIZE'
+      include 'mpif.h'
+
+      common /nekmpi/ mid,np,nekcomm,nekgroup,nekreal
+      integer status(mpi_status_size)
+
+      parameter  (lt=lx1*ly1*lz1*lelt)
+      parameter (mwd = 3*lt)
+      common /scrns/ x(mwd),y(mwd)
+      common /scruz/ times(2,500)
+
+      call rzero(x,mwd)
+
+      nwds = 1
+      do itest = 1,500
+         call gp2(x,y,'+  ',1,nid,np)
+
+         t0 = mpi_wtime ()
+         call gp2(x,y,'+  ',nwds,nid,np)
+         call gp2(x,y,'+  ',nwds,nid,np)
+         call gp2(x,y,'+  ',nwds,nid,np)
+         call gp2(x,y,'+  ',nwds,nid,np)
+         t1 = mpi_wtime ()
+
+         tmsg = (t1-t0)/4 ! four calls
+         tpwd = tmsg
+         if (nwds.gt.0) tpwd = tmsg/nwds
+         times(1,itest) = tmsg
+         times(2,itest) = tpwd
+
+         nwds = (nwds+1)*1.016
+         if (nwds.gt.mwd) goto 101
+      enddo
+  101 continue
+
+
+      if (nid.eq.0) then
+         nwds = 1
+         do itest=1,500
+            if (ivb.gt.0.or.itest.eq.1) 
+     $         write(6,1) np,nwds,(times(k,itest),k=1,2)
+    1       format(i9,i12,1p2e16.8,' gp2')
+            nwds = (nwds+1)*1.016
+            if (nwds.gt.mwd) goto 102
+         enddo
+  102    continue
+      endif
+
+      return
+      end
+c-----------------------------------------------------------------------
+      integer function xor(m,n)
+c
+c  If NOT running on a parallel processor, it is sufficient to
+c  have this routine return a value of XOR=1.
+c
+c  Pick one of the following:
+c
+c  UNIX 4.2, f77:
+       XOR = OR(M,N)-AND(M,N)
+c
+c  Intel FTN286:
+c     XOR = M.NEQV.N
+c
+c  Ryan-McFarland Fortran
+C      XOR = IEOR(M,N)
+c
+c     XOR = 0
+c     IF(M.EQ.1 .OR.  N.EQ.1) XOR=1
+c     IF(M.EQ.0 .AND. N.EQ.0) XOR=0
+c     IF(M.EQ.1 .AND. N.EQ.1) XOR=0
+c     IF(M.GT.1 .OR.N.GT.1 .OR.M.LT.0.OR.N.LT.0) THEN
+c        PRINT*,'ERROR IN XOR'
+c        STOP
+c     ENDIF
+C
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine gp2( x, w, op, n, nid, np)
+c
+c     Global vector commutative operation using spanning tree.
+c
+c     Std. fan-in/fan-out
+
+      real x(n), w(n)
+      character*3 op
+
+      integer bit, bytes, cnt, diff, spsize, i, 
+     *   parent, troot, xor, root, lnp, log2
+      logical ifgot
+
+      integer type
+      save    type
+      data    type  /998/
+
+      type  = type+100
+      if (type.gt.9992) type=type-998
+      typer = type-1
+      bytes = 8*n
+
+      root    = 0
+      troot   = max0((nid/np)*np, root)
+      diff    = xor(nid,troot)
+      nullpid = 0
+
+c     Accumulate contributions from children, if any
+      level2=1
+    5 continue
+         level=level2
+         level2=level+level
+         if (mod(nid,level2).ne.0) goto 20
+            call crecv(type,w,bytes)
+            if (op.eq.'+  ') then
+               do i=1,n
+                  x(i) = x(i) + w(i)
+               enddo
+            elseif (op.eq.'*  ') then
+               do i=1,n
+                  x(i) = x(i) * w(i)
+               enddo
+            elseif (op.eq.'M  ') then
+               do i=1,n
+                  x(i) = max(x(i),w(i))
+               enddo
+            elseif (op.eq.'m  ') then
+               do i=1,n
+                  x(i) = min(x(i),w(i))
+               enddo
+            endif
+         if (level2.lt.np) goto 5
+
+c     Pass result back to parent
+   20 parent = nid-level
+      if (nid .ne. 0) call csend(type,x,bytes,parent,nullpid)
+
+c     Await final answer from node 0 via log_2 fan out
+      level=np/2
+      ifgot=.false.
+      if (nid.eq.root) ifgot=.true.
+
+      lnp = log2(np)
+      do i=1,lnp
+        if (ifgot) then
+           jnid=nid+level
+           call csend(typer,x,bytes,jnid,nullpid)
+        elseif (mod(nid,level).eq.0) then
+           call crecv(typer,x,bytes)
+           ifgot=.true.
+        endif
+        level=level/2
+      enddo
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine ping_loop1(t1,t0,len,nloop,nodea,nodeb,nid,x,y)
+
+      common /nekmpi/ mid,np,nekcomm,nekgroup,nekreal
+
+      real x(1),y(1)
+
+      include 'mpif.h'
+      integer status(mpi_status_size)
+
+      i=0
+      if (nid.eq.nodea) then
+         call nekgsync
+         call mpi_irecv(y,len,mpi_byte,nodeb,i,nekcomm,msg,ierr)    ! 1b
+         call mpi_send (x,len,mpi_byte,nodeb,i,nekcomm,ierr)        ! 1a
+c        call mpi_rsend(x,len,mpi_byte,nodeb,i,nekcomm,ierr)        ! 1a
+         call msgwait(msg)                                          ! 1b
+
+         t0 = mpi_wtime ()
+         do i=1,nloop
+            call mpi_irecv(y,len,mpi_byte,nodeb,i,nekcomm,msg,ierr) ! 2b
+            call mpi_send (x,len,mpi_byte,nodeb,i,nekcomm,ierr)     ! 2a
+c           call mpi_rsend(x,len,mpi_byte,nodeb,i,nekcomm,ierr)     ! 2a
+            call mpi_wait (msg,status,ierr)                         ! 2b
+         enddo
+         t1 = mpi_wtime ()
+
+      elseif (nid.eq.nodeb) then
+
+         call mpi_irecv(y,len,mpi_byte,nodea,i,nekcomm,msg,ierr)    ! 1a
+         call nekgsync
+         call mpi_wait (msg,status,ierr)                            ! 1a
+
+         j=i
+         do i=1,nloop
+            call mpi_irecv(y,len,mpi_byte,nodea,i,nekcomm,msg,ierr) ! 2a
+c           call mpi_rsend(x,len,mpi_byte,nodea,j,nekcomm,ierr)     ! 1b
+            call mpi_send (x,len,mpi_byte,nodea,j,nekcomm,ierr)     ! 1b
+            call mpi_wait (msg,status,ierr)                         ! 2a
+            j=i
+         enddo
+c        call mpi_rsend(x,len,mpi_byte,nodea,j,nekcomm,ierr)        ! nb
+         call mpi_send (x,len,mpi_byte,nodea,j,nekcomm,ierr)        ! nb
+
+      else
+         call nekgsync
+      endif
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine ping_loop2(t1,t0,len,nloop,nodea,nodeb,nid,x,y)
+
+      common /nekmpi/ mid,np,nekcomm,nekgroup,nekreal
+
+      real x(1),y(1)
+
+      include 'mpif.h'
+      integer status(mpi_status_size)
+
+      i=0
+      if (nid.eq.nodea) then
+         call nekgsync
+         call mpi_irecv(y,len,mpi_byte,nodeb,i,nekcomm,msg,ierr)    ! 1b
+         call mpi_send (x,len,mpi_byte,nodeb,i,nekcomm,ierr)        ! 1a
+         call msgwait(msg)                                          ! 1b
+
+         t0 = mpi_wtime ()
+         do i=1,nloop
+            call mpi_send (x,len,mpi_byte,nodeb,i,nekcomm,ierr)     ! 2a
+            call mpi_irecv(y,len,mpi_byte,nodeb,i,nekcomm,msg,ierr) ! 2b
+            call mpi_wait (msg,status,ierr)                         ! 2b
+         enddo
+         t1 = mpi_wtime ()
+
+      elseif (nid.eq.nodeb) then
+
+         call mpi_irecv(y,len,mpi_byte,nodea,i,nekcomm,msg,ierr)    ! 1a
+         call nekgsync
+         call mpi_wait (msg,status,ierr)                            ! 1a
+
+         j=i
+         do i=1,nloop
+            call mpi_send (x,len,mpi_byte,nodea,j,nekcomm,ierr)     ! 1b
+            call mpi_irecv(y,len,mpi_byte,nodea,i,nekcomm,msg,ierr) ! 2a
+            call mpi_wait (msg,status,ierr)                         ! 2a
+            j=i
+         enddo
+         call mpi_send (x,len,mpi_byte,nodea,j,nekcomm,ierr)        ! nb
+
+      else
+         call nekgsync
+      endif
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine ping_loop(t1,t0,len,nloop,nodea,nodeb,nid,x1,y1,x2,y2)
+c     Double Buffer : does 2*nloop timings
+
+      common /nekmpi/ mid,np,nekcomm,nekgroup,nekreal
+
+      real x1(1),y1(1),x2(1),y2(1)
+
+      include 'mpif.h'
+      integer status(mpi_status_size)
+
+      itag=1
+      if (nid.eq.nodea) then
+         call mpi_irecv(y1,len,mpi_byte,nodeb,itag,nekcomm,msg1,ierr)   ! 1b 
+         call nekgsync
+
+
+         t0 = mpi_wtime ()
+         do i=1,nloop
+            call mpi_send (x1,len,mpi_byte,nodeb,itag,nekcomm,ierr)     ! 1a 
+            call mpi_irecv(y2,len,mpi_byte,nodeb,itag,nekcomm,msg2,ierr)! 2b 
+            call mpi_wait (msg1,status,ierr)                            ! 1b
+            call mpi_send (x2,len,mpi_byte,nodeb,itag,nekcomm,ierr)     ! 2a 
+            call mpi_irecv(y1,len,mpi_byte,nodeb,itag,nekcomm,msg1,ierr)! 3b 
+            call mpi_wait (msg2,status,ierr)                            ! 2b
+         enddo
+         t1 = mpi_wtime ()
+         call mpi_send (x1,len,mpi_byte,nodeb,itag,nekcomm,ierr)        ! nb
+         call mpi_wait (msg1,status,ierr)                              ! nb
+
+      elseif (nid.eq.nodeb) then
+
+         call mpi_irecv(y1,len,mpi_byte,nodea,itag,nekcomm,msg1,ierr)   ! nb 
+         call nekgsync
+
+
+         do i=1,nloop
+            call mpi_wait (msg1,status,ierr)                            ! 1a
+            call mpi_send (x1,len,mpi_byte,nodea,itag,nekcomm,ierr)     ! 1b
+            call mpi_irecv(y2,len,mpi_byte,nodea,itag,nekcomm,msg2,ierr)! 2a
+            call mpi_wait (msg2,status,ierr)                            ! 2a 
+            call mpi_send (x2,len,mpi_byte,nodea,itag,nekcomm,ierr)     ! 2b
+            call mpi_irecv(y1,len,mpi_byte,nodea,itag,nekcomm,msg1,ierr)! 3a
+         enddo
+         call mpi_wait (msg1,status,ierr)                            ! 2a 
+         call mpi_send (x1,len,mpi_byte,nodea,itag,nekcomm,ierr)        ! nb
+
+      else
+         call nekgsync
+      endif
+
+      return
+      end
+
+
diff --git a/src/driver.f b/src/driver.f
new file mode 100644
index 0000000..f45aa50
--- /dev/null
+++ b/src/driver.f
@@ -0,0 +1,660 @@
+c-----------------------------------------------------------------------
+      program nekbone
+      
+      include 'SIZE'
+      include 'TOTAL'
+      include 'mpif.h'
+
+      parameter (lxyz = lx1*ly1*lz1)
+      parameter (lt=lxyz*lelt)
+
+      real ah(lx1*lx1),bh(lx1),ch(lx1*lx1),dh(lx1*lx1)
+     $    ,zpts(2*lx1),wght(2*lx1)
+      
+      real x(lt),f(lt),r(lt),w(lt),p(lt),z(lt),c(lt)
+      real g(6,lt)
+      real mfloplist(1024), avmflop
+      real tstart, tstop
+      integer icount
+
+      logical ifbrick
+      integer iel0,ielN,ielD   ! element range per proc.
+      integer nx0,nxN,nxD      ! poly. order range
+      integer npx,npy,npz      ! poly. order range
+      integer mx,my,mz         ! poly. order range
+      integer numthreads, omp_get_max_threads
+
+      call iniproc(mpi_comm_world)    ! has nekmpi common block
+      tstart = dnekclock()
+      call read_param(ifbrick,iel0,ielN,ielD,nx0,nxN,nxD,
+     +                npx,npy,npz,mx,my,mz)
+
+      numthreads = 1
+#ifdef _OPENMP
+      numthreads= omp_get_max_threads()
+#endif 
+
+      if (nid.eq.0) then
+        write(*,*) "Max number of threads: ", numthreads
+      end if
+
+c     GET PLATFORM CHARACTERISTICS
+c     iverbose = 1
+c     call platform_timer(iverbose)   ! iverbose=0 or 1
+
+      icount = 0
+
+#ifndef NITER 
+#define NITER 100
+#endif
+      niter = NITER
+
+      if (nid.eq.0) then
+        write(*,*) "Number of iterations: ", niter
+      end if
+
+#ifdef LOG
+#define WLOG(X) if (nid .eq. 0) write(*,*) X 
+#else 
+#define WLOG(X) 
+#endif
+
+c     SET UP and RUN NEKBONE
+      do nx1=nx0,nxN,nxD
+         WLOG("calling init_dim")
+         call init_dim
+         do nelt=iel0,ielN,ielD
+           WLOG("calling init_mesh")
+           call init_mesh(ifbrick,npx,npy,npz,mx,my,mz)
+           WLOG("calling prox_setupds")
+           call proxy_setupds    (gsh)     ! Has nekmpi common block
+           WLOG("calling set_multiplicity")
+           call set_multiplicity (c)       ! Inverse of counting matrix
+
+           WLOG("calling proxy_setup")
+           call proxy_setup(ah,bh,ch,dh,zpts,wght,g) 
+
+           n     = nx1*ny1*nz1*nelt
+
+           WLOG("calling set_f")
+           call set_f(f,c,n)
+           WLOG("calling cg")
+           call cg(x,f,g,c,r,w,p,z,n,niter,flop_cg)
+
+           WLOG("calling nekgsync")
+           call nekgsync()
+
+           WLOG("calling set_timer_flop_count")
+           call set_timer_flop_cnt(0)
+           WLOG("calling cg")
+           call cg(x,f,g,c,r,w,p,z,n,niter,flop_cg)
+           WLOG("calling set_timer_flop_count")
+           call set_timer_flop_cnt(1)
+
+           WLOG("calling gs_free")
+           call gs_free(gsh)
+
+           icount = icount + 1
+           mfloplist(icount)= mflops*np
+         enddo
+      enddo
+
+      avmflop = 0.0
+      do i = 1, icount
+        avmflop = avmflop + mfloplist(i)
+      end do
+
+      if (icount .ne. 0) then
+        avmflop = avmflop/icount
+      end if
+
+      if (nid .eq. 0) then
+        write(6,1) avmflop
+      end if
+    1 format('Av MFlops = ', 1pe12.4)
+
+c     TEST BANDWIDTH BISECTION CAPACITY
+c     call xfer(np,cr_h)
+
+      call nekgsync()
+      tstop = dnekclock()
+      if (nid .eq.0) write(*,*) "Total run time = ", tstop-tstart
+
+      call exitt0
+
+      end
+c--------------------------------------------------------------
+      subroutine set_f(f,c,n)
+      real f(n),c(n)
+      integer i
+      integer, allocatable :: seed(:)
+
+      call RANDOM_SEED(SIZE=i)
+      allocate(seed(i))
+      seed = 5
+      call RANDOM_SEED(PUT=seed(1:i))
+
+      do i=1,n
+         call RANDOM_NUMBER(f(i))
+      enddo
+
+      call dssum(f)
+      call col2 (f,c,n)
+
+      deallocate(seed)
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine init_dim
+
+C     Transfer array dimensions to common
+
+      include 'SIZE'
+      include 'INPUT'
+ 
+      ny1=nx1
+      nz1=nx1
+ 
+      ndim=ldim
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine init_mesh(ifbrick,npx,npy,npz,mx,my,mz)
+      include 'SIZE'
+      include 'TOTAL'
+      logical ifbrick
+      integer e,eg,offs
+ 
+
+      if(.not.ifbrick) then   ! A 1-D array of elements of length P*lelt
+  10     continue
+         nelx = nelt*np
+         nely = 1
+         nelz = 1
+   
+         do e=1,nelt
+            eg = e + nid*nelt
+            lglel(e) = eg
+         enddo
+      else              ! A 3-D block of elements 
+        if (npx*npy*npz .ne. np) then
+          call cubic(npx,npy,npz,np)  !xyz distribution of total proc
+        end if 
+        if (mx*my*mz .ne. nelt) then
+          call cubic(mx,my,mz,nelt)   !xyz distribution of elements per proc
+        end if 
+      
+c       if(mx.eq.nelt) goto 10
+
+        nelx = mx*npx
+        nely = my*npy 
+        nelz = mz*npz
+
+        e = 1
+        offs = (mod(nid,npx)*mx) + npx*(my*mx)*(mod(nid/npx,npy)) 
+     $      + (npx*npy)*(mx*my*mz)*(nid/(npx*npy))
+        do k = 0,mz-1
+        do j = 0,my-1
+        do i = 0,mx-1
+           eg = offs+i+(j*nelx)+(k*nelx*nely)+1
+           lglel(e) = eg
+           e        = e+1
+        enddo
+        enddo
+        enddo
+      endif
+
+      if (nid.eq.0) then
+        write(6,*) "Processes: npx= ", npx, " npy= ", npy, " npz= ", npz
+        write(6,*) "Local Elements: mx= ", mx, " my= ", my, " mz= ", mz
+        write(6,*) "Elements: nelx= ", nelx, " nely= ", nely,
+     &             " nelz= ", nelz
+      end if
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine cubic(mx,my,mz,np)
+
+      mx = np
+      my = 1
+      mz = 1
+      ratio = np
+
+      iroot3 = np**(1./3.) + 0.000001
+      do i= iroot3,1,-1
+        iz = i
+        myx = np/iz
+        nrem = np-myx*iz
+
+        if (nrem.eq.0) then
+          iroot2 = myx**(1./2.) + 0.000001
+          do j=iroot2,1,-1
+            iy = j
+            ix = myx/iy
+            nrem = myx-ix*iy
+            if (nrem.eq.0) goto 20
+          enddo
+   20     continue
+
+          if (ix < iy) then
+            it = ix
+            ix = iy
+            iy = it
+          end if      
+
+          if (ix < iz) then
+            it = ix
+            ix = iz
+            iz = it
+          end if      
+
+          if (iy < iz) then
+            it = iy
+            iy = iz
+            iz = it
+          end if      
+
+          if ( REAL(ix)/iz < ratio) then
+            ratio = REAL(ix)/iz
+            mx = ix
+            my = iy
+            mz = iz
+          end if 
+
+        end if
+      enddo
+
+      return
+      end
+
+c-----------------------------------------------------------------------
+      subroutine set_multiplicity (c)       ! Inverse of counting matrix
+      include 'SIZE'
+      include 'TOTAL'
+
+      real c(1)
+
+      n = nx1*ny1*nz1*nelt
+
+      call rone(c,n)
+      call gs_op(gsh,c,1,1,0)  ! Gather-scatter operation  ! w   = QQ  w
+
+      do i=1,n
+         c(i) = 1./c(i)
+      enddo
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine set_timer_flop_cnt(iset)
+      include 'SIZE'
+      include 'TOTAL'
+      include 'TIMER'
+
+      integer i, numThrd, totThd
+      integer omp_get_max_threads
+      real tmp1(20), tmp2(20), tmp3(20), tmp4(20)
+
+      real time0,time1
+      save time0,time1
+
+      if (iset.eq.0) then
+         flop_a  = 0
+         flop_cg = 0
+
+         do i = 1, tmax
+           trzero(i) = 0
+           tcopy(i) = 0
+           tsolvem(i) = 0
+           tglsc3a(i) = 0
+           tglsc3b(i) = 0
+           tglsc3c(i) = 0
+           tglsc3d(i) = 0
+           tadd2s1(i) = 0
+           tadd2s2a(i) = 0
+           tadd2s2b(i) = 0
+           tadd2s2c(i) = 0
+           tlocalgrad3(i) = 0
+           twrwswt(i) = 0
+           tlocalgrad3t(i) = 0
+           tgsop(i) = 0
+           tgop(1,i) = 0
+           tgop(2,i) = 0
+           tgop(3,i) = 0
+           tgop(4,i) = 0
+         end do
+
+         time0   = dnekclock()
+      else
+        time1   = dnekclock()-time0
+        if (time1.gt.0) mflops = (1.0*flop_a+1.0*flop_cg)/(1.e6*time1)
+
+        if (nid.eq.0) then
+          write(6,1) nelt,np,nx1, nelt*np
+          write(6,2) mflops*np, mflops
+          write(6,3) REAL(flop_a),REAL(flop_cg),time1
+        end if
+
+    1   format('nelt = ', i7, ', np = ', i9, ', nx1 = ', i7,
+     &         ', elements =', i10 )
+    2   format('Tot MFlops = ', 1pe12.5, ', MFlops = ', e12.5)
+    3   format('Ax FOp = ', 1pe12.5, ', CG FOp = ', e12.5,
+     &         ', Solve Time = ', e12.5)
+
+#ifdef TIMERS
+        numThrd = 1
+#ifdef _OPENMP
+        numThrd = omp_get_max_threads()
+#endif
+        totThd = numThrd*np
+
+        do i = 1, numThrd
+          tglsc3a(i) = tglsc3a(i) - tgop(1,i)
+          tglsc3b(i) = tglsc3b(i) - tgop(2,i)
+          tglsc3c(i) = tglsc3c(i) - tgop(3,i)
+          tglsc3d(i) = tglsc3d(i) - tgop(4,i)
+        end do
+
+        do i = 1,20
+          tmp1(i) = 0.0
+        end do
+        
+        tmp1(1)= time1
+        do i = 1, numThrd
+          tmp1(2)= tmp1(2) + trzero(i)
+          tmp1(3)= tmp1(3) + tcopy(i)
+          tmp1(4)= tmp1(4) + tsolvem(i)
+          tmp1(5)= tmp1(5) + tglsc3a(i)
+          tmp1(6)= tmp1(6) + tglsc3b(i)
+          tmp1(7)= tmp1(7) + tglsc3c(i)
+          tmp1(8)= tmp1(8) + tglsc3d(i)
+          tmp1(9)= tmp1(9) + tadd2s1(i)
+          tmp1(10)= tmp1(10) + tadd2s2a(i)
+          tmp1(11)= tmp1(11) + tadd2s2b(i)
+          tmp1(12)= tmp1(12) + tadd2s2c(i)
+          tmp1(13)= tmp1(13) + tlocalgrad3(i)
+          tmp1(14)= tmp1(14) + twrwswt(i)
+          tmp1(15)= tmp1(15) + tlocalgrad3t(i)
+          tmp1(16)= tmp1(16) + tgsop(i)
+          tmp1(17)= tmp1(17) + tgop(1,i)
+          tmp1(18)= tmp1(18) + tgop(2,i)
+          tmp1(19)= tmp1(19) + tgop(3,i)
+          tmp1(20)= tmp1(20) + tgop(4,i)
+        end do
+
+        call gop(tmp1, tmp4, '+  ', 20)
+
+        tmp2(1)= time1
+        tmp2(2)= trzero(1)
+        tmp2(3)= tcopy(1)
+        tmp2(4)= tsolvem(1)
+        tmp2(5)= tglsc3a(1)
+        tmp2(6)= tglsc3b(1)
+        tmp2(7)= tglsc3c(1)
+        tmp2(8)= tglsc3d(1)
+        tmp2(9)= tadd2s1(1)
+        tmp2(10)= tadd2s2a(1)
+        tmp2(11)= tadd2s2b(1)
+        tmp2(12)= tadd2s2c(1)
+        tmp2(13)= tlocalgrad3(1)
+        tmp2(14)= twrwswt(1)
+        tmp2(15)= tlocalgrad3t(1)
+        tmp2(16)= tgsop(1)
+        tmp2(17)= tgop(1,1)
+        tmp2(18)= tgop(2,1)
+        tmp2(19)= tgop(3,1)
+        tmp2(20)= tgop(4,1)
+
+        do i = 2, numThrd
+          if (trzero(i) < tmp2(2)) tmp2(2)= trzero(i)
+          if (tcopy(i) < tmp2(3)) tmp2(3)= tcopy(i)
+          if (tsolvem(i) < tmp2(4)) tmp2(4)= tsolvem(i)
+          if (tglsc3a(i) < tmp2(5)) tmp2(5)= tglsc3a(i)
+          if (tglsc3b(i) < tmp2(6)) tmp2(6)= tglsc3b(i)
+          if (tglsc3c(i) < tmp2(7)) tmp2(7)= tglsc3c(i)
+          if (tglsc3d(i) < tmp2(8)) tmp2(8)= tglsc3d(i)
+          if (tadd2s1(i) < tmp2(9)) tmp2(9)= tadd2s1(i)
+          if (tadd2s2a(i) < tmp2(10)) tmp2(10)= tadd2s2a(i)
+          if (tadd2s2b(i) < tmp2(11)) tmp2(11)= tadd2s2b(i)
+          if (tadd2s2c(i) < tmp2(12)) tmp2(12)= tadd2s2c(i)
+          if (tlocalgrad3(i) < tmp2(13)) tmp2(13)= tlocalgrad3(i)
+          if (twrwswt(i) < tmp2(14)) tmp2(14)= twrwswt(i)
+          if (tlocalgrad3t(i) < tmp2(15)) tmp2(15)= tlocalgrad3t(i)
+          if (tgsop(i) < tmp2(16)) tmp2(16)= tgsop(i)
+          if (tgop(1,i) < tmp2(17)) tmp2(17)= tgop(1,i)
+          if (tgop(2,i) < tmp2(18)) tmp2(18)= tgop(2,i)
+          if (tgop(3,i) < tmp2(19)) tmp2(19)= tgop(3,i)
+          if (tgop(4,i) < tmp2(20)) tmp2(20)= tgop(4,i)
+        end do
+
+        call gop(tmp2, tmp4, 'm  ', 20)
+
+        tmp3(1)= time1
+        tmp3(2)= trzero(1)
+        tmp3(3)= tcopy(1)
+        tmp3(4)= tsolvem(1)
+        tmp3(5)= tglsc3a(1)
+        tmp3(6)= tglsc3b(1)
+        tmp3(7)= tglsc3c(1)
+        tmp3(8)= tglsc3d(1)
+        tmp3(9)= tadd2s1(1)
+        tmp3(10)= tadd2s2a(1)
+        tmp3(11)= tadd2s2b(1)
+        tmp3(12)= tadd2s2c(1)
+        tmp3(13)= tlocalgrad3(1)
+        tmp3(14)= twrwswt(1)
+        tmp3(15)= tlocalgrad3t(1)
+        tmp3(16)= tgsop(1)
+        tmp3(17)= tgop(1,1)
+        tmp3(18)= tgop(2,1)
+        tmp3(19)= tgop(3,1)
+        tmp3(20)= tgop(4,1)
+
+        do i = 2, numThrd
+          if (trzero(i) > tmp3(2)) tmp3(2)= trzero(i)
+          if (tcopy(i) > tmp3(3)) tmp3(3)= tcopy(i)
+          if (tsolvem(i) > tmp3(4)) tmp3(4)= tsolvem(i)
+          if (tglsc3a(i) > tmp3(5)) tmp3(5)= tglsc3a(i)
+          if (tglsc3b(i) > tmp3(6)) tmp3(6)= tglsc3b(i)
+          if (tglsc3c(i) > tmp3(7)) tmp3(7)= tglsc3c(i)
+          if (tglsc3d(i) > tmp3(8)) tmp3(8)= tglsc3d(i)
+          if (tadd2s1(i) > tmp3(9)) tmp3(9)= tadd2s1(i)
+          if (tadd2s2a(i) > tmp3(10)) tmp3(10)= tadd2s2a(i)
+          if (tadd2s2b(i) > tmp3(11)) tmp3(11)= tadd2s2b(i)
+          if (tadd2s2c(i) > tmp3(12)) tmp3(12)= tadd2s2c(i)
+          if (tlocalgrad3(i) > tmp3(13)) tmp3(13)= tlocalgrad3(i)
+          if (twrwswt(i) > tmp3(14)) tmp3(14)= twrwswt(i)
+          if (tlocalgrad3t(i) > tmp3(15)) tmp3(15)= tlocalgrad3t(i)
+          if (tgsop(i) > tmp3(16)) tmp3(16)= tgsop(i)
+          if (tgop(1,i) > tmp3(17)) tmp3(17)= tgop(1,i)
+          if (tgop(2,i) > tmp3(18)) tmp3(18)= tgop(2,i)
+          if (tgop(3,i) > tmp3(19)) tmp3(19)= tgop(3,i)
+          if (tgop(4,i) > tmp3(20)) tmp3(20)= tgop(4,i)
+        end do
+
+        call gop(tmp3, tmp4, 'M  ', 20)
+
+        if (nid.eq.0) then
+          write(6,4) "time       = ",tmp1(1)/np, tmp2(1), tmp3(1)
+          write(6,4) "rzero      = ",tmp1(2)/totThd, tmp2(2), tmp3(2)
+          write(6,4) "copy       = ",tmp1(3)/totThd, tmp2(3), tmp3(3)
+          write(6,4) "glsc3a     = ",tmp1(5)/totThd, tmp2(5), tmp3(5)
+          write(6,4) "gopa       = ",tmp1(17)/totThd, tmp2(17), tmp3(17)
+          write(6,4) "solveM     = ",tmp1(4)/totThd, tmp2(4), tmp3(4)
+          write(6,4) "glsc3b     = ",tmp1(6)/totThd, tmp2(6), tmp3(6)
+          write(6,4) "gopb       = ",tmp1(18)/totThd, tmp2(18), tmp3(18)
+          write(6,4) "add2s1     = ",tmp1(9)/totThd, tmp2(9), tmp3(9)
+          write(6,4) "localgrad3 = ",tmp1(13)/totThd, tmp2(13), tmp3(13)
+          write(6,4) "wrwswt     = ",tmp1(14)/totThd, tmp2(14), tmp3(14)
+          write(6,4) "localgradt = ",tmp1(15)/totThd, tmp2(15), tmp3(15)
+          write(6,4) "gsop       = ",tmp1(16)/totThd, tmp2(16), tmp3(16)
+          write(6,4) "add2s2a    = ",tmp1(10)/totThd, tmp2(10), tmp3(10)
+          write(6,4) "glsc3c     = ",tmp1(7)/totThd, tmp2(7), tmp3(7)
+          write(6,4) "gopc       = ",tmp1(19)/totThd, tmp2(19), tmp3(19)
+          write(6,4) "add2s2b    = ",tmp1(11)/totThd, tmp2(11), tmp3(11)
+          write(6,4) "add2s2c    = ",tmp1(12)/totThd, tmp2(12), tmp3(12)
+          write(6,4) "glsc3d     = ",tmp1(8)/totThd, tmp2(8), tmp3(8)
+          write(6,4) "gopd       = ",tmp1(20)/totThd, tmp2(20), tmp3(20)
+        endif
+
+    4   format(A, 1pe12.4, e12.4, e12.4)
+
+c       if (nid.eq.0) then
+c         write(6,4) "av time: ", tmp2(1)/np, tmp2(2)/totThd,
+c    &               tmp2(3)/totThd, tmp2(4)/totThd, tmp2(5)/totThd
+c         write(6,5) "av time: ", tmp2(5)/totThd, tmp2(6)/totThd,
+c    &               tmp2(7)/totThd, tmp2(8)/totThd
+c       endif
+
+c       if (nid.eq.0) then
+c         write(6,4) "min time: ", tmp2(1), tmp2(2), tmp2(3),
+c    &               tmp2(4), tmp2(5)
+c         write(6,5) "min time: ", tmp2(5), tmp2(6), tmp2(7),
+c    &               tmp2(8)
+c       endif
+
+c       if (nid.eq.0) then
+c         write(6,4) "max time: ", tmp2(1), tmp2(2), tmp2(3),
+c    &               tmp2(4), tmp2(5)
+c         write(6,5) "max time: ", tmp2(5), tmp2(6), tmp2(7),
+c    &               tmp2(8)
+c       endif
+
+c   4   format(A, ' cg= ', 1pe12.4, ', zcm= ', e12.4,
+c    &         ', glsc3= ', e12.4, ', add2sx= ', e12.4,
+c    &         ', ax= ', e12.4)
+c   5   format(A, ' ax= ', 1pe12.4, ', add2s2= ', e12.4,
+c    &         ', gsop= ', e12.4, ', axe= ', e12.4)
+#endif
+      endif
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine xfer(np,gsh)
+      include 'SIZE'
+      parameter(npts_max = lx1*ly1*lz1*lelt)
+
+      real buffer(2,npts_max)
+      integer ikey(npts_max)
+
+
+      nbuf = 800
+      npts = 1
+      do itest=1,200
+         npoints = npts*np
+
+         call load_points(buffer,nppp,npoints,npts,nbuf)
+         iend   = mod1(npoints,nbuf)
+         istart = 1
+         if(nid.ne.0)istart = iend+(nid-1)*nbuf+1
+         do i = 1,nppp
+            icount=istart+(i-1)
+            ikey(i)=mod(icount,np)
+         enddo
+
+         call nekgsync
+         time0 = dnekclock()
+         do loop=1,50
+            call crystal_tuple_transfer(gsh,nppp,npts_max,
+     $                ikey,1,ifake,0,buffer,2,1)
+         enddo
+         time1 = dnekclock()
+         etime = (time1-time0)/50
+
+         if (nid.eq.0) write(6,1) np,npts,npoints,etime
+   1     format(2i7,i10,1p1e12.4,' bandwidth' )
+         npts = 1.02*(npts+1)
+         if (npts.gt.npts_max) goto 100
+      enddo
+ 100  continue
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine load_points(buffer,nppp,npoints,npts,nbuf)
+      include 'SIZE'
+      include 'PARALLEL'
+
+      real buffer(2,nbuf)
+
+      nppp=0
+      if(nbuf.gt.npts) then
+       npass = 1+npoints/nbuf
+
+       do ipass = 1,npass
+          if(nid.eq.ipass.and.ipass.ne.npass) then
+            do i = 1,nbuf
+             buffer(1,i)=i
+             buffer(2,i)=nid
+            enddo
+            nppp=nbuf
+          elseif (npass.eq.ipass.and.nid.eq.0) then
+            mbuf=mod1(npoints,nbuf)
+            do i=1,mbuf
+               buffer(1,i)=i
+               buffer(2,i)=nid
+            enddo
+            nppp=mbuf
+          endif
+       enddo
+      else
+       do i = 1,npts
+          buffer(1,i)=i
+          buffer(2,i)=nid
+       enddo
+       nppp=npts
+      endif
+
+      return
+      end
+c----------------------------------------------------------------------
+      subroutine read_param(ifbrick,iel0,ielN,ielD,nx0,nxN,nxD,
+     +                      npx,npy,npz,mx,my,mz)
+      include 'SIZE'
+      logical ifbrick
+      integer iel0,ielN,ielD,nx0,nxN,nxD,npx,npy,npz,mx,my,mz
+
+      !open .rea
+      if(nid.eq.0) then
+         open(unit=9,file='data.rea',status='old') 
+         read(9,*,err=100) ifbrick
+         read(9,*,err=100) iel0,ielN,ielD
+         read(9,*,err=100) nx0,nxN,nxD
+         read(9,*,err=100) npx,npy,npz
+         read(9,*,err=100) mx,my,mz
+         close(9)
+      endif
+      call bcast(ifbrick,4)
+      call bcast(iel0,4)
+      call bcast(ielN,4)
+      call bcast(ielD,4)
+c     nx0=lx1
+c     nxN=lx1
+      call bcast(nx0,4)
+      call bcast(nxN,4)
+      call bcast(nxD,4)
+      call bcast(npx,4)
+      call bcast(npy,4)
+      call bcast(npz,4)
+      call bcast(mx,4)
+      call bcast(my,4)
+      call bcast(mz,4)
+      if(iel0.gt.ielN.or.nx0.gt.nxN) goto 200
+
+      return
+
+  100 continue
+      write(6,*) "ERROR READING data.rea....ABORT"
+      call exitt0
+
+  200 continue
+      write(6,*) "ERROR data.rea :: iel0 > ielN or nx0 > nxN :: ABORT"
+      call exitt0
+  
+      return
+      end
+c-----------------------------------------------------------------------
diff --git a/src/driver_comm.f b/src/driver_comm.f
new file mode 100644
index 0000000..d66e436
--- /dev/null
+++ b/src/driver_comm.f
@@ -0,0 +1,21 @@
+c-----------------------------------------------------------------------
+      program nekbone
+      
+      include 'SIZE'
+      include 'TOTAL'
+      include 'mpif.h'
+
+      parameter (lxyz = lx1*ly1*lz1)
+      parameter (lt=lxyz*lelt)
+
+
+      call iniproc(mpi_comm_world)    ! has nekmpi common block
+
+c     GET PLATFORM CHARACTERISTICS
+      iverbose = 1
+      call platform_timer(iverbose)   ! iverbose=0 or 1
+
+      call exitt0
+
+      end
+c--------------------------------------------------------------
diff --git a/src/jl/Makefile b/src/jl/Makefile
new file mode 100644
index 0000000..96c2f75
--- /dev/null
+++ b/src/jl/Makefile
@@ -0,0 +1,91 @@
+CC=mpicc -std=c99 --pedantic
+CFLAGS+=-DMPI
+CFLAGS+=-DPREFIX=jl_
+CFLAGS+=-DNO_NEK_EXITT
+CFLAGS+=-DGLOBAL_LONG
+LDFLAGS+=-lm
+
+#CFLAGS+=-DPRINT_MALLOCS=1
+
+CFLAGS+=-DUSE_NAIVE_BLAS
+#CFLAGS+=-DUSE_CBLAS
+#LDFLAGS+=-lcblas
+
+#CFLAGS+=-DAMG_DUMP
+CFLAGS+=-DGS_TIMING -DGS_BARRIER
+
+#CFLAGS+=-O0 -g
+CFLAGS+=-O3 -march=native
+
+CFLAGS+=-W -Wall -Wno-unused-function -Wno-unused-parameter
+#CFLAGS+=-Minform=warn
+
+CCCMD=$(CC) $(G) $(CFLAGS)
+LINKCMD=$(CC) $(G) $(LDFLAGS)
+#RLINKCMD = $(LD) -r
+.PHONY: cmds deps tests clean objects odepinfo
+
+TESTS=sort_test sort_test2 sarray_sort_test spchol_test \
+      comm_test crystal_test sarray_transfer_test \
+      gs_test gs_test_old gs_unique_test \
+      xxt_test xxt_test2 crs_test \
+      poly_test poly_test2 lob_bnd_test obbox_test \
+      findpts_el_2_test findpts_el_2_test2 \
+      findpts_el_3_test findpts_el_3_test2 \
+      findpts_local_test findpts_test
+
+CRS=$(AMG)
+
+tests: $(TESTS) ;
+clean: ; @$(RM) $(TESTS) *.o *.s
+
+cmds: ; @echo CC = $(CCCMD); echo LINK = $(LINKCMD);
+
+deps: ; ./cdep.py *.c > makefile.cdep;
+
+odepinfo: deps objects; @./odep_info.py *.o
+
+-include makefile.cdep
+
+%.o: %.c ; @echo CC $<; $(CCCMD) -c $< -o $@
+%.s: %.c ; @echo CC -S $<; $(CCCMD) -S $< -o $@
+objects: $(OBJECTS) ;
+
+poly_imp.h: gen_poly_imp.c
+	$(RM) poly_imp.h;
+	$(CC) -lgmp -lm gen_poly_imp.c -o gen_poly_imp;
+	./gen_poly_imp > poly_imp.h;
+	$(RM) gen_poly_imp
+
+GS_OBJECTS=gs.o sort.o sarray_transfer.o sarray_sort.o \
+           gs_local.o fail.o crystal.o comm.o tensor.o
+
+XXT=sparse_cholesky.o xxt.o
+AMG=amg.o
+
+sort_test: sort.o fail.o comm.o tensor.o gs_local.o sort_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@
+sort_test2: sort.o fail.o comm.o tensor.o gs_local.o sort_test2.o ; @echo LINK $@; $(LINKCMD) $^ -o $@
+sarray_sort_test: sort.o fail.o comm.o tensor.o gs_local.o sarray_sort.o sarray_sort_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@
+spchol_test: sparse_cholesky.o sort.o fail.o comm.o tensor.o gs_local.o spchol_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@
+comm_test: fail.o comm.o tensor.o gs_local.o comm_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@
+crystal_test: fail.o crystal.o comm.o tensor.o gs_local.o crystal_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@
+sarray_transfer_test: sarray_transfer.o sarray_sort.o sort.o fail.o crystal.o comm.o tensor.o gs_local.o sarray_transfer_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@
+
+gs_test: gs_test.o $(GS_OBJECTS);		@echo LINK $@; $(LINKCMD) $^ -o $@
+gs_test_old: gs_test_old.o $(GS_OBJECTS);	@echo LINK $@; $(LINKCMD) $^ -o $@
+gs_unique_test: gs_unique_test.o $(GS_OBJECTS);	@echo LINK $@; $(LINKCMD) $^ -o $@
+xxt_test: xxt_test.o $(CRS) $(GS_OBJECTS);	@echo LINK $@; $(LINKCMD) $^ -o $@
+xxt_test2: xxt_test2.o $(CRS) $(GS_OBJECTS);	@echo LINK $@; $(LINKCMD) $^ -o $@
+crs_test: crs_test.o $(CRS) $(GS_OBJECTS);	@echo LINK $@; $(LINKCMD) $^ -o $@
+
+poly_test2: poly.o fail.o comm.o tensor.o gs_local.o poly_test2.o ; @echo LINK $@; $(LINKCMD) $^ -o $@
+poly_test: poly.o fail.o comm.o tensor.o gs_local.o poly_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@
+lob_bnd_test: tensor.o poly.o lob_bnd.o fail.o comm.o gs_local.o lob_bnd_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@
+obbox_test: rand_elt_test.o poly.o obbox.o tensor.o lob_bnd.o fail.o comm.o gs_local.o obbox_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@
+findpts_el_2_test2: tensor.o rand_elt_test.o lob_bnd.o fail.o comm.o gs_local.o poly.o findpts_el_2.o findpts_el_2_test2.o ; @echo LINK $@; $(LINKCMD) $^ -o $@
+findpts_el_2_test: poly.o fail.o comm.o tensor.o gs_local.o findpts_el_2.o findpts_el_2_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@
+findpts_el_3_test2: tensor.o rand_elt_test.o lob_bnd.o fail.o comm.o gs_local.o poly.o findpts_el_3.o findpts_el_3_test2.o ; @echo LINK $@; $(LINKCMD) $^ -o $@
+findpts_el_3_test: poly.o fail.o comm.o tensor.o gs_local.o findpts_el_3.o findpts_el_3_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@
+findpts_local_test: rand_elt_test.o lob_bnd.o fail.o comm.o tensor.o gs_local.o poly.o findpts_local.o sort.o sarray_sort.o obbox.o findpts_el_3.o findpts_el_2.o findpts_local_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@
+findpts_test: sarray_transfer.o sort.o rand_elt_test.o lob_bnd.o poly.o findpts.o sarray_sort.o findpts_local.o obbox.o tensor.o findpts_el_3.o findpts_el_2.o fail.o crystal.o comm.o gs_local.o findpts_test.o ; @echo LINK $@; $(LINKCMD) $^ -o $@
+
diff --git a/src/jl/README b/src/jl/README
new file mode 100644
index 0000000..36c66ab
--- /dev/null
+++ b/src/jl/README
@@ -0,0 +1,69 @@
+
+A high-level view of the code in this directory is as follows. See each header
+file listed for more documentation.
+
+The following headers are fundamental to most of the code.
+
+  name.h:    a given prefix is added to all external symbols;
+             determines how FORTRAN routines are named
+  types.h:   defines the integer types used everywhere (e.g., for array indices)
+  mem.h:     memory-management wrappers;
+             "array" type (generic dynamically sized array);
+             "buffer" type ( = char array )
+  comm.h:    wrappers for MPI calls (with alternative single proc versions)
+
+The Gather/Scatter library top-level interface is defined in "gs.h".
+The file "gs_defs.h" defines the datatypes and operations that it supports.
+
+There are two coarse solvers (XXT and AMG), which are not currently very well
+documented. The interface is given in "crs.h". 
+ 
+"findpts" is documented in "findpts.c". The idea is that during a run of an
+SEM code, we have a geometry map
+  (processor, element, r, s, t) -> (x, y, z)
+that defines our mesh. Within each element, the xyz coordinate is a
+polynomial function of the parametric r,s,t coordinates.
+"findpts" takes a distributed list of "(x,y,z)" points and computes the inverse
+of the above map.
+"findpts_eval" takes a list of "(proc,el,r,s,t)" coords, e.g., as returned by
+  "findpts", and interpolates a given field at each point.
+
+
+The "workhorses" of the implementations of much of the above are the
+"sarray_sort" and "sarray_transfer" routines, documented in the respective
+headers. The "array" type, defined in "mem.h", can be used to keep track of a
+dynamically sized array of (arbitrary) structs.
+
+  sarray_sort.h:     
+    sort an array of structs (locally/sequentially) by one or two of its fields
+  sarray_transfer.h:
+    transfer each struct in array to the processor specified by a given field
+    
+These in turn, are implemented using the lower-level routines of
+"sort.h", and "crystal_router.h".
+
+
+The "findpts" algorithm makes use of a number of lower-level routines
+possibly useful on their own.
+
+  poly.h:     computation of quadrature nodes; fast polynomial interpolation
+  lob_bnd.h:  (relatively) fast yet robust bounds for polynomials on [-1,1]^d
+  obbox.h:    oriented as well as axis-aligned bounding boxes for spectral els
+  tensor.h:   some tensor-product applications,
+                with BLAS ops delegated to Nek, cblas, or a naive imp
+
+All of the preprocessor macros that affect compilation are:
+  name.h:  PREFIX="..."    prefix added to all C external symbols
+          FPREFIX="..."    prefix added to all FORTRAN routines
+    UPCASE, UNDERSCORE   determines FORTRAN naming convention
+  types.h: USE_LONG, USE_LONG_LONG, GLOBAL_LONG, GLOBAL_LONG_LONG
+           determine the integer types used by all code
+  mem.h: PRINT_MALLOCS=1   (print all mem mngmt to stdout)
+  comm.h: MPI  (use MPI when defined;
+                otherwise, use a dummy single-proc implementation)
+  tensor.h: USE_CBLAS, USE_NAIVE_BLAS
+            (select BLAS implementation; default is Nek's mxm)
+  fail.c: NO_NEK_EXITT    when defined, don't call Nek's exitt routine
+  amg.c: AMG_BLOCK_ROWS   number of rows to read at a time (default=1200)
+         GS_TIMING        record timings for the matrix multiplies
+         GS_BARRIER       use a barrier to improve the quality of the timings
diff --git a/src/jl/c99.h b/src/jl/c99.h
new file mode 100644
index 0000000..a5a44e3
--- /dev/null
+++ b/src/jl/c99.h
@@ -0,0 +1,16 @@
+#ifndef C99_H
+#define C99_H
+
+#ifndef __STDC_VERSION__
+#  define NO_C99
+#elif __STDC_VERSION__ < 199901L
+#  define NO_C99
+#endif
+
+#ifdef NO_C99
+#  define restrict
+#  define inline
+#  undef NO_C99
+#endif
+
+#endif
diff --git a/src/jl/cdep.py b/src/jl/cdep.py
new file mode 100755
index 0000000..a0dd87a
--- /dev/null
+++ b/src/jl/cdep.py
@@ -0,0 +1,33 @@
+#!/usr/bin/python
+
+import sys, os, re
+
+#mergestr = lambda x: reduce((lambda a,b: a+" "+b),x,"")
+
+pathjoin = lambda a,b: os.path.normpath(os.path.join(a,b))
+include_re = re.compile("\s*#\s*include\s*\"([^\"]*)\"")
+incmatch = lambda x: ( include_re.match(line) for line in open(x) )
+incline = lambda x,m: pathjoin(os.path.split(x)[0],m.group(1))
+incl = lambda x: [ incline(x,m) for m in incmatch(x) if m!=None ]
+includes = {}
+def get_include(x):
+	if not includes.has_key(x): includes[x] = incl(x)
+	return includes[x]
+
+def closure(seq,f):
+	v = [], [x for x in seq], set(x for x in seq)
+	while len(v[1]): [(v[1].append(y),v[2].add(y)) for y in 
+	  f((lambda x: (v[0].append(x),x)[1])(v[1].pop())) if not y in v[2]]
+	return v[0]
+
+src_files = sys.argv[1:]
+files = closure(src_files, get_include)
+deps = dict((x,closure(includes[x],lambda y: includes[y])) for x in src_files)
+
+obj = lambda x: os.path.splitext(x)[0]+".o"
+
+for x in src_files:
+	print obj(x)+": "+x+reduce((lambda a,b: a+" "+b),deps[x],"")
+
+print
+print "OBJECTS="+reduce((lambda a,b: a+" "+obj(b)),src_files,"")
diff --git a/src/jl/comm.c b/src/jl/comm.c
new file mode 100644
index 0000000..8e5c9a3
--- /dev/null
+++ b/src/jl/comm.c
@@ -0,0 +1,175 @@
+#include <stddef.h> /* for size_t */
+#include <stdlib.h> /* for exit */
+#include <string.h> /* memcpy */
+#include <limits.h> /* for gs identities */
+#include <float.h>  /* for gs identities */
+#include "name.h"
+#include "fail.h"
+#include "types.h"
+#include "tensor.h"
+#include "gs_defs.h"
+#include "gs_local.h"
+#include "comm.h"
+
+uint comm_gbl_id=0, comm_gbl_np=1;
+
+GS_DEFINE_IDENTITIES()
+GS_DEFINE_DOM_SIZES()
+
+static void scan_imp(void *scan, const struct comm *com, gs_dom dom, gs_op op,
+                     const void *v, uint vn, void *buffer)
+{
+  comm_req req[2];
+  size_t vsize = vn*gs_dom_size[dom];
+  const uint id=com->id, np=com->np;
+  uint n = np, c=1, odd=0, base=0;
+  void *buf[2];
+  void *red = (char*)scan+vsize;
+  buf[0]=buffer,buf[1]=(char*)buffer+vsize;
+  while(n>1) {
+    odd=(odd<<1)|(n&1);
+    c<<=1, n>>=1;
+    if(id>=base+n) c|=1, base+=n, n+=(odd&1);
+  }
+  gs_init_array(scan,vn,dom,op);
+  memcpy(red,v,vsize);
+  while(n<np) {
+    if(c&1) n-=(odd&1), base-=n;
+    c>>=1, n<<=1, n+=(odd&1);
+    odd>>=1;
+    if(base==id) {
+      comm_irecv(&req[0],com, buf[0],vsize, id+n/2,id+n/2);
+      comm_isend(&req[1],com, red   ,vsize, id+n/2,id);
+      comm_wait(req,2);
+      gs_gather_array(red,buf[0],vn,dom,op);
+    } else {
+      comm_irecv(&req[0],com, scan,vsize, base,base);
+      comm_isend(&req[1],com, red ,vsize, base,id);
+      comm_wait(req,2);
+      break;
+    }
+  }
+  while(n>1) {
+    if(base==id) {
+      comm_send(com, scan  ,2*vsize, id+n/2,id);
+    } else {
+      comm_recv(com, buffer,2*vsize, base,base);
+      gs_gather_array(scan,buf[0],vn,dom,op);
+      memcpy(red,buf[1],vsize);
+    }
+    odd=(odd<<1)|(n&1);
+    c<<=1, n>>=1;
+    if(id>=base+n) c|=1, base+=n, n+=(odd&1);
+  }
+}
+
+
+static void allreduce_imp(const struct comm *com, gs_dom dom, gs_op op,
+                          void *v, uint vn, void *buf)
+{
+  size_t total_size = vn*gs_dom_size[dom];
+  const uint id=com->id, np=com->np;
+  uint n = np, c=1, odd=0, base=0;
+  while(n>1) {
+    odd=(odd<<1)|(n&1);
+    c<<=1, n>>=1;
+    if(id>=base+n) c|=1, base+=n, n+=(odd&1);
+  }
+  while(n<np) {
+    if(c&1) n-=(odd&1), base-=n;
+    c>>=1, n<<=1, n+=(odd&1);
+    odd>>=1;
+    if(base==id) {
+      comm_recv(com, buf,total_size, id+n/2,id+n/2);
+      gs_gather_array(v,buf,vn, dom,op);
+    } else {
+      comm_send(com, v,total_size, base,id);
+      break;
+    }
+  }
+  while(n>1) {
+    if(base==id)
+      comm_send(com, v,total_size, id+n/2,id);
+    else
+      comm_recv(com, v,total_size, base,base);
+    odd=(odd<<1)|(n&1);
+    c<<=1, n>>=1;
+    if(id>=base+n) c|=1, base+=n, n+=(odd&1);
+  }
+}
+
+void comm_scan(void *scan, const struct comm *com, gs_dom dom, gs_op op,
+               const void *v, uint vn, void *buffer)
+{
+  scan_imp(scan, com,dom,op, v,vn, buffer);
+}
+
+void comm_allreduce(const struct comm *com, gs_dom dom, gs_op op,
+                          void *v, uint vn, void *buf)
+{
+  if(vn==0) return;
+#ifdef MPI
+  {
+    MPI_Datatype mpitype;
+    MPI_Op mpiop;
+    #define DOMAIN_SWITCH() do { \
+      switch(dom) { case gs_double:    mpitype=MPI_DOUBLE;    break; \
+                    case gs_float:     mpitype=MPI_FLOAT;     break; \
+                    case gs_int:       mpitype=MPI_INT;       break; \
+                    case gs_long:      mpitype=MPI_LONG;      break; \
+     WHEN_LONG_LONG(case gs_long_long: mpitype=MPI_LONG_LONG; break;) \
+                  default:        goto comm_allreduce_byhand; \
+      } \
+    } while(0)
+    DOMAIN_SWITCH();
+    #undef DOMAIN_SWITCH
+    switch(op) { case gs_add: mpiop=MPI_SUM;  break;
+                 case gs_mul: mpiop=MPI_PROD; break;
+                 case gs_min: mpiop=MPI_MIN;  break;
+                 case gs_max: mpiop=MPI_MAX;  break;
+                 default:        goto comm_allreduce_byhand;
+    }
+    MPI_Allreduce(v,buf,vn,mpitype,mpiop,com->c);
+    memcpy(v,buf,vn*gs_dom_size[dom]);
+    return;
+  }
+#endif
+#ifdef MPI
+comm_allreduce_byhand:
+  allreduce_imp(com,dom,op, v,vn, buf);
+#endif
+}
+
+double comm_dot(const struct comm *comm, double *v, double *w, uint n)
+{
+  double s=tensor_dot(v,w,n),b;
+  comm_allreduce(comm,gs_double,gs_add, &s,1, &b);
+  return s;
+}
+
+/* T comm_reduce__T(const struct comm *comm, gs_op op, const T *in, uint n) */
+
+#define SWITCH_OP_CASE(T,OP) case gs_##OP: WITH_OP(T,OP); break;
+#define SWITCH_OP(T,op) do switch(op) { \
+    GS_FOR_EACH_OP(T,SWITCH_OP_CASE) case gs_op_n: break; } while(0)
+
+#define WITH_OP(T,OP) \
+  do { T v = *in++; GS_DO_##OP(accum,v); } while(--n)
+
+#define DEFINE_REDUCE(T) \
+T PREFIXED_NAME(comm_reduce__##T)( \
+    const struct comm *comm, gs_op op, const T *in, uint n) \
+{                                                           \
+  T accum = gs_identity_##T[op], buf;                       \
+  if(n!=0) SWITCH_OP(T,op);                                 \
+  comm_allreduce(comm,gs_##T,op, &accum,1, &buf);           \
+  return accum;                                             \
+}
+
+GS_FOR_EACH_DOMAIN(DEFINE_REDUCE)
+
+#undef DEFINE_REDUCE
+#undef WITH_OP
+#undef SWITCH_OP
+#undef SWITCH_OP_CASE
+
diff --git a/src/jl/comm.h b/src/jl/comm.h
new file mode 100644
index 0000000..4d0ed3e
--- /dev/null
+++ b/src/jl/comm.h
@@ -0,0 +1,255 @@
+#ifndef COMM_H
+#define COMM_H
+
+/* requires:
+     <stddef.h>            for size_t
+     <stdlib.h>            for exit
+     "fail.h", "types.h"
+     "gs_defs.h"           for comm_allreduce, comm_scan, comm_reduce_T
+*/
+
+#if !defined(FAIL_H) || !defined(TYPES_H)
+#warning "comm.h" requires "fail.h" and "types.h"
+#endif
+
+/*
+  When the preprocessor macro MPI is defined, defines (very) thin wrappers
+  for the handful of used MPI routines. Alternatively, when MPI is not defined,
+  these wrappers become dummy routines suitable for a single process run.
+  No code outside of "comm.h" and "comm.c" makes use of MPI at all.
+
+  Basic usage:
+  
+    struct comm c;
+  
+    comm_init(&c, MPI_COMM_WORLD);  // initializes c using MPI_Comm_dup
+
+    comm_free(&c);
+  
+  Very thin MPI wrappers: (see below for implementation)
+
+    comm_send,_recv,_isend,_irecv,_time,_barrier
+    
+  Additionally, some reduction and scan routines are provided making use
+    of the definitions in "gs_defs.h" (provided this has been included first).
+
+  Example comm_allreduce usage:
+    
+    double v[5], buf[5];
+    comm_allreduce(&c, gs_double,gs_add, v,5,buf);
+      // Computes the vector sum of v across all procs, using
+      // buf as a scratch area. Delegates to MPI_Allreduce if possible.
+    
+  Example comm_scan usage:
+    
+    long in[5], out[2][5], buf[2][5];
+    comm_scan(out, &c,gs_long,gs_add, in,5,buf);
+      // out[0] will be the vector sum of "in" across procs with ids
+           *strictly* less than this one (exclusive behavior),
+         and out[1] will be the vector sum across all procs, as would
+           be computed with comm_allreduce.
+         Note: differs from MPI_Scan which has inclusive behavior
+  
+  Example comm_reduce_double, etc. usage:
+  
+    T out, in[10];
+    out = comm_reduce_T(&c, gs_max, in, 10);
+      // out will equal the largest element of "in",
+         across all processors
+      // T can be "double", "float", "int", "long", "slong", "sint", etc.
+         as defined in "gs_defs.h"
+         
+*/
+
+#ifdef MPI
+#include <mpi.h>
+typedef MPI_Comm comm_ext;
+typedef MPI_Request comm_req;
+#else
+typedef int comm_ext;
+typedef int comm_req;
+typedef int MPI_Fint;
+#endif
+
+#define comm_allreduce PREFIXED_NAME(comm_allreduce)
+#define comm_scan      PREFIXED_NAME(comm_scan     )
+#define comm_dot       PREFIXED_NAME(comm_dot      )
+
+/* global id, np vars strictly for diagnostic messages (fail.c) */
+#ifndef comm_gbl_id
+#define comm_gbl_id PREFIXED_NAME(comm_gbl_id)
+#define comm_gbl_np PREFIXED_NAME(comm_gbl_np)
+extern uint comm_gbl_id, comm_gbl_np;
+#endif
+
+struct comm {
+  uint id, np;
+  comm_ext c;
+};
+
+static void comm_init(struct comm *c, comm_ext ce);
+/* (macro) static void comm_init_check(struct comm *c, MPI_Fint ce, uint np); */
+/* (macro) static void comm_dup(struct comm *d, const struct comm *s); */
+static void comm_free(struct comm *c);
+static double comm_time(void);
+static void comm_barrier(const struct comm *c);
+static void comm_recv(const struct comm *c, void *p, size_t n,
+                      uint src, int tag);
+static void comm_send(const struct comm *c, void *p, size_t n,
+                      uint dst, int tag);
+static void comm_irecv(comm_req *req, const struct comm *c,
+                       void *p, size_t n, uint src, int tag);
+static void comm_isend(comm_req *req, const struct comm *c,
+                       void *p, size_t n, uint dst, int tag);
+static void comm_wait(comm_req *req, int n);
+
+double comm_dot(const struct comm *comm, double *v, double *w, uint n);
+
+#ifdef GS_DEFS_H
+void comm_allreduce(const struct comm *com, gs_dom dom, gs_op op,
+                          void *v, uint vn, void *buf);
+void comm_scan(void *scan, const struct comm *com, gs_dom dom, gs_op op,
+               const void *v, uint vn, void *buffer);
+
+#define DEFINE_REDUCE(T) \
+T PREFIXED_NAME(comm_reduce__##T)( \
+    const struct comm *comm, gs_op op, const T *in, uint n); \
+static T comm_reduce_##T(const struct comm *c, gs_op op, const T *v, uint vn) \
+{ return PREFIXED_NAME(comm_reduce__##T)(c,op,v,vn); }
+GS_FOR_EACH_DOMAIN(DEFINE_REDUCE)
+#undef DEFINE_REDUCE
+
+#define comm_reduce_sint \
+    TYPE_LOCAL(comm_reduce_int,comm_reduce_long,comm_reduce_long_long)
+#define comm_reduce_slong \
+   TYPE_GLOBAL(comm_reduce_int,comm_reduce_long,comm_reduce_long_long)
+
+#endif
+
+/*----------------------------------------------------------------------------
+  Code for static (inline) functions
+  ----------------------------------------------------------------------------*/
+
+static void comm_init(struct comm *c, comm_ext ce)
+{
+#ifdef MPI
+  int i;
+  MPI_Comm_dup(ce, &c->c);
+  MPI_Comm_rank(c->c,&i), comm_gbl_id=c->id=i;
+  MPI_Comm_size(c->c,&i), comm_gbl_np=c->np=i;
+#else
+  c->id = 0, c->np = 1;
+#endif
+}
+
+static void comm_init_check_(struct comm *c, MPI_Fint ce, uint np,
+                             const char *file, unsigned line)
+{
+#ifdef MPI
+  comm_init(c,MPI_Comm_f2c(ce));
+  if(c->np != np)
+    fail(1,file,line,"comm_init_check: passed P=%u, "
+                     "but MPI_Comm_size gives P=%u",np,c->np);
+#else
+  comm_init(c,0);
+  if(np != 1)
+    fail(1,file,line,"comm_init_check: passed P=%u, "
+                     "but not compiled with -DMPI",np);
+#endif
+}
+#define comm_init_check(c,ce,np) comm_init_check_(c,ce,np,__FILE__,__LINE__)
+
+
+static void comm_dup_(struct comm *d, const struct comm *s,
+                      const char *file, unsigned line)
+{
+  d->id = s->id, d->np = s->np;
+#ifdef MPI
+  MPI_Comm_dup(s->c,&d->c);
+#else
+  if(s->np!=1) fail(1,file,line,"%s not compiled with -DMPI\n",file);
+#endif
+}
+#define comm_dup(d,s) comm_dup_(d,s,__FILE__,__LINE__)
+
+static void comm_free(struct comm *c)
+{
+#ifdef MPI
+  MPI_Comm_free(&c->c);
+#endif
+}
+
+static double comm_time(void)
+{
+#ifdef MPI
+  return MPI_Wtime();
+#else
+  return 0;
+#endif
+}
+
+static void comm_barrier(const struct comm *c)
+{
+#ifdef MPI
+  MPI_Barrier(c->c);
+#endif
+}
+
+static void comm_recv(const struct comm *c, void *p, size_t n,
+                      uint src, int tag)
+{
+#ifdef MPI
+# ifndef MPI_STATUS_IGNORE
+  MPI_Status stat;
+  MPI_Recv(p,n,MPI_UNSIGNED_CHAR,src,tag,c->c,&stat);
+# else  
+  MPI_Recv(p,n,MPI_UNSIGNED_CHAR,src,tag,c->c,MPI_STATUS_IGNORE);
+# endif
+#endif
+}
+
+static void comm_send(const struct comm *c, void *p, size_t n,
+                      uint dst, int tag)
+{
+#ifdef MPI
+  MPI_Send(p,n,MPI_UNSIGNED_CHAR,dst,tag,c->c);
+#endif
+}
+
+static void comm_irecv(comm_req *req, const struct comm *c,
+                       void *p, size_t n, uint src, int tag)
+{
+#ifdef MPI
+  MPI_Irecv(p,n,MPI_UNSIGNED_CHAR,src,tag,c->c,req);
+#endif
+}
+
+static void comm_isend(comm_req *req, const struct comm *c,
+                       void *p, size_t n, uint dst, int tag)
+{
+#ifdef MPI
+  MPI_Isend(p,n,MPI_UNSIGNED_CHAR,dst,tag,c->c,req);
+#endif
+}
+
+static void comm_wait(comm_req *req, int n)
+{
+#ifdef MPI
+# ifndef MPI_STATUSES_IGNORE
+  MPI_Status status[8];
+  while(n>=8) MPI_Waitall(8,req,status), req+=8, n-=8;
+  if(n>0) MPI_Waitall(n,req,status);
+# else
+  MPI_Waitall(n,req,MPI_STATUSES_IGNORE);
+# endif  
+#endif
+}
+
+static void comm_bcast(const struct comm *c, void *p, size_t n, uint root)
+{
+#ifdef MPI
+  MPI_Bcast(p,n,MPI_UNSIGNED_CHAR,root,c->c);
+#endif
+}
+
+#endif
diff --git a/src/jl/crs.h b/src/jl/crs.h
new file mode 100644
index 0000000..e2d0d36
--- /dev/null
+++ b/src/jl/crs.h
@@ -0,0 +1,24 @@
+#ifndef CRS_H
+#define CRS_H
+
+#if !defined(COMM_H)
+#warning "crs.h" requires "comm.h"
+#endif
+
+#define crs_setup PREFIXED_NAME(crs_setup)
+#define crs_solve PREFIXED_NAME(crs_solve)
+#define crs_stats PREFIXED_NAME(crs_stats)
+#define crs_free  PREFIXED_NAME(crs_free )
+
+struct crs_data;
+
+struct crs_data *crs_setup(
+  uint n, const ulong *id,
+  uint nz, const uint *Ai, const uint *Aj, const double *A,
+  uint null_space, const struct comm *comm);
+void crs_solve(double *x, struct crs_data *data, double *b);
+void crs_stats(struct crs_data *data);
+void crs_free(struct crs_data *data);
+
+#endif
+
diff --git a/src/jl/crs_test.c b/src/jl/crs_test.c
new file mode 100644
index 0000000..e5367d2
--- /dev/null
+++ b/src/jl/crs_test.c
@@ -0,0 +1,116 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include "c99.h"
+#include "name.h"
+#include "fail.h"
+#include "types.h"
+#include "mem.h"
+#include "gs_defs.h"
+#include "comm.h"
+#include "gs.h"
+#include "crs.h"
+
+void test(const struct comm *const comm)
+{
+  const double A[16] = {  2, -1, -1,  0,
+                         -1,  2,  0, -1,
+                         -1,  0,  2, -1,
+                          0, -1, -1,  2 };
+  const uint Ai[16]  = { 0, 0, 0, 0,
+                         1, 1, 1, 1,
+                         2, 2, 2, 2,
+                         3, 3, 3, 3 },
+             Aj[16]  = { 0, 1, 2, 3,
+                         0, 1, 2, 3,
+                         0, 1, 2, 3,
+                         0, 1, 2, 3 };
+  ulong xid[4]; slong uid[4];
+  double x[4]={1,1,1,1}, b[4], bmean;
+  uint i, w, gn, px, py;
+  
+  slong *xgid=0; double *xg=0; struct gs_data *gsh;
+  
+  struct crs_data *crs;
+
+  w = ceil(sqrt(comm->np)); gn = (w+1)*(w+1);
+  
+  if(comm->id==0) printf("arranging procs in a %u x %u square\n", w, w);
+  
+  px = comm->id%w, py = comm->id/w;
+  b[0] = xid[0] = (w+1)*py    +px+1;
+  b[1] = xid[1] = (w+1)*py    +px+2;
+  b[2] = xid[2] = (w+1)*(py+1)+px+1;
+  b[3] = xid[3] = (w+1)*(py+1)+px+2;
+
+  gn = comm_reduce_slong(comm, gs_max, (const slong*)&xid[3],1);
+  bmean = comm_reduce_double(comm, gs_add, b,4)/gn;
+
+  gsh = gs_setup((const slong*)xid,4, comm,0,gs_crystal_router,0);
+  gs(x,gs_double,gs_add,0,gsh,0);
+  gs(b,gs_double,gs_add,0,gsh,0);
+  for(i=0;i<4;++i) b[i]=xid[i]-bmean/x[i];
+  gs(b,gs_double,gs_add,0,gsh,0);
+  gs_free(gsh);
+  
+  gsh = gs_setup((const slong*)xid,4, comm,1,gs_crystal_router,0);
+  for(i=0;i<4;++i) uid[i]=comm->id;
+  gs(uid,gs_slong,gs_min,0,gsh,0);
+  gs_free(gsh);
+  for(i=0;i<4;++i) uid[i] = (uid[i]==comm->id?(slong)xid[i]:-(slong)xid[i]);
+
+  if(comm->id==0) {
+    xgid = tmalloc(slong, gn);
+    xg   = tmalloc(double,gn);
+    for(i=0;i<gn;++i) xgid[i] = -(slong)(i+1);
+    for(i=0;i<4;++i) xgid[xid[i]-1] = uid[i];
+  }
+  gsh = gs_setup(comm->id?uid:xgid,comm->id?4:gn, comm,0,gs_crystal_router,0);
+
+
+  if(comm->id==0) for(i=0;i<4;++i) xg[xid[i]-1]=b[i];
+  gs(comm->id?b:xg,gs_double,gs_add, 0, gsh, 0);
+  if(comm->id==0) for(i=0;i<gn;++i) printf("b[%u] = %g\n",i,xg[i]);
+  for(i=0;i<4;++i) b[i]=xid[i]-bmean/x[i];
+
+  crs = crs_setup(4,xid, 16,Ai,Aj,A, 1, comm);
+
+  crs_solve(x,crs,b);
+
+  crs_stats(crs);
+
+  crs_free(crs);
+
+  if(comm->id==0) for(i=0;i<4;++i) xg[xid[i]-1]=x[i];
+  gs(comm->id?x:xg,gs_double,gs_add, 0, gsh, 0);
+  if(comm->id==0) for(i=0;i<gn;++i) printf("x[%u] = %g\n",i,xg[i]);
+
+  gs_free(gsh);  
+  if(comm->id==0) free(xg), free(xgid);
+}
+
+int main(int narg, char* arg[])
+{
+  comm_ext world; int np;
+  struct comm comm;
+#ifdef MPI
+  MPI_Init(&narg,&arg);
+  world = MPI_COMM_WORLD;
+  MPI_Comm_size(world,&np);
+#else
+  world=0, np=1;
+#endif
+
+  comm_init(&comm,world);
+  test(&comm);
+  comm_free(&comm);
+  
+#ifdef MPI
+  MPI_Finalize();
+#endif
+
+  return 0;
+}
+
diff --git a/src/jl/crystal.c b/src/jl/crystal.c
new file mode 100644
index 0000000..a0e8135
--- /dev/null
+++ b/src/jl/crystal.c
@@ -0,0 +1,141 @@
+/*------------------------------------------------------------------------------
+  
+  Crystal Router
+  
+  Accomplishes all-to-all communication in log P msgs per proc
+  The routine is low-level; the format of the input/output is an
+  array of integers, consisting of a sequence of messages with format:
+  
+      target proc
+      source proc
+      m
+      integer
+      integer
+      ...
+      integer  (m integers in total)
+
+  Before crystal_router is called, the source of each message should be
+  set to this proc id; upon return from crystal_router, the target of each
+  message will be this proc id.
+  
+  Example Usage:
+  
+    struct crystal cr;
+    
+    crystal_init(&cr, &comm);  // makes an internal copy of comm
+    
+    crystal.data.n = ... ;  // total number of integers (not bytes!)
+    buffer_reserve(&cr.data, crystal.n * sizeof(uint));
+    ... // fill cr.data.ptr with messages
+    crystal_router(&cr);
+    
+    crystal_free(&cr);
+    
+  ----------------------------------------------------------------------------*/
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include "c99.h"
+#include "name.h"
+#include "fail.h"
+#include "types.h"
+#include "comm.h"
+#include "mem.h"
+
+#define crystal_init   PREFIXED_NAME(crystal_init  )
+#define crystal_free   PREFIXED_NAME(crystal_free  )
+#define crystal_router PREFIXED_NAME(crystal_router)
+
+struct crystal {
+  struct comm comm;
+  buffer data, work;
+};
+
+void crystal_init(struct crystal *p, const struct comm *comm)
+{
+  comm_dup(&p->comm, comm);
+  buffer_init(&p->data,1000);
+  buffer_init(&p->work,1000);
+}
+
+void crystal_free(struct crystal *p)
+{
+  comm_free(&p->comm);
+  buffer_free(&p->data);
+  buffer_free(&p->work);
+}
+
+static void uintcpy(uint *dst, const uint *src, uint n)
+{
+  if(dst+n<=src)    memcpy (dst,src,n*sizeof(uint));
+  else if(dst!=src) memmove(dst,src,n*sizeof(uint));
+}
+
+static uint crystal_move(struct crystal *p, uint cutoff, int send_hi)
+{
+  uint len, *src, *end;
+  uint *keep = p->data.ptr, *send;
+  uint n = p->data.n;
+  send = buffer_reserve(&p->work,n*sizeof(uint));
+  if(send_hi) { /* send hi, keep lo */
+    for(src=keep,end=keep+n; src<end; src+=len) {
+      len = 3 + src[2];
+      if(src[0]>=cutoff) memcpy (send,src,len*sizeof(uint)), send+=len;
+      else               uintcpy(keep,src,len),              keep+=len;
+    }
+  } else      { /* send lo, keep hi */
+    for(src=keep,end=keep+n; src<end; src+=len) {
+      len = 3 + src[2];
+      if(src[0]< cutoff) memcpy (send,src,len*sizeof(uint)), send+=len;
+      else               uintcpy(keep,src,len),              keep+=len;
+    }
+  }
+  p->data.n = keep - (uint*)p->data.ptr;
+  return      send - (uint*)p->work.ptr;
+}
+
+static void crystal_exchange(struct crystal *p, uint send_n, uint targ,
+                             int recvn, int tag)
+{
+  comm_req req[3];
+  uint count[2] = {0,0}, sum, *recv[2];
+
+  if(recvn)   
+    comm_irecv(&req[1],&p->comm, &count[0],sizeof(uint), targ        ,tag);
+  if(recvn==2)
+    comm_irecv(&req[2],&p->comm, &count[1],sizeof(uint), p->comm.id-1,tag);
+  comm_isend(&req[0],&p->comm, &send_n,sizeof(uint), targ,tag);
+  comm_wait(req,recvn+1);
+  
+  sum = p->data.n + count[0] + count[1];
+  buffer_reserve(&p->data,sum*sizeof(uint));
+  recv[0] = (uint*)p->data.ptr + p->data.n, recv[1] = recv[0] + count[0];
+  p->data.n = sum;
+  
+  if(recvn)    comm_irecv(&req[1],&p->comm,
+                          recv[0],count[0]*sizeof(uint), targ        ,tag+1);
+  if(recvn==2) comm_irecv(&req[2],&p->comm,
+                          recv[1],count[1]*sizeof(uint), p->comm.id-1,tag+1);
+  comm_isend(&req[0],&p->comm, p->work.ptr,send_n*sizeof(uint), targ,tag+1);
+  comm_wait(req,recvn+1);
+}
+
+void crystal_router(struct crystal *p)
+{
+  uint bl=0, bh, nl;
+  uint id = p->comm.id, n=p->comm.np;
+  uint send_n, targ, tag = 0;
+  int send_hi, recvn;
+  while(n>1) {
+    nl = (n+1)/2, bh = bl+nl;
+    send_hi = id<bh;
+    send_n = crystal_move(p,bh,send_hi);
+    recvn = 1, targ = n-1-(id-bl)+bl;
+    if(id==targ) targ=bh, recvn=0;
+    if(n&1 && id==bh) recvn=2;
+    crystal_exchange(p,send_n,targ,recvn,tag);
+    if(id<bh) n=nl; else n-=nl,bl=bh;
+    tag += 2;
+  }
+}
diff --git a/src/jl/crystal.h b/src/jl/crystal.h
new file mode 100644
index 0000000..b6d4582
--- /dev/null
+++ b/src/jl/crystal.h
@@ -0,0 +1,21 @@
+#ifndef CRYSTAL_H
+#define CRYSTAL_H
+
+#if !defined(COMM_H) || !defined(MEM_H)
+#warning "crystal.h" requires "comm.h" and "mem.h"
+#endif
+
+#define crystal_init   PREFIXED_NAME(crystal_init  )
+#define crystal_free   PREFIXED_NAME(crystal_free  )
+#define crystal_router PREFIXED_NAME(crystal_router)
+
+struct crystal {
+  struct comm comm;
+  buffer data, work;
+};
+
+void crystal_init(struct crystal *cr, const struct comm *comm);
+void crystal_free(struct crystal *cr);
+void crystal_router(struct crystal *cr);
+
+#endif
diff --git a/src/jl/crystal_test.c b/src/jl/crystal_test.c
new file mode 100644
index 0000000..c7f50df
--- /dev/null
+++ b/src/jl/crystal_test.c
@@ -0,0 +1,88 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "c99.h"
+#include "name.h"
+#include "fail.h"
+#include "types.h"
+#include "comm.h"
+#include "mem.h"
+#include "crystal.h"
+
+int main(int narg, char *arg[])
+{
+  comm_ext world; int np;
+  struct comm comm;
+  struct crystal cr;
+  uint i,sum, *data, *end;
+#ifdef MPI
+  MPI_Init(&narg,&arg);
+  world = MPI_COMM_WORLD;
+  MPI_Comm_size(world,&np);
+#else
+  world=0, np=1;
+#endif
+
+  comm_init(&comm,world);
+  
+  crystal_init(&cr,&comm);
+
+  cr.data.n = (4+(comm.id&1))*comm.np;
+  buffer_reserve(&cr.data,cr.data.n*sizeof(uint));
+  data = cr.data.ptr;
+  for(i=0;i<comm.np;++i, data+=3+data[2]) {
+    data[0] = i, data[1] = comm.id, data[2] = 1;
+    data[3] = 2*comm.id;
+    if(comm.id&1) data[2] = 2, data[4] = data[3]+1;
+  }
+
+#if 0
+  data = cr.data.ptr, end = data + cr.data.n;
+  for(;data!=end; data+=3+data[2]) {
+    uint i;
+    printf("%u -> %u:",data[1],data[0]);
+    for(i=0;i<data[2];++i) printf(" %u",data[3+i]);
+    printf("\n");
+  }
+#endif
+  
+  crystal_router(&cr);
+
+#if 0
+  printf("\n");
+  data = cr.data.ptr, end = data + cr.data.n;
+  for(;data!=end; data+=3+data[2]) {
+    uint i;
+    printf("%u <- %u:",data[0],data[1]);
+    for(i=0;i<data[2];++i) printf(" %u",data[3+i]);
+    printf("\n");
+  }
+#endif
+  
+  if(cr.data.n != comm.np*4 + (comm.np/2))
+    fail(1,__FILE__,__LINE__,"failure on %u",comm.id);
+  sum = 0;
+  data = cr.data.ptr, end = data + cr.data.n;
+  for(;data!=end; data+=3+data[2]) {
+    sum+=data[1];
+    if(data[3]!=data[1]*2)
+      fail(1,__FILE__,__LINE__,"failure on %u",comm.id);
+    if(data[1]&1 && (data[2]!=2 || data[4]!=data[3]+1))
+      fail(1,__FILE__,__LINE__,"failure on %u",comm.id);
+  }
+  if(sum != comm.np*(comm.np-1)/2)
+    fail(1,__FILE__,__LINE__,"failure on %u",comm.id);
+
+  crystal_free(&cr);
+  comm_free(&comm);
+
+  diagnostic("",__FILE__,__LINE__,
+    "test successful %u/%u",(unsigned)comm.id,(unsigned)comm.np);
+  
+#ifdef MPI
+  MPI_Finalize();
+#endif
+
+  return 0;
+}
diff --git a/src/jl/fail.c b/src/jl/fail.c
new file mode 100644
index 0000000..718aa36
--- /dev/null
+++ b/src/jl/fail.c
@@ -0,0 +1,53 @@
+#include <stdio.h>  /* sprintf, vfprintf, stdout */
+#include <stdarg.h> /* va_list, va_start, ... */
+#include <stdlib.h> /* exit */
+#include <string.h> /* memcpy, and str* functions in comm_fail */
+#include "name.h"
+#include "fail.h"
+#include "types.h"
+#include "comm.h"
+
+#define nek_exitt FORTRAN_UNPREFIXED(exitt,EXITT)
+void die(int status)
+{
+#ifdef NO_NEK_EXITT
+  if(comm_gbl_id==0) exit(status); else for(;;) ;
+#else
+  exit(status);
+#endif  
+}
+
+void vdiagnostic(const char *prefix, const char *file, unsigned line,
+                 const char *fmt, va_list ap)
+{
+  static char buf[2048]; int n,na,i=0;
+  sprintf(buf,"%s(proc %04d, %s:%d): ",prefix,(int)comm_gbl_id,file,line);
+  vsprintf(buf+strlen(buf),fmt,ap);
+  strcat(buf,"\n");
+  n=strlen(buf);
+  while(n && (na=fwrite(buf+i,1,n,stdout))) n-=na, i+=na;
+  fflush(stdout);
+}
+
+void diagnostic(const char *prefix, const char *file, unsigned line,
+                const char *fmt, ...)
+{
+  va_list ap; va_start(ap,fmt);
+  vdiagnostic(prefix,file,line,fmt,ap);
+  va_end(ap);
+}
+
+void vfail(int status, const char *file, unsigned line,
+           const char *fmt, va_list ap)
+{
+  vdiagnostic("ERROR ",file,line,fmt,ap);
+  die(status);
+}
+
+void fail(int status, const char *file, unsigned line,
+          const char *fmt, ...)
+{
+  va_list ap; va_start(ap,fmt);
+  vfail(status,file,line,fmt,ap);
+  va_end(ap);
+}
diff --git a/src/jl/fail.h b/src/jl/fail.h
new file mode 100644
index 0000000..0185110
--- /dev/null
+++ b/src/jl/fail.h
@@ -0,0 +1,52 @@
+#ifndef FAIL_H
+#define FAIL_H
+
+#if !defined(NAME_H)
+#warning "fail.h" requires "name.h"
+#endif
+
+#define  die        PREFIXED_NAME( die       )
+#define vdiagnostic PREFIXED_NAME(vdiagnostic)
+#define  diagnostic PREFIXED_NAME( diagnostic)
+#define vfail       PREFIXED_NAME(vfail      )
+#define  fail       PREFIXED_NAME( fail      )
+
+#ifdef __GNUC__
+#  define ATTRBD   __attribute__ ((noreturn))
+#  define ATTRB4V  __attribute__ ((format(printf,4,0)))
+#  define ATTRB4   __attribute__ ((format(printf,4,5)))
+#  define ATTRB4DV __attribute__ ((noreturn,format(printf,4,0)))
+#  define ATTRB4D  __attribute__ ((noreturn,format(printf,4,5)))
+#else
+#  define ATTRBD
+#  define ATTRB4V
+#  define ATTRB4
+#  define ATTRB4DV
+#  define ATTRB4D
+#endif
+
+#define DEF_FUNS() \
+  void  die(int status) ATTRBD; \
+  void  diagnostic(const char *prefix, const char *file, unsigned line, \
+                   const char *fmt, ...) ATTRB4  ; \
+  void  fail      (int status,         const char *file, unsigned line, \
+                   const char *fmt, ...) ATTRB4D ;
+#define VDEF_FUNS() \
+  void vdiagnostic(const char *prefix, const char *file, unsigned line, \
+                   const char *fmt, va_list ap) ATTRB4V  ; \
+  void vfail      (int status,         const char *file, unsigned line, \
+                   const char *fmt, va_list ap) ATTRB4DV ;
+DEF_FUNS()
+#ifdef va_arg
+VDEF_FUNS()
+#endif
+
+#undef VDEF_FUNS
+#undef DEF_FUNS
+#undef ATTRB4D
+#undef ATTRB4DV
+#undef ATTRB4
+#undef ATTRB4V
+#undef ATTRBD
+
+#endif
diff --git a/src/jl/fcrystal.c b/src/jl/fcrystal.c
new file mode 100644
index 0000000..3fe4c9a
--- /dev/null
+++ b/src/jl/fcrystal.c
@@ -0,0 +1,191 @@
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include "c99.h"
+#include "name.h"
+#include "fail.h"
+#include "types.h"
+#include "mem.h"
+#include "comm.h"
+#include "crystal.h"
+#include "sort.h"
+#include "sarray_sort.h"
+#include "sarray_transfer.h"
+
+/*--------------------------------------------------------------------------
+
+  FORTRAN Interface to crystal router
+   
+  integer h, np
+  MPI_Comm comm
+  call crystal_setup(h,comm,np)  ! set h to handle to new instance
+  ! it is a runtime error if MPI_Comm_size gives a value different than np
+  call crystal_free(h)         ! release instance
+
+  integer*? ituple(m,max)   ! integer type matching sint from "types.h"
+  call crystal_ituple_transfer(h, ituple,m,n,max, kp)
+    - moves each column ituple(:,i), 1 <= i <= n,
+      to proc ituple(kp,i)
+    - sets n to the number of columns received,
+      which may be larger than max (indicating loss of n-max columns)
+    - also sets ituple(kp,i) to the source proc of column ituple(:,i)
+
+  call crystal_ituple_sort(h, ituple,m,n, key,nkey)
+    - locally sorts columns ituple(:,1...n) in ascending order,
+      ranked by ituple(key(1),i),
+           then ituple(key(2),i),
+           ...
+           then ituple(key(nkey),i)
+    - no communication; h used for scratch area
+    - linear time
+    - assumes nonnegative integers
+
+  integer*? vi(mi,max)   ! integer type matching sint  from "types.h"
+  integer*? vl(ml,max)   ! integer type matching slong from "types.h"
+  real      vr(mr,max)
+  call crystal_tuple_transfer(h,n,max, vi,mi,vl,ml,vr,mr, kp)
+    - moves each column vi(:,i),vl(:,i),vr(:,i) 1 <= i <= n,
+      to proc vi(kp,i)
+    - sets n to the number of columns received,
+      which may be larger than max (indicating loss of n-max columns)
+    - also sets vi(kp,i) to the source proc of columns vi(:,i),vl(:,i),vr(:,i)
+
+  call crystal_tuple_sort(h,n, vi,mi,vl,ml,vr,mr, key,nkey)
+    - locally sorts columns vi/vl/vr (:,1...n) in ascending order,
+      ranked by vi(key(1),i) [ or vl(key(1)-mi,i) if key(1)>mi ],
+           then vi(key(2),i) [ or vl(key(2)-mi,i) if key(2)>mi ],
+           ...
+           then vi(key(nkey),i) or vl(key(nkey)-mi,i)
+    - no communication; h used for scratch area
+    - linear time
+    - assumes nonnegative integers
+    - sorting on reals not yet implemented
+
+  --------------------------------------------------------------------------*/
+
+#undef   crystal_free
+#define ccrystal_free  PREFIXED_NAME(crystal_free)
+
+#define fcrystal_setup           \
+  FORTRAN_NAME(crystal_setup          ,CRYSTAL_SETUP          )
+#define fcrystal_ituple_sort     \
+  FORTRAN_NAME(crystal_ituple_sort    ,CRYSTAL_ITUPLE_SORT    )
+#define fcrystal_tuple_sort      \
+  FORTRAN_NAME(crystal_tuple_sort     ,CRYSTAL_TUPLE_SORT     )
+#define fcrystal_ituple_transfer \
+  FORTRAN_NAME(crystal_ituple_transfer,CRYSTAL_ITUPLE_TRANSFER)
+#define fcrystal_tuple_transfer  \
+  FORTRAN_NAME(crystal_tuple_transfer ,CRYSTAL_TUPLE_TRANSFER )
+#define fcrystal_free            \
+  FORTRAN_NAME(crystal_free           ,CRYSTAL_FREE           )
+
+static struct crystal **handle_array = 0;
+static int handle_max = 0;
+static int handle_n = 0;
+
+void fcrystal_setup(sint *handle, const MPI_Fint *comm, const sint *np)
+{
+  struct crystal *p;
+  if(handle_n==handle_max)
+    handle_max+=handle_max/2+1,
+    handle_array=trealloc(struct crystal*,handle_array,handle_max);
+  handle_array[handle_n]=p=tmalloc(struct crystal,1);
+  comm_init_check(&p->comm, *comm, *np);
+  buffer_init(&p->data,1000);
+  buffer_init(&p->work,1000);
+  *handle = handle_n++;
+}
+
+#define CHECK_HANDLE(func) do \
+  if(*handle<0 || *handle>=handle_n || !handle_array[*handle]) \
+    fail(1,__FILE__,__LINE__,func ": invalid handle"); \
+while(0)
+
+void fcrystal_ituple_sort(const sint *handle,
+                          sint A[], const sint *m, const sint *n,
+                          const sint keys[], const sint *nkey)
+{
+  const size_t size = (*m)*sizeof(sint);
+  sint nk = *nkey;
+  buffer *buf;
+  CHECK_HANDLE("crystal_ituple_sort");
+  buf = &handle_array[*handle]->data;
+  if(--nk>=0) {
+    sortp(buf,0, (uint*)&A[keys[nk]-1],*n,size);
+    while(--nk>=0)
+      sortp(buf,1, (uint*)&A[keys[nk]-1],*n,size);
+    sarray_permute_buf_(ALIGNOF(sint),size,A,*n, buf);
+  }
+}
+
+void fcrystal_tuple_sort(const sint *const handle, const sint *const n,
+                         sint   Ai[], const sint *const mi,
+                         slong  Al[], const sint *const ml,
+                         double Ad[], const sint *const md,
+                         const sint keys[], const sint *const nkey)
+{
+  const size_t size_i = (*mi)*sizeof(sint),
+               size_l = (*ml)*sizeof(slong),
+               size_d = (*md)*sizeof(double);
+  int init=0;
+  sint nk = *nkey;
+  buffer *buf;
+  CHECK_HANDLE("crystal_tuple_sort");
+  buf = &handle_array[*handle]->data;
+  if(nk<=0) return;
+  while(--nk>=0) {
+    sint k = keys[nk]-1;
+    if(k<0 || k>=*mi+*ml)
+      fail(1,__FILE__,__LINE__,"crystal_tuple_sort: invalid key");
+    else if(k<*mi) sortp     (buf,init, (uint *)&Ai[k],    *n,size_i);
+    else           sortp_long(buf,init, (ulong*)&Al[k-*mi],*n,size_l);
+    init=1;
+  }
+  if(*mi) sarray_permute_buf_(ALIGNOF(sint  ),size_i,Ai,*n, buf);
+  if(*ml) sarray_permute_buf_(ALIGNOF(slong ),size_l,Al,*n, buf);
+  if(*md) sarray_permute_buf_(ALIGNOF(double),size_d,Ad,*n, buf);
+}
+
+void fcrystal_ituple_transfer(const sint *handle,
+                              sint A[], const sint *m, sint *n,
+                              const sint *nmax, const sint *proc_key)
+{
+  struct array ar, *const ar_ptr = &ar;
+  const unsigned size=(*m)*sizeof(sint);
+  CHECK_HANDLE("crystal_ituple_transfer");
+  ar.ptr=A, ar.n=*n, ar.max=*nmax;
+  *n = sarray_transfer_many(&ar_ptr,&size,1, 1,0,1,(*proc_key-1)*sizeof(sint),
+         (uint*)&A[*proc_key-1],size, handle_array[*handle]);
+}
+
+void fcrystal_tuple_transfer(
+  const sint *const handle, sint *const n, const sint *const max,
+  sint   Ai[], const sint *const mi,
+  slong  Al[], const sint *const ml,
+  double Ad[], const sint *const md,
+  const sint *const proc_key)
+{
+  struct array ar_i, ar_l, ar_d, *ar[3];
+  unsigned size[3];
+  CHECK_HANDLE("crystal_tuple_transfer");
+  size[0]=*mi*sizeof(sint);
+  size[1]=*ml*sizeof(slong);
+  size[2]=*md*sizeof(double);
+  ar[0]=&ar_i, ar[1]=&ar_l, ar[2]=&ar_d;
+  ar_i.ptr=Ai,ar_l.ptr=Al,ar_d.ptr=Ad;
+  ar_i.n=ar_l.n=ar_d.n = *n;
+  ar_i.max=ar_l.max=ar_d.max=*max;
+  *n = sarray_transfer_many(ar,size,3, 1,0,1,(*proc_key-1)*sizeof(sint),
+         (uint*)&Ai[*proc_key-1],size[0], handle_array[*handle]);
+}
+
+void fcrystal_free(sint *handle)
+{
+  CHECK_HANDLE("crystal_free");
+  ccrystal_free(handle_array[*handle]);
+  free(handle_array[*handle]);
+  handle_array[*handle] = 0;
+}
+
+
diff --git a/src/jl/gen_poly_imp.c b/src/jl/gen_poly_imp.c
new file mode 100644
index 0000000..21a7410
--- /dev/null
+++ b/src/jl/gen_poly_imp.c
@@ -0,0 +1,227 @@
+#include <math.h>
+#include <stdio.h>
+#include <gmp.h>
+
+#define PREC_BITS 256
+#define DIGITS 50
+
+#define GLL_LAG_FIX_MAX 16
+
+#if 1
+#  define STATIC "static "
+#else
+#  define STATIC ""
+#endif
+
+
+#define PI 3.1415926535897932384626433832795028841971693993751058209749445923
+
+#define DECLARE_1VAR(a)        static int init=0; static mpf_t a; \
+                               if(!init) init=1, mpf_init(a)
+#define DECLARE_2VARS(a,b)     static int init=0; static mpf_t a,b; \
+                               if(!init) init=1, mpf_init(a), mpf_init(b)
+#define DECLARE_3VARS(a,b,c)   static int init=0; static mpf_t a,b,c; \
+                               if(!init) init=1, mpf_init(a), mpf_init(b), \
+                                                 mpf_init(c)
+#define DECLARE_4VARS(a,b,c,d) static int init=0; static mpf_t a,b,c,d; \
+                               if(!init) init=1, mpf_init(a), mpf_init(b), \
+                                                 mpf_init(c), mpf_init(d)
+                                                 
+static int is_small(const mpf_t x, const mpf_t y) {
+  DECLARE_2VARS(xa,ya);
+  mpf_abs(xa,x);
+  mpf_abs(ya,y);
+  mpf_div_2exp(ya,ya,PREC_BITS-mp_bits_per_limb);
+  return mpf_cmp(xa,ya) < 0;
+}
+
+typedef void fun_3term(mpf_t Pn, int n, const mpf_t x);
+
+#define DECLARE_THREE_TERM(name, i0_init, init_Ps, a_ip1,a_i,a_im1) \
+static void name(mpf_t Pn, int n, const mpf_t x) \
+{ \
+  int i, i0_init; \
+  DECLARE_4VARS(a,b,P_im1,P_i); \
+  init_Ps; \
+  for(i=i0+1; i<n; ++i) { \
+    mpf_mul(a, x,P_i); \
+    mpf_mul_ui(a, a,a_i); \
+    mpf_mul_ui(b, P_im1,a_im1); \
+    mpf_sub(a, a,b); \
+    mpf_swap(P_im1, P_i); \
+    mpf_div_ui(P_i, a,a_ip1); \
+  } \
+  mpf_set(Pn, n>i0?P_i:P_im1); \
+}
+
+DECLARE_THREE_TERM(legendre,    i0=0,(mpf_set_ui(P_im1,1),mpf_set   (P_i,x)),
+                   i+1, 2*i+1, i  )
+DECLARE_THREE_TERM(legendre_d1, i0=0,(mpf_set_ui(P_im1,0),mpf_set_ui(P_i,1)),
+                   i  , 2*i+1, i+1)
+DECLARE_THREE_TERM(legendre_d2, i0=1,(mpf_set_ui(P_im1,0),mpf_set_ui(P_i,3)),
+                   i-1, 2*i+1, i+2)
+
+static void newton(mpf_t x, double seed,
+                   fun_3term *fun, fun_3term *der, int n)
+{
+  DECLARE_3VARS(ox,f,df);
+  mpf_set_d(x, seed);
+  do {
+    mpf_set(ox, x);
+    fun(f, n,x), der(df, n,x), mpf_div(f, f,df), mpf_sub(x, x,f);
+  } while(!is_small(f,x));
+  fun( f, n,x), der(df, n,x), mpf_div(f, f,df), mpf_sub(x, x,f);
+}
+
+static void gauss_node(mpf_t z, int n, int i) {
+  if( (n&1) && i==n/2 ) mpf_set_ui(z,0);
+  else newton(z, cos( (2*n-2*i-1)*(PI/2)/n ), legendre,legendre_d1,n);
+}
+
+static void lobatto_node(mpf_t z, int n, int i) {
+  if( (n&1) && i==n/2 ) mpf_set_ui(z,0);
+  else if(i==0)   mpf_set_d(z,-(double)1);
+  else if(i==n-1) mpf_set_ui(z,1);
+  else newton(z, cos( (n-1-i)*PI/(n-1) ), legendre_d1,legendre_d2,n-1);
+}
+
+#define PRINT_LIST(i, i0,nline,n, printi,sep,sepline) \
+  do { \
+    int i; \
+    for(i=i0;i<n;++i) { \
+      printi; \
+      printf("%s",i==n-1?"":((i-i0)%nline==nline-1?sepline:sep)); \
+    } \
+  } while(0)
+
+static void print_gll_lag_fix(int n)
+{
+  int i;
+  DECLARE_1VAR(z);
+  if(n>3) {
+    printf("static const double gllz_%02d[%2d] = {\n  ",n,n/2-1);
+    for(i=1;i<=n/2-1;++i) {
+      lobatto_node(z, n,n-1-i);
+      if(i!=1) printf(",\n  ");
+      gmp_printf("%.*Fg",DIGITS,z);
+    }
+    puts("\n};\n");
+  }
+  printf(STATIC "void gll_lag_%02d(double *restrict p, double *restrict data,\n"
+           "                       unsigned n, int d, double xh)\n{\n",n);
+  printf("  const double *restrict w = data;\n");
+  printf("  const double x = xh*2;\n");
+  #define PRINT_D(i) do { \
+    printf("d%02d=x",i); \
+    if(2*i+1==n)    printf("              "); \
+    else if(i==0)   printf("+2            "); \
+    else if(i==n-1) printf("-2            "); \
+    else if(i<n/2)  printf("+2*gllz_%02d[%2d]",n,i-1); \
+    else            printf("-2*gllz_%02d[%2d]",n,n-2-i); \
+  } while(0)
+  printf("%s",                            "  const double ");
+  PRINT_LIST(i, 0,3,n, PRINT_D(i),",",",\n               ");
+  #undef PRINT_D
+  #define PRINT_U0(i) (i==0  ?printf("    1"):printf("u0_%02d",i))
+  #define PRINT_V0(i) (i==n-1?printf("    1"):printf("v0_%02d",i))
+  #define PRINT_U1(i) (i<=1  ?printf("    %d",i      ):printf("u1_%02d",i))
+  #define PRINT_V1(i) (i>=n-2?printf("    %d",n-1-(i)):printf("v1_%02d",i))
+  #define PRINT_U2(i) (i<=1  ?printf("    0"): \
+                      (i==2  ?printf("    2"):printf("u2_%02d",i)))
+  #define PRINT_V2(i) (i>=n-2?printf("    0"): \
+                      (i==n-3?printf("    2"):printf("v2_%02d",i)))
+  printf("%s",";\n  const double ");
+  PRINT_LIST(i, 1,3,n,
+    (PRINT_U0(i),putchar('='),PRINT_U0(i-1),printf("*d%02d",i-1)),
+    ",",",\n               ");
+  printf("%s",";\n  const double ");
+  PRINT_LIST(i, 1,3,n,
+    (PRINT_V0(n-1-i),putchar('='),printf("d%02d*",n-i),PRINT_V0(n-i)),
+    ",",",\n               ");
+  printf("%s",";\n  ");
+  PRINT_LIST(i, 0,3,n, 
+    (printf("p[%2d]=w[%2d]*",i,i),PRINT_U0(i),putchar('*'),
+     PRINT_V0(i)),"; ",";\n  ");
+  puts(";\n  if(d>0) {");
+  if(n>2) {
+    printf("%s","    const double ");
+    PRINT_LIST(i, 2,2,n,
+      (PRINT_U1(i),putchar('='),PRINT_U1(i-1),printf("*d%02d",i-1),
+       putchar('+'),PRINT_U0(i-1)),
+      ",",",\n                 ");
+    printf("%s",";\n    const double ");
+    PRINT_LIST(i, 2,2,n,
+      (PRINT_V1(n-1-i),putchar('='),printf("d%02d*",n-i),PRINT_V1(n-i),
+       putchar('+'),PRINT_V0(n-i)),
+      ",",",\n                 ");
+    puts(";");
+  }
+  for(i=0;i<n;++i) {
+    printf("    p[%d+%2d]=2*w[%2d]*(",n,i,i);
+    if(i==0)        printf("                  "),PRINT_V1(0);
+    else if(i==n-1) PRINT_U1(i),printf("                  ");
+    else PRINT_U1(i),putchar('*'),PRINT_V0(i),putchar('+'),
+         PRINT_U0(i),putchar('*'),PRINT_V1(i);
+    puts(");");
+  }
+  puts("    if(d>1) {");
+  if(n>3) {
+    printf("%s","      const double ");
+    PRINT_LIST(i, 3,2,n,
+      (PRINT_U2(i),putchar('='),PRINT_U2(i-1),printf("*d%02d",i-1),
+       printf("+2*"),PRINT_U1(i-1)),
+      ",",",\n                   ");
+    printf("%s",";\n      const double ");
+    PRINT_LIST(i, 3,2,n,
+      (PRINT_V2(n-1-i),putchar('='),printf("d%02d*",n-i),PRINT_V2(n-i),
+       printf("+2*"),PRINT_V1(n-i)),
+      ",",",\n                   ");
+    puts(";");
+  }  
+  if(n<3) for(i=0;i<n;++i) printf("      p[2*%d+%2d]=0;\n",n,i);
+  else for(i=0;i<n;++i) {
+      printf("      p[2*%d+%2d]=4*w[%2d]*(",n,i,i);
+      if(i>1)
+        PRINT_U2(i),putchar('*'),PRINT_V0(i);
+      else printf("           ");
+      if(i>0 && i<n-1)
+        printf("+2*"),PRINT_U1(i),putchar('*'),PRINT_V1(i);
+      else printf("              ");
+      if(i<n-2)
+        putchar('+'),PRINT_U0(i),putchar('*'),PRINT_V2(i);
+      else printf("            ");
+      puts(");");
+  }
+  #undef PRINT_U0
+  #undef PRINT_V0
+  #undef PRINT_U1
+  #undef PRINT_V1
+  #undef PRINT_U2
+  #undef PRINT_V2
+  puts("    }\n  }\n}");
+}
+
+
+int main()
+{
+  int n;
+  mpf_set_default_prec(PREC_BITS);
+  puts("/* generated by gen_poly_imp.c */\n");
+  printf("#define GLL_LAG_FIX_MAX %d\n\n",GLL_LAG_FIX_MAX);
+  /*puts("typedef void gll_lag_fun(double *p, int d, int n, double x);\n");*/
+  for(n=2;n<=GLL_LAG_FIX_MAX;++n)
+      print_gll_lag_fix(n), puts("");
+  printf(STATIC "const double *const gllz_table[%d] = {\n  ",
+    GLL_LAG_FIX_MAX-3);
+  PRINT_LIST(i, 4,8,(GLL_LAG_FIX_MAX+1),
+    printf("gllz_%02d",i), ", ",",\n  ");
+  puts("\n};");
+  puts("");
+  printf(STATIC "lagrange_fun *const gll_lag_table[%d] = {\n  ",
+    GLL_LAG_FIX_MAX-1);
+  PRINT_LIST(i, 2,6,(GLL_LAG_FIX_MAX+1),
+    printf("&gll_lag_%02d",i), ", ",",\n  ");
+  puts("\n};");
+  puts("");
+  return 0;
+}
diff --git a/src/jl/gs.c b/src/jl/gs.c
new file mode 100644
index 0000000..d39051a
--- /dev/null
+++ b/src/jl/gs.c
@@ -0,0 +1,1503 @@
+#include <stdio.h>
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include "c99.h"
+#include "name.h"
+#include "fail.h"
+#include "types.h"
+
+#ifdef _OPENMP
+#include "omp.h"
+#endif
+
+#define gs_op gs_op_t   /* fix conflict with fortran */
+
+#include "gs_defs.h"
+#include "gs_local.h"
+#include "comm.h"
+#include "mem.h"
+#include "sort.h"
+#include "crystal.h"
+#include "sarray_sort.h"
+#include "sarray_transfer.h"
+
+#define gs         PREFIXED_NAME(gs       )
+#define gs_vec     PREFIXED_NAME(gs_vec   )
+#define gs_many    PREFIXED_NAME(gs_many  )
+#define gs_setup   PREFIXED_NAME(gs_setup )
+#define gs_free    PREFIXED_NAME(gs_free  )
+#define gs_unique  PREFIXED_NAME(gs_unique)
+
+GS_DEFINE_DOM_SIZES()
+
+typedef enum { mode_plain, mode_vec, mode_many,
+               mode_dry_run } gs_mode;
+
+static buffer static_buffer = null_buffer;
+
+static void gather_noop(
+  void *out, const void *in, const unsigned vn,
+  const uint *map, gs_dom dom, gs_op op)
+{}
+
+static void scatter_noop(
+  void *out, const void *in, const unsigned vn,
+  const uint *map, gs_dom dom)
+{}
+
+static void init_noop(
+  void *out, const unsigned vn,
+  const uint *map, gs_dom dom, gs_op op)
+{}
+
+/*------------------------------------------------------------------------------
+  Topology Discovery
+------------------------------------------------------------------------------*/
+
+struct gs_topology {
+  ulong total_shared; /* number of globally unique shared ids */
+  struct array nz; /* array of nonzero_id's, grouped by id, 
+                      sorted by primary index, then flag, then index */
+  struct array sh; /* array of shared_id's, arbitrary ordering */
+  struct array pr; /* array of primary_shared_id's */
+};
+
+static void gs_topology_free(struct gs_topology *top)
+{
+  array_free(&top->pr);
+  array_free(&top->sh);
+  array_free(&top->nz);
+}
+
+/************** Local topology **************/
+
+/* nonzero_ids    (local part)
+
+   Creates an array of s_nonzeros, one per nonzero in user id array. The
+   output array is grouped by id. Within each group, non-flagged entries come
+   first; otherwise the entries within the group are sorted by the index into
+   the user id array. The first index in each group is the primary index, and
+   is stored along with each entry. The groups themselves are ordered in
+   increasing order of the primary index associated with the group (as opposed
+   to the user id). */
+
+struct nonzero_id {
+  ulong id; uint i, flag, primary;
+};
+
+static void nonzero_ids(struct array *nz,
+                        const slong *id, const uint n, buffer *buf)
+{
+  ulong last_id = -(ulong)1;
+  uint i, primary = -(uint)1;
+  struct nonzero_id *row, *end;
+  array_init(struct nonzero_id,nz,n), end=row=nz->ptr;
+  for(i=0;i<n;++i) {
+    slong id_i = id[i], abs_id = iabsl(id_i);
+    if(id_i==0) continue;
+    end->i = i;
+    end->id = abs_id;
+    end->flag = id_i!=abs_id;
+    ++end;
+  }
+  nz->n = end-row;
+  array_resize(struct nonzero_id,nz,nz->n);
+  sarray_sort_2(struct nonzero_id,nz->ptr,nz->n, id,1, flag,0, buf);
+  for(row=nz->ptr,end=row+nz->n;row!=end;++row) {
+    ulong this_id = row->id;
+    if(this_id!=last_id) primary = row->i;
+    row->primary = primary;
+    last_id = this_id;
+  }
+  sarray_sort(struct nonzero_id,nz->ptr,nz->n, primary,0, buf);
+}
+
+/************** Global topology **************/
+
+/* construct list of all unique id's on this proc */
+struct unique_id { ulong id; uint work_proc, src_if; };
+static void unique_ids(struct array *un, const struct array *nz, const uint np)
+{
+  struct unique_id *un_row;
+  const struct nonzero_id *nz_row, *nz_end;
+  array_init(struct unique_id,un,nz->n), un_row=un->ptr;
+  for(nz_row=nz->ptr,nz_end=nz_row+nz->n;nz_row!=nz_end;++nz_row) {
+    if(nz_row->i != nz_row->primary) continue;
+    un_row->id = nz_row->id;
+    un_row->work_proc = nz_row->id%np;
+    un_row->src_if = nz_row->flag ? ~nz_row->i : nz_row->i;
+    ++un_row;
+  }
+  un->n = un_row - (struct unique_id*)un->ptr;
+}
+
+/* shared_ids    (global part)
+
+   Creates an array of shared_id's from an array of nonzero_id's. Each entry
+   in the output identifies one id shared with one other processor p.
+   Note: two procs share an id only when at least one of them has it unflagged.
+   The primary index is i locally and ri remotely. Bit 1 of flags indicates
+   the local flag, bit 2 indicates the remote flag. The output has no
+   particular ordering.
+
+   Also creates an array of primary_shared_id's, one for each shared id.
+   This struct includes ord, a global rank of the id (arbitrary, but unique). */
+
+#define FLAGS_LOCAL  1
+#define FLAGS_REMOTE 2
+
+/* i  : local primary index
+   p  : remote proc
+   ri : remote primary index
+   bi : buffer index (set and used during pw setup) */
+struct shared_id {
+  ulong id; uint i, p, ri, bi; unsigned flags;
+};
+
+struct primary_shared_id {
+  ulong id, ord; uint i; unsigned flag;
+};
+
+
+
+struct shared_id_work { ulong id,ord; uint p1, p2, i1f, i2f; };
+static void shared_ids_aux(struct array *sh, struct array *pr, uint pr_n,
+                           struct array *wa, buffer *buf)
+{
+  const struct shared_id_work *w, *we;
+  struct shared_id *s;
+  struct primary_shared_id *p;
+  ulong last_id = -(ulong)1;
+  /* translate work array to output arrays */
+  sarray_sort(struct shared_id_work,wa->ptr,wa->n, id,1, buf);
+  array_init(struct shared_id,sh,wa->n), sh->n=wa->n, s=sh->ptr;
+  array_init(struct primary_shared_id,pr,pr_n), p=pr->ptr;
+  for(w=wa->ptr,we=w+wa->n;w!=we;++w) {
+    uint i1f = w->i1f, i2f = w->i2f;
+    uint i1 = ~i1f<i1f?~i1f:i1f, i2 = ~i2f<i2f?~i2f:i2f;
+    s->id=w->id, s->i=i1, s->p=w->p2, s->ri=i2;
+    s->flags = ((i2f^i2)&FLAGS_REMOTE) | ((i1f^i1)&FLAGS_LOCAL);
+    ++s;
+    if(w->id!=last_id) {
+      last_id=w->id;
+      p->id=last_id, p->ord=w->ord, p->i=i1, p->flag=(i1f^i1)&FLAGS_LOCAL;
+      ++p;
+    }
+  }
+  pr->n = p-(struct primary_shared_id*)pr->ptr;
+  sarray_sort(struct primary_shared_id,pr->ptr,pr->n, i,0, buf);
+}
+
+static ulong shared_ids(struct array *sh, struct array *pr,
+                        const struct array *nz, struct crystal *cr)
+{
+  struct array un; struct unique_id *un_row, *un_end, *other;
+  ulong last_id = -(ulong)1;
+  ulong ordinal[2], n_shared=0, scan_buf[2];
+  struct array wa; struct shared_id_work *w;
+  uint n_unique;
+  /* construct list of all unique id's on this proc */
+  unique_ids(&un,nz,cr->comm.np);
+  n_unique = un.n;
+  /* transfer list to work procs */
+  sarray_transfer(struct unique_id,&un, work_proc,1, cr);
+  /* group by id, put flagged entries after unflagged (within each group) */
+  sarray_sort_2(struct unique_id,un.ptr,un.n, id,1, src_if,0, &cr->data);
+  /* count shared id's */
+  for(un_row=un.ptr,un_end=un_row+un.n;un_row!=un_end;++un_row) {
+    ulong id = un_row->id;
+    if(~un_row->src_if<un_row->src_if) continue;
+    if(id==last_id) continue;
+    other=un_row+1;
+    if(other!=un_end&&other->id==id) last_id=id, ++n_shared;
+  }
+  comm_scan(ordinal, &cr->comm,gs_slong,gs_add, &n_shared,1, scan_buf);
+  /* there are ordinal[1] globally shared unique ids;
+           and ordinal[0] of those are seen by work procs of lower rank;
+     i.e., this work processor sees the range ordinal[0] + (0,n_shared-1) */
+  /* construct list of shared ids */
+  last_id = -(ulong)1;
+  array_init(struct shared_id_work,&wa,un.n), wa.n=0, w=wa.ptr;
+  for(un_row=un.ptr,un_end=un_row+un.n;un_row!=un_end;++un_row) {
+    ulong id = un_row->id;
+    uint p1 = un_row->work_proc, i1f = un_row->src_if;
+    if(~i1f<i1f) continue;
+    for(other=un_row+1;other!=un_end&&other->id==id;++other) {
+      uint p2 = other->work_proc, i2f = other->src_if;
+      ulong ord;
+      if(id!=last_id) last_id=id, ++ordinal[0];
+      ord=ordinal[0]-1;
+      if(wa.n+2>wa.max)
+        array_reserve(struct shared_id_work,&wa,wa.n+2),
+        w=(struct shared_id_work*)wa.ptr+wa.n;
+      w->id=id, w->ord=ord, w->p1=p1, w->p2=p2, w->i1f=i1f, w->i2f=i2f, ++w;
+      w->id=id, w->ord=ord, w->p1=p2, w->p2=p1, w->i1f=i2f, w->i2f=i1f, ++w;
+      wa.n+=2;
+    }
+  }
+  /* transfer shared list to source procs */
+  sarray_transfer(struct shared_id_work,&wa, p1,0, cr);
+  /* fill output arrays from work array */
+  shared_ids_aux(sh,pr,n_unique,&wa,&cr->data);
+  array_free(&un);
+  array_free(&wa);
+  return ordinal[1];
+}
+
+static void get_topology(struct gs_topology *top,
+                         const slong *id, uint n, struct crystal *cr)
+{
+  nonzero_ids(&top->nz,id,n,&cr->data);
+  top->total_shared = shared_ids(&top->sh,&top->pr, &top->nz,cr);
+}
+
+static void make_topology_unique(struct gs_topology *top, slong *id,
+                                 uint pid, buffer *buf)
+{
+  struct array *const nz=&top->nz, *const sh=&top->sh, *const pr=&top->pr;
+  struct nonzero_id *pnz;
+  struct shared_id *pb, *pe, *e, *out;
+  struct primary_shared_id *q;
+
+  /* flag local non-primaries */
+  sarray_sort(struct nonzero_id,nz->ptr,nz->n, i,0, buf);
+  if(id) {
+    struct nonzero_id *p,*e;
+    for(p=nz->ptr,e=p+nz->n;p!=e;++p)
+      if(p->i != p->primary) id[p->i]=-(slong)p->id,p->flag=1;
+  } else {
+    struct nonzero_id *p,*e;
+    for(p=nz->ptr,e=p+nz->n;p!=e;++p)
+      if(p->i != p->primary) p->flag=1;
+  }
+  sarray_sort(struct nonzero_id,nz->ptr,nz->n, primary,0, buf);
+
+  /* assign owner among shared primaries */
+  
+  /* create sentinel with i = -1 */
+  array_reserve(struct shared_id,sh,sh->n+1);
+  ((struct shared_id*)sh->ptr)[sh->n].i = -(uint)1;
+  /* in the sorted list of procs sharing a given id,
+     the owner is chosen to be the j^th unflagged proc,
+     where j = id mod (length of list) */
+  sarray_sort_2(struct shared_id,sh->ptr,sh->n, i,0, p,0, buf);
+  out=sh->ptr; pnz=top->nz.ptr;
+  for(pb=sh->ptr,e=pb+sh->n;pb!=e;pb=pe) {
+    uint i = pb->i, lt=0,gt=0, owner; struct shared_id *p;
+    while(pnz->i!=i) ++pnz;
+    /* note: current proc not in list */
+    for(pe=pb; pe->i==i && pe->p<pid; ++pe) if(!(pe->flags&FLAGS_REMOTE)) ++lt;
+    for(     ; pe->i==i             ; ++pe) if(!(pe->flags&FLAGS_REMOTE)) ++gt;
+    if(!(pb->flags&FLAGS_LOCAL)) {
+      owner = pb->id%(lt+gt+1);
+      if(owner==lt) goto make_sh_unique_mine;
+      if(owner>lt) --owner;
+    } else
+      owner = pb->id%(lt+gt);
+    /* we don't own pb->id */
+    if(id) id[i] = -(slong)pb->id;
+    pnz->flag=1;
+    /* we only share this id with the owner now; remove the other entries */
+    for(p=pb; p!=pe; ++p) if(!(p->flags&FLAGS_REMOTE) && !(owner--)) break;
+    if(p!=pe) *out=*p, out->flags=FLAGS_LOCAL, ++out;
+    continue;
+  make_sh_unique_mine:
+    /* we own pb->id */
+    if(out==pb) { out=pe; for(p=pb; p!=pe; ++p) p->flags=FLAGS_REMOTE; }
+    else        for(p=pb; p!=pe; ++p) *out=*p,out->flags=FLAGS_REMOTE,++out;
+  }
+  sh->n = out - ((struct shared_id*)sh->ptr);
+
+  /* set primary_shared_id flags to match */
+  ((struct shared_id*)sh->ptr)[sh->n].i = -(uint)1;
+  sarray_sort(struct shared_id,sh->ptr,sh->n, id,1, buf);
+  sarray_sort(struct primary_shared_id,pr->ptr,pr->n, id,1, buf);
+  q=pr->ptr;
+  for(pb=sh->ptr,e=pb+sh->n;pb!=e;pb=pe) {
+    uint i=pb->i;
+    pe=pb; while(pe->i==i) ++pe;
+    if(q->id!=pb->id) printf("FAIL!!!\n");
+    q->flag=pb->flags&FLAGS_LOCAL;
+    ++q;
+  }
+}
+
+
+/*------------------------------------------------------------------------------
+  Divide lists for parallel execution
+------------------------------------------------------------------------------*/
+
+void sublist(const uint *map, uint ***slPtr) {
+
+  // Iterate over array and count items and lists
+
+  uint i,j;
+  int itemCount = 0, listCount = 0;
+  const uint *lmap = map;
+  while((i=*lmap++)!=-(unsigned int)1) {
+    listCount++;
+  
+    j=*lmap++;
+    do {
+      itemCount++;
+    } while ((j=*lmap++)!=-(unsigned int)1);
+  }
+
+  // Determine number of threads and lists
+
+  int maxThreads = 1;
+#ifdef _OPENMP
+  maxThreads = omp_get_max_threads();
+#endif
+  int max = (maxThreads <= listCount) ? maxThreads : listCount;
+  if (max == 0) max = 1;
+
+  // Setup sublists
+
+  uint *subListData = tmalloc(uint, max+itemCount+2*listCount);
+  *slPtr = tmalloc(uint*, maxThreads);
+  uint **subListPtr = *slPtr;
+
+  subListData[0] = -(unsigned int)1;
+  subListPtr[0] = subListData;
+  int nextSubList = 1;
+
+  // Populate sublists
+
+  int copyItemCount = 0;
+  lmap = map;
+  while((i=*lmap++)!=-(unsigned int)1) {
+    *subListData++ = i;
+
+    j=*lmap++;
+    do {
+      *subListData++ = j;
+      copyItemCount++;
+    } while ((j=*lmap++)!=-(unsigned int)1);
+    *subListData++ = -(unsigned int) 1;
+  
+    if ( copyItemCount*max >= itemCount*nextSubList ) {
+      *subListData= -(unsigned int)1;
+  
+      if (copyItemCount != itemCount) {
+        subListData++;
+        subListPtr[nextSubList] = subListData;
+        nextSubList++;
+      }
+    }
+  }
+
+  // Terminate unused sublists
+
+  for (; nextSubList < maxThreads; nextSubList++) {
+    subListPtr[nextSubList] = subListData;
+  }
+
+  return;
+}
+
+void subflagged(const uint *map, uint ***slPtr) {
+
+  // Iterate over map and count items
+
+  int count = 0;
+  const uint *lmap = map;
+  while(*lmap++ !=-(unsigned int)1) count++;
+
+  // Determine number of threads and sublists
+
+  int maxThreads = 1;
+#ifdef _OPENMP
+  maxThreads = omp_get_max_threads();
+#endif
+  int maxLists = (maxThreads <= count) ? maxThreads : count;
+  if (maxLists == 0) maxLists = 1;
+
+  // Setup empty sublists
+
+  uint *subFlaggedData = tmalloc(uint, maxLists+count);
+  *slPtr = tmalloc(uint*, maxThreads);
+
+  subFlaggedData[0] = -(unsigned int)1;
+  (*slPtr)[0] = subFlaggedData;
+  int nextList = 1;
+
+  // Populate sublists
+
+  int copyCount=0;
+  uint i;
+  lmap = map;
+  while((i=*lmap++)!=-(unsigned int)1) {
+    *subFlaggedData++ = i;
+    copyCount++;
+  
+    if (copyCount*maxLists >= count*nextList) {
+      *subFlaggedData = -(unsigned int)1;
+      
+      if (copyCount != count) {
+        subFlaggedData++;
+        (*slPtr)[nextList] = subFlaggedData;
+        nextList++;
+      }
+    }
+  }
+
+  // Terminate unused sublists
+  
+  for (; nextList < maxThreads; nextList++) {
+    (*slPtr)[nextList] = subFlaggedData;
+  }
+
+  return;
+}
+
+/*------------------------------------------------------------------------------
+  Local setup
+------------------------------------------------------------------------------*/
+
+/* assumes nz is sorted by primary, then flag, then index */
+static const uint *local_map(const struct array *nz, const int ignore_flagged)
+{
+  uint *map, *p, count = 1;
+  const struct nonzero_id *row, *other, *end;
+#define DO_COUNT(cond) do \
+    for(row=nz->ptr,end=row+nz->n;row!=end;) {                     \
+      ulong row_id = row->id; int any=0;                           \
+      for(other=row+1;other!=end&&other->id==row_id&&cond;++other) \
+        any=2, ++count;                                            \
+      count+=any, row=other;                                       \
+    } while(0)
+  if(ignore_flagged) DO_COUNT(other->flag==0); else DO_COUNT(1);
+#undef DO_COUNT
+  p = map = tmalloc(uint,count);
+#define DO_SET(cond) do \
+    for(row=nz->ptr,end=row+nz->n;row!=end;) {                     \
+      ulong row_id = row->id; int any=0;                           \
+      *p++ = row->i;                                               \
+      for(other=row+1;other!=end&&other->id==row_id&&cond;++other) \
+        any=1, *p++ = other->i;                                    \
+      if(any) *p++ = -(uint)1; else --p;                           \
+      row=other;                                                   \
+    } while(0)
+  if(ignore_flagged) DO_SET(other->flag==0); else DO_SET(1);
+#undef DO_SET
+  *p = -(uint)1;
+  return map;
+}
+
+static const uint *flagged_primaries_map(const struct array *nz)
+{
+  uint *map, *p, count=1;
+  const struct nonzero_id *row, *end;
+  for(row=nz->ptr,end=row+nz->n;row!=end;++row)
+    if(row->i==row->primary && row->flag==1) ++count;
+  p = map = tmalloc(uint,count);
+  for(row=nz->ptr,end=row+nz->n;row!=end;++row)
+    if(row->i==row->primary && row->flag==1) *p++ = row->i;
+  *p = -(uint)1;
+  return map;
+}
+
+/*------------------------------------------------------------------------------
+  Remote execution and setup
+------------------------------------------------------------------------------*/
+
+typedef void exec_fun(
+  void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op,
+  unsigned transpose, const void *execdata, const struct comm *comm, char *buf);
+typedef void fin_fun(void *data);
+
+struct gs_remote {
+  uint buffer_size;
+  void *data;
+  exec_fun *exec;
+  fin_fun *fin;
+};
+
+typedef void setup_fun(struct gs_remote *r, struct gs_topology *top,
+                       const struct comm *comm, buffer *buf);
+
+/*------------------------------------------------------------------------------
+  Pairwise Execution
+------------------------------------------------------------------------------*/
+struct pw_comm_data {
+  uint n;      /* number of messages */
+  uint *p;     /* message source/dest proc */
+  uint *size;  /* size of message */
+  uint total;  /* sum of message sizes */
+  size_t *offsets;
+};
+
+struct pw_data {
+  struct pw_comm_data comm[2];
+  const uint *map[2];
+  comm_req *req;
+  uint buffer_size;
+  uint **submap[2];
+};
+
+static char *pw_exec_recvs(char *buf, const unsigned unit_size,
+                           const struct comm *comm,
+                           const struct pw_comm_data *c, comm_req *req)
+{
+  const uint *p=c->p, *size=c->size;
+  int i;
+  char *retVal = buf;
+
+#ifdef MPITHREADS
+#pragma omp for
+#endif
+  for (i = 0; i < c->n; i++) {
+    comm_irecv(&(req[i]),comm,buf+c->offsets[i]*unit_size,size[i]*unit_size,p[i],p[i]);
+  }
+
+  if (c->n != 0) {
+   retVal += c->offsets[c->n-1]*unit_size + size[c->n-1]*unit_size;
+  }
+
+  return retVal;
+}
+
+static char *pw_exec_sends(char *buf, const unsigned unit_size,
+                           const struct comm *comm,
+                           const struct pw_comm_data *c, comm_req *req)
+{
+  const uint *p=c->p, *size=c->size;
+  int i;
+  char *retVal = buf;
+
+#ifdef MPITHREADS
+#pragma omp for
+#endif
+  for(i = 0; i < c->n; i++) {
+    comm_isend(&(req[i]),comm,buf+c->offsets[i]*unit_size,size[i]*unit_size,p[i],comm->id);
+  }
+
+  if (c->n != 0) {
+   retVal += c->offsets[c->n-1]*unit_size + size[c->n-1]*unit_size;
+  }
+
+  return retVal;
+}
+
+static void pw_exec(
+  void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op,
+  unsigned transpose, const void *execdata, const struct comm *comm, char *buf)
+{
+  const struct pw_data *pwd = execdata;
+  static gs_scatter_fun *const scatter_to_buf[] =
+    { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop };
+  static gs_gather_fun *const gather_from_buf[] =
+    { &gs_gather, &gs_gather_vec, &gs_gather_vec_to_many, &gather_noop };
+  const unsigned recv = 0^transpose, send = 1^transpose;
+  unsigned unit_size = vn*gs_dom_size[dom];
+
+#ifdef MPITHREADS
+  char *sendbuf;
+#else
+  static char *sendbuf;
+#endif
+
+  int thd = 0;
+  int inp = 0;
+  #ifdef _OPENMP
+    thd = omp_get_thread_num();
+    inp = omp_in_parallel();
+  #endif
+
+  if (inp) {
+    /* post receives */
+#ifndef MPITHREADS
+    #pragma omp master
+#endif
+    {
+      sendbuf = pw_exec_recvs(buf,unit_size,comm,&pwd->comm[recv],pwd->req);
+    }
+    #pragma omp barrier
+
+    /* fill send buffer */
+    scatter_to_buf[mode](sendbuf,data,vn,(pwd->submap[send])[thd],dom);
+    #pragma omp barrier
+
+    /* post sends */
+#ifndef MPITHREADS
+    #pragma omp master
+#endif
+    {
+      pw_exec_sends(sendbuf,unit_size,comm,&pwd->comm[send],
+                      &pwd->req[pwd->comm[recv].n]);
+    }
+    #pragma omp barrier
+
+    #pragma omp master 
+    {
+      comm_wait(pwd->req,pwd->comm[0].n+pwd->comm[1].n);
+    }
+    #pragma omp barrier
+
+    /* gather using recv buffer */
+    gather_from_buf[mode](data,buf,vn,(pwd->submap[recv])[thd],dom,op);
+  } else {
+    /* post receives */
+    sendbuf = pw_exec_recvs(buf,unit_size,comm,&pwd->comm[recv],pwd->req);
+    /* fill send buffer */
+    scatter_to_buf[mode](sendbuf,data,vn,pwd->map[send],dom);
+    /* post sends */
+    pw_exec_sends(sendbuf,unit_size,comm,&pwd->comm[send],
+                  &pwd->req[pwd->comm[recv].n]);
+    comm_wait(pwd->req,pwd->comm[0].n+pwd->comm[1].n);
+    /* gather using recv buffer */
+    gather_from_buf[mode](data,buf,vn,pwd->map[recv],dom,op);
+  }
+
+}
+
+/*------------------------------------------------------------------------------
+  Pairwise setup
+------------------------------------------------------------------------------*/
+static void pw_comm_setup(struct pw_comm_data *data, struct array *sh,
+                          const unsigned flags_mask, buffer *buf)
+{
+  uint n=0,count=0, lp=-(uint)1;
+  struct shared_id *s, *se;
+  /* sort by remote processor and id (a globally consistent ordering) */
+  sarray_sort_2(struct shared_id,sh->ptr,sh->n, p,0, id,1, buf);
+  /* assign index into buffer */
+  for(s=sh->ptr,se=s+sh->n;s!=se;++s) {
+    if(s->flags&flags_mask) { s->bi = -(uint)1; continue; }
+    s->bi = count++;
+    if(s->p!=lp) lp=s->p, ++n;
+  }
+  data->n = n;
+  data->p = tmalloc(uint,2*n);
+  data->size = data->p + n;
+  data->total = count;
+  n = 0, lp=-(uint)1;
+  for(s=sh->ptr,se=s+sh->n;s!=se;++s) {
+    if(s->flags&flags_mask) continue;
+    if(s->p!=lp) {
+      lp=s->p;
+      if(n!=0) data->size[n-1] = count;
+      count=0, data->p[n++]=lp;
+    }
+    ++count;
+  }
+  if(n!=0) data->size[n-1] = count;
+
+  data->offsets = malloc(sizeof(size_t)*data->n);
+  int i;
+  size_t len = 0;
+  for (i = 0; i < data->n; i++) {
+    data->offsets[i] = len;
+    len += data->size[i];
+  }
+}
+
+static void pw_comm_free(struct pw_comm_data *data) { free(data->p); free(data->offsets);}
+
+/* assumes that the bi field of sh is set */
+static const uint *pw_map_setup(struct array *sh, buffer *buf)
+{
+  uint count=0, *map, *p;
+  struct shared_id *s, *se;
+  sarray_sort(struct shared_id,sh->ptr,sh->n, i,0, buf);
+  /* calculate map size */
+  count=1;
+  for(s=sh->ptr,se=s+sh->n;s!=se;) {
+    uint i=s->i;
+    if(s->bi==-(uint)1) { ++s; continue; }
+    count+=3;
+    for(++s;s!=se&&s->i==i;++s) if(s->bi!=-(uint)1) ++count;
+  }
+  /* write map */
+  p = map = tmalloc(uint,count);
+  for(s=sh->ptr,se=s+sh->n;s!=se;) {
+    uint i=s->i;
+    if(s->bi==-(uint)1) { ++s; continue; }
+    *p++ = i, *p++ = s->bi;
+    for(++s;s!=se&&s->i==i;++s) if(s->bi!=-(uint)1) *p++ = s->bi;
+    *p++ = -(uint)1;
+  }
+  *p = -(uint)1;
+  return map;
+}
+
+
+static struct pw_data *pw_setup_aux(struct array *sh, buffer *buf)
+{
+  struct pw_data *pwd = tmalloc(struct pw_data,1);
+  
+  /* default behavior: receive only remotely unflagged data */
+  pw_comm_setup(&pwd->comm[0],sh, FLAGS_REMOTE, buf);
+  pwd->map[0] = pw_map_setup(sh, buf);
+  sublist(pwd->map[0], &(pwd->submap[0]));
+
+  /* default behavior: send only locally unflagged data */
+  pw_comm_setup(&pwd->comm[1],sh, FLAGS_LOCAL, buf);
+  pwd->map[1] = pw_map_setup(sh, buf);
+  sublist(pwd->map[1], &(pwd->submap[1]));
+
+  pwd->req = tmalloc(comm_req,pwd->comm[0].n+pwd->comm[1].n);
+  pwd->buffer_size = pwd->comm[0].total + pwd->comm[1].total;
+
+  return pwd;
+}
+
+static void pw_free(struct pw_data *data)
+{
+  pw_comm_free(&data->comm[0]);
+  pw_comm_free(&data->comm[1]);
+  free((uint*)data->map[0]);
+  free((uint*)data->map[1]);
+  free(data->req);
+  free(data);
+
+  free((data->submap[0])[0]);
+  free(data->submap[0]);
+  free((data->submap[1])[0]);
+  free(data->submap[1]);
+}
+
+static void pw_setup(struct gs_remote *r, struct gs_topology *top,
+                     const struct comm *comm, buffer *buf)
+{
+  struct pw_data *pwd = pw_setup_aux(&top->sh,buf);
+  r->buffer_size = pwd->buffer_size;
+  r->data = pwd;
+  r->exec = (exec_fun*)&pw_exec;
+  r->fin = (fin_fun*)&pw_free;
+}
+
+/*------------------------------------------------------------------------------
+  Crystal-Router Execution
+------------------------------------------------------------------------------*/
+struct cr_stage {
+  const uint *scatter_map, *gather_map;
+  uint size_r, size_r1, size_r2;
+  uint size_sk, size_s, size_total;
+  uint p1, p2;
+  unsigned nrecvn;
+};
+
+struct cr_data {
+  struct cr_stage *stage[2];
+  unsigned nstages;
+  uint buffer_size, stage_buffer_size;
+};
+
+static void cr_exec(
+  void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op,
+  unsigned transpose, const void *execdata, const struct comm *comm, char *buf)
+{
+  const struct cr_data *crd = execdata;
+  static gs_scatter_fun *const scatter_user_to_buf[] =
+    { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop };
+  static gs_scatter_fun *const scatter_buf_to_buf[] =
+    { &gs_scatter, &gs_scatter_vec, &gs_scatter_vec, &gs_scatter };
+  static gs_scatter_fun *const scatter_buf_to_user[] =
+    { &gs_scatter, &gs_scatter_vec, &gs_scatter_vec_to_many, &scatter_noop };
+  static gs_gather_fun *const gather_buf_to_user[] =
+    { &gs_gather, &gs_gather_vec, &gs_gather_vec_to_many, &gather_noop };
+  static gs_gather_fun *const gather_buf_to_buf[] =
+    { &gs_gather, &gs_gather_vec, &gs_gather_vec, &gs_gather };
+  const unsigned unit_size = vn*gs_dom_size[dom], nstages=crd->nstages;
+  unsigned k;
+  char *sendbuf, *buf_old, *buf_new;
+  const struct cr_stage *stage = crd->stage[transpose];
+  buf_old = buf;
+  buf_new = buf_old + unit_size*crd->stage_buffer_size;
+  /* crystal router */
+  for(k=0;k<nstages;++k) {
+    comm_req req[3];
+    if(stage[k].nrecvn)
+      comm_irecv(&req[1],comm,buf_new,unit_size*stage[k].size_r1,
+               stage[k].p1, comm->np+k);
+    if(stage[k].nrecvn==2)
+      comm_irecv(&req[2],comm,buf_new+unit_size*stage[k].size_r1,
+               unit_size*stage[k].size_r2, stage[k].p2, comm->np+k);
+    sendbuf = buf_new+unit_size*stage[k].size_r;
+    if(k==0)
+      scatter_user_to_buf[mode](sendbuf,data,vn,stage[0].scatter_map,dom);
+    else
+      scatter_buf_to_buf[mode](sendbuf,buf_old,vn,stage[k].scatter_map,dom),
+      gather_buf_to_buf [mode](sendbuf,buf_old,vn,stage[k].gather_map ,dom,op);
+
+    comm_isend(&req[0],comm,sendbuf,unit_size*stage[k].size_s,
+               stage[k].p1, comm->np+k);
+    comm_wait(&req[0],1+stage[k].nrecvn);
+    { char *t = buf_old; buf_old=buf_new; buf_new=t; }
+  }
+  scatter_buf_to_user[mode](data,buf_old,vn,stage[k].scatter_map,dom);
+  gather_buf_to_user [mode](data,buf_old,vn,stage[k].gather_map ,dom,op);
+}
+
+/*------------------------------------------------------------------------------
+  Crystal-Router setup
+------------------------------------------------------------------------------*/
+static void cr_schedule(struct cr_data *data, const struct comm *comm)
+{
+  const uint id = comm->id;
+  uint bl=0, n=comm->np;
+  unsigned k = 0;
+  while(n>1) {
+    uint nl = (n+1)/2, bh = bl+nl;
+    if(id<bh) n=nl; else n-=nl,bl=bh;
+    ++k;
+  }
+  data->nstages = k;
+  data->stage[0] = tmalloc(struct cr_stage,2*(k+1));
+  data->stage[1] = data->stage[0] + (k+1);
+  bl=0, n=comm->np, k=0;
+  while(n>1) {
+    uint nl = (n+1)/2, bh = bl+nl;
+    uint targ; unsigned recvn;
+    recvn = 1, targ = n-1-(id-bl)+bl;
+    if(id==targ) targ=bh, recvn=0;
+    if(n&1 && id==bh) recvn=2;
+    data->stage[1][k].nrecvn=data->stage[0][k].nrecvn=recvn;
+    data->stage[1][k].p1    =data->stage[0][k].p1    =targ;
+    data->stage[1][k].p2    =data->stage[0][k].p2    =comm->id-1;
+    if(id<bh) n=nl; else n-=nl,bl=bh;
+    ++k;
+  }
+}
+
+struct crl_id {
+  ulong id; uint p, ri, si, bi, send;
+};
+
+/* assumes sh is grouped by i (e.g., sorted by i or by id) */
+static void crl_work_init(struct array *cw, struct array *sh,
+                          const unsigned send_mask, uint this_p)
+{
+  const unsigned recv_mask = send_mask^(FLAGS_REMOTE|FLAGS_LOCAL);
+  uint last_i=-(uint)1; int added_myself;
+  uint cw_n = 0, cw_max = cw->max;
+  struct crl_id *w = cw->ptr;
+  struct shared_id *s, *se;
+
+#define CW_ADD(aid,ap,ari,asi) do { \
+    if(cw_n==cw_max)                                         \
+      array_reserve(struct crl_id,cw,cw_n+1),cw_max=cw->max, \
+      w=(struct crl_id*)cw->ptr+cw_n;                        \
+    w->id=aid, w->p=ap, w->ri=ari, w->si=asi;                \
+    ++w, ++cw_n;                                             \
+  } while(0)
+  
+  for(s=sh->ptr,se=s+sh->n;s!=se;++s) {
+    int send = (s->flags&send_mask)==0;
+    int recv = (s->flags&recv_mask)==0;
+    if(s->i!=last_i) last_i=s->i, added_myself=0;
+    if(!added_myself && recv && (s->flags&FLAGS_LOCAL)==0) {
+      added_myself=1;
+      CW_ADD(s->id,this_p,s->i,s->i);
+    }
+    if(send) CW_ADD(s->id,s->p,s->ri,s->i);
+  }
+  cw->n=cw_n;
+#undef CW_ADD  
+}
+
+static void crl_maps(struct cr_stage *stage, struct array *cw, buffer *buf)
+{
+  struct crl_id *w, *we, *other;
+  uint scount=1, gcount=1, *sp, *gp;
+  sarray_sort_2(struct crl_id,cw->ptr,cw->n, bi,0, si,0, buf);
+  for(w=cw->ptr,we=w+cw->n;w!=we;w=other) {
+    uint bi=w->bi,any=0,si=w->si;
+    scount+=3;
+    for(other=w+1;other!=we&&other->bi==bi;++other)
+      if(other->si!=si) si=other->si, any=2, ++gcount;
+    gcount+=any;
+  }
+  stage->scatter_map = sp = tmalloc(uint,scount+gcount);
+  stage->gather_map  = gp = sp + scount;
+  for(w=cw->ptr,we=w+cw->n;w!=we;w=other) {
+    uint bi=w->bi,any=0,si=w->si;
+    *sp++ = w->si, *sp++ = bi;
+    *gp++ = bi;
+    for(other=w+1;other!=we&&other->bi==bi;++other)
+      if(other->si!=si) si=other->si, any=1, *gp++ = si;
+    if(any) *gp++ = -(uint)1; else --gp;
+    *sp++ = -(uint)1;
+  }
+  *sp=-(uint)1, *gp=-(uint)1;
+}
+
+static uint crl_work_label(struct array *cw, struct cr_stage *stage,
+                           uint cutoff, int send_hi, buffer *buf)
+{
+  struct crl_id *w, *we, *start;
+  uint nsend, nkeep = 0, nks = 0, bi=0;
+  /* here w->send has a reverse meaning */
+  if(send_hi) for(w=cw->ptr,we=w+cw->n;w!=we;++w) w->send = w->p< cutoff;
+         else for(w=cw->ptr,we=w+cw->n;w!=we;++w) w->send = w->p>=cutoff;
+  sarray_sort_2(struct crl_id,cw->ptr,cw->n, id,1, send,0, buf);
+  for(start=cw->ptr,w=start,we=w+cw->n;w!=we;++w) {
+    nkeep += w->send;
+    if(w->id!=start->id) start=w;
+    if(w->send!=start->send) w->send=0,w->bi=1, ++nks; else w->bi=0;
+  }
+  nsend = cw->n-nkeep;
+  /* assign indices; sent ids have priority (hence w->send is reversed) */
+  sarray_sort(struct crl_id,cw->ptr,cw->n, send,0, buf);
+  for(start=cw->ptr,w=start,we=w+nsend+nks;w!=we;++w) {
+    if(w->id!=start->id) start=w, ++bi;
+    if(w->bi!=1) w->send=1;   /* switch back to the usual semantics */
+    w->bi = bi;
+  }
+  stage->size_s = nsend+nks==0 ? 0 : bi+1;
+  for(we=(struct crl_id*)cw->ptr+cw->n;w!=we;++w) {
+    if(w->id!=start->id) start=w, ++bi;
+    w->send = 0;              /* switch back to the usual semantics */
+    w->bi = bi;
+  }
+  stage->size_sk = cw->n==0 ? 0 : bi+1;
+  crl_maps(stage,cw,buf);
+  return nsend;
+}
+
+static void crl_bi_to_si(struct crl_id *w, uint n, uint v) {
+  for(;n;--n) w->si=w->bi+v, ++w;
+}
+
+static void crl_ri_to_bi(struct crl_id *w, uint n) {
+  for(;n;--n) w->bi=w->ri, ++w;
+}
+
+static uint cr_learn(struct array *cw, struct cr_stage *stage,
+                     const struct comm *comm, buffer *buf)
+{
+  comm_req req[3];
+  const uint id = comm->id;
+  uint bl=0, n=comm->np;
+  uint size_max=0;
+  uint tag = comm->np;
+  while(n>1) {
+    uint nl = (n+1)/2, bh = bl+nl;
+    uint nkeep, nsend[2], nrecv[2][2] = {{0,0},{0,0}};
+    struct crl_id *wrecv[2], *wsend;
+    nsend[0] = crl_work_label(cw,stage,bh,id<bh,buf);
+    nsend[1] = stage->size_s;
+    nkeep = cw->n - nsend[0];
+
+    if(stage->nrecvn   ) comm_irecv(&req[1],comm,nrecv[0],2*sizeof(uint),
+                                    stage->p1,tag);
+    if(stage->nrecvn==2) comm_irecv(&req[2],comm,nrecv[1],2*sizeof(uint),
+                                    stage->p2,tag);
+    comm_isend(&req[0],comm,nsend,2*sizeof(uint),stage->p1,tag);
+    comm_wait(req,1+stage->nrecvn),++tag;
+    
+    stage->size_r1 = nrecv[0][1], stage->size_r2 = nrecv[1][1];
+    stage->size_r = stage->size_r1 + stage->size_r2;
+    stage->size_total = stage->size_r + stage->size_sk;
+    if(stage->size_total>size_max) size_max=stage->size_total;
+    
+    array_reserve(struct crl_id,cw,cw->n+nrecv[0][0]+nrecv[1][0]);
+    wrecv[0] = cw->ptr, wrecv[0] += cw->n, wrecv[1] = wrecv[0]+nrecv[0][0];
+    wsend = cw->ptr, wsend += nkeep;
+    if(stage->nrecvn   )
+      comm_irecv(&req[1],comm,wrecv[0],nrecv[0][0]*sizeof(struct crl_id),
+                 stage->p1,tag);
+    if(stage->nrecvn==2)
+      comm_irecv(&req[2],comm,wrecv[1],nrecv[1][0]*sizeof(struct crl_id),
+                 stage->p2,tag);
+    sarray_sort_2(struct crl_id,cw->ptr,cw->n, send,0, bi,0, buf);
+    comm_isend(&req[0],comm,wsend,nsend[0]*sizeof(struct crl_id),stage->p1,tag);
+    comm_wait(req,1+stage->nrecvn),++tag;
+
+    crl_bi_to_si(cw->ptr,nkeep,stage->size_r);
+    if(stage->nrecvn)    crl_bi_to_si(wrecv[0],nrecv[0][0],0);
+    if(stage->nrecvn==2) crl_bi_to_si(wrecv[1],nrecv[1][0],stage->size_r1);
+    memmove(wsend,wrecv[0],(nrecv[0][0]+nrecv[1][0])*sizeof(struct crl_id));
+    cw->n += nrecv[0][0] + nrecv[1][0];
+    cw->n -= nsend[0];
+    
+    if(id<bh) n=nl; else n-=nl,bl=bh;
+    ++stage;
+  }
+  crl_ri_to_bi(cw->ptr,cw->n);
+  crl_maps(stage,cw,buf);
+  return size_max;
+}
+
+static struct cr_data *cr_setup_aux(
+  struct array *sh, const struct comm *comm, buffer *buf)
+{
+  uint size_max[2];
+  struct array cw = null_array;
+  struct cr_data *crd = tmalloc(struct cr_data,1);
+  
+  /* default behavior: receive only remotely unflagged data */
+  /* default behavior: send only locally unflagged data */
+  
+  cr_schedule(crd,comm);
+
+  sarray_sort(struct shared_id,sh->ptr,sh->n, i,0, buf);
+  crl_work_init(&cw,sh, FLAGS_LOCAL , comm->id);
+  size_max[0]=cr_learn(&cw,crd->stage[0],comm,buf);
+  crl_work_init(&cw,sh, FLAGS_REMOTE, comm->id);
+  size_max[1]=cr_learn(&cw,crd->stage[1],comm,buf);
+  
+  crd->stage_buffer_size = size_max[1]>size_max[0]?size_max[1]:size_max[0];
+
+  array_free(&cw);
+  
+  crd->buffer_size = 2*crd->stage_buffer_size;
+  return crd;
+}
+
+static void cr_free_stage_maps(struct cr_stage *stage, unsigned kmax)
+{
+  unsigned k;
+  for(k=0; k<kmax; ++k) {
+    free((uint*)stage->scatter_map);
+    ++stage;
+  }
+  free((uint*)stage->scatter_map);
+}
+
+static void cr_free(struct cr_data *data)
+{
+  cr_free_stage_maps(data->stage[0],data->nstages);
+  cr_free_stage_maps(data->stage[1],data->nstages);
+  free(data->stage[0]);
+  free(data);
+}
+
+static void cr_setup(struct gs_remote *r, struct gs_topology *top,
+                     const struct comm *comm, buffer *buf)
+{
+  struct cr_data *crd = cr_setup_aux(&top->sh,comm,buf);
+  r->buffer_size = crd->buffer_size;
+  r->data = crd;
+  r->exec = (exec_fun*)&cr_exec;
+  r->fin = (fin_fun*)&cr_free;
+}
+
+/*------------------------------------------------------------------------------
+  All-reduce Execution
+------------------------------------------------------------------------------*/
+struct allreduce_data {
+  const uint *map_to_buf[2], *map_from_buf[2];
+  uint buffer_size;
+};
+
+static void allreduce_exec(
+  void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op,
+  unsigned transpose, const void *execdata, const struct comm *comm, char *buf)
+{
+  const struct allreduce_data *ard = execdata;
+  static gs_scatter_fun *const scatter_to_buf[] =
+    { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop };
+  static gs_scatter_fun *const scatter_from_buf[] =
+    { &gs_scatter, &gs_scatter_vec, &gs_scatter_vec_to_many, &scatter_noop };
+  uint gvn = vn*(ard->buffer_size/2);
+  unsigned unit_size = gs_dom_size[dom];
+  char *ardbuf;
+  ardbuf = buf+unit_size*gvn;
+  /* user array -> buffer */
+  gs_init_array(buf,gvn,dom,op);
+  scatter_to_buf[mode](buf,data,vn,ard->map_to_buf[transpose],dom);
+  /* all reduce */
+  comm_allreduce(comm,dom,op, buf,gvn, ardbuf);
+  /* buffer -> user array */
+  scatter_from_buf[mode](data,buf,vn,ard->map_from_buf[transpose],dom);
+}
+
+/*------------------------------------------------------------------------------
+  All-reduce setup
+------------------------------------------------------------------------------*/
+static const uint *allreduce_map_setup(
+  struct array *pr, const unsigned flags_mask, int to_buf)
+{
+  struct primary_shared_id *p, *pe;
+  uint count=1, *map, *m;
+  for(p=pr->ptr,pe=p+pr->n;p!=pe;++p)
+    if((p->flag&flags_mask)==0) count+=3;
+  m=map=tmalloc(uint,count);
+  if(to_buf) {
+    for(p=pr->ptr,pe=p+pr->n;p!=pe;++p)
+      if((p->flag&flags_mask)==0)
+        *m++ = p->i, *m++ = p->ord, *m++ = -(uint)1;
+  } else {
+    for(p=pr->ptr,pe=p+pr->n;p!=pe;++p)
+      if((p->flag&flags_mask)==0)
+        *m++ = p->ord, *m++ = p->i, *m++ = -(uint)1;
+  }
+  *m=-(uint)1;
+  return map;
+}
+
+static struct allreduce_data *allreduce_setup_aux(
+  struct array *pr, ulong total_shared)
+{
+  struct allreduce_data *ard = tmalloc(struct allreduce_data,1);
+  
+  /* default behavior: reduce only unflagged data, copy to all */
+  ard->map_to_buf  [0] = allreduce_map_setup(pr,1,1);
+  ard->map_from_buf[0] = allreduce_map_setup(pr,0,0);
+
+  /* transpose behavior: reduce all data, copy to unflagged */
+  ard->map_to_buf  [1] = allreduce_map_setup(pr,0,1);
+  ard->map_from_buf[1] = allreduce_map_setup(pr,1,0);
+  
+  ard->buffer_size = total_shared*2;
+  return ard;
+}
+
+static void allreduce_free(struct allreduce_data *ard)
+{
+  free((uint*)ard->map_to_buf[0]);
+  free((uint*)ard->map_to_buf[1]);
+  free((uint*)ard->map_from_buf[0]);
+  free((uint*)ard->map_from_buf[1]);
+  free(ard);
+}
+
+static void allreduce_setup(struct gs_remote *r, struct gs_topology *top,
+                            const struct comm *comm, buffer *buf)
+{
+  struct allreduce_data *ard = allreduce_setup_aux(&top->pr,top->total_shared);
+  r->buffer_size = ard->buffer_size;
+  r->data = ard;
+  r->exec = (exec_fun*)&allreduce_exec;
+  r->fin = (fin_fun*)&allreduce_free;
+}
+
+/*------------------------------------------------------------------------------
+  Automatic Setup --- dynamically picks the fastest method
+------------------------------------------------------------------------------*/
+
+static void dry_run_time(double times[3], const struct gs_remote *r,
+                         const struct comm *comm, buffer *buf)
+{
+  int i; double t;
+  buffer_reserve(buf,gs_dom_size[gs_double]*r->buffer_size);
+  for(i= 2;i;--i)
+    r->exec(0,mode_dry_run,1,gs_double,gs_add,0,r->data,comm,buf->ptr);
+  comm_barrier(comm);
+  t = comm_time();
+  for(i=10;i;--i)
+    r->exec(0,mode_dry_run,1,gs_double,gs_add,0,r->data,comm,buf->ptr);
+  t = (comm_time() - t)/10;
+  times[0] = t/comm->np, times[1] = t, times[2] = t;
+  comm_allreduce(comm,gs_double,gs_add, &times[0],1, &t);
+  comm_allreduce(comm,gs_double,gs_min, &times[1],1, &t);
+  comm_allreduce(comm,gs_double,gs_max, &times[2],1, &t);
+}
+
+static void auto_setup(struct gs_remote *r, struct gs_topology *top,
+                       const struct comm *comm, buffer *buf)
+{
+  pw_setup(r, top,comm,buf);
+  
+  if(comm->np>1) {
+    const char *name = "pairwise";
+    struct gs_remote r_alt;
+    double time[2][3];
+
+    #define DRY_RUN(i,gsr,str) do { \
+      if(comm->id==0) printf("   " str ": "); \
+      dry_run_time(time[i],gsr,comm,buf); \
+      if(comm->id==0) \
+        printf("%g %g %g\n",time[i][0],time[i][1],time[i][2]); \
+    } while(0)
+    
+    #define DRY_RUN_CHECK(str,new_name) do { \
+      DRY_RUN(1,&r_alt,str); \
+      if(time[1][2]<time[0][2]) \
+        time[0][2]=time[1][2], name=new_name, \
+        r->fin(r->data), *r = r_alt; \
+      else \
+        r_alt.fin(r_alt.data); \
+    } while(0)
+
+    DRY_RUN(0, r, "pairwise times (avg, min, max)");
+
+    cr_setup(&r_alt, top,comm,buf);
+    DRY_RUN_CHECK(      "crystal router                ", "crystal router");
+    
+    if(top->total_shared<100000) {
+      allreduce_setup(&r_alt, top,comm,buf);
+      DRY_RUN_CHECK(    "all reduce                    ", "allreduce");
+    }
+
+    #undef DRY_RUN_CHECK
+    #undef DRY_RUN
+
+    if(comm->id==0) printf("   used all_to_all method: %s\n",name);
+  }
+}
+
+/*------------------------------------------------------------------------------
+  Main Execution
+------------------------------------------------------------------------------*/
+struct gs_data {
+  struct comm comm;
+  const uint *map_local[2]; /* 0=unflagged, 1=all */
+  const uint *flagged_primaries;
+  struct gs_remote r;
+  uint **submap_local[2]; /* 0=unflagged, 1=all */
+  uint **subflagged_primaries;
+};
+
+
+static void gs_aux(
+  void *u, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose,
+  struct gs_data *gsh, buffer *buf)
+{
+  static gs_scatter_fun *const local_scatter[] =
+    { &gs_scatter, &gs_scatter_vec, &gs_scatter_many, &scatter_noop };
+  static gs_gather_fun  *const local_gather [] =
+    { &gs_gather,  &gs_gather_vec,  &gs_gather_many, &gather_noop  };
+  static gs_init_fun *const init[] =
+    { &gs_init, &gs_init_vec, &gs_init_many, &init_noop };
+
+
+  int thd = 0;
+  int inp = 0;
+  #ifdef _OPENMP
+    thd = omp_get_thread_num();
+    inp = omp_in_parallel();
+  #endif
+
+  if(!buf) buf = &static_buffer;
+
+  #pragma omp single
+  {
+    buffer_reserve(buf,vn*gs_dom_size[dom]*gsh->r.buffer_size);
+  }
+
+  if (inp) {
+    local_gather [mode](u,u,vn,(gsh->submap_local[0^transpose])[thd],dom,op);
+    #pragma omp barrier
+
+    if(transpose==0) init[mode](u,vn,(gsh->subflagged_primaries)[thd],dom,op);
+    #pragma omp barrier
+
+    gsh->r.exec(u,mode,vn,dom,op,transpose,gsh->r.data,&gsh->comm,buf->ptr);
+    #pragma omp barrier
+
+    local_scatter[mode](u,u,vn,(gsh->submap_local[1^transpose])[thd],dom);
+    #pragma omp barrier
+
+  } else { 
+    local_gather [mode](u,u,vn,gsh->map_local[0^transpose],dom,op);
+    if(transpose==0) init[mode](u,vn,gsh->flagged_primaries,dom,op);
+    gsh->r.exec(u,mode,vn,dom,op,transpose,gsh->r.data,&gsh->comm,buf->ptr);
+    local_scatter[mode](u,u,vn,gsh->map_local[1^transpose],dom);
+  }
+
+}
+
+void gs(void *u, gs_dom dom, gs_op op, unsigned transpose,
+        struct gs_data *gsh, buffer *buf)
+{
+  gs_aux(u,mode_plain,1,dom,op,transpose,gsh,buf);
+}
+
+void gs_vec(void *u, unsigned vn, gs_dom dom, gs_op op,
+            unsigned transpose, struct gs_data *gsh, buffer *buf)
+{
+  gs_aux(u,mode_vec,vn,dom,op,transpose,gsh,buf);
+}
+
+void gs_many(void *const*u, unsigned vn, gs_dom dom, gs_op op,
+             unsigned transpose, struct gs_data *gsh, buffer *buf)
+{
+  gs_aux((void*)u,mode_many,vn,dom,op,transpose,gsh,buf);
+}
+
+/*------------------------------------------------------------------------------
+  Main Setup
+------------------------------------------------------------------------------*/
+typedef enum { gs_pairwise, gs_crystal_router, gs_all_reduce,
+               gs_auto } gs_method;
+
+
+
+static void local_setup(struct gs_data *gsh, const struct array *nz)
+{
+  gsh->map_local[0] = local_map(nz,1);
+  gsh->map_local[1] = local_map(nz,0);
+  gsh->flagged_primaries = flagged_primaries_map(nz);
+  sublist(gsh->map_local[0], &(gsh->submap_local[0]));
+  sublist(gsh->map_local[1], &(gsh->submap_local[1]));
+  subflagged(gsh->flagged_primaries, &(gsh->subflagged_primaries));
+}
+
+static void gs_setup_aux(struct gs_data *gsh, const slong *id, uint n,
+                         int unique, gs_method method, int verbose)
+{
+  static setup_fun *const remote_setup[] =
+    { &pw_setup, &cr_setup, &allreduce_setup, &auto_setup };
+
+  struct gs_topology top;
+  struct crystal cr;
+  
+  crystal_init(&cr,&gsh->comm);
+
+  get_topology(&top, id,n, &cr);
+  if(unique) make_topology_unique(&top,0,gsh->comm.id,&cr.data);
+
+  local_setup(gsh,&top.nz);
+
+  if(verbose && gsh->comm.id==0)
+    printf("gs_setup: %ld unique labels shared\n",(long)top.total_shared);
+
+  remote_setup[method](&gsh->r, &top,&gsh->comm,&cr.data);
+
+  gs_topology_free(&top);
+  crystal_free(&cr);
+}
+
+struct gs_data *gs_setup(const slong *id, uint n, const struct comm *comm,
+                         int unique, gs_method method, int verbose)
+{
+  struct gs_data *gsh = tmalloc(struct gs_data,1);
+  comm_dup(&gsh->comm,comm);
+  gs_setup_aux(gsh,id,n,unique,method,verbose);
+  return gsh;
+}
+
+void gs_free(struct gs_data *gsh)
+{
+  comm_free(&gsh->comm);
+  free((uint*)gsh->map_local[0]), free((uint*)gsh->map_local[1]);
+  free((uint*)gsh->flagged_primaries);
+  gsh->r.fin(gsh->r.data);
+  free((gsh->submap_local[0])[0]);
+  free(gsh->submap_local[0]);
+  free((gsh->submap_local[1])[0]);
+  free(gsh->submap_local[1]);
+  free((gsh->subflagged_primaries)[0]);
+  free(gsh->subflagged_primaries);
+  free(gsh);
+}
+
+void gs_unique(slong *id, uint n, const struct comm *comm)
+{
+  struct gs_topology top;
+  struct crystal cr;
+  crystal_init(&cr,comm);
+  get_topology(&top, id,n, &cr);
+  make_topology_unique(&top,id,comm->id,&cr.data);
+  gs_topology_free(&top);
+  crystal_free(&cr);
+}
+
+/*------------------------------------------------------------------------------
+  FORTRAN interface
+------------------------------------------------------------------------------*/
+
+#undef gs_op
+
+#undef gs_free
+#undef gs_setup
+#undef gs_many
+#undef gs_vec
+#undef gs
+#define cgs       PREFIXED_NAME(gs      )
+#define cgs_vec   PREFIXED_NAME(gs_vec  )
+#define cgs_many  PREFIXED_NAME(gs_many )
+#define cgs_setup PREFIXED_NAME(gs_setup)
+#define cgs_free  PREFIXED_NAME(gs_free )
+
+#define fgs_setup  FORTRAN_NAME(gs_setup    ,GS_SETUP    )
+#define fgs        FORTRAN_NAME(gs_op       ,GS_OP       )
+#define fgs_vec    FORTRAN_NAME(gs_op_vec   ,GS_OP_VEC   )
+#define fgs_many   FORTRAN_NAME(gs_op_many  ,GS_OP_MANY  )
+#define fgs_fields FORTRAN_NAME(gs_op_fields,GS_OP_FIELDS)
+#define fgs_free   FORTRAN_NAME(gs_free     ,GS_FREE     )
+
+static struct gs_data **fgs_info = 0;
+static int fgs_max = 0;
+static int fgs_n = 0;
+
+void fgs_setup(sint *handle, const slong id[], const sint *n,
+               const MPI_Fint *comm, const sint *np)
+{
+  struct gs_data *gsh;
+  if(fgs_n==fgs_max) fgs_max+=fgs_max/2+1,
+                     fgs_info=trealloc(struct gs_data*,fgs_info,fgs_max);
+  gsh=fgs_info[fgs_n]=tmalloc(struct gs_data,1);
+  comm_init_check(&gsh->comm,*comm,*np);
+  gs_setup_aux(gsh,id,*n,0,gs_pairwise,1);
+  *handle = fgs_n++;
+}
+
+static void fgs_check_handle(sint handle, const char *func, unsigned line)
+{
+  if(handle<0 || handle>=fgs_n || !fgs_info[handle])
+    fail(1,__FILE__,line,"%s: invalid handle", func);
+}
+
+static void fgs_check_parms(sint handle, sint dom, sint op,
+                            const char *func, unsigned line)
+{
+  if(dom<1 || dom>4)
+    fail(1,__FILE__,line,"%s: datatype %d not in valid range 1-4",func,dom);
+  if(op <1 || op >4)
+    fail(1,__FILE__,line,"%s: op %d not in valid range 1-4",func,op);
+  fgs_check_handle(handle,func,line);
+}
+
+void fgs(const sint *handle, void *u, const sint *dom, const sint *op,
+         const sint *transpose)
+{
+  fgs_check_parms(*handle,*dom,*op,"gs_op",__LINE__);
+  cgs(u,(gs_dom)(*dom-1),(gs_op_t)(*op-1),*transpose!=0,fgs_info[*handle],0);
+}
+
+void fgs_vec(const sint *handle, void *u, const sint *n,
+             const sint *dom, const sint *op, const sint *transpose)
+{
+  fgs_check_parms(*handle,*dom,*op,"gs_op_vec",__LINE__);
+  cgs_vec(u,*n,(gs_dom)(*dom-1),(gs_op_t)(*op-1),*transpose!=0,
+          fgs_info[*handle],0);
+}
+
+void fgs_many(const sint *handle, void *u1, void *u2, void *u3,
+              void *u4, void *u5, void *u6, const sint *n,
+              const sint *dom, const sint *op, const sint *transpose)
+{
+  void *uu[6];
+  uu[0]=u1,uu[1]=u2,uu[2]=u3,uu[3]=u4,uu[4]=u5,uu[5]=u6;
+  fgs_check_parms(*handle,*dom,*op,"gs_op_many",__LINE__);
+  cgs_many((void *const*)uu,*n,(gs_dom)(*dom-1),(gs_op_t)(*op-1),*transpose!=0,
+           fgs_info[*handle],0);
+}
+
+static struct array fgs_fields_array = null_array;
+
+void fgs_fields(const sint *handle,
+                void *u, const sint *stride, const sint *n,
+                const sint *dom, const sint *op, const sint *transpose)
+{
+  size_t offset;
+  void **p;
+  uint i;
+  
+  fgs_check_parms(*handle,*dom,*op,"gs_op_fields",__LINE__);
+  if(*n<0) return;
+
+  array_reserve(void*,&fgs_fields_array,*n);
+  p = fgs_fields_array.ptr;
+  offset = *stride * gs_dom_size[*dom-1];
+  for(i=*n;i;--i) *p++ = u, u = (char*)u + offset;
+
+  cgs_many((void *const*)fgs_fields_array.ptr,*n,
+           (gs_dom)(*dom-1),(gs_op_t)(*op-1),
+           *transpose!=0, fgs_info[*handle],0);
+}
+
+void fgs_free(const sint *handle)
+{
+  fgs_check_handle(*handle,"gs_free",__LINE__);
+  cgs_free(fgs_info[*handle]);
+  fgs_info[*handle] = 0;
+}
+
diff --git a/src/jl/gs.h b/src/jl/gs.h
new file mode 100644
index 0000000..43fc142
--- /dev/null
+++ b/src/jl/gs.h
@@ -0,0 +1,141 @@
+#ifndef GS_H
+#define GS_H
+
+#if !defined(COMM_H) || !defined(GS_DEFS_H) || !defined(MEM_H)
+#warning "gs.h" requires "comm.h", "gs_defs.h", and "mem.h"
+#endif
+
+/*
+  Gather/Scatter Library
+
+  The code
+  
+    struct comm c;  // see "comm.h"
+    slong id[n];    // the slong type is defined in "types.h"
+    ...
+    struct gs_data *g = gs_setup(id,n, &c, 0,gs_auto,1);
+    
+  defines a partition of the set of (processor, local index) pairs,
+    (p,i) \in S_j  iff   abs(id[i]) == j  on processor p
+  That is, all (p,i) pairs are grouped together (in group S_j) that have the
+    same id (=j).
+  S_0 is treated specially --- it is ignored completely
+    (i.e., when id[i] == 0, local index i does not participate in any
+    gather/scatter operation
+  If id[i] on proc p is negative then the pair (p,i) is "flagged". This
+  determines the non-symmetric behavior. For the simpler, symmetric case,
+  all id's should be positive.
+  
+  The second to last argument to gs_setup is the method to use, one of
+    gs_pairwise, gs_crystal_router, gs_all_reduce, gs_auto
+  The method "gs_auto" tries ~10 runs of each and chooses the fastest.
+  For a single-use handle, it makes more sense to use "gs_crystal_router".
+  
+  When "g" is no longer needed, free it with
+  
+    gs_free(g);
+  
+  A basic gather/scatter operation is, e.g.,
+  
+    double v[n]; buffer buf;  // see "mem.h" for "buffer"
+    ...
+    gs(v, gs_double,gs_add, 0, g,&buf);
+    
+  The buffer pointer can be null, in which case, a static buffer is used,
+  shared across all gs handles.
+  This gs call has the effect, (in the simple, symmetric, unflagged case)
+  
+    v[i] <--  \sum_{ (p,j) \in S_{id[i]} } v_(p) [j]
+    
+  where v_(p) [j] means v[j] on proc p. In other words, every v[i] is replaced
+  by the sum of all v[j]'s with the same id, given by id[i]. This accomplishes
+  "direct stiffness summation" corresponding to the action of QQ^T, where
+  "Q" is a boolean matrix that copies from a global vector (indexed by id)
+  to the local vectors indexed by (p,i) pairs.
+  
+  Summation on doubles is not the only operation and datatype supported. The
+  full list is defined in "gs_defs.h", and includes the operations
+    gs_add, gs_mul, gs_max, gs_min
+  and datatypes
+    gs_double, gs_float, gs_int, gs_long, gs_sint, gs_slong.
+  (The int and long types are the plain C types, whereas sint and slong
+   are defined in "types.h").
+   
+  For the nonsymmetric behavior, the "transpose" parameter is important:
+  
+    gs(v, gs_double,gs_add, transpose, g,&buf);
+    
+  When transpose == 0, any "flagged" (p,i) pairs (id[i] negative on p)
+  do not participate in the sum, but *do* still receive the sum on output.
+  As a special case, when only one (p,i) pair is unflagged per group this
+  corresponds to the rectangular "Q" matrix referred to above.
+  
+  When transpose == 1, the "flagged" (p,i) pairs *do* participate in the sum,
+  but do *not* get set on output. In the special case of only one unflagged
+  (p,i) pair, this corresponds to the transpose of "Q" referred to above.
+
+
+
+  A version for vectors (contiguously packed) is, e.g.,
+  
+    double v[n][k];
+    gs_vec(v,k, gs_double,gs_add, transpose, g,&buf);
+  
+  which is like "gs" operating on the datatype double[k],
+  with summation here being vector summation. Number of messages sent
+  is independent of k.
+  
+  For combining the communication for "gs" on multiple arrays:
+  
+    double v1[n], v2[n], ..., vk[n];
+    double (*vs)[k] = {v1, v2, ..., vk};
+    
+    gs_many(vs,k, gs_double,op, t, g,&buf);
+  
+  This call is equivalent to
+  
+    gs(v1, gs_double,op, t, g, &buf);
+    gs(v2, gs_double,op, t, g, &buf);
+    ...
+    gs(vk, gs_double,op, t, g, &buf);
+    
+  except that all communication is done together.
+  
+
+
+  Finally, gs_unique has the same basic signature as gs_setup:
+  
+    gs_unique(id,n, &c);
+    
+  This call modifies id, "flagging" (by negating id[i]) all (p,i) pairs in
+  each group except one. The sole "unflagged" member of the group is chosen
+  in an arbitrary but consistent way. If the "unique" flag is set when
+  calling gs_setup, the behavior is equivalent to first calling gs_unique,
+  except that the id array is left unmodified.
+  
+
+*/  
+
+#define gs         PREFIXED_NAME(gs       )
+#define gs_vec     PREFIXED_NAME(gs_vec   )
+#define gs_many    PREFIXED_NAME(gs_many  )
+#define gs_setup   PREFIXED_NAME(gs_setup )
+#define gs_free    PREFIXED_NAME(gs_free  )
+#define gs_unique  PREFIXED_NAME(gs_unique)
+
+struct gs_data;
+typedef enum { gs_pairwise, gs_crystal_router, gs_all_reduce,
+               gs_auto } gs_method;
+
+void gs(void *u, gs_dom dom, gs_op op, unsigned transpose,
+        struct gs_data *gsh, buffer *buf);
+void gs_vec(void *u, unsigned vn, gs_dom dom, gs_op op,
+            unsigned transpose, struct gs_data *gsh, buffer *buf);
+void gs_many(void *const*u, unsigned vn, gs_dom dom, gs_op op,
+             unsigned transpose, struct gs_data *gsh, buffer *buf);
+struct gs_data *gs_setup(const slong *id, uint n, const struct comm *comm,
+                         int unique, gs_method method, int verbose);
+void gs_free(struct gs_data *gsh);
+void gs_unique(slong *id, uint n, const struct comm *comm);
+
+#endif
diff --git a/src/jl/gs_defs.h b/src/jl/gs_defs.h
new file mode 100644
index 0000000..df4ad7b
--- /dev/null
+++ b/src/jl/gs_defs.h
@@ -0,0 +1,81 @@
+#ifndef GS_DEFS_H
+#define GS_DEFS_H
+
+/* requires:
+     <limits.h>, <float.h>   for GS_DEFINE_IDENTITIES()
+     "types.h"               for gs_sint, gs_slong
+*/
+   
+/*------------------------------------------------------------------------------
+  Monoid Definitions
+  
+  Here are defined the domains and operations, each combination being a
+  commutative semigroup, as well as the identity element making each a 
+  commutative monoid.
+------------------------------------------------------------------------------*/
+
+/* the supported domains */
+#define GS_FOR_EACH_DOMAIN(macro) \
+  macro(double) \
+  macro(float ) \
+  macro(int   ) \
+  macro(long  ) \
+  WHEN_LONG_LONG(macro(long_long))
+  
+/* the supported ops */
+#define GS_FOR_EACH_OP(T,macro) \
+  macro(T,add) \
+  macro(T,mul) \
+  macro(T,min) \
+  macro(T,max) \
+  macro(T,bpr)
+
+#define GS_DO_add(a,b) a+=b
+#define GS_DO_mul(a,b) a*=b
+#define GS_DO_min(a,b) if(b<a) a=b
+#define GS_DO_max(a,b) if(b>a) a=b
+#define GS_DO_bpr(a,b) \
+  do if(b!=0) { uint a_ = a; uint b_ = b; \
+       if(a_==0) { a=b_; break; } \
+       for(;;) { if(a_<b_) b_>>=1; else if(b_<a_) a_>>=1; else break; } \
+       a = a_; \
+     } while(0)
+
+/* the monoid identity elements */
+#define GS_DEFINE_MONOID_ID(T,min,max) \
+  static const T gs_identity_##T[] = { 0, 1, max, min, 0 };
+#define GS_DEFINE_IDENTITIES() \
+  GS_DEFINE_MONOID_ID(double, -DBL_MAX,  DBL_MAX) \
+  GS_DEFINE_MONOID_ID(float , -FLT_MAX,  FLT_MAX) \
+  GS_DEFINE_MONOID_ID(int   ,  INT_MIN,  INT_MAX) \
+  GS_DEFINE_MONOID_ID(long  , LONG_MIN, LONG_MAX) \
+  WHEN_LONG_LONG(GS_DEFINE_MONOID_ID(long_long,LLONG_MIN,LLONG_MAX))
+
+/*------------------------------------------------------------------------------
+  Enums and constants
+------------------------------------------------------------------------------*/
+
+/* domain enum */
+#define LIST GS_FOR_EACH_DOMAIN(ITEM) gs_dom_n
+#define ITEM(T) gs_##T,
+typedef enum { LIST } gs_dom;
+#undef ITEM
+#undef LIST
+
+#define gs_sint   TYPE_LOCAL(gs_int,gs_long,gs_long_long)
+#define gs_slong TYPE_GLOBAL(gs_int,gs_long,gs_long_long)
+
+/* domain type size array */
+#define GS_DOM_SIZE_ITEM(T) sizeof(T),
+#define GS_DEFINE_DOM_SIZES() \
+  static const unsigned gs_dom_size[] = \
+    { GS_FOR_EACH_DOMAIN(GS_DOM_SIZE_ITEM) 0 };
+
+/* operation enum */
+#define LIST GS_FOR_EACH_OP(T,ITEM) gs_op_n
+#define ITEM(T,op) gs_##op,
+typedef enum { LIST } gs_op;
+#undef ITEM
+#undef LIST
+
+#endif
diff --git a/src/jl/gs_local.c b/src/jl/gs_local.c
new file mode 100644
index 0000000..2bc246d
--- /dev/null
+++ b/src/jl/gs_local.c
@@ -0,0 +1,336 @@
+#include <string.h>
+#include <limits.h>
+#include <float.h>
+#include "c99.h"
+#include "name.h"
+#include "types.h"
+
+#define gs_gather_array        PREFIXED_NAME(gs_gather_array       )
+#define gs_init_array          PREFIXED_NAME(gs_init_array         )
+#define gs_gather              PREFIXED_NAME(gs_gather             )
+#define gs_scatter             PREFIXED_NAME(gs_scatter            )
+#define gs_init                PREFIXED_NAME(gs_init               )
+#define gs_gather_vec          PREFIXED_NAME(gs_gather_vec         )
+#define gs_scatter_vec         PREFIXED_NAME(gs_scatter_vec        )
+#define gs_init_vec            PREFIXED_NAME(gs_init_vec           )
+#define gs_gather_many         PREFIXED_NAME(gs_gather_many        )
+#define gs_scatter_many        PREFIXED_NAME(gs_scatter_many       )
+#define gs_init_many           PREFIXED_NAME(gs_init_many          )
+#define gs_gather_vec_to_many  PREFIXED_NAME(gs_gather_vec_to_many )
+#define gs_scatter_many_to_vec PREFIXED_NAME(gs_scatter_many_to_vec)
+#define gs_scatter_vec_to_many PREFIXED_NAME(gs_scatter_vec_to_many)
+
+#include "gs_defs.h"
+GS_DEFINE_IDENTITIES()
+GS_DEFINE_DOM_SIZES()
+
+/*------------------------------------------------------------------------------
+  The array gather kernel
+------------------------------------------------------------------------------*/
+#define DEFINE_GATHER(T,OP) \
+static void gather_array_##T##_##OP( \
+  T *restrict out, const T *restrict in, uint n) \
+{                                                                \
+  for(;n;--n) { T q = *in++, *p = out++; GS_DO_##OP(*p,q); }      \
+}
+
+/*------------------------------------------------------------------------------
+  The array initialization kernel
+------------------------------------------------------------------------------*/
+#define DEFINE_INIT(T) \
+static void init_array_##T(T *restrict out, uint n, gs_op op) \
+{                                                             \
+  const T e = gs_identity_##T[op];                            \
+  for(;n;--n) *out++=e;                                       \
+}
+
+#define DEFINE_PROCS(T) \
+  GS_FOR_EACH_OP(T,DEFINE_GATHER) \
+  DEFINE_INIT(T)
+
+GS_FOR_EACH_DOMAIN(DEFINE_PROCS)
+
+#undef DEFINE_PROCS
+#undef DEFINE_INIT
+#undef DEFINE_GATHER
+
+/*------------------------------------------------------------------------------
+  The basic gather kernel
+------------------------------------------------------------------------------*/
+#define DEFINE_GATHER(T,OP) \
+static void gather_##T##_##OP( \
+  T *restrict out, const T *restrict in, const unsigned in_stride,           \
+  const uint *restrict map)                                                  \
+{                                                                            \
+  uint i,j;                                                                  \
+  while((i=*map++)!=-(uint)1) {                                              \
+    T t=out[i];                                                              \
+    j=*map++; do GS_DO_##OP(t,in[j*in_stride]); while((j=*map++)!=-(uint)1); \
+    out[i]=t;                                                                \
+  }                                                                          \
+}
+
+/*------------------------------------------------------------------------------
+  The basic scatter kernel
+------------------------------------------------------------------------------*/
+#define DEFINE_SCATTER(T) \
+static void scatter_##T( \
+  T *restrict out, const unsigned out_stride,                      \
+  const T *restrict in, const unsigned in_stride,                  \
+  const uint *restrict map)                                        \
+{                                                                  \
+  uint i,j;                                                        \
+  while((i=*map++)!=-(uint)1) {                                    \
+    T t=in[i*in_stride];                                           \
+    j=*map++; do out[j*out_stride]=t; while((j=*map++)!=-(uint)1); \
+  }                                                                \
+}
+
+/*------------------------------------------------------------------------------
+  The basic initialization kernel
+------------------------------------------------------------------------------*/
+#define DEFINE_INIT(T) \
+static void init_##T(T *restrict out, const uint *restrict map, gs_op op) \
+{                                                       \
+  uint i; const T e = gs_identity_##T[op];              \
+  while((i=*map++)!=-(uint)1) out[i]=e;                 \
+}
+
+#define DEFINE_PROCS(T) \
+  GS_FOR_EACH_OP(T,DEFINE_GATHER) \
+  DEFINE_SCATTER(T) \
+  DEFINE_INIT(T)
+
+GS_FOR_EACH_DOMAIN(DEFINE_PROCS)
+
+#undef DEFINE_PROCS
+#undef DEFINE_INIT
+#undef DEFINE_SCATTER
+#undef DEFINE_GATHER
+
+/*------------------------------------------------------------------------------
+  The vector gather kernel
+------------------------------------------------------------------------------*/
+#define DEFINE_GATHER(T,OP) \
+static void gather_vec_##T##_##OP( \
+  T *restrict out, const T *restrict in, const unsigned vn,                  \
+  const uint *restrict map)                                                  \
+{                                                                            \
+  uint i,j;                                                                  \
+  while((i=*map++)!=-(uint)1) {                                              \
+    T *restrict p = &out[i*vn], *pe = p+vn;                                  \
+    j=*map++; do {                                                           \
+      const T *restrict q = &in[j*vn];                                       \
+      T *restrict pk=p; do { GS_DO_##OP(*pk,*q); ++pk, ++q; } while(pk!=pe); \
+    } while((j=*map++)!=-(uint)1);                                           \
+  }                                                                          \
+}
+
+/*------------------------------------------------------------------------------
+  The vector scatter kernel
+------------------------------------------------------------------------------*/
+void gs_scatter_vec(
+  void *restrict out, const void *restrict in, const unsigned vn,
+  const uint *restrict map, gs_dom dom)
+{
+  unsigned unit_size = vn*gs_dom_size[dom];
+  uint i,j;
+  while((i=*map++)!=-(uint)1) {
+    const char *t = (const char *)in + i*unit_size;
+    j=*map++; do
+      memcpy((char *)out+j*unit_size,t,unit_size);
+    while((j=*map++)!=-(uint)1);
+  }
+}
+
+/*------------------------------------------------------------------------------
+  The vector initialization kernel
+------------------------------------------------------------------------------*/
+#define DEFINE_INIT(T) \
+static void init_vec_##T(T *restrict out, const unsigned vn, \
+                         const uint *restrict map, gs_op op) \
+{                                                            \
+  uint i; const T e = gs_identity_##T[op];                   \
+  while((i=*map++)!=-(uint)1) {                              \
+    T *restrict u = (T*)out + vn*i, *ue = u+vn;              \
+    do *u++ = e; while(u!=ue);                               \
+  }                                                          \
+}
+
+#define DEFINE_PROCS(T) \
+  GS_FOR_EACH_OP(T,DEFINE_GATHER) \
+  DEFINE_INIT(T)
+
+GS_FOR_EACH_DOMAIN(DEFINE_PROCS)
+
+#undef DEFINE_PROCS
+#undef DEFINE_INIT
+#undef DEFINE_GATHER
+
+#undef DO_bpr
+#undef DO_max
+#undef DO_min
+#undef DO_mul
+#undef DO_add
+
+#define SWITCH_DOMAIN_CASE(T) case gs_##T: WITH_DOMAIN(T); break;
+#define SWITCH_DOMAIN(dom) do switch(dom) { \
+    GS_FOR_EACH_DOMAIN(SWITCH_DOMAIN_CASE) case gs_dom_n: break; } while(0)
+
+#define SWITCH_OP_CASE(T,OP) case gs_##OP: WITH_OP(T,OP); break;
+#define SWITCH_OP(T,op) do switch(op) { \
+    GS_FOR_EACH_OP(T,SWITCH_OP_CASE) case gs_op_n: break; } while(0)
+
+/*------------------------------------------------------------------------------
+  Array kernels
+------------------------------------------------------------------------------*/
+void gs_gather_array(void *out, const void *in, uint n, gs_dom dom, gs_op op)
+{
+#define WITH_OP(T,OP) gather_array_##T##_##OP(out,in,n)
+#define WITH_DOMAIN(T) SWITCH_OP(T,op)
+  SWITCH_DOMAIN(dom);
+#undef  WITH_DOMAIN
+#undef  WITH_OP
+}
+
+void gs_init_array(void *out, uint n, gs_dom dom, gs_op op)
+{
+#define WITH_DOMAIN(T) init_array_##T(out,n,op)
+  SWITCH_DOMAIN(dom);
+#undef  WITH_DOMAIN
+}
+
+/*------------------------------------------------------------------------------
+  Plain kernels; vn parameter ignored but present for consistent signatures
+------------------------------------------------------------------------------*/
+void gs_gather(void *out, const void *in, const unsigned vn,
+               const uint *map, gs_dom dom, gs_op op)
+{
+#define WITH_OP(T,OP) gather_##T##_##OP(out,in,1,map)
+#define WITH_DOMAIN(T) SWITCH_OP(T,op)
+  SWITCH_DOMAIN(dom);
+#undef  WITH_DOMAIN
+#undef  WITH_OP
+}
+
+void gs_scatter(void *out, const void *in, const unsigned vn,
+                const uint *map, gs_dom dom)
+{
+#define WITH_DOMAIN(T) scatter_##T(out,1,in,1,map)
+  SWITCH_DOMAIN(dom);
+#undef  WITH_DOMAIN
+}
+
+void gs_init(void *out, const unsigned vn, const uint *map,
+             gs_dom dom, gs_op op)
+{
+#define WITH_DOMAIN(T) init_##T(out,map,op)
+  SWITCH_DOMAIN(dom);
+#undef  WITH_DOMAIN
+}
+
+/*------------------------------------------------------------------------------
+  Vector kernels
+------------------------------------------------------------------------------*/
+void gs_gather_vec(void *out, const void *in, const unsigned vn,
+                   const uint *map, gs_dom dom, gs_op op)
+{
+#define WITH_OP(T,OP) gather_vec_##T##_##OP(out,in,vn,map)
+#define WITH_DOMAIN(T) SWITCH_OP(T,op)
+  SWITCH_DOMAIN(dom);
+#undef  WITH_DOMAIN
+#undef  WITH_OP
+}
+
+void gs_init_vec(void *out, const unsigned vn, const uint *map,
+                 gs_dom dom, gs_op op)
+{
+#define WITH_DOMAIN(T) init_vec_##T(out,vn,map,op)
+  SWITCH_DOMAIN(dom);
+#undef  WITH_DOMAIN
+}
+
+/*------------------------------------------------------------------------------
+  Multiple array kernels
+------------------------------------------------------------------------------*/
+void gs_gather_many(void *out, const void *in, const unsigned vn,
+                    const uint *map, gs_dom dom, gs_op op)
+{
+  uint k;
+  typedef void *ptr_to_void; typedef const void *ptr_to_const_void;
+  const ptr_to_void *p = out; const ptr_to_const_void *q = in;
+#define WITH_OP(T,OP) for(k=0;k<vn;++k) gather_##T##_##OP(p[k],q[k],1,map)
+#define WITH_DOMAIN(T) SWITCH_OP(T,op)
+  SWITCH_DOMAIN(dom);
+#undef  WITH_DOMAIN
+#undef  WITH_OP
+}
+
+void gs_scatter_many(void *out, const void *in, const unsigned vn,
+                     const uint *map, gs_dom dom)
+{
+  uint k;
+  typedef void *ptr_to_void; typedef const void *ptr_to_const_void;
+  const ptr_to_void *p = out; const ptr_to_const_void *q = in;
+#define WITH_DOMAIN(T) for(k=0;k<vn;++k) scatter_##T(p[k],1,q[k],1,map)
+  SWITCH_DOMAIN(dom);
+#undef  WITH_DOMAIN
+}
+
+void gs_init_many(void *out, const unsigned vn, const uint *map,
+                  gs_dom dom, gs_op op)
+{
+  uint k;
+  typedef void *ptr_to_void; const ptr_to_void *p = out;
+#define WITH_DOMAIN(T) for(k=0;k<vn;++k) init_##T(p[k],map,op)
+  SWITCH_DOMAIN(dom);
+#undef  WITH_DOMAIN
+}
+
+/*------------------------------------------------------------------------------
+  Gather from strided array -> multiple arrays
+  Scatter from multiple arrays -> strided array,
+  Scatter from strided array -> multiple arrays,
+------------------------------------------------------------------------------*/
+void gs_gather_vec_to_many(void *out, const void *in, const unsigned vn,
+                           const uint *map, gs_dom dom, gs_op op)
+{
+  unsigned i; const unsigned unit_size = gs_dom_size[dom];
+  typedef void *ptr_to_void;
+  const ptr_to_void *p = out; const char *q = in;
+#define WITH_OP(T,OP) \
+  for(i=vn;i;--i) gather_##T##_##OP(*p++,(const T*)q,vn,map), q+=unit_size
+#define WITH_DOMAIN(T) SWITCH_OP(T,op)
+  SWITCH_DOMAIN(dom);
+#undef  WITH_DOMAIN
+#undef  WITH_OP
+}
+
+void gs_scatter_many_to_vec(void *out, const void *in, const unsigned vn,
+                            const uint *map, gs_dom dom)
+{
+  unsigned i; const unsigned unit_size = gs_dom_size[dom];
+  typedef const void *ptr_to_const_void;
+  char *p = out; const ptr_to_const_void *q = in;
+#define WITH_DOMAIN(T) \
+  for(i=vn;i;--i) scatter_##T((T*)p,vn,*q++,1,map), p+=unit_size
+  SWITCH_DOMAIN(dom);
+#undef  WITH_DOMAIN
+}
+
+void gs_scatter_vec_to_many(void *out, const void *in, const unsigned vn,
+                            const uint *map, gs_dom dom)
+{
+  unsigned i; const unsigned unit_size = gs_dom_size[dom];
+  typedef void *ptr_to_void;
+  const ptr_to_void *p = out; const char *q = in;
+#define WITH_DOMAIN(T) \
+  for(i=vn;i;--i) scatter_##T(*p++,1,(const T*)q,vn,map), q+=unit_size
+  SWITCH_DOMAIN(dom);
+#undef  WITH_DOMAIN
+}
+
+#undef SWITCH_OP
+#undef SWITCH_OP_CASE
+#undef SWITCH_DOMAIN
+#undef SWITCH_DOMAIN_CASE
diff --git a/src/jl/gs_local.h b/src/jl/gs_local.h
new file mode 100644
index 0000000..fc7c414
--- /dev/null
+++ b/src/jl/gs_local.h
@@ -0,0 +1,43 @@
+#ifndef GS_LOCAL_H
+#define GS_LOCAL_H
+
+#if !defined(NAME_H) || !defined(TYPES_H) || !defined(GS_DEFS_H)
+#warning "gs_local.h" requires "name.h", "types.h", and "gs_defs.h"
+#endif
+
+#define gs_gather_array        PREFIXED_NAME(gs_gather_array       )
+#define gs_init_array          PREFIXED_NAME(gs_init_array         )
+#define gs_gather              PREFIXED_NAME(gs_gather             )
+#define gs_scatter             PREFIXED_NAME(gs_scatter            )
+#define gs_init                PREFIXED_NAME(gs_init               )
+#define gs_gather_vec          PREFIXED_NAME(gs_gather_vec         )
+#define gs_scatter_vec         PREFIXED_NAME(gs_scatter_vec        )
+#define gs_init_vec            PREFIXED_NAME(gs_init_vec           )
+#define gs_gather_many         PREFIXED_NAME(gs_gather_many        )
+#define gs_scatter_many        PREFIXED_NAME(gs_scatter_many       )
+#define gs_init_many           PREFIXED_NAME(gs_init_many          )
+#define gs_gather_vec_to_many  PREFIXED_NAME(gs_gather_vec_to_many )
+#define gs_scatter_many_to_vec PREFIXED_NAME(gs_scatter_many_to_vec)
+#define gs_scatter_vec_to_many PREFIXED_NAME(gs_scatter_vec_to_many)
+
+void gs_gather_array(void *out, const void *in, uint n,
+                     gs_dom dom, gs_op op);
+void gs_init_array(void *out, uint n, gs_dom dom, gs_op op);
+
+typedef void gs_gather_fun(
+  void *out, const void *in, const unsigned vn,
+  const uint *map, gs_dom dom, gs_op op);
+typedef void gs_scatter_fun(
+  void *out, const void *in, const unsigned vn,
+  const uint *map, gs_dom dom);
+typedef void gs_init_fun(
+  void *out, const unsigned vn,
+  const uint *map, gs_dom dom, gs_op op);
+
+extern gs_gather_fun gs_gather, gs_gather_vec, gs_gather_many,
+                     gs_gather_vec_to_many;
+extern gs_scatter_fun gs_scatter, gs_scatter_vec, gs_scatter_many,
+                      gs_scatter_many_to_vec, gs_scatter_vec_to_many;
+extern gs_init_fun gs_init, gs_init_vec, gs_init_many;
+
+#endif
diff --git a/src/jl/gs_test.c b/src/jl/gs_test.c
new file mode 100644
index 0000000..588a52b
--- /dev/null
+++ b/src/jl/gs_test.c
@@ -0,0 +1,68 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "c99.h"
+#include "name.h"
+#include "fail.h"
+#include "types.h"
+#include "comm.h"
+#include "mem.h"
+#include "gs_defs.h"
+#include "gs.h"
+
+typedef double T;
+const gs_dom dom = gs_double;
+
+static void test(const struct comm *comm)
+{
+  struct gs_data *gsh;
+  const uint np = comm->np;
+  slong *id = tmalloc(slong,np+4);
+  T *v = tmalloc(T,np+4);
+  uint i;
+  id[0] = -(slong)(np+10+3*comm->id);
+  for(i=0;i<np;++i) id[i+1] = -(sint)(i+1);
+  id[np+1] = comm->id+1;
+  id[np+2] = comm->id+1;
+  id[np+3] = np-comm->id;
+  gsh = gs_setup(id,np+4,comm,0,gs_auto,1);
+  free(id);
+  
+  for(i=0;i<np+4;++i) v[i] = 1;
+  gs(v,dom,gs_add,0,gsh,0);
+  if(comm->id==0) for(i=0;i<np+4;++i) printf("%g\n",v[i]);
+  if(comm->id==0) printf("\n");
+  for(i=0;i<np+4;++i) v[i] = 1;
+  gs(v,dom,gs_add,1,gsh,0);
+  if(comm->id==0) for(i=0;i<np+4;++i) printf("%g\n",v[i]);
+
+  gs_free(gsh);
+  free(v);
+}
+
+int main(int narg, char *arg[])
+{
+  comm_ext world; int np;
+  struct comm comm;
+  
+#ifdef MPI
+  MPI_Init(&narg,&arg);
+  world = MPI_COMM_WORLD;
+  MPI_Comm_size(world,&np);
+#else
+  world=0, np=1;
+#endif
+
+  comm_init(&comm,world);
+
+  test(&comm);
+  
+  comm_free(&comm);
+
+#ifdef MPI
+  MPI_Finalize();
+#endif
+
+  return 0;
+}
diff --git a/src/jl/gs_test_old.c b/src/jl/gs_test_old.c
new file mode 100644
index 0000000..b75a2af
--- /dev/null
+++ b/src/jl/gs_test_old.c
@@ -0,0 +1,147 @@
+/* simple stand-alone test for parallel gather-scatter routines
+   assumes gather-scatter routines were compiled with default names
+   can compile to sequential version if MPI is not defined
+   
+   the test is as follows, where N is the number of procs:
+     there are N physical nodes (vertices)
+     each proc has 2 local/virtual nodes mapping to each physical node,
+       for a total of 2*N*N virtual nodes
+     virtual nodes are given values that correspond to a sequential ordering
+       (so that they range from 0 to 2*N*N-1)
+       the addition operation is performed and the result is checked,
+       the correct result being known a priori
+     the addition operation is also checked, in a similar manner, for
+       both the cpgs_op_vec and cpgs_op_many routines with vector dimension 3
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#ifdef MPI
+#  include <mpi.h>
+#else
+   typedef void MPI_Comm;
+#endif
+#include "name.h"
+#include "types.h"
+
+typedef long real;
+sint datatype = 4;
+
+#define fgs_setup     FORTRAN_NAME(gs_setup    ,GS_SETUP    )
+#define fgs_op        FORTRAN_NAME(gs_op       ,GS_OP       )
+#define fgs_op_vec    FORTRAN_NAME(gs_op_vec   ,GS_OP_VEC   )
+#define fgs_op_many   FORTRAN_NAME(gs_op_many  ,GS_OP_MANY  )
+#define fgs_op_fields FORTRAN_NAME(gs_op_fields,GS_OP_FIELDS)
+#define fgs_free      FORTRAN_NAME(gs_free     ,GS_FREE     )
+
+void fgs_setup(sint *handle, const slong id[], const sint *n,
+               const MPI_Comm *comm, const sint *np);
+void fgs_op(const sint *handle, void *u, const sint *dom, const sint *op,
+            const sint *transpose);
+void fgs_op_vec(const sint *handle, void *u, const sint *n,
+                const sint *dom, const sint *op, const sint *transpose);
+void fgs_op_many(const sint *handle, void *u1, void *u2, void *u3,
+                 void *u4, void *u5, void *u6, const sint *n,
+                 const sint *dom, const sint *op, const sint *transpose);
+void fgs_free(const sint *handle);
+
+void assert_is_zero(real v)
+{
+  if(fabs(v) < 1e-20) return;
+  printf("test failed\n");
+  exit(1);
+}
+
+int main(int narg, char* arg[])
+{
+  sint transpose=0;
+  sint id=0,np=1;
+  sint i,handle,maxv=3;
+  real *u;
+  slong *glindex;
+#ifndef MPI
+  int comm;
+#else
+  MPI_Comm comm;
+  MPI_Init(&narg,&arg);
+  MPI_Comm_dup(MPI_COMM_WORLD,&comm);
+  { int i;
+    MPI_Comm_rank(comm,&i); id=i;
+    MPI_Comm_size(comm,&i); np=i;
+  }
+#endif
+
+  glindex = malloc(np*2*sizeof(slong));
+  for(i=0;i<np;++i) glindex[2*i+1] = glindex[2*i] = i+1;
+  i=np*2;
+  fgs_setup(&handle,glindex,&i,&comm,&np);
+  free(glindex);
+  
+  u = malloc(np*2*sizeof(real));
+  for(i=0;i<np;++i) u[2*i  ] = (real)( 2*np*id + 2*i ),
+                    u[2*i+1] = (real)( 2*np*id + 2*i+1 );
+  /*for(i=0;i<np;++i) printf(" (%g %g)", u[2*i], u[2*i+1]); printf("\n");*/
+  i=1, fgs_op(&handle,u,&datatype,&i,&transpose);
+  /*for(i=0;i<np;++i) printf(" (%g %g)", u[2*i], u[2*i+1]); printf("\n");*/
+  for(i=0;i<np;++i) assert_is_zero( np*(2*np*(np-1)+4*i+1) - u[2*i] ),
+                    assert_is_zero( np*(2*np*(np-1)+4*i+1) - u[2*i+1]  );
+  free(u);
+
+  u = malloc(np*2*3*sizeof(real));
+  for(i=0;i<np;++i)
+    u[3*(2*i  )+0] = (real)( 3*(2*np*id + 2*i  ) + 0 ),
+    u[3*(2*i  )+1] = (real)( 3*(2*np*id + 2*i  ) + 1 ),
+    u[3*(2*i  )+2] = (real)( 3*(2*np*id + 2*i  ) + 2 ),
+    u[3*(2*i+1)+0] = (real)( 3*(2*np*id + 2*i+1) + 0 ),
+    u[3*(2*i+1)+1] = (real)( 3*(2*np*id + 2*i+1) + 1 ),
+    u[3*(2*i+1)+2] = (real)( 3*(2*np*id + 2*i+1) + 2 );
+  /*for(i=0;i<np;++i) {
+    int j;
+    printf("%d: ( ", id);
+    for(j=3*(2*i);j<=3*(2*i+1)+2;++j) printf("%g ",u[j]);
+    printf(")\n");
+  }*/
+  i=1, maxv=3, fgs_op_vec(&handle,u,&maxv,&datatype,&i,&transpose);
+  /*for(i=0;i<np;++i) {
+    int j;
+    printf("%d: ( ", id);
+    for(j=3*(2*i);j<=3*(2*i+1)+2;++j) printf("%g ",u[j]);
+    printf(")\n");
+  }*/
+  for(i=0;i<np;++i)
+    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*0) - u[3*(2*i  )+0] ),
+    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*1) - u[3*(2*i  )+1] ),
+    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*2) - u[3*(2*i  )+2] ),
+    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*0) - u[3*(2*i+1)+0] ),
+    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*1) - u[3*(2*i+1)+1] ),
+    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*2) - u[3*(2*i+1)+2] );
+  free(u);
+
+  u = malloc(np*2*3*sizeof(real));
+  for(i=0;i<np;++i)
+    u[2*np*0+(2*i  )] = (real)( 3*(2*np*id + 2*i  ) + 0 ),
+    u[2*np*1+(2*i  )] = (real)( 3*(2*np*id + 2*i  ) + 1 ),
+    u[2*np*2+(2*i  )] = (real)( 3*(2*np*id + 2*i  ) + 2 ),
+    u[2*np*0+(2*i+1)] = (real)( 3*(2*np*id + 2*i+1) + 0 ),
+    u[2*np*1+(2*i+1)] = (real)( 3*(2*np*id + 2*i+1) + 1 ),
+    u[2*np*2+(2*i+1)] = (real)( 3*(2*np*id + 2*i+1) + 2 );
+  i=1, maxv=3, fgs_op_many(&handle,u,u+2*np,u+4*np,0,0,0,&maxv,
+                           &datatype,&i,&transpose);
+  for(i=0;i<np;++i)
+    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*0) - u[2*np*0+(2*i  )] ),
+    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*1) - u[2*np*1+(2*i  )] ),
+    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*2) - u[2*np*2+(2*i  )] ),
+    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*0) - u[2*np*0+(2*i+1)] ),
+    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*1) - u[2*np*1+(2*i+1)] ),
+    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*2) - u[2*np*2+(2*i+1)] );
+  free(u);
+  
+  fgs_free(&handle);
+  printf("test on node %d/%d succeeded\n", (int)id+1, (int)np);
+#ifdef MPI  
+  MPI_Comm_free(&comm);
+  MPI_Finalize();
+#endif
+  return 0;
+}
diff --git a/src/jl/gs_unique_test.c b/src/jl/gs_unique_test.c
new file mode 100644
index 0000000..ea416dd
--- /dev/null
+++ b/src/jl/gs_unique_test.c
@@ -0,0 +1,72 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "c99.h"
+#include "name.h"
+#include "fail.h"
+#include "types.h"
+#include "comm.h"
+#include "mem.h"
+#include "gs_defs.h"
+#include "gs.h"
+
+static void test(const struct comm *comm)
+{
+  uint i,np=comm->np,id=comm->id;
+  slong *glindex = tmalloc(slong,np*2);
+  char *out, *buf = tmalloc(char,80+np*2*30);
+  struct gs_data *gsh;
+  
+  for(i=0;i<np;++i) glindex[2*i+1]=glindex[2*i]=i+1;
+  
+  out = buf+sprintf(buf, "%03d bgn : [", (int)comm->id);
+  for(i=0;i<np*2;++i) out += sprintf(out, " %+d", (int)glindex[i]);
+  sprintf(out," ]"), puts(buf);
+  
+  gs_unique(glindex,np*2,comm);
+
+  out = buf+sprintf(buf, "%03d end : [", (int)comm->id);
+  for(i=0;i<np*2;++i) out += sprintf(out, " %+d", (int)glindex[i]);
+  sprintf(out," ]"), puts(buf);
+
+
+  for(i=0;i<np;++i) glindex[2*i+1]=glindex[2*i]=i+1;
+  gsh=gs_setup(glindex,np*2,comm,1,gs_auto,1);
+  for(i=0;i<np;++i) glindex[2*i+1]=glindex[2*i]=id;
+  gs(glindex,gs_slong,gs_add,0,gsh,0);
+  gs_free(gsh);
+
+  out = buf+sprintf(buf, "%03d own : [", (int)comm->id);
+  for(i=0;i<np*2;++i) out += sprintf(out, " %+d", (int)glindex[i]);
+  sprintf(out," ]"), puts(buf);
+
+  free(buf);
+  free(glindex);
+}
+
+int main(int narg, char *arg[])
+{
+  comm_ext world; int np;
+  struct comm comm;
+  
+#ifdef MPI
+  MPI_Init(&narg,&arg);
+  world = MPI_COMM_WORLD;
+  MPI_Comm_size(world,&np);
+#else
+  world=0, np=1;
+#endif
+
+  comm_init(&comm,world);
+
+  test(&comm);
+  
+  comm_free(&comm);
+
+#ifdef MPI
+  MPI_Finalize();
+#endif
+
+  return 0;
+}
diff --git a/src/jl/makefile.cdep b/src/jl/makefile.cdep
new file mode 100644
index 0000000..ae6672b
--- /dev/null
+++ b/src/jl/makefile.cdep
@@ -0,0 +1,48 @@
+amg.o: amg.c gs.h sarray_transfer.h crystal.h comm.h gs_defs.h sarray_sort.h sort.h mem.h fail.h types.h name.h c99.h
+comm.o: comm.c comm.h gs_local.h gs_defs.h tensor.h types.h fail.h name.h
+comm_test.o: comm_test.c comm.h gs_defs.h types.h fail.h name.h
+crs_test.o: crs_test.c crs.h gs.h comm.h gs_defs.h mem.h types.h fail.h name.h c99.h
+crystal.o: crystal.c mem.h comm.h types.h fail.h name.h c99.h
+crystal_test.o: crystal_test.c crystal.h mem.h comm.h types.h fail.h name.h c99.h
+fail.o: fail.c comm.h types.h fail.h name.h
+fcrs.o: fcrs.c crs.h comm.h mem.h types.h fail.h name.h c99.h
+fcrystal.o: fcrystal.c sarray_transfer.h sarray_sort.h sort.h crystal.h comm.h mem.h types.h fail.h name.h c99.h
+findpts.o: findpts.c findpts_imp.h findpts_imp.h sarray_sort.h sort.h sarray_transfer.h crystal.h comm.h gs_defs.h findpts_local.h findpts_el.h obbox.h poly.h mem.h fail.h types.h name.h c99.h
+findpts_el_2.o: findpts_el_2.c poly.h tensor.h mem.h types.h fail.h name.h c99.h
+findpts_el_2_test2.o: findpts_el_2_test2.c rdtsc.h rand_elt_test.h findpts_el.h obbox.h lob_bnd.h poly.h tensor.h mem.h fail.h name.h types.h c99.h
+findpts_el_2_test.o: findpts_el_2_test.c findpts_el.h poly.h mem.h fail.h types.h name.h c99.h
+findpts_el_3.o: findpts_el_3.c poly.h tensor.h mem.h types.h fail.h name.h c99.h
+findpts_el_3_test2.o: findpts_el_3_test2.c rdtsc.h rand_elt_test.h findpts_el.h obbox.h lob_bnd.h poly.h tensor.h mem.h fail.h name.h types.h c99.h
+findpts_el_3_test.o: findpts_el_3_test.c findpts_el.h poly.h mem.h fail.h types.h name.h c99.h
+findpts_local.o: findpts_local.c findpts_local_imp.h findpts_local_imp.h findpts_el.h sarray_sort.h sort.h poly.h obbox.h mem.h fail.h name.h types.h c99.h
+findpts_local_test.o: findpts_local_test.c rand_elt_test.h findpts_local.h findpts_el.h obbox.h poly.h types.h mem.h fail.h name.h c99.h
+findpts_test.o: findpts_test.c sarray_transfer.h crystal.h findpts.h rand_elt_test.h comm.h gs_defs.h poly.h mem.h types.h fail.h name.h c99.h
+gen_poly_imp.o: gen_poly_imp.c
+gs.o: gs.c sarray_transfer.h sarray_sort.h crystal.h sort.h mem.h comm.h gs_local.h gs_defs.h types.h fail.h name.h c99.h
+gs_local.o: gs_local.c gs_defs.h types.h name.h c99.h
+gs_test.o: gs_test.c gs.h gs_defs.h mem.h comm.h types.h fail.h name.h c99.h
+gs_test_old.o: gs_test_old.c types.h name.h
+gs_unique_test.o: gs_unique_test.c gs.h gs_defs.h mem.h comm.h types.h fail.h name.h c99.h
+lob_bnd.o: lob_bnd.c poly.h mem.h fail.h types.h name.h c99.h
+lob_bnd_test.o: lob_bnd_test.c lob_bnd.h poly.h tensor.h mem.h fail.h name.h types.h c99.h
+obbox.o: obbox.c lob_bnd.h poly.h tensor.h mem.h types.h fail.h name.h c99.h
+obbox_test.o: obbox_test.c rand_elt_test.h obbox.h lob_bnd.h poly.h mem.h fail.h name.h types.h c99.h
+poly.o: poly.c poly_imp.h mem.h types.h fail.h name.h c99.h
+poly_test2.o: poly_test2.c rdtsc.h poly.h mem.h fail.h name.h types.h c99.h
+poly_test.o: poly_test.c poly.h types.h name.h c99.h
+rand_elt_test.o: rand_elt_test.c lob_bnd.h poly.h name.h types.h c99.h
+sarray_sort.o: sarray_sort.c sort.h mem.h fail.h types.h name.h c99.h
+sarray_sort_test.o: sarray_sort_test.c sarray_sort.h sort.h mem.h types.h fail.h name.h c99.h
+sarray_transfer.o: sarray_transfer.c sort.h crystal.h mem.h comm.h types.h fail.h name.h c99.h
+sarray_transfer_test.o: sarray_transfer_test.c sarray_transfer.h crystal.h sarray_sort.h sort.h mem.h comm.h types.h fail.h name.h c99.h
+sort.o: sort.c sort_imp.h sort_imp.h sort_imp.h mem.h types.h fail.h name.h c99.h
+sort_test2.o: sort_test2.c rdtsc.h sort.h mem.h types.h fail.h name.h c99.h
+sort_test.o: sort_test.c sort.h mem.h types.h fail.h name.h c99.h
+sparse_cholesky.o: sparse_cholesky.c sort.h mem.h types.h fail.h name.h c99.h
+spchol_test.o: spchol_test.c sparse_cholesky.h mem.h types.h fail.h name.h c99.h
+tensor.o: tensor.c types.h name.h c99.h
+xxt.o: xxt.c gs.h sparse_cholesky.h sarray_sort.h sort.h mem.h comm.h gs_defs.h tensor.h types.h fail.h name.h c99.h
+xxt_test2.o: xxt_test2.c crs.h mem.h comm.h types.h fail.h name.h c99.h
+xxt_test.o: xxt_test.c crs.h comm.h types.h fail.h name.h
+
+OBJECTS= amg.o comm.o comm_test.o crs_test.o crystal.o crystal_test.o fail.o fcrs.o fcrystal.o findpts.o findpts_el_2.o findpts_el_2_test2.o findpts_el_2_test.o findpts_el_3.o findpts_el_3_test2.o findpts_el_3_test.o findpts_local.o findpts_local_test.o findpts_test.o gen_poly_imp.o gs.o gs_local.o gs_test.o gs_test_old.o gs_unique_test.o lob_bnd.o lob_bnd_test.o obbox.o obbox_test.o poly.o poly_test2.o poly_test.o rand_elt_test.o sarray_sort.o sarray_sort_test.o sarray_transfer.o sarray_transfer_test.o sort.o sort_test2.o sort_test.o sparse_cholesky.o spchol_test.o tensor.o xxt.o xxt_test2.o xxt_test.o
diff --git a/src/jl/mem.h b/src/jl/mem.h
new file mode 100644
index 0000000..e55b81a
--- /dev/null
+++ b/src/jl/mem.h
@@ -0,0 +1,168 @@
+#ifndef MEM_H
+#define MEM_H
+
+/* requires:
+     <stddef.h> for size_t, offsetof
+     <stdlib.h> for malloc, calloc, realloc, free
+     <string.h> for memcpy
+     "c99.h"
+     "fail.h"
+*/
+
+#if !defined(C99_H) || !defined(FAIL_H)
+#error "mem.h" requires "c99.h" and "fail.h"
+#endif
+
+/* 
+   All memory management goes through the wrappers defined in this
+   header. Diagnostics can be turned on with
+     -DPRINT_MALLOCS=1
+   Then all memory management operations will be printed to stdout.
+   
+   Most memory management occurs through use of the "array" type,
+   defined below, which defines a generic dynamically-sized array
+   that grows in bursts. The "buffer" type is a "char" array and
+   is often passed around by code to provide a common area for
+   scratch work.
+*/
+
+#ifndef PRINT_MALLOCS
+#  define PRINT_MALLOCS 0
+#else
+#  include <stdio.h>
+#  ifndef comm_gbl_id
+#    define comm_gbl_id PREFIXED_NAME(comm_gbl_id)
+#    define comm_gbl_np PREFIXED_NAME(comm_gbl_np)
+#    include "types.h"
+     extern uint comm_gbl_id, comm_gbl_np;
+#  endif
+#endif
+
+/*--------------------------------------------------------------------------
+   Memory Allocation Wrappers to Catch Out-of-memory
+  --------------------------------------------------------------------------*/
+
+static inline void *smalloc(size_t size, const char *file, unsigned line)
+{
+  void *restrict res = malloc(size);
+  #if PRINT_MALLOCS
+  fprintf(stdout,"MEM: proc %04d: %p = malloc(%ld) @ %s(%u)\n",
+          (int)comm_gbl_id,res,(long)size,file,line), fflush(stdout);
+  #endif
+  if(!res && size)
+    fail(1,file,line,"allocation of %ld bytes failed\n",(long)size);
+  return res;
+}
+
+static inline void *scalloc(
+  size_t nmemb, size_t size, const char *file, unsigned line)
+{
+  void *restrict res = calloc(nmemb, size);
+  #if PRINT_MALLOCS
+  fprintf(stdout,"MEM: proc %04d: %p = calloc(%ld) @ %s(%u)\n",
+          (int)comm_gbl_id,res,(long)size*nmemb,file,line), fflush(stdout);
+  #endif
+  if(!res && nmemb)
+    fail(1,file,line,"allocation of %ld bytes failed\n",
+           (long)size*nmemb);
+  return res;
+}
+
+static inline void *srealloc(
+  void *restrict ptr, size_t size, const char *file, unsigned line)
+{
+  void *restrict res = realloc(ptr, size);
+  #if PRINT_MALLOCS
+  if(res!=ptr) {
+    if(ptr)
+      fprintf(stdout,"MEM: proc %04d: %p freed by realloc @ %s(%u)\n",
+              (int)comm_gbl_id,ptr,file,line), fflush(stdout);
+    fprintf(stdout,"MEM: proc %04d: %p = realloc of %p to %lu @ %s(%u)\n",
+            (int)comm_gbl_id,res,ptr,(long)size,file,line), fflush(stdout);
+  } else
+    fprintf(stdout,"MEM: proc %04d: %p realloc'd to %lu @ %s(%u)\n",
+            (int)comm_gbl_id,res,(long)size,file,line), fflush(stdout);
+  #endif
+  if(!res && size)
+    fail(1,file,line,"allocation of %ld bytes failed\n",(long)size);
+  return res;
+}
+
+#define tmalloc(type, count) \
+  ((type*) smalloc((count)*sizeof(type),__FILE__,__LINE__) )
+#define tcalloc(type, count) \
+  ((type*) scalloc((count),sizeof(type),__FILE__,__LINE__) )
+#define trealloc(type, ptr, count) \
+  ((type*) srealloc((ptr),(count)*sizeof(type),__FILE__,__LINE__) )
+
+#if PRINT_MALLOCS
+static inline void sfree(void *restrict ptr, const char *file, unsigned line)
+{
+  free(ptr);
+  fprintf(stdout,"MEM: proc %04d: %p freed @ %s(%u)\n",
+          (int)comm_gbl_id,ptr,file,line), fflush(stdout);
+}
+#define free(x) sfree(x,__FILE__,__LINE__)
+#endif
+
+/*--------------------------------------------------------------------------
+   A dynamic array
+  --------------------------------------------------------------------------*/
+struct array { void *ptr; size_t n,max; };
+#define null_array {0,0,0}
+static void array_init_(struct array *a, size_t max, size_t size,
+                        const char *file, unsigned line)
+{
+  a->n=0, a->max=max, a->ptr=smalloc(max*size,file,line);
+}
+static void array_resize_(struct array *a, size_t max, size_t size,
+                          const char *file, unsigned line)
+{
+  a->max=max, a->ptr=srealloc(a->ptr,max*size,file,line);
+}
+static void *array_reserve_(struct array *a, size_t min, size_t size,
+                            const char *file, unsigned line)
+{
+  size_t max = a->max;
+  if(max<min) {
+    max+=max/2+1;
+    if(max<min) max=min;
+    array_resize_(a,max,size,file,line);
+  }
+  return a->ptr;
+}
+
+#define array_free(a) (free((a)->ptr))
+#define array_init(T,a,max) array_init_(a,max,sizeof(T),__FILE__,__LINE__)
+#define array_resize(T,a,max) array_resize_(a,max,sizeof(T),__FILE__,__LINE__)
+#define array_reserve(T,a,min) array_reserve_(a,min,sizeof(T),__FILE__,__LINE__)
+
+static void array_cat_(size_t size, struct array *d, const void *s, size_t n,
+                       const char *file, unsigned line)
+{
+  char *out = array_reserve_(d,d->n+n,size, file,line);
+  memcpy(out+d->n*size, s, n*size);
+  d->n+=n;
+}
+
+#define array_cat(T,d,s,n) array_cat_(sizeof(T),d,s,n,__FILE__,__LINE__)
+
+/*--------------------------------------------------------------------------
+   Buffer = char array
+  --------------------------------------------------------------------------*/
+typedef struct array buffer;
+#define null_buffer null_array
+#define buffer_init(b,max) array_init(char,b,max)
+#define buffer_resize(b,max) array_resize(char,b,max)
+#define buffer_reserve(b,max) array_reserve(char,b,max)
+#define buffer_free(b) array_free(b)
+
+/*--------------------------------------------------------------------------
+   Alignment routines
+  --------------------------------------------------------------------------*/
+#define ALIGNOF(T) offsetof(struct { char c; T x; }, x)
+static size_t align_as_(size_t a, size_t n) { return (n+a-1)/a*a; }
+#define align_as(T,n) align_as_(ALIGNOF(T),n)
+#define align_ptr(T,base,offset) ((T*)((char*)(base)+align_as(T,offset)))
+#endif
+
diff --git a/src/jl/name.h b/src/jl/name.h
new file mode 100644
index 0000000..b4bcd91
--- /dev/null
+++ b/src/jl/name.h
@@ -0,0 +1,44 @@
+#ifndef NAME_H
+#define NAME_H
+
+/* establishes some macros to establish
+   * the FORTRAN naming convention
+     default      gs_setup, etc.
+     -DUPCASE     GS_SETUP, etc.
+     -DUNDERSCORE gs_setup_, etc.
+   * a prefix for all external (non-FORTRAN) function names
+     for example, -DPREFIX=jl_   transforms fail -> jl_fail
+   * a prefix for all external FORTRAN function names     
+     for example, -DFPREFIX=jlf_ transforms gs_setup_ -> jlf_gs_setup_
+*/
+
+/* the following macro functions like a##b,
+   but will expand a and/or b if they are themselves macros */
+#define TOKEN_PASTE_(a,b) a##b
+#define TOKEN_PASTE(a,b) TOKEN_PASTE_(a,b)
+
+#ifdef PREFIX
+#  define PREFIXED_NAME(x) TOKEN_PASTE(PREFIX,x)
+#else
+#  define PREFIXED_NAME(x) x
+#endif
+
+#ifdef FPREFIX
+#  define FPREFIXED_NAME(x) TOKEN_PASTE(FPREFIX,x)
+#else
+#  define FPREFIXED_NAME(x) x
+#endif
+
+#if defined(UPCASE)
+#  define FORTRAN_NAME(low,up) FPREFIXED_NAME(up)
+#  define FORTRAN_UNPREFIXED(low,up) up
+#elif defined(UNDERSCORE)
+#  define FORTRAN_NAME(low,up) FPREFIXED_NAME(TOKEN_PASTE(low,_))
+#  define FORTRAN_UNPREFIXED(low,up) TOKEN_PASTE(low,_)
+#else
+#  define FORTRAN_NAME(low,up) FPREFIXED_NAME(low)
+#  define FORTRAN_UNPREFIXED(low,up) low
+#endif
+
+#endif
+
diff --git a/src/jl/odep_info.py b/src/jl/odep_info.py
new file mode 100755
index 0000000..620d0ec
--- /dev/null
+++ b/src/jl/odep_info.py
@@ -0,0 +1,50 @@
+#!/usr/bin/python
+
+import sys, os, re
+
+obj_files = sys.argv[1:]
+
+defined = dict((x,set([])) for x in obj_files)
+undefined = dict((x,set([])) for x in obj_files)
+nm_re = re.compile("[0-9a-fA-F]*\s*([BCDRTU])\s+([A-Za-z_][A-Za-z_0-9]*)\s*")
+def nm_match(x): return ( nm_re.match(line) for line in os.popen('nm -g '+x) )
+def nm_line(x,m):
+	if m.group(1)=='U': undefined[x].add(m.group(2))
+	else: defined[x].add(m.group(2))
+[ [ nm_line(x,m) for m in nm_match(x) if m!=None ] for x in obj_files ]
+
+def closure(seq,f):
+	v = [], [x for x in seq], set(x for x in seq)
+	while len(v[1]): [(v[1].append(y),v[2].add(y)) for y in 
+	  f((lambda x: (v[0].append(x),x)[1])(v[1].pop())) if not y in v[2]]
+	return v[0]
+
+needs={}
+def get_needs(x):
+	if not needs.has_key(x):
+		needs[x]=[y for y in obj_files if len(defined[y]&undefined[x])]
+	return needs[x]
+deps = dict((x,closure(get_needs(x),get_needs)) for x in obj_files)
+
+for x in deps:
+	print x,'depends on',reduce((lambda a,b: a+" "+b),deps[x],"")
+print
+
+results = [ os.path.splitext(x)[0] for x in obj_files if 'main' in defined[x] ]
+print "RESULTS="+reduce((lambda a,b: a+" "+b),results,"")
+print
+
+def need_X(objs):
+	for x in objs:
+		if "XOpenDisplay" in undefined[x]: return True
+	return False
+
+for x in results:
+	objs = deps[x+'.o'];
+	if not (x+'.o') in objs: objs.append(x+'.o')
+	sobjs = reduce((lambda a,b: a+" "+b),objs,"")
+	if need_X(objs):
+		print x+":"+sobjs+" ; @echo LINK $@; $(LINKCMD) $^ -lX11 -o $@"
+	else:
+		print x+":"+sobjs+" ; @echo LINK $@; $(LINKCMD) $^ -o $@"
+
diff --git a/src/jl/rand_elt_test.c b/src/jl/rand_elt_test.c
new file mode 100644
index 0000000..1e11dae
--- /dev/null
+++ b/src/jl/rand_elt_test.c
@@ -0,0 +1,169 @@
+#include <stdlib.h>
+#include <math.h>
+#include "c99.h"
+#include "types.h"
+#include "name.h"
+#include "poly.h"
+#include "lob_bnd.h"
+
+static double det_2(const double A[4]) { return A[0]*A[3]-A[1]*A[2]; }
+
+static double quad_2(const double x0, const double g[2], const double H[3],
+                     const double r[2])
+{
+  return x0 + (g[0]*r[0]+g[1]*r[1])
+            + (  r[0] * (H[0]*r[0]+H[1]*r[1])
+               + r[1] * (H[1]*r[0]+H[2]*r[1]) )/2;
+}
+
+static void quad_2_grad(double grad[2], const double g[2], const double H[3],
+                        const double r[2])
+{
+  grad[0] = g[0] + (H[0]*r[0]+H[1]*r[1]);
+  grad[1] = g[1] + (H[1]*r[0]+H[2]*r[1]);
+}
+
+static double quad_2_jac(const double g[4], const double H[6],
+                         const double r[2])
+{
+  double J[4];
+  quad_2_grad(J  ,g  ,H  ,r);
+  quad_2_grad(J+2,g+2,H+3,r);
+  return det_2(J);
+}
+
+static double det_3(const double A[9])
+{
+  const double a = A[4]*A[8]-A[5]*A[7],
+               b = A[5]*A[6]-A[3]*A[8],
+               c = A[3]*A[7]-A[4]*A[6];
+  return A[0]*a+A[1]*b+A[2]*c;
+}
+
+static double quad_3(const double x0, const double g[3], const double H[6],
+                     const double r[3])
+{
+  return x0 + (g[0]*r[0]+g[1]*r[1]+g[2]*r[2])
+            + (  r[0] * (H[0]*r[0]+H[1]*r[1]+H[2]*r[2])
+               + r[1] * (H[1]*r[0]+H[3]*r[1]+H[4]*r[2])
+               + r[2] * (H[2]*r[0]+H[4]*r[1]+H[5]*r[2]) )/2;
+}
+
+static void quad_3_grad(double grad[3], const double g[3], const double H[6],
+                        const double r[3])
+{
+  grad[0] = g[0] + (H[0]*r[0]+H[1]*r[1]+H[2]*r[2]);
+  grad[1] = g[1] + (H[1]*r[0]+H[3]*r[1]+H[4]*r[2]);
+  grad[2] = g[2] + (H[2]*r[0]+H[4]*r[1]+H[5]*r[2]);
+}
+
+static double quad_3_jac(const double g[9], const double H[18],
+                         const double r[3])
+{
+  double J[9];
+  quad_3_grad(J  ,g  ,H   ,r);
+  quad_3_grad(J+3,g+3,H+ 6,r);
+  quad_3_grad(J+6,g+6,H+12,r);
+  return det_3(J);
+}
+
+void rand_elt_2(double *x, double *y,
+                const double *zr, unsigned nr,
+                const double *zs, unsigned ns)
+{
+  static int init=0;
+  static double z4[4], lob_bnd_data[16+3*4*(2*16+1)],
+                work[2*16*(4+16+1)];
+  unsigned i,j;
+  double x0[2], g[4], H[6], jac[4*4], r[2];
+  struct dbl_range jr;
+  if(!init) {
+    init=1;
+    lobatto_nodes(z4,4);
+    lob_bnd_setup(lob_bnd_data,4,16);
+  }
+  do {
+    for(i=0;i<4;++i) g[i] = -1+2*(rand()/(double)RAND_MAX);
+    for(i=0;i<6;++i) H[i] =.5*(-1+2*(rand()/(double)RAND_MAX));
+    for(j=0;j<4;++j) { r[1] = z4[j];
+      for(i=0;i<4;++i) { r[0] = z4[i];
+        jac[j*4+i] = quad_2_jac(g,H,r);
+      }
+    }
+    jr = lob_bnd_2(lob_bnd_data,4,16, lob_bnd_data,4,16, jac, work);
+    /*printf("Jacobian range %g, %g\n", jr.min, jr.max);*/
+  } while(jr.max*jr.min<=0);
+  for(i=0;i< 2;++i) x0[i] = -1+2*(rand()/(double)RAND_MAX);
+  for(j=0;j<ns;++j) {   r[1] = zs[j];
+    for(i=0;i<nr;++i) { r[0] = zr[i];
+      x[j*nr+i] = quad_2(x0[0],g  ,H  ,r);
+      y[j*nr+i] = quad_2(x0[1],g+2,H+3,r);
+    }
+  }
+}
+
+void rand_elt_3(double *x, double *y, double *z,
+                const double *zr, unsigned nr,
+                const double *zs, unsigned ns,
+                const double *zt, unsigned nt)
+{
+  static int init=0;
+  static double z4[4], lob_bnd_data[16+3*4*(2*16+1)],
+                work[2*16*16*(4+16+1)];
+  unsigned i,j,k;
+  double x0[3], g[9], H[18], jac[4*4*4], r[3];
+  struct dbl_range jr;
+  if(!init) {
+    init=1;
+    lobatto_nodes(z4,4);
+    lob_bnd_setup(lob_bnd_data,4,16);
+  }
+  do {
+    for(i=0;i< 9;++i) g[i] = -1+2*(rand()/(double)RAND_MAX);
+    for(i=0;i<18;++i) H[i] =.5*(-1+2*(rand()/(double)RAND_MAX));
+    for(k=0;k<4;++k) { r[2] = z4[k];
+      for(j=0;j<4;++j) { r[1] = z4[j];
+        for(i=0;i<4;++i) { r[0] = z4[i];
+          jac[(k*4+j)*4+i] = quad_3_jac(g,H,r);
+        }
+      }
+    }
+    jr = lob_bnd_3(lob_bnd_data,4,16, lob_bnd_data,4,16, lob_bnd_data,4,16,
+                   jac, work);
+    /*printf("Jacobian range %g, %g\n", jr.min, jr.max);*/
+  } while(jr.max*jr.min<=0);
+  for(i=0;i< 3;++i) x0[i] = -1+2*(rand()/(double)RAND_MAX);
+  for(k=0;k<nt;++k) {     r[2] = zt[k];
+    for(j=0;j<ns;++j) {   r[1] = zs[j];
+      for(i=0;i<nr;++i) { r[0] = zr[i];
+        x[(k*ns+j)*nr+i] = quad_3(x0[0],g  ,H   ,r);
+        y[(k*ns+j)*nr+i] = quad_3(x0[1],g+3,H+ 6,r);
+        z[(k*ns+j)*nr+i] = quad_3(x0[2],g+6,H+12,r);
+      }
+    }
+  }
+}
+
+#define PI 3.1415926535897932384626433832795028841971693993751058209749445923
+
+void bubble_elt(double *x, double *y, double *z,
+                const double *zr, unsigned nr,
+                const double *zs, unsigned ns,
+                const double *zt, unsigned nt, int type)
+{
+  unsigned i,j,k;
+  for(k=0;k<nt;++k) for(j=0;j<ns;++j) for(i=0;i<nr;++i) {
+    double dx=0,dy=0,dz=0;
+    switch(type) {
+      case 0: dx =  cos(PI*zs[j]/2)*cos(PI*zt[k]/2); break;
+      case 1: dx = -cos(PI*zs[j]/2)*cos(PI*zt[k]/2); break;
+      case 2: dy =  cos(PI*zt[k]/2)*cos(PI*zr[i]/2); break;
+      case 3: dy = -cos(PI*zt[k]/2)*cos(PI*zr[i]/2); break;
+      case 4: dz =  cos(PI*zr[i]/2)*cos(PI*zs[j]/2); break;
+      case 5: dz = -cos(PI*zr[i]/2)*cos(PI*zs[j]/2); break;
+    }
+    x[(k*ns+j)*nr+i] = zr[i] + dx;
+    y[(k*ns+j)*nr+i] = zs[j] + dy;
+    z[(k*ns+j)*nr+i] = zt[k] + dz;
+  }
+}
diff --git a/src/jl/rand_elt_test.h b/src/jl/rand_elt_test.h
new file mode 100644
index 0000000..40c5325
--- /dev/null
+++ b/src/jl/rand_elt_test.h
@@ -0,0 +1,18 @@
+#ifndef RAND_ELT_TEST_H
+#define RAND_ELT_TEST_H
+
+void rand_elt_2(double *x, double *y,
+                const double *zr, unsigned nr,
+                const double *zs, unsigned ns);
+
+void rand_elt_3(double *x, double *y, double *z,
+                const double *zr, unsigned nr,
+                const double *zs, unsigned ns,
+                const double *zt, unsigned nt);
+
+void bubble_elt(double *x, double *y, double *z,
+                const double *zr, unsigned nr,
+                const double *zs, unsigned ns,
+                const double *zt, unsigned nt, int type);
+
+#endif
diff --git a/src/jl/rdtsc.h b/src/jl/rdtsc.h
new file mode 100644
index 0000000..b39663e
--- /dev/null
+++ b/src/jl/rdtsc.h
@@ -0,0 +1,12 @@
+#ifndef RDTSC_H
+#define RDTSC_H
+
+#define DEFINE_HW_COUNTER() \
+static __inline__ unsigned long long getticks(void) \
+{ \
+   volatile unsigned low, high; \
+   __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high)); \
+   return ((unsigned long long)high)<<32 | low; \
+}
+
+#endif
diff --git a/src/jl/sarray_sort.c b/src/jl/sarray_sort.c
new file mode 100644
index 0000000..0ec26d1
--- /dev/null
+++ b/src/jl/sarray_sort.c
@@ -0,0 +1,45 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include "c99.h"
+#include "name.h"
+#include "types.h"
+#include "fail.h"
+#include "mem.h"
+#include "sort.h"
+
+#define sarray_permute_     PREFIXED_NAME(sarray_permute_)
+#define sarray_permute_buf_ PREFIXED_NAME(sarray_permute_buf_)
+
+void sarray_permute_(size_t size, void *A, size_t n, uint *perm, void *work)
+{
+  char *const ar = A, *const item = work;
+  sint *const fperm = (sint*)perm;
+  uint i;
+  for(i=0;i<n;++i) {
+    sint pi = fperm[i];
+    if(pi<0) { fperm[i] = -pi-1; continue; }
+    else if((uint)pi==i) continue;
+    else {
+      char *dst = ar+i*size, *src = ar+pi*size;
+      memcpy(item, dst, size);
+      for(;;) {
+        sint ppi;
+        memcpy(dst, src, size);
+        dst=src;
+        ppi=fperm[pi], fperm[pi]=-ppi-1, pi=ppi;
+        if((uint)pi==i) break;
+        src=ar+pi*size;
+      }
+      memcpy(dst, item, size);
+    }
+  }
+}
+
+void sarray_permute_buf_(size_t align, size_t size, void *A, size_t n,
+                         buffer *buf)
+{
+  buffer_reserve(buf,align_as_(align,n*sizeof(uint)+size));
+  sarray_permute_(size,A,n, buf->ptr,
+                 (char*)buf->ptr + align_as_(align,n*sizeof(uint)));
+}
diff --git a/src/jl/sarray_sort.h b/src/jl/sarray_sort.h
new file mode 100644
index 0000000..77dc653
--- /dev/null
+++ b/src/jl/sarray_sort.h
@@ -0,0 +1,89 @@
+#ifndef SARRAY_SORT_H
+#define SARRAY_SORT_H
+
+#if !defined(SORT_H)
+#warning "sarray_sort.h" requires "sort.h"
+#endif
+
+/*------------------------------------------------------------------------------
+  
+  Array of Structs Sort
+  
+  buffer *buf;
+  typedef struct { ... } T;
+  T A[n];
+
+  sarray_sort(T,A,n, field_name,is_long, buf)
+    - sort A according to the struct field "field_name",
+      which is a ulong/uint field according as is_long is true/false
+
+  sarray_sort_2(T,A,n, field1,is_long1, field2,is_long2, buf)
+    - sort A by field1 then field2
+
+  sarray_permute(T,A,n, perm, work)
+    - permute A  (in-place)
+      A[0] <- A[perm[0]], etc.
+      work needs to hold sizeof(T) bytes  (i.e., 1 T)
+
+  sarray_permute_buf(T,A,n, buf);
+    - permute A according to the permutation in buf
+      A[0] <- A[perm[0]], etc.
+      where uint *perm = buf->ptr   (see "sort.h")
+
+  ----------------------------------------------------------------------------*/
+
+
+#define sarray_permute_     PREFIXED_NAME(sarray_permute_)
+#define sarray_permute_buf_ PREFIXED_NAME(sarray_permute_buf_)
+
+void sarray_permute_(size_t size, void *A, size_t n, uint *perm, void *work);
+void sarray_permute_buf_(
+  size_t align, size_t size, void *A, size_t n, buffer *buf);
+
+#define sarray_permute(T,A,n, perm, work) \
+  sarray_permute_(sizeof(T),A,n, perm, work)
+#define sarray_permute_buf(T,A,n, buf) \
+  sarray_permute_buf_(ALIGNOF(T),sizeof(T),A,n,buf)
+
+#define sarray_sort_field(T,A,n, field,is_long, buf,keep) do { \
+  if(is_long) \
+    sortp_long(buf,keep, (ulong*)((char*)(A)+offsetof(T,field)),n,sizeof(T)); \
+  else \
+    sortp     (buf,keep, (uint *)((char*)(A)+offsetof(T,field)),n,sizeof(T)); \
+} while (0)
+
+#define sarray_sort(T,A,n, field,is_long, buf) do { \
+  sarray_sort_field(T,A,n, field,is_long, buf,0); \
+  sarray_permute_buf(T,A,n, buf); \
+} while (0)
+
+#define sarray_sort_2(T,A,n, field1,is_long1, field2,is_long2, buf) do { \
+  sarray_sort_field(T,A,n, field2,is_long2, buf,0); \
+  sarray_sort_field(T,A,n, field1,is_long1, buf,1); \
+  sarray_permute_buf(T,A,n, buf); \
+} while (0)
+
+#define sarray_sort_3(T,A,n, field1,is_long1, field2,is_long2, \
+                             field3,is_long3, buf) do { \
+  sarray_sort_field(T,A,n, field3,is_long3, buf,0); \
+  sarray_sort_field(T,A,n, field2,is_long2, buf,1); \
+  sarray_sort_field(T,A,n, field1,is_long1, buf,1); \
+  sarray_permute_buf(T,A,n, buf); \
+} while (0)
+
+#define sarray_sort_4(T,A,n, field1,is_long1, field2,is_long2, \
+                             field3,is_long3, field4,is_long4, buf) do { \
+  sarray_sort_field(T,A,n, field4,is_long4, buf,0); \
+  sarray_sort_field(T,A,n, field3,is_long3, buf,1); \
+  sarray_sort_field(T,A,n, field2,is_long2, buf,1); \
+  sarray_sort_field(T,A,n, field1,is_long1, buf,1); \
+  sarray_permute_buf(T,A,n, buf); \
+} while (0)
+
+static void sarray_perm_invert(
+  uint *const pinv, const uint *const perm, const uint n)
+{
+  uint i; for(i=0;i<n;++i) pinv[perm[i]] = i;
+}
+
+#endif
diff --git a/src/jl/sarray_sort_test.c b/src/jl/sarray_sort_test.c
new file mode 100644
index 0000000..15fa780
--- /dev/null
+++ b/src/jl/sarray_sort_test.c
@@ -0,0 +1,47 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include "c99.h"
+#include "name.h"
+#include "fail.h"
+#include "types.h"
+#include "mem.h"
+#include "sort.h"
+#include "sarray_sort.h"
+
+int main()
+{
+  struct rec { double d; slong l; sint i; float f; };
+  buffer buf = {0,0,0};
+  struct rec rec[500];
+  uint i;
+  
+  for(i=0;i<500;++i) {
+    sint num1 = rand() & 0xff;
+    slong num2 = rand();
+    num2<<=(CHAR_BIT)*sizeof(int)-1;
+    num2|=rand();
+    num2<<=(CHAR_BIT)*sizeof(int)-1;
+    num2|=rand();
+    num2= num2<0?-num2:num2;
+    rec[i].d = num2;
+    rec[i].f = num2;
+    rec[i].l = num2;
+    rec[i].i = num1;
+  }
+  sarray_sort_2(struct rec,rec,500, i,0, l,1, &buf);
+  for(i=0;i<500;++i)
+    printf("%g\t%g\t%ld\t%d\n",
+      rec[i].d,rec[i].f,(long)rec[i].l,(int)rec[i].i);
+
+  printf("\n");
+  sarray_sort(struct rec,rec,500, l,1, &buf);
+  for(i=0;i<500;++i)
+    printf("%g\t%g\t%ld\t%d\n",
+      rec[i].d,rec[i].f,(long)rec[i].l,(int)rec[i].i);
+  buffer_free(&buf);
+  return 0;
+}
+
diff --git a/src/jl/sarray_transfer.c b/src/jl/sarray_transfer.c
new file mode 100644
index 0000000..9eed6ba
--- /dev/null
+++ b/src/jl/sarray_transfer.c
@@ -0,0 +1,197 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include "c99.h"
+#include "name.h"
+#include "fail.h"
+#include "types.h"
+#include "comm.h"
+#include "mem.h"
+#include "crystal.h"
+#include "sort.h"
+
+#define sarray_transfer_many PREFIXED_NAME(sarray_transfer_many)
+#define sarray_transfer_     PREFIXED_NAME(sarray_transfer_    )
+#define sarray_transfer_ext_ PREFIXED_NAME(sarray_transfer_ext_)
+
+static void pack_int(
+  buffer *const data, const unsigned row_size, const uint id,
+  const char *const restrict input, const uint n, const unsigned size,
+  const unsigned p_off, const uint *const restrict perm)
+{
+  const unsigned after = p_off + sizeof(uint), after_len = size-after;
+
+#define GET_P() memcpy(&p,row+p_off,sizeof(uint))
+#define COPY_ROW() memcpy(out,row,p_off), \
+                   memcpy((char*)out + p_off,row+after,after_len)
+
+#define PACK_BODY() do {                                                  \
+  uint dummy, *len_ptr=&dummy;                                            \
+  uint i, p,lp = -(uint)1, len=0;                                         \
+  uint *restrict out = buffer_reserve(data, n*(row_size+3)*sizeof(uint)); \
+  for(i=0;i<n;++i) {                                                      \
+    const char *row = input + size*perm[i];                               \
+    GET_P();                                                              \
+    if(p!=lp) {                                                           \
+      lp = p;                                                             \
+      *len_ptr = len;       /* previous message length */                 \
+      *out++ = p;           /* target */                                  \
+      *out++ = id;          /* source */                                  \
+      len_ptr=out++; len=0; /* length (t.b.d.) */                         \
+    }                                                                     \
+    COPY_ROW();                                                           \
+    out += row_size, len += row_size;                                     \
+  }                                                                       \
+  *len_ptr = len; /* last message length */                               \
+  data->n = out - (uint*)data->ptr;                                       \
+} while(0)
+  PACK_BODY();
+#undef COPY_ROW
+#undef GET_P
+}
+
+static void pack_ext(
+  buffer *const data, const unsigned row_size, const uint id,
+  const char *const restrict input, const uint n, const unsigned size,
+  const uint *const restrict proc, const unsigned proc_stride,
+  const uint *const restrict perm)
+{
+  #define GET_P() p=*(const uint*)((const char*)proc+proc_stride*perm[i])
+  #define COPY_ROW() memcpy(out,row,size)
+  PACK_BODY();
+  #undef PACK_BODY
+  #undef COPY_ROW
+  #undef GET_P
+}
+
+static void pack_more(
+  buffer *const data, const unsigned off, const unsigned row_size,
+  const char *const restrict input, const unsigned size,
+  const uint *restrict perm)
+{
+  uint *restrict buf = data->ptr, *buf_end = buf+data->n;
+  while(buf!=buf_end) {
+    uint *msg_end = buf+3+buf[2]; buf+=3;
+    while(buf!=msg_end)
+      memcpy((char*)buf+off, input+size*(*perm++), size), buf+=row_size;
+  }
+}
+
+static void unpack_more(
+  char *restrict out, const unsigned size,
+  const buffer *const data, const unsigned off, const unsigned row_size)
+{
+  const uint *restrict buf = data->ptr, *buf_end = buf+data->n;
+  while(buf!=buf_end) {
+    const uint *msg_end = buf+3+buf[2]; buf+=3;
+    while(buf!=msg_end)
+      memcpy(out, (char*)buf+off, size), out+=size, buf+=row_size;
+  }
+}
+
+static void unpack_int(
+  char *restrict out, const unsigned size, const unsigned p_off,
+  const buffer *const data, const unsigned row_size, int set_src)
+{
+  const unsigned after = p_off + sizeof(uint), after_len = size-after;
+  const uint *restrict buf = data->ptr, *buf_end = buf+data->n;
+  const unsigned pi = set_src ? 1:0;
+  while(buf!=buf_end) {
+    const uint p=buf[pi], *msg_end = buf+3+buf[2]; buf+=3;
+    while(buf!=msg_end) {
+      memcpy(out,buf,p_off);
+      memcpy(out+p_off,&p,sizeof(uint));
+      memcpy(out+after,(const char *)buf+p_off,after_len);
+      out+=size, buf+=row_size;
+    }
+  }
+}
+
+static uint num_rows(const buffer *const data, const unsigned row_size)
+{
+  const uint *buf = data->ptr, *buf_end = buf + data->n;
+  uint n=0;
+  while(buf!=buf_end) { uint len=buf[2]; n+=len, buf+=len+3; }
+  return n/row_size;
+}
+
+static uint cap_rows(buffer *const data, const unsigned row_size,const uint max)
+{
+  uint *buf = data->ptr, *buf_end = buf + data->n;
+  const uint maxn = max*row_size;
+  uint n=0;
+  while(buf!=buf_end) {
+    uint len=buf[2]; n+=len;
+    if(n<maxn) buf+=len+3;
+    else {
+      buf[2]-=(maxn-n); data->n = (buf-(uint*)data->ptr)+3+buf[2];
+      buf+=len+3;
+      while(buf!=buf_end) { uint len=buf[2]; n+=len, buf+=len+3; }
+      break;
+    }
+  }
+  return n/row_size;
+}
+
+/* An must be >= 1 */
+uint sarray_transfer_many(
+  struct array *const *const A, const unsigned *const size, const unsigned An,
+  const int fixed, const int ext, const int set_src, const unsigned p_off,
+  const uint *const restrict proc, const unsigned proc_stride,
+  struct crystal *const cr)
+{
+  uint n, *perm;
+  unsigned i,row_size,off,off1;
+
+  off1 = size[0];
+  if(!ext) off1 -= sizeof(uint);
+  row_size=off1; for(i=1;i<An;++i) row_size += size[i];
+  row_size = (row_size+sizeof(uint)-1)/sizeof(uint);
+  
+  perm = sortp(&cr->work,0, proc,A[0]->n,proc_stride);
+
+  if(!ext) pack_int(&cr->data, row_size, cr->comm.id, A[0]->ptr,A[0]->n,size[0],
+                    p_off, perm);
+  else     pack_ext(&cr->data, row_size, cr->comm.id, A[0]->ptr,A[0]->n,size[0],
+                    proc,proc_stride, perm);
+  for(off=off1,i=1;i<An;++i) if(size[i])
+    pack_more(&cr->data,off,row_size, A[i]->ptr,size[i], perm),off+=size[i];
+    
+  crystal_router(cr);
+  
+  if(!fixed) {
+    n = num_rows(&cr->data,row_size);
+    for(i=0;i<An;++i)
+      array_reserve_(A[i],n,size[i],__FILE__,__LINE__), A[i]->n=n;
+  } else {
+    uint max=A[0]->max, an;
+    for(i=1;i<An;++i) if(A[i]->max<max) max=A[i]->max;
+    n = cap_rows(&cr->data,row_size, max);
+    an = n>max?max:n;
+    for(i=0;i<An;++i) A[i]->n=an;
+  }
+  
+  if(!ext) unpack_int (A[0]->ptr,size[0],p_off, &cr->data,  row_size, set_src);
+  else     unpack_more(A[0]->ptr,size[0],       &cr->data,0,row_size);
+  for(off=off1,i=1;i<An;++i) if(size[i])
+    unpack_more(A[i]->ptr,size[i], &cr->data,off,row_size),off+=size[i];
+    
+  return n;
+}
+  
+
+void sarray_transfer_(struct array *const A, const unsigned size,
+                      const unsigned p_off, const int set_src,
+                      struct crystal *const cr)
+{
+  sarray_transfer_many(&A,&size,1, 0,0,set_src,p_off,
+                       (uint*)((char*)A->ptr+p_off),size, cr);
+}
+
+void sarray_transfer_ext_(struct array *const A, const unsigned size,
+                          const uint *const proc, const unsigned proc_stride,
+                          struct crystal *const cr)
+{
+  sarray_transfer_many(&A,&size,1, 0,1,0,0, proc,proc_stride, cr);
+}
+
diff --git a/src/jl/sarray_transfer.h b/src/jl/sarray_transfer.h
new file mode 100644
index 0000000..c195e21
--- /dev/null
+++ b/src/jl/sarray_transfer.h
@@ -0,0 +1,95 @@
+#ifndef SARRAY_TRANSFER_H
+#define SARRAY_TRANSFER_H
+
+#if !defined(CRYSTAL_H)
+#warning "sarray_transfer.h" requires "crystal.h"
+#endif
+
+/*
+  High-level interface for the crystal router.
+  Given an array of structs, transfers each to the process indicated
+  by a field of the struct, which gets set to the source process on output.
+  
+  For the dynamic "array" type, see "mem.h".
+  
+  Requires a "crystal router" object:
+  
+    struct comm c;
+    struct crystal cr;
+    
+    comm_init(&c, MPI_COMM_WORLD);
+    crystal_init(&cr, &c);
+    
+  Example sarray_transfer usage:
+  
+    struct T { ...; uint proc; ...; };
+    struct array A = null_array;
+    struct T *p, *e;
+    
+    // resize A to 100 struct T's, fill up with data
+    p = array_reserve(struct T, &A, 100), A.n=100;
+    for(e=p+A.n;p!=e;++p) {
+      ...
+      p->proc = ...;
+      ...
+    }
+    
+    // array A represents the array
+    //   struct T ar[A.n]    where &ar[0] == A.ptr
+    // transfer ar[i] to processor ar[i].proc  for each i=0,...,A.n-1:
+    
+    sarray_transfer(struct T, A, proc,set_src, &cr);
+    
+    // now array A represents a different array with a different size
+    //   struct T ar[A.n]    where &ar[0] == A.ptr
+    // the ordering is arbitrary
+    // if set_src != 0, ar[i].proc is set to the proc where ar[i] came from
+    // otherwise ar[i].proc is unchanged (and == this proc id)
+    
+    // note: two calls of
+    sarray_transfer(struct T, A, proc,1, &cr);
+    // in a row should return A to its original state, up to ordering
+ 
+  Cleanup:
+    array_free(&A);
+    crystal_free(&cr);
+    comm_free(&c);
+
+  Example sarray_transfer_ext usage:
+  
+    struct T { ... };
+    struct array A;
+    uint proc[A.n];
+    
+    // array A represents the array
+    //   struct T ar[A.n]    where &ar[0] == A.ptr
+    // transfer ar[i] to processor proc[i]  for each i=0,...,A.n-1:
+    sarray_transfer_ext(struct T, &A, proc, &cr);
+    
+    // no information is available now on where each struct came from
+
+*/
+
+#define sarray_transfer_many PREFIXED_NAME(sarray_transfer_many)
+#define sarray_transfer_     PREFIXED_NAME(sarray_transfer_    )
+#define sarray_transfer_ext_ PREFIXED_NAME(sarray_transfer_ext_)
+
+uint sarray_transfer_many(
+  struct array *const *const A, const unsigned *const size, const unsigned An,
+  const int fixed, const int ext, const int set_src, const unsigned p_off,
+  const uint *const restrict proc, const unsigned proc_stride,
+  struct crystal *const cr);
+void sarray_transfer_(struct array *const A, const unsigned size,
+                      const unsigned p_off, const int set_src,
+                      struct crystal *const cr);
+void sarray_transfer_ext_(struct array *const A, const unsigned size,
+                          const uint *const proc, const unsigned proc_stride,
+                          struct crystal *const cr);
+
+#define sarray_transfer(T,A,proc_field,set_src,cr) \
+  sarray_transfer_(A,sizeof(T),offsetof(T,proc_field),set_src,cr)
+
+#define sarray_transfer_ext(T,A,proc,proc_stride,cr) \
+  sarray_transfer_ext_(A,sizeof(T),proc,proc_stride,cr)
+
+#endif
diff --git a/src/jl/sarray_transfer_test.c b/src/jl/sarray_transfer_test.c
new file mode 100644
index 0000000..aaf3b7f
--- /dev/null
+++ b/src/jl/sarray_transfer_test.c
@@ -0,0 +1,93 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "c99.h"
+#include "name.h"
+#include "fail.h"
+#include "types.h"
+#include "comm.h"
+#include "mem.h"
+#include "sort.h"
+#include "sarray_sort.h"
+#include "crystal.h"
+#include "sarray_transfer.h"
+
+typedef struct {
+  double d;
+  ulong l,l2;
+  uint i;
+  uint p;
+} r_work;
+
+int main(int narg, char *arg[])
+{
+  comm_ext world; int np;
+  struct comm comm;
+  struct crystal crystal;
+  struct array A, A0=null_array; r_work *row, *row_0;
+  uint i;
+#ifdef MPI
+  MPI_Init(&narg,&arg);
+  world = MPI_COMM_WORLD;
+  MPI_Comm_size(world,&np);
+#else
+  world=0, np=1;
+#endif
+
+  comm_init(&comm,world);
+  crystal_init(&crystal,&comm);
+
+  array_init(r_work,&A,np*3), A.n=np*3, row=A.ptr;
+  for(i=0;i<A.n;++i) {
+    row[i].i = rand();
+    row[i].l = row[i].l2 = rand();
+    row[i].p = rand()%np;
+    row[i].d = rand()/(double)rand();
+  }
+  
+  sarray_sort_3(r_work,row,A.n, i,0, l,1, p,0, &crystal.data);
+  
+  for(i=0;i<A.n;++i)
+    printf("%02d send -> %02d: %08x %08x %d %g\n",
+      (int)comm.id,(int)row[i].p,(int)row[i].i,
+      (int)row[i].l,(int)row[i].p,row[i].d);
+  
+  array_cat(r_work,&A0, row,A.n);
+  
+  sarray_transfer(r_work,&A, p,1, &crystal);
+
+  row=A.ptr;
+  for(i=0;i<A.n;++i)
+    printf("%02d recv <- %02d: %08x %08x %d %g\n",
+      (int)comm.id,(int)row[i].p,(int)row[i].i,
+      (int)row[i].l,(int)row[i].p,row[i].d);
+
+  sarray_transfer(r_work,&A, p,1, &crystal);
+  sarray_sort_3(r_work,row,A.n, i,0, l,1, p,0, &crystal.data);
+  if(A.n!=A0.n)
+    fail(1,__FILE__,__LINE__,"final array has different length than original");
+  row=A.ptr, row_0=A0.ptr;
+  for(i=0;i<A.n;++i)
+    if(   row[i].d != row_0[i].d
+       || row[i].l != row_0[i].l
+       || row[i].l2!= row_0[i].l2
+       || row[i].i != row_0[i].i
+       || row[i].p != row_0[i].p)
+      fail(1,__FILE__,__LINE__,"final array differs from original");
+      
+  array_free(&A0);
+  array_free(&A);
+  crystal_free(&crystal);
+
+  fflush(stdout); comm_barrier(&comm);
+  if(comm.id==0) printf("tests passed\n"), fflush(stdout);
+  
+  comm_free(&comm);
+  
+#ifdef MPI
+  MPI_Finalize();
+#endif
+
+  return 0;
+}
diff --git a/src/jl/sort.c b/src/jl/sort.c
new file mode 100644
index 0000000..5b25f42
--- /dev/null
+++ b/src/jl/sort.c
@@ -0,0 +1,31 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "c99.h"
+#include "name.h"
+#include "fail.h"
+#include "types.h"
+#include "mem.h"
+
+#define T unsigned int
+#define SORT_SUFFIX _ui
+#include "sort_imp.h"
+#undef SORT_SUFFIX
+#undef T
+
+#if defined(USE_LONG) || defined(GLOBAL_LONG)
+#  define T unsigned long
+#  define SORT_SUFFIX _ul
+#  include "sort_imp.h"
+#  undef SORT_SUFFIX
+#  undef T
+#endif
+
+#if defined(USE_LONG_LONG) || defined(GLOBAL_LONG_LONG)
+#  define T unsigned long long
+#  define SORT_SUFFIX _ull
+#  include "sort_imp.h"
+#  undef SORT_SUFFIX
+#  undef T
+#endif
diff --git a/src/jl/sort.h b/src/jl/sort.h
new file mode 100644
index 0000000..eaeeb95
--- /dev/null
+++ b/src/jl/sort.h
@@ -0,0 +1,76 @@
+#ifndef SORT_H
+#define SORT_H
+
+#if !defined(TYPES_H) || !defined(MEM_H)
+#warning "sort.h" requires "types.h" and "mem.h"
+/* types.h defines uint, ulong
+   mem.h   defines buffer */
+#endif
+
+/*------------------------------------------------------------------------------
+  
+  Sort
+  
+  O(n) stable sort with good performance for all n
+
+  sortv     (uint  *out,  const uint  *A, uint n, uint stride,  buffer *buf)
+  sortv_long(ulong *out,  const ulong *A, uint n, uint stride,  buffer *buf)
+
+  sortp     (buffer *buf, int perm_start,  const uint  *A, uint n, uint stride)
+  sortp_long(buffer *buf, int perm_start,  const ulong *A, uint n, uint stride)
+
+  A, n, stride : specifices the input (stride is in bytes!)
+  out : the sorted values on output
+
+  For the value sort, (sortv*)
+    A and out may alias (A == out) exactly when stride == sizeof(T)
+
+  For the permutation sort, (sortp*)
+    The permutation can be both input (when start_perm!=0) and output,
+    following the convention that it is always at the start of the buffer buf:
+      uint *perm = buf->ptr;
+    The permutation denotes the ordering
+      A[perm[0]], A[perm[1]], ..., A[perm[n-1]]
+    (assuming stride == sizeof(uint) or sizeof(ulong) as appropriate)
+    and is re-arranged stably to give a sorted ordering.
+    Specifying start_perm==0 is equivalent to specifying
+      perm[i] = i,   i=0,...,n-1
+    for an initial permutation (but may be faster).
+    The buffer will be expanded as necessary to accomodate the permutation
+    and the required scratch space.
+  
+  Most code calls these routines indirectly via the higher-level routine
+    sarray_sort for sorting arrays of structs (see "sarray_sort.h").
+
+  ----------------------------------------------------------------------------*/
+
+#define sortv_ui  PREFIXED_NAME(sortv_ui)
+#define sortv_ul  PREFIXED_NAME(sortv_ul)
+#define sortv_ull PREFIXED_NAME(sortv_ull)
+#define sortp_ui  PREFIXED_NAME(sortp_ui)
+#define sortp_ul  PREFIXED_NAME(sortp_ul)
+#define sortp_ull PREFIXED_NAME(sortp_ull)
+
+#define sortv TYPE_LOCAL(sortv_ui,sortv_ul,sortv_ull)
+#define sortp TYPE_LOCAL(sortp_ui,sortp_ul,sortp_ull)
+#define sortv_long TYPE_GLOBAL(sortv_ui,sortv_ul,sortv_ull)
+#define sortp_long TYPE_GLOBAL(sortp_ui,sortp_ul,sortp_ull)
+
+void sortv_ui(unsigned *out, const unsigned *A, uint n, unsigned stride,
+              buffer *restrict buf);
+void sortv_ul(unsigned long *out,
+              const unsigned long *A, uint n, unsigned stride,
+              buffer *restrict buf);
+uint *sortp_ui(buffer *restrict buf, int start_perm,
+               const unsigned *restrict A, uint n, unsigned stride);
+uint *sortp_ul(buffer *restrict buf, int start_perm,
+               const unsigned long *restrict A, uint n, unsigned stride);
+#if defined(USE_LONG_LONG) || defined(GLOBAL_LONG_LONG)
+void sortv_ull(unsigned long long *out,
+               const unsigned long long *A, uint n, unsigned stride,
+               buffer *restrict buf);
+uint *sortp_ull(buffer *restrict buf, int start_perm,
+                const unsigned long long *restrict A, uint n, unsigned stride);
+#endif
+
+#endif
diff --git a/src/jl/sort_imp.h b/src/jl/sort_imp.h
new file mode 100644
index 0000000..08b05d1
--- /dev/null
+++ b/src/jl/sort_imp.h
@@ -0,0 +1,543 @@
+#if !defined(T) || !defined(SORT_SUFFIX)
+#error sort_imp.h not meant to be compiled by itself
+#endif
+
+#define sort_data       TOKEN_PASTE(sort_data      ,SORT_SUFFIX)
+#define radix_count     TOKEN_PASTE(radix_count    ,SORT_SUFFIX)
+#define radix_offsets   TOKEN_PASTE(radix_offsets  ,SORT_SUFFIX)
+#define radix_zeros     TOKEN_PASTE(radix_zeros    ,SORT_SUFFIX)
+#define radix_passv     TOKEN_PASTE(radix_passv    ,SORT_SUFFIX)
+#define radix_sortv     TOKEN_PASTE(radix_sortv    ,SORT_SUFFIX)
+#define radix_passp0_b  TOKEN_PASTE(radix_passp0_b ,SORT_SUFFIX)
+#define radix_passp_b   TOKEN_PASTE(radix_passp_b  ,SORT_SUFFIX)
+#define radix_passp_m   TOKEN_PASTE(radix_passp_m  ,SORT_SUFFIX)
+#define radix_passp_e   TOKEN_PASTE(radix_passp_e  ,SORT_SUFFIX)
+#define radix_passp0_be TOKEN_PASTE(radix_passp0_be,SORT_SUFFIX)
+#define radix_passp_be  TOKEN_PASTE(radix_passp_be, SORT_SUFFIX)
+#define radix_sortp     TOKEN_PASTE(radix_sortp    ,SORT_SUFFIX)
+#define merge_sortv     TOKEN_PASTE(merge_sortv    ,SORT_SUFFIX)
+#define merge_copy_perm TOKEN_PASTE(merge_copy_perm,SORT_SUFFIX)
+#define merge_sortp0    TOKEN_PASTE(merge_sortp0   ,SORT_SUFFIX)
+#define merge_sortp     TOKEN_PASTE(merge_sortp    ,SORT_SUFFIX)
+#define heap_sortv      TOKEN_PASTE(heap_sortv     ,SORT_SUFFIX)
+
+#define sortv PREFIXED_NAME(TOKEN_PASTE(sortv,SORT_SUFFIX))
+#define sortp PREFIXED_NAME(TOKEN_PASTE(sortp,SORT_SUFFIX))
+
+typedef struct { T v; uint i; } sort_data;
+
+#define INC_PTR(A,stride) ((A)=(T*)((char*)(A)+(stride)))
+#define INDEX_PTR(A,stride,i) (*(T*)((char*)(A)+(i)*(stride)))
+
+/*------------------------------------------------------------------------------
+  
+  Radix Sort
+  
+  stable; O(n+k) time and extra storage
+    where k = (digits in an int) * 2^(bits per digit)
+    (e.g. k = 4 * 256 = 1024 for 32-bit ints with 8-bit digits)
+
+  brief description:
+    input sorted stably on each digit, starting with the least significant
+    counting sort is used for each digit:
+      a pass through the input counts the occurences of each digit value
+      on a second pass, each input has a known destination
+  
+  tricks:
+    all counting passes are combined into one
+    the counting pass also computes the inclusive bit-wise or of all inputs,
+      which is used to skip digit positions for which all inputs have zeros
+
+  ----------------------------------------------------------------------------*/
+
+#define STATIC_DIGIT_BUCKETS 1
+
+#define DIGIT_BITS   8
+#define DIGIT_VALUES (1<<DIGIT_BITS)
+#define DIGIT_MASK   ((T)(DIGIT_VALUES-1))
+#define CEILDIV(a,b) (((a)+(b)-1)/(b))
+#define DIGITS       CEILDIV(CHAR_BIT*sizeof(T),DIGIT_BITS)
+#define VALUE_BITS   (DIGIT_BITS*DIGITS)
+#define COUNT_SIZE   (DIGITS*DIGIT_VALUES)
+
+/* used to unroll a tiny loop: */
+#define COUNT_DIGIT_01(n,i) \
+    if(n>i) count[i][val&DIGIT_MASK]++, val>>=DIGIT_BITS
+#define COUNT_DIGIT_02(n,i) COUNT_DIGIT_01(n,i); COUNT_DIGIT_01(n,i+ 1)
+#define COUNT_DIGIT_04(n,i) COUNT_DIGIT_02(n,i); COUNT_DIGIT_02(n,i+ 2)
+#define COUNT_DIGIT_08(n,i) COUNT_DIGIT_04(n,i); COUNT_DIGIT_04(n,i+ 4)
+#define COUNT_DIGIT_16(n,i) COUNT_DIGIT_08(n,i); COUNT_DIGIT_08(n,i+ 8)
+#define COUNT_DIGIT_32(n,i) COUNT_DIGIT_16(n,i); COUNT_DIGIT_16(n,i+16)
+#define COUNT_DIGIT_64(n,i) COUNT_DIGIT_32(n,i); COUNT_DIGIT_32(n,i+32)
+
+static T radix_count(
+  uint (*restrict count)[DIGIT_VALUES],
+  const T *restrict A, const T *const end, const unsigned stride)
+{
+  T bitorkey = 0;
+  memset(count,0,COUNT_SIZE*sizeof(uint));
+  do {
+    T val=*A;
+    bitorkey|=val;
+    COUNT_DIGIT_64(DIGITS,0);
+    /* above macro expands to:
+    if(DIGITS> 0) count[ 0][val&DIGIT_MASK]++, val>>=DIGIT_BITS;
+    if(DIGITS> 1) count[ 1][val&DIGIT_MASK]++, val>>=DIGIT_BITS;
+      ...
+    if(DIGITS>63) count[63][val&DIGIT_MASK]++, val>>=DIGIT_BITS;
+    */
+  } while(INC_PTR(A,stride),A!=end);
+  return bitorkey;
+}
+
+#undef COUNT_DIGIT_01
+#undef COUNT_DIGIT_02
+#undef COUNT_DIGIT_04
+#undef COUNT_DIGIT_08
+#undef COUNT_DIGIT_16
+#undef COUNT_DIGIT_32
+#undef COUNT_DIGIT_64
+
+static void radix_offsets(uint *restrict c)
+{
+  uint *const ce = c+DIGIT_VALUES;
+  uint sum = 0;
+  do {
+    const uint c0=c[0], c1=c[1], c2=c[2], c3=c[3]; 
+    const uint o1=sum+c0, o2=o1+c1, o3=o2+c2;
+    c[0]=sum, c[1]=o1, c[2]=o2, c[3]=o3;
+    sum = o3+c3;
+    c+=4;
+  } while(c!=ce);
+}
+
+static unsigned radix_zeros(
+  T bitorkey, uint (*restrict count)[DIGIT_VALUES],
+  unsigned *restrict shift, uint **restrict offsets)
+{
+  unsigned digits=0, sh=0; uint *c = &count[0][0];
+  do {
+    if(bitorkey&DIGIT_MASK) *shift++ = sh, *offsets++ = c, ++digits,
+                            radix_offsets(c);
+  } while(bitorkey>>=DIGIT_BITS,sh+=DIGIT_BITS,c+=DIGIT_VALUES,sh!=VALUE_BITS);
+  return digits;
+}
+
+static void radix_passv(
+  const T *restrict A, const T *const end, const unsigned stride,
+  const unsigned sh, uint *const restrict off, T *const restrict out)
+{
+  do out[off[(*A>>sh)&DIGIT_MASK]++] = *A; while(INC_PTR(A,stride),A!=end);
+}
+
+static void radix_sortv(
+  T *out, const T *A, const uint n, const unsigned stride,
+  T *work, uint (*restrict count)[DIGIT_VALUES])
+{
+  const T *const end = &INDEX_PTR(A,stride,n);
+  T bitorkey = radix_count(count, A,end,stride);
+  unsigned shift[DIGITS]; uint *offsets[DIGITS];
+  const unsigned digits = radix_zeros(bitorkey,count,shift,offsets);
+  if(digits==0) {
+    memset(out,0,n*sizeof(T));
+  } else {
+    T *src, *dst; unsigned d;
+    if(out==A || (digits&1)==0) dst=out,src=work;
+                           else src=out,dst=work;
+    radix_passv(A,end,stride,shift[0],offsets[0],src);
+    for(d=1;d!=digits;++d) {
+      T *t;
+      radix_passv(src,src+n,sizeof(T),shift[d],offsets[d],dst);
+      t=src,src=dst,dst=t;
+    }
+    if(src!=out) memcpy(out,src,n*sizeof(T));
+  }
+}
+
+static void radix_passp0_b(
+  const T *restrict A, const uint n, const unsigned stride,
+  const unsigned sh, uint *const restrict off,
+  sort_data *const restrict out)
+{
+  uint i=0;
+  do {
+    T v = *A;
+    sort_data *d = &out[off[(v>>sh)&DIGIT_MASK]++];
+    d->v=v, d->i=i++;
+  } while(INC_PTR(A,stride),i!=n);
+}
+
+static void radix_passp_b(
+  const uint *restrict p,
+  const T *const restrict A, const uint n, const unsigned stride,
+  const unsigned sh, uint *const restrict off,
+  sort_data *const out)
+{
+  const uint *const pe = p+n;
+  do {
+    uint j = *p++;
+    T v = INDEX_PTR(A,stride,j);
+    sort_data *d = &out[off[(v>>sh)&DIGIT_MASK]++];
+    d->v=v, d->i=j;
+  } while(p!=pe);
+}
+
+static void radix_passp_m(
+  const sort_data *restrict src, const sort_data *const end,
+  const unsigned sh, uint *const restrict off,
+  sort_data *const restrict out)
+{
+  do {
+    sort_data *d = &out[off[(src->v>>sh)&DIGIT_MASK]++];
+    d->v=src->v,d->i=src->i;
+  } while(++src!=end);
+}
+
+static void radix_passp_e(
+  const sort_data *restrict src, const sort_data *const end,
+  const unsigned sh, uint *const restrict off,
+  uint *const restrict out)
+{
+  do out[off[(src->v>>sh)&DIGIT_MASK]++]=src->i; while(++src!=end);
+}
+
+static void radix_passp0_be(
+  uint *const restrict out,
+  const T *restrict A, const uint n, const unsigned stride,
+  const unsigned sh, uint *const restrict off)
+{
+  uint i=0;
+  do out[off[(*A>>sh)&DIGIT_MASK]++]=i++; while(INC_PTR(A,stride),i!=n);
+}
+
+static void radix_passp_be(
+  uint *restrict p,
+  const T *restrict A, const uint n, const unsigned stride,
+  const unsigned sh, uint *const restrict off,
+  sort_data *restrict work)
+{
+  uint *q = p, *const qe = p+n;
+  uint *w = &work[0].i;
+  do {
+    uint j = *q++;
+    T v = INDEX_PTR(A,stride,j);
+    w[off[(v>>sh)&DIGIT_MASK]++]=j;
+  } while(q!=qe);
+  memcpy(p,w,n*sizeof(uint));
+}
+
+static void radix_sortp(
+  uint *restrict idx, uint perm_start,
+  const T *restrict A, const uint n, const unsigned stride,
+  sort_data *restrict work,
+  uint (*restrict count)[DIGIT_VALUES])
+{
+  T bitorkey = radix_count(count, A,&INDEX_PTR(A,stride,n),stride);
+  unsigned shift[DIGITS]; uint *offsets[DIGITS];
+  unsigned digits = radix_zeros(bitorkey,count,shift,offsets);
+  if(digits==0) {
+    if(!perm_start) { uint i=0; do *idx++=i++; while(i!=n); }
+  } else if(digits==1) {
+    if(perm_start) radix_passp_be (idx,A,n,stride,shift[0],offsets[0],work);
+              else radix_passp0_be(idx,A,n,stride,shift[0],offsets[0]);
+  } else {
+    sort_data *src, *dst; unsigned d;
+    if((digits&1)==0) dst=work,src=dst+n;
+                 else src=work,dst=src+n;
+    if(perm_start) radix_passp_b (idx,A,n,stride,shift[0],offsets[0],src);
+              else radix_passp0_b(    A,n,stride,shift[0],offsets[0],src);
+    for(d=1;d!=digits-1;++d) {
+      sort_data *t;
+      radix_passp_m(src,src+n,shift[d],offsets[d],dst);
+      t=src,src=dst,dst=t;
+    }
+    radix_passp_e(src,src+n,shift[d],offsets[d],idx);
+  }
+}
+
+/*------------------------------------------------------------------------------
+  
+  Merge Sort
+  
+  stable; O(n log n) time
+
+  ----------------------------------------------------------------------------*/
+
+#define MERGE_2(p,v)                           \
+  if(VAL(v[1])<VAL(v[0])) p[0]=v[1],p[1]=v[0]; \
+                     else p[0]=v[0],p[1]=v[1]
+#define MERGE_3(p,v) do                                              \
+  if(VAL(v[1])<VAL(v[0])) {                                          \
+    if(VAL(v[2])<VAL(v[1]))        p[0]=v[2],p[1]=v[1],p[2]=v[0];    \
+    else { if(VAL(v[2])<VAL(v[0])) p[0]=v[1],p[1]=v[2],p[2]=v[0];    \
+                              else p[0]=v[1],p[1]=v[0],p[2]=v[2]; }  \
+  } else {                                                           \
+     if(VAL(v[2])<VAL(v[0]))        p[0]=v[2],p[1]=v[0],p[2]=v[1];   \
+     else { if(VAL(v[2])<VAL(v[1])) p[0]=v[0],p[1]=v[2],p[2]=v[1];   \
+                               else p[0]=v[0],p[1]=v[1],p[2]=v[2]; } \
+  } while(0)
+#define MERGE_SORT() \
+  do {                                                                 \
+    uint i=0, n=An, base=-n, odd=0, c=0, b=1;                          \
+    for(;;) {                                                          \
+      DATA *restrict p;                                                \
+      if((c&1)==0) {                                                   \
+        base+=n, n+=(odd&1), c|=1, b^=1;                               \
+        while(n>3) odd<<=1,odd|=(n&1),n>>=1,c<<=1,b^=1;                \
+      } else                                                           \
+        base-=n-(odd&1),n<<=1,n-=(odd&1),odd>>=1,c>>=1;                \
+      if(c==0) break;                                                  \
+      p = buf[b]+base;                                                 \
+      if(n==2) {                                                       \
+        DATA v[2]; SETVAL(v[0],i), SETVAL(v[1],i+1);                   \
+        MERGE_2(p,v);                                                  \
+        i+=2;                                                          \
+      } else if(n==3) {                                                \
+        DATA v[3]; SETVAL(v[0],i), SETVAL(v[1],i+1), SETVAL(v[2],i+2); \
+        MERGE_3(p,v);                                                  \
+        i+=3;                                                          \
+      } else {                                                         \
+        const uint na = n>>1, nb = (n+1)>>1;                           \
+        const DATA *restrict ap = buf[b^1]+base, *const ae = ap+na;    \
+        DATA *restrict bp = p+na, *const be = bp+nb;                   \
+        for(;;) {                                                      \
+          if(VAL((*bp))<VAL((*ap))) {                                  \
+            *p++=*bp++;                                                \
+            if(bp!=be) continue;                                       \
+            do *p++=*ap++; while(ap!=ae);                              \
+            break;                                                     \
+          } else {                                                     \
+            *p++=*ap++;                                                \
+            if(ap==ae) break;                                          \
+          }                                                            \
+        }                                                              \
+      }                                                                \
+    }                                                                  \
+  } while(0)
+
+static void merge_sortv(
+  T *restrict out,
+  const T *restrict A, const uint An, const unsigned stride,
+  T *restrict work)
+{
+  T *buf[2]; buf[0]=out, buf[1]=work;
+#define DATA T
+#define VAL(x) x
+#define SETVAL(x,ai) x=*A,INC_PTR(A,stride)
+  MERGE_SORT();
+#undef SETVAL
+#undef VAL
+#undef DATA
+}
+
+static void merge_copy_perm(
+  uint *restrict idx, const sort_data *restrict p, uint n)
+{
+  /*const sort_data *pe = p+n;
+  do *idx++ = (p++)->i; while(p!=pe);*/
+  uint n_by_8 = (n+7)/8;
+  switch(n%8) {
+    case 0: do { *idx++ = (p++)->i;
+    case 7:      *idx++ = (p++)->i;
+    case 6:      *idx++ = (p++)->i;
+    case 5:      *idx++ = (p++)->i;
+    case 4:      *idx++ = (p++)->i;
+    case 3:      *idx++ = (p++)->i;
+    case 2:      *idx++ = (p++)->i;
+    case 1:      *idx++ = (p++)->i;
+    } while (--n_by_8 > 0);
+  }
+}
+
+static void merge_sortp0(
+  uint *restrict idx,
+  const T *restrict A, const uint An, const unsigned stride,
+  sort_data *restrict work)
+{
+  sort_data *buf[2]; buf[0]=work+An,buf[1]=work;
+#define DATA sort_data
+#define VAL(x) x.v
+#define SETVAL(x,ai) x.v=*A,INC_PTR(A,stride),x.i=ai
+  MERGE_SORT();
+#undef SETVAL
+#undef VAL
+#undef DATA
+  merge_copy_perm(idx,buf[0],An);
+}
+
+static void merge_sortp(
+  uint *restrict idx,
+  const T *const restrict A, const uint An, const unsigned stride,
+  sort_data *restrict work)
+{
+  sort_data *buf[2]; buf[0]=work+An,buf[1]=work;
+#define DATA sort_data
+#define VAL(x) x.v
+#define SETVAL(x,ai) x.i=idx[ai],x.v=INDEX_PTR(A,stride,x.i)
+  MERGE_SORT();
+#undef SETVAL
+#undef VAL
+#undef DATA
+  merge_copy_perm(idx,buf[0],An);
+}
+
+#undef MERGE_SORT
+#undef MERGE_3
+#undef MERGE_2
+
+/*------------------------------------------------------------------------------
+  
+  Heap Sort
+  
+  in-place, stability unobservable; O(n log n) time
+
+  ----------------------------------------------------------------------------*/
+static void heap_sortv(T *const restrict A, unsigned n)
+{
+  unsigned i;
+  /* build heap */
+  for(i=1;i<n;++i) {
+    T item = A[i];
+    unsigned h=i, p = (h-1)>>1;
+    if(A[p] >= item) continue;
+    do A[h]=A[p], h=p, p=(p-1)>>1; while(h && A[p] < item);
+    A[h] = item;
+  }
+  /* extract */
+  for(i=n-1;i;--i) {
+    T item = A[i];
+    unsigned h = 0;
+    A[i] = A[0];
+    for(;;) {
+      unsigned ch = 1+(h<<1), r = ch+1;
+      if(r<i && A[ch] < A[r]) ch=r;
+      if(ch>=i || item >= A[ch]) break;
+      A[h]=A[ch], h=ch;
+    }
+    A[h] = item;
+  }
+}
+
+
+/*------------------------------------------------------------------------------
+  
+  Hybrid Stable Sort
+  
+  low-overhead merge sort when n is small,
+  otherwise asymptotically superior radix sort
+
+  result = O(n) sort with good performance for all n
+  
+  A, n, stride : specifices the input, stride in bytes
+  out : the sorted values on output
+
+  For the value sort,
+    A and out may alias (A == out) exactly when stride == sizeof(T),
+      in which case heap sort is used for small sizes
+
+  For the permutation sort,
+    the permutation can be both input (when start_perm!=0) and output,
+    following the convention that it is always at the start of the buffer buf;
+    the buffer will be expanded as necessary to accomodate the permutation
+    and the required scratch space
+
+  ----------------------------------------------------------------------------*/
+
+void sortv(T *out, const T *A, uint n, unsigned stride, buffer *restrict buf)
+{
+  if(n<DIGIT_VALUES) {
+    if(n<2) {
+      if(n==0) return;
+      *out = *A;
+    } else {
+      if(out==A) {
+        if(stride!=sizeof(T))
+          fail(1,__FILE__,__LINE__,"in-place sort with non-unit stride");
+        heap_sortv(out,n);
+      } else {
+        buffer_reserve(buf,n*sizeof(T));
+        merge_sortv(out, A,n,stride, (T*)buf->ptr);
+      }
+    }
+  } else if(STATIC_DIGIT_BUCKETS) {
+    static uint count[DIGITS][DIGIT_VALUES];
+    buffer_reserve(buf,n*sizeof(T));
+    radix_sortv(out, A,n,stride, (T*)buf->ptr,count);
+  } else {
+    T *restrict work;
+    uint (*restrict count)[DIGIT_VALUES];
+    const size_t count_off=align_as(uint,n*sizeof(T));
+    buffer_reserve(buf,count_off+sizeof(uint[DIGITS][DIGIT_VALUES]));
+    work = buf->ptr;
+    count = (uint(*)[DIGIT_VALUES])((char*)buf->ptr+count_off);
+    radix_sortv(out, A,n,stride, work,count);
+  }
+}
+
+uint *sortp(buffer *restrict buf, int start_perm,
+            const T *restrict A, uint n, unsigned stride)
+{
+  uint *restrict perm;
+  sort_data *restrict work;
+  size_t work_off=align_as(sort_data,n*sizeof(uint));
+  if(n<DIGIT_VALUES) {
+    buffer_reserve(buf,work_off+2*n*sizeof(sort_data));
+    perm = buf->ptr;
+    work = (sort_data*)((char*)buf->ptr+work_off);
+    if(n<2) {
+      if(n==1) *perm=0;
+    } else {
+      if(start_perm) merge_sortp (perm, A,n,stride, work);
+      else           merge_sortp0(perm, A,n,stride, work);
+    }
+  } else if(STATIC_DIGIT_BUCKETS){
+    static uint count[DIGITS][DIGIT_VALUES];
+    buffer_reserve(buf,work_off+2*n*sizeof(sort_data));
+    perm = buf->ptr;
+    work = (sort_data*)((char*)buf->ptr+work_off);
+    radix_sortp(perm,start_perm, A,n,stride, work,count);
+  } else {
+    uint (*restrict count)[DIGIT_VALUES];
+    const size_t count_off=align_as(uint,work_off+2*n*sizeof(sort_data));
+    buffer_reserve(buf,count_off+sizeof(uint[DIGITS][DIGIT_VALUES]));
+    perm = buf->ptr;
+    work = (sort_data*)((char*)buf->ptr+work_off);
+    count = (uint(*)[DIGIT_VALUES])((char*)buf->ptr+count_off);
+    radix_sortp(perm,start_perm, A,n,stride, work,count);
+  }  
+  return perm;
+}
+
+#undef STATIC_DIGIT_BUCKETS
+
+#undef DIGIT_BITS
+#undef DIGIT_VALUES
+#undef DIGIT_MASK
+#undef CEILDIV
+#undef DIGITS
+#undef VALUE_BITS
+#undef COUNT_SIZE
+
+#undef INDEX_PTR
+#undef INC_PTR
+
+#undef sortp
+#undef sortv
+
+#undef merge_sortp
+#undef merge_sortp0
+#undef merge_sortv
+#undef radix_sortp
+#undef radix_passp_be
+#undef radix_passp0_be
+#undef radix_passp_e
+#undef radix_passp_m
+#undef radix_passp_b
+#undef radix_passp0_b
+#undef radix_sortv
+#undef radix_passv
+#undef radix_zeros
+#undef radix_offsets
+#undef radix_count
+#undef sort_data
+
diff --git a/src/jl/sort_test.c b/src/jl/sort_test.c
new file mode 100644
index 0000000..acd0bb3
--- /dev/null
+++ b/src/jl/sort_test.c
@@ -0,0 +1,113 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <limits.h>
+#include <string.h>
+#include "c99.h"
+#include "name.h"
+#include "fail.h"
+#include "types.h"
+#include "mem.h"
+#include "sort.h"
+
+#define SMALL 22
+#define NUM   500
+#define SI 9
+
+ulong A[NUM][SI], Av[NUM];
+uint  B[NUM][SI], Bv[NUM];
+
+uint P[NUM], Q[NUM];
+
+int main()
+{
+  buffer buf = {0,0,0};
+  uint i;
+
+  /*buffer_init(&buf, sortp_long_worksize(NUM,0));*/
+
+#if 0
+  printf("\nsource:\n");
+#endif
+  for(i=0;i!=NUM;++i) {
+    A[i][0]=rand();
+    A[i][0]<<=CHAR_BIT*sizeof(int)-1;
+    A[i][0]^=rand();
+    A[i][0]<<=CHAR_BIT*sizeof(int)-1;
+    A[i][0]^=rand();
+    if(0) A[i][0]&=0x000ff00;
+    B[i][0]=A[i][0];
+#if 0    
+    printf("%016lx\t%016lx\n",(unsigned long)A[i][0],(unsigned long)B[i][0]);
+#endif
+  }
+#if 0
+  printf("\n");
+#endif
+  printf("merge sort:\n");
+  for(i=0;i!=SMALL;++i) Q[i]=SMALL-1-i;
+  sortv_long(Av,  &A[0][0],SMALL,sizeof(ulong[SI]), &buf);
+  sortp_long(&buf,0, &A[0][0],SMALL,sizeof(ulong[SI]));
+    memcpy(P,buf.ptr,SMALL*sizeof(uint));
+  memcpy(buf.ptr,Q,SMALL*sizeof(uint));
+  sortp_long(&buf,1, &A[0][0],SMALL,sizeof(ulong[SI]));
+    memcpy(Q,buf.ptr,SMALL*sizeof(uint));
+  for(i=0;i!=SMALL;++i)
+    printf("%u\t%u\t%016lx\t%d\t%d\n",(unsigned)P[i],(unsigned)Q[i],
+           (unsigned long)A[P[i]][0],
+           A[P[i]][0]==A[Q[i]][0],
+           Av[i]==A[P[i]][0]);
+  printf("\n");
+  printf("radix sort:\n");
+  for(i=0;i!=NUM;++i) Q[i]=NUM-1-i;
+  sortv_long(Av,  &A[0][0],NUM,sizeof(ulong[SI]), &buf);
+  sortp_long(&buf,0, &A[0][0],NUM,sizeof(ulong[SI]));
+    memcpy(P,buf.ptr,NUM*sizeof(uint));
+  memcpy(buf.ptr,Q,NUM*sizeof(uint));
+  sortp_long(&buf,1, &A[0][0],NUM,sizeof(ulong[SI]));
+    memcpy(Q,buf.ptr,NUM*sizeof(uint));
+  for(i=0;i!=NUM;++i)
+    printf("%u\t%u\t%016lx\t%d\t%d\n",(unsigned)P[i],(unsigned)Q[i],
+           (unsigned long)A[P[i]][0],
+           A[P[i]][0]==A[Q[i]][0],
+           Av[i]==A[P[i]][0]);
+
+  printf("\nsmall integers:\n");
+  printf("\n");
+
+  printf("heap sort:\n");
+  for(i=0;i!=SMALL;++i) Q[i]=SMALL-1-i;
+  sortv(Q,  Q,SMALL,sizeof(uint), &buf);
+  for(i=0;i!=SMALL;++i) printf("\t%u\n",(unsigned)Q[i]);
+
+  printf("merge sort:\n");
+  for(i=0;i!=SMALL;++i) Q[i]=SMALL-1-i;
+  sortv(Bv,  &B[0][0],SMALL,sizeof(uint[SI]), &buf);
+  sortp(&buf,0, &B[0][0],SMALL,sizeof(uint[SI]));
+    memcpy(P,buf.ptr,SMALL*sizeof(uint));
+  memcpy(buf.ptr,Q,SMALL*sizeof(uint));
+  sortp(&buf,1, &B[0][0],SMALL,sizeof(uint[SI]));
+    memcpy(Q,buf.ptr,SMALL*sizeof(uint));
+  for(i=0;i!=SMALL;++i)
+    printf("%u\t%u\t%016lx\t%d\t%d\n",(unsigned)P[i],(unsigned)Q[i],
+           (unsigned long)B[P[i]][0],
+           B[P[i]][0]==B[Q[i]][0],
+           B[P[i]][0]==Bv[i]);
+  printf("\n");
+  printf("radix sort:\n");
+  for(i=0;i!=NUM;++i) Q[i]=NUM-1-i;
+  sortv(Bv,  &B[0][0],NUM,sizeof(uint[SI]), &buf);
+  sortp(&buf,0, &B[0][0],NUM,sizeof(uint[SI]));
+    memcpy(P,buf.ptr,NUM*sizeof(uint));
+  memcpy(buf.ptr,Q,NUM*sizeof(uint));
+  sortp(&buf,1, &B[0][0],NUM,sizeof(uint[SI]));
+    memcpy(Q,buf.ptr,NUM*sizeof(uint));
+  for(i=0;i!=NUM;++i)
+    printf("%u\t%u\t%016lx\t%d\t%d\n",(unsigned)P[i],(unsigned)Q[i],
+           (unsigned long)B[P[i]][0],
+           B[P[i]][0]==B[Q[i]][0],
+           B[P[i]][0]==Bv[i]);
+  buffer_free(&buf);
+  return 0;
+}
+
diff --git a/src/jl/sort_test2.c b/src/jl/sort_test2.c
new file mode 100644
index 0000000..4481a16
--- /dev/null
+++ b/src/jl/sort_test2.c
@@ -0,0 +1,74 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <limits.h>
+#include <string.h>
+#include "c99.h"
+#include "name.h"
+#include "fail.h"
+#include "types.h"
+#include "mem.h"
+#include "sort.h"
+#include "rdtsc.h"
+
+#if 1
+
+DEFINE_HW_COUNTER()
+
+#define N (1<<20)
+
+ulong A[N], out[N];
+uint P[N];
+
+int main()
+{
+  buffer buf = null_buffer;
+  uint i;
+  unsigned long long tic, toc;
+  unsigned r;
+  #define TIME(t, repeat, what) do { \
+    for(r=repeat;r;--r) { what; } \
+    tic = getticks(); \
+    for(r=repeat;r;--r) { what; } \
+    toc = getticks(); \
+    t = toc-tic; \
+  } while(0)
+
+  for(i=0;i!=N;++i) {
+    A[i]=rand();
+    A[i]<<=CHAR_BIT*sizeof(int)-1;
+    A[i]^=rand();
+    A[i]<<=CHAR_BIT*sizeof(int)-1;
+    A[i]^=rand();
+    if(0) A[i]&=0x000ff00;
+  }
+
+  for(i=N;i;i>>=1) {
+    unsigned long long t;
+    TIME(t, (N/i), 
+      sortv_long(out, A,i,sizeof(ulong), &buf));
+    printf("sortv %d : %g cycles per item\n",
+      (int)i, t/(double)(N/i)/(double)i);
+  }
+
+  for(i=N;i;i>>=1) {
+    unsigned long long t;
+    TIME(t, (N/i), 
+      sortp_long(&buf,0, A,i,sizeof(ulong)));
+    printf("sortp %d : %g cycles per item\n",
+      (int)i, t/(double)(N/i)/(double)i);
+  }
+
+  buffer_free(&buf);
+  return 0;
+}
+
+#else
+
+int main()
+{
+  return 0;
+}
+
+#endif
+
diff --git a/src/jl/spchol_test.c b/src/jl/spchol_test.c
new file mode 100644
index 0000000..5fc6596
--- /dev/null
+++ b/src/jl/spchol_test.c
@@ -0,0 +1,54 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "c99.h"
+#include "name.h"
+#include "fail.h"
+#include "types.h"
+#include "mem.h"
+#include "sparse_cholesky.h"
+
+int main()
+{
+#define x -1
+
+  uint i,n=7;
+  uint Aj [] = {0,2, 1,2,6, 0,1,2, 3,5,6, 4,5, 3,4,5, 1,3,6};
+  double A[] = {2,x, 2,x,x, x,x,2, 2,x,x, 2,x, x,x,2, x,x,2};
+#undef x
+  uint Arp[] = {0,2,5,8,11,13,16,19};
+  double x[7], b[7] = {0,0,0,0, 0,0,0};
+  uint o[7] = {0,2,1,6,3,5,4};
+/*
+  uint i,n=10;
+  uint Aj [] = {0,2,7, 1,4,9, 0,2,6, 3,8,9, 1,4,8,9, 5,6,7, 2,5,6, 0,5,7,8,9, 3,4,7,8, 1,3,4,7,9};
+  real A  [] = {3,x,x, 2,x,x, x,2,x, 2,x,x, x,3,x,x, 2,x,x, x,x,2, x,x,4,x,x, x,x,x,3, x,x,x,x,4};
+#undef x
+  uint Arp[] = {0,     3,     6,     9,     12,      16,    19,    22,        27,      31,    36};
+  real b[] = {1,2,3,4,5, 6,7,8,9,10};
+*/
+  struct sparse_cholesky data;
+  buffer buf;
+  buffer_init(&buf,4);
+  sparse_cholesky_factor(n,Arp,Aj,A,&data,&buf);
+  
+  for(i=0;i<n;++i) { uint j;
+    b[o[i]]=1;
+    sparse_cholesky_solve(x,&data,b);
+    for(j=0;j<n;++j) printf("\t%g",(double)x[o[j]]);
+    printf("\n");
+    b[o[i]]=0;
+  }
+  sparse_cholesky_free(&data);
+  buffer_free(&buf);
+  /*
+  sparse_cholesky_solve(b,&data,b);
+  sparse_cholesky_free(&data);
+  for(i=0;i<n;++i) printf("%g\n", b[i]);
+  */
+
+  return 0;
+}
+
+
diff --git a/src/jl/tensor.c b/src/jl/tensor.c
new file mode 100644
index 0000000..a724714
--- /dev/null
+++ b/src/jl/tensor.c
@@ -0,0 +1,82 @@
+#include "c99.h"
+#include "name.h"
+#include "types.h"
+
+#if !defined(USE_CBLAS)
+
+#define tensor_dot  PREFIXED_NAME(tensor_dot )
+#define tensor_mtxm PREFIXED_NAME(tensor_mtxm)
+
+/* Matrices are always column-major (FORTRAN style) */
+
+double tensor_dot(const double *a, const double *b, uint n)
+{
+  double sum = 0;
+  for(;n;--n) sum += *a++ * *b++;
+  return sum;
+}
+
+#  if defined(USE_NAIVE_BLAS)
+#    define tensor_mxv  PREFIXED_NAME(tensor_mxv )
+#    define tensor_mtxv PREFIXED_NAME(tensor_mtxv)
+#    define tensor_mxm  PREFIXED_NAME(tensor_mxm )
+
+/* y = A x */
+void tensor_mxv(
+  double *restrict y, uint ny,
+  const double *restrict A, const double *restrict x, uint nx)
+{
+  uint i;
+  for(i=0;i<ny;++i) y[i]=0;
+  for(;nx;--nx) {
+    const double xk = *x++;
+    for(i=0;i<ny;++i) y[i] += (*A++)*xk;
+  }
+}
+
+/* y = A^T x */
+void tensor_mtxv(
+  double *restrict y, uint ny,
+  const double *restrict A, const double *restrict x, uint nx)
+{
+  for(;ny;--ny) {
+    const double *restrict xp = x;
+    uint n = nx;
+    double sum = *A++ * *xp++;
+    for(--n;n;--n) sum += *A++ * *xp++;
+    *y++ = sum;
+  }
+}
+
+/* C = A * B */
+void tensor_mxm(
+  double *restrict C, uint nc,
+  const double *restrict A, uint na, const double *restrict B, uint nb)
+{
+  uint i,j,k;
+  for(i=0;i<nc*nb;++i) C[i]=0;
+  for(j=0;j<nb;++j,C+=nc) {
+    const double *restrict A_ = A;
+    for(k=0;k<na;++k) {
+      const double b = *B++;
+      for(i=0;i<nc;++i) C[i] += (*A_++) * b;
+    }
+  }
+}
+
+#  endif
+
+/* C = A^T * B */
+void tensor_mtxm(
+  double *restrict C, uint nc,
+  const double *restrict A, uint na, const double *restrict B, uint nb)
+{
+  uint i,j;
+  for(j=0;j<nb;++j,B+=na) {
+    const double *restrict A_ = A;
+    for(i=0;i<nc;++i,A_+=na) *C++ = tensor_dot(A_,B,na);
+  }
+}
+
+#endif
+
diff --git a/src/jl/tensor.h b/src/jl/tensor.h
new file mode 100644
index 0000000..bbff4ca
--- /dev/null
+++ b/src/jl/tensor.h
@@ -0,0 +1,199 @@
+#ifndef TENSOR_H
+#define TENSOR_H
+
+#if !defined(TYPES_H) || !defined(NAME_H)
+#warning "tensor.h" requires "types.h" and "name.h"
+#endif
+
+#if defined(USE_CBLAS)
+#  include <cblas.h>
+#  define tensor_dot(a,b,n) cblas_ddot((int)(n),a,1,b,1)
+#  define tensor_mxv(y,ny,A,x,nx) \
+     cblas_dgemv(CblasColMajor,CblasNoTrans,(int)ny,(int)nx, \
+                 1.0,A,(int)ny,x,1,0.0,y,1)
+#  define tensor_mtxv(y,ny,A,x,nx) \
+     cblas_dgemv(CblasColMajor,CblasTrans,(int)nx,(int)ny, \
+                 1.0,A,(int)nx,x,1,0.0,y,1)
+#  define tensor_mxm(C,nc,A,na,B,nb) \
+     cblas_dgemm(CblasColMajor,CblasNoTrans,CblasNoTrans, \
+                 (int)nc,(int)nb,(int)na,1.0, \
+                 A,(int)nc,B,(int)na,0.0,C,(int)nc)
+#  define tensor_mtxm(C,nc,A,na,B,nb) \
+     cblas_dgemm(CblasColMajor,CblasTrans,CblasNoTrans, \
+                 (int)nc,(int)nb,(int)na,1.0, \
+                 A,(int)na,B,(int)na,0.0,C,(int)nc)
+#else
+#  define tensor_dot  PREFIXED_NAME(tensor_dot )
+#  define tensor_mtxm PREFIXED_NAME(tensor_mtxm)
+double tensor_dot(const double *a, const double *b, uint n);
+
+/* C (nc x nb) = [A (na x nc)]^T * B (na x nb); all column-major */
+void tensor_mtxm(double *C, uint nc,
+                 const double *A, uint na, const double *B, uint nb);
+#  if defined(USE_NAIVE_BLAS)
+#    define tensor_mxv  PREFIXED_NAME(tensor_mxv )
+#    define tensor_mtxv PREFIXED_NAME(tensor_mtxv)
+#    define tensor_mxm  PREFIXED_NAME(tensor_mxm )
+/* y = A x */
+void tensor_mxv(double *y, uint ny, const double *A, const double *x, uint nx);
+
+/* y = A^T x */
+void tensor_mtxv(double *y, uint ny, const double *A, const double *x, uint nx);
+
+/* C (nc x nb) = A (nc x na) * B (na x nb); all column-major */
+void tensor_mxm(double *C, uint nc,
+                const double *A, uint na, const double *B, uint nb);
+#  else
+#    define nek_mxm FORTRAN_UNPREFIXED(mxm,MXM)
+/* C (na x nc) = A (na x nb) * B (nb x nc); all column-major */
+void nek_mxm(const double *A, const uint *na,
+             const double *B, const uint *nb,
+             double *C, const uint *nc);
+/* C (nc x nb) = A (nc x na) * B (na x nb); all column-major */
+static void tensor_mxm(double *C, uint nc,
+                       const double *A, uint na, const double *B, uint nb)
+{ nek_mxm(A,&nc,B,&na,C,&nb); }
+
+/* y = A x */
+static void tensor_mxv(double *y, uint ny,
+                       const double *A, const double *x, uint nx)
+{ uint one=1; nek_mxm(A,&ny,x,&nx,y,&one); }
+
+/* y = A^T x */
+static void tensor_mtxv(double *y, uint ny,
+                        const double *A, const double *x, uint nx)
+{ uint one=1; nek_mxm(x,&one,A,&nx,y,&ny); }
+
+#  endif
+#endif
+
+/*--------------------------------------------------------------------------
+   1-,2-,3-d Tensor Application of Row Vectors (for Interpolation)
+   
+   the 3d case:
+   v = tensor_i3(Jr,nr, Js,ns, Jt,nt, u, work)
+     gives v = [ Jr (x) Js (x) Jt ] u
+     where Jr, Js, Jt are row vectors (interpolation weights)
+     u is nr x ns x nt in column-major format (inner index is r)
+     v is a scalar
+  --------------------------------------------------------------------------*/
+
+static double tensor_i1(const double *Jr, uint nr, const double *u)
+{
+  return tensor_dot(Jr,u,nr);
+}
+
+/* work holds ns doubles */
+static double tensor_i2(const double *Jr, uint nr,
+                        const double *Js, uint ns,
+                        const double *u, double *work)
+{
+  tensor_mtxv(work,ns, u, Jr,nr);
+  return tensor_dot(Js,work,ns);
+}
+
+/* work holds ns*nt + nt doubles */
+static double tensor_i3(const double *Jr, uint nr,
+                        const double *Js, uint ns,
+                        const double *Jt, uint nt,
+                        const double *u, double *work)
+{
+  double *work2 = work+nt;
+  tensor_mtxv(work2,ns*nt,   u,     Jr,nr);
+  tensor_mtxv(work ,nt   ,   work2, Js,ns);
+  return tensor_dot(Jt,work,nt);
+}
+
+/*--------------------------------------------------------------------------
+   1-,2-,3-d Tensor Application of Row Vectors
+             for simultaneous Interpolation and Gradient computation
+   
+   the 3d case:
+   v = tensor_ig3(g, wtr,nr, wts,ns, wtt,nt, u, work)
+     gives v   = [ Jr (x) Js (x) Jt ] u
+           g_0 = [ Dr (x) Js (x) Jt ] u
+           g_1 = [ Jr (x) Ds (x) Jt ] u
+           g_2 = [ Jr (x) Js (x) Dt ] u
+     where Jr,Dr,Js,Ds,Jt,Dt are row vectors,
+       Jr=wtr, Dr=wtr+nr, etc.
+       (interpolation & derivative weights)
+     u is nr x ns x nt in column-major format (inner index is r)
+     v is a scalar, g is an array of 3 doubles
+  --------------------------------------------------------------------------*/
+
+static double tensor_ig1(double g[1],
+                         const double *wtr, uint nr,
+                         const double *u)
+{
+  g[0] = tensor_dot(wtr+nr,u,nr);
+  return tensor_dot(wtr   ,u,nr);
+}
+
+/* work holds 2*nr doubles */
+static double tensor_ig2(double g[2],
+                         const double *wtr, uint nr,
+                         const double *wts, uint ns,
+                         const double *u, double *work)
+{
+  tensor_mxm(work,nr, u,ns, wts,2);
+  g[0] = tensor_dot(wtr+nr,work   ,nr);
+  g[1] = tensor_dot(wtr   ,work+nr,nr);
+  return tensor_dot(wtr   ,work   ,nr);
+}
+
+/* work holds 2*nr*ns + 3*nr doubles */
+static double tensor_ig3(double g[3],
+                         const double *wtr, uint nr,
+                         const double *wts, uint ns,
+                         const double *wtt, uint nt,
+                         const double *u, double *work)
+{
+  const uint nrs = nr*ns;
+  double *a = work, *b = work+2*nrs, *c=b+2*nr;
+  tensor_mxm(a,nrs, u,nt, wtt,2);
+  tensor_mxm(b,nr,  a,ns, wts,2);
+  tensor_mxv(c,nr, a+nrs, wts,ns);
+  g[0] = tensor_dot(b   , wtr+nr, nr);
+  g[1] = tensor_dot(b+nr, wtr   , nr);
+  g[2] = tensor_dot(c   , wtr   , nr);
+  return tensor_dot(b   , wtr   , nr);
+}
+
+/*
+  out - nr x ns
+  u   - mr x ms
+  Jrt - mr x nr, Jst - ms x ns
+  work - nr x ms
+*/
+static void tensor_2t(double *out,
+                      const double *Jrt, uint nr, uint mr,
+                      const double *Jst, uint ns, uint ms,
+                      const double *u, double *work)
+{
+  tensor_mtxm(work,nr, Jrt,mr, u,ms);
+  tensor_mxm(out,nr, work,ms, Jst,ns);
+}
+
+/*
+  out - nr x ns x nt
+  u   - mr x ms x mt
+  Jrt - mr x nr, Jst - ms x ns, Jtt - mt x nt
+  work - nr*ms*mt + nr*ns*mt = nr*(ms+ns)*mt
+*/
+static void tensor_3t(double *out,
+                      const double *Jrt, uint nr, uint mr,
+                      const double *Jst, uint ns, uint ms,
+                      const double *Jtt, uint nt, uint mt,
+                      const double *u, double *work)
+{
+  const uint nrs=nr*ns, mst=ms*mt, nrms=nr*ms;
+  uint k;
+  double *work2 = work+nr*mst;
+  double *p; const double *q;
+  tensor_mtxm(work,nr, Jrt,mr, u,mst);
+  for(k=0,p=work2,q=work;k<mt;++k,p+=nrs,q+=nrms)
+    tensor_mxm(p,nr, q,ms, Jst,ns);
+  tensor_mxm(out,nrs, work2,mt, Jtt,nt);
+}
+
+#endif
diff --git a/src/jl/types.h b/src/jl/types.h
new file mode 100644
index 0000000..21977fa
--- /dev/null
+++ b/src/jl/types.h
@@ -0,0 +1,79 @@
+#ifndef TYPES_H
+#define TYPES_H
+
+/* 
+  Define the integer types used throughout the code,
+  controlled by preprocessor macros.
+  
+  The integer type sint/uint (signed/unsigned) is used
+  most frequently, e.g., for indexing into local arrays,
+  and for processor ids. It can be one of
+  
+    macro             sint/uint type
+    
+    (default)         int
+    USE_LONG          long
+    USE_LONG_LONG     long long
+    
+  The slong/ulong type is used in relatively few places
+  for global identifiers and indices. It can be one of
+
+    macro             slong/ulong type
+    
+    (default)         int
+    GLOBAL_LONG       long
+    GLOBAL_LONG_LONG  long long
+
+  Since the long long type is not ISO C90, it is never
+  used unless explicitly asked for.
+*/
+
+#if defined(USE_LONG_LONG) || defined(GLOBAL_LONG_LONG)
+typedef long long long_long;
+#  define WHEN_LONG_LONG(x) x
+#  if !defined(LLONG_MAX)
+#    if defined(LONG_LONG_MAX)
+#      define LLONG_MAX LONG_LONG_MAX
+#    else
+#      define LLONG_MAX 9223372036854775807
+#    endif
+#  endif
+#  if !defined(LLONG_MIN)
+#    if defined(LONG_LONG_MIN)
+#      define LLONG_MIN LONG_LONG_MIN
+#    else
+#      define LLONG_MIN -9223372036854775807
+#    endif
+#  endif
+#else
+#  define WHEN_LONG_LONG(x)
+#endif
+
+#if !defined(USE_LONG) && !defined(USE_LONG_LONG)
+#  define TYPE_LOCAL(i,l,ll) i
+#elif defined(USE_LONG)
+#  define TYPE_LOCAL(i,l,ll) l
+#elif defined(USE_LONG_LONG)
+#  define TYPE_LOCAL(i,l,ll) ll
+#endif
+
+#if !defined(GLOBAL_LONG) && !defined(GLOBAL_LONG_LONG)
+#  define TYPE_GLOBAL(i,l,ll) i
+#elif defined(GLOBAL_LONG)
+#  define TYPE_GLOBAL(i,l,ll) l
+#else
+#  define TYPE_GLOBAL(i,l,ll) ll
+#endif
+
+/* local integer type: for quantities O(N/P) */
+#define sint   signed TYPE_LOCAL(int,long,long long)
+#define uint unsigned TYPE_LOCAL(int,long,long long)
+#define iabs TYPE_LOCAL(abs,labs,llabs)
+
+/* global integer type: for quantities O(N) */
+#define slong   signed TYPE_GLOBAL(int,long,long long)
+#define ulong unsigned TYPE_GLOBAL(int,long,long long)
+#define iabsl TYPE_GLOBAL(abs,labs,llabs)
+
+#endif
+
diff --git a/src/k10_mxm.c b/src/k10_mxm.c
new file mode 100644
index 0000000..a8ba95d
--- /dev/null
+++ b/src/k10_mxm.c
@@ -0,0 +1,56 @@
+#ifndef FNAME_H
+#define FNAME_H
+
+#ifdef UPCASE
+#  define FORTRAN_NAME(low,up) up
+#else
+#ifdef UNDERSCORE
+#  define FORTRAN_NAME(low,up) low##_
+#else
+#  define FORTRAN_NAME(low,up) low
+#endif
+#endif
+
+#endif
+
+#define k10_mxm FORTRAN_NAME(k10_mxm, K10_MXM)
+
+void tune_mxm888    (double*, double*, double*);
+void tune_mxm8864   (double*, double*, double*);
+void tune_mxm6488   (double*, double*, double*);
+void tune_mxm101010 (double*, double*, double*);
+void tune_mxm1010100(double*, double*, double*);
+void tune_mxm1001010(double*, double*, double*);
+
+
+int k10_mxm(double* a, int* sz1, double* b, int* sz2, double* c, int* sz3)
+{
+    int m, k, n;
+
+    m = *sz1;
+    k = *sz2;
+    n = *sz3;
+
+    if ((unsigned)a%16 != 0 || (unsigned)b%16 != 0 || (unsigned)c%16 != 0){
+       return 1;}
+
+    // 8,8,8  8,8,64  64,8,8
+    if (k == 8){
+       if (m == 8){
+          if     (n ==  8){tune_mxm888 (a,b,c); return 0;}
+          else if(n == 64){tune_mxm8864(a,b,c); return 0;}
+       }
+       else if (m == 64 && n == 8){tune_mxm6488(a,b,c); return 0;}
+    }
+
+    // 10,10,10  10,10,100  100,10,10
+    if (k == 10){
+       if (m == 10){
+          if     (n ==  10){tune_mxm101010 (a,b,c); return 0;}
+          else if(n == 100){tune_mxm1010100(a,b,c); return 0;}
+       }
+       else if (m == 100 && n == 10){tune_mxm1001010(a,b,c); return 0;}
+    }
+
+    return 1;
+}
diff --git a/src/makenek.inc b/src/makenek.inc
new file mode 100644
index 0000000..6d97ace
--- /dev/null
+++ b/src/makenek.inc
@@ -0,0 +1,324 @@
+# This include file is used by the makenek script
+# to automatically create a makefile for Nek5000 
+# (c) 2008,2009,2010 UCHICAGO ARGONNE, LLC
+# ------------------------------------------------
+
+echo "makenek - automatic build tool for Nek5000"
+
+if [ "$PPLIST" == "?" ]; then
+  echo "available pre-processor symbols:" 
+  echo "  BGP       enable Blue Gene/P optimizations "
+  echo "  BGQ       compile for Blue Gene/Q "
+  echo "  K10_MXM   use optimized MxM kernel for AMD Family 10h processors" 
+  echo "  TIMERS    turns on detailed routine timers"
+  echo "  MPITIMER  use MPI_Wtime for timing"
+  echo "  BGQTIMER  use BG/Q cycle counter for timing"
+  echo "  CGTTIMER  use clock_gettime for timing"
+  echo "  NITER=x   number of cg iterations, default 100"
+  echo "  LOG       output progress"
+  exit 1
+fi
+
+
+if [ "$1" == "clean" ]; then
+  make clean
+  exit 0
+fi
+
+NOCOMPILE=0
+if [ "$1" == "-nocompile" ]; then
+  NOCOMPILE=1
+fi 
+
+CASENAME=$1
+CASEDIR=`pwd`
+APATH_SRC=`cd $SOURCE_ROOT; pwd`
+SOURCE_ROOT=$APATH_SRC
+
+# do some basic checks
+if [ "$CASEDIR" == "$SOURCE_ROOT" ]; then
+   echo "FATAL ERROR: Working directory has to be different from the source!"
+   exit 1
+fi
+
+if [ ! -f SIZE ]; then
+   echo "FATAL ERROR: Cannot find SIZE!"
+   exit 1
+fi
+
+if [ ! -f ./makefile.template ]; then
+  echo "FATAL ERROR: Cannot find ./makefile.template!"
+  exit 1
+fi
+
+# test F77 compiler
+which `echo $F77 | awk '{print $1}'` 1>/dev/null
+if [ $? -ne 0 ]; then
+  echo "FATAL ERROR: Cannot find $F77!"
+  exit 1
+fi
+\rm test_f77.o 2>/dev/null
+
+# basic compiler test
+cat > test_f77.f << _ACEOF
+      subroutine test
+      end
+_ACEOF
+$F77 -c test_f77.f >/dev/null
+if [ ! -f test_f77.o ]; then
+  echo "FATAL ERROR: Basic compiler test for $F77 failed!"
+  exit 1
+fi
+\rm test_f77.* 2>/dev/null
+
+# test C compiler
+which `echo $CC | awk '{print $1}'` 1>/dev/null
+if [ $? -ne 0 ]; then
+  echo "FATAL ERROR: Cannot find $CC!"
+  exit 1
+fi
+\rm test_cc.o 2>/dev/null
+
+cat > test_cc.c << _ACEOF
+      void function(){}
+_ACEOF
+$CC -c test_cc.c 1>/dev/null
+if [ ! -f test_cc.o ]; then
+  echo "FATAL ERROR: Basic compiler test for $CC failed!"
+  exit 1
+fi
+\rm test_cc.* 2>/dev/null
+
+# initial clean-up
+rm -f nekbone 2>/dev/null
+
+# Check if the compiler adds an underscore to external functions
+UNDERSCORE=false
+cat > test_underscore.f << _ACEOF
+      subroutine underscore_test
+        call byte_write
+      end
+_ACEOF
+$F77 -c test_underscore.f 2>&1 >/dev/null 
+nm test_underscore.o | grep byte_write_ 1>/dev/null
+if [ $? -eq 0 ] 
+then
+  UNDERSCORE=true
+fi
+\rm test_underscore.* 2>/dev/null
+
+# trying to figure which compiler the wrapper is using
+F77ok=0
+
+F77comp_=`$F77 -showme 2>/dev/null 1>.tmp || true`
+F77comp=`cat .tmp | awk '{print $1}' | awk -F/ '{print $NF}' || true`
+if [ -f "`which $F77comp 2>/dev/null`" ]; then
+  F77ok=1
+fi
+
+if [ $F77ok -eq 0 ]; then
+  F77comp_=`$F77 -show 2>/dev/null 1>.tmp || true`
+  F77comp=`cat .tmp | awk '{print $1}' | awk -F/ '{print $NF}' || true`
+  if [ -f "`which $F77comp 2>/dev/null`" ]; then
+    F77ok=1
+  fi
+fi
+
+if [ $F77ok -eq 0 ]; then
+  F77comp_=`$F77 -craype-verbose 2>/dev/null 1>.tmp || true`
+  F77comp=`cat .tmp | awk '{print $1}' | awk -F/ '{print $NF}' || true`
+  if [ -f "`which $F77comp 2>/dev/null`" ]; then
+    F77ok=1
+  fi
+fi
+
+if [ $F77ok -eq 0 ]; then
+  F77comp=`echo $F77 | awk '{print $1}'`
+  if [ -f "`which $F77comp 2>/dev/null`" ]; then
+    F77ok=1
+  fi
+fi
+
+\rm -f .tmp
+if [ $F77ok -eq 0 ]; then
+  F77comp="unknown"
+fi
+
+CFE_FLAG=""
+# assign F77 compiler specific flags
+case $F77comp in
+  *pgf*)        P="-r8 -Mpreprocess"
+               ;;
+  *gfortran*)   P="-fcray-pointer -fdefault-real-8 -x f77-cpp-input"
+               ;;
+  *ifort*)      P="-r8 -fpconstant -fpp"
+                CFE_FLAG="-xHost"
+               ;;
+  *pathf*)      P="-r8 -cpp -fno-second-underscore"
+               ;;
+  *xlf*)       P="-qrealsize=8 -qdpc=e -qsuffix=cpp=f"
+               PPPO="-WF,"
+               F77="${F77} -qsuppress=cmpmsg"
+               ;;
+  *ftn*)        P="-r8 -Mpreprocess"
+               ;;
+  *sunf*)       P="-r8const -xtypemap=real:64 -fpp"
+               ;;
+  *open*)       P="-r8 -cpp -fno-second-underscore"
+               ;;
+  *)  echo "ERROR: Unable to detect compiler!"
+      echo "        - don't know how to promote datatype REAL to 8 bytes"
+      echo "        - don't know how to invoke the C pre-processor (CPP) before compilation"
+      echo "       Please edit the makefile and specify the requested compiler flags using the P variable."
+      echo ""
+      P="<specify your compiler flags here>"
+      NOCOMPILE=1
+      read;;
+esac
+export PPPO
+
+# Check ptr size
+cat > tmp.c << _ACEOF
+#include <stdlib.h>
+#include <stdio.h>
+int main()
+{
+  int *p;printf("%li\n",sizeof(p));
+}
+_ACEOF
+$CC $CFE_FLAG tmp.c 2>&1>/dev/null
+ptrSize=`./a.out`
+rm tmp.c a.out
+if [ "$ptrSize" == "8" ]
+then
+  PPLIST="${PPLIST} PTRSIZE8"
+fi
+
+# set preprocessor symbols
+if [ "$IFMPI" == "false" -o "$IFMPI" == "no" ]; then
+  IFMPI=false
+else
+  # default
+  IFMPI=true
+  PPLIST="${PPLIST} MPI"
+fi
+export IFMPI
+
+# Check size of long int
+cat > tmp.c << _ACEOF
+#include <stdlib.h>
+#include <stdio.h>
+int main()
+{
+  int i;
+  i=sizeof(long int);
+  printf("%i\n",i);
+}
+_ACEOF
+$CC $CFE_FLAG tmp.c 2>&1>/dev/null
+longIntTest=`./a.out`
+rm tmp.c a.out
+if [ "$longIntTest" == "8" ]
+then
+  PPLIST="${PPLIST} LONGINT8"
+fi
+
+if [ "$UNDERSCORE" == "true" ]; then
+  PPLIST="${PPLIST} UNDERSCORE"
+fi 
+
+PPLIST="${PPLIST} GLOBAL_LONG_LONG"
+
+MXM_USER="mxm_std.o blas.o"
+echo $PPLIST | grep 'BGP' >/dev/null 
+if [ $? -eq 0 ]; then
+   MXM_USER="mxm_std.o bg_aligned3.o bg_mxm44.o bg_mxm44_uneven.o bg_mxm3.o blas.o" 
+   OPT_FLAGS_STD="-qarch=450 -qtune=450"
+   OPT_FLAGS_MAG="-O5 -qarch=450d -qtune=450"
+fi
+echo $PPLIST | grep 'BLAS_MXM' >/dev/null 
+if [ $? -eq 0 ]; then
+   MXM_USER="mxm_std.o" 
+fi
+
+# set optimization flags
+L0="\$(G) -O0"
+L2="\$(G) -O2"
+L3="\$(G) -O3" 
+L4="\$(L3)"
+
+# user specified opt flags
+if [ "$OPT_FLAGS_STD" != "" ]; then
+  echo $OPT_FLAGS_STD | grep "\-O." 1>/dev/null
+  if [ $? -eq 0 ]; then
+    L2="\$(G) $OPT_FLAGS_STD"
+    L3="\$(G) $OPT_FLAGS_STD" 
+  else
+    L2="\$(G) -O2 $OPT_FLAGS_STD"
+    L3="\$(G) -O3 $OPT_FLAGS_STD"
+  fi
+fi
+
+if [ "$OPT_FLAGS_MAG" != "" ]; then
+    L4="\$(G) $OPT_FLAGS_MAG"
+fi
+
+if [ "$USR_LIB" != "" ]; then
+    USR_LFLAGS="${USR_LFLAGS} ${USR_LIB}"
+fi
+
+# tweak makefile template 
+echo "generating makefile ..."
+rm -rf makefile 2>/dev/null
+
+sed -e "s:^F77[ ]*=.*:F77\:=$F77:" \
+-e "s:^CC[ ]*=.*:CC\:=$CC:" \
+-e "s:^G[ ]*=.*:G\:=$G:" \
+-e "s:^OPT_FLAGS[ ]*=.*:OPT_FLAGS\:=$OPT_FLAGS:" \
+-e "s/^P[ ]*=.*/P:=$P/" \
+-e "s/^L0[ ]*=.*/L0=$L0/" \
+-e "s/^L2[ ]*=.*/L2=$L2/" \
+-e "s/^L3[ ]*=.*/L3=$L3/" \
+-e "s/^L4[ ]*=.*/L4=$L4/" \
+-e "s/^PPPO[ ]*=.*/PPPO=$PPPO/" \
+-e "s/^PPS[ ]*=.*/PPS=$PPLIST/" \
+-e "s:^MXM[ ]*=.*:MXM=$MXM_USER:" \
+-e "s/^IFMPI[ ]*=.*/IFMPI:=$IFMPI/" \
+-e "s:^USR[ ]*=.*:USR\:=$USR:" \
+-e "s:^USR_LFLAGS[ ]*=.*:USR_LFLAGS\:=$USR_LFLAGS:" \
+-e "s:^S[ ]*=.*:S\:=${SOURCE_ROOT}:" ./makefile.template >.makefile
+
+echo $G | grep '\-g' 1>/dev/null
+if [ $? -eq 0 ]; then
+  sed 's/-O[1-4]/-O0/g' .makefile > .makefile.tmp
+  mv .makefile.tmp .makefile
+  echo "Activate DEBUG mode"
+fi
+
+if [ "$USR" != "" ]; then
+  echo "###########################################################" >> makefile
+  echo "include makefile_usr.inc" >> .makefile
+fi
+
+if [ -f .makefile ]; then
+  sed -e "1i\\
+### makefile automatically created by makenek `date +"%m/%d/%Y %T"` ###" \
+-e "s:^CASEDIR[ ]*=.*:CASEDIR\:=${CASEDIR}:" \
+-e "s:^CASENAME[ ]*=.*:CASENAME\:=${CASENAME}:" .makefile > makefile 
+else
+  echo "ERROR: Nek Makefile could not be created!"
+  exit 1 
+fi
+\rm .makefile 2>/dev/null
+
+# tweak SIZE file
+if [ -f "./SIZE" ]; then
+  cat SIZE | grep -i 'lxo' >/dev/null
+else
+  echo "FATAL ERROR: Cannot find SIZE"
+  exit 1
+fi
+
+if [ $NOCOMPILE -eq 1 ]; then
+  exit 0
+fi 
diff --git a/src/math.f b/src/math.f
new file mode 100644
index 0000000..48a24c3
--- /dev/null
+++ b/src/math.f
@@ -0,0 +1,1402 @@
+c-----------------------------------------------------------------------
+      SUBROUTINE BLANK(A,N)
+      CHARACTER*1 A(1)
+      CHARACTER*1 BLNK
+      SAVE        BLNK
+      DATA        BLNK /' '/
+ 
+      DO 10 I=1,N
+         A(I)=BLNK
+   10 CONTINUE
+      RETURN
+      END
+c-----------------------------------------------------------------------
+      SUBROUTINE VSQ (A,N)
+      DIMENSION  A(1)
+ 
+      DO 100 I = 1, N
+ 100     A(I) = A(I)**2
+      RETURN
+      END
+c-----------------------------------------------------------------------
+      SUBROUTINE VSQRT(A,N)
+      DIMENSION  A(1)
+ 
+      DO 100 I = 1, N
+ 100     A(I) = SQRT(A(I))
+      RETURN
+      END
+c-----------------------------------------------------------------------
+      subroutine invers2(a,b,n)
+      REAL A(1),B(1)
+ 
+      DO 100 I=1,N
+         A(I)=1./B(I)
+ 100  CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine invcol1(a,n)
+      REAL A(1)
+ 
+      DO 100 I=1,N
+         A(I)=1./A(I)
+ 100  CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine invcol2(a,b,n)
+ 
+      REAL A(1),B(1)
+ 
+      DO 100 I=1,N
+         A(I)=A(I)/B(I)
+ 100  CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine invcol3(a,b,c,n)
+      REAL A(1),B(1),C(1)
+ 
+ 
+      DO 100 I=1,N
+         A(I)=B(I)/C(I)
+ 100  CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine col4(a,b,c,d,n)
+      REAL A(1),B(1),C(1),D(1)
+ 
+      DO 100 I=1,N
+         A(I)=B(I)*C(I)*D(I)
+  100 CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine Xaddcol3(a,b,c,n)
+      REAL A(1),B(1),C(1)
+ 
+      DO 100 I=1,N
+         A(I)=A(I)+B(I)*C(I)
+  100 CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine addcol4(a,b,c,d,n)
+      REAL A(1),B(1),C(1),D(1)
+ 
+      DO 100 I=1,N
+         A(I)=A(I)+B(I)*C(I)*D(I)
+  100 CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine ascol5 (a,b,c,d,e,n)
+      REAL A(1),B(1),C(1),D(1),E(1)
+ 
+      DO 100 I=1,N
+         A(I) = B(I)*C(I)-D(I)*E(I)
+ 100  CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine sub2(a,b,n)
+      REAL A(1),B(1)
+ 
+      DO 100 I=1,N
+         A(I)=A(I)-B(I)
+ 100  CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine sub3(a,b,c,n)
+      REAL A(1),B(1),C(1)
+ 
+      DO 100 I=1,N
+         A(I)=B(I)-C(I)
+ 100  CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine subcol3(a,b,c,n)
+      REAL A(1),B(1),C(1)
+ 
+ 
+      DO 100 I=1,N
+         A(I)=A(I)-B(I)*C(I)
+  100 CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine subcol4(a,b,c,d,n)
+      REAL A(1),B(1),C(1),D(1)
+ 
+ 
+      DO 100 I=1,N
+         A(I)=A(I)-B(I)*C(I)*D(I)
+  100 CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine rzero(a,n)
+      DIMENSION  A(1)
+
+      DO 100 I = 1, N
+ 100     A(I ) = 0.0
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine izero(a,n)
+      INTEGER A(1)
+ 
+      DO 100 I = 1, N
+ 100     A(I ) = 0
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine ione(a,n)
+      INTEGER   A(1)
+      DO 100 I = 1, N
+ 100     A(I ) = 1
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine rone(a,n)
+      DIMENSION  A(1)
+      DO 100 I = 1, N
+ 100     A(I ) = 1.0
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine cfill(a,b,n)
+      DIMENSION  A(1)
+ 
+      DO 100 I = 1, N
+ 100     A(I) = B
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine ifill(ia,ib,n)
+      DIMENSION IA(1)
+ 
+      DO 100 I = 1, N
+ 100     IA(I) = IB
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine copy(a,b,n)
+      real a(1),b(1)
+
+      do i=1,n
+         a(i)=b(i)
+      enddo
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine chcopy(a,b,n)
+      CHARACTER*1 A(1), B(1)
+ 
+      DO 100 I = 1, N
+ 100     A(I) = B(I)
+      return
+      END
+ 
+      subroutine icopy(a,b,n)
+      INTEGER A(1), B(1)
+ 
+      DO 100 I = 1, N
+ 100     A(I) = B(I)
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine i8copy(a,b,n)
+      INTEGER*8 A(1), B(1)
+ 
+      DO 100 I = 1, N
+ 100     A(I) = B(I)
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine chsign(a,n)
+      REAL A(1)
+ 
+      DO 100 I=1,N
+         A(I) = -A(I)
+ 100  CONTINUE
+      return
+      END
+ 
+c-----------------------------------------------------------------------
+      subroutine cmult(a,const,n)
+      REAL A(1)
+ 
+      DO 100 I=1,N
+         A(I)=A(I)*CONST
+ 100  CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine cadd(a,const,n)
+      REAL A(1)
+ 
+      DO 100 I=1,N
+         A(I)=A(I)+CONST
+ 100  CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine iadd(i1,iscal,n)
+      DIMENSION I1(1)
+ 
+      DO 10 I=1,N
+         I1(I)=I1(I)+ISCAL
+   10 CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine cadd2(a,b,const,n)
+      REAL A(1),B(1)
+ 
+      DO 100 I=1,N
+         A(I)=B(I)+CONST
+ 100  CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      real function vlmin(vec,n)
+      REAL VEC(1)
+      TMIN = 99.0E20
+ 
+      DO 100 I=1,N
+         TMIN = MIN(TMIN,VEC(I))
+ 100  CONTINUE
+      VLMIN = TMIN
+      return
+      END
+c-----------------------------------------------------------------------
+      integer function ivlmin(vec,n)
+      integer vec(1),tmin
+      if (n.eq.0) then
+         ivlmin=0
+         return
+      endif
+      tmin = 8888888
+      do i=1,n
+         tmin = min(tmin,vec(i))
+      enddo
+      ivlmin = tmin
+      return
+      end
+c-----------------------------------------------------------------------
+      integer function ivlmax(vec,n)
+      integer vec(1),tmax
+      if (n.eq.0) then
+         ivlmax=0
+         return
+      endif
+      TMAX =-8888888
+      do i=1,n
+         TMAX = MAX(TMAX,VEC(I))
+      enddo
+      Ivlmax = tmax
+      return
+      end
+c-----------------------------------------------------------------------
+      real function vlmax(vec,n)
+      REAL VEC(1)
+      TMAX =-99.0E20
+      do i=1,n
+         TMAX = MAX(TMAX,VEC(I))
+      enddo
+      VLMAX = TMAX
+      return
+      END
+c-----------------------------------------------------------------------
+      real function vlamax(vec,n)
+      REAL VEC(1)
+      TAMAX = 0.0
+ 
+      DO 100 I=1,N
+         TAMAX = MAX(TAMAX,ABS(VEC(I)))
+ 100  CONTINUE
+      VLAMAX = TAMAX
+      return
+      END
+c-----------------------------------------------------------------------
+      real function vlsum(vec,n)
+      REAL VEC(1)
+ 
+      SUM = 0.
+ 
+      DO 100 I=1,N
+         SUM=SUM+VEC(I)
+ 100  CONTINUE
+      VLSUM = SUM
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine vcross (u1,u2,u3,v1,v2,v3,w1,w2,w3,n)
+C
+C     Compute a Cartesian vector cross product.
+C
+      DIMENSION U1(1),U2(1),U3(1)
+      DIMENSION V1(1),V2(1),V3(1)
+      DIMENSION W1(1),W2(1),W3(1)
+ 
+ 
+      DO 100 I=1,N
+         U1(I) = V2(I)*W3(I) - V3(I)*W2(I)
+         U2(I) = V3(I)*W1(I) - V1(I)*W3(I)
+         U3(I) = V1(I)*W2(I) - V2(I)*W1(I)
+  100 CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine vdot2 (dot,u1,u2,v1,v2,n)
+C
+C     Compute a Cartesian vector dot product. 2-d version
+C
+      DIMENSION DOT(1)
+      DIMENSION U1(1),U2(1)
+      DIMENSION V1(1),V2(1)
+ 
+ 
+      DO 100 I=1,N
+         DOT(I) = U1(I)*V1(I) + U2(I)*V2(I) 
+  100 CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine vdot3 (dot,u1,u2,u3,v1,v2,v3,n)
+C
+C     Compute a Cartesian vector dot product. 3-d version
+C
+      DIMENSION DOT(1)
+      DIMENSION U1(1),U2(1),U3(1)
+      DIMENSION V1(1),V2(1),V3(1)
+ 
+ 
+      DO 100 I=1,N
+         DOT(I) = U1(I)*V1(I) + U2(I)*V2(I) + U3(I)*V3(I)
+  100 CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine addtnsr(s,h1,h2,h3,nx,ny,nz)
+C
+C     Map and add to S a tensor product form of the three functions H1,H2,H3.
+C     This is a single element routine used for deforming geometry.
+C
+      DIMENSION H1(1),H2(1),H3(1)
+      DIMENSION S(NX,NY,NZ)
+ 
+      DO 200 IZ=1,NZ
+      DO 200 IY=1,NY
+         HH = H2(IY)*H3(IZ)
+         DO 100 IX=1,NX
+            S(IX,IY,IZ)=S(IX,IY,IZ)+HH*H1(IX)
+  100    CONTINUE
+  200 CONTINUE
+      return
+      END
+      function ltrunc(string,l)
+      CHARACTER*1 STRING(L)
+      CHARACTER*1   BLNK
+      DATA BLNK/' '/
+ 
+      DO 100 I=L,1,-1
+         L1=I
+         IF (STRING(I).NE.BLNK) GOTO 200
+  100 CONTINUE
+      L1=0
+  200 CONTINUE
+      LTRUNC=L1
+      return
+      END
+c-----------------------------------------------------------------------
+      function mod1(i,n)
+C
+C     Yields MOD(I,N) with the exception that if I=K*N, result is N.
+C
+      MOD1=0
+      IF (I.EQ.0) THEN
+         return
+      ENDIF
+      IF (N.EQ.0) THEN
+         WRITE(6,*) 
+     $  'WARNING:  Attempt to take MOD(I,0) in function mod1.'
+         return
+      ENDIF
+      II = I+N-1
+      MOD1 = MOD(II,N)+1
+      return
+      END
+c-----------------------------------------------------------------------
+      integer function log2(k)
+      RK=(K)
+      RLOG=LOG10(RK)
+      RLOG2=LOG10(2.0)
+      RLOG=RLOG/RLOG2+0.5
+      LOG2=INT(RLOG)
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine iflip(i1,n)
+      DIMENSION I1(1)
+      N1=N+1
+      N2=N/2
+      DO 10 I=1,N2
+         ILAST=N1-I
+         ITMP=I1(ILAST)
+         I1(ILAST)=I1(I)
+         I1(I)=ITMP
+   10 CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine iswap(b,ind,n,temp)
+      INTEGER B(1),IND(1),TEMP(1)
+C***
+C***  SORT ASSOCIATED ELEMENTS BY PUTTING ITEM(JJ)
+C***  INTO ITEM(I), WHERE JJ=IND(I).
+C***
+      DO 20 I=1,N
+         JJ=IND(I)
+         TEMP(I)=B(JJ)
+   20 CONTINUE
+      DO 30 I=1,N
+   30 B(I)=TEMP(I)
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine col2(a,b,n)
+      real a(1),b(1)
+
+!xbm* unroll (10)
+      do i=1,n
+         a(i)=a(i)*b(i)
+      enddo
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine col2c(a,b,c,n)
+      real a(1),b(1),c
+
+      do i=1,n
+         a(i)=a(i)*b(i)*c
+      enddo
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine col3(a,b,c,n)
+      real a(1),b(1),c(1)
+
+!xbm* unroll (10)
+      do i=1,n
+         a(i)=b(i)*c(i)
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine add2(a,b,n)
+      real a(1),b(1)
+
+!xbm* unroll (10)
+      do i=1,n
+         a(i)=a(i)+b(i)
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine add3(a,b,c,n)
+      real a(1),b(1),c(1)
+
+!xbm* unroll (10)
+      do i=1,n
+         a(i)=b(i)+c(i)
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine addcol3(a,b,c,n)
+      real a(1),b(1),c(1)
+
+!xbm* unroll (10)
+      do i=1,n
+         a(i)=a(i)+b(i)*c(i)
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine add2s1(a,b,c1,n)
+      real a(1),b(1)
+ 
+      DO 100 I=1,N
+        A(I)=C1*A(I)+B(I)
+  100 CONTINUE
+      return
+      END
+ 
+c-----------------------------------------------------------------------
+      subroutine add2s2(a,b,c1,n)
+      real a(1),b(1)
+ 
+      DO 100 I=1,N
+        A(I)=A(I)+C1*B(I)
+  100 CONTINUE
+      return
+      END
+ 
+c-----------------------------------------------------------------------
+      subroutine add3s2(a,b,c,c1,c2,n)
+      real a(1),b(1),c(1)
+ 
+      DO 100 I=1,N
+        A(I)=C1*B(I)+C2*C(I)
+  100 CONTINUE
+      return
+      END
+ 
+c-----------------------------------------------------------------------
+      subroutine add4(a,b,c,d,n)
+      REAL A(1),B(1),C(1),D(1)
+ 
+      DO 100 I=1,N
+         A(I)=B(I)+C(I)+D(I)
+ 100  CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      real function vlsc2(x,y,n)
+      REAL X(1),Y(1)
+ 
+      s = 0.
+      do i=1,n
+         s = s + x(i)*y(i)
+      enddo
+      vlsc2=s
+      return
+      end
+c-----------------------------------------------------------------------
+      real function vlsc21(x,y,n)
+      real x(1),y(1)
+ 
+      s = 0.
+      do i=1,n
+         s = s + x(i)*x(i)*y(i)
+      enddo
+      vlsc21=s
+      return
+      end
+
+
+C----------------------------------------------------------------------------
+C
+C     Vector reduction routines which require communication 
+C     on a parallel machine. These routines must be substituted with
+C     appropriate routines which take into account the specific architecture.
+C
+C----------------------------------------------------------------------------
+      function glsc3(a,b,mult,n)
+C
+C     Perform inner-product in double precision
+C
+      real a(1),b(1),mult(1)
+      real tmp,work(1)
+ 
+      tmp = 0.0
+      do 10 i=1,n
+         tmp = tmp + a(i)*b(i)*mult(i)
+ 10   continue
+      call gop(tmp,work,'+  ',1)
+      glsc3 = tmp
+      return
+      end
+c-----------------------------------------------------------------------
+      function glsc2(x,y,n)
+C
+C     Perform inner-product in double precision
+C
+      real x(1), y(1)
+      real tmp,work(1)
+ 
+      tmp=0.0
+      do 10 i=1,n
+         tmp = tmp+ x(i)*y(i)
+   10 continue
+      CALL GOP(TMP,WORK,'+  ',1)
+      GLSC2 = TMP
+      return
+      END
+c-----------------------------------------------------------------------
+      function glsc23(x,y,z,n)
+c
+C     Perform inner-product  x*x*y*z
+c
+      real x(1), y(1),z(1)
+      real tmp,work(1)
+
+      ds = 0.0
+      do 10 i=1,n
+         ds=ds+x(i)*x(i)*y(i)*z(i)
+   10 continue
+      tmp=ds
+      call gop(tmp,work,'+  ',1)
+      glsc23 = tmp
+      return
+      end
+c-----------------------------------------------------------------------
+c     real function gl2norm(a,n)
+
+c     include 'SIZE'
+c     include 'MASS'
+
+c     real a(1)
+
+c     common /scrsf/ w1 (lx1,ly1,lz1,lelt)
+
+c     call col3 (w1,a,a,n)
+c     call col2 (w1,bm1,n)
+c     gl2norm = sqrt(glsum (w1,n)/volvm1)
+
+c     return
+c     end
+c-----------------------------------------------------------------------
+      function glsum (x,n)
+      DIMENSION X(1)
+      DIMENSION TMP(1),WORK(1)
+      TSUM = 0.
+      DO 100 I=1,N
+         TSUM = TSUM+X(I)
+ 100  CONTINUE
+      TMP(1)=TSUM
+      CALL GOP(TMP,WORK,'+  ',1)
+      GLSUM = TMP(1)
+      return
+      END
+c-----------------------------------------------------------------------
+      real function glamax(a,n)
+      REAL A(1)
+      DIMENSION TMP(1),WORK(1)
+      TMAX = 0.0
+      DO 100 I=1,N
+         TMAX = MAX(TMAX,ABS(A(I)))
+ 100  CONTINUE
+      TMP(1)=TMAX
+      CALL GOP(TMP,WORK,'M  ',1)
+      GLAMAX=ABS(TMP(1))
+      return
+      END
+c-----------------------------------------------------------------------
+      real function glamin(a,n)
+      real a(1)
+      dimension tmp(1),work(1)
+      tmin = 9.e28
+      do 100 i=1,n
+         tmin = min(tmin,abs(a(i)))
+ 100  continue
+      tmp(1)=tmin
+      call gop(tmp,work,'m  ',1)
+      glamin=abs(tmp(1))
+      return
+      end
+c-----------------------------------------------------------------------
+      function iglmin(a,n)
+      integer a(1),tmin
+      integer tmp(1),work(1)
+      tmin=  999999999
+      do i=1,n
+         tmin=min(tmin,a(i))
+      enddo
+      tmp(1)=tmin
+      call igop(tmp,work,'m  ',1)
+      iglmin=tmp(1)
+      return
+      end
+c-----------------------------------------------------------------------
+      function iglmax(a,n)
+      integer a(1),tmax
+      integer tmp(1),work(1)
+      tmax= -999999999
+      do i=1,n
+         tmax=max(tmax,a(i))
+      enddo
+      tmp(1)=tmax
+      call igop(tmp,work,'M  ',1)
+      iglmax=tmp(1)
+      return
+      end
+c-----------------------------------------------------------------------
+      function iglsum(a,n)
+      integer a(1),tsum
+      integer tmp(1),work(1)
+      tsum= 0
+      do i=1,n
+         tsum=tsum+a(i)
+      enddo
+      tmp(1)=tsum
+      call igop(tmp,work,'+  ',1)
+      iglsum=tmp(1)
+      return
+      end
+C-----------------------------------------------------------------------
+      integer*8 function i8glsum(a,n)
+      integer*8 a(1),tsum
+      integer*8 tmp(1),work(1)
+      tsum= 0
+      do i=1,n
+         tsum=tsum+a(i)
+      enddo
+      tmp(1)=tsum
+      call i8gop(tmp,work,'+  ',1)
+      i8glsum=tmp(1)
+      return
+      end
+C-----------------------------------------------------------------------
+      function glmax(a,n)
+      REAL A(1)
+      DIMENSION TMP(1),WORK(1)
+      TMAX=-99.0e20
+      DO 100 I=1,N
+         TMAX=MAX(TMAX,A(I))
+  100 CONTINUE
+      TMP(1)=TMAX
+      CALL GOP(TMP,WORK,'M  ',1)
+      GLMAX=TMP(1)
+      return
+      END
+c-----------------------------------------------------------------------
+      function glmin(a,n)
+      REAL A(1)
+      DIMENSION TMP(1),WORK(1)
+      TMIN=99.0e20
+      DO 100 I=1,N
+         TMIN=MIN(TMIN,A(I))
+  100 CONTINUE
+      TMP(1)=TMIN
+      CALL GOP(TMP,WORK,'m  ',1)
+      GLMIN = TMP(1)
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine gllog(la,lb)
+C
+C     If ANY LA=LB, then ALL LA=LB.
+C
+      LOGICAL LA,LB
+      DIMENSION TMP(1),WORK(1)
+ 
+      TMP(1)=1.0
+      IF (LB) THEN
+         IF (LA) TMP(1)=0.0
+      ELSE
+         IF (.NOT.LA) TMP(1)=0.0
+      ENDIF
+      CALL GOP(TMP,WORK,'*  ',1)
+      IF (TMP(1).EQ.0.0) LA=LB
+      return
+      END
+c-----------------------------------------------------------------------
+      function fmdian(a,n,ifok)
+C     find the Median of the (global) set A
+      include 'SIZE'
+      DIMENSION A(1)
+      DIMENSION WORK1(5),WORK2(5)
+      DIMENSION GUES(100)
+      LOGICAL IFOK
+ 
+      AMP  =1.5
+      AFAC =1.5
+      GMIN =GLMIN(A,N)
+      GMAX =GLMAX(A,N)
+      GMIN0=GLMIN(A,N)
+      GMAX0=GLMAX(A,N)
+      GUESS=(GMAX+GMIN)/2.0
+      EPS  =(GMAX-GMIN)
+      IF (EPS.EQ.0.0) THEN
+         FMDIAN=GMAX
+         return
+      ENDIF
+      WORK1(1)=N
+      CALL GOP(WORK1,WORK2,'+  ',1)
+      NTOT=WORK1(1)
+      N2 = (NTOT+1)/2
+      IF (.NOT.IFOK) THEN
+        WRITE(6,8) NID,N,(A(I),I=1,N)
+        WRITE(6,9) NID,NTOT,N2,N,GMIN,GMAX
+    8   FORMAT(I5,'N,A:',I5,10(6F10.5,/)) 
+    9   FORMAT(I5,'mnx:',3I6,2F10.5)
+      ENDIF
+C
+C     This is the trial loop
+C
+      ITRY=-1
+   10 CONTINUE
+      ITRY=ITRY+1
+      II=ITRY+1
+      IF (II.LE.100) GUES(II)=GUESS
+C     error check for infinite loop
+      IF (ITRY.GT.2*NTOT) GOTO 9000
+      CALL RZERO(WORK1,5)
+      NLT=0
+      NGT=0
+      CLT=GMIN0
+      CGT=GMAX0
+      DO 100 I=1,N
+         AA=A(I)
+         IF (AA.NE.GUESS) THEN
+            IF (AA.LT.GUESS) THEN
+               NLT=NLT+1
+C              CLT - closest value to GUESS Less Than GUESS
+               IF (AA.GT.CLT) CLT=AA
+            ENDIF
+            IF (AA.GT.GUESS) THEN
+               NGT=NGT+1
+C              CGT - closest value to GUESS Greater Than GUESS
+               IF (AA.LT.CGT) CGT=AA
+            ENDIF
+            DUM=1./(EPS+ABS(AA-GUESS))
+            WORK1(1)=WORK1(1)+DUM
+            WORK1(2)=WORK1(2)+DUM*AA
+         ELSE
+C           detected values equaling the guess.
+            WORK1(5)=WORK1(5)+1.0
+         ENDIF
+  100 CONTINUE
+C     Invoke vector reduction across processors:
+      WORK2(1)=CLT
+      CLT=GLMAX(WORK2,1)
+      WORK2(1)=CGT
+      CGT=GLMIN(WORK2,1)
+      WORK1(3)=NLT
+      WORK1(4)=NGT
+      CALL GOP(WORK1,WORK2,'+  ',5)
+      NLT=WORK1(3)
+      NGT=WORK1(4)
+      IF (.NOT.IFOK) THEN
+         WRITE(6,101) NID,GUESS,CLT,CGT
+         WRITE(6,102) NID,(WORK1(I),I=1,5)
+  101    FORMAT(I5,'Glg:',3F12.5)
+  102    FORMAT(I5,'WORK1:',5F12.5)
+      ENDIF
+C
+C     Done?
+C
+      IF (NLT.GT.N2.OR.NGT.GT.N2) THEN
+C        we're not done.....
+         IF (NGT.GT.NLT) THEN
+C           guess is too low
+            GMIN=CGT
+            G2=CGT+MAX(0.,WORK1(2)/WORK1(1)-GUESS)*AMP
+            IF (G2.GT.GMAX) G2=0.5*(GUESS+GMAX)
+            EPS=AFAC*ABS(G2-GUESS)
+C           see that we move at least as far as the next closest value.
+            GUESS=MAX(G2,CGT)
+            GOTO 10
+         ELSE IF (NLT.GT.NGT) THEN
+C           guess is too high
+            GMAX=CLT
+            G2=CLT+MIN(0.,WORK1(2)/WORK1(1)-GUESS)*AMP
+            IF (G2.LT.GMIN) G2=0.5*(GUESS+GMIN)
+            EPS=AFAC*ABS(G2-GUESS)
+C           see that we move at least as far as the next closest value.
+            GUESS=MIN(G2,CLT)
+            GOTO 10
+         ENDIF
+      ELSE
+C
+C        we're done....
+         IF (WORK1(5).NE.0) THEN
+C           the median is (usually) one of the values 
+            FMDIAN=GUESS
+            IF (WORK1(5).EQ.1.0) THEN
+               IF (MOD(NTOT,2).EQ.0) THEN
+                  IF (NGT.GT.NLT) THEN
+                     FMDIAN=0.5*(GUESS+CGT)
+                  ELSE
+                     FMDIAN=0.5*(GUESS+CLT)
+                  ENDIF
+               ELSE
+                  IF (NGT.EQ.NLT) THEN
+                     FMDIAN=GUESS
+                  ELSE IF(NGT.GT.NLT) THEN
+                     FMDIAN=CGT
+                  ELSE
+                     FMDIAN=CLT
+                  ENDIF
+               ENDIF
+            ENDIF
+         ELSE
+            IF (MOD(NTOT,2).EQ.0) THEN
+               IF (NGT.EQ.NLT) THEN
+                  FMDIAN=0.5*(CLT+CGT)
+               ELSE IF(NGT.GT.NLT) THEN
+                  FMDIAN=0.5*(GUESS+CGT)
+               ELSE
+                  FMDIAN=0.5*(GUESS+CLT)
+               ENDIF
+            ELSE
+               IF (NGT.EQ.NLT) THEN
+                  FMDIAN=GUESS
+               ELSE IF(NGT.GT.NLT) THEN
+                  FMDIAN=CGT
+               ELSE
+                  FMDIAN=CLT
+               ENDIF
+           ENDIF
+         ENDIF
+ 
+      ENDIF
+       IF (.NOT.IFOK) WRITE(6,*) NID,'FMDIAN2',FMDIAN,(A(I),I=1,N)
+      return
+C
+C     Error handling
+C
+ 9000 CONTINUE
+      WRITE(6,11) NTOT,GMIN0,GMAX0,GUESS
+   11 FORMAT('ABORTING IN FMDIAN: N,AMIN,AMAX:',I6,3G14.6)
+      DO 13 I1=1,N,5
+        IN=I1+5 
+        IN=MIN(IN,N)
+        WRITE(6,12) NID,(A(I),I=I1,IN)
+   12   FORMAT(I4,' FMA:',5G14.6)
+   13 CONTINUE
+      DO 15 I1=1,ITRY,5
+        IN=I1+5
+        IN=MIN(IN,ITRY)
+        WRITE(6,14) NID,(GUES(I),I=I1,IN)
+   14   FORMAT(I4,' FMG:',5G14.6)
+   15 CONTINUE
+      call exitt
+      END
+
+C========================================================================
+C     Double precision matrix and vector routines
+C========================================================================
+
+c-----------------------------------------------------------------------
+      subroutine dcadd(a,const,n)
+      real*8 A(1),CONST
+ 
+      DO 100 I=1,N
+         A(I)=A(I)+CONST
+ 100  CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine dsub2(a,b,n)
+      real*8 A(1), B(1)
+ 
+      DO 100 I=1,N
+         A(I)=A(I)-B(I)
+ 100  CONTINUE
+      return
+      END
+ 
+c-----------------------------------------------------------------------
+      subroutine dadd2(a,b,n)
+      real*8 A(1), B(1)
+ 
+      DO 100 I=1,N
+         A(I)=A(I)+B(I)
+ 100  CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine chswapr(b,L,ind,n,temp)
+      INTEGER IND(1)
+      CHARACTER*6 B(1),TEMP(1)
+    
+C***  SORT ASSOCIATED ELEMENTS BY PUTTING ITEM(JJ)
+C***  INTO ITEM(I), WHERE JJ=IND(I).
+C***
+      DO 20 I=1,N
+         JJ=IND(I)
+         TEMP(I)=B(JJ)
+   20 CONTINUE
+      DO 30 I=1,N
+   30 B(I)=TEMP(I)
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine drcopy(r,d,N)
+      real*8    d(1)
+      dimension r(1)
+      do 10 i=1,n
+         r(i)=d(i)
+   10 continue
+      return
+      end
+      subroutine sorts(xout,xin,work,n)
+      real xout(1),xin(1),work(1)
+      call copy(xout,xin,n)
+      call sort(xout,work,n)
+      return
+      end
+C
+c-----------------------------------------------------------------------
+      function ivlsum(a,n)
+      INTEGER A(1)
+      INTEGER TSUM
+      if (n.eq.0) then
+         ivlsum = 0
+         return
+      endif
+      TSUM=A(1)
+      DO 100 I=2,N
+         TSUM=TSUM+A(I)
+  100 CONTINUE
+      IVLSUM=TSUM
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine icadd(a,c,n)
+      INTEGER A(1),C
+      DO 100 I = 1, N
+ 100     A(I) = A(I) + C
+      return
+      END
+      subroutine isort(a,ind,n)
+C
+C     Use Heap Sort (p 231 Num. Rec., 1st Ed.)
+C
+      integer a(1),ind(1)
+      integer aa
+ 
+      dO 10 j=1,n
+         ind(j)=j
+   10 continue
+ 
+      if (n.le.1) return
+      L=n/2+1
+      ir=n
+  100 continue
+         if (l.gt.1) then
+            l=l-1
+            aa  = a  (l)
+            ii  = ind(l)
+         else
+                 aa =   a(ir)
+                 ii = ind(ir)
+              a(ir) =   a( 1)
+            ind(ir) = ind( 1)
+            ir=ir-1
+            if (ir.eq.1) then
+                 a(1) = aa
+               ind(1) = ii
+               return
+            endif
+         endif
+         i=l
+         j=l+l
+  200    continue
+         if (j.le.ir) then
+            if (j.lt.ir) then
+               if ( a(j).lt.a(j+1) ) j=j+1
+            endif
+            if (aa.lt.a(j)) then
+                 a(i) = a(j)
+               ind(i) = ind(j)
+               i=j
+               j=j+j
+            else
+               j=ir+1
+            endif
+         GOTO 200
+         endif
+           a(i) = aa
+         ind(i) = ii
+      GOTO 100
+      end
+      subroutine sort(a,ind,n)
+C
+C     Use Heap Sort (p 231 Num. Rec., 1st Ed.)
+C
+      real a(1),aa
+      integer ind(1)
+ 
+      dO 10 j=1,n
+         ind(j)=j
+   10 continue
+ 
+      if (n.le.1) return
+      L=n/2+1
+      ir=n
+  100 continue
+         if (l.gt.1) then
+            l=l-1
+            aa  = a  (l)
+            ii  = ind(l)
+         else
+                 aa =   a(ir)
+                 ii = ind(ir)
+              a(ir) =   a( 1)
+            ind(ir) = ind( 1)
+            ir=ir-1
+            if (ir.eq.1) then
+                 a(1) = aa
+               ind(1) = ii
+               return
+            endif
+         endif
+         i=l
+         j=l+l
+  200    continue
+         if (j.le.ir) then
+            if (j.lt.ir) then
+               if ( a(j).lt.a(j+1) ) j=j+1
+            endif
+            if (aa.lt.a(j)) then
+                 a(i) = a(j)
+               ind(i) = ind(j)
+               i=j
+               j=j+j
+            else
+               j=ir+1
+            endif
+         GOTO 200
+         endif
+           a(i) = aa
+         ind(i) = ii
+      GOTO 100
+      end
+c-----------------------------------------------------------------------
+      subroutine iswap_ip(x,p,n)
+      integer x(1),xstart
+      integer p(1)
+c
+c     In-place permutation: x' = x(p)
+c
+      do k=1,n
+         if (p(k).gt.0) then   ! not swapped
+            xstart     = x(k)
+            loop_start = k
+            last       = k
+            do j=k,n
+               next    = p(last)
+               if (next.lt.0) then
+                  write(6,*) 'Hey! iswap_ip problem.',j,k,n,next
+                  call exitt
+               elseif (next.eq.loop_start) then
+                  x(last) = xstart
+                  p(last) = -p(last)
+                  goto 10
+               else
+                  x(last) = x(next)
+                  p(last) = -p(last)
+                  last    = next
+               endif
+            enddo
+   10       continue
+         endif
+      enddo
+c
+      do k=1,n
+         p(k) = -p(k)
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine iswapt_ip(x,p,n)
+      integer x(1),t1,t2
+      integer p(1)
+c
+c     In-place permutation: x'(p) = x
+c
+
+      do k=1,n
+         if (p(k).gt.0) then   ! not swapped
+            loop_start = k
+            next       = p(loop_start)
+            t1         = x(loop_start)
+            do j=1,n
+               if (next.lt.0) then
+                  write(6,*) 'Hey! iswapt_ip problem.',j,k,n,next
+                  call exitt
+               elseif (next.eq.loop_start) then
+                  x(next) = t1
+                  p(next) = -p(next)
+                  goto 10
+               else
+                  t2      =  x(next)
+                  x(next) =  t1
+                  t1      =  t2
+                  nextp   =  p(next)
+                  p(next) = -p(next)
+                  next    =  nextp
+               endif
+            enddo
+   10       continue
+         endif
+      enddo
+ 
+      do k=1,n
+         p(k) = -p(k)
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine swap_ip(x,p,n)
+      real    x(1),xstart
+      integer p(1)
+c
+c     In-place permutation: x' = x(p)
+c
+      do k=1,n
+         if (p(k).gt.0) then   ! not swapped
+            xstart     = x(k)
+            loop_start = k
+            last       = k
+            do j=k,n
+               next    = p(last)
+               if (next.lt.0) then
+                  write(6,*) 'Hey! swap_ip problem.',j,k,n,next
+                  call exitt
+               elseif (next.eq.loop_start) then
+                  x(last) = xstart
+                  p(last) = -p(last)
+                  goto 10
+               else
+                  x(last) = x(next)
+                  p(last) = -p(last)
+                  last    = next
+               endif
+            enddo
+   10       continue
+         endif
+      enddo
+ 
+      do k=1,n
+         p(k) = -p(k)
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine swapt_ip(x,p,n)
+      real    x(1),t1,t2
+      integer p(1)
+c
+c     In-place permutation: x'(p) = x
+c
+
+      do k=1,n
+         if (p(k).gt.0) then   ! not swapped
+            loop_start = k
+            next       = p(loop_start)
+            t1         = x(loop_start)
+            do j=1,n
+               if (next.lt.0) then
+                  write(6,*) 'Hey! swapt_ip problem.',j,k,n,next
+                  call exitt
+               elseif (next.eq.loop_start) then
+                  x(next) = t1
+                  p(next) = -p(next)
+                  goto 10
+               else
+                  t2      =  x(next)
+                  x(next) =  t1
+                  t1      =  t2
+                  nextp   =  p(next)
+                  p(next) = -p(next)
+                  next    =  nextp
+               endif
+            enddo
+   10       continue
+         endif
+      enddo
+ 
+      do k=1,n
+         p(k) = -p(k)
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine glvadd(x,w,n)
+      real x(1),w(1)
+      call gop(x,w,'+  ',1)
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine add3s12(x,y,z,c1,c2,n)
+      real x(1),y(1),z(1),c1,c2
+      do i=1,n
+         x(i) = c1*y(i)+c2*z(i)
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      integer*8 function i8glmax(a,n)
+      integer*8 a(1),tmax
+      integer*8 tmp(1),work(1)
+      tmax= -999999
+      do i=1,n
+         tmax=max(tmax,a(i))
+      enddo
+      tmp(1)=tmax
+      call i8gop(tmp,work,'M  ',1)
+      i8glmax=tmp(1)
+      if (i8glmax .eq. -999999) i8glmax=0
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine admcol3(a,b,c,d,n)
+      REAL A(1),B(1),C(1),D
+C
+      DO 100 I=1,N
+         A(I)=A(I)+B(I)*C(I)*D
+  100 CONTINUE
+      return
+      END
+c-----------------------------------------------------------------------
+      subroutine add2col2(a,b,c,n)
+      real a(1),b(1),c(1)
+ 
+      do i=1,n
+         a(i) = a(i) + b(i)*c(i)
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine add2sxy(x,a,y,b,n)
+      real x(1),y(1)
+ 
+      do i=1,n
+         x(i) = a*x(i) + b*y(i)
+      enddo
+ 
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine col2s2(x,y,s,n)
+      real x(n),y(n)
+ 
+      do i=1,n
+         x(i)=s*x(i)*y(i)
+      enddo
+ 
+      return
+      end
+c-----------------------------------------------------------------------
+      INTEGER FUNCTION INDX1(S1,S2,L2)
+      CHARACTER*132 S1,S2
+ 
+      N1=132-L2+1
+      INDX1=0
+      IF (N1.LT.1) return
+ 
+      DO 100 I=1,N1
+         I2=I+L2-1
+         IF (S1(I:I2).EQ.S2(1:L2)) THEN
+            INDX1=I
+            return
+         ENDIF
+  100 CONTINUE
+ 
+      return
+      END
+c-----------------------------------------------------------------------
+
diff --git a/src/mpi_dummy.f b/src/mpi_dummy.f
new file mode 100644
index 0000000..f6257df
--- /dev/null
+++ b/src/mpi_dummy.f
@@ -0,0 +1,1053 @@
+c*********************************************************************72
+      subroutine mpi_scan(data1, data2, n, datatype,
+     &  operation, comm, ierror )
+
+      integer data1,data2  ! currently hardwired only for integer
+
+      data2 = data1
+
+      return
+      end
+
+c*********************************************************************72
+      subroutine mpi_abort ( comm, errorcode, ierror )
+
+c*********************************************************************72
+c
+cc MPI_ABORT shuts down the processes in a given communicator.
+c
+      implicit none
+
+      integer comm
+      integer errorcode
+      integer ierror
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+
+      ierror = MPI_SUCCESS
+
+      write ( *, '(a)' ) ' '
+      write ( *, '(a)' ) 'MPI_ABORT:'
+      write ( *, '(a,i12)' ) 
+     &  '  Shut down with error code = ', errorcode
+
+      stop
+      end
+      subroutine mpi_allgather ( data1, nsend, sendtype, data2, 
+     &  nrecv, recvtype, comm, ierror )
+
+c*********************************************************************72
+c
+cc MPI_ALLGATHER gathers data from all the processes in a communicator.
+c
+      implicit none
+
+      include "mpi_dummy.h"
+
+      integer nsend
+
+      integer comm
+      integer data1(nsend)
+      integer data2(nsend)
+      integer ierror
+      integer nrecv
+      integer recvtype
+      integer sendtype
+
+      ierror = MPI_SUCCESS
+
+      if ( sendtype .eq. mpi_double_precision ) then
+        call mpi_copy_double_precision ( data1, data2, nsend, ierror )
+      else if ( sendtype .eq. mpi_integer ) then
+        call mpi_copy_integer ( data1, data2, nsend, ierror )
+      else if ( sendtype .eq. mpi_real ) then
+        call mpi_copy_real ( data1, data2, nsend, ierror )
+      else
+        ierror = MPI_FAILURE
+      end if
+
+      return
+      end
+      subroutine mpi_allgatherv ( data1, nsend, sendtype,
+     &  data2, nrecv, ndispls, recvtype, comm, ierror )
+
+c*********************************************************************72
+c
+cc MPI_ALLGATHERV gathers data from all the processes in a communicator.
+c
+      implicit none
+
+      include "mpi_dummy.h"
+
+      integer nsend
+
+      integer comm
+      integer data1(nsend)
+      integer data2(nsend)
+      integer ierror
+      integer ndispls
+      integer nrecv
+      integer recvtype
+      integer sendtype
+
+      ierror = MPI_SUCCESS
+
+      if ( sendtype .eq. mpi_double_precision ) then
+        call mpi_copy_double_precision ( data1, data2, nsend, ierror )
+      else if ( sendtype .eq. mpi_integer ) then
+        call mpi_copy_integer ( data1, data2, nsend, ierror )
+      else if ( sendtype .eq. mpi_real ) then
+        call mpi_copy_real ( data1, data2, nsend, ierror )
+      else
+        ierror = MPI_FAILURE
+      end if
+
+      return
+      end
+      subroutine mpi_allreduce ( data1, data2, n, datatype,
+     &  operation, comm, ierror )
+
+c*********************************************************************72
+c
+cc MPI_ALLREDUCE carries out a reduction operation.
+c
+      implicit none
+
+      include "mpi_dummy.h"
+
+      integer n
+
+      integer comm
+      integer data1(n)
+      integer data2(n)
+      integer datatype
+      integer ierror
+      integer operation
+
+      ierror = MPI_SUCCESS
+
+      if ( datatype .eq. mpi_double_precision ) then
+
+        call mpi_reduce_double_precision ( 
+     &    data1, data2, n, operation, ierror )
+
+      else if ( datatype .eq. mpi_integer ) then
+
+        call mpi_reduce_integer ( 
+     &    data1, data2, n, operation, ierror )
+
+      else if ( datatype .eq. mpi_integer8 ) then
+
+        call mpi_reduce_integer8( 
+     &    data1, data2, n, operation, ierror )
+
+      else if ( datatype .eq. mpi_real ) then
+
+        call mpi_reduce_real ( 
+     &    data1, data2, n, operation, ierror )
+
+      else
+
+        ierror = MPI_FAILURE
+
+      end if
+
+      return
+      end
+
+      subroutine mpi_barrier ( comm, ierror )
+
+c*********************************************************************72
+c
+cc MPI_BARRIER forces processes within a communicator to wait together.
+c
+      implicit none
+
+      integer comm
+      integer ierror
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+
+      ierror = MPI_FAILURE
+
+      return
+      end
+      subroutine mpi_bcast ( data, n, datatype, node, comm, ierror )
+
+c*********************************************************************72
+c
+cc MPI_BCAST broadcasts data from one process to all others.
+c
+      implicit none
+
+      integer n
+
+      integer comm
+      integer data(n)
+      integer datatype
+      integer ierror
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+      integer node
+
+      ierror = MPI_SUCCESS
+
+      return
+      end
+      subroutine mpi_bsend ( data, n, datatype, iproc, itag,
+     &  comm, ierror )
+
+c*********************************************************************72
+c
+cc MPI_BSEND sends data from one process to another, using buffering.
+c
+      implicit none
+
+      integer n
+
+      integer comm
+      integer data(n)
+      integer datatype
+      integer ierror
+      integer iproc
+      integer itag
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+
+      ierror = MPI_FAILURE
+
+      write ( *, '(a)' ) ' '
+      write ( *, '(a)' ) 'MPI_BSEND - Error!'
+      write ( *, '(a)' )  '  Should not send message to self.'
+
+      return
+      end
+      subroutine mpi_cart_create ( comm, ndims, dims, periods,
+     &  reorder, comm_cart, ierror )
+
+c*********************************************************************72
+c
+cc MPI_CART_CREATE creates a communicator for a Cartesian topology.
+c
+      implicit none
+
+      integer ndims
+
+      integer comm
+      integer comm_cart
+      integer dims(*)
+      integer ierror
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+      logical periods(*)
+      logical reorder
+
+      ierror = MPI_SUCCESS
+
+      return
+      end
+      subroutine mpi_cart_get ( comm, ndims, dims, periods,
+     &  coords, ierror )
+
+c*********************************************************************72
+c
+cc MPI_CART_GET returns the "Cartesian coordinates" of the calling process.
+c
+      implicit none
+
+      integer ndims
+
+      integer comm
+      integer coords(*)
+      integer dims(*)
+      integer i
+      integer ierror
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+      logical periods(*)
+
+      ierror = MPI_SUCCESS
+
+      do i = 1, ndims
+        coords(i) = 0
+      end do
+
+      return
+      end
+      subroutine mpi_cart_shift ( comm, idir, idisp, isource, 
+     &  idest, ierror )
+
+c*********************************************************************72
+c
+cc MPI_CART_SHIFT finds the destination and source for Cartesian shifts.
+c
+      implicit none
+
+      integer comm
+      integer idest
+      integer idir
+      integer idisp
+      integer ierror
+      integer isource
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+
+      ierror = MPI_SUCCESS
+      isource = 0
+      idest = 0
+
+      return
+      end
+      subroutine mpi_comm_dup ( comm, comm_out, ierror )
+
+c*********************************************************************72
+c
+cc MPI_COMM_DUP duplicates a communicator.
+c
+      implicit none
+
+      integer comm
+      integer comm_out
+      integer ierror
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+
+      ierror = MPI_SUCCESS
+      comm_out = comm
+
+      return
+      end
+      subroutine mpi_comm_free ( comm, ierror )
+
+c*********************************************************************72
+c
+cc MPI_COMM_FREE "frees" a communicator.
+c
+      implicit none
+
+      integer comm
+      integer ierror
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+
+      ierror = MPI_SUCCESS
+
+      return
+      end
+      subroutine mpi_comm_rank ( comm, me, ierror )
+
+c*********************************************************************72
+c
+cc MPI_COMM_RANK reports the rank of the calling process.
+c
+      implicit none
+
+      integer comm
+      integer ierror
+      integer me
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+
+      ierror = MPI_SUCCESS
+      me = 0
+
+      return
+      end
+      subroutine mpi_comm_size ( comm, nprocs, ierror )
+
+c*********************************************************************72
+c
+cc MPI_COMM_SIZE reports the number of processes in a communicator.
+c
+      implicit none
+
+      integer comm
+      integer ierror
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+      integer nprocs
+
+      ierror = MPI_SUCCESS
+      nprocs = 1
+
+      return
+      end
+      subroutine mpi_comm_split ( comm, icolor, ikey, comm_new,
+     &  ierror )
+
+c*********************************************************************72
+c
+cc MPI_COMM_SPLIT splits up a communicator based on a key.
+c
+      implicit none
+
+      integer comm
+      integer comm_new
+      integer icolor
+      integer ierror
+      integer ikey
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+
+      ierror = MPI_SUCCESS
+
+      return
+      end
+      subroutine mpi_copy_double_precision ( data1, data2, n, ierror )
+
+c*********************************************************************72
+c
+cc MPI_COPY_DOUBLE copies a double precision vector.
+c
+      implicit none
+
+      integer n
+
+      double precision data1(n)
+      double precision data2(n)
+      integer i
+      integer ierror
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+
+      ierror = MPI_SUCCESS
+
+      do i = 1, n
+        data2(i) = data1(i)
+      end do
+
+      return
+      end
+      subroutine mpi_copy_integer ( data1, data2, n, ierror )
+
+c*********************************************************************72
+c
+cc MPI_COPY_INTEGER copies an integer vector.
+c
+      implicit none
+
+      integer n
+
+      integer data1(n)
+      integer data2(n)
+      integer i
+      integer ierror
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+
+      ierror = MPI_SUCCESS
+
+      do i = 1, n
+        data2(i) = data1(i)
+      end do
+
+      return
+      end
+      subroutine mpi_copy_real ( data1, data2, n, ierror )
+
+c*********************************************************************72
+c
+      implicit none
+
+      integer n
+
+      real data1(n)
+      real data2(n)
+      integer i
+      integer ierror
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+
+      ierror = MPI_SUCCESS
+
+      do i = 1, n
+        data2(i) = data1(i)
+      end do
+
+      return
+      end
+      subroutine mpi_finalize ( ierror )
+
+c*********************************************************************72
+c
+cc MPI_FINALIZE shuts down the MPI library.
+c
+      implicit none
+
+      integer ierror
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+
+      ierror = MPI_SUCCESS
+
+      return
+      end
+      subroutine mpi_get_count ( istatus, datatype, icount, ierror )
+
+c*********************************************************************72
+c
+cc MPI_GET_COUNT reports the actual number of items transmitted.
+c
+      implicit none
+
+      integer datatype
+      integer icount
+      integer ierror
+      integer istatus
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+
+      ierror = MPI_FAILURE
+
+      write ( *, '(a)' ) ' '
+      write ( *, '(a)' ) 'MPI_GET_COUNT - Error!'
+      write ( *, '(a)' ) '  Should not query message from self.'
+
+      return
+      end
+      subroutine mpi_init ( ierror )
+
+c*********************************************************************72
+c
+cc MPI_INIT initializes the MPI library.
+c
+      implicit none
+
+      integer ierror
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+
+      ierror = MPI_SUCCESS
+
+      return
+      end
+      subroutine mpi_irecv ( data, n, datatype, iproc, itag,
+     &  comm, irequest, ierror )
+
+c*********************************************************************72
+c
+cc MPI_IRECV receives data from another process.
+c
+      implicit none
+
+      integer n
+
+      integer comm
+      integer data(n)
+      integer datatype
+      integer ierror
+      integer iproc
+      integer irequest
+      integer itag
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+
+      ierror = MPI_FAILURE
+
+      write ( *, '(a)' ) ' '
+      write ( *, '(a)' ) 'MPI_IRECV - Error!'
+      write ( *, '(a)' ) '  Should not recv message from self.'
+
+      return
+      end
+      subroutine mpi_isend ( data, n, datatype, iproc, itag,
+     &  comm, request, ierror )
+
+c*********************************************************************72
+c
+cc MPI_ISEND sends data from one process to another using nonblocking transmission.
+c
+      implicit none
+
+      integer n
+
+      integer comm
+      integer data(n)
+      integer datatype
+      integer ierror
+      integer iproc
+      integer itag
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+      integer request
+
+      request = 0
+      ierror = MPI_FAILURE
+
+      write ( *, '(a)' ) ' '
+      write ( *, '(a)' ) 'MPI_ISEND - Error!'
+      write ( *, '(a)' )  '  Should not send message to self.'
+
+      return
+      end
+      subroutine mpi_recv ( data, n, datatype, iproc, itag,
+     &  comm, istatus, ierror )
+
+c*********************************************************************72
+c
+cc MPI_RECV receives data from another process within a communicator.
+c
+      implicit none
+
+      integer n
+
+      integer comm
+      integer data(n)
+      integer datatype
+      integer ierror
+      integer iproc
+      integer istatus
+      integer itag
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+
+      ierror = MPI_FAILURE
+
+      write ( *, '(a)' ) ' '
+      write ( *, '(a)' ) 'MPI_RECV - Error!'
+      write ( *, '(a)' ) '  Should not recv message from self.'
+
+      return
+      end
+      subroutine mpi_reduce ( data1, data2, n, datatype, operation,
+     &  receiver, comm, ierror )
+
+c*********************************************************************72
+c
+cc MPI_REDUCE carries out a reduction operation.
+c
+      implicit none
+
+      include "mpi_dummy.h"
+
+      integer n
+
+      integer comm
+      integer data1(n)
+      integer data2
+      integer datatype
+      integer ierror
+      integer operation
+      integer receiver
+
+      ierror = MPI_SUCCESS
+
+      if ( datatype .eq. mpi_double_precision ) then
+
+        call mpi_reduce_double_precision ( 
+     &    data1, data2, n, operation, ierror )
+
+      else if ( datatype .eq. mpi_integer ) then
+
+        call mpi_reduce_integer ( 
+     &    data1, data2, n, operation, ierror )
+
+      else if ( datatype .eq. mpi_real ) then
+
+        call mpi_reduce_real ( 
+     &    data1, data2, n, operation, ierror )
+
+      else
+
+        ierror = MPI_FAILURE
+
+      end if
+
+      return
+      end
+      subroutine mpi_reduce_double_precision ( 
+     &  data1, data2, n, operation, ierror )
+
+c*********************************************************************72
+c
+cc MPI_REDUCE_DOUBLE_PRECISION carries out a reduction operation on double precision values.
+c
+      implicit none
+
+      include "mpi_dummy.h"
+
+      integer n
+
+      double precision data1(n)
+      double precision data2(n)
+      integer i
+      integer ierror
+      integer operation
+
+
+      ierror = MPI_SUCCESS
+
+      do i = 1, n
+        data2(i) = data1(i)
+      end do
+
+      return
+      end
+
+      subroutine mpi_reduce_integer8 ( 
+     &  data1, data2, n, operation, ierror )
+
+c*********************************************************************72
+c
+      implicit none
+
+      include "mpi_dummy.h"
+
+      integer n
+
+      integer*8 data1(n)
+      integer*8 data2(n)
+      integer i
+      integer ierror
+      integer operation
+
+      ierror = MPI_SUCCESS
+
+      do i = 1, n
+         data2(i) = data1(i)
+      end do
+
+      ierror = MPI_FAILURE
+
+      return
+      end
+ 
+      subroutine mpi_reduce_integer ( 
+     &  data1, data2, n, operation, ierror )
+
+c*********************************************************************72
+c
+      implicit none
+
+      include "mpi_dummy.h"
+
+      integer n
+
+      integer data1(n)
+      integer data2(n)
+      integer i
+      integer ierror
+      integer operation
+
+      ierror = MPI_SUCCESS
+
+      do i = 1, n
+         data2(i) = data1(i)
+      end do
+
+      ierror = MPI_FAILURE
+
+      return
+      end
+
+      subroutine mpi_reduce_real ( 
+     &  data1, data2, n, operation, ierror )
+
+c*********************************************************************72
+c
+cc MPI_REDUCE_REAL carries out a reduction operation on reals.
+c
+c  Discussion:
+c
+      implicit none
+
+      include "mpi_dummy.h"
+
+      integer n
+
+      real data1(n)
+      real data2(n)
+      integer i
+      integer ierror
+      integer operation
+
+      ierror = MPI_SUCCESS
+
+        do i = 1, n
+          data2(i) = data1(i)
+        end do
+
+      return
+      end
+      subroutine mpi_reduce_scatter ( data1, data2, n, datatype,
+     &  operation, comm, ierror )
+
+c*********************************************************************72
+c
+cc MPI_REDUCE_SCATTER collects a message of the same length from each process.
+c
+      implicit none
+
+      include "mpi_dummy.h"
+
+      integer n
+
+      integer comm
+      integer data1(n)
+      integer data2(n)
+      integer datatype
+      integer ierror
+      integer operation
+
+      ierror = MPI_SUCCESS
+
+      if ( datatype .eq. mpi_double_precision ) then
+        call mpi_copy_double_precision ( data1, data2, n, ierror )
+      else if ( datatype .eq. mpi_integer ) then
+        call mpi_copy_integer ( data1, data2, n, ierror )
+      else if ( datatype .eq. mpi_real ) then
+        call mpi_copy_real ( data1, data2, n, ierror )
+      else
+        ierror = MPI_FAILURE
+      end if
+
+      return
+      end
+      subroutine mpi_rsend ( data, n, datatype, iproc, itag,
+     &  comm, ierror )
+
+c*********************************************************************72
+c
+cc MPI_RSEND "ready sends" data from one process to another.
+c
+      implicit none
+
+      integer n
+
+      integer comm
+      integer data(n)
+      integer datatype
+      integer ierror
+      integer iproc
+      integer itag
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+
+      ierror = MPI_FAILURE
+
+      write ( *, '(a)' ) ' '
+      write ( *, '(a)' ) 'MPI_RSEND - Error!'
+      write ( *, '(a)' ) '  Should not send message to self.'
+
+      return
+      end
+      subroutine mpi_send ( data, n, datatype, iproc, itag,
+     &  comm, ierror )
+
+c*********************************************************************72
+c
+cc MPI_SEND sends data from one process to another.
+c
+      implicit none
+
+      integer n
+
+      integer comm
+      integer data(n)
+      integer datatype
+      integer ierror
+      integer iproc
+      integer itag
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+
+      ierror = MPI_FAILURE
+
+      write ( *, '(a)' ) ' '
+      write ( *, '(a)' ) 'MPI_SEND - Error!'
+      write ( *, '(a)' )  '  Should not send message to self.'
+
+      return
+      end
+      subroutine mpi_wait ( irequest, istatus, ierror )
+
+c*********************************************************************72
+c
+cc MPI_WAIT waits for an I/O request to complete.
+c
+      implicit none
+
+      integer ierror
+      integer irequest
+      integer istatus
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+
+      ierror = MPI_FAILURE
+
+      write ( *, '(a)' ) ' '
+      write ( *, '(a)' ) 'MPI_WAIT - Error!'
+      write ( *, '(a)' ) '  Should not wait on message from self.'
+
+      return
+      end
+      subroutine mpi_waitall ( icount, irequest, istatus, ierror )
+
+c*********************************************************************72
+c
+cc MPI_WAITALL waits until all I/O requests have completed.
+c
+      implicit none
+
+      integer icount
+      integer ierror
+      integer irequest
+      integer istatus
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+
+      ierror = MPI_FAILURE
+
+      write ( *, '(a)' ) ' '
+      write ( *, '(a)' ) 'MPI_WAITALL - Error!'
+      write ( *, '(a)' ) '  Should not wait on message from self.'
+
+      return
+      end
+      subroutine mpi_waitany ( icount, array_of_requests, index, 
+     &  istatus, ierror )
+
+c*********************************************************************72
+c
+cc MPI_WAITANY waits until one I/O requests has completed.
+c
+      implicit none
+
+      integer array_of_requests(*)
+      integer icount
+      integer ierror
+      integer index
+      integer istatus
+      integer MPI_FAILURE
+      parameter ( MPI_FAILURE = 1 )
+      integer MPI_SUCCESS
+      parameter ( MPI_SUCCESS = 0 )
+
+      ierror = MPI_FAILURE
+
+      write ( *, '(a)' ) ' '
+      write ( *, '(a)' ) 'MPI_WAITANY - Error!'
+      write ( *, '(a)' ) '  Should not wait on message from self.'
+
+      return
+      end
+      function mpi_wtick ( )
+
+c*********************************************************************72
+c
+cc MPI_WTICK returns the time between clock ticks.
+c
+      implicit none
+      
+      double precision mpi_wtick
+      
+      mpi_wtick = 1.0D+00
+      
+      return
+      end
+      function mpi_wtime ( )
+
+c*********************************************************************72
+c
+cc MPI_WTIME returns the elapsed wall clock time.
+c
+      implicit none
+
+      real*8  mpi_wtime
+      real*8 a
+      integer*8 countval, countrate, countmax
+
+      call system_clock(countval, countrate, countmax)
+      a = countval
+      mpi_wtime = a/countrate
+
+      return
+      end
+
+      subroutine mpi_initialized(mpi_is_initialized, ierr)
+
+      mpi_is_initialized = 0 
+      ierr = 0
+
+      return
+      end
+
+      subroutine mpi_comm_create(icomm,igroup,icommd,ierr)
+
+      icommd = 1
+
+      return
+      end
+
+      subroutine mpi_comm_group(icomm,igroup,ierr)
+
+      igroup = 1
+      ierr = 0
+
+      return
+      end
+
+      subroutine mpi_group_free
+
+      return
+      end
+
+      subroutine mpi_attr_get(icomm,ikey,ival,iflag,ierr)
+ 
+      logical iflag
+
+      ival =  999 999 999  ! dummy
+ 
+      return
+      end
+c-----------------------------------------------------------------------
diff --git a/src/mpi_dummy.h b/src/mpi_dummy.h
new file mode 100644
index 0000000..0a92b81
--- /dev/null
+++ b/src/mpi_dummy.h
@@ -0,0 +1,61 @@
+c
+c  Dummy parameters for MPI F77 stubs
+c
+      integer mpi_comm_world
+      parameter ( mpi_comm_world = 0 )
+c
+c  Return values.
+c
+      integer mpi_failure
+      parameter ( mpi_failure = 1 )
+      integer mpi_success
+      parameter ( mpi_success = 0 )
+c
+c  recv message status
+c
+      integer mpi_status_size
+      parameter ( mpi_status_size = 3 )
+      integer mpi_source
+      parameter ( mpi_source = 1 )
+      integer mpi_tag
+      parameter ( mpi_tag = 2 )
+      integer mpi_count
+      parameter ( mpi_count = 3 )
+c
+c  recv flags
+c
+      integer mpi_any_source
+      parameter ( mpi_any_source = -1 )
+      integer mpi_any_tag
+      parameter ( mpi_any_tag = -1 )
+c
+c  data types and sizes
+c
+      integer mpi_integer
+      parameter ( mpi_integer = 1 )
+      integer mpi_integer8
+      parameter ( mpi_integer8 = 6 )
+      integer mpi_real
+      parameter ( mpi_real = 2 )
+      integer mpi_double_precision
+      parameter ( mpi_double_precision = 3 )
+      integer mpi_logical
+      parameter ( mpi_logical = 4 )
+      integer mpi_character
+      parameter ( mpi_character = 5 )
+c
+c  allreduce operations
+c
+      integer mpi_sum
+      parameter ( mpi_sum = 1 )
+      integer mpi_max
+      parameter ( mpi_max = 2 )
+      integer mpi_min
+      parameter ( mpi_min = 3 )
+      integer mpi_product
+      parameter ( mpi_product = 4 )
+c
+c  timer
+c
+      external mpi_wtime
+      real*8 mpi_wtime
diff --git a/src/mxm_std.f b/src/mxm_std.f
new file mode 100644
index 0000000..5e21cb3
--- /dev/null
+++ b/src/mxm_std.f
@@ -0,0 +1,4123 @@
+      subroutine mxmf2(A,N1,B,N2,C,N3)
+c
+c     unrolled loop version 
+c
+      real a(n1,n2),b(n2,n3),c(n1,n3)
+
+      if (n2.le.8) then
+         if (n2.eq.1) then
+            call mxf1(a,n1,b,n2,c,n3)
+         elseif (n2.eq.2) then
+            call mxf2(a,n1,b,n2,c,n3)
+         elseif (n2.eq.3) then
+            call mxf3(a,n1,b,n2,c,n3)
+         elseif (n2.eq.4) then
+            call mxf4(a,n1,b,n2,c,n3)
+         elseif (n2.eq.5) then
+            call mxf5(a,n1,b,n2,c,n3)
+         elseif (n2.eq.6) then
+            call mxf6(a,n1,b,n2,c,n3)
+         elseif (n2.eq.7) then
+            call mxf7(a,n1,b,n2,c,n3)
+         else
+            call mxf8(a,n1,b,n2,c,n3)
+         endif
+      elseif (n2.le.16) then
+         if (n2.eq.9) then
+            call mxf9(a,n1,b,n2,c,n3)
+         elseif (n2.eq.10) then
+            call mxf10(a,n1,b,n2,c,n3)
+         elseif (n2.eq.11) then
+            call mxf11(a,n1,b,n2,c,n3)
+         elseif (n2.eq.12) then
+            call mxf12(a,n1,b,n2,c,n3)
+         elseif (n2.eq.13) then
+            call mxf13(a,n1,b,n2,c,n3)
+         elseif (n2.eq.14) then
+            call mxf14(a,n1,b,n2,c,n3)
+         elseif (n2.eq.15) then
+            call mxf15(a,n1,b,n2,c,n3)
+         else
+            call mxf16(a,n1,b,n2,c,n3)
+         endif
+      elseif (n2.le.24) then
+         if (n2.eq.17) then
+            call mxf17(a,n1,b,n2,c,n3)
+         elseif (n2.eq.18) then
+            call mxf18(a,n1,b,n2,c,n3)
+         elseif (n2.eq.19) then
+            call mxf19(a,n1,b,n2,c,n3)
+         elseif (n2.eq.20) then
+            call mxf20(a,n1,b,n2,c,n3)
+         elseif (n2.eq.21) then
+            call mxf21(a,n1,b,n2,c,n3)
+         elseif (n2.eq.22) then
+            call mxf22(a,n1,b,n2,c,n3)
+         elseif (n2.eq.23) then
+            call mxf23(a,n1,b,n2,c,n3)
+         elseif (n2.eq.24) then
+            call mxf24(a,n1,b,n2,c,n3)
+         endif
+      else
+         call mxm44_0(a,n1,b,n2,c,n3)
+      endif
+c
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf1(a,n1,b,n2,c,n3)
+c
+      real a(n1,1),b(1,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf2(a,n1,b,n2,c,n3)
+c
+      real a(n1,2),b(2,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf3(a,n1,b,n2,c,n3)
+c
+      real a(n1,3),b(3,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf4(a,n1,b,n2,c,n3)
+c
+      real a(n1,4),b(4,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf5(a,n1,b,n2,c,n3)
+c
+      real a(n1,5),b(5,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf6(a,n1,b,n2,c,n3)
+c
+      real a(n1,6),b(6,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf7(a,n1,b,n2,c,n3)
+c
+      real a(n1,7),b(7,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf8(a,n1,b,n2,c,n3)
+c
+      real a(n1,8),b(8,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf9(a,n1,b,n2,c,n3)
+c
+      real a(n1,9),b(9,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf10(a,n1,b,n2,c,n3)
+c
+      real a(n1,10),b(10,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf11(a,n1,b,n2,c,n3)
+c
+      real a(n1,11),b(11,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf12(a,n1,b,n2,c,n3)
+c
+      real a(n1,12),b(12,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf13(a,n1,b,n2,c,n3)
+c
+      real a(n1,13),b(13,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf14(a,n1,b,n2,c,n3)
+c
+      real a(n1,14),b(14,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf15(a,n1,b,n2,c,n3)
+c
+      real a(n1,15),b(15,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf16(a,n1,b,n2,c,n3)
+c
+      real a(n1,16),b(16,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf17(a,n1,b,n2,c,n3)
+c
+      real a(n1,17),b(17,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf18(a,n1,b,n2,c,n3)
+c
+      real a(n1,18),b(18,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+     $             + a(i,18)*b(18,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf19(a,n1,b,n2,c,n3)
+c
+      real a(n1,19),b(19,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+     $             + a(i,18)*b(18,j)
+     $             + a(i,19)*b(19,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf20(a,n1,b,n2,c,n3)
+c
+      real a(n1,20),b(20,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+     $             + a(i,18)*b(18,j)
+     $             + a(i,19)*b(19,j)
+     $             + a(i,20)*b(20,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf21(a,n1,b,n2,c,n3)
+c
+      real a(n1,21),b(21,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+     $             + a(i,18)*b(18,j)
+     $             + a(i,19)*b(19,j)
+     $             + a(i,20)*b(20,j)
+     $             + a(i,21)*b(21,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf22(a,n1,b,n2,c,n3)
+c
+      real a(n1,22),b(22,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+     $             + a(i,18)*b(18,j)
+     $             + a(i,19)*b(19,j)
+     $             + a(i,20)*b(20,j)
+     $             + a(i,21)*b(21,j)
+     $             + a(i,22)*b(22,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf23(a,n1,b,n2,c,n3)
+c
+      real a(n1,23),b(23,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+     $             + a(i,18)*b(18,j)
+     $             + a(i,19)*b(19,j)
+     $             + a(i,20)*b(20,j)
+     $             + a(i,21)*b(21,j)
+     $             + a(i,22)*b(22,j)
+     $             + a(i,23)*b(23,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxf24(a,n1,b,n2,c,n3)
+c
+      real a(n1,24),b(24,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+     $             + a(i,18)*b(18,j)
+     $             + a(i,19)*b(19,j)
+     $             + a(i,20)*b(20,j)
+     $             + a(i,21)*b(21,j)
+     $             + a(i,22)*b(22,j)
+     $             + a(i,23)*b(23,j)
+     $             + a(i,24)*b(24,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxm44_0(a, m, b, k, c, n)
+c
+c matrix multiply with a 4x4 pencil 
+c
+      real a(m,k), b(k,n), c(m,n)
+      real s11, s12, s13, s14, s21, s22, s23, s24
+      real s31, s32, s33, s34, s41, s42, s43, s44
+
+      mresid = iand(m,3) 
+      nresid = iand(n,3) 
+      m1 = m - mresid + 1
+      n1 = n - nresid + 1
+
+      do i=1,m-mresid,4
+        do j=1,n-nresid,4
+          s11 = 0.0d0
+          s21 = 0.0d0
+          s31 = 0.0d0
+          s41 = 0.0d0
+          s12 = 0.0d0
+          s22 = 0.0d0
+          s32 = 0.0d0
+          s42 = 0.0d0
+          s13 = 0.0d0
+          s23 = 0.0d0
+          s33 = 0.0d0
+          s43 = 0.0d0
+          s14 = 0.0d0
+          s24 = 0.0d0
+          s34 = 0.0d0
+          s44 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(i,l)*b(l,j)
+            s12 = s12 + a(i,l)*b(l,j+1)
+            s13 = s13 + a(i,l)*b(l,j+2)
+            s14 = s14 + a(i,l)*b(l,j+3)
+
+            s21 = s21 + a(i+1,l)*b(l,j)
+            s22 = s22 + a(i+1,l)*b(l,j+1)
+            s23 = s23 + a(i+1,l)*b(l,j+2)
+            s24 = s24 + a(i+1,l)*b(l,j+3)
+
+            s31 = s31 + a(i+2,l)*b(l,j)
+            s32 = s32 + a(i+2,l)*b(l,j+1)
+            s33 = s33 + a(i+2,l)*b(l,j+2)
+            s34 = s34 + a(i+2,l)*b(l,j+3)
+
+            s41 = s41 + a(i+3,l)*b(l,j)
+            s42 = s42 + a(i+3,l)*b(l,j+1)
+            s43 = s43 + a(i+3,l)*b(l,j+2)
+            s44 = s44 + a(i+3,l)*b(l,j+3)
+          enddo
+          c(i,j)     = s11 
+          c(i,j+1)   = s12 
+          c(i,j+2)   = s13
+          c(i,j+3)   = s14
+
+          c(i+1,j)   = s21 
+          c(i+2,j)   = s31 
+          c(i+3,j)   = s41 
+
+          c(i+1,j+1) = s22
+          c(i+2,j+1) = s32
+          c(i+3,j+1) = s42
+
+          c(i+1,j+2) = s23
+          c(i+2,j+2) = s33
+          c(i+3,j+2) = s43
+
+          c(i+1,j+3) = s24
+          c(i+2,j+3) = s34
+          c(i+3,j+3) = s44
+        enddo
+* Residual when n is not multiple of 4
+        if (nresid .ne. 0) then
+          if (nresid .eq. 1) then
+            s11 = 0.0d0
+            s21 = 0.0d0
+            s31 = 0.0d0
+            s41 = 0.0d0
+            do l=1,k
+              s11 = s11 + a(i,l)*b(l,n)
+              s21 = s21 + a(i+1,l)*b(l,n)
+              s31 = s31 + a(i+2,l)*b(l,n)
+              s41 = s41 + a(i+3,l)*b(l,n)
+            enddo
+            c(i,n)     = s11 
+            c(i+1,n)   = s21 
+            c(i+2,n)   = s31 
+            c(i+3,n)   = s41 
+          elseif (nresid .eq. 2) then
+            s11 = 0.0d0
+            s21 = 0.0d0
+            s31 = 0.0d0
+            s41 = 0.0d0
+            s12 = 0.0d0
+            s22 = 0.0d0
+            s32 = 0.0d0
+            s42 = 0.0d0
+            do l=1,k
+              s11 = s11 + a(i,l)*b(l,j)
+              s12 = s12 + a(i,l)*b(l,j+1)
+
+              s21 = s21 + a(i+1,l)*b(l,j)
+              s22 = s22 + a(i+1,l)*b(l,j+1)
+
+              s31 = s31 + a(i+2,l)*b(l,j)
+              s32 = s32 + a(i+2,l)*b(l,j+1)
+
+              s41 = s41 + a(i+3,l)*b(l,j)
+              s42 = s42 + a(i+3,l)*b(l,j+1)
+            enddo
+            c(i,j)     = s11 
+            c(i,j+1)   = s12
+
+            c(i+1,j)   = s21 
+            c(i+2,j)   = s31 
+            c(i+3,j)   = s41 
+
+            c(i+1,j+1) = s22
+            c(i+2,j+1) = s32
+            c(i+3,j+1) = s42
+          else
+            s11 = 0.0d0
+            s21 = 0.0d0
+            s31 = 0.0d0
+            s41 = 0.0d0
+            s12 = 0.0d0
+            s22 = 0.0d0
+            s32 = 0.0d0
+            s42 = 0.0d0
+            s13 = 0.0d0
+            s23 = 0.0d0
+            s33 = 0.0d0
+            s43 = 0.0d0
+            do l=1,k
+              s11 = s11 + a(i,l)*b(l,j)
+              s12 = s12 + a(i,l)*b(l,j+1)
+              s13 = s13 + a(i,l)*b(l,j+2)
+
+              s21 = s21 + a(i+1,l)*b(l,j)
+              s22 = s22 + a(i+1,l)*b(l,j+1)
+              s23 = s23 + a(i+1,l)*b(l,j+2)
+
+              s31 = s31 + a(i+2,l)*b(l,j)
+              s32 = s32 + a(i+2,l)*b(l,j+1)
+              s33 = s33 + a(i+2,l)*b(l,j+2)
+
+              s41 = s41 + a(i+3,l)*b(l,j)
+              s42 = s42 + a(i+3,l)*b(l,j+1)
+              s43 = s43 + a(i+3,l)*b(l,j+2)
+            enddo
+            c(i,j)     = s11 
+            c(i+1,j)   = s21 
+            c(i+2,j)   = s31 
+            c(i+3,j)   = s41 
+            c(i,j+1)   = s12 
+            c(i+1,j+1) = s22
+            c(i+2,j+1) = s32
+            c(i+3,j+1) = s42
+            c(i,j+2)   = s13
+            c(i+1,j+2) = s23
+            c(i+2,j+2) = s33
+            c(i+3,j+2) = s43
+          endif
+        endif
+      enddo
+
+* Residual when m is not multiple of 4
+      if (mresid .eq. 0) then
+        return
+      elseif (mresid .eq. 1) then
+        do j=1,n-nresid,4
+          s11 = 0.0d0
+          s12 = 0.0d0
+          s13 = 0.0d0
+          s14 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(m,l)*b(l,j)
+            s12 = s12 + a(m,l)*b(l,j+1)
+            s13 = s13 + a(m,l)*b(l,j+2)
+            s14 = s14 + a(m,l)*b(l,j+3)
+          enddo
+          c(m,j)     = s11 
+          c(m,j+1)   = s12 
+          c(m,j+2)   = s13
+          c(m,j+3)   = s14
+        enddo
+* mresid is 1, check nresid
+        if (nresid .eq. 0) then
+          return
+        elseif (nresid .eq. 1) then
+          s11 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(m,l)*b(l,n)
+          enddo
+          c(m,n) = s11
+          return
+        elseif (nresid .eq. 2) then
+          s11 = 0.0d0
+          s12 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(m,l)*b(l,n-1)
+            s12 = s12 + a(m,l)*b(l,n)
+          enddo
+          c(m,n-1) = s11
+          c(m,n) = s12
+          return
+        else
+          s11 = 0.0d0
+          s12 = 0.0d0
+          s13 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(m,l)*b(l,n-2)
+            s12 = s12 + a(m,l)*b(l,n-1)
+            s13 = s13 + a(m,l)*b(l,n)
+          enddo
+          c(m,n-2) = s11
+          c(m,n-1) = s12
+          c(m,n) = s13
+          return
+        endif          
+      elseif (mresid .eq. 2) then
+        do j=1,n-nresid,4
+          s11 = 0.0d0
+          s12 = 0.0d0
+          s13 = 0.0d0
+          s14 = 0.0d0
+          s21 = 0.0d0
+          s22 = 0.0d0
+          s23 = 0.0d0
+          s24 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(m-1,l)*b(l,j)
+            s12 = s12 + a(m-1,l)*b(l,j+1)
+            s13 = s13 + a(m-1,l)*b(l,j+2)
+            s14 = s14 + a(m-1,l)*b(l,j+3)
+
+            s21 = s21 + a(m,l)*b(l,j)
+            s22 = s22 + a(m,l)*b(l,j+1)
+            s23 = s23 + a(m,l)*b(l,j+2)
+            s24 = s24 + a(m,l)*b(l,j+3)
+          enddo
+          c(m-1,j)   = s11 
+          c(m-1,j+1) = s12 
+          c(m-1,j+2) = s13
+          c(m-1,j+3) = s14
+          c(m,j)     = s21
+          c(m,j+1)   = s22 
+          c(m,j+2)   = s23
+          c(m,j+3)   = s24
+        enddo
+* mresid is 2, check nresid
+        if (nresid .eq. 0) then
+          return
+        elseif (nresid .eq. 1) then
+          s11 = 0.0d0
+          s21 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(m-1,l)*b(l,n)
+            s21 = s21 + a(m,l)*b(l,n)
+          enddo
+          c(m-1,n) = s11
+          c(m,n) = s21
+          return
+        elseif (nresid .eq. 2) then
+          s11 = 0.0d0
+          s21 = 0.0d0
+          s12 = 0.0d0
+          s22 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(m-1,l)*b(l,n-1)
+            s12 = s12 + a(m-1,l)*b(l,n)
+            s21 = s21 + a(m,l)*b(l,n-1)
+            s22 = s22 + a(m,l)*b(l,n)
+          enddo
+          c(m-1,n-1) = s11
+          c(m-1,n)   = s12
+          c(m,n-1)   = s21
+          c(m,n)     = s22
+          return
+        else
+          s11 = 0.0d0
+          s21 = 0.0d0
+          s12 = 0.0d0
+          s22 = 0.0d0
+          s13 = 0.0d0
+          s23 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(m-1,l)*b(l,n-2)
+            s12 = s12 + a(m-1,l)*b(l,n-1)
+            s13 = s13 + a(m-1,l)*b(l,n)
+            s21 = s21 + a(m,l)*b(l,n-2)
+            s22 = s22 + a(m,l)*b(l,n-1)
+            s23 = s23 + a(m,l)*b(l,n)
+          enddo
+          c(m-1,n-2) = s11
+          c(m-1,n-1) = s12
+          c(m-1,n)   = s13
+          c(m,n-2)   = s21
+          c(m,n-1)   = s22
+          c(m,n)     = s23
+          return
+        endif
+      else
+* mresid is 3
+        do j=1,n-nresid,4
+          s11 = 0.0d0
+          s21 = 0.0d0
+          s31 = 0.0d0
+
+          s12 = 0.0d0
+          s22 = 0.0d0
+          s32 = 0.0d0
+
+          s13 = 0.0d0
+          s23 = 0.0d0
+          s33 = 0.0d0
+
+          s14 = 0.0d0
+          s24 = 0.0d0
+          s34 = 0.0d0
+
+          do l=1,k
+            s11 = s11 + a(m-2,l)*b(l,j)
+            s12 = s12 + a(m-2,l)*b(l,j+1)
+            s13 = s13 + a(m-2,l)*b(l,j+2)
+            s14 = s14 + a(m-2,l)*b(l,j+3)
+
+            s21 = s21 + a(m-1,l)*b(l,j)
+            s22 = s22 + a(m-1,l)*b(l,j+1)
+            s23 = s23 + a(m-1,l)*b(l,j+2)
+            s24 = s24 + a(m-1,l)*b(l,j+3)
+
+            s31 = s31 + a(m,l)*b(l,j)
+            s32 = s32 + a(m,l)*b(l,j+1)
+            s33 = s33 + a(m,l)*b(l,j+2)
+            s34 = s34 + a(m,l)*b(l,j+3)
+          enddo
+          c(m-2,j)   = s11 
+          c(m-2,j+1) = s12 
+          c(m-2,j+2) = s13
+          c(m-2,j+3) = s14
+
+          c(m-1,j)   = s21 
+          c(m-1,j+1) = s22
+          c(m-1,j+2) = s23
+          c(m-1,j+3) = s24
+
+          c(m,j)     = s31 
+          c(m,j+1)   = s32
+          c(m,j+2)   = s33
+          c(m,j+3)   = s34
+        enddo
+* mresid is 3, check nresid
+        if (nresid .eq. 0) then
+          return
+        elseif (nresid .eq. 1) then
+          s11 = 0.0d0
+          s21 = 0.0d0
+          s31 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(m-2,l)*b(l,n)
+            s21 = s21 + a(m-1,l)*b(l,n)
+            s31 = s31 + a(m,l)*b(l,n)
+          enddo
+          c(m-2,n) = s11
+          c(m-1,n) = s21
+          c(m,n) = s31
+          return
+        elseif (nresid .eq. 2) then
+          s11 = 0.0d0
+          s21 = 0.0d0
+          s31 = 0.0d0
+          s12 = 0.0d0
+          s22 = 0.0d0
+          s32 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(m-2,l)*b(l,n-1)
+            s12 = s12 + a(m-2,l)*b(l,n)
+            s21 = s21 + a(m-1,l)*b(l,n-1)
+            s22 = s22 + a(m-1,l)*b(l,n)
+            s31 = s31 + a(m,l)*b(l,n-1)
+            s32 = s32 + a(m,l)*b(l,n)
+          enddo
+          c(m-2,n-1) = s11
+          c(m-2,n)   = s12
+          c(m-1,n-1) = s21
+          c(m-1,n)   = s22
+          c(m,n-1)   = s31
+          c(m,n)     = s32
+          return
+        else
+          s11 = 0.0d0
+          s21 = 0.0d0
+          s31 = 0.0d0
+          s12 = 0.0d0
+          s22 = 0.0d0
+          s32 = 0.0d0
+          s13 = 0.0d0
+          s23 = 0.0d0
+          s33 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(m-2,l)*b(l,n-2)
+            s12 = s12 + a(m-2,l)*b(l,n-1)
+            s13 = s13 + a(m-2,l)*b(l,n)
+            s21 = s21 + a(m-1,l)*b(l,n-2)
+            s22 = s22 + a(m-1,l)*b(l,n-1)
+            s23 = s23 + a(m-1,l)*b(l,n)
+            s31 = s31 + a(m,l)*b(l,n-2)
+            s32 = s32 + a(m,l)*b(l,n-1)
+            s33 = s33 + a(m,l)*b(l,n)
+          enddo
+          c(m-2,n-2) = s11
+          c(m-2,n-1) = s12
+          c(m-2,n)   = s13
+          c(m-1,n-2) = s21
+          c(m-1,n-1) = s22
+          c(m-1,n)   = s23
+          c(m,n-2)   = s31
+          c(m,n-1)   = s32
+          c(m,n)     = s33
+          return
+        endif
+      endif
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxm44_2(a, m, b, k, c, n)
+      real a(m,2), b(2,n), c(m,n)
+
+      nresid = iand(n,3) 
+      n1 = n - nresid + 1
+
+      do j=1,n-nresid,4
+         do i=1,m
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+            c(i,j+1) = a(i,1)*b(1,j+1)
+     $             + a(i,2)*b(2,j+1)
+            c(i,j+2) = a(i,1)*b(1,j+2)
+     $             + a(i,2)*b(2,j+2)
+            c(i,j+3) = a(i,1)*b(1,j+3)
+     $             + a(i,2)*b(2,j+3)
+         enddo
+      enddo
+      if (nresid .eq. 0) then
+        return
+      elseif (nresid .eq. 1) then
+         do i=1,m
+            c(i,n) = a(i,1)*b(1,n)
+     $             + a(i,2)*b(2,n)
+         enddo
+      elseif (nresid .eq. 2) then
+         do i=1,m
+            c(i,n-1) = a(i,1)*b(1,n-1)
+     $             + a(i,2)*b(2,n-1)
+            c(i,n) = a(i,1)*b(1,n)
+     $             + a(i,2)*b(2,n)
+         enddo
+      else
+         do i=1,m
+            c(i,n-2) = a(i,1)*b(1,n-2)
+     $             + a(i,2)*b(2,n-2)
+            c(i,n-1) = a(i,1)*b(1,n-1)
+     $             + a(i,2)*b(2,n-1)
+            c(i,n) = a(i,1)*b(1,n)
+     $             + a(i,2)*b(2,n)
+         enddo
+      endif
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxm_test_all(nid,ivb)
+c
+c     Collect matrix-matrix product statistics
+c
+      external mxms,mxmur2,mxmur3,mxmd,mxmfb,mxmf3,mxmu4,mxmn2
+      external mxmk2,mxmtr,mxmrg,madd,mxm,mxm44
+c
+      parameter (nn=24)
+      parameter (nt=10)
+      character*5 c(3,nt)
+      real        s(nn,2,nt,3)
+      real        a(nn,2,nt,3)
+
+      call nekgsync
+
+      do k=1,3   ! 3 tests:  N^2 x N, NxN, NxN^2
+         call mxmtest(s(1,1, 1,k),nn,c(k, 1),mxm44 ,'mxm44',k,ivb)
+         call mxmtest(s(1,1, 2,k),nn,c(k, 2),mxms  ,' std ',k,ivb)
+         call mxmtest(s(1,1, 3,k),nn,c(k, 3),mxmur2,'mxmu2',k,ivb)
+         call mxmtest(s(1,1, 4,k),nn,c(k, 4),mxmur3,'mxmu3',k,ivb)
+         call mxmtest(s(1,1, 5,k),nn,c(k, 5),mxmd  ,'mxmd ',k,ivb)
+         call mxmtest(s(1,1, 6,k),nn,c(k, 6),mxmfb ,'mxmfb',k,ivb)
+         call mxmtest(s(1,1, 7,k),nn,c(k, 7),mxmu4 ,'mxmu4',k,ivb)
+         call mxmtest(s(1,1, 8,k),nn,c(k, 8),mxmf3 ,'mxmf3',k,ivb)
+         if (k.eq.2) ! Add works only for NxN case
+     $   call mxmtest(s(1,1, 9,k),nn,c(k, 9),madd  ,'madd ',k,ivb)
+         call mxmtest(s(1,1,10,k),nn,c(k,10),mxm   ,'mxm  ',k,ivb)
+      enddo
+
+      call nekgsync
+      if (nid.eq.0) call mxm_analyze(s,a,nn,c,nt,ivb)
+      call nekgsync
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine initab(a,b,n)
+      real a(1),b(1)
+      do i=1,n-1
+         x  = i
+         k = mod(i,19) + 2
+         l = mod(i,17) + 5
+         m = mod(i,31) + 3
+         a(i) = -.25*(a(i)+a(i+1)) + (x*x + k + l)/(x*x+m)
+         b(i) = -.25*(b(i)+b(i+1)) + (x*x + k + m)/(x*x+l)
+      enddo
+      a(n) = -.25*(a(n)+a(n)) + (x*x + k + l)/(x*x+m)
+      b(n) = -.25*(b(n)+b(n)) + (x*x + k + m)/(x*x+l)
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxms(a,n1,b,n2,c,n3)
+C----------------------------------------------------------------------
+C
+C     Matrix-vector product routine. 
+C     NOTE: Use assembly coded routine if available.
+C
+C---------------------------------------------------------------------
+      REAL A(N1,N2),B(N2,N3),C(N1,N3)
+C
+         N0=N1*N3
+         DO 10 I=1,N0
+            C(I,1)=0.
+ 10      CONTINUE
+         DO 100 J=1,N3
+         DO 100 K=1,N2
+         BB=B(K,J)
+         DO 100 I=1,N1
+            C(I,J)=C(I,J)+A(I,K)*BB
+ 100     CONTINUE
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmu4(a,n1,b,n2,c,n3)
+C----------------------------------------------------------------------
+C
+C     Matrix-vector product routine. 
+C     NOTE: Use assembly coded routine if available.
+C
+C---------------------------------------------------------------------
+      REAL A(N1,N2),B(N2,N3),C(N1,N3)
+C
+         N0=N1*N3
+         DO 10 I=1,N0
+            C(I,1)=0.
+ 10      CONTINUE
+         i1 = n1 - mod(n1,4) + 1
+            DO 100 J=1,N3
+            DO 100 K=1,N2
+            BB=B(K,J)
+               DO I=1,N1-3,4
+                  C(I  ,J)=C(I  ,J)+A(I  ,K)*BB
+                  C(I+1,J)=C(I+1,J)+A(I+1,K)*BB
+                  C(I+2,J)=C(I+2,J)+A(I+2,K)*BB
+                  C(I+3,J)=C(I+3,J)+A(I+3,K)*BB
+               enddo
+               DO i=i1,N1
+                  C(I  ,J)=C(I  ,J)+A(I  ,K)*BB
+               enddo
+ 100        CONTINUE
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine madd (a,n1,b,n2,c,n3)
+c
+      real a(n1,n2),b(n2,n3),c(n1,n3)
+c
+      do j=1,n3
+      do i=1,n1
+         c(i,j) = a(i,j)+b(i,j)
+      enddo
+      enddo
+c
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmUR2(a,n1,b,n2,c,n3)
+C----------------------------------------------------------------------
+C
+C     Matrix-vector product routine. 
+C     NOTE: Use assembly coded routine if available.
+C
+C---------------------------------------------------------------------
+      REAL A(N1,N2),B(N2,N3),C(N1,N3)
+C
+      if (n2.le.8) then
+         if (n2.eq.1) then
+            call mxmur2_1(a,n1,b,n2,c,n3)
+         elseif (n2.eq.2) then
+            call mxmur2_2(a,n1,b,n2,c,n3)
+         elseif (n2.eq.3) then
+            call mxmur2_3(a,n1,b,n2,c,n3)
+         elseif (n2.eq.4) then
+            call mxmur2_4(a,n1,b,n2,c,n3)
+         elseif (n2.eq.5) then
+            call mxmur2_5(a,n1,b,n2,c,n3)
+         elseif (n2.eq.6) then
+            call mxmur2_6(a,n1,b,n2,c,n3)
+         elseif (n2.eq.7) then
+            call mxmur2_7(a,n1,b,n2,c,n3)
+         else
+            call mxmur2_8(a,n1,b,n2,c,n3)
+         endif
+      elseif (n2.le.16) then
+         if (n2.eq.9) then
+            call mxmur2_9(a,n1,b,n2,c,n3)
+         elseif (n2.eq.10) then
+            call mxmur2_10(a,n1,b,n2,c,n3)
+         elseif (n2.eq.11) then
+            call mxmur2_11(a,n1,b,n2,c,n3)
+         elseif (n2.eq.12) then
+            call mxmur2_12(a,n1,b,n2,c,n3)
+         elseif (n2.eq.13) then
+            call mxmur2_13(a,n1,b,n2,c,n3)
+         elseif (n2.eq.14) then
+            call mxmur2_14(a,n1,b,n2,c,n3)
+         elseif (n2.eq.15) then
+            call mxmur2_15(a,n1,b,n2,c,n3)
+         else
+            call mxmur2_16(a,n1,b,n2,c,n3)
+         endif
+      else
+         N0=N1*N3
+         DO 10 I=1,N0
+            C(I,1)=0.
+ 10      CONTINUE
+         DO 100 J=1,N3
+         DO 100 K=1,N2
+         BB=B(K,J)
+         DO 100 I=1,N1
+            C(I,J)=C(I,J)+A(I,K)*BB
+ 100     CONTINUE
+      endif
+      return
+      end
+c
+      subroutine mxmur2_1(a,n1,b,n2,c,n3)
+c
+      real a(n1,1),b(1,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+         enddo
+      enddo
+      return
+      end
+      subroutine mxmur2_2(a,n1,b,n2,c,n3)
+c
+      real a(n1,2),b(2,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+         enddo
+      enddo
+      return
+      end
+      subroutine mxmur2_3(a,n1,b,n2,c,n3)
+c
+      real a(n1,3),b(3,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+         enddo
+      enddo
+      return
+      end
+      subroutine mxmur2_4(a,n1,b,n2,c,n3)
+c
+      real a(n1,4),b(4,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+         enddo
+      enddo
+      return
+      end
+      subroutine mxmur2_5(a,n1,b,n2,c,n3)
+c
+      real a(n1,5),b(5,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+         enddo
+      enddo
+      return
+      end
+      subroutine mxmur2_6(a,n1,b,n2,c,n3)
+c
+      real a(n1,6),b(6,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+         enddo
+      enddo
+      return
+      end
+      subroutine mxmur2_7(a,n1,b,n2,c,n3)
+c
+      real a(n1,7),b(7,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+         enddo
+      enddo
+      return
+      end
+      subroutine mxmur2_8(a,n1,b,n2,c,n3)
+c
+      real a(n1,8),b(8,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+         enddo
+      enddo
+      return
+      end
+      subroutine mxmur2_9(a,n1,b,n2,c,n3)
+c
+      real a(n1,9),b(9,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+         enddo
+      enddo
+      return
+      end
+      subroutine mxmur2_10(a,n1,b,n2,c,n3)
+c
+      real a(n1,10),b(10,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+         enddo
+      enddo
+      return
+      end
+      subroutine mxmur2_11(a,n1,b,n2,c,n3)
+c
+      real a(n1,11),b(11,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+         enddo
+      enddo
+      return
+      end
+      subroutine mxmur2_12(a,n1,b,n2,c,n3)
+c
+      real a(n1,12),b(12,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+         enddo
+      enddo
+      return
+      end
+      subroutine mxmur2_13(a,n1,b,n2,c,n3)
+c
+      real a(n1,13),b(13,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+         enddo
+      enddo
+      return
+      end
+      subroutine mxmur2_14(a,n1,b,n2,c,n3)
+c
+      real a(n1,14),b(14,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+         enddo
+      enddo
+      return
+      end
+      subroutine mxmur2_15(a,n1,b,n2,c,n3)
+c
+      real a(n1,15),b(15,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+         enddo
+      enddo
+      return
+      end
+      subroutine mxmur2_16(a,n1,b,n2,c,n3)
+c
+      real a(n1,16),b(16,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmUR3(a,n1,b,n2,c,n3)
+C----------------------------------------------------------------------
+C
+C     Matrix-vector product routine. 
+C     NOTE: Use assembly coded routine if available.
+C
+C---------------------------------------------------------------------
+      REAL A(N1,N2),B(N2,N3),C(N1,N3)
+C
+      N0=N1*N3
+      DO 10 I=1,N0
+         C(I,1)=0.
+ 10   CONTINUE
+      if (n3.le.8) then
+         if (n3.eq.1) then
+            call mxmur3_1(a,n1,b,n2,c,n3)
+         elseif (n3.eq.2) then
+            call mxmur3_2(a,n1,b,n2,c,n3)
+         elseif (n3.eq.3) then
+            call mxmur3_3(a,n1,b,n2,c,n3)
+         elseif (n3.eq.4) then
+            call mxmur3_4(a,n1,b,n2,c,n3)
+         elseif (n3.eq.5) then
+            call mxmur3_5(a,n1,b,n2,c,n3)
+         elseif (n3.eq.6) then
+            call mxmur3_6(a,n1,b,n2,c,n3)
+         elseif (n3.eq.7) then
+            call mxmur3_7(a,n1,b,n2,c,n3)
+         else
+            call mxmur3_8(a,n1,b,n2,c,n3)
+         endif
+      elseif (n3.le.16) then
+         if (n3.eq.9) then
+            call mxmur3_9(a,n1,b,n2,c,n3)
+         elseif (n3.eq.10) then
+            call mxmur3_10(a,n1,b,n2,c,n3)
+         elseif (n3.eq.11) then
+            call mxmur3_11(a,n1,b,n2,c,n3)
+         elseif (n3.eq.12) then
+            call mxmur3_12(a,n1,b,n2,c,n3)
+         elseif (n3.eq.13) then
+            call mxmur3_13(a,n1,b,n2,c,n3)
+         elseif (n3.eq.14) then
+            call mxmur3_14(a,n1,b,n2,c,n3)
+         elseif (n3.eq.15) then
+            call mxmur3_15(a,n1,b,n2,c,n3)
+         else
+            call mxmur3_16(a,n1,b,n2,c,n3)
+         endif
+      else
+         DO 100 J=1,N3
+         DO 100 K=1,N2
+         BB=B(K,J)
+         DO 100 I=1,N1
+            C(I,J)=C(I,J)+A(I,K)*BB
+ 100     CONTINUE
+      endif
+      return
+      end
+c
+      subroutine mxmur3_16(a,n1,b,n2,c,n3)
+      real a(n1,n2),b(n2,16),c(n1,16)
+c
+      do k=1,n2
+         tmp1  =  b(k, 1)
+         tmp2  =  b(k, 2)
+         tmp3  =  b(k, 3)
+         tmp4  =  b(k, 4)
+         tmp5  =  b(k, 5)
+         tmp6  =  b(k, 6)
+         tmp7  =  b(k, 7)
+         tmp8  =  b(k, 8)
+         tmp9  =  b(k, 9)
+         tmp10 =  b(k,10)
+         tmp11 =  b(k,11)
+         tmp12 =  b(k,12)
+         tmp13 =  b(k,13)
+         tmp14 =  b(k,14)
+         tmp15 =  b(k,15)
+         tmp16 =  b(k,16)
+         do i=1,n1
+            c(i, 1)  =  c(i, 1) + a(i,k) * tmp1
+            c(i, 2)  =  c(i, 2) + a(i,k) * tmp2
+            c(i, 3)  =  c(i, 3) + a(i,k) * tmp3
+            c(i, 4)  =  c(i, 4) + a(i,k) * tmp4
+            c(i, 5)  =  c(i, 5) + a(i,k) * tmp5
+            c(i, 6)  =  c(i, 6) + a(i,k) * tmp6
+            c(i, 7)  =  c(i, 7) + a(i,k) * tmp7
+            c(i, 8)  =  c(i, 8) + a(i,k) * tmp8
+            c(i, 9)  =  c(i, 9) + a(i,k) * tmp9
+            c(i,10)  =  c(i,10) + a(i,k) * tmp10
+            c(i,11)  =  c(i,11) + a(i,k) * tmp11
+            c(i,12)  =  c(i,12) + a(i,k) * tmp12
+            c(i,13)  =  c(i,13) + a(i,k) * tmp13
+            c(i,14)  =  c(i,14) + a(i,k) * tmp14
+            c(i,15)  =  c(i,15) + a(i,k) * tmp15
+            c(i,16)  =  c(i,16) + a(i,k) * tmp16
+         enddo
+c
+      enddo
+c
+      return
+      end
+      subroutine mxmur3_15(a,n1,b,n2,c,n3)
+      real a(n1,n2),b(n2,15),c(n1,15)
+c
+      do k=1,n2
+         tmp1  =  b(k, 1)
+         tmp2  =  b(k, 2)
+         tmp3  =  b(k, 3)
+         tmp4  =  b(k, 4)
+         tmp5  =  b(k, 5)
+         tmp6  =  b(k, 6)
+         tmp7  =  b(k, 7)
+         tmp8  =  b(k, 8)
+         tmp9  =  b(k, 9)
+         tmp10 =  b(k,10)
+         tmp11 =  b(k,11)
+         tmp12 =  b(k,12)
+         tmp13 =  b(k,13)
+         tmp14 =  b(k,14)
+         tmp15 =  b(k,15)
+         do i=1,n1
+            c(i, 1)  =  c(i, 1) + a(i,k) * tmp1
+            c(i, 2)  =  c(i, 2) + a(i,k) * tmp2
+            c(i, 3)  =  c(i, 3) + a(i,k) * tmp3
+            c(i, 4)  =  c(i, 4) + a(i,k) * tmp4
+            c(i, 5)  =  c(i, 5) + a(i,k) * tmp5
+            c(i, 6)  =  c(i, 6) + a(i,k) * tmp6
+            c(i, 7)  =  c(i, 7) + a(i,k) * tmp7
+            c(i, 8)  =  c(i, 8) + a(i,k) * tmp8
+            c(i, 9)  =  c(i, 9) + a(i,k) * tmp9
+            c(i,10)  =  c(i,10) + a(i,k) * tmp10
+            c(i,11)  =  c(i,11) + a(i,k) * tmp11
+            c(i,12)  =  c(i,12) + a(i,k) * tmp12
+            c(i,13)  =  c(i,13) + a(i,k) * tmp13
+            c(i,14)  =  c(i,14) + a(i,k) * tmp14
+            c(i,15)  =  c(i,15) + a(i,k) * tmp15
+         enddo
+c
+      enddo
+c
+      return
+      end
+      subroutine mxmur3_14(a,n1,b,n2,c,n3)
+      real a(n1,n2),b(n2,14),c(n1,14)
+c
+      do k=1,n2
+         tmp1  =  b(k, 1)
+         tmp2  =  b(k, 2)
+         tmp3  =  b(k, 3)
+         tmp4  =  b(k, 4)
+         tmp5  =  b(k, 5)
+         tmp6  =  b(k, 6)
+         tmp7  =  b(k, 7)
+         tmp8  =  b(k, 8)
+         tmp9  =  b(k, 9)
+         tmp10 =  b(k,10)
+         tmp11 =  b(k,11)
+         tmp12 =  b(k,12)
+         tmp13 =  b(k,13)
+         tmp14 =  b(k,14)
+         do i=1,n1
+            c(i, 1)  =  c(i, 1) + a(i,k) * tmp1
+            c(i, 2)  =  c(i, 2) + a(i,k) * tmp2
+            c(i, 3)  =  c(i, 3) + a(i,k) * tmp3
+            c(i, 4)  =  c(i, 4) + a(i,k) * tmp4
+            c(i, 5)  =  c(i, 5) + a(i,k) * tmp5
+            c(i, 6)  =  c(i, 6) + a(i,k) * tmp6
+            c(i, 7)  =  c(i, 7) + a(i,k) * tmp7
+            c(i, 8)  =  c(i, 8) + a(i,k) * tmp8
+            c(i, 9)  =  c(i, 9) + a(i,k) * tmp9
+            c(i,10)  =  c(i,10) + a(i,k) * tmp10
+            c(i,11)  =  c(i,11) + a(i,k) * tmp11
+            c(i,12)  =  c(i,12) + a(i,k) * tmp12
+            c(i,13)  =  c(i,13) + a(i,k) * tmp13
+            c(i,14)  =  c(i,14) + a(i,k) * tmp14
+         enddo
+c
+      enddo
+c
+      return
+      end
+      subroutine mxmur3_13(a,n1,b,n2,c,n3)
+      real a(n1,n2),b(n2,13),c(n1,13)
+c
+      do k=1,n2
+         tmp1  =  b(k, 1)
+         tmp2  =  b(k, 2)
+         tmp3  =  b(k, 3)
+         tmp4  =  b(k, 4)
+         tmp5  =  b(k, 5)
+         tmp6  =  b(k, 6)
+         tmp7  =  b(k, 7)
+         tmp8  =  b(k, 8)
+         tmp9  =  b(k, 9)
+         tmp10 =  b(k,10)
+         tmp11 =  b(k,11)
+         tmp12 =  b(k,12)
+         tmp13 =  b(k,13)
+         do i=1,n1
+            c(i, 1)  =  c(i, 1) + a(i,k) * tmp1
+            c(i, 2)  =  c(i, 2) + a(i,k) * tmp2
+            c(i, 3)  =  c(i, 3) + a(i,k) * tmp3
+            c(i, 4)  =  c(i, 4) + a(i,k) * tmp4
+            c(i, 5)  =  c(i, 5) + a(i,k) * tmp5
+            c(i, 6)  =  c(i, 6) + a(i,k) * tmp6
+            c(i, 7)  =  c(i, 7) + a(i,k) * tmp7
+            c(i, 8)  =  c(i, 8) + a(i,k) * tmp8
+            c(i, 9)  =  c(i, 9) + a(i,k) * tmp9
+            c(i,10)  =  c(i,10) + a(i,k) * tmp10
+            c(i,11)  =  c(i,11) + a(i,k) * tmp11
+            c(i,12)  =  c(i,12) + a(i,k) * tmp12
+            c(i,13)  =  c(i,13) + a(i,k) * tmp13
+         enddo
+c
+      enddo
+c
+      return
+      end
+      subroutine mxmur3_12(a,n1,b,n2,c,n3)
+      real a(n1,n2),b(n2,12),c(n1,12)
+c
+      do k=1,n2
+         tmp1  =  b(k, 1)
+         tmp2  =  b(k, 2)
+         tmp3  =  b(k, 3)
+         tmp4  =  b(k, 4)
+         tmp5  =  b(k, 5)
+         tmp6  =  b(k, 6)
+         tmp7  =  b(k, 7)
+         tmp8  =  b(k, 8)
+         tmp9  =  b(k, 9)
+         tmp10 =  b(k,10)
+         tmp11 =  b(k,11)
+         tmp12 =  b(k,12)
+         do i=1,n1
+            c(i, 1)  =  c(i, 1) + a(i,k) * tmp1
+            c(i, 2)  =  c(i, 2) + a(i,k) * tmp2
+            c(i, 3)  =  c(i, 3) + a(i,k) * tmp3
+            c(i, 4)  =  c(i, 4) + a(i,k) * tmp4
+            c(i, 5)  =  c(i, 5) + a(i,k) * tmp5
+            c(i, 6)  =  c(i, 6) + a(i,k) * tmp6
+            c(i, 7)  =  c(i, 7) + a(i,k) * tmp7
+            c(i, 8)  =  c(i, 8) + a(i,k) * tmp8
+            c(i, 9)  =  c(i, 9) + a(i,k) * tmp9
+            c(i,10)  =  c(i,10) + a(i,k) * tmp10
+            c(i,11)  =  c(i,11) + a(i,k) * tmp11
+            c(i,12)  =  c(i,12) + a(i,k) * tmp12
+         enddo
+c
+      enddo
+c
+      return
+      end
+      subroutine mxmur3_11(a,n1,b,n2,c,n3)
+      real a(n1,n2),b(n2,11),c(n1,11)
+c
+      do k=1,n2
+         tmp1  =  b(k, 1)
+         tmp2  =  b(k, 2)
+         tmp3  =  b(k, 3)
+         tmp4  =  b(k, 4)
+         tmp5  =  b(k, 5)
+         tmp6  =  b(k, 6)
+         tmp7  =  b(k, 7)
+         tmp8  =  b(k, 8)
+         tmp9  =  b(k, 9)
+         tmp10 =  b(k,10)
+         tmp11 =  b(k,11)
+         do i=1,n1
+            c(i, 1)  =  c(i, 1) + a(i,k) * tmp1
+            c(i, 2)  =  c(i, 2) + a(i,k) * tmp2
+            c(i, 3)  =  c(i, 3) + a(i,k) * tmp3
+            c(i, 4)  =  c(i, 4) + a(i,k) * tmp4
+            c(i, 5)  =  c(i, 5) + a(i,k) * tmp5
+            c(i, 6)  =  c(i, 6) + a(i,k) * tmp6
+            c(i, 7)  =  c(i, 7) + a(i,k) * tmp7
+            c(i, 8)  =  c(i, 8) + a(i,k) * tmp8
+            c(i, 9)  =  c(i, 9) + a(i,k) * tmp9
+            c(i,10)  =  c(i,10) + a(i,k) * tmp10
+            c(i,11)  =  c(i,11) + a(i,k) * tmp11
+         enddo
+c
+      enddo
+c
+      return
+      end
+      subroutine mxmur3_10(a,n1,b,n2,c,n3)
+      real a(n1,n2),b(n2,10),c(n1,10)
+c
+      do k=1,n2
+         tmp1  =  b(k, 1)
+         tmp2  =  b(k, 2)
+         tmp3  =  b(k, 3)
+         tmp4  =  b(k, 4)
+         tmp5  =  b(k, 5)
+         tmp6  =  b(k, 6)
+         tmp7  =  b(k, 7)
+         tmp8  =  b(k, 8)
+         tmp9  =  b(k, 9)
+         tmp10 =  b(k,10)
+         do i=1,n1
+            c(i, 1)  =  c(i, 1) + a(i,k) * tmp1
+            c(i, 2)  =  c(i, 2) + a(i,k) * tmp2
+            c(i, 3)  =  c(i, 3) + a(i,k) * tmp3
+            c(i, 4)  =  c(i, 4) + a(i,k) * tmp4
+            c(i, 5)  =  c(i, 5) + a(i,k) * tmp5
+            c(i, 6)  =  c(i, 6) + a(i,k) * tmp6
+            c(i, 7)  =  c(i, 7) + a(i,k) * tmp7
+            c(i, 8)  =  c(i, 8) + a(i,k) * tmp8
+            c(i, 9)  =  c(i, 9) + a(i,k) * tmp9
+            c(i,10)  =  c(i,10) + a(i,k) * tmp10
+         enddo
+c
+      enddo
+c
+      return
+      end
+      subroutine mxmur3_9(a,n1,b,n2,c,n3)
+      real a(n1,n2),b(n2,9),c(n1,9)
+c
+      do k=1,n2
+         tmp1  =  b(k, 1)
+         tmp2  =  b(k, 2)
+         tmp3  =  b(k, 3)
+         tmp4  =  b(k, 4)
+         tmp5  =  b(k, 5)
+         tmp6  =  b(k, 6)
+         tmp7  =  b(k, 7)
+         tmp8  =  b(k, 8)
+         tmp9  =  b(k, 9)
+         do i=1,n1
+            c(i, 1)  =  c(i, 1) + a(i,k) * tmp1
+            c(i, 2)  =  c(i, 2) + a(i,k) * tmp2
+            c(i, 3)  =  c(i, 3) + a(i,k) * tmp3
+            c(i, 4)  =  c(i, 4) + a(i,k) * tmp4
+            c(i, 5)  =  c(i, 5) + a(i,k) * tmp5
+            c(i, 6)  =  c(i, 6) + a(i,k) * tmp6
+            c(i, 7)  =  c(i, 7) + a(i,k) * tmp7
+            c(i, 8)  =  c(i, 8) + a(i,k) * tmp8
+            c(i, 9)  =  c(i, 9) + a(i,k) * tmp9
+         enddo
+c
+      enddo
+c
+      return
+      end
+      subroutine mxmur3_8(a,n1,b,n2,c,n3)
+      real a(n1,n2),b(n2,8),c(n1,8)
+c
+      do k=1,n2
+         tmp1  =  b(k, 1)
+         tmp2  =  b(k, 2)
+         tmp3  =  b(k, 3)
+         tmp4  =  b(k, 4)
+         tmp5  =  b(k, 5)
+         tmp6  =  b(k, 6)
+         tmp7  =  b(k, 7)
+         tmp8  =  b(k, 8)
+         do i=1,n1
+            c(i, 1)  =  c(i, 1) + a(i,k) * tmp1
+            c(i, 2)  =  c(i, 2) + a(i,k) * tmp2
+            c(i, 3)  =  c(i, 3) + a(i,k) * tmp3
+            c(i, 4)  =  c(i, 4) + a(i,k) * tmp4
+            c(i, 5)  =  c(i, 5) + a(i,k) * tmp5
+            c(i, 6)  =  c(i, 6) + a(i,k) * tmp6
+            c(i, 7)  =  c(i, 7) + a(i,k) * tmp7
+            c(i, 8)  =  c(i, 8) + a(i,k) * tmp8
+         enddo
+c
+      enddo
+c
+      return
+      end
+      subroutine mxmur3_7(a,n1,b,n2,c,n3)
+      real a(n1,n2),b(n2,7),c(n1,7)
+c
+      do k=1,n2
+         tmp1  =  b(k, 1)
+         tmp2  =  b(k, 2)
+         tmp3  =  b(k, 3)
+         tmp4  =  b(k, 4)
+         tmp5  =  b(k, 5)
+         tmp6  =  b(k, 6)
+         tmp7  =  b(k, 7)
+         do i=1,n1
+            c(i, 1)  =  c(i, 1) + a(i,k) * tmp1
+            c(i, 2)  =  c(i, 2) + a(i,k) * tmp2
+            c(i, 3)  =  c(i, 3) + a(i,k) * tmp3
+            c(i, 4)  =  c(i, 4) + a(i,k) * tmp4
+            c(i, 5)  =  c(i, 5) + a(i,k) * tmp5
+            c(i, 6)  =  c(i, 6) + a(i,k) * tmp6
+            c(i, 7)  =  c(i, 7) + a(i,k) * tmp7
+         enddo
+c
+      enddo
+c
+      return
+      end
+      subroutine mxmur3_6(a,n1,b,n2,c,n3)
+      real a(n1,n2),b(n2,6),c(n1,6)
+c
+      do k=1,n2
+         tmp1  =  b(k, 1)
+         tmp2  =  b(k, 2)
+         tmp3  =  b(k, 3)
+         tmp4  =  b(k, 4)
+         tmp5  =  b(k, 5)
+         tmp6  =  b(k, 6)
+         do i=1,n1
+            c(i, 1)  =  c(i, 1) + a(i,k) * tmp1
+            c(i, 2)  =  c(i, 2) + a(i,k) * tmp2
+            c(i, 3)  =  c(i, 3) + a(i,k) * tmp3
+            c(i, 4)  =  c(i, 4) + a(i,k) * tmp4
+            c(i, 5)  =  c(i, 5) + a(i,k) * tmp5
+            c(i, 6)  =  c(i, 6) + a(i,k) * tmp6
+         enddo
+c
+      enddo
+c
+      return
+      end
+      subroutine mxmur3_5(a,n1,b,n2,c,n3)
+      real a(n1,n2),b(n2,5),c(n1,5)
+c
+      do k=1,n2
+         tmp1  =  b(k, 1)
+         tmp2  =  b(k, 2)
+         tmp3  =  b(k, 3)
+         tmp4  =  b(k, 4)
+         tmp5  =  b(k, 5)
+         do i=1,n1
+            c(i, 1)  =  c(i, 1) + a(i,k) * tmp1
+            c(i, 2)  =  c(i, 2) + a(i,k) * tmp2
+            c(i, 3)  =  c(i, 3) + a(i,k) * tmp3
+            c(i, 4)  =  c(i, 4) + a(i,k) * tmp4
+            c(i, 5)  =  c(i, 5) + a(i,k) * tmp5
+         enddo
+c
+      enddo
+c
+      return
+      end
+      subroutine mxmur3_4(a,n1,b,n2,c,n3)
+      real a(n1,n2),b(n2,4),c(n1,4)
+c
+      do k=1,n2
+         tmp1  =  b(k, 1)
+         tmp2  =  b(k, 2)
+         tmp3  =  b(k, 3)
+         tmp4  =  b(k, 4)
+         do i=1,n1
+            c(i, 1)  =  c(i, 1) + a(i,k) * tmp1
+            c(i, 2)  =  c(i, 2) + a(i,k) * tmp2
+            c(i, 3)  =  c(i, 3) + a(i,k) * tmp3
+            c(i, 4)  =  c(i, 4) + a(i,k) * tmp4
+         enddo
+c
+      enddo
+c
+      return
+      end
+      subroutine mxmur3_3(a,n1,b,n2,c,n3)
+      real a(n1,n2),b(n2,3),c(n1,3)
+c
+      do k=1,n2
+         tmp1  =  b(k, 1)
+         tmp2  =  b(k, 2)
+         tmp3  =  b(k, 3)
+         do i=1,n1
+            c(i, 1)  =  c(i, 1) + a(i,k) * tmp1
+            c(i, 2)  =  c(i, 2) + a(i,k) * tmp2
+            c(i, 3)  =  c(i, 3) + a(i,k) * tmp3
+         enddo
+c
+      enddo
+c
+      return
+      end
+      subroutine mxmur3_2(a,n1,b,n2,c,n3)
+      real a(n1,n2),b(n2,2),c(n1,2)
+c
+      do k=1,n2
+         tmp1  =  b(k, 1)
+         tmp2  =  b(k, 2)
+         do i=1,n1
+            c(i, 1)  =  c(i, 1) + a(i,k) * tmp1
+            c(i, 2)  =  c(i, 2) + a(i,k) * tmp2
+         enddo
+c
+      enddo
+c
+      return
+      end
+      subroutine mxmur3_1(a,n1,b,n2,c,n3)
+      real a(n1,n2),b(n2,1),c(n1,1)
+c
+      do k=1,n2
+         tmp1  =  b(k, 1)
+         do i=1,n1
+            c(i, 1)  =  c(i, 1) + a(i,k) * tmp1
+         enddo
+      enddo
+c
+      return
+      end
+C----------------------------------------------------------------------
+      subroutine mxmd(a,n1,b,n2,c,n3)
+C
+C     Matrix-vector product routine. 
+C     NOTE: Use assembly coded routine if available.
+C
+C---------------------------------------------------------------------
+      REAL A(N1,N2),B(N2,N3),C(N1,N3)
+      REAL ONE,ZERO,EPS
+C
+C
+C
+      one=1.0
+      zero=0.0
+      call dgemm( 'N','N',n1,n3,n2,ONE,A,N1,B,N2,ZERO,C,N1)
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb(a,n1,b,n2,c,n3)
+C-----------------------------------------------------------------------
+C
+C     Matrix-vector product routine. 
+C     NOTE: Use assembly coded routine if available.
+C
+C----------------------------------------------------------------------
+      REAL A(N1,N2),B(N2,N3),C(N1,N3)
+C
+      integer wdsize
+      save    wdsize
+      data    wdsize/0/
+c
+c     First call: determine word size for dgemm/sgemm discrimination, below.
+c
+      if (wdsize.eq.0) then
+         one = 1.0
+         eps = 1.e-12
+         wdsize = 8
+         if (one+eps.eq.1.0) wdsize = 4
+      endif
+c
+      if (n2.le.8) then
+         if (n2.eq.1) then
+            call mxmfb_1(a,n1,b,n2,c,n3)
+         elseif (n2.eq.2) then
+            call mxmfb_2(a,n1,b,n2,c,n3)
+         elseif (n2.eq.3) then
+            call mxmfb_3(a,n1,b,n2,c,n3)
+         elseif (n2.eq.4) then
+            call mxmfb_4(a,n1,b,n2,c,n3)
+         elseif (n2.eq.5) then
+            call mxmfb_5(a,n1,b,n2,c,n3)
+         elseif (n2.eq.6) then
+            call mxmfb_6(a,n1,b,n2,c,n3)
+         elseif (n2.eq.7) then
+            call mxmfb_7(a,n1,b,n2,c,n3)
+         else
+            call mxmfb_8(a,n1,b,n2,c,n3)
+         endif
+      elseif (n2.le.16) then
+         if (n2.eq.9) then
+            call mxmfb_9(a,n1,b,n2,c,n3)
+         elseif (n2.eq.10) then
+            call mxmfb_10(a,n1,b,n2,c,n3)
+         elseif (n2.eq.11) then
+            call mxmfb_11(a,n1,b,n2,c,n3)
+         elseif (n2.eq.12) then
+            call mxmfb_12(a,n1,b,n2,c,n3)
+         elseif (n2.eq.13) then
+            call mxmfb_13(a,n1,b,n2,c,n3)
+         elseif (n2.eq.14) then
+            call mxmfb_14(a,n1,b,n2,c,n3)
+         elseif (n2.eq.15) then
+            call mxmfb_15(a,n1,b,n2,c,n3)
+         else
+            call mxmfb_16(a,n1,b,n2,c,n3)
+         endif
+      elseif (n2.le.24) then
+         if (n2.eq.17) then
+            call mxmfb_17(a,n1,b,n2,c,n3)
+         elseif (n2.eq.18) then
+            call mxmfb_18(a,n1,b,n2,c,n3)
+         elseif (n2.eq.19) then
+            call mxmfb_19(a,n1,b,n2,c,n3)
+         elseif (n2.eq.20) then
+            call mxmfb_20(a,n1,b,n2,c,n3)
+         elseif (n2.eq.21) then
+            call mxmfb_21(a,n1,b,n2,c,n3)
+         elseif (n2.eq.22) then
+            call mxmfb_22(a,n1,b,n2,c,n3)
+         elseif (n2.eq.23) then
+            call mxmfb_23(a,n1,b,n2,c,n3)
+         elseif (n2.eq.24) then
+            call mxmfb_24(a,n1,b,n2,c,n3)
+         endif
+      else
+c
+         one=1.0
+         zero=0.0
+         if (wdsize.eq.4) then
+            call sgemm( 'N','N',n1,n3,n2,ONE,A,N1,B,N2,ZERO,C,N1)
+         else
+            call dgemm( 'N','N',n1,n3,n2,ONE,A,N1,B,N2,ZERO,C,N1)
+         endif
+ 
+      endif
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_1(a,n1,b,n2,c,n3)
+c
+      real a(n1,1),b(1,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_2(a,n1,b,n2,c,n3)
+c
+      real a(n1,2),b(2,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_3(a,n1,b,n2,c,n3)
+c
+      real a(n1,3),b(3,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_4(a,n1,b,n2,c,n3)
+c
+      real a(n1,4),b(4,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_5(a,n1,b,n2,c,n3)
+c
+      real a(n1,5),b(5,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_6(a,n1,b,n2,c,n3)
+c
+      real a(n1,6),b(6,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_7(a,n1,b,n2,c,n3)
+c
+      real a(n1,7),b(7,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_8(a,n1,b,n2,c,n3)
+c
+      real a(n1,8),b(8,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_9(a,n1,b,n2,c,n3)
+c
+      real a(n1,9),b(9,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_10(a,n1,b,n2,c,n3)
+c
+      real a(n1,10),b(10,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_11(a,n1,b,n2,c,n3)
+c
+      real a(n1,11),b(11,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_12(a,n1,b,n2,c,n3)
+c
+      real a(n1,12),b(12,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_13(a,n1,b,n2,c,n3)
+c
+      real a(n1,13),b(13,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_14(a,n1,b,n2,c,n3)
+c
+      real a(n1,14),b(14,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_15(a,n1,b,n2,c,n3)
+c
+      real a(n1,15),b(15,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_16(a,n1,b,n2,c,n3)
+c
+      real a(n1,16),b(16,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_17(a,n1,b,n2,c,n3)
+c
+      real a(n1,17),b(17,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_18(a,n1,b,n2,c,n3)
+c
+      real a(n1,18),b(18,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+     $             + a(i,18)*b(18,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_19(a,n1,b,n2,c,n3)
+c
+      real a(n1,19),b(19,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+     $             + a(i,18)*b(18,j)
+     $             + a(i,19)*b(19,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_20(a,n1,b,n2,c,n3)
+c
+      real a(n1,20),b(20,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+     $             + a(i,18)*b(18,j)
+     $             + a(i,19)*b(19,j)
+     $             + a(i,20)*b(20,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_21(a,n1,b,n2,c,n3)
+c
+      real a(n1,21),b(21,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+     $             + a(i,18)*b(18,j)
+     $             + a(i,19)*b(19,j)
+     $             + a(i,20)*b(20,j)
+     $             + a(i,21)*b(21,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_22(a,n1,b,n2,c,n3)
+c
+      real a(n1,22),b(22,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+     $             + a(i,18)*b(18,j)
+     $             + a(i,19)*b(19,j)
+     $             + a(i,20)*b(20,j)
+     $             + a(i,21)*b(21,j)
+     $             + a(i,22)*b(22,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_23(a,n1,b,n2,c,n3)
+c
+      real a(n1,23),b(23,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+     $             + a(i,18)*b(18,j)
+     $             + a(i,19)*b(19,j)
+     $             + a(i,20)*b(20,j)
+     $             + a(i,21)*b(21,j)
+     $             + a(i,22)*b(22,j)
+     $             + a(i,23)*b(23,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmfb_24(a,n1,b,n2,c,n3)
+c
+      real a(n1,24),b(24,n3),c(n1,n3)
+c
+      do j=1,n3
+         do i=1,n1
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+     $             + a(i,18)*b(18,j)
+     $             + a(i,19)*b(19,j)
+     $             + a(i,20)*b(20,j)
+     $             + a(i,21)*b(21,j)
+     $             + a(i,22)*b(22,j)
+     $             + a(i,23)*b(23,j)
+     $             + a(i,24)*b(24,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3(a,n1,b,n2,c,n3)
+C-----------------------------------------------------------------------
+C
+C     Matrix-vector product routine. 
+C     NOTE: Use assembly coded routine if available.
+C
+C----------------------------------------------------------------------
+      REAL A(N1,N2),B(N2,N3),C(N1,N3)
+C
+      integer wdsize
+      save    wdsize
+      data    wdsize/0/
+c
+c     First call: determine word size for dgemm/sgemm discrimination, below.
+c
+      if (wdsize.eq.0) then
+         one = 1.0
+         eps = 1.e-12
+         wdsize = 8
+         if (one+eps.eq.1.0) wdsize = 4
+      endif
+c
+      if (n2.le.8) then
+         if (n2.eq.1) then
+            call mxmf3_1(a,n1,b,n2,c,n3)
+         elseif (n2.eq.2) then
+            call mxmf3_2(a,n1,b,n2,c,n3)
+         elseif (n2.eq.3) then
+            call mxmf3_3(a,n1,b,n2,c,n3)
+         elseif (n2.eq.4) then
+            call mxmf3_4(a,n1,b,n2,c,n3)
+         elseif (n2.eq.5) then
+            call mxmf3_5(a,n1,b,n2,c,n3)
+         elseif (n2.eq.6) then
+            call mxmf3_6(a,n1,b,n2,c,n3)
+         elseif (n2.eq.7) then
+            call mxmf3_7(a,n1,b,n2,c,n3)
+         else
+            call mxmf3_8(a,n1,b,n2,c,n3)
+         endif
+      elseif (n2.le.16) then
+         if (n2.eq.9) then
+            call mxmf3_9(a,n1,b,n2,c,n3)
+         elseif (n2.eq.10) then
+            call mxmf3_10(a,n1,b,n2,c,n3)
+         elseif (n2.eq.11) then
+            call mxmf3_11(a,n1,b,n2,c,n3)
+         elseif (n2.eq.12) then
+            call mxmf3_12(a,n1,b,n2,c,n3)
+         elseif (n2.eq.13) then
+            call mxmf3_13(a,n1,b,n2,c,n3)
+         elseif (n2.eq.14) then
+            call mxmf3_14(a,n1,b,n2,c,n3)
+         elseif (n2.eq.15) then
+            call mxmf3_15(a,n1,b,n2,c,n3)
+         else
+            call mxmf3_16(a,n1,b,n2,c,n3)
+         endif
+      elseif (n2.le.24) then
+         if (n2.eq.17) then
+            call mxmf3_17(a,n1,b,n2,c,n3)
+         elseif (n2.eq.18) then
+            call mxmf3_18(a,n1,b,n2,c,n3)
+         elseif (n2.eq.19) then
+            call mxmf3_19(a,n1,b,n2,c,n3)
+         elseif (n2.eq.20) then
+            call mxmf3_20(a,n1,b,n2,c,n3)
+         elseif (n2.eq.21) then
+            call mxmf3_21(a,n1,b,n2,c,n3)
+         elseif (n2.eq.22) then
+            call mxmf3_22(a,n1,b,n2,c,n3)
+         elseif (n2.eq.23) then
+            call mxmf3_23(a,n1,b,n2,c,n3)
+         elseif (n2.eq.24) then
+            call mxmf3_24(a,n1,b,n2,c,n3)
+         endif
+      else
+c
+         one=1.0
+         zero=0.0
+         if (wdsize.eq.4) then
+            call sgemm( 'N','N',n1,n3,n2,ONE,A,N1,B,N2,ZERO,C,N1)
+         else
+            call dgemm( 'N','N',n1,n3,n2,ONE,A,N1,B,N2,ZERO,C,N1)
+         endif
+c
+c        N0=N1*N3
+c        DO 10 I=1,N0
+c           C(I,1)=0.
+c  10    CONTINUE
+c        DO 100 J=1,N3
+c        DO 100 K=1,N2
+c        BB=B(K,J)
+c        DO 100 I=1,N1
+c           C(I,J)=C(I,J)+A(I,K)*BB
+c 100    CONTINUE
+ 
+      endif
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_1(a,n1,b,n2,c,n3)
+c
+      real a(n1,1),b(1,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_2(a,n1,b,n2,c,n3)
+c
+      real a(n1,2),b(2,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_3(a,n1,b,n2,c,n3)
+c
+      real a(n1,3),b(3,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_4(a,n1,b,n2,c,n3)
+c
+      real a(n1,4),b(4,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_5(a,n1,b,n2,c,n3)
+c
+      real a(n1,5),b(5,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_6(a,n1,b,n2,c,n3)
+c
+      real a(n1,6),b(6,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_7(a,n1,b,n2,c,n3)
+c
+      real a(n1,7),b(7,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_8(a,n1,b,n2,c,n3)
+c
+      real a(n1,8),b(8,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_9(a,n1,b,n2,c,n3)
+c
+      real a(n1,9),b(9,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_10(a,n1,b,n2,c,n3)
+c
+      real a(n1,10),b(10,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_11(a,n1,b,n2,c,n3)
+c
+      real a(n1,11),b(11,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_12(a,n1,b,n2,c,n3)
+c
+      real a(n1,12),b(12,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_13(a,n1,b,n2,c,n3)
+c
+      real a(n1,13),b(13,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_14(a,n1,b,n2,c,n3)
+c
+      real a(n1,14),b(14,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_15(a,n1,b,n2,c,n3)
+c
+      real a(n1,15),b(15,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_16(a,n1,b,n2,c,n3)
+c
+      real a(n1,16),b(16,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_17(a,n1,b,n2,c,n3)
+c
+      real a(n1,17),b(17,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_18(a,n1,b,n2,c,n3)
+c
+      real a(n1,18),b(18,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+     $             + a(i,18)*b(18,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_19(a,n1,b,n2,c,n3)
+c
+      real a(n1,19),b(19,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+     $             + a(i,18)*b(18,j)
+     $             + a(i,19)*b(19,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_20(a,n1,b,n2,c,n3)
+c
+      real a(n1,20),b(20,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+     $             + a(i,18)*b(18,j)
+     $             + a(i,19)*b(19,j)
+     $             + a(i,20)*b(20,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_21(a,n1,b,n2,c,n3)
+c
+      real a(n1,21),b(21,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+     $             + a(i,18)*b(18,j)
+     $             + a(i,19)*b(19,j)
+     $             + a(i,20)*b(20,j)
+     $             + a(i,21)*b(21,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_22(a,n1,b,n2,c,n3)
+c
+      real a(n1,22),b(22,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+     $             + a(i,18)*b(18,j)
+     $             + a(i,19)*b(19,j)
+     $             + a(i,20)*b(20,j)
+     $             + a(i,21)*b(21,j)
+     $             + a(i,22)*b(22,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_23(a,n1,b,n2,c,n3)
+c
+      real a(n1,23),b(23,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+     $             + a(i,18)*b(18,j)
+     $             + a(i,19)*b(19,j)
+     $             + a(i,20)*b(20,j)
+     $             + a(i,21)*b(21,j)
+     $             + a(i,22)*b(22,j)
+     $             + a(i,23)*b(23,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmf3_24(a,n1,b,n2,c,n3)
+c
+      real a(n1,24),b(24,n3),c(n1,n3)
+c
+      do i=1,n1
+         do j=1,n3
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+     $             + a(i,3)*b(3,j)
+     $             + a(i,4)*b(4,j)
+     $             + a(i,5)*b(5,j)
+     $             + a(i,6)*b(6,j)
+     $             + a(i,7)*b(7,j)
+     $             + a(i,8)*b(8,j)
+     $             + a(i,9)*b(9,j)
+     $             + a(i,10)*b(10,j)
+     $             + a(i,11)*b(11,j)
+     $             + a(i,12)*b(12,j)
+     $             + a(i,13)*b(13,j)
+     $             + a(i,14)*b(14,j)
+     $             + a(i,15)*b(15,j)
+     $             + a(i,16)*b(16,j)
+     $             + a(i,17)*b(17,j)
+     $             + a(i,18)*b(18,j)
+     $             + a(i,19)*b(19,j)
+     $             + a(i,20)*b(20,j)
+     $             + a(i,21)*b(21,j)
+     $             + a(i,22)*b(22,j)
+     $             + a(i,23)*b(23,j)
+     $             + a(i,24)*b(24,j)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxm44(a,n1,b,n2,c,n3)
+C-----------------------------------------------------------------------
+C
+C     NOTE -- this code has been set up with the "mxmf3" routine
+c             referenced in memtime.f.   On most machines, the f2
+c             and f3 versions give the same performance (f2 is the
+c             nekton standard).  On the t3e, f3 is noticeably faster.
+c             pff  10/5/98
+C
+C
+C     Matrix-vector product routine. 
+C     NOTE: Use assembly coded routine if available.
+C
+C----------------------------------------------------------------------
+      REAL A(N1,N2),B(N2,N3),C(N1,N3)
+c
+      if (n2.eq.1) then
+         call mxm44_2_t(a,n1,b,2,c,n3)
+      elseif (n2.eq.2) then
+         call mxm44_2_t(a,n1,b,n2,c,n3)
+      else
+         call mxm44_0_t(a,n1,b,n2,c,n3)
+      endif
+c
+      return
+      end
+c
+c-----------------------------------------------------------------------
+      subroutine mxm44_0_t(a, m, b, k, c, n)
+*      subroutine matmul44(m, n, k, a, lda, b, ldb, c, ldc)
+*      real*8 a(lda,k), b(ldb,n), c(ldc,n)
+      real a(m,k), b(k,n), c(m,n)
+      real s11, s12, s13, s14, s21, s22, s23, s24
+      real s31, s32, s33, s34, s41, s42, s43, s44
+c
+c matrix multiply with a 4x4 pencil 
+c
+
+      mresid = iand(m,3) 
+      nresid = iand(n,3) 
+      m1 = m - mresid + 1
+      n1 = n - nresid + 1
+
+      do i=1,m-mresid,4
+        do j=1,n-nresid,4
+          s11 = 0.0d0
+          s21 = 0.0d0
+          s31 = 0.0d0
+          s41 = 0.0d0
+          s12 = 0.0d0
+          s22 = 0.0d0
+          s32 = 0.0d0
+          s42 = 0.0d0
+          s13 = 0.0d0
+          s23 = 0.0d0
+          s33 = 0.0d0
+          s43 = 0.0d0
+          s14 = 0.0d0
+          s24 = 0.0d0
+          s34 = 0.0d0
+          s44 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(i,l)*b(l,j)
+            s12 = s12 + a(i,l)*b(l,j+1)
+            s13 = s13 + a(i,l)*b(l,j+2)
+            s14 = s14 + a(i,l)*b(l,j+3)
+
+            s21 = s21 + a(i+1,l)*b(l,j)
+            s22 = s22 + a(i+1,l)*b(l,j+1)
+            s23 = s23 + a(i+1,l)*b(l,j+2)
+            s24 = s24 + a(i+1,l)*b(l,j+3)
+
+            s31 = s31 + a(i+2,l)*b(l,j)
+            s32 = s32 + a(i+2,l)*b(l,j+1)
+            s33 = s33 + a(i+2,l)*b(l,j+2)
+            s34 = s34 + a(i+2,l)*b(l,j+3)
+
+            s41 = s41 + a(i+3,l)*b(l,j)
+            s42 = s42 + a(i+3,l)*b(l,j+1)
+            s43 = s43 + a(i+3,l)*b(l,j+2)
+            s44 = s44 + a(i+3,l)*b(l,j+3)
+          enddo
+          c(i,j)     = s11 
+          c(i,j+1)   = s12 
+          c(i,j+2)   = s13
+          c(i,j+3)   = s14
+
+          c(i+1,j)   = s21 
+          c(i+2,j)   = s31 
+          c(i+3,j)   = s41 
+
+          c(i+1,j+1) = s22
+          c(i+2,j+1) = s32
+          c(i+3,j+1) = s42
+
+          c(i+1,j+2) = s23
+          c(i+2,j+2) = s33
+          c(i+3,j+2) = s43
+
+          c(i+1,j+3) = s24
+          c(i+2,j+3) = s34
+          c(i+3,j+3) = s44
+        enddo
+* Residual when n is not multiple of 4
+        if (nresid .ne. 0) then
+          if (nresid .eq. 1) then
+            s11 = 0.0d0
+            s21 = 0.0d0
+            s31 = 0.0d0
+            s41 = 0.0d0
+            do l=1,k
+              s11 = s11 + a(i,l)*b(l,n)
+              s21 = s21 + a(i+1,l)*b(l,n)
+              s31 = s31 + a(i+2,l)*b(l,n)
+              s41 = s41 + a(i+3,l)*b(l,n)
+            enddo
+            c(i,n)     = s11 
+            c(i+1,n)   = s21 
+            c(i+2,n)   = s31 
+            c(i+3,n)   = s41 
+          elseif (nresid .eq. 2) then
+            s11 = 0.0d0
+            s21 = 0.0d0
+            s31 = 0.0d0
+            s41 = 0.0d0
+            s12 = 0.0d0
+            s22 = 0.0d0
+            s32 = 0.0d0
+            s42 = 0.0d0
+            do l=1,k
+              s11 = s11 + a(i,l)*b(l,j)
+              s12 = s12 + a(i,l)*b(l,j+1)
+
+              s21 = s21 + a(i+1,l)*b(l,j)
+              s22 = s22 + a(i+1,l)*b(l,j+1)
+
+              s31 = s31 + a(i+2,l)*b(l,j)
+              s32 = s32 + a(i+2,l)*b(l,j+1)
+
+              s41 = s41 + a(i+3,l)*b(l,j)
+              s42 = s42 + a(i+3,l)*b(l,j+1)
+            enddo
+            c(i,j)     = s11 
+            c(i,j+1)   = s12
+
+            c(i+1,j)   = s21 
+            c(i+2,j)   = s31 
+            c(i+3,j)   = s41 
+
+            c(i+1,j+1) = s22
+            c(i+2,j+1) = s32
+            c(i+3,j+1) = s42
+          else
+            s11 = 0.0d0
+            s21 = 0.0d0
+            s31 = 0.0d0
+            s41 = 0.0d0
+            s12 = 0.0d0
+            s22 = 0.0d0
+            s32 = 0.0d0
+            s42 = 0.0d0
+            s13 = 0.0d0
+            s23 = 0.0d0
+            s33 = 0.0d0
+            s43 = 0.0d0
+            do l=1,k
+              s11 = s11 + a(i,l)*b(l,j)
+              s12 = s12 + a(i,l)*b(l,j+1)
+              s13 = s13 + a(i,l)*b(l,j+2)
+
+              s21 = s21 + a(i+1,l)*b(l,j)
+              s22 = s22 + a(i+1,l)*b(l,j+1)
+              s23 = s23 + a(i+1,l)*b(l,j+2)
+
+              s31 = s31 + a(i+2,l)*b(l,j)
+              s32 = s32 + a(i+2,l)*b(l,j+1)
+              s33 = s33 + a(i+2,l)*b(l,j+2)
+
+              s41 = s41 + a(i+3,l)*b(l,j)
+              s42 = s42 + a(i+3,l)*b(l,j+1)
+              s43 = s43 + a(i+3,l)*b(l,j+2)
+            enddo
+            c(i,j)     = s11 
+            c(i+1,j)   = s21 
+            c(i+2,j)   = s31 
+            c(i+3,j)   = s41 
+            c(i,j+1)   = s12 
+            c(i+1,j+1) = s22
+            c(i+2,j+1) = s32
+            c(i+3,j+1) = s42
+            c(i,j+2)   = s13
+            c(i+1,j+2) = s23
+            c(i+2,j+2) = s33
+            c(i+3,j+2) = s43
+          endif
+        endif
+      enddo
+
+* Residual when m is not multiple of 4
+      if (mresid .eq. 0) then
+        return
+      elseif (mresid .eq. 1) then
+        do j=1,n-nresid,4
+          s11 = 0.0d0
+          s12 = 0.0d0
+          s13 = 0.0d0
+          s14 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(m,l)*b(l,j)
+            s12 = s12 + a(m,l)*b(l,j+1)
+            s13 = s13 + a(m,l)*b(l,j+2)
+            s14 = s14 + a(m,l)*b(l,j+3)
+          enddo
+          c(m,j)     = s11 
+          c(m,j+1)   = s12 
+          c(m,j+2)   = s13
+          c(m,j+3)   = s14
+        enddo
+* mresid is 1, check nresid
+        if (nresid .eq. 0) then
+          return
+        elseif (nresid .eq. 1) then
+          s11 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(m,l)*b(l,n)
+          enddo
+          c(m,n) = s11
+          return
+        elseif (nresid .eq. 2) then
+          s11 = 0.0d0
+          s12 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(m,l)*b(l,n-1)
+            s12 = s12 + a(m,l)*b(l,n)
+          enddo
+          c(m,n-1) = s11
+          c(m,n) = s12
+          return
+        else
+          s11 = 0.0d0
+          s12 = 0.0d0
+          s13 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(m,l)*b(l,n-2)
+            s12 = s12 + a(m,l)*b(l,n-1)
+            s13 = s13 + a(m,l)*b(l,n)
+          enddo
+          c(m,n-2) = s11
+          c(m,n-1) = s12
+          c(m,n) = s13
+          return
+        endif          
+      elseif (mresid .eq. 2) then
+        do j=1,n-nresid,4
+          s11 = 0.0d0
+          s12 = 0.0d0
+          s13 = 0.0d0
+          s14 = 0.0d0
+          s21 = 0.0d0
+          s22 = 0.0d0
+          s23 = 0.0d0
+          s24 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(m-1,l)*b(l,j)
+            s12 = s12 + a(m-1,l)*b(l,j+1)
+            s13 = s13 + a(m-1,l)*b(l,j+2)
+            s14 = s14 + a(m-1,l)*b(l,j+3)
+
+            s21 = s21 + a(m,l)*b(l,j)
+            s22 = s22 + a(m,l)*b(l,j+1)
+            s23 = s23 + a(m,l)*b(l,j+2)
+            s24 = s24 + a(m,l)*b(l,j+3)
+          enddo
+          c(m-1,j)   = s11 
+          c(m-1,j+1) = s12 
+          c(m-1,j+2) = s13
+          c(m-1,j+3) = s14
+          c(m,j)     = s21
+          c(m,j+1)   = s22 
+          c(m,j+2)   = s23
+          c(m,j+3)   = s24
+        enddo
+* mresid is 2, check nresid
+        if (nresid .eq. 0) then
+          return
+        elseif (nresid .eq. 1) then
+          s11 = 0.0d0
+          s21 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(m-1,l)*b(l,n)
+            s21 = s21 + a(m,l)*b(l,n)
+          enddo
+          c(m-1,n) = s11
+          c(m,n) = s21
+          return
+        elseif (nresid .eq. 2) then
+          s11 = 0.0d0
+          s21 = 0.0d0
+          s12 = 0.0d0
+          s22 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(m-1,l)*b(l,n-1)
+            s12 = s12 + a(m-1,l)*b(l,n)
+            s21 = s21 + a(m,l)*b(l,n-1)
+            s22 = s22 + a(m,l)*b(l,n)
+          enddo
+          c(m-1,n-1) = s11
+          c(m-1,n)   = s12
+          c(m,n-1)   = s21
+          c(m,n)     = s22
+          return
+        else
+          s11 = 0.0d0
+          s21 = 0.0d0
+          s12 = 0.0d0
+          s22 = 0.0d0
+          s13 = 0.0d0
+          s23 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(m-1,l)*b(l,n-2)
+            s12 = s12 + a(m-1,l)*b(l,n-1)
+            s13 = s13 + a(m-1,l)*b(l,n)
+            s21 = s21 + a(m,l)*b(l,n-2)
+            s22 = s22 + a(m,l)*b(l,n-1)
+            s23 = s23 + a(m,l)*b(l,n)
+          enddo
+          c(m-1,n-2) = s11
+          c(m-1,n-1) = s12
+          c(m-1,n)   = s13
+          c(m,n-2)   = s21
+          c(m,n-1)   = s22
+          c(m,n)     = s23
+          return
+        endif
+      else
+* mresid is 3
+        do j=1,n-nresid,4
+          s11 = 0.0d0
+          s21 = 0.0d0
+          s31 = 0.0d0
+
+          s12 = 0.0d0
+          s22 = 0.0d0
+          s32 = 0.0d0
+
+          s13 = 0.0d0
+          s23 = 0.0d0
+          s33 = 0.0d0
+
+          s14 = 0.0d0
+          s24 = 0.0d0
+          s34 = 0.0d0
+
+          do l=1,k
+            s11 = s11 + a(m-2,l)*b(l,j)
+            s12 = s12 + a(m-2,l)*b(l,j+1)
+            s13 = s13 + a(m-2,l)*b(l,j+2)
+            s14 = s14 + a(m-2,l)*b(l,j+3)
+
+            s21 = s21 + a(m-1,l)*b(l,j)
+            s22 = s22 + a(m-1,l)*b(l,j+1)
+            s23 = s23 + a(m-1,l)*b(l,j+2)
+            s24 = s24 + a(m-1,l)*b(l,j+3)
+
+            s31 = s31 + a(m,l)*b(l,j)
+            s32 = s32 + a(m,l)*b(l,j+1)
+            s33 = s33 + a(m,l)*b(l,j+2)
+            s34 = s34 + a(m,l)*b(l,j+3)
+          enddo
+          c(m-2,j)   = s11 
+          c(m-2,j+1) = s12 
+          c(m-2,j+2) = s13
+          c(m-2,j+3) = s14
+
+          c(m-1,j)   = s21 
+          c(m-1,j+1) = s22
+          c(m-1,j+2) = s23
+          c(m-1,j+3) = s24
+
+          c(m,j)     = s31 
+          c(m,j+1)   = s32
+          c(m,j+2)   = s33
+          c(m,j+3)   = s34
+        enddo
+* mresid is 3, check nresid
+        if (nresid .eq. 0) then
+          return
+        elseif (nresid .eq. 1) then
+          s11 = 0.0d0
+          s21 = 0.0d0
+          s31 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(m-2,l)*b(l,n)
+            s21 = s21 + a(m-1,l)*b(l,n)
+            s31 = s31 + a(m,l)*b(l,n)
+          enddo
+          c(m-2,n) = s11
+          c(m-1,n) = s21
+          c(m,n) = s31
+          return
+        elseif (nresid .eq. 2) then
+          s11 = 0.0d0
+          s21 = 0.0d0
+          s31 = 0.0d0
+          s12 = 0.0d0
+          s22 = 0.0d0
+          s32 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(m-2,l)*b(l,n-1)
+            s12 = s12 + a(m-2,l)*b(l,n)
+            s21 = s21 + a(m-1,l)*b(l,n-1)
+            s22 = s22 + a(m-1,l)*b(l,n)
+            s31 = s31 + a(m,l)*b(l,n-1)
+            s32 = s32 + a(m,l)*b(l,n)
+          enddo
+          c(m-2,n-1) = s11
+          c(m-2,n)   = s12
+          c(m-1,n-1) = s21
+          c(m-1,n)   = s22
+          c(m,n-1)   = s31
+          c(m,n)     = s32
+          return
+        else
+          s11 = 0.0d0
+          s21 = 0.0d0
+          s31 = 0.0d0
+          s12 = 0.0d0
+          s22 = 0.0d0
+          s32 = 0.0d0
+          s13 = 0.0d0
+          s23 = 0.0d0
+          s33 = 0.0d0
+          do l=1,k
+            s11 = s11 + a(m-2,l)*b(l,n-2)
+            s12 = s12 + a(m-2,l)*b(l,n-1)
+            s13 = s13 + a(m-2,l)*b(l,n)
+            s21 = s21 + a(m-1,l)*b(l,n-2)
+            s22 = s22 + a(m-1,l)*b(l,n-1)
+            s23 = s23 + a(m-1,l)*b(l,n)
+            s31 = s31 + a(m,l)*b(l,n-2)
+            s32 = s32 + a(m,l)*b(l,n-1)
+            s33 = s33 + a(m,l)*b(l,n)
+          enddo
+          c(m-2,n-2) = s11
+          c(m-2,n-1) = s12
+          c(m-2,n)   = s13
+          c(m-1,n-2) = s21
+          c(m-1,n-1) = s22
+          c(m-1,n)   = s23
+          c(m,n-2)   = s31
+          c(m,n-1)   = s32
+          c(m,n)     = s33
+          return
+        endif
+      endif
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxm44_2_t(a, m, b, k, c, n)
+      real a(m,2), b(2,n), c(m,n)
+
+      nresid = iand(n,3) 
+      n1 = n - nresid + 1
+
+      do j=1,n-nresid,4
+         do i=1,m
+            c(i,j) = a(i,1)*b(1,j)
+     $             + a(i,2)*b(2,j)
+            c(i,j+1) = a(i,1)*b(1,j+1)
+     $             + a(i,2)*b(2,j+1)
+            c(i,j+2) = a(i,1)*b(1,j+2)
+     $             + a(i,2)*b(2,j+2)
+            c(i,j+3) = a(i,1)*b(1,j+3)
+     $             + a(i,2)*b(2,j+3)
+         enddo
+      enddo
+      if (nresid .eq. 0) then
+        return
+      elseif (nresid .eq. 1) then
+         do i=1,m
+            c(i,n) = a(i,1)*b(1,n)
+     $             + a(i,2)*b(2,n)
+         enddo
+      elseif (nresid .eq. 2) then
+         do i=1,m
+            c(i,n-1) = a(i,1)*b(1,n-1)
+     $             + a(i,2)*b(2,n-1)
+            c(i,n) = a(i,1)*b(1,n)
+     $             + a(i,2)*b(2,n)
+         enddo
+      else
+         do i=1,m
+            c(i,n-2) = a(i,1)*b(1,n-2)
+     $             + a(i,2)*b(2,n-2)
+            c(i,n-1) = a(i,1)*b(1,n-1)
+     $             + a(i,2)*b(2,n-1)
+            c(i,n) = a(i,1)*b(1,n)
+     $             + a(i,2)*b(2,n)
+         enddo
+      endif
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxmtest(s,nn,cn,mxmt,name,k,ivb)
+
+      real        s(nn,2)   ! MFLOPS
+      character*5 cn        ! name
+      character*5 name
+      external mxmt
+
+      include 'SIZE'
+      parameter (lt=4*lx1*ly1*lz1*lelt)
+      common /scrns/ a(lt)
+      common /scruz/ b(lt)
+      common /scrmg/ c(lt)
+
+      integer ll,icalld
+      save    ll,icalld
+      data    ll,icalld  /1,0/
+
+      if (icalld.eq.0) then    !     Initialize matrices:
+         icalld = icalld + 1
+         time1 = dnekclock()
+         call initab(a,b,lt)
+         time2 = dnekclock()-time1
+         if (nid.eq.0) write(6,*) 'mxm test init:',lt,time2,name
+      endif
+
+
+      cn = name
+
+c     Rectangular matrix tests
+
+      nn0 = 1
+      nn1 = nn
+      if (ivb.eq.0) then
+         nn0 = lx1
+         nn1 = lx1
+      endif
+
+      m = k
+      do n=nn0,nn1
+         n1 = n
+         n2 = n
+         n3 = n
+         if (m.eq.1) n1 = n*n
+         if (m.eq.3) n3 = n*n
+         if (lt.gt.n1*n3) then
+          n13 = max(n1,n3)
+          loop = 250000/(n1*n2*n3) + 500
+          if (name.eq.'madd ') loop = 200000/(n1*n3) + 5000
+
+c-------------------------------------------------------
+c         mem test
+c-------------------------------------------------------
+
+          t0    = dnekclock()
+          overh = dnekclock()-t0
+          time1 = dnekclock()
+          do l=1,loop
+            if (ll.ge.lt-n1*n3) ll = 1
+            call mxmt(a(ll),n1,b(ll),n2,c(ll),n3)
+            ll = ll+n1*n3
+          enddo
+          time2 = dnekclock()
+          time = time2-time1 - overh
+          iops=loop*n1*n3*(2*n2-1)
+          if (name.eq.'madd ') iops = loop*n1*n3
+c         write(6,*) loop,time,time2,time1,overh
+          flops=iops/(1.0e6*time)
+          s(n,1) = flops
+c
+          timel = time/loop
+          if (nid.eq.0) write(6,199) n,n1,n2,n3,flops,timel,name
+  199     format(i3,'m',1x,3i6,f10.4,e16.5,3x,a5,' mem')
+c
+c-------------------------------------------------------
+c         fast test
+c-------------------------------------------------------
+c
+          call mxmt(a,n1,b,n2,c,n3)
+          t0    = dnekclock()
+          overh = dnekclock()-t0
+          time1 = dnekclock()
+          do l=1,loop
+            call mxmt(a,n1,b,n2,c,n3)
+          enddo
+          time2 = dnekclock()
+          time = time2-time1 - overh
+          iops=loop*n1*n3*(2*n2-1)
+          if (name.eq.'madd ') iops = loop*n1*n3
+          flops=iops/(1.0e6*time)
+          s(n,2) = flops
+          timel = time/loop
+c
+          if (nid.eq.0) write(6,198) n,n1,n2,n3,flops,timel,name
+  198     format(i3,'f',1x,3i6,f10.4,e16.5,3x,a5,' fast')
+c
+        endif
+       enddo
+c
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine mxm_analyze(s,a,nn,c,nt,ivb)
+      include 'SIZE'
+
+      character*5 c(3,nt)
+      real        s(nn,2,nt,3)  ! Measured Mflops, 3 cases
+      real        a(nn,2,nt,3)
+c                   ^  ^ ^  |__ N^2xN, NxN, NxN^2
+c  matrix order N __|  | |__________which mxm
+c                      |
+c                      |__cached vs. noncached data
+ 
+
+      integer itmax(200)
+
+      nn0 = 1
+      nn1 = nn
+      if (ivb.eq.0) then
+         nn0 = lx1
+         nn1 = lx1
+      endif
+
+      do n = nn0,nn1
+         fmax = 0.   ! Peak mflops
+         do it=1,nt
+            ai = 0.
+            di = 0.
+            do k=1,3
+               if (s(n,1,it,k).gt.0) then  ! Take harmonic means of
+                  ai = ai + 1./s(n,1,it,k) ! case I II and III for 
+                  di = di + 1.             ! mem test, s(n,1...).
+               endif
+            enddo
+            if (ai.gt.0) ai = di/ai
+            a(n,1,it,1) = di/ai
+            if (ai.gt.fmax.and.c(2,it).ne.'madd ') then
+               fmax     = ai
+               itmax(n) = it
+            endif
+         enddo
+         it = itmax(n)
+         if (nid.eq.0) write(6,3) n,it,c(2,it),(s(n,1,it,k),k=1,3),fmax
+    3    format(i3,i2,1x,a5,4f12.0,' Peak harmonic')
+      enddo
+      call out_anal(s,a,nn,c,nt,itmax,'Harmonic',1,ivb)
+c
+c     Case by case
+c
+      do k=1,3
+         do n = nn0,nn1
+            fmax = 0.   ! Peak mflops
+            do it=1,nt
+               ai = s(n,1,it,k)
+               if (ai.gt.fmax.and.c(2,it).ne.'madd ') then
+                  fmax     = ai
+                  itmax(n) = it
+               endif
+            enddo
+         enddo
+         if (k.eq.1) call out_anal(s,a,nn,c,nt,itmax,'Case N2N',k,ivb)
+         if (k.eq.2) call out_anal(s,a,nn,c,nt,itmax,'Case NxN',k,ivb)
+         if (k.eq.3) call out_anal(s,a,nn,c,nt,itmax,'Case NN2',k,ivb)
+      enddo
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine out_anal(s,a,nn,c,nt,itmax,name8,k,ivb)
+      include 'SIZE'
+
+      character*5 c(3,nt)
+      real        s(nn,2,nt,3)
+      real        a(nn,2,nt,3)
+      integer itmax(200)
+      character*8 name8
+
+      if (nid.ne.0) return
+
+      nn0 = 1
+      nn1 = nn
+      if (ivb.eq.0) then
+         nn0 = lx1
+         nn1 = lx1
+      endif
+
+
+      do n=nn0,nn1
+         it = itmax(n)
+         write(6,1) n,s(n,1,it,k),c(2,it),name8
+    1    format(i4,f14.0,4x,a5,4x,a8,'   MxM MFLOPS')
+      enddo
+
+      return
+      end
+c-----------------------------------------------------------------------
diff --git a/src/mxm_wrapper.f b/src/mxm_wrapper.f
new file mode 100644
index 0000000..7c3d111
--- /dev/null
+++ b/src/mxm_wrapper.f
@@ -0,0 +1,165 @@
+      subroutine mxm(a,n1,b,n2,c,n3)
+
+#if defined(XSMM_DISPATCH)
+      USE :: LIBXSMM
+#endif
+
+#define LIBXSMM_DMM1(N, a, b, c)  LIBXSMM_DMM1_str(N, a, b, c) 
+#define LIBXSMM_DMM1_str(N, a, b, c)  libxsmm_dmm_##N##x##N##_##N##_##N(a, b, c)
+#define LIBXSMM_DMM2(N, a, b, c)   LIBXSMM_DMM2_str(N, a, b, c)
+#define LIBXSMM_DMM2_str(N, a, b, c)  libxsmm_dmm_##N##_##N##x##N##_##N(a, b, c)
+#define LIBXSMM_DMM3(N, a, b, c)  LIBXSMM_DMM3_str(N, a, b, c) 
+#define LIBXSMM_DMM3_str(N, a, b, c)  libxsmm_dmm_##N##_##N##_##N(a, b, c)
+
+c
+c     Compute matrix-matrix product C = A*B
+c     for contiguously packed matrices A,B, and C.
+c
+#if defined (MKL)
+#     include  "mkl_direct_call.fi"
+#endif
+      real a(n1,n2),b(n2,n3),c(n1,n3)
+      real alpha, beta
+c
+      include 'SIZE'
+      include 'TOTAL'
+c
+      integer aligned
+      integer K10_mxm
+      integer init, prevn2
+
+#if defined(XSMM_DISPATCH)
+      TYPE(LIBXSMM_DMMFUNCTION) :: xmm1,xmm2,xmm3
+#endif
+
+      data init /0/, prevn2 /0/
+      save init, prevn2
+#if defined(XSMM_DISPATCH)
+      save xsmm1, xsmm2, xsmm3
+#endif
+
+c     write(*,*) "in", init, prevn2, LOC(xsmm1), LOC(xsmm2), LOC(xsmm3)
+
+#if defined (MKL)
+      alpha = 1.0
+      beta = 0.0
+      call dgemm('N','N',n1,n3,n2,alpha,A,n1,B,n2,beta,C,n1)
+#elif defined (BLAS_MXM)
+      alpha = 1.0
+      beta = 0.0
+      call dgemm('N','N',n1,n3,n2,alpha,a,n1,b,n2,beta,c,n1)
+#elif defined (BG)
+      call bg_aligned3(a,b,c,aligned)
+      if (n2.eq.2) then
+         call mxm44_2(a,n1,b,n2,c,n3)
+      else if ((aligned.eq.1) .and.
+     $         (n1.ge.8) .and. (n2.ge.8) .and. (n3.ge.8) .and.
+     $         (modulo(n1,2).eq.0) .and. (modulo(n2,2).eq.0) ) then
+         if (modulo(n3,4).eq.0) then
+            call bg_mxm44(a,n1,b,n2,c,n3)
+         else
+            call bg_mxm44_uneven(a,n1,b,n2,c,n3)
+         endif
+      else if((aligned.eq.1) .and.
+     $        (modulo(n1,6).eq.0) .and. (modulo(n3,6).eq.0) .and.
+     $        (n2.ge.4) .and. (modulo(n2,2).eq.0) ) then
+         call bg_mxm3(a,n1,b,n2,c,n3)
+      else
+         call mxm44_0(a,n1,b,n2,c,n3)
+      endif
+#elif defined (K10_MXM)
+      ! fow now only supported for lx1=8
+      ! tuned for AMD K10
+      ierr = K10_mxm(a,n1,b,n2,c,n3) 
+      if (ierr.gt.0) call mxmf2(a,n1,b,n2,c,n3)
+#elif defined (XSMM_DISPATCH)
+      if (init == 0) then
+        CALL libxsmm_init()
+        init = 1
+        write(*,*) "initializing libxsmm"
+      end if 
+
+      if (prevn2 /= n2) then
+        prevn2 = n2
+
+        CALL libxsmm_dispatch(xmm1, n2, n2, n2*n2, alpha=1D0, beta=0D0)
+c       write(*,*) "initialized xmm1"
+        IF (.NOT. libxsmm_available(xmm1)) THEN
+          write(*,*) "  ** Error: unable to dispatch libxsmm call"
+          STOP 
+        END IF
+
+        CALL libxsmm_dispatch(xmm2, n2, n2, n2, alpha=1D0, beta=0D0)
+c       write(*,*) "initialized xmm2"
+        IF (.NOT. libxsmm_available(xmm2)) THEN
+          write(*,*) "  ** Error: unable to dispatch libxsmm call"
+          STOP 
+        END IF
+
+        CALL libxsmm_dispatch(xmm3, n2*n2, n2, n2, alpha=1D0, beta=0D0)
+c       write(*,*) "initialized xmm3"
+        IF (.NOT. libxsmm_available(xmm3)) THEN
+          write(*,*) "  ** Error: unable to dispatch libxsmm call"
+          STOP 
+        END IF
+      end if
+
+      if (n1 .eq. n2*n2) then
+c       write(*,*) "call to xmm3", n1, n2, n3
+        IF (.NOT. libxsmm_available(xmm3)) THEN
+          write(*,*) "  ** Error: unable to dispatch libxsmm call"
+          STOP 
+        END IF
+        call libxsmm_call(xmm3, C_LOC(a), C_LOC(b), C_LOC(c))
+c       call libxsmm_dmm_256_16_16(a, b, c)
+      else if (n3 .eq. n2*n2) then
+c       write(*,*) "call to xmm1", n1, n2, n3
+        IF (.NOT. libxsmm_available(xmm1)) THEN
+          write(*,*) "  ** Error: unable to dispatch libxsmm call"
+          STOP 
+        END IF
+        call libxsmm_call(xmm1, C_LOC(a), C_LOC(b), C_LOC(c))
+c       call libxsmm_dmm_16_256_16(a, b, c)
+      else
+c       write(*,*) "call to xmm2", n1, n2, n3
+        IF (.NOT. libxsmm_available(xmm2)) THEN
+          write(*,*) "  ** Error: unable to dispatch libxsmm call"
+          STOP 
+        END IF
+        call libxsmm_call(xmm2, C_LOC(a), C_LOC(b), C_LOC(c))
+c       call libxsmm_dmm_16_16_16(a, b, c)
+      end if
+#elif defined (XSMM_FIXED)
+      if (n2 .eq. NPOLY) then
+        if (n1 .eq. n2*n2) then
+          call LIBXSMM_DMM1(NPOLY, a, b, c)
+        else if (n3 .eq. n2*n2) then
+          call LIBXSMM_DMM2(NPOLY, a, b, c)
+        else
+          call LIBXSMM_DMM3(NPOLY, a, b, c)
+        end if
+      else
+        write(*,*) "Invalid matrix size"
+        stop
+      end if
+#elif defined (XSMM)
+      alpha = 1.0
+      beta = 0.0
+      CALL libxsmm_dgemm('N','N',n1,n3,n2,alpha,A,n1,B,n2,beta,C,n1)
+#elif defined (MXMBASIC)
+      do j=1,n3
+        do i=1,n1
+          c(i,j) = 0.0
+          do k=1,n2
+            c(i,j) = c(i,j) + a(i,k)*b(k,j)
+          enddo
+        enddo
+      enddo
+#else
+      call mxmf2(a,n1,b,n2,c,n3)
+#endif
+
+c     write(*,*) "out", init, prevn2, xsmm1, xsmm2, xsmm3
+
+      return
+      end
diff --git a/src/omp.f b/src/omp.f
new file mode 100644
index 0000000..1591214
--- /dev/null
+++ b/src/omp.f
@@ -0,0 +1,128 @@
+#ifdef TIMERS
+#define NBTIMER(a) a = dnekclock()
+#define STIMER(a) a = dnekclock_sync()
+#define ACCUMTIMER(b,a) b = b + (dnekclock()- a)
+#else
+#define NBTIMER(a)
+#define STIMER(a)
+#define ACCUMTIMER(a,b)
+#endif
+
+
+      subroutine rzeroi(a,n,start,fin)
+        implicit none
+  
+        real a(n)
+        integer n, i, start, fin
+
+        do i = start, fin
+          a(i) = 0.0
+        end do 
+
+        return
+      end subroutine
+
+c----------------------------------------------------------
+
+      subroutine copyi(a,b,n, start, fin)
+        implicit none
+
+        real a(n),b(n)
+        integer n, i, start, fin
+
+        do i=start,fin
+          a(i)=b(i)
+        enddo
+
+        return
+      end subroutine
+
+c----------------------------------------------------------
+
+      subroutine glsc3i(val,a,b,mult,n,find,lind)
+      implicit none
+
+      include 'TIMER'
+
+      real val,a(n),b(n),mult(n)
+      real tsum,psum,work(1)
+      integer n,find,lind
+      integer i, tmt, thread
+      integer omp_get_thread_num
+    
+      save psum
+      data psum /0.0/
+
+      thread = 0
+#ifdef _OPENMP
+      thread = omp_get_thread_num()
+#endif
+      tmt = thread + 1
+
+      tsum = 0.0
+      do i=find, lind
+         tsum = tsum + a(i)*b(i)*mult(i)
+      end do
+
+c$OMP ATOMIC update
+      psum = psum + tsum
+c$OMP END ATOMIC
+
+c$OMP BARRIER
+      NBTIMER(ttemp4)
+c$OMP MASTER
+      call gop(psum,work,'+  ',1)
+      val = psum
+      psum = 0.0
+c$OMP END MASTER
+c$OMP BARRIER
+      ACCUMTIMER(tgop(gopi(tmt),tmt), ttemp4)
+
+
+      return
+      end subroutine
+
+c----------------------------------------------------------
+
+      subroutine solveMi(z,r,n,start,fin)
+      implicit none
+
+      real z(n),r(n)
+      integer n,start,fin
+
+      call copyi(z,r,n,start,fin) 
+
+      return
+      end
+
+c----------------------------------------------------------
+
+      subroutine add2s1i(a,b,c1,n,start,fin)
+      implicit none
+
+      real a(n),b(n),c1
+      integer n,start,fin
+      integer i
+
+      do i= start, fin
+        a(i)=c1*a(i)+b(i)
+      end do
+
+      return
+      end subroutine
+
+c----------------------------------------------------------
+
+      subroutine add2s2i(a,b,c1,n,start,fin)
+      implicit none
+ 
+      real a(n),b(n),c1
+      integer n,start,fin
+      integer i
+
+      do i= start,fin
+        a(i)=a(i)+c1*b(i)
+      end do
+
+      return
+      end subroutine
diff --git a/src/prox_dssum.f b/src/prox_dssum.f
new file mode 100644
index 0000000..c3c0402
--- /dev/null
+++ b/src/prox_dssum.f
@@ -0,0 +1,174 @@
+c-----------------------------------------------------------------------
+      subroutine dssum(f)
+      include 'SIZE'
+      include 'TOTAL'
+      real f(1)
+
+c     call nekgsync()
+      call gs_op(gsh,f,1,1,0)  ! Gather-scatter operation  ! w   = QQ  w
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine proxy_setupds(gs_handle)
+      include 'SIZE'
+      include 'INPUT'
+      include 'PARALLEL'
+
+      integer gs_handle,dof
+      integer*8 glo_num(lx1*ly1*lz1*lelt)
+
+      common /nekmpi/ mid,mp,nekcomm,nekgroup,nekreal
+
+      t0 = dnekclock()
+
+      call set_vert_box(glo_num) ! Set global-to-local map
+
+      ntot      = nx1*ny1*nz1*nelt
+      call gs_setup(gs_handle,glo_num,ntot,nekcomm,mp) ! Initialize gather-scatter
+      dof = ntot *mp
+      t1 = dnekclock() - t0
+      if (nid.eq.0) then
+         write(6,1) t1,gs_handle,nx1,dof
+    1    format('   setupds time',1pe11.4,' seconds ',2i3,i12)
+      endif
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine set_vert_box(glo_num)
+
+c     Set up global numbering for elements in a box
+
+      include 'SIZE'
+      include 'PARALLEL'
+
+      integer*8 glo_num(1),ii,kg,jg,ig ! The latter 3 for proper promotion
+
+      integer e,ex,ey,ez,eg
+
+      nn = nx1-1  ! nn := polynomial order
+
+      do e=1,nelt
+        eg = lglel(e)                              
+        call get_exyz(ex,ey,ez,eg,nelx,nely,nelz)  
+        do k=0,nn
+        do j=0,nn
+        do i=0,nn
+           kg = nn*(ez-1) + k                     
+           jg = nn*(ey-1) + j                     
+           ig = nn*(ex-1) + i
+           ii = 1 + ig + jg*(nn*nelx+1) + kg*(nn*nelx+1)*(nn*nely+1) 
+           ll = 1 + i + nx1*j + nx1*ny1*k + nx1*ny1*nz1*(e-1)
+           glo_num(ll) = ii
+        enddo
+        enddo
+        enddo
+      enddo
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine get_exyz(ex,ey,ez,eg,nelx,nely,nelz)
+      integer ex,ey,ez,eg
+
+      nelxy = nelx*nely
+ 
+      ez = 1 +  (eg-1)/nelxy
+      ey = mod1 (eg,nelxy)
+      ey = 1 +  (ey-1)/nelx
+      ex = mod1 (eg,nelx)
+ 
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine outmat_glo_num(glo_num)
+      include 'SIZE'
+      include 'INPUT'
+      include 'PARALLEL'
+
+      integer*8 glo_num(lx1*ly1*lz1,lelt)
+
+      integer e
+
+      do e=1,nelt
+         call outmat_e_i8(glo_num(1,e),e)
+      enddo
+ 
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine outmat_e_i8(gn,e)
+      include 'SIZE'
+      include 'INPUT'
+      include 'PARALLEL'
+
+      integer*8 gn(lx1,ly1,lz1)
+
+      integer e
+
+      write(6,*)
+      write(6,2) e
+      write(6,*)
+
+      do k0=3,1,-2
+
+         k1=k0+1
+         do j=ny1,1,-1
+            write(6,1) ((gn(i,j,k),i=1,4),k=k0,k1)
+         enddo
+         write(6,*)
+
+      enddo
+    1 format('gn: ',4i8,3x,4i8)
+    2 format('gn: element: ',i4)
+ 
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine outmat_r(x,name5)
+      include 'SIZE'
+      include 'INPUT'
+      include 'PARALLEL'
+      character*5 name5
+
+      real x(lx1*ly1*lz1,lelt)
+
+      integer e
+
+      do e=1,nelt
+         call outmat_e_r(x(1,e),name5,e)
+      enddo
+ 
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine outmat_e_r(x,name5,e)
+      include 'SIZE'
+      include 'INPUT'
+      include 'PARALLEL'
+      character*5 name5
+
+      real x(lx1,ly1,lz1)
+
+      integer e
+
+      write(6,*)
+      write(6,2) e,name5
+      write(6,*)
+
+      do k0=3,1,-2
+
+         k1=k0+1
+         do j=ny1,1,-1
+            write(6,1) ((x(i,j,k),i=1,4),k=k0,k1)
+         enddo
+         write(6,*)
+
+      enddo
+    1 format('mat: ',4f8.3,3x,4f8.3)
+    2 format('mat: element: ',i4,2x,a5)
+ 
+      return
+      end
+c-----------------------------------------------------------------------
diff --git a/src/prox_setup.f b/src/prox_setup.f
new file mode 100644
index 0000000..fabe8c5
--- /dev/null
+++ b/src/prox_setup.f
@@ -0,0 +1,113 @@
+c-----------------------------------------------------------------------
+      subroutine proxy_setup(a,b,c,d,z,w,g)
+
+      include 'SIZE'
+      include 'TOTAL'
+
+      real a(lx1*lx1),b(lx1),c(lx1*lx1),d(lx1*lx1),z(lx1)
+     $               , w(lx1*2),g(6,lx1*ly1*lz1*lelt)
+
+      call semhat(a,b,c,d,z,w,nx1-1)
+
+      n = nx1*nx1
+      call copy(dxm1,d,n)
+      call transpose(dxtm1,nx1,dxm1,nx1)
+
+      call copy(zgm1,z,nx1)   ! GLL points
+      call copy(wxm1,b,nx1)   ! GLL weights
+
+      call setup_g(g)
+     
+c     m = nx1*ny1*nz1*nelt
+c     call outmat(g,6,m,'gxyz 1',m)
+
+      return
+      end
+c-------------------------------------------------------------------------
+      subroutine setup_g(g)
+
+      include 'SIZE'
+      include 'TOTAL'
+      real g(6,nx1,ny1,nz1,nelt)
+      integer e
+
+      n = nx1*ny1*nz1*nelt
+
+
+      do e=1,nelt
+      do k=1,nz1
+      do j=1,ny1
+      do i=1,nx1
+         call rzero(g(1,i,j,k,e),6)
+         g(1,i,j,k,e) = wxm1(i)*wxm1(j)*wxm1(k)
+         g(4,i,j,k,e) = wxm1(i)*wxm1(j)*wxm1(k)
+         g(6,i,j,k,e) = wxm1(i)*wxm1(j)*wxm1(k)
+         g(6,i,j,k,e) = wxm1(i)*wxm1(j)*wxm1(k)
+      enddo
+      enddo
+      enddo
+      enddo
+
+      return
+      end
+c-------------------------------------------------------------------------
+      subroutine transpose(a,lda,b,ldb)
+      real a(lda,1),b(ldb,1)
+c
+      do j=1,ldb
+         do i=1,lda
+            a(i,j) = b(j,i)
+         enddo
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine outmat(a,m,n,name6,ie)
+      real a(m,n)
+      character*6 name6
+c
+      n10 = min(n,10)
+      write(6,*) 
+      write(6,*) ie,' matrix: ',name6,m,n
+      do i=1,m
+         write(6,6) ie,name6,(a(i,j),j=1,n10)
+      enddo
+    6 format(i3,1x,a6,1p10e12.4)
+      write(6,*) 
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine outmat1(a,m,n,name6,ie)
+      real a(m,n)
+      character*6 name6
+c
+      n10 = min(n,10)
+      write(ie,*) 
+      write(ie,*) ie,' matrix: ',name6,m,n
+      do i=1,m
+         write(ie,6) ie,name6,(a(i,j),j=1,n10)
+      enddo
+    6 format(i3,1x,a6,1p10e12.4)
+      write(ie,*) 
+      return
+      end
+c-----------------------------------------------------------------------
+      function randx(seed)
+
+#ifdef BGQ
+#define M_SIN(X) _sin((X))
+#define M_COS(X) _cos((X))
+#else
+#define M_SIN(X) sin((X))
+#define M_COS(X) cos((X))
+#endif
+
+      arg   = 1.e9*seed
+      arg   = 1.e9*M_COS(arg)
+      randx = M_SIN(arg)
+      seed  = randx
+      seed  = randx
+
+      return
+      end
+c-----------------------------------------------------------------------
diff --git a/src/semhat.f b/src/semhat.f
new file mode 100644
index 0000000..1ad8d23
--- /dev/null
+++ b/src/semhat.f
@@ -0,0 +1,94 @@
+c-----------------------------------------------------------------------
+      subroutine semhat(a,b,c,d,z,w,n)
+c
+c     Generate matrices for single element, 1D operators:
+c
+c        a    = Laplacian
+c        b    = diagonal mass matrix    (GLL weights)
+c        c    = convection operator b*d
+c        d    = derivative matrix
+c        z    = GLL points
+
+      real a(0:n,0:n),b(0:n),c(0:n,0:n),d(0:n,0:n),z(0:n)
+      real w(0:2*n)
+ 
+      np = n+1
+ 
+      call zwgll (z,b,np)
+ 
+      do i=0,n
+         call fd_weights_full(z(i),z,n,1,w)
+         do j=0,n
+            d(i,j) = w(j+np)                   !  Derivative matrix
+         enddo
+      enddo
+
+      call rzero(a,np*np)
+      do j=0,n
+      do i=0,n
+         do k=0,n
+            a(i,j) = a(i,j) + d(k,i)*b(k)*d(k,j)
+         enddo
+         c(i,j) = b(i)*d(i,j)
+      enddo
+      enddo
+
+      return
+      end
+c-----------------------------------------------------------------------
+      subroutine fd_weights_full(xx,x,n,m,c)
+c
+c     This routine evaluates the derivative based on all points
+c     in the stencils.  It is more memory efficient than "fd_weights"
+c
+c     This set of routines comes from the appendix of 
+c     A Practical Guide to Pseudospectral Methods, B. Fornberg
+c     Cambridge Univ. Press, 1996.   (pff)
+c
+c     Input parameters:
+c       xx -- point at wich the approximations are to be accurate
+c       x  -- array of x-ordinates:   x(0:n)
+c       n  -- polynomial degree of interpolant (# of points := n+1)
+c       m  -- highest order of derivative to be approxxmated at xi
+c
+c     Output:
+c       c  -- set of coefficients c(0:n,0:m).
+c             c(j,k) is to be applied at x(j) when
+c             the kth derivative is approxxmated by a 
+c             stencil extending over x(0),x(1),...x(n).
+c
+c
+      real x(0:n),c(0:n,0:m)
+ 
+      c1       = 1.
+      c4       = x(0) - xx
+ 
+      do k=0,m
+      do j=0,n
+         c(j,k) = 0.
+      enddo
+      enddo
+      c(0,0) = 1.
+ 
+      do i=1,n
+         mn = min(i,m)
+         c2 = 1.
+         c5 = c4
+         c4 = x(i)-xx
+         do j=0,i-1
+            c3 = x(i)-x(j)
+            c2 = c2*c3
+            do k=mn,1,-1
+               c(i,k) = c1*(k*c(i-1,k-1)-c5*c(i-1,k))/c2
+            enddo
+            c(i,0) = -c1*c5*c(i-1,0)/c2
+            do k=mn,1,-1
+               c(j,k) = (c4*c(j,k)-k*c(j,k-1))/c3
+            enddo
+            c(j,0) = c4*c(j,0)/c3
+         enddo
+         c1 = c2
+      enddo
+      return
+      end
+c-----------------------------------------------------------------------
diff --git a/src/speclib.f b/src/speclib.f
new file mode 100644
index 0000000..6c86b64
--- /dev/null
+++ b/src/speclib.f
@@ -0,0 +1,1176 @@
+C==============================================================================
+C
+C     LIBRARY ROUTINES FOR SPECTRAL METHODS
+C
+C     March 1989
+C
+C     For questions, comments or suggestions, please contact:
+C
+C     Einar Malvin Ronquist
+C     Room 3-243
+C     Department of Mechanical Engineering
+C     Massachusetts Institute of Technology
+C     77 Massachusetts Avenue
+C     Cambridge, MA 0299
+C     U.S.A.
+C
+C------------------------------------------------------------------------------
+C
+C     ABBRIVIATIONS:
+C
+C     M   - Set of mesh points
+C     Z   - Set of collocation/quadrature points
+C     W   - Set of quadrature weights
+C     H   - Lagrangian interpolant
+C     D   - Derivative operator
+C     I   - Interpolation operator
+C     GL  - Gauss Legendre
+C     GLL - Gauss-Lobatto Legendre
+C     GJ  - Gauss Jacobi
+C     GLJ - Gauss-Lobatto Jacobi
+C
+C
+C     MAIN ROUTINES:
+C
+C     Points and weights:
+C
+C     ZWGL      Compute Gauss Legendre points and weights
+C     ZWGLL     Compute Gauss-Lobatto Legendre points and weights
+C     ZWGJ      Compute Gauss Jacobi points and weights (general)
+C     ZWGLJ     Compute Gauss-Lobatto Jacobi points and weights (general)
+C
+C     Lagrangian interpolants:
+C
+C     HGL       Compute Gauss Legendre Lagrangian interpolant
+C     HGLL      Compute Gauss-Lobatto Legendre Lagrangian interpolant
+C     HGJ       Compute Gauss Jacobi Lagrangian interpolant (general)
+C     HGLJ      Compute Gauss-Lobatto Jacobi Lagrangian interpolant (general)
+C
+C     Derivative operators:
+C
+C     DGLL      Compute Gauss-Lobatto Legendre derivative matrix
+C     DGLLGL    Compute derivative matrix for a staggered mesh (GLL->GL)
+C     DGJ       Compute Gauss Jacobi derivative matrix (general)
+C     DGLJ      Compute Gauss-Lobatto Jacobi derivative matrix (general)
+C     DGLJGJ    Compute derivative matrix for a staggered mesh (GLJ->GJ) (general)
+C
+C     Interpolation operators:
+C
+C     IGLM      Compute interpolation operator GL  -> M
+C     IGLLM     Compute interpolation operator GLL -> M
+C     IGJM      Compute interpolation operator GJ  -> M  (general)
+C     IGLJM     Compute interpolation operator GLJ -> M  (general)
+C
+C     Other:
+C
+C     PNLEG     Compute Legendre polynomial of degree N
+C     PNDLEG    Compute derivative of Legendre polynomial of degree N
+C
+C     Comments:
+C
+C     Note that many of the above routines exist in both single and
+C     double precision. If the name of the single precision routine is
+C     SUB, the double precision version is called SUBD. In most cases
+C     all the "low-level" arithmetic is done in double precision, even
+C     for the single precsion versions.
+C
+C     Useful references:
+C
+C [1] Gabor Szego: Orthogonal Polynomials, American Mathematical Society,
+C     Providence, Rhode Island, 1939.
+C [2] Abramowitz & Stegun: Handbook of Mathematical Functions,
+C     Dover, New York, 1972.
+C [3] Canuto, Hussaini, Quarteroni & Zang: Spectral Methods in Fluid
+C     Dynamics, Springer-Verlag, 1988.
+C
+C
+C==============================================================================
+C
+C--------------------------------------------------------------------
+      SUBROUTINE ZWGL (Z,W,NP)
+C--------------------------------------------------------------------
+C
+C     Generate NP Gauss Legendre points (Z) and weights (W)
+C     associated with Jacobi polynomial P(N)(alpha=0,beta=0).
+C     The polynomial degree N=NP-1.
+C     Z and W are in single precision, but all the arithmetic
+C     operations are done in double precision.
+C
+C--------------------------------------------------------------------
+      REAL Z(1),W(1)
+      ALPHA = 0.
+      BETA  = 0.
+      CALL ZWGJ (Z,W,NP,ALPHA,BETA)
+      RETURN
+      END
+C
+      SUBROUTINE ZWGLL (Z,W,NP)
+C--------------------------------------------------------------------
+C
+C     Generate NP Gauss-Lobatto Legendre points (Z) and weights (W)
+C     associated with Jacobi polynomial P(N)(alpha=0,beta=0).
+C     The polynomial degree N=NP-1.
+C     Z and W are in single precision, but all the arithmetic
+C     operations are done in double precision.
+C
+C--------------------------------------------------------------------
+      REAL Z(1),W(1)
+      ALPHA = 0.
+      BETA  = 0.
+      CALL ZWGLJ (Z,W,NP,ALPHA,BETA)
+      RETURN
+      END
+C
+      SUBROUTINE ZWGJ (Z,W,NP,ALPHA,BETA)
+C--------------------------------------------------------------------
+C
+C     Generate NP GAUSS JACOBI points (Z) and weights (W)
+C     associated with Jacobi polynomial P(N)(alpha>-1,beta>-1).
+C     The polynomial degree N=NP-1.
+C     Single precision version.
+C
+C--------------------------------------------------------------------
+      PARAMETER (NMAX=84)
+      PARAMETER (NZD = NMAX)
+      REAL*8  ZD(NZD),WD(NZD)
+      REAL Z(1),W(1),ALPHA,BETA
+C
+      NPMAX = NZD
+      IF (NP.GT.NPMAX) THEN
+         WRITE (6,*) 'Too large polynomial degree in ZWGJ'
+         WRITE (6,*) 'Maximum polynomial degree is',NMAX
+         WRITE (6,*) 'Here NP=',NP
+         call exitt
+      ENDIF
+      ALPHAD = ALPHA
+      BETAD  = BETA
+      CALL ZWGJD (ZD,WD,NP,ALPHAD,BETAD)
+      DO 100 I=1,NP
+         Z(I) = ZD(I)
+         W(I) = WD(I)
+ 100  CONTINUE
+      RETURN
+      END
+C
+      SUBROUTINE ZWGJD (Z,W,NP,ALPHA,BETA)
+C--------------------------------------------------------------------
+C
+C     Generate NP GAUSS JACOBI points (Z) and weights (W)
+C     associated with Jacobi polynomial P(N)(alpha>-1,beta>-1).
+C     The polynomial degree N=NP-1.
+C     Double precision version.
+C
+C--------------------------------------------------------------------
+      IMPLICIT REAL*8  (A-H,O-Z)
+      REAL*8  Z(1),W(1),ALPHA,BETA
+C
+      N     = NP-1
+      DN    = ((N))
+      ONE   = 1.
+      TWO   = 2.
+      APB   = ALPHA+BETA
+C
+      IF (NP.LE.0) THEN
+         WRITE (6,*) 'ZWGJD: Minimum number of Gauss points is 1',np
+         call exitt
+      ENDIF
+      IF ((ALPHA.LE.-ONE).OR.(BETA.LE.-ONE)) THEN
+         WRITE (6,*) 'ZWGJD: Alpha and Beta must be greater than -1'
+         call exitt
+      ENDIF
+C
+      IF (NP.EQ.1) THEN
+         Z(1) = (BETA-ALPHA)/(APB+TWO)
+         W(1) = GAMMAF(ALPHA+ONE)*GAMMAF(BETA+ONE)/GAMMAF(APB+TWO)
+     $          * TWO**(APB+ONE)
+         RETURN
+      ENDIF
+C
+      CALL JACG (Z,NP,ALPHA,BETA)
+C
+      NP1   = N+1
+      NP2   = N+2
+      DNP1  = ((NP1))
+      DNP2  = ((NP2))
+      FAC1  = DNP1+ALPHA+BETA+ONE
+      FAC2  = FAC1+DNP1
+      FAC3  = FAC2+ONE
+      FNORM = PNORMJ(NP1,ALPHA,BETA)
+      RCOEF = (FNORM*FAC2*FAC3)/(TWO*FAC1*DNP2)
+      DO 100 I=1,NP
+         CALL JACOBF (P,PD,PM1,PDM1,PM2,PDM2,NP2,ALPHA,BETA,Z(I))
+         W(I) = -RCOEF/(P*PDM1)
+ 100  CONTINUE
+      RETURN
+      END
+C
+      SUBROUTINE ZWGLJ (Z,W,NP,ALPHA,BETA)
+C--------------------------------------------------------------------
+C
+C     Generate NP GAUSS LOBATTO JACOBI points (Z) and weights (W)
+C     associated with Jacobi polynomial P(N)(alpha>-1,beta>-1).
+C     The polynomial degree N=NP-1.
+C     Single precision version.
+C
+C--------------------------------------------------------------------
+      PARAMETER (NMAX=84)
+      PARAMETER (NZD = NMAX)
+      REAL*8  ZD(NZD),WD(NZD)
+      REAL Z(1),W(1),ALPHA,BETA
+C
+      NPMAX = NZD
+      IF (NP.GT.NPMAX) THEN
+         WRITE (6,*) 'Too large polynomial degree in ZWGLJ'
+         WRITE (6,*) 'Maximum polynomial degree is',NMAX
+         WRITE (6,*) 'Here NP=',NP
+         call exitt
+      ENDIF
+      ALPHAD = ALPHA
+      BETAD  = BETA
+      CALL ZWGLJD (ZD,WD,NP,ALPHAD,BETAD)
+      DO 100 I=1,NP
+         Z(I) = ZD(I)
+         W(I) = WD(I)
+ 100  CONTINUE
+      RETURN
+      END
+C
+      SUBROUTINE ZWGLJD (Z,W,NP,ALPHA,BETA)
+C--------------------------------------------------------------------
+C
+C     Generate NP GAUSS LOBATTO JACOBI points (Z) and weights (W)
+C     associated with Jacobi polynomial P(N)(alpha>-1,beta>-1).
+C     The polynomial degree N=NP-1.
+C     Double precision version.
+C
+C--------------------------------------------------------------------
+      IMPLICIT REAL*8  (A-H,O-Z)
+      REAL*8  Z(NP),W(NP),ALPHA,BETA
+C
+      N     = NP-1
+      NM1   = N-1
+      ONE   = 1.
+      TWO   = 2.
+C
+      IF (NP.LE.1) THEN
+       WRITE (6,*) 'ZWGLJD: Minimum number of Gauss-Lobatto points is 2'
+       WRITE (6,*) 'ZWGLJD: alpha,beta:',alpha,beta,np
+       call exitt
+      ENDIF
+      IF ((ALPHA.LE.-ONE).OR.(BETA.LE.-ONE)) THEN
+         WRITE (6,*) 'ZWGLJD: Alpha and Beta must be greater than -1'
+         call exitt
+      ENDIF
+C
+      IF (NM1.GT.0) THEN
+         ALPG  = ALPHA+ONE
+         BETG  = BETA+ONE
+         CALL ZWGJD (Z(2),W(2),NM1,ALPG,BETG)
+      ENDIF
+      Z(1)  = -ONE
+      Z(NP) =  ONE
+      DO 100  I=2,NP-1
+         W(I) = W(I)/(ONE-Z(I)**2)
+ 100  CONTINUE
+      CALL JACOBF (P,PD,PM1,PDM1,PM2,PDM2,N,ALPHA,BETA,Z(1))
+      W(1)  = ENDW1 (N,ALPHA,BETA)/(TWO*PD)
+      CALL JACOBF (P,PD,PM1,PDM1,PM2,PDM2,N,ALPHA,BETA,Z(NP))
+      W(NP) = ENDW2 (N,ALPHA,BETA)/(TWO*PD)
+C
+      RETURN
+      END
+C
+      REAL*8  FUNCTION ENDW1 (N,ALPHA,BETA)
+      IMPLICIT REAL*8  (A-H,O-Z)
+      REAL*8  ALPHA,BETA
+      ZERO  = 0.
+      ONE   = 1.
+      TWO   = 2.
+      THREE = 3.
+      FOUR  = 4.
+      APB   = ALPHA+BETA
+      IF (N.EQ.0) THEN
+         ENDW1 = ZERO
+         RETURN
+      ENDIF
+      F1   = GAMMAF(ALPHA+TWO)*GAMMAF(BETA+ONE)/GAMMAF(APB+THREE)
+      F1   = F1*(APB+TWO)*TWO**(APB+TWO)/TWO
+      IF (N.EQ.1) THEN
+         ENDW1 = F1
+         RETURN
+      ENDIF
+      FINT1 = GAMMAF(ALPHA+TWO)*GAMMAF(BETA+ONE)/GAMMAF(APB+THREE)
+      FINT1 = FINT1*TWO**(APB+TWO)
+      FINT2 = GAMMAF(ALPHA+TWO)*GAMMAF(BETA+TWO)/GAMMAF(APB+FOUR)
+      FINT2 = FINT2*TWO**(APB+THREE)
+      F2    = (-TWO*(BETA+TWO)*FINT1 + (APB+FOUR)*FINT2)
+     $        * (APB+THREE)/FOUR
+      IF (N.EQ.2) THEN
+         ENDW1 = F2
+         RETURN
+      ENDIF
+      DO 100 I=3,N
+         DI   = ((I-1))
+         ABN  = ALPHA+BETA+DI
+         ABNN = ABN+DI
+         A1   = -(TWO*(DI+ALPHA)*(DI+BETA))/(ABN*ABNN*(ABNN+ONE))
+         A2   =  (TWO*(ALPHA-BETA))/(ABNN*(ABNN+TWO))
+         A3   =  (TWO*(ABN+ONE))/((ABNN+TWO)*(ABNN+ONE))
+         F3   =  -(A2*F2+A1*F1)/A3
+         F1   = F2
+         F2   = F3
+ 100  CONTINUE
+      ENDW1  = F3
+      RETURN
+      END
+C
+      REAL*8  FUNCTION ENDW2 (N,ALPHA,BETA)
+      IMPLICIT REAL*8  (A-H,O-Z)
+      REAL*8  ALPHA,BETA
+      ZERO  = 0.
+      ONE   = 1.
+      TWO   = 2.
+      THREE = 3.
+      FOUR  = 4.
+      APB   = ALPHA+BETA
+      IF (N.EQ.0) THEN
+         ENDW2 = ZERO
+         RETURN
+      ENDIF
+      F1   = GAMMAF(ALPHA+ONE)*GAMMAF(BETA+TWO)/GAMMAF(APB+THREE)
+      F1   = F1*(APB+TWO)*TWO**(APB+TWO)/TWO
+      IF (N.EQ.1) THEN
+         ENDW2 = F1
+         RETURN
+      ENDIF
+      FINT1 = GAMMAF(ALPHA+ONE)*GAMMAF(BETA+TWO)/GAMMAF(APB+THREE)
+      FINT1 = FINT1*TWO**(APB+TWO)
+      FINT2 = GAMMAF(ALPHA+TWO)*GAMMAF(BETA+TWO)/GAMMAF(APB+FOUR)
+      FINT2 = FINT2*TWO**(APB+THREE)
+      F2    = (TWO*(ALPHA+TWO)*FINT1 - (APB+FOUR)*FINT2)
+     $        * (APB+THREE)/FOUR
+      IF (N.EQ.2) THEN
+         ENDW2 = F2
+         RETURN
+      ENDIF
+      DO 100 I=3,N
+         DI   = ((I-1))
+         ABN  = ALPHA+BETA+DI
+         ABNN = ABN+DI
+         A1   =  -(TWO*(DI+ALPHA)*(DI+BETA))/(ABN*ABNN*(ABNN+ONE))
+         A2   =  (TWO*(ALPHA-BETA))/(ABNN*(ABNN+TWO))
+         A3   =  (TWO*(ABN+ONE))/((ABNN+TWO)*(ABNN+ONE))
+         F3   =  -(A2*F2+A1*F1)/A3
+         F1   = F2
+         F2   = F3
+ 100  CONTINUE
+      ENDW2  = F3
+      RETURN
+      END
+C
+      REAL*8  FUNCTION GAMMAF (X)
+      IMPLICIT REAL*8  (A-H,O-Z)
+      REAL*8  X
+      ZERO = 0.0
+      HALF = 0.5
+      ONE  = 1.0
+      TWO  = 2.0
+      FOUR = 4.0
+      PI   = FOUR*ATAN(ONE)
+      GAMMAF = ONE
+      IF (X.EQ.-HALF) GAMMAF = -TWO*SQRT(PI)
+      IF (X.EQ. HALF) GAMMAF =  SQRT(PI)
+      IF (X.EQ. ONE ) GAMMAF =  ONE
+      IF (X.EQ. TWO ) GAMMAF =  ONE
+      IF (X.EQ. 1.5  ) GAMMAF =  SQRT(PI)/2.
+      IF (X.EQ. 2.5) GAMMAF =  1.5*SQRT(PI)/2.
+      IF (X.EQ. 3.5) GAMMAF =  0.5*(2.5*(1.5*SQRT(PI)))
+      IF (X.EQ. 3. ) GAMMAF =  2.
+      IF (X.EQ. 4. ) GAMMAF = 6.
+      IF (X.EQ. 5. ) GAMMAF = 24.
+      IF (X.EQ. 6. ) GAMMAF = 120.
+      RETURN
+      END
+C
+      REAL*8  FUNCTION PNORMJ (N,ALPHA,BETA)
+      IMPLICIT REAL*8  (A-H,O-Z)
+      REAL*8  ALPHA,BETA
+      ONE   = 1.
+      TWO   = 2.
+      DN    = ((N))
+      CONST = ALPHA+BETA+ONE
+      IF (N.LE.1) THEN
+         PROD   = GAMMAF(DN+ALPHA)*GAMMAF(DN+BETA)
+         PROD   = PROD/(GAMMAF(DN)*GAMMAF(DN+ALPHA+BETA))
+         PNORMJ = PROD * TWO**CONST/(TWO*DN+CONST)
+         RETURN
+      ENDIF
+      PROD  = GAMMAF(ALPHA+ONE)*GAMMAF(BETA+ONE)
+      PROD  = PROD/(TWO*(ONE+CONST)*GAMMAF(CONST+ONE))
+      PROD  = PROD*(ONE+ALPHA)*(TWO+ALPHA)
+      PROD  = PROD*(ONE+BETA)*(TWO+BETA)
+      DO 100 I=3,N
+         DINDX = ((I))
+         FRAC  = (DINDX+ALPHA)*(DINDX+BETA)/(DINDX*(DINDX+ALPHA+BETA))
+         PROD  = PROD*FRAC
+ 100  CONTINUE
+      PNORMJ = PROD * TWO**CONST/(TWO*DN+CONST)
+      RETURN
+      END
+C
+      SUBROUTINE JACG (XJAC,NP,ALPHA,BETA)
+C--------------------------------------------------------------------
+C
+C     Compute NP Gauss points XJAC, which are the zeros of the
+C     Jacobi polynomial J(NP) with parameters ALPHA and BETA.
+C     ALPHA and BETA determines the specific type of Gauss points.
+C     Examples:
+C     ALPHA = BETA =  0.0  ->  Legendre points
+C     ALPHA = BETA = -0.5  ->  Chebyshev points
+C
+C--------------------------------------------------------------------
+      IMPLICIT REAL*8  (A-H,O-Z)
+      REAL*8  XJAC(1)
+      DATA KSTOP /10/
+      DATA EPS/1.0e-12/
+      N   = NP-1
+      one = 1.
+      DTH = 4.*ATAN(one)/(2.*((N))+2.)
+      DO 40 J=1,NP
+         IF (J.EQ.1) THEN
+            X = COS((2.*(((J))-1.)+1.)*DTH)
+         ELSE
+            X1 = COS((2.*(((J))-1.)+1.)*DTH)
+            X2 = XLAST
+            X  = (X1+X2)/2.
+         ENDIF
+         DO 30 K=1,KSTOP
+            CALL JACOBF (P,PD,PM1,PDM1,PM2,PDM2,NP,ALPHA,BETA,X)
+            RECSUM = 0.
+            JM = J-1
+            DO 29 I=1,JM
+               RECSUM = RECSUM+1./(X-XJAC(NP-I+1))
+ 29         CONTINUE
+            DELX = -P/(PD-RECSUM*P)
+            X    = X+DELX
+            IF (ABS(DELX) .LT. EPS) GOTO 31
+ 30      CONTINUE
+ 31      CONTINUE
+         XJAC(NP-J+1) = X
+         XLAST        = X
+ 40   CONTINUE
+      DO 200 I=1,NP
+         XMIN = 2.
+         DO 100 J=I,NP
+            IF (XJAC(J).LT.XMIN) THEN
+               XMIN = XJAC(J)
+               JMIN = J
+            ENDIF
+ 100     CONTINUE
+         IF (JMIN.NE.I) THEN
+            SWAP = XJAC(I)
+            XJAC(I) = XJAC(JMIN)
+            XJAC(JMIN) = SWAP
+         ENDIF
+ 200  CONTINUE
+      RETURN
+      END
+C
+      SUBROUTINE JACOBF (POLY,PDER,POLYM1,PDERM1,POLYM2,PDERM2,
+     $                   N,ALP,BET,X)
+C--------------------------------------------------------------------
+C
+C     Computes the Jacobi polynomial (POLY) and its derivative (PDER)
+C     of degree N at X.
+C
+C--------------------------------------------------------------------
+      IMPLICIT REAL*8  (A-H,O-Z)
+      APB  = ALP+BET
+      POLY = 1.
+      PDER = 0.
+      IF (N .EQ. 0) RETURN
+      POLYL = POLY
+      PDERL = PDER
+      POLY  = (ALP-BET+(APB+2.)*X)/2.
+      PDER  = (APB+2.)/2.
+      IF (N .EQ. 1) RETURN
+      DO 20 K=2,N
+         DK = ((K))
+         A1 = 2.*DK*(DK+APB)*(2.*DK+APB-2.)
+         A2 = (2.*DK+APB-1.)*(ALP**2-BET**2)
+         B3 = (2.*DK+APB-2.)
+         A3 = B3*(B3+1.)*(B3+2.)
+         A4 = 2.*(DK+ALP-1.)*(DK+BET-1.)*(2.*DK+APB)
+         POLYN  = ((A2+A3*X)*POLY-A4*POLYL)/A1
+         PDERN  = ((A2+A3*X)*PDER-A4*PDERL+A3*POLY)/A1
+         PSAVE  = POLYL
+         PDSAVE = PDERL
+         POLYL  = POLY
+         POLY   = POLYN
+         PDERL  = PDER
+         PDER   = PDERN
+ 20   CONTINUE
+      POLYM1 = POLYL
+      PDERM1 = PDERL
+      POLYM2 = PSAVE
+      PDERM2 = PDSAVE
+      RETURN
+      END
+C
+      REAL FUNCTION HGJ (II,Z,ZGJ,NP,ALPHA,BETA)
+C---------------------------------------------------------------------
+C
+C     Compute the value of the Lagrangian interpolant HGJ through
+C     the NP Gauss Jacobi points ZGJ at the point Z.
+C     Single precision version.
+C
+C---------------------------------------------------------------------
+      PARAMETER (NMAX=84)
+      PARAMETER (NZD = NMAX)
+      REAL*8  ZD,ZGJD(NZD)
+      REAL Z,ZGJ(1),ALPHA,BETA
+      NPMAX = NZD
+      IF (NP.GT.NPMAX) THEN
+         WRITE (6,*) 'Too large polynomial degree in HGJ'
+         WRITE (6,*) 'Maximum polynomial degree is',NMAX
+         WRITE (6,*) 'Here NP=',NP
+         call exitt
+      ENDIF
+      ZD = Z
+      DO 100 I=1,NP
+         ZGJD(I) = ZGJ(I)
+ 100  CONTINUE
+      ALPHAD = ALPHA
+      BETAD  = BETA
+      HGJ    = HGJD (II,ZD,ZGJD,NP,ALPHAD,BETAD)
+      RETURN
+      END
+C
+      REAL*8  FUNCTION HGJD (II,Z,ZGJ,NP,ALPHA,BETA)
+C---------------------------------------------------------------------
+C
+C     Compute the value of the Lagrangian interpolant HGJD through
+C     the NZ Gauss-Lobatto Jacobi points ZGJ at the point Z.
+C     Double precision version.
+C
+C---------------------------------------------------------------------
+      IMPLICIT REAL*8  (A-H,O-Z)
+      REAL*8  Z,ZGJ(1),ALPHA,BETA
+      EPS = 1.e-5
+      ONE = 1.
+      ZI  = ZGJ(II)
+      DZ  = Z-ZI
+      IF (ABS(DZ).LT.EPS) THEN
+         HGJD = ONE
+         RETURN
+      ENDIF
+      CALL JACOBF (PZI,PDZI,PM1,PDM1,PM2,PDM2,NP,ALPHA,BETA,ZI)
+      CALL JACOBF (PZ,PDZ,PM1,PDM1,PM2,PDM2,NP,ALPHA,BETA,Z)
+      HGJD  = PZ/(PDZI*(Z-ZI))
+      RETURN
+      END
+C
+      REAL FUNCTION HGLJ (II,Z,ZGLJ,NP,ALPHA,BETA)
+C---------------------------------------------------------------------
+C
+C     Compute the value of the Lagrangian interpolant HGLJ through
+C     the NZ Gauss-Lobatto Jacobi points ZGLJ at the point Z.
+C     Single precision version.
+C
+C---------------------------------------------------------------------
+      PARAMETER (NMAX=84)
+      PARAMETER (NZD = NMAX)
+      REAL*8  ZD,ZGLJD(NZD)
+      REAL Z,ZGLJ(1),ALPHA,BETA
+      NPMAX = NZD
+      IF (NP.GT.NPMAX) THEN
+         WRITE (6,*) 'Too large polynomial degree in HGLJ'
+         WRITE (6,*) 'Maximum polynomial degree is',NMAX
+         WRITE (6,*) 'Here NP=',NP
+         call exitt
+      ENDIF
+      ZD = Z
+      DO 100 I=1,NP
+         ZGLJD(I) = ZGLJ(I)
+ 100  CONTINUE
+      ALPHAD = ALPHA
+      BETAD  = BETA
+      HGLJ   = HGLJD (II,ZD,ZGLJD,NP,ALPHAD,BETAD)
+      RETURN
+      END
+C
+      REAL*8  FUNCTION HGLJD (I,Z,ZGLJ,NP,ALPHA,BETA)
+C---------------------------------------------------------------------
+C
+C     Compute the value of the Lagrangian interpolant HGLJD through
+C     the NZ Gauss-Lobatto Jacobi points ZJACL at the point Z.
+C     Double precision version.
+C
+C---------------------------------------------------------------------
+      IMPLICIT REAL*8  (A-H,O-Z)
+      REAL*8  Z,ZGLJ(1),ALPHA,BETA
+      EPS = 1.e-5
+      ONE = 1.
+      ZI  = ZGLJ(I)
+      DZ  = Z-ZI
+      IF (ABS(DZ).LT.EPS) THEN
+         HGLJD = ONE
+         RETURN
+      ENDIF
+      N      = NP-1
+      DN     = ((N))
+      EIGVAL = -DN*(DN+ALPHA+BETA+ONE)
+      CALL JACOBF (PI,PDI,PM1,PDM1,PM2,PDM2,N,ALPHA,BETA,ZI)
+      CONST  = EIGVAL*PI+ALPHA*(ONE+ZI)*PDI-BETA*(ONE-ZI)*PDI
+      CALL JACOBF (P,PD,PM1,PDM1,PM2,PDM2,N,ALPHA,BETA,Z)
+      HGLJD  = (ONE-Z**2)*PD/(CONST*(Z-ZI))
+      RETURN
+      END
+C
+      SUBROUTINE DGJ (D,DT,Z,NZ,NZD,ALPHA,BETA)
+C-----------------------------------------------------------------
+C
+C     Compute the derivative matrix D and its transpose DT
+C     associated with the Nth order Lagrangian interpolants
+C     through the NZ Gauss Jacobi points Z.
+C     Note: D and DT are square matrices.
+C     Single precision version.
+C
+C-----------------------------------------------------------------
+      PARAMETER (NMAX=84)
+      PARAMETER (NZDD = NMAX)
+      REAL*8  DD(NZDD,NZDD),DTD(NZDD,NZDD),ZD(NZDD)
+      REAL D(NZD,NZD),DT(NZD,NZD),Z(1),ALPHA,BETA
+C
+      IF (NZ.LE.0) THEN
+         WRITE (6,*) 'DGJ: Minimum number of Gauss points is 1'
+         call exitt
+      ENDIF
+      IF (NZ .GT. NMAX) THEN
+         WRITE (6,*) 'Too large polynomial degree in DGJ'
+         WRITE (6,*) 'Maximum polynomial degree is',NMAX
+         WRITE (6,*) 'Here Nz=',Nz
+         call exitt
+      ENDIF
+      IF ((ALPHA.LE.-1.).OR.(BETA.LE.-1.)) THEN
+         WRITE (6,*) 'DGJ: Alpha and Beta must be greater than -1'
+         call exitt
+      ENDIF
+      ALPHAD = ALPHA
+      BETAD  = BETA
+      DO 100 I=1,NZ
+         ZD(I) = Z(I)
+ 100  CONTINUE
+      CALL DGJD (DD,DTD,ZD,NZ,NZDD,ALPHAD,BETAD)
+      DO 200 I=1,NZ
+      DO 200 J=1,NZ
+         D(I,J)  = DD(I,J)
+         DT(I,J) = DTD(I,J)
+ 200  CONTINUE
+      RETURN
+      END
+C
+      SUBROUTINE DGJD (D,DT,Z,NZ,NZD,ALPHA,BETA)
+C-----------------------------------------------------------------
+C
+C     Compute the derivative matrix D and its transpose DT
+C     associated with the Nth order Lagrangian interpolants
+C     through the NZ Gauss Jacobi points Z.
+C     Note: D and DT are square matrices.
+C     Double precision version.
+C
+C-----------------------------------------------------------------
+      IMPLICIT REAL*8  (A-H,O-Z)
+      REAL*8  D(NZD,NZD),DT(NZD,NZD),Z(1),ALPHA,BETA
+      N    = NZ-1
+      DN   = ((N))
+      ONE  = 1.
+      TWO  = 2.
+C
+      IF (NZ.LE.1) THEN
+       WRITE (6,*) 'DGJD: Minimum number of Gauss-Lobatto points is 2'
+       call exitt
+      ENDIF
+      IF ((ALPHA.LE.-ONE).OR.(BETA.LE.-ONE)) THEN
+         WRITE (6,*) 'DGJD: Alpha and Beta must be greater than -1'
+         call exitt
+      ENDIF
+C
+      DO 200 I=1,NZ
+      DO 200 J=1,NZ
+         CALL JACOBF (PI,PDI,PM1,PDM1,PM2,PDM2,NZ,ALPHA,BETA,Z(I))
+         CALL JACOBF (PJ,PDJ,PM1,PDM1,PM2,PDM2,NZ,ALPHA,BETA,Z(J))
+         IF (I.NE.J) D(I,J) = PDI/(PDJ*(Z(I)-Z(J)))
+         IF (I.EQ.J) D(I,J) = ((ALPHA+BETA+TWO)*Z(I)+ALPHA-BETA)/
+     $                        (TWO*(ONE-Z(I)**2))
+         DT(J,I) = D(I,J)
+ 200  CONTINUE
+      RETURN
+      END
+C
+      SUBROUTINE DGLJ (D,DT,Z,NZ,NZD,ALPHA,BETA)
+C-----------------------------------------------------------------
+C
+C     Compute the derivative matrix D and its transpose DT
+C     associated with the Nth order Lagrangian interpolants
+C     through the NZ Gauss-Lobatto Jacobi points Z.
+C     Note: D and DT are square matrices.
+C     Single precision version.
+C
+C-----------------------------------------------------------------
+      PARAMETER (NMAX=84)
+      PARAMETER (NZDD = NMAX)
+      REAL*8  DD(NZDD,NZDD),DTD(NZDD,NZDD),ZD(NZDD)
+      REAL D(NZD,NZD),DT(NZD,NZD),Z(1),ALPHA,BETA
+C
+      IF (NZ.LE.1) THEN
+       WRITE (6,*) 'DGLJ: Minimum number of Gauss-Lobatto points is 2'
+       call exitt
+      ENDIF
+      IF (NZ .GT. NMAX) THEN
+         WRITE (6,*) 'Too large polynomial degree in DGLJ'
+         WRITE (6,*) 'Maximum polynomial degree is',NMAX
+         WRITE (6,*) 'Here NZ=',NZ
+         call exitt
+      ENDIF
+      IF ((ALPHA.LE.-1.).OR.(BETA.LE.-1.)) THEN
+         WRITE (6,*) 'DGLJ: Alpha and Beta must be greater than -1'
+         call exitt
+      ENDIF
+      ALPHAD = ALPHA
+      BETAD  = BETA
+      DO 100 I=1,NZ
+         ZD(I) = Z(I)
+ 100  CONTINUE
+      CALL DGLJD (DD,DTD,ZD,NZ,NZDD,ALPHAD,BETAD)
+      DO 200 I=1,NZ
+      DO 200 J=1,NZ
+         D(I,J)  = DD(I,J)
+         DT(I,J) = DTD(I,J)
+ 200  CONTINUE
+      RETURN
+      END
+C
+      SUBROUTINE DGLJD (D,DT,Z,NZ,NZD,ALPHA,BETA)
+C-----------------------------------------------------------------
+C
+C     Compute the derivative matrix D and its transpose DT
+C     associated with the Nth order Lagrangian interpolants
+C     through the NZ Gauss-Lobatto Jacobi points Z.
+C     Note: D and DT are square matrices.
+C     Double precision version.
+C
+C-----------------------------------------------------------------
+      IMPLICIT REAL*8  (A-H,O-Z)
+      REAL*8  D(NZD,NZD),DT(NZD,NZD),Z(1),ALPHA,BETA
+      N    = NZ-1
+      DN   = ((N))
+      ONE  = 1.
+      TWO  = 2.
+      EIGVAL = -DN*(DN+ALPHA+BETA+ONE)
+C
+      IF (NZ.LE.1) THEN
+       WRITE (6,*) 'DGLJD: Minimum number of Gauss-Lobatto points is 2'
+       call exitt
+      ENDIF
+      IF ((ALPHA.LE.-ONE).OR.(BETA.LE.-ONE)) THEN
+         WRITE (6,*) 'DGLJD: Alpha and Beta must be greater than -1'
+         call exitt
+      ENDIF
+C
+      DO 200 I=1,NZ
+      DO 200 J=1,NZ
+         CALL JACOBF (PI,PDI,PM1,PDM1,PM2,PDM2,N,ALPHA,BETA,Z(I))
+         CALL JACOBF (PJ,PDJ,PM1,PDM1,PM2,PDM2,N,ALPHA,BETA,Z(J))
+         CI = EIGVAL*PI-(BETA*(ONE-Z(I))-ALPHA*(ONE+Z(I)))*PDI
+         CJ = EIGVAL*PJ-(BETA*(ONE-Z(J))-ALPHA*(ONE+Z(J)))*PDJ
+         IF (I.NE.J) D(I,J) = CI/(CJ*(Z(I)-Z(J)))
+         IF ((I.EQ.J).AND.(I.NE.1).AND.(I.NE.NZ))
+     $   D(I,J) = (ALPHA*(ONE+Z(I))-BETA*(ONE-Z(I)))/
+     $            (TWO*(ONE-Z(I)**2))
+         IF ((I.EQ.J).AND.(I.EQ.1))
+     $   D(I,J) =  (EIGVAL+ALPHA)/(TWO*(BETA+TWO))
+         IF ((I.EQ.J).AND.(I.EQ.NZ))
+     $   D(I,J) = -(EIGVAL+BETA)/(TWO*(ALPHA+TWO))
+         DT(J,I) = D(I,J)
+ 200  CONTINUE
+      RETURN
+      END
+C
+      SUBROUTINE DGLL (D,DT,Z,NZ,NZD)
+C-----------------------------------------------------------------
+C
+C     Compute the derivative matrix D and its transpose DT
+C     associated with the Nth order Lagrangian interpolants
+C     through the NZ Gauss-Lobatto Legendre points Z.
+C     Note: D and DT are square matrices.
+C
+C-----------------------------------------------------------------
+      PARAMETER (NMAX=84)
+      REAL D(NZD,NZD),DT(NZD,NZD),Z(1)
+      N  = NZ-1
+      IF (NZ .GT. NMAX) THEN
+         WRITE (6,*) 'Subroutine DGLL'
+         WRITE (6,*) 'Maximum polynomial degree =',NMAX
+         WRITE (6,*) 'Polynomial degree         =',NZ
+      ENDIF
+      IF (NZ .EQ. 1) THEN
+         D(1,1) = 0.
+         RETURN
+      ENDIF
+      FN = (N)
+      d0 = FN*(FN+1.)/4.
+      DO 200 I=1,NZ
+      DO 200 J=1,NZ
+         D(I,J) = 0.
+         IF  (I.NE.J) D(I,J) = PNLEG(Z(I),N)/
+     $                        (PNLEG(Z(J),N)*(Z(I)-Z(J)))
+         IF ((I.EQ.J).AND.(I.EQ.1))  D(I,J) = -d0
+         IF ((I.EQ.J).AND.(I.EQ.NZ)) D(I,J) =  d0
+         DT(J,I) = D(I,J)
+ 200  CONTINUE
+      RETURN
+      END
+C
+      REAL FUNCTION HGLL (I,Z,ZGLL,NZ)
+C---------------------------------------------------------------------
+C
+C     Compute the value of the Lagrangian interpolant L through
+C     the NZ Gauss-Lobatto Legendre points ZGLL at the point Z.
+C
+C---------------------------------------------------------------------
+      REAL ZGLL(1)
+      EPS = 1.E-5
+      DZ = Z - ZGLL(I)
+      IF (ABS(DZ) .LT. EPS) THEN
+         HGLL = 1.
+         RETURN
+      ENDIF
+      N = NZ - 1
+      ALFAN = (N)*((N)+1.)
+      HGLL = - (1.-Z*Z)*PNDLEG(Z,N)/
+     $         (ALFAN*PNLEG(ZGLL(I),N)*(Z-ZGLL(I)))
+      RETURN
+      END
+C
+      REAL FUNCTION HGL (I,Z,ZGL,NZ)
+C---------------------------------------------------------------------
+C
+C     Compute the value of the Lagrangian interpolant HGL through
+C     the NZ Gauss Legendre points ZGL at the point Z.
+C
+C---------------------------------------------------------------------
+      REAL ZGL(1)
+      EPS = 1.E-5
+      DZ = Z - ZGL(I)
+      IF (ABS(DZ) .LT. EPS) THEN
+         HGL = 1.
+         RETURN
+      ENDIF
+      N = NZ-1
+      HGL = PNLEG(Z,NZ)/(PNDLEG(ZGL(I),NZ)*(Z-ZGL(I)))
+      RETURN
+      END
+C
+      REAL FUNCTION PNLEG (Z,N)
+C---------------------------------------------------------------------
+C
+C     Compute the value of the Nth order Legendre polynomial at Z.
+C     (Simpler than JACOBF)
+C     Based on the recursion formula for the Legendre polynomials.
+C
+C---------------------------------------------------------------------
+C
+C     This next statement is to overcome the underflow bug in the i860.  
+C     It can be removed at a later date.  11 Aug 1990   pff.
+C
+      IF(ABS(Z) .LT. 1.0E-25) Z = 0.0
+C
+      P1   = 1.
+      IF (N.EQ.0) THEN
+         PNLEG = P1
+         RETURN
+      ENDIF
+      P2   = Z
+      P3   = P2
+      DO 10 K = 1, N-1
+         FK  = (K)
+         P3  = ((2.*FK+1.)*Z*P2 - FK*P1)/(FK+1.)
+         P1  = P2
+         P2  = P3
+ 10   CONTINUE
+      PNLEG = P3
+      if (n.eq.0) pnleg = 1.
+      RETURN
+      END
+C
+      REAL FUNCTION PNDLEG (Z,N)
+C----------------------------------------------------------------------
+C
+C     Compute the derivative of the Nth order Legendre polynomial at Z.
+C     (Simpler than JACOBF)
+C     Based on the recursion formula for the Legendre polynomials.
+C
+C----------------------------------------------------------------------
+      P1   = 1.
+      P2   = Z
+      P1D  = 0.
+      P2D  = 1.
+      P3D  = 1.
+      DO 10 K = 1, N-1
+         FK  = (K)
+         P3  = ((2.*FK+1.)*Z*P2 - FK*P1)/(FK+1.)
+         P3D = ((2.*FK+1.)*P2 + (2.*FK+1.)*Z*P2D - FK*P1D)/(FK+1.)
+         P1  = P2
+         P2  = P3
+         P1D = P2D
+         P2D = P3D
+ 10   CONTINUE
+      PNDLEG = P3D
+      IF (N.eq.0) pndleg = 0.
+      RETURN
+      END
+C
+      SUBROUTINE DGLLGL (D,DT,ZM1,ZM2,IM12,NZM1,NZM2,ND1,ND2)
+C-----------------------------------------------------------------------
+C
+C     Compute the (one-dimensional) derivative matrix D and its
+C     transpose DT associated with taking the derivative of a variable
+C     expanded on a Gauss-Lobatto Legendre mesh (M1), and evaluate its
+C     derivative on a Guass Legendre mesh (M2).
+C     Need the one-dimensional interpolation operator IM12
+C     (see subroutine IGLLGL).
+C     Note: D and DT are rectangular matrices.
+C
+C-----------------------------------------------------------------------
+      REAL D(ND2,ND1), DT(ND1,ND2), ZM1(ND1), ZM2(ND2), IM12(ND2,ND1)
+      IF (NZM1.EQ.1) THEN
+        D (1,1) = 0.
+        DT(1,1) = 0.
+        RETURN
+      ENDIF
+      EPS = 1.E-6
+      NM1 = NZM1-1
+      DO 10 IP = 1, NZM2
+         DO 10 JQ = 1, NZM1
+            ZP = ZM2(IP)
+            ZQ = ZM1(JQ)
+            IF ((ABS(ZP) .LT. EPS).AND.(ABS(ZQ) .LT. EPS)) THEN
+                D(IP,JQ) = 0.
+            ELSE
+                D(IP,JQ) = (PNLEG(ZP,NM1)/PNLEG(ZQ,NM1)
+     $                     -IM12(IP,JQ))/(ZP-ZQ)
+            ENDIF
+            DT(JQ,IP) = D(IP,JQ)
+ 10   CONTINUE
+      RETURN
+      END
+C
+      SUBROUTINE DGLJGJ (D,DT,ZGL,ZG,IGLG,NPGL,NPG,ND1,ND2,ALPHA,BETA)
+C-----------------------------------------------------------------------
+C
+C     Compute the (one-dimensional) derivative matrix D and its
+C     transpose DT associated with taking the derivative of a variable
+C     expanded on a Gauss-Lobatto Jacobi mesh (M1), and evaluate its
+C     derivative on a Guass Jacobi mesh (M2).
+C     Need the one-dimensional interpolation operator IM12
+C     (see subroutine IGLJGJ).
+C     Note: D and DT are rectangular matrices.
+C     Single precision version.
+C
+C-----------------------------------------------------------------------
+      REAL D(ND2,ND1), DT(ND1,ND2), ZGL(ND1), ZG(ND2), IGLG(ND2,ND1)
+      PARAMETER (NMAX=84)
+      PARAMETER (NDD = NMAX)
+      REAL*8  DD(NDD,NDD), DTD(NDD,NDD)
+      REAL*8  ZGD(NDD), ZGLD(NDD), IGLGD(NDD,NDD)
+      REAL*8  ALPHAD, BETAD
+C
+      IF (NPGL.LE.1) THEN
+       WRITE(6,*) 'DGLJGJ: Minimum number of Gauss-Lobatto points is 2'
+       call exitt
+      ENDIF
+      IF (NPGL.GT.NMAX) THEN
+         WRITE(6,*) 'Polynomial degree too high in DGLJGJ'
+         WRITE(6,*) 'Maximum polynomial degree is',NMAX
+         WRITE(6,*) 'Here NPGL=',NPGL
+         call exitt
+      ENDIF
+      IF ((ALPHA.LE.-1.).OR.(BETA.LE.-1.)) THEN
+         WRITE(6,*) 'DGLJGJ: Alpha and Beta must be greater than -1'
+         call exitt
+      ENDIF
+C
+      ALPHAD = ALPHA
+      BETAD  = BETA
+      DO 100 I=1,NPG
+         ZGD(I) = ZG(I)
+         DO 100 J=1,NPGL
+            IGLGD(I,J) = IGLG(I,J)
+ 100  CONTINUE
+      DO 200 I=1,NPGL
+         ZGLD(I) = ZGL(I)
+ 200  CONTINUE
+      CALL DGLJGJD (DD,DTD,ZGLD,ZGD,IGLGD,NPGL,NPG,NDD,NDD,ALPHAD,BETAD)
+      DO 300 I=1,NPG
+      DO 300 J=1,NPGL
+         D(I,J)  = DD(I,J)
+         DT(J,I) = DTD(J,I)
+ 300  CONTINUE
+      RETURN
+      END
+C
+      SUBROUTINE DGLJGJD (D,DT,ZGL,ZG,IGLG,NPGL,NPG,ND1,ND2,ALPHA,BETA)
+C-----------------------------------------------------------------------
+C
+C     Compute the (one-dimensional) derivative matrix D and its
+C     transpose DT associated with taking the derivative of a variable
+C     expanded on a Gauss-Lobatto Jacobi mesh (M1), and evaluate its
+C     derivative on a Guass Jacobi mesh (M2).
+C     Need the one-dimensional interpolation operator IM12
+C     (see subroutine IGLJGJ).
+C     Note: D and DT are rectangular matrices.
+C     Double precision version.
+C
+C-----------------------------------------------------------------------
+      IMPLICIT REAL*8  (A-H,O-Z)
+      REAL*8  D(ND2,ND1), DT(ND1,ND2), ZGL(ND1), ZG(ND2)
+      REAL*8  IGLG(ND2,ND1), ALPHA, BETA
+C
+      IF (NPGL.LE.1) THEN
+       WRITE(6,*) 'DGLJGJD: Minimum number of Gauss-Lobatto points is 2'
+       call exitt
+      ENDIF
+      IF ((ALPHA.LE.-1.).OR.(BETA.LE.-1.)) THEN
+         WRITE(6,*) 'DGLJGJD: Alpha and Beta must be greater than -1'
+         call exitt
+      ENDIF
+C
+      EPS    = 1.e-6
+      ONE    = 1.
+      TWO    = 2.
+      NGL    = NPGL-1
+      DN     = ((NGL))
+      EIGVAL = -DN*(DN+ALPHA+BETA+ONE)
+C
+      DO 100 I=1,NPG
+      DO 100 J=1,NPGL
+         DZ = ABS(ZG(I)-ZGL(J))
+         IF (DZ.LT.EPS) THEN
+            D(I,J) = (ALPHA*(ONE+ZG(I))-BETA*(ONE-ZG(I)))/
+     $               (TWO*(ONE-ZG(I)**2))
+         ELSE
+            CALL JACOBF (PI,PDI,PM1,PDM1,PM2,PDM2,NGL,ALPHA,BETA,ZG(I))
+            CALL JACOBF (PJ,PDJ,PM1,PDM1,PM2,PDM2,NGL,ALPHA,BETA,ZGL(J))
+            FACI   = ALPHA*(ONE+ZG(I))-BETA*(ONE-ZG(I))
+            FACJ   = ALPHA*(ONE+ZGL(J))-BETA*(ONE-ZGL(J))
+            CONST  = EIGVAL*PJ+FACJ*PDJ
+            D(I,J) = ((EIGVAL*PI+FACI*PDI)*(ZG(I)-ZGL(J))
+     $               -(ONE-ZG(I)**2)*PDI)/(CONST*(ZG(I)-ZGL(J))**2)
+         ENDIF
+         DT(J,I) = D(I,J)
+ 100  CONTINUE
+      RETURN
+      END
+C
+      SUBROUTINE IGLM (I12,IT12,Z1,Z2,NZ1,NZ2,ND1,ND2)
+C----------------------------------------------------------------------
+C
+C     Compute the one-dimensional interpolation operator (matrix) I12
+C     ands its transpose IT12 for interpolating a variable from a
+C     Gauss Legendre mesh (1) to a another mesh M (2).
+C     Z1 : NZ1 Gauss Legendre points.
+C     Z2 : NZ2 points on mesh M.
+C
+C--------------------------------------------------------------------
+      REAL I12(ND2,ND1),IT12(ND1,ND2),Z1(ND1),Z2(ND2)
+      IF (NZ1 .EQ. 1) THEN
+         I12 (1,1) = 1.
+         IT12(1,1) = 1.
+         RETURN
+      ENDIF
+      DO 10 I=1,NZ2
+         ZI = Z2(I)
+         DO 10 J=1,NZ1
+            I12 (I,J) = HGL(J,ZI,Z1,NZ1)
+            IT12(J,I) = I12(I,J)
+ 10   CONTINUE
+      RETURN
+      END
+c
+      SUBROUTINE IGLLM (I12,IT12,Z1,Z2,NZ1,NZ2,ND1,ND2)
+C----------------------------------------------------------------------
+C
+C     Compute the one-dimensional interpolation operator (matrix) I12
+C     ands its transpose IT12 for interpolating a variable from a
+C     Gauss-Lobatto Legendre mesh (1) to a another mesh M (2).
+C     Z1 : NZ1 Gauss-Lobatto Legendre points.
+C     Z2 : NZ2 points on mesh M.
+C
+C--------------------------------------------------------------------
+      REAL I12(ND2,ND1),IT12(ND1,ND2),Z1(ND1),Z2(ND2)
+      IF (NZ1 .EQ. 1) THEN
+         I12 (1,1) = 1.
+         IT12(1,1) = 1.
+         RETURN
+      ENDIF
+      DO 10 I=1,NZ2
+         ZI = Z2(I)
+         DO 10 J=1,NZ1
+            I12 (I,J) = HGLL(J,ZI,Z1,NZ1)
+            IT12(J,I) = I12(I,J)
+ 10   CONTINUE
+      RETURN
+      END
+C
+      SUBROUTINE IGJM (I12,IT12,Z1,Z2,NZ1,NZ2,ND1,ND2,ALPHA,BETA)
+C----------------------------------------------------------------------
+C
+C     Compute the one-dimensional interpolation operator (matrix) I12
+C     ands its transpose IT12 for interpolating a variable from a
+C     Gauss Jacobi mesh (1) to a another mesh M (2).
+C     Z1 : NZ1 Gauss Jacobi points.
+C     Z2 : NZ2 points on mesh M.
+C     Single precision version.
+C
+C--------------------------------------------------------------------
+      REAL I12(ND2,ND1),IT12(ND1,ND2),Z1(ND1),Z2(ND2)
+      IF (NZ1 .EQ. 1) THEN
+         I12 (1,1) = 1.
+         IT12(1,1) = 1.
+         RETURN
+      ENDIF
+      DO 10 I=1,NZ2
+         ZI = Z2(I)
+         DO 10 J=1,NZ1
+            I12 (I,J) = HGJ(J,ZI,Z1,NZ1,ALPHA,BETA)
+            IT12(J,I) = I12(I,J)
+ 10   CONTINUE
+      RETURN
+      END
+c
+      SUBROUTINE IGLJM (I12,IT12,Z1,Z2,NZ1,NZ2,ND1,ND2,ALPHA,BETA)
+C----------------------------------------------------------------------
+C
+C     Compute the one-dimensional interpolation operator (matrix) I12
+C     ands its transpose IT12 for interpolating a variable from a
+C     Gauss-Lobatto Jacobi mesh (1) to a another mesh M (2).
+C     Z1 : NZ1 Gauss-Lobatto Jacobi points.
+C     Z2 : NZ2 points on mesh M.
+C     Single precision version.
+C
+C--------------------------------------------------------------------
+      REAL I12(ND2,ND1),IT12(ND1,ND2),Z1(ND1),Z2(ND2)
+      IF (NZ1 .EQ. 1) THEN
+         I12 (1,1) = 1.
+         IT12(1,1) = 1.
+         RETURN
+      ENDIF
+      DO 10 I=1,NZ2
+         ZI = Z2(I)
+         DO 10 J=1,NZ1
+            I12 (I,J) = HGLJ(J,ZI,Z1,NZ1,ALPHA,BETA)
+            IT12(J,I) = I12(I,J)
+ 10   CONTINUE
+      RETURN
+      END
diff --git a/src/timers.c b/src/timers.c
new file mode 100644
index 0000000..f40bf91
--- /dev/null
+++ b/src/timers.c
@@ -0,0 +1,24 @@
+
+#include <time.h>  /* for struct timespec */
+
+double fclock_gettime(void) {
+
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  double timeval = (double) (ts.tv_sec + ts.tv_nsec*1.0E-9);
+
+  return timeval;
+}
+
+double fclock_gettime_( void ) __attribute__((alias("fclock_gettime")));
+
+#ifdef BGQTIMER
+#include <hwi/include/bqc/A2_inlines.h>
+
+double ReadTimeBase_Double( void ) {
+  return (double) GetTimeBase();
+}
+
+double readtimebase_double( void ) __attribute__((alias("ReadTimeBase_Double")));
+double readtimebase_double_( void ) __attribute__((alias("ReadTimeBase_Double")));
+#endif
diff --git a/test/example1/SIZE b/test/example1/SIZE
new file mode 100644
index 0000000..941aedc
--- /dev/null
+++ b/test/example1/SIZE
@@ -0,0 +1,17 @@
+C     Dimension file to be included
+
+      parameter (ldim=3)                      ! dimension
+      parameter (lx1=12,ly1=lx1,lz1=lx1)      ! polynomial order
+
+      parameter (lp =49152)                 ! max number of processors
+      parameter (lelt= 512)                    ! max number of elements, per proc
+
+      parameter (lelg=lelt*lp)                ! max total elements in a test
+      parameter (lelx=lelg,lely=1,lelz=1)     ! max elements in each direction
+
+      parameter (ldimt=1,ldimt1=ldimt+1)      ! used in 'include' files
+
+
+      common /dimn/ nelx,nely,nelz,nelt       ! local element common block
+     $            , nx1,ny1,nz1,ndim,nfield,nid
+
diff --git a/test/example1/data.rea b/test/example1/data.rea
new file mode 100644
index 0000000..36077f4
--- /dev/null
+++ b/test/example1/data.rea
@@ -0,0 +1,5 @@
+.true. = ifbrick               ! brick or linear geometry
+512 512 1  = iel0,ielN,ielD (per processor)  ! range of number of elements per proc.
+ 9  12 3 = nx0,nxN,nxD         ! poly. order range for nx1
+ 1  1  1 = npx, npy, npz       ! processor distribution in x,y,z
+ 1  1  1 = mx, my, mz          ! local element distribution in x,y,z
diff --git a/test/example1/makefile.template b/test/example1/makefile.template
new file mode 100644
index 0000000..81d8805
--- /dev/null
+++ b/test/example1/makefile.template
@@ -0,0 +1,140 @@
+BINNAME=nekbone
+CASENAME=
+CASEDIR=
+S=
+J:=$S/jl
+OPT_INCDIR:=./
+OBJDIR=obj
+IFMPI= 
+F77=
+CC=
+P=
+PPPO=
+PPS=
+G=
+OPT_FLAGS_STD=
+USR=
+USR_LFLAGS=
+
+################################################################################
+
+lFLAGS = $(USR_LFLAGS)
+
+PPS_F = $(patsubst %,$(PPPO)-D%,$(PPS))
+PPS_C = $(patsubst %,-D%,$(PPS))
+
+#NEW #########################################################################
+EXTRA = cg.o driver.o math.o omp.o mxm_wrapper.o prox_dssum.o prox_setup.o semhat.o \
+speclib.o timers.o
+
+################################################################################
+# MXM 
+MXM = mxm_std.o
+
+# JL Routines ###################################################################
+JO  = jl_
+JL := -DPREFIX=jl_
+
+JLCORE = $(JO)gs.o $(JO)sort.o $(JO)sarray_transfer.o $(JO)sarray_sort.o \
+$(JO)gs_local.o $(JO)crystal.o $(JO)comm.o $(JO)tensor.o $(JO)fail.o \
+$(JO)fcrystal.o
+
+COMM_MPI := comm_mpi.o
+ifeq ($(IFMPI),false)
+  COMM_MPI := ${COMM_MPI} mpi_dummy.o
+endif
+
+ifeq ($(IFMPI),false)
+	DUMMY:= $(shell cp $S/mpi_dummy.h $S/mpif.h) 
+else
+	DUMMY:= $(shell rm -rf $S/mpif.h) 
+endif
+
+#####################################################################################
+TMP0 = $(EXTRA) $(COMM_MPI) $(MXM)
+NOBJS_F0 = $(patsubst %,$(OBJDIR)/%,$(TMP0))
+TMP0c = $(JLCORE)
+NOBJS_C0 = $(patsubst %,$(OBJDIR)/%,$(TMP0c))
+
+NOBJS0 = $(NOBJS_F0) $(NOBJS_C0)
+##############################################################################
+
+L0 = $(G) -O0
+L2 = $(G) $(OPT_FLAGS_STD)
+L3 = $(G) $(OPT_FLAGS_STD)
+L4 = $(L3)
+
+FL0   = $(L0) $(P) $(PPS_F) -I$(CASEDIR) -I$S -I$(OPT_INCDIR)
+FL2i4 = $(L2)      $(PPS_F) -I$(CASEDIR) -I$S -I$(OPT_INCDIR)
+FL2   = $(L2) $(P) $(PPS_F) -I$(CASEDIR) -I$S -I$(OPT_INCDIR)
+FL3   = $(L3) $(P) $(PPS_F) -I$(CASEDIR) -I$S -I$(OPT_INCDIR)
+FL4   = $(L4) $(P) $(PPS_F) -I$(CASEDIR) -I$S -I$(OPT_INCDIR)
+
+cFL0   = $(L0) $(PPS_C) 
+cFL2   = $(L2) $(PPS_C) 
+cFL3   = $(L3) $(PPS_C) 
+cFL4   = $(L4) $(PPS_C) 
+################################################################################
+all : nekbone
+
+objdir: 
+	@mkdir $(OBJDIR) 2>/dev/null; cat /dev/null 
+
+nekbone: 	objdir $(NOBJS0)
+	$(F77) -o ${BINNAME} $G $(NOBJS0) $(lFLAGS)
+	@if test -f ${BINNAME}; then \
+	echo "#############################################################"; \
+	echo "#                  Compilation successful!                  #"; \
+	echo "#############################################################"; \
+        size ${BINNAME}; \
+        echo ""; \
+	else \
+	echo -e "\033[1;31;38m" "ERROR: Compilation failed!"; \
+	echo -e "\033[0m"; \
+	fi
+ifeq ($(IFMPI),false) 
+	@rm -rf $S/mpif.h
+endif
+
+clean:
+	rm -rf ./obj ${BINNAME}
+ifeq ($(IFMPI),false) 
+	@rm -rf $S/mpif.h
+endif
+
+$(NOBJS_F0) : SIZE
+# CORE      ############################################################################
+$(OBJDIR)/cg.o		:$S/cg.f;			$(F77) -c $(FL4) $< -o $@
+$(OBJDIR)/driver.o	:$S/driver.f;			$(F77) -c $(FL2) $< -o $@
+$(OBJDIR)/math.o	:$S/math.f;			$(F77) -c $(FL4) $< -o $@
+$(OBJDIR)/omp.o		:$S/omp.f;			$(F77) -c $(FL4) $< -o $@
+$(OBJDIR)/prox_dssum.o  :$S/prox_dssum.f;		$(F77) -c $(FL2) $< -o $@
+$(OBJDIR)/prox_setup.o	:$S/prox_setup.f;		$(F77) -c $(FL4) $< -o $@
+$(OBJDIR)/semhat.o	:$S/semhat.f;			$(F77) -c $(FL4) $< -o $@
+$(OBJDIR)/speclib.o	:$S/speclib.f;			$(F77) -c $(FL2) $< -o $@
+$(OBJDIR)/blas.o        :$S/blas.f; 		        $(F77) -c $(FL2i4) $< -o $@
+$(OBJDIR)/byte_mpi.o	:$S/byte_mpi.f;			$(F77) -c $(FL2) $< -o $@
+$(OBJDIR)/comm_mpi.o	:$S/comm_mpi.f;			$(F77) -c $(FL2) $< -o $@
+$(OBJDIR)/mpi_dummy.o	:$S/mpi_dummy.f;		$(F77) -c $(FL2) $< -o $@
+# MXM       ############################################################################
+$(OBJDIR)/mxm_wrapper.o	  :$S/mxm_wrapper.f;		$(F77) -c $(FL2) $< -o $@ 
+$(OBJDIR)/mxm_std.o	  :$S/mxm_std.f;		$(F77) -c $(FL4) $< -o $@
+$(OBJDIR)/bg_aligned3.o	  :$S/bg_aligned3.s;		$(CC) -c $< -o $@
+$(OBJDIR)/bg_mxm3.o	  :$S/bg_mxm3.s;		$(CC) -c $< -o $@
+$(OBJDIR)/bg_mxm44.o	  :$S/bg_mxm44.s;		$(CC) -c $< -o $@
+$(OBJDIR)/bg_mxm44_uneven.o :$S/bg_mxm44_uneven.s;	$(CC) -c $< -o $@
+$(OBJDIR)/k10_mxm.o	  :$S/k10_mxm.c;		$(CC)  -c $(cFL2) $(JL) $< -o $@
+# C Files ##################################################################################
+$(OBJDIR)/byte.o                 :$S/byte.c;              $(CC) -c $(cFL2) $(JL) $< -o $@
+$(OBJDIR)/chelpers.o             :$S/chelpers.c;          $(CC) -c $(cFL2) $(JL) $< -o $@
+$(OBJDIR)/timers.o               :$S/timers.c;            $(CC) -c $(cFL2) $(JL) $< -o $@
+$(OBJDIR)/$(JO)fail.o            :$(J)/fail.c;            $(CC) -c $(cFL2) $(JL) $< -o $@
+$(OBJDIR)/$(JO)tensor.o          :$(J)/tensor.c;          $(CC) -c $(cFL2) $(JL) $< -o $@
+$(OBJDIR)/$(JO)sort.o            :$(J)/sort.c;            $(CC) -c $(cFL2) $(JL) $< -o $@
+$(OBJDIR)/$(JO)sarray_sort.o     :$(J)/sarray_sort.c;     $(CC) -c $(cFL2) $(JL) $< -o $@
+$(OBJDIR)/$(JO)comm.o            :$(J)/comm.c;            $(CC) -c $(cFL2) $(JL) $< -o $@
+$(OBJDIR)/$(JO)crystal.o         :$(J)/crystal.c;         $(CC) -c $(cFL2) $(JL) $< -o $@
+$(OBJDIR)/$(JO)sarray_transfer.o :$(J)/sarray_transfer.c; $(CC) -c $(cFL2) $(JL) $< -o $@
+$(OBJDIR)/$(JO)fcrystal.o        :$(J)/fcrystal.c;        $(CC) -c $(cFL2) $(JL) $< -o $@
+$(OBJDIR)/$(JO)gs.o              :$(J)/gs.c;              $(CC) -c $(cFL2) $(JL) $< -o $@
+$(OBJDIR)/$(JO)gs_local.o        :$(J)/gs_local.c;        $(CC) -c $(cFL2) $(JL) $< -o $@
diff --git a/test/example1/makenek b/test/example1/makenek
new file mode 100755
index 0000000..228bcd1
--- /dev/null
+++ b/test/example1/makenek
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Nek5000 build config file
+# (c) 2008,2009,2010 UCHICAGO ARGONNE, LLC
+
+# source path 
+SOURCE_ROOT="$HOME/Nekbone/src" 
+
+# Fortran compiler
+F77="mpif77"
+
+# C compiler
+CC="mpicc"
+
+# pre-processor symbol list 
+# (set PPLIST=? to get a list of available symbols)
+# NEKCOMM, NEKDLAY, BG, MGRID
+#PPLIST="?"
+
+
+# OPTIONAL SETTINGS
+# -----------------
+
+# enable MPI (default true)
+#IFMPI="false"
+
+# auxilliary files to compile
+# NOTE: source files have to located in the same directory as makenek
+#       a makefile_usr.inc has to be provided containing the build rules 
+#USR="foo.o"
+
+# linking flags
+#USR_LFLAGS="-L/usr/lib -lfoo"
+
+# generic compiler flags
+#G="-g"
+
+# optimization flags
+#OPT_FLAGS_STD=""
+#OPT_FLAGS_MAG=""
+
+###############################################################################
+# DONT'T TOUCH WHAT FOLLOWS !!!
+###############################################################################
+# assign version tag
+mver=1
+# overwrite source path with optional 2nd argument
+if [ -d $2 ] && [ $# -eq 2 ]; then
+  SOURCE_ROOT="$2"
+  echo "change source code directory to: ", $SOURCE_ROOT
+fi
+# do some checks and create makefile
+source $SOURCE_ROOT/makenek.inc
+# compile
+make -j4 -f makefile 2>&1 | tee compiler.out
+exit 0
diff --git a/test/example1/makenek-bgq b/test/example1/makenek-bgq
new file mode 100755
index 0000000..5210815
--- /dev/null
+++ b/test/example1/makenek-bgq
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Nek5000 build config file
+# (c) 2008,2009,2010 UCHICAGO ARGONNE, LLC
+
+# source path 
+SOURCE_ROOT="../../src"
+
+# Fortran compiler
+F77="mpixlf77_r -qsmp=omp -qnosave"
+
+# C compiler
+CC="mpixlc_r -qsmp=omp"
+
+# pre-processor symbol list 
+# (set PPLIST=? to get a list of available symbols)
+#PPLSIT="BGQ BGP K10_MXM TIMERS MPITIMER BGQTIMER CGTTIMER NITER=20 LOG MPITHREADS XSMM MXMBASIC MKL BLAS_MXM XSMM_FIXED XSMM_DISPATCH NPOLY=8"
+PPLIST="TIMERS BGQTIMERS"
+
+
+# OPTIONAL SETTINGS
+# -----------------
+
+# enable MPI (default true)
+#IFMPI="false"
+
+# auxilliary files to compile
+# NOTE: source files have to located in the same directory as makenek
+#       a makefile_usr.inc has to be provided containing the build rules 
+#USR="foo.o"
+
+# linking flags
+#USR_LFLAGS="-L/usr/lib/ -lfoo"
+
+# generic compiler flags
+#G="-g"
+
+# optimization flags
+OPT_FLAGS_STD="-O3"
+OPT_FLAGS_MAG="-O3"
+
+###############################################################################
+# DONT'T TOUCH WHAT FOLLOWS !!!
+###############################################################################
+# assign version tag
+mver=1
+# overwrite source path with optional 2nd argument
+if [ -d $2 ] && [ $# -eq 2 ]; then
+  SOURCE_ROOT="$2"
+  echo "change source code directory to: ", $SOURCE_ROOT
+fi
+# do some checks and create makefile
+source $SOURCE_ROOT/makenek.inc
+# compile
+make -j4 -f makefile 2>&1 | tee compiler.out
+exit 0
diff --git a/test/example1/makenek-cray-knl b/test/example1/makenek-cray-knl
new file mode 100755
index 0000000..c0658f2
--- /dev/null
+++ b/test/example1/makenek-cray-knl
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Nek5000 build config file
+# (c) 2008,2009,2010 UCHICAGO ARGONNE, LLC
+
+# source path 
+SOURCE_ROOT="../../src"
+
+# Fortran compiler
+F77="ftn"
+
+# C compiler
+CC="cc"
+
+# pre-processor symbol list 
+# (set PPLIST=? to get a list of available symbols)
+#PPLSIT="BGQ BGP K10_MXM TIMERS MPITIMER BGQTIMER CGTTIMER NITER=20 LOG MPITHREADS XSMM MXMBASIC MKL BLAS_MXM XSMM_FIXED XSMM_DISPATCH NPOLY=8"
+PPLIST="TIMERS CGTIMERS"
+
+
+# OPTIONAL SETTINGS
+# -----------------
+
+# enable MPI (default true)
+#IFMPI="false"
+
+# auxilliary files to compile
+# NOTE: source files have to located in the same directory as makenek
+#       a makefile_usr.inc has to be provided containing the build rules 
+#USR="foo.o"
+
+# linking flags
+USR_LFLAGS="-qopenmp -dynamic -mcmodel=medium -shared-intel"
+
+# generic compiler flags
+#G="-g"
+
+# optimization flags
+OPT_FLAGS_STD="-qopenmp -dynamic -O3 -g -xMIC-AVX512 -mcmodel=medium -shared-intel "
+OPT_FLAGS_MAG="-qopenmp -dynamic -O3 -g -xMIC-AVX512 -mcmodel=medium -shared-intel"
+
+###############################################################################
+# DONT'T TOUCH WHAT FOLLOWS !!!
+###############################################################################
+# assign version tag
+mver=1
+# overwrite source path with optional 2nd argument
+if [ -d $2 ] && [ $# -eq 2 ]; then
+  SOURCE_ROOT="$2"
+  echo "change source code directory to: ", $SOURCE_ROOT
+fi
+# do some checks and create makefile
+source $SOURCE_ROOT/makenek.inc
+# compile
+make -j4 -f makefile 2>&1 | tee compiler.out
+exit 0
diff --git a/test/example1/makenek-intel b/test/example1/makenek-intel
new file mode 100755
index 0000000..209dfc3
--- /dev/null
+++ b/test/example1/makenek-intel
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Nek5000 build config file
+# (c) 2008,2009,2010 UCHICAGO ARGONNE, LLC
+
+# source path 
+SOURCE_ROOT="../../src"
+
+# Fortran compiler
+F77="mpiifort"
+
+# C compiler
+CC="mpiicc"
+
+# pre-processor symbol list 
+# (set PPLIST=? to get a list of available symbols)
+#PPLSIT="BGQ BGP K10_MXM TIMERS MPITIMER BGQTIMER CGTTIMER NITER=20 LOG MPITHREADS XSMM MXMBASIC MKL BLAS_MXM XSMM_FIXED XSMM_DISPATCH NPOLY=8"
+PPLIST="TIMERS CGTIMERS"
+
+
+# OPTIONAL SETTINGS
+# -----------------
+
+# enable MPI (default true)
+#IFMPI="false"
+
+# auxilliary files to compile
+# NOTE: source files have to located in the same directory as makenek
+#       a makefile_usr.inc has to be provided containing the build rules 
+#USR="foo.o"
+
+# linking flags
+USR_LFLAGS="-qopenmp -mcmodel=medium -shared-intel"
+
+# generic compiler flags
+#G="-g"
+
+# optimization flags
+OPT_FLAGS_STD="-qopenmp -O3 -g -xHost -mcmodel=medium -shared-intel"
+OPT_FLAGS_MAG="-qopenmp -O3 -g -xHost -mcmodel=medium -shared-intel"
+
+###############################################################################
+# DONT'T TOUCH WHAT FOLLOWS !!!
+###############################################################################
+# assign version tag
+mver=1
+# overwrite source path with optional 2nd argument
+if [ -d $2 ] && [ $# -eq 2 ]; then
+  SOURCE_ROOT="$2"
+  echo "change source code directory to: ", $SOURCE_ROOT
+fi
+# do some checks and create makefile
+source $SOURCE_ROOT/makenek.inc
+# compile
+make -j4 -f makefile 2>&1 | tee compiler.out
+exit 0
diff --git a/test/example1/nekpmpi b/test/example1/nekpmpi
new file mode 100755
index 0000000..1c4cc8a
--- /dev/null
+++ b/test/example1/nekpmpi
@@ -0,0 +1,4 @@
+rm -f logfile
+mv $1.log.$2 $1.log1.$2
+mpiexec -np $2 ./nekbone $1 > $1.log.$2
+ln $1.log.$2 logfile