diff --git a/Android.mk b/Android.mk
new file mode 100644
index 0000000..e9f691d
--- /dev/null
+++ b/Android.mk
@@ -0,0 +1,112 @@
+LOCAL_PATH := $(call my-dir)
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := tfm
+
+LOCAL_SRC_FILES := \
+	src/addsub/fp_add.c \
+	src/addsub/fp_add_d.c \
+	src/addsub/fp_addmod.c \
+	src/addsub/fp_cmp.c \
+	src/addsub/fp_cmp_d.c \
+	src/addsub/fp_cmp_mag.c \
+	src/addsub/fp_sub.c \
+	src/addsub/fp_sub_d.c \
+	src/addsub/fp_submod.c \
+	src/addsub/s_fp_add.c \
+	src/addsub/s_fp_sub.c \
+	src/bin/fp_radix_size.c \
+	src/bin/fp_read_radix.c \
+	src/bin/fp_read_signed_bin.c \
+	src/bin/fp_read_unsigned_bin.c \
+	src/bin/fp_reverse.c \
+	src/bin/fp_signed_bin_size.c \
+	src/bin/fp_s_rmap.c \
+	src/bin/fp_toradix.c \
+	src/bin/fp_to_signed_bin.c \
+	src/bin/fp_to_unsigned_bin.c \
+	src/bin/fp_unsigned_bin_size.c \
+	src/bit/fp_cnt_lsb.c \
+	src/bit/fp_count_bits.c \
+	src/bit/fp_div_2.c \
+	src/bit/fp_div_2d.c \
+	src/bit/fp_lshd.c \
+	src/bit/fp_mod_2d.c \
+	src/bit/fp_rshd.c \
+	src/divide/fp_div.c \
+	src/divide/fp_div_d.c \
+	src/divide/fp_mod.c \
+	src/divide/fp_mod_d.c \
+	src/exptmod/fp_2expt.c \
+	src/exptmod/fp_exptmod.c \
+	src/misc/fp_ident.c \
+	src/misc/fp_set.c \
+	src/mont/fp_montgomery_calc_normalization.c \
+	src/mont/fp_montgomery_reduce.c \
+	src/mont/fp_montgomery_setup.c \
+	src/mul/fp_mul_2.c \
+	src/mul/fp_mul_2d.c \
+	src/mul/fp_mul.c \
+	src/mul/fp_mul_comba_12.c \
+	src/mul/fp_mul_comba_17.c \
+	src/mul/fp_mul_comba_20.c \
+	src/mul/fp_mul_comba_24.c \
+	src/mul/fp_mul_comba_28.c \
+	src/mul/fp_mul_comba_32.c \
+	src/mul/fp_mul_comba_3.c \
+	src/mul/fp_mul_comba_48.c \
+	src/mul/fp_mul_comba_4.c \
+	src/mul/fp_mul_comba_64.c \
+	src/mul/fp_mul_comba_6.c \
+	src/mul/fp_mul_comba_7.c \
+	src/mul/fp_mul_comba_8.c \
+	src/mul/fp_mul_comba_9.c \
+	src/mul/fp_mul_comba.c \
+	src/mul/fp_mul_comba_small_set.c \
+	src/mul/fp_mul_d.c \
+	src/mul/fp_mulmod.c \
+	src/numtheory/fp_gcd.c \
+	src/numtheory/fp_invmod.c \
+	src/numtheory/fp_isprime.c \
+	src/numtheory/fp_lcm.c \
+	src/numtheory/fp_prime_miller_rabin.c \
+	src/numtheory/fp_prime_random_ex.c \
+	src/sqr/fp_sqr.c \
+	src/sqr/fp_sqr_comba_12.c \
+	src/sqr/fp_sqr_comba_17.c \
+	src/sqr/fp_sqr_comba_20.c \
+	src/sqr/fp_sqr_comba_24.c \
+	src/sqr/fp_sqr_comba_28.c \
+	src/sqr/fp_sqr_comba_32.c \
+	src/sqr/fp_sqr_comba_3.c \
+	src/sqr/fp_sqr_comba_48.c \
+	src/sqr/fp_sqr_comba_4.c \
+	src/sqr/fp_sqr_comba_64.c \
+	src/sqr/fp_sqr_comba_6.c \
+	src/sqr/fp_sqr_comba_7.c \
+	src/sqr/fp_sqr_comba_8.c \
+	src/sqr/fp_sqr_comba_9.c \
+	src/sqr/fp_sqr_comba.c \
+	src/sqr/fp_sqr_comba_generic.c \
+	src/sqr/fp_sqr_comba_small_set.c \
+	src/sqr/fp_sqrmod.c
+
+LOCAL_C_INCLUDES := $(LOCAL_PATH)/src/headers
+
+LOCAL_CFLAGS += -DTFM_ARM
+
+ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+# Possible optimizations:
+#  -ftree-vectorize: have GCC attempt to automatically vectorize loops
+#  -ftree-vectorizer-verbose=2: verbose output during compile
+# Note: not all V7-a targets support NEON!
+LOCAL_ARM_NEON := true
+LOCAL_CFLAGS += -DTFM_ARM_V7A -ftree-vectorize
+else ifeq ($(TARGET_ARCH_ABI),armeabi)
+LOCAL_CFLAGS += -DTFM_ARM_V5TE
+else
+LOCAL_CFLAGS += -DTFM_ARM_V4M
+endif
+
+include $(BUILD_STATIC_LIBRARY)
diff --git a/Android_jni/Android.mk b/Android_jni/Android.mk
new file mode 100644
index 0000000..487df6d
--- /dev/null
+++ b/Android_jni/Android.mk
@@ -0,0 +1,2 @@
+# Recursively sources all Android.mk files in subdirs:
+include $(call all-subdir-makefiles)
diff --git a/Android_jni/Application.mk b/Android_jni/Application.mk
new file mode 100644
index 0000000..d1e6201
--- /dev/null
+++ b/Android_jni/Application.mk
@@ -0,0 +1,3 @@
+APP_MODULES := tfm
+APP_OPTIM := release
+APP_ABI := armeabi armeabi-v7a
diff --git a/Android_jni/README b/Android_jni/README
new file mode 100644
index 0000000..0aa3670
--- /dev/null
+++ b/Android_jni/README
@@ -0,0 +1,36 @@
+This project provides a build framework for TomsFastMath on Android.
+
+Notes / TODO:
+
+To use the faster ARM inline assembly, TFM_ARM must be defined at compile time. Unfortunately, is
+incompatible with the armeabi build target because some of the instructions are not supported. The
+Android SDK defines a few variables which may help with detection:
+
+when compiled for armeabi-v7a:
+  __thumb2__
+  __ARM_ARCH_7A__
+when compiled for armeabi:
+  __SOFTFP__
+
+For now, the armeabi target will use the unoptimized C code.  TFM_ARM is conditionally defined
+for the armeabi-v7a target.
+
+Don't forget to change FP_MAX_SIZE in src/headers/tfm.h to suit your needs, e.g.
+
+  #define FP_MAX_SIZE           (16384+(8*DIGIT_BIT))
+
+Build instructions:
+
+(1) Clone this git repository to your project's JNI directory, naming the directory "tfm".
+(2) Modify your JNI root's Android.mk and Application.mk using the files in this directory as a model.
+(3) Run ndk-build (available from the Android NDK); see Gingerbread's NDK docs for more.
+
+Known bugs / issues:
+
+* armeabi builds are known to produce binary code that is over three times larger than armeabi-v7a.
+  This is probably due to either a bug in the Android NDK or limitations of the Thumb16 instruction set.
+
+* NEON support can't really be turned on, because not all armeabi-v7a boards support it.
+  After some trials, it appears that gcc can auto-vectorize some loops when NEON support is enabled
+  and this leads to a performance increase.  But doing so will cause the code to crash on phones
+  where NEON isn't supported...
diff --git a/src/mont/fp_montgomery_reduce.c b/src/mont/fp_montgomery_reduce.c
index b8a194f..396087a 100644
--- a/src/mont/fp_montgomery_reduce.c
+++ b/src/mont/fp_montgomery_reduce.c
@@ -283,15 +283,85 @@ asm(                                        \
 : "%eax", "%cc")
 
 /******************************************************************/
-#elif defined(TFM_ARM)
-   /* ARMv4 code */
+#elif defined(TFM_ARM_V4M)
+   /* generic ARMv4 or higher with M */
 
-#define MONT_START 
+#define MONT_START
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+   mu = c[x] * mp
+
+#define INNERMUL                    \
+asm(                                \
+    " LDR    r0,%1            \n\t" \
+    " ADDS   r0,r0,%0         \n\t" \
+    " MOVCS  %0,#1            \n\t" \
+    " MOVCC  %0,#0            \n\t" \
+    " UMLAL  r0,%0,%3,%4      \n\t" \
+    " STR    r0,%1            \n\t" \
+:"=r"(cy),"=g"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","%cc");
+
+#define PROPCARRY                  \
+asm(                               \
+    " LDR   r0,%1            \n\t" \
+    " ADDS  r0,r0,%0         \n\t" \
+    " STR   r0,%1            \n\t" \
+    " MOVCS %0,#1            \n\t" \
+    " MOVCC %0,#0            \n\t" \
+:"=r"(cy),"=g"(_c[0]):"0"(cy),"1"(_c[0]):"r0","%cc");
+
+/******************************************************************/
+#elif defined(TFM_ARM_V7A)
+   /* Android: armeabi-v7a target */
+
+#define MONT_START
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+   mu = c[x] * mp
+
+#define INNERMUL                    \
+asm(                                \
+    " LDR    r0,%1            \n\t" \
+    " ADDS   r0,r0,%0         \n\t" \
+    " ITE    CS               \n\t" \
+    " MOVCS  %0,#1            \n\t" \
+    " MOVCC  %0,#0            \n\t" \
+    " UMLAL  r0,%0,%3,%4      \n\t" \
+    " STR    r0,%1            \n\t" \
+:"=r"(cy),"=g"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","%cc");
+
+#define PROPCARRY                  \
+asm(                               \
+    " LDR   r0,%1            \n\t" \
+    " ADDS  r0,r0,%0         \n\t" \
+    " STR   r0,%1            \n\t" \
+    " ITE   CS               \n\t" \
+    " MOVCS %0,#1            \n\t" \
+    " MOVCC %0,#0            \n\t" \
+:"=r"(cy),"=g"(_c[0]):"0"(cy),"1"(_c[0]):"r0","%cc");
+
+/******************************************************************/
+#elif defined(TFM_ARM_V5TE)
+   /* Android: armeabi target */
+
+#define MONT_START
 #define MONT_FINI
 #define LOOP_END
 #define LOOP_START \
    mu = c[x] * mp
 
+/*
+These will need a complete rewrite for armeabi:
+  * ADDS is not supported in Thumb16 mode
+  * Thumb does not support conditional execution (MOVCS/MOVCC)
+  * armv5te+xscale does not support UMLAL
+ADDS - add and set condition flags (bottom of page 16, arm_inst.pdf)
+MOVCS / MOVCC - move if carry bit is set / clear
+UMLAL RdLo, RdHi, Rn, Rm - unsigned multiply Rn and Rm, then
+                           add resulting 64-bit value to (RdHi,RdLo)
+*/
 #define INNERMUL                    \
 asm(                                \
     " LDR    r0,%1            \n\t" \
@@ -300,7 +370,7 @@ asm(                                \
     " MOVCC  %0,#0            \n\t" \
     " UMLAL  r0,%0,%3,%4      \n\t" \
     " STR    r0,%1            \n\t" \
-:"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","%cc");
+:"=r"(cy),"=g"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","%cc");
 
 #define PROPCARRY                  \
 asm(                               \
@@ -309,7 +379,7 @@ asm(                               \
     " STR   r0,%1            \n\t" \
     " MOVCS %0,#1            \n\t" \
     " MOVCC %0,#0            \n\t" \
-:"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","%cc");
+:"=r"(cy),"=g"(_c[0]):"0"(cy),"1"(_c[0]):"r0","%cc");
 
 /******************************************************************/
 #elif defined(TFM_PPC32)