From 4eed9fb90e61ccb7abfdd63788d724206a02c802 Mon Sep 17 00:00:00 2001
From: Elichai Turkel <elichai.turkel@gmail.com>
Date: Wed, 24 Mar 2021 16:48:34 +0200
Subject: [PATCH 1/4] Remove ECMH

---
 multiset.go      | 131 --------------------
 multiset_test.go | 305 -----------------------------------------------
 secp256k1.go     |   3 +-
 3 files changed, 2 insertions(+), 437 deletions(-)
 delete mode 100644 multiset.go
 delete mode 100644 multiset_test.go

diff --git a/multiset.go b/multiset.go
deleted file mode 100644
index 5a90d18..0000000
--- a/multiset.go
+++ /dev/null
@@ -1,131 +0,0 @@
-package secp256k1
-
-// #include "./depend/secp256k1/include/secp256k1_multiset.h"
-import "C"
-import (
-	"encoding/hex"
-	"github.com/pkg/errors"
-)
-
-// SerializedMultiSetSize defines the length in bytes of SerializedMultiSet
-const SerializedMultiSetSize = 64
-
-// MultiSet is a type used to create an Elliptic Curve Multiset Hash
-// which is a rolling(homomorphic) hash that you can add and remove elements from
-// and receiving the same resulting hash as-if you never hashed that element.
-// Because of that the order of adding and removing elements doesn't matter.
-// Use NewMultiset to initialize a MultiSet, or DeserializeMultiSet to parse a MultiSet.
-type MultiSet struct {
-	set C.secp256k1_multiset
-}
-
-// SerializedMultiSet is a is a byte array representing the storage representation of a MultiSet
-type SerializedMultiSet [SerializedMultiSetSize]byte
-
-// String returns the SerializedMultiSet as the hexadecimal string
-func (serialized *SerializedMultiSet) String() string {
-	return hex.EncodeToString(serialized[:])
-}
-
-// String returns the MultiSet as the hexadecimal string
-func (multiset *MultiSet) String() string {
-	return multiset.Serialize().String()
-}
-
-// NewMultiset return an empty initialized set.
-// when finalized it should be equal to a finalized set with all elements removed.
-func NewMultiset() *MultiSet {
-	multiset := MultiSet{}
-	ret := C.secp256k1_multiset_init(context, &multiset.set)
-	if ret != 1 {
-		panic("failed initializing a multiset. Should never happen")
-	}
-	return &multiset
-}
-
-// Reset clears the multiset from all data. Equivalent to creating a new empty set
-func (multiset *MultiSet) Reset() {
-	ret := C.secp256k1_multiset_init(context, &multiset.set)
-	if ret != 1 {
-		panic("failed resetting a multiset. Should never happen")
-	}
-}
-
-// Add hashes the data onto the curve and adds it to the multiset.
-// Supports arbitrary length data (subject to the underlying hash function(SHA256) limits)
-func (multiset *MultiSet) Add(data []byte) {
-	cPtrData := (*C.uchar)(&data[0])
-	CLenData := (C.size_t)(len(data))
-	ret := C.secp256k1_multiset_add(context, &multiset.set, cPtrData, CLenData)
-	if ret != 1 {
-		panic("failed adding to the multiset. Should never happen")
-	}
-}
-
-// Remove hashes the data onto the curve and removes it from the multiset.
-// Supports arbitrary length data (subject to the underlying hash function(SHA256) limits)
-func (multiset *MultiSet) Remove(data []byte) {
-	cPtrData := (*C.uchar)(&data[0])
-	CLenData := (C.size_t)(len(data))
-	ret := C.secp256k1_multiset_remove(context, &multiset.set, cPtrData, CLenData)
-	if ret != 1 {
-		panic("failed removing from the multiset. Should never happen")
-	}
-}
-
-// Combine will add the MultiSets together. Equivalent to manually adding all the data elements
-// from one set to the other.
-func (multiset *MultiSet) Combine(input *MultiSet) {
-	ret := C.secp256k1_multiset_combine(context, &multiset.set, &input.set)
-	if ret != 1 {
-		panic("failed combining 2 multisets. Should never happen")
-	}
-}
-
-// Finalize will return a hash(SHA256) of the multiset.
-// Because the returned value is a hash of a multiset you cannot "Un-Finalize" it.
-// If this is meant for storage then Serialize should be used instead.
-func (multiset *MultiSet) Finalize() *Hash {
-	hash := Hash{}
-	cPtrHash := (*C.uchar)(&hash[0])
-	ret := C.secp256k1_multiset_finalize(context, cPtrHash, &multiset.set)
-	if ret != 1 {
-		panic("failed finalizing the multiset. Should never happen")
-	}
-	return &hash
-}
-
-// Serialize returns a serialized version of the MultiSet. This is the only right way to serialize a multiset for storage.
-// This multiset is not finalized, this is meant for storage.
-func (multiset *MultiSet) Serialize() *SerializedMultiSet {
-	serialized := SerializedMultiSet{}
-	cPtrData := (*C.uchar)(&serialized[0])
-	ret := C.secp256k1_multiset_serialize(context, cPtrData, &multiset.set)
-	if ret != 1 {
-		panic("failed serializing the multiset. Should never happen")
-	}
-	return &serialized
-}
-
-// DeserializeMultiSet will deserialize the multiset that `Serialize()` serialized.
-func DeserializeMultiSet(serialized *SerializedMultiSet) (multiset *MultiSet, err error) {
-	multiset = &MultiSet{}
-	cPtrData := (*C.uchar)(&serialized[0])
-	ret := C.secp256k1_multiset_parse(context, &multiset.set, cPtrData)
-	if ret != 1 {
-		return nil, errors.New("failed parsing the multiset")
-	}
-	return
-}
-
-// DeserializeMultiSetFromSlice returns a MultiSet type from a from a a serialized multiset slice.
-// will verify that it's SerializedMultiSetSize bytes long and a valid multiset.
-func DeserializeMultiSetFromSlice(newMultiSet []byte) (multiset *MultiSet, err error) {
-	if len(newMultiSet) != SerializedMultiSetSize {
-		return nil, errors.Errorf("invalid multiset length got %d, expected %d", len(newMultiSet),
-			SerializedMultiSetSize)
-	}
-	serializedMultiSet := &SerializedMultiSet{}
-	copy(serializedMultiSet[:], newMultiSet)
-	return DeserializeMultiSet(serializedMultiSet)
-}
diff --git a/multiset_test.go b/multiset_test.go
deleted file mode 100644
index 8c0ed0c..0000000
--- a/multiset_test.go
+++ /dev/null
@@ -1,305 +0,0 @@
-package secp256k1
-
-import (
-	"encoding/hex"
-	"fmt"
-	"math/rand"
-	"os"
-	"testing"
-
-	"github.com/pkg/errors"
-)
-
-type testVector struct {
-	dataElement    []byte
-	ecmhHash       Hash
-	cumulativeHash Hash
-}
-
-var testVectors []testVector
-
-var testVectorsStrings = []struct {
-	dataElementHex string
-	point          [2]string
-	ecmhHash       string
-	cumulativeHash string
-}{
-	{
-		"982051fd1e4ba744bbbe680e1fee14677ba1a3c3540bf7b1cdb606e857233e0e00000000010000000100f2052a0100000043410496b538e853519c726a2c91e61ec11600ae1390813a627c66fb8be7947be63c52da7589379515d4e0a604f8141781e62294721166bf621e73a82cbf2342c858eeac",
-		[2]string{"4f9a5dce69067bf28603e73a7af4c3650b16539b95bad05eee95dfc94d1efe2c", "346d5b777881f2729e7f89b2de4e8e79c7f2f42d1a0b25a8f10becb66e2d0f98"},
-		"f883195933a687170c34fa1adec66fe2861889279fb12c03a3fb0ca68ad87893",
-		"f883195933a687170c34fa1adec66fe2861889279fb12c03a3fb0ca68ad87893",
-	},
-	{
-		"d5fdcc541e25de1c7a5addedf24858b8bb665c9f36ef744ee42c316022c90f9b00000000020000000100f2052a010000004341047211a824f55b505228e4c3d5194c1fcfaa15a456abdf37f9b9d97a4040afc073dee6c89064984f03385237d92167c13e236446b417ab79a0fcae412ae3316b77ac",
-		[2]string{"68cf91eb2388a0287c13d46011c73fb8efb6be89c0867a47feccb2d11c390d2d", "f42ba72b1079d3d941881836f88b5dcd7c207a6a4839f129272c77ebb7194d42"},
-		"ef85d123a15da95d8aff92623ad1e1c9fcda3baa801bd40bc567a83a6fdcf3e2",
-		"fabafd38d07370982a34547daf5b57b8a4398696d6fd2294788abda07b1faaaf",
-	},
-	{
-		"44f672226090d85db9a9f2fbfe5f0f9609b387af7be5b7fbb7a1767c831c9e9900000000030000000100f2052a0100000043410494b9d3e76c5b1629ecf97fff95d7a4bbdac87cc26099ada28066c6ff1eb9191223cd897194a08d0c2726c5747f1db49e8cf90e75dc3e3550ae9b30086f3cd5aaac",
-		[2]string{"359c6f59859d1d5af8e7081905cb6bb734c010be8680c14b5a89ee315694fc2b", "fb6ba531d4bd83b14c970ad1bec332a8ae9a05706cd5df7fd91a2f2cc32482fe"},
-		"cfadf40fc017faff5e04ccc0a2fae0fd616e4226dd7c03b1334a7a610468edff",
-		"1cbccda23d7ce8c5a8b008008e1738e6bf9cffb1d5b86a92a4e62b5394a636e2",
-	},
-}
-
-func TestMain(m *testing.M) {
-	for _, vector := range testVectorsStrings {
-		res := testVector{}
-		err := errors.New("")
-		res.dataElement, err = hex.DecodeString(vector.dataElementHex)
-		if err != nil {
-			panic(fmt.Sprintf("failed parsing the hex: '%s', err: '%s'", vector.dataElementHex, err))
-		}
-		data, err := hex.DecodeString(vector.ecmhHash)
-		if err != nil {
-			panic(fmt.Sprintf("failed parsing the hex: '%s', err: '%s'", vector.ecmhHash, err))
-		}
-		err = res.ecmhHash.SetBytes(data)
-		if err != nil {
-			panic(fmt.Sprintf("failed setting the hash: '%x', err: '%s'", data, err))
-		}
-		data, err = hex.DecodeString(vector.cumulativeHash)
-		if err != nil {
-			panic(fmt.Sprintf("failed parsing the hex: '%s', err: '%s'", vector.cumulativeHash, err))
-		}
-		err = res.cumulativeHash.SetBytes(data)
-		if err != nil {
-			panic(fmt.Sprintf("failed setting the hash: '%x', err: '%s'", data, err))
-		}
-		testVectors = append(testVectors, res)
-	}
-
-	os.Exit(m.Run())
-}
-
-func TestVectorsMultiset_Hash(t *testing.T) {
-	for _, test := range testVectors {
-		m := NewMultiset()
-		m.Add(test.dataElement)
-		mFinal := m.Finalize()
-		if !m.Finalize().IsEqual(&test.ecmhHash) {
-			t.Fatalf("Multiset-Hash returned incorrect hash serialization, expected: '%s', found: '%s'", mFinal, test.ecmhHash)
-		}
-	}
-	m := NewMultiset()
-	if !m.Finalize().IsEqual(&Hash{}) {
-		t.Fatalf("Empty set did not return zero hash, got: '%s' instead", m.Finalize())
-	}
-}
-
-func TestVectorsMultiset_AddRemove(t *testing.T) {
-	m := NewMultiset()
-	for i, test := range testVectors {
-		m.Add(test.dataElement)
-		mFinal := m.Finalize()
-		if !mFinal.IsEqual(&test.cumulativeHash) {
-			t.Fatalf("Test #%d: Multiset-Add returned incorrect hash. Expected '%s' but got '%s'", i, test.cumulativeHash, mFinal)
-		}
-	}
-
-	for i := len(testVectors) - 1; i > 0; i-- {
-		m.Remove(testVectors[i].dataElement)
-		mFinal := m.Finalize()
-		if !mFinal.IsEqual(&testVectors[i-1].cumulativeHash) {
-			t.Fatalf("Test #%d: Multiset-Remove returned incorrect hash. Expected '%s' but got '%s'", i, testVectors[i].cumulativeHash, mFinal)
-		}
-	}
-}
-
-func TestVectorsMultiset_CombineSubtract(t *testing.T) {
-	m1 := NewMultiset()
-	zeroHash := m1.Finalize()
-
-	for _, test := range testVectors {
-		m1.Add(test.dataElement)
-	}
-
-	m2 := NewMultiset()
-	for _, test := range testVectors {
-		m2.Remove(test.dataElement)
-	}
-	m1.Combine(m2)
-	if !m1.Finalize().IsEqual(zeroHash) {
-		t.Fatalf("m1 was expected to have a zero hash, but was '%s' instead", m1.Finalize())
-	}
-}
-
-func TestVectorsMultiset_Commutativity(t *testing.T) {
-	m := NewMultiset()
-	zeroHash := m.Finalize()
-
-	// Check that if we subtract values from zero and then re-add them, we return to zero.
-	for _, test := range testVectors {
-		m.Remove(test.dataElement)
-	}
-
-	for _, test := range testVectors {
-		m.Add(test.dataElement)
-	}
-	if !m.Finalize().IsEqual(zeroHash) {
-		t.Fatalf("m was expected to be zero hash, but was '%s' instead", m.Finalize())
-	}
-
-	// Here we first remove an element from an empty multiset, and then add some other
-	// elements, and then we create a new empty multiset, then we add the same elements
-	// we added to the previous multiset, and then we remove the same element we remove
-	// the same element we removed from the previous multiset. According to commutativity
-	// laws, the result should be the same.
-	removeIndex := 0
-	removeData := testVectors[removeIndex].dataElement
-
-	m1 := NewMultiset()
-	m1.Remove(removeData)
-
-	for i, test := range testVectors {
-		if i != removeIndex {
-			m1.Add(test.dataElement)
-		}
-	}
-
-	m2 := NewMultiset()
-	for i, test := range testVectors {
-		if i != removeIndex {
-			m2.Add(test.dataElement)
-		}
-	}
-	m2.Remove(removeData)
-
-	if !m1.Finalize().IsEqual(m2.Finalize()) {
-		t.Fatalf("m1 and m2 was exepcted to have the same hash, but got instead m1 '%s' and m2 '%s'", m1.Finalize(), m2.Finalize())
-	}
-}
-
-func TestParseMultiSetFail(t *testing.T) {
-	r := rand.New(rand.NewSource(1))
-	data := SerializedMultiSet{}
-	copy(data[:], Secp256k1Order.Bytes())
-	_, err := DeserializeMultiSet(&data)
-	if err == nil {
-		t.Errorf("shouldn't be able to parse a multiset bigger with x bigger than the field size: '%s'", err)
-	}
-	data = [64]byte{}
-	copy(data[32:], Secp256k1Order.Bytes())
-	_, err = DeserializeMultiSet(&data)
-	if err == nil {
-		t.Errorf("shouldn't be able to parse a multiset bigger with y bigger than the field size: '%s'", err)
-	}
-	set := NewMultiset()
-	n, err := r.Read(data[:])
-	if err != nil || n != len(data) {
-		t.Fatalf("failed generating random data '%s' '%d' ", err, n)
-	}
-	set.Add(data[:])
-
-}
-
-func TestMultiSet_Reset(t *testing.T) {
-	r := rand.New(rand.NewSource(1))
-	set := NewMultiset()
-	emptySet := NewMultiset()
-	data := [100]byte{}
-	n, err := r.Read(data[:])
-	if err != nil || n != len(data) {
-		t.Fatalf("failed generating random data '%s' '%d' ", err, n)
-	}
-	set.Add(data[:])
-	if set.Finalize().IsEqual(emptySet.Finalize()) {
-		t.Errorf("expected set to be empty. found: '%s'", set.Finalize())
-	}
-	set.Reset()
-	if !set.Finalize().IsEqual(emptySet.Finalize()) {
-		t.Errorf("expected set to be empty. found: '%s'", set.Finalize())
-	}
-}
-
-func TestMultiSetAddRemove(t *testing.T) {
-	r := rand.New(rand.NewSource(1))
-	list := [loopsN][100]byte{}
-	set := NewMultiset()
-	set2 := *set
-	serializedEmpty := *set.Serialize()
-	for i := 0; i < loopsN; i++ {
-		data := [100]byte{}
-		n, err := r.Read(data[:])
-		if err != nil || n != len(data) {
-			t.Fatalf("Failed generating random data. read: '%d' bytes. .'%s'", n, err)
-		}
-		set.Add(data[:])
-		list[i] = data
-	}
-	if set.Finalize().IsEqual(set2.Finalize()) {
-		t.Errorf("sets are the same when they should be different: set '%s'\n", set.Finalize())
-	}
-
-	for i := 0; i < loopsN; i++ {
-		set.Remove(list[i][:])
-	}
-	if !set.Finalize().IsEqual(set2.Finalize()) {
-		t.Errorf("sets are different when they should be the same: set1: '%s', set2: '%s'\n", set.Finalize(), set2.Finalize())
-	}
-	if *set.Serialize() != serializedEmpty {
-		t.Errorf("serialized sets are different when they should be the same: set1: '%s', set2: '%s'\n", set.Serialize(), serializedEmpty)
-	}
-	parsedSet, err := DeserializeMultiSet(&serializedEmpty)
-	if err != nil {
-		t.Errorf("error: '%s' happened when parsing: '%s'", err, serializedEmpty)
-	} else if !parsedSet.Finalize().IsEqual(set.Finalize()) {
-		t.Errorf("sets are different when they should be the same: set1: '%s', parsedSet: '%s'\n", set.Finalize(), parsedSet.Finalize())
-	}
-}
-
-func BenchmarkMultiSet_Add(b *testing.B) {
-	set := NewMultiset()
-	var data [100]byte
-	for i := range data {
-		data[i] = 0xFF
-	}
-	b.ReportAllocs()
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		set.Add(data[:])
-	}
-}
-
-func BenchmarkMultiSet_Remove(b *testing.B) {
-	set := NewMultiset()
-	var data [100]byte
-	for i := range data {
-		data[i] = 0xFF
-	}
-	b.ReportAllocs()
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		set.Remove(data[:])
-	}
-}
-
-func BenchmarkMultiSet_CombineRand(b *testing.B) {
-	r := rand.New(rand.NewSource(0))
-	set := NewMultiset()
-	set2 := NewMultiset()
-	var element [32]byte
-	r.Read(element[:])
-	set2.Add(element[:])
-	b.ReportAllocs()
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		set.Combine(set2)
-	}
-}
-
-func BenchmarkMultiSet_Finalize(b *testing.B) {
-	r := rand.New(rand.NewSource(0))
-	set := NewMultiset()
-	var element [32]byte
-	r.Read(element[:])
-	set.Add(element[:])
-	b.ReportAllocs()
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		clonedSet := *set
-		clonedSet.Finalize()
-	}
-}
diff --git a/secp256k1.go b/secp256k1.go
index a765f71..9185eb8 100644
--- a/secp256k1.go
+++ b/secp256k1.go
@@ -3,7 +3,7 @@ package secp256k1
 // // **This is CGO's build system. CGO parses the following comments as build instructions.**
 // // Including the headers and code, and defining the default macros
 // #cgo CFLAGS: -I./depend/secp256k1 -I./depend/secp256k1/src/
-// #cgo CFLAGS: -DSECP256K1_BUILD=1 -DECMULT_WINDOW_SIZE=15 -DENABLE_MODULE_SCHNORRSIG=1 -DENABLE_MODULE_EXTRAKEYS=1 -DENABLE_MODULE_MULTISET=1
+// #cgo CFLAGS: -DSECP256K1_BUILD=1 -DECMULT_WINDOW_SIZE=15 -DENABLE_MODULE_SCHNORRSIG=1 -DENABLE_MODULE_EXTRAKEYS=1
 // // Consider using libgmp. these macros are set to use the slower in-project implementation of nums
 // #cgo CFLAGS: -DUSE_NUM_NONE=1 -DUSE_FIELD_INV_BUILTIN=1 -DUSE_SCALAR_INV_BUILTIN=1 -DECMULT_GEN_PREC_BITS=4
 // // x86_64 can use the Assembly implementation.
@@ -15,6 +15,7 @@ import "C"
 import (
 	"crypto/rand"
 	"encoding/hex"
+
 	"github.com/pkg/errors"
 )
 

From 8ccb0e2b77d1a05a05d657f85bbaf735e4d2f1c4 Mon Sep 17 00:00:00 2001
From: Elichai Turkel <elichai.turkel@gmail.com>
Date: Wed, 24 Mar 2021 16:49:00 +0200
Subject: [PATCH 2/4] Change upstream to bitcoin-core/secp256k1

---
 depend/update-subtree.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/depend/update-subtree.sh b/depend/update-subtree.sh
index 76d33d0..fe1fdb9 100755
--- a/depend/update-subtree.sh
+++ b/depend/update-subtree.sh
@@ -1,4 +1,4 @@
 #!/bin/bash
 # This command pulls the latest updates from the master branch of the subtree and commits them in depend/secp256k1.
 # See man git-subtree(1) for more details.
-git subtree pull --prefix depend/secp256k1 https://github.com/kaspanet/secp256k1.git master --squash
+git subtree pull --prefix depend/secp256k1 https://github.com/bitcoin-core/secp256k1.git master --squash

From 322ef0fc26a287e3ad0b686b709fa740aad7608a Mon Sep 17 00:00:00 2001
From: Elichai Turkel <elichai.turkel@gmail.com>
Date: Wed, 24 Mar 2021 16:49:38 +0200
Subject: [PATCH 3/4] Squashed 'depend/secp256k1/' changes from
 9fd06254..26de4dfe

26de4dfe Merge #831: Safegcd inverses, drop Jacobi symbols, remove libgmp
24ad04fc Make scalar_inverse{,_var} benchmark scale with SECP256K1_BENCH_ITERS
ebc1af70 Optimization: track f,g limb count and pass to new variable-time update_fg_var
b306935a Optimization: use formulas instead of lookup tables for cancelling g bits
9164a1b6 Optimization: special-case zero modulus limbs in modinv64
1f233b3f Remove num/gmp support
20448b8d Remove unused Jacobi symbol support
5437e7bd Remove unused scalar_sqr
aa9cc521 Improve field/scalar inverse tests
1e0e885c Make field/scalar code use the new modinv modules for inverses
436281af Move secp256k1_fe_inverse{_var} to per-impl files
aa404d53 Move secp256k1_scalar_{inverse{_var},is_even} to per-impl files
08d54964 Improve bounds checks in modinv modules
151aac00 Add tests for modinv modules
d8a92fcc Add extensive comments on the safegcd algorithm and implementation
8e415acb Add safegcd based modular inverse modules
de0a643c Add secp256k1_ctz{32,64}_var functions
REVERT: 9fd06254 Merge pull request #5 from bitcoin-core/master
REVERT: 143ecc6f Fix multiset benchmarks
REVERT: d66ad94e Merge pull request #3 from kaspanet/new-schnorr
REVERT: 2a29b5c6 Merge remote-tracking branch 'upstream/master' into new-schnorr
REVERT: f09320ed Revert "Add matching Schnorr implementation "
REVERT: ee3ab072 Add matching Schnorr implementation
REVERT: 087f4bba Fix missing MIT license attribute in oldschnorr impl https://github.com/bitcoin-core/secp256k1/pull/425
REVERT: 054ade68 Add multiset and oldschnorr to travis
REVERT: 2b47e2cf Rename schnorr to oldschnorr and add fixes to build system
REVERT: 604052f4 [secp256k1] Implement Schnorr signatures
REVERT: 1f46fcb2 Add Multiset/ECMH implementation
REVERT: 225587b4 Fix UB(violating alignment rules) in multiset tests
REVERT: b0e16b52 Fix docs and small code problems in multiset
REVERT: 582b1256 Add multiset serialize/parse functions
REVERT: d6dd4762 Fix some build configurations for multiset module
REVERT: 41145690 Add ECMH multiset module to libsecp256k1

git-subtree-dir: depend/secp256k1
git-subtree-split: 26de4dfeb1f1436dae1fcf17f57bdaa43540f940
---
 .cirrus.yml                              |   17 +-
 .gitignore                               |    1 -
 Makefile.am                              |   12 +-
 README.md                                |    2 +-
 build-aux/m4/bitcoin_secp.m4             |   13 -
 ci/cirrus.sh                             |    2 +-
 ci/linux-debian.Dockerfile               |    4 +-
 configure.ac                             |   74 --
 doc/safegcd_implementation.md            |  750 ++++++++++++++++
 include/secp256k1_multiset.h             |  136 ---
 src/basic-config.h                       |    9 -
 src/bench_ecmult.c                       |    1 -
 src/bench_internal.c                     |   63 +-
 src/bench_multiset.c                     |   62 --
 src/ecmult.h                             |    1 -
 src/field.h                              |    3 -
 src/field_10x26_impl.h                   |   89 ++
 src/field_5x52_impl.h                    |   77 ++
 src/field_impl.h                         |  153 ----
 src/group.h                              |   10 -
 src/group_impl.h                         |   23 +-
 src/modinv32.h                           |   42 +
 src/modinv32_impl.h                      |  587 +++++++++++++
 src/modinv64.h                           |   46 +
 src/modinv64_impl.h                      |  589 +++++++++++++
 src/modules/multiset/Makefile.am.include |    8 -
 src/modules/multiset/README.md           |   81 --
 src/modules/multiset/main_impl.h         |  241 -----
 src/modules/multiset/tests_impl.h        |  315 -------
 src/num.h                                |   74 --
 src/num_gmp.h                            |   20 -
 src/num_gmp_impl.h                       |  288 ------
 src/num_impl.h                           |   24 -
 src/scalar.h                             |   12 -
 src/scalar_4x64_impl.h                   |  242 ++---
 src/scalar_8x32_impl.h                   |  179 ++--
 src/scalar_impl.h                        |  219 -----
 src/scalar_low_impl.h                    |   19 +-
 src/secp256k1.c                          |    5 -
 src/tests.c                              | 1019 +++++++++++++---------
 src/util.h                               |   65 ++
 41 files changed, 3049 insertions(+), 2528 deletions(-)
 create mode 100644 doc/safegcd_implementation.md
 delete mode 100644 include/secp256k1_multiset.h
 delete mode 100644 src/bench_multiset.c
 create mode 100644 src/modinv32.h
 create mode 100644 src/modinv32_impl.h
 create mode 100644 src/modinv64.h
 create mode 100644 src/modinv64_impl.h
 delete mode 100644 src/modules/multiset/Makefile.am.include
 delete mode 100644 src/modules/multiset/README.md
 delete mode 100644 src/modules/multiset/main_impl.h
 delete mode 100644 src/modules/multiset/tests_impl.h
 delete mode 100644 src/num.h
 delete mode 100644 src/num_gmp.h
 delete mode 100644 src/num_gmp_impl.h
 delete mode 100644 src/num_impl.h

diff --git a/.cirrus.yml b/.cirrus.yml
index 9399fbd..506a860 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -1,6 +1,5 @@
 env:
   WIDEMUL: auto
-  BIGNUM: auto
   STATICPRECOMPUTATION: yes
   ECMULTGENPRECISION: auto
   ASM: no
@@ -59,9 +58,8 @@ task:
     - env: {WIDEMUL: int128,  RECOVERY: yes,            EXPERIMENTAL: yes, SCHNORRSIG: yes}
     - env: {WIDEMUL: int128,                 ECDH: yes, EXPERIMENTAL: yes, SCHNORRSIG: yes}
     - env: {WIDEMUL: int128,  ASM: x86_64}
-    - env: {BIGNUM: no}
-    - env: {BIGNUM: no,       RECOVERY: yes,            EXPERIMENTAL: yes, SCHNORRSIG: yes}
-    - env: {BIGNUM: no,       STATICPRECOMPUTATION: no}
+    - env: {                  RECOVERY: yes,            EXPERIMENTAL: yes, SCHNORRSIG: yes}
+    - env: {                  STATICPRECOMPUTATION: no}
     - env: {BUILD: distcheck, WITH_VALGRIND: no, CTIMETEST: no, BENCH: no}
     - env: {CPPFLAGS: -DDETERMINISTIC}
     - env: {CFLAGS: -O0, CTIMETEST: no}
@@ -69,7 +67,6 @@ task:
         CFLAGS:  "-fsanitize=undefined -fno-omit-frame-pointer"
         LDFLAGS: "-fsanitize=undefined -fno-omit-frame-pointer"
         UBSAN_OPTIONS: "print_stacktrace=1:halt_on_error=1"
-        BIGNUM: no
         ASM: x86_64
         ECDH: yes
         RECOVERY: yes
@@ -80,7 +77,6 @@ task:
     - env: { ECMULTGENPRECISION: 8 }
     - env:
         RUN_VALGRIND: yes
-        BIGNUM: no
         ASM: x86_64
         ECDH: yes
         RECOVERY: yes
@@ -115,12 +111,6 @@ task:
         CC: i686-linux-gnu-gcc
     - env:
         CC: clang --target=i686-pc-linux-gnu -isystem /usr/i686-linux-gnu/include
-  matrix:
-    - env:
-        BIGNUM: gmp
-    - env:
-        BIGNUM: no
-  << : *MERGE_BASE
   test_script:
     - ./ci/cirrus.sh
   << : *CAT_LOGS
@@ -178,7 +168,7 @@ task:
     # If we haven't restored from cached (and just run brew install), this is a no-op.
     - brew link valgrind
   brew_script:
-    - brew install automake libtool gmp gcc@9
+    - brew install automake libtool gcc@9
   << : *MERGE_BASE
   test_script:
     - ./ci/cirrus.sh
@@ -195,7 +185,6 @@ task:
     HOST: s390x-linux-gnu
     BUILD:
     WITH_VALGRIND: no
-    BIGNUM: no
     ECDH: yes
     RECOVERY: yes
     EXPERIMENTAL: yes
diff --git a/.gitignore b/.gitignore
index d081f73..ccdef02 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,6 @@ bench_sign
 bench_verify
 bench_recover
 bench_internal
-bench_multiset
 tests
 exhaustive_tests
 gen_context
diff --git a/Makefile.am b/Makefile.am
index 4c7b66b..58c9635 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -14,8 +14,6 @@ noinst_HEADERS += src/scalar_8x32_impl.h
 noinst_HEADERS += src/scalar_low_impl.h
 noinst_HEADERS += src/group.h
 noinst_HEADERS += src/group_impl.h
-noinst_HEADERS += src/num_gmp.h
-noinst_HEADERS += src/num_gmp_impl.h
 noinst_HEADERS += src/ecdsa.h
 noinst_HEADERS += src/ecdsa_impl.h
 noinst_HEADERS += src/eckey.h
@@ -26,14 +24,16 @@ noinst_HEADERS += src/ecmult_const.h
 noinst_HEADERS += src/ecmult_const_impl.h
 noinst_HEADERS += src/ecmult_gen.h
 noinst_HEADERS += src/ecmult_gen_impl.h
-noinst_HEADERS += src/num.h
-noinst_HEADERS += src/num_impl.h
 noinst_HEADERS += src/field_10x26.h
 noinst_HEADERS += src/field_10x26_impl.h
 noinst_HEADERS += src/field_5x52.h
 noinst_HEADERS += src/field_5x52_impl.h
 noinst_HEADERS += src/field_5x52_int128_impl.h
 noinst_HEADERS += src/field_5x52_asm_impl.h
+noinst_HEADERS += src/modinv32.h
+noinst_HEADERS += src/modinv32_impl.h
+noinst_HEADERS += src/modinv64.h
+noinst_HEADERS += src/modinv64_impl.h
 noinst_HEADERS += src/assumptions.h
 noinst_HEADERS += src/util.h
 noinst_HEADERS += src/scratch.h
@@ -151,10 +151,6 @@ if ENABLE_MODULE_ECDH
 include src/modules/ecdh/Makefile.am.include
 endif
 
-if ENABLE_MODULE_MULTISET
-include src/modules/multiset/Makefile.am.include
-endif
-
 if ENABLE_MODULE_RECOVERY
 include src/modules/recovery/Makefile.am.include
 endif
diff --git a/README.md b/README.md
index 9918678..197a56f 100644
--- a/README.md
+++ b/README.md
@@ -34,11 +34,11 @@ Implementation details
   * Optimized implementation of arithmetic modulo the curve's field size (2^256 - 0x1000003D1).
     * Using 5 52-bit limbs (including hand-optimized assembly for x86_64, by Diederik Huys).
     * Using 10 26-bit limbs (including hand-optimized assembly for 32-bit ARM, by Wladimir J. van der Laan).
-  * Field inverses and square roots using a sliding window over blocks of 1s (by Peter Dettman).
 * Scalar operations
   * Optimized implementation without data-dependent branches of arithmetic modulo the curve's order.
     * Using 4 64-bit limbs (relying on __int128 support in the compiler).
     * Using 8 32-bit limbs.
+* Modular inverses (both field elements and scalars) based on [safegcd](https://gcd.cr.yp.to/index.html) with some modifications, and a variable-time variant (by Peter Dettman).
 * Group operations
   * Point addition formula specifically simplified for the curve equation (y^2 = x^3 + 7).
   * Use addition between points in Jacobian and affine coordinates where possible.
diff --git a/build-aux/m4/bitcoin_secp.m4 b/build-aux/m4/bitcoin_secp.m4
index 7b48a5e..e57888c 100644
--- a/build-aux/m4/bitcoin_secp.m4
+++ b/build-aux/m4/bitcoin_secp.m4
@@ -75,19 +75,6 @@ if test x"$has_libcrypto" = x"yes" && test x"$has_openssl_ec" = x; then
 fi
 ])
 
-dnl
-AC_DEFUN([SECP_GMP_CHECK],[
-if test x"$has_gmp" != x"yes"; then
-  CPPFLAGS_TEMP="$CPPFLAGS"
-  CPPFLAGS="$GMP_CPPFLAGS $CPPFLAGS"
-  LIBS_TEMP="$LIBS"
-  LIBS="$GMP_LIBS $LIBS"
-  AC_CHECK_HEADER(gmp.h,[AC_CHECK_LIB(gmp, __gmpz_init,[has_gmp=yes; GMP_LIBS="$GMP_LIBS -lgmp"; AC_DEFINE(HAVE_LIBGMP,1,[Define this symbol if libgmp is installed])])])
-  CPPFLAGS="$CPPFLAGS_TEMP"
-  LIBS="$LIBS_TEMP"
-fi
-])
-
 AC_DEFUN([SECP_VALGRIND_CHECK],[
 if test x"$has_valgrind" != x"yes"; then
   CPPFLAGS_TEMP="$CPPFLAGS"
diff --git a/ci/cirrus.sh b/ci/cirrus.sh
index f223a91..f26ca98 100755
--- a/ci/cirrus.sh
+++ b/ci/cirrus.sh
@@ -14,7 +14,7 @@ valgrind --version || true
 
 ./configure \
     --enable-experimental="$EXPERIMENTAL" \
-    --with-test-override-wide-multiply="$WIDEMUL" --with-bignum="$BIGNUM" --with-asm="$ASM" \
+    --with-test-override-wide-multiply="$WIDEMUL" --with-asm="$ASM" \
     --enable-ecmult-static-precomputation="$STATICPRECOMPUTATION" --with-ecmult-gen-precision="$ECMULTGENPRECISION" \
     --enable-module-ecdh="$ECDH" --enable-module-recovery="$RECOVERY" \
     --enable-module-schnorrsig="$SCHNORRSIG" \
diff --git a/ci/linux-debian.Dockerfile b/ci/linux-debian.Dockerfile
index 201ace4..5967cf8 100644
--- a/ci/linux-debian.Dockerfile
+++ b/ci/linux-debian.Dockerfile
@@ -8,6 +8,6 @@ RUN apt-get update
 RUN apt-get install --no-install-recommends --no-upgrade -y \
         git ca-certificates \
         make automake libtool pkg-config dpkg-dev valgrind qemu-user \
-        gcc clang libc6-dbg libgmp-dev \
-        gcc-i686-linux-gnu libc6-dev-i386-cross libc6-dbg:i386 libgmp-dev:i386 \
+        gcc clang libc6-dbg \
+        gcc-i686-linux-gnu libc6-dev-i386-cross libc6-dbg:i386 \
         gcc-s390x-linux-gnu libc6-dev-s390x-cross libc6-dbg:s390x
diff --git a/configure.ac b/configure.ac
index 644f902..e84005e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -48,17 +48,12 @@ case $host_os in
          # in expected paths because they may conflict with system files. Ask
          # Homebrew where each one is located, then adjust paths accordingly.
          openssl_prefix=`$BREW --prefix openssl 2>/dev/null`
-         gmp_prefix=`$BREW --prefix gmp 2>/dev/null`
          valgrind_prefix=`$BREW --prefix valgrind 2>/dev/null`
          if test x$openssl_prefix != x; then
            PKG_CONFIG_PATH="$openssl_prefix/lib/pkgconfig:$PKG_CONFIG_PATH"
            export PKG_CONFIG_PATH
            CRYPTO_CPPFLAGS="-I$openssl_prefix/include"
          fi
-         if test x$gmp_prefix != x; then
-           GMP_CPPFLAGS="-I$gmp_prefix/include"
-           GMP_LIBS="-L$gmp_prefix/lib"
-         fi
          if test x$valgrind_prefix != x; then
            VALGRIND_CPPFLAGS="-I$valgrind_prefix/include"
          fi
@@ -140,11 +135,6 @@ AC_ARG_ENABLE(module_ecdh,
     [enable_module_ecdh=$enableval],
     [enable_module_ecdh=no])
 
-AC_ARG_ENABLE(module_multiset,
-    AS_HELP_STRING([--enable-module-multiset],[enable multiset operations (experimental)]),
-    [enable_module_multiset=$enableval],
-    [enable_module_multiset=no])
-
 AC_ARG_ENABLE(module_recovery,
     AS_HELP_STRING([--enable-module-recovery],[enable ECDSA pubkey recovery module [default=no]]),
     [enable_module_recovery=$enableval],
@@ -169,9 +159,6 @@ AC_ARG_ENABLE(external_default_callbacks,
 # Legal values are int64 (for [u]int64_t), int128 (for [unsigned] __int128), and auto (the default).
 AC_ARG_WITH([test-override-wide-multiply], [] ,[set_widemul=$withval], [set_widemul=auto])
 
-AC_ARG_WITH([bignum], [AS_HELP_STRING([--with-bignum=gmp|no|auto],
-[bignum implementation to use [default=auto]])],[req_bignum=$withval], [req_bignum=auto])
-
 AC_ARG_WITH([asm], [AS_HELP_STRING([--with-asm=x86_64|arm|no|auto],
 [assembly optimizations to use (experimental: arm) [default=auto]])],[req_asm=$withval], [req_asm=auto])
 
@@ -250,32 +237,6 @@ else
   esac
 fi
 
-if test x"$req_bignum" = x"auto"; then
-  SECP_GMP_CHECK
-  if test x"$has_gmp" = x"yes"; then
-    set_bignum=gmp
-  fi
-
-  if test x"$set_bignum" = x; then
-    set_bignum=no
-  fi
-else
-  set_bignum=$req_bignum
-  case $set_bignum in
-  gmp)
-    SECP_GMP_CHECK
-    if test x"$has_gmp" != x"yes"; then
-      AC_MSG_ERROR([gmp bignum explicitly requested but libgmp not available])
-    fi
-    ;;
-  no)
-    ;;
-  *)
-    AC_MSG_ERROR([invalid bignum implementation selection])
-    ;;
-  esac
-fi
-
 # Select assembly optimization
 use_external_asm=no
 
@@ -313,24 +274,6 @@ auto)
   ;;
 esac
 
-# Select bignum implementation
-case $set_bignum in
-gmp)
-  AC_DEFINE(HAVE_LIBGMP, 1, [Define this symbol if libgmp is installed])
-  AC_DEFINE(USE_NUM_GMP, 1, [Define this symbol to use the gmp implementation for num])
-  AC_DEFINE(USE_FIELD_INV_NUM, 1, [Define this symbol to use the num-based field inverse implementation])
-  AC_DEFINE(USE_SCALAR_INV_NUM, 1, [Define this symbol to use the num-based scalar inverse implementation])
-  ;;
-no)
-  AC_DEFINE(USE_NUM_NONE, 1, [Define this symbol to use no num implementation])
-  AC_DEFINE(USE_FIELD_INV_BUILTIN, 1, [Define this symbol to use the native field inverse implementation])
-  AC_DEFINE(USE_SCALAR_INV_BUILTIN, 1, [Define this symbol to use the native scalar inverse implementation])
-  ;;
-*)
-  AC_MSG_ERROR([invalid bignum implementation])
-  ;;
-esac
-
 # Set ecmult window size
 if test x"$req_ecmult_window" = x"auto"; then
   set_ecmult_window=15
@@ -395,11 +338,6 @@ else
   enable_openssl_tests=no
 fi
 
-if test x"$set_bignum" = x"gmp"; then
-  SECP_LIBS="$SECP_LIBS $GMP_LIBS"
-  SECP_INCLUDES="$SECP_INCLUDES $GMP_CPPFLAGS"
-fi
-
 if test x"$enable_valgrind" = x"yes"; then
   SECP_INCLUDES="$SECP_INCLUDES $VALGRIND_CPPFLAGS"
 fi
@@ -491,10 +429,6 @@ if test x"$enable_module_ecdh" = x"yes"; then
   AC_DEFINE(ENABLE_MODULE_ECDH, 1, [Define this symbol to enable the ECDH module])
 fi
 
-if test x"$enable_module_multiset" = x"yes"; then
-  AC_DEFINE(ENABLE_MODULE_MULTISET, 1, [Define this symbol to enable the multiset module])
-fi
-
 if test x"$enable_module_recovery" = x"yes"; then
   AC_DEFINE(ENABLE_MODULE_RECOVERY, 1, [Define this symbol to enable the ECDSA pubkey recovery module])
 fi
@@ -522,8 +456,6 @@ if test x"$enable_experimental" = x"yes"; then
   AC_MSG_NOTICE([******])
   AC_MSG_NOTICE([WARNING: experimental build])
   AC_MSG_NOTICE([Experimental features do not have stable APIs or properties, and may not be safe for production use.])
-  AC_MSG_NOTICE([Building ECDH module: $enable_module_ecdh])
-  AC_MSG_NOTICE([Building multiset module: $enable_module_multiset])
   AC_MSG_NOTICE([Building extrakeys module: $enable_module_extrakeys])
   AC_MSG_NOTICE([Building schnorrsig module: $enable_module_schnorrsig])
   AC_MSG_NOTICE([******])
@@ -534,9 +466,6 @@ else
   if test x"$enable_module_schnorrsig" = x"yes"; then
     AC_MSG_ERROR([schnorrsig module is experimental. Use --enable-experimental to allow.])
   fi
-  if test x"$enable_module_multiset" = x"yes"; then
-    AC_MSG_ERROR([Multiset module is experimental. Use --enable-experimental to allow.])
-  fi
   if test x"$set_asm" = x"arm"; then
     AC_MSG_ERROR([ARM assembly optimization is experimental. Use --enable-experimental to allow.])
   fi
@@ -558,7 +487,6 @@ AM_CONDITIONAL([USE_EXHAUSTIVE_TESTS], [test x"$use_exhaustive_tests" != x"no"])
 AM_CONDITIONAL([USE_BENCHMARK], [test x"$use_benchmark" = x"yes"])
 AM_CONDITIONAL([USE_ECMULT_STATIC_PRECOMPUTATION], [test x"$set_precomp" = x"yes"])
 AM_CONDITIONAL([ENABLE_MODULE_ECDH], [test x"$enable_module_ecdh" = x"yes"])
-AM_CONDITIONAL([ENABLE_MODULE_MULTISET], [test x"$enable_module_multiset" = x"yes"])
 AM_CONDITIONAL([ENABLE_MODULE_RECOVERY], [test x"$enable_module_recovery" = x"yes"])
 AM_CONDITIONAL([ENABLE_MODULE_EXTRAKEYS], [test x"$enable_module_extrakeys" = x"yes"])
 AM_CONDITIONAL([ENABLE_MODULE_SCHNORRSIG], [test x"$enable_module_schnorrsig" = x"yes"])
@@ -581,13 +509,11 @@ echo "  with tests              = $use_tests"
 echo "  with openssl tests      = $enable_openssl_tests"
 echo "  with coverage           = $enable_coverage"
 echo "  module ecdh             = $enable_module_ecdh"
-echo "  module multiset         = $enable_module_multiset"
 echo "  module recovery         = $enable_module_recovery"
 echo "  module extrakeys        = $enable_module_extrakeys"
 echo "  module schnorrsig       = $enable_module_schnorrsig"
 echo
 echo "  asm                     = $set_asm"
-echo "  bignum                  = $set_bignum"
 echo "  ecmult window size      = $set_ecmult_window"
 echo "  ecmult gen prec. bits   = $set_ecmult_gen_precision"
 # Hide test-only options unless they're used.
diff --git a/doc/safegcd_implementation.md b/doc/safegcd_implementation.md
new file mode 100644
index 0000000..8346d22
--- /dev/null
+++ b/doc/safegcd_implementation.md
@@ -0,0 +1,750 @@
+# The safegcd implementation in libsecp256k1 explained
+
+This document explains the modular inverse implementation in the `src/modinv*.h` files. It is based
+on the paper
+["Fast constant-time gcd computation and modular inversion"](https://gcd.cr.yp.to/papers.html#safegcd)
+by Daniel J. Bernstein and Bo-Yin Yang. The references below are for the Date: 2019.04.13 version.
+
+The actual implementation is in C of course, but for demonstration purposes Python3 is used here.
+Most implementation aspects and optimizations are explained, except those that depend on the specific
+number representation used in the C code.
+
+## 1. Computing the Greatest Common Divisor (GCD) using divsteps
+
+The algorithm from the paper (section 11), at a very high level, is this:
+
+```python
+def gcd(f, g):
+    """Compute the GCD of an odd integer f and another integer g."""
+    assert f & 1  # require f to be odd
+    delta = 1     # additional state variable
+    while g != 0:
+        assert f & 1  # f will be odd in every iteration
+        if delta > 0 and g & 1:
+            delta, f, g = 1 - delta, g, (g - f) // 2
+        elif g & 1:
+            delta, f, g = 1 + delta, f, (g + f) // 2
+        else:
+            delta, f, g = 1 + delta, f, (g    ) // 2
+    return abs(f)
+```
+
+It computes the greatest common divisor of an odd integer *f* and any integer *g*. Its inner loop
+keeps rewriting the variables *f* and *g* alongside a state variable *&delta;* that starts at *1*, until
+*g=0* is reached. At that point, *|f|* gives the GCD. Each of the transitions in the loop is called a
+"division step" (referred to as divstep in what follows).
+
+For example, *gcd(21, 14)* would be computed as:
+- Start with *&delta;=1 f=21 g=14*
+- Take the third branch: *&delta;=2 f=21 g=7*
+- Take the first branch: *&delta;=-1 f=7 g=-7*
+- Take the second branch: *&delta;=0 f=7 g=0*
+- The answer *|f| = 7*.
+
+Why it works:
+- Divsteps can be decomposed into two steps (see paragraph 8.2 in the paper):
+  - (a) If *g* is odd, replace *(f,g)* with *(g,g-f)* or (f,g+f), resulting in an even *g*.
+  - (b) Replace *(f,g)* with *(f,g/2)* (where *g* is guaranteed to be even).
+- Neither of those two operations change the GCD:
+  - For (a), assume *gcd(f,g)=c*, then it must be the case that *f=a&thinsp;c* and *g=b&thinsp;c* for some integers *a*
+    and *b*. As *(g,g-f)=(b&thinsp;c,(b-a)c)* and *(f,f+g)=(a&thinsp;c,(a+b)c)*, the result clearly still has
+    common factor *c*. Reasoning in the other direction shows that no common factor can be added by
+    doing so either.
+  - For (b), we know that *f* is odd, so *gcd(f,g)* clearly has no factor *2*, and we can remove
+    it from *g*.
+- The algorithm will eventually converge to *g=0*. This is proven in the paper (see theorem G.3).
+- It follows that eventually we find a final value *f'* for which *gcd(f,g) = gcd(f',0)*. As the
+  gcd of *f'* and *0* is *|f'|* by definition, that is our answer.
+
+Compared to more [traditional GCD algorithms](https://en.wikipedia.org/wiki/Euclidean_algorithm), this one has the property of only ever looking at
+the low-order bits of the variables to decide the next steps, and being easy to make
+constant-time (in more low-level languages than Python). The *&delta;* parameter is necessary to
+guide the algorithm towards shrinking the numbers' magnitudes without explicitly needing to look
+at high order bits.
+
+Properties that will become important later:
+- Performing more divsteps than needed is not a problem, as *f* does not change anymore after *g=0*.
+- Only even numbers are divided by *2*. This means that when reasoning about it algebraically we
+  do not need to worry about rounding.
+- At every point during the algorithm's execution the next *N* steps only depend on the bottom *N*
+  bits of *f* and *g*, and on *&delta;*.
+
+
+## 2. From GCDs to modular inverses
+
+We want an algorithm to compute the inverse *a* of *x* modulo *M*, i.e. the number a such that *a&thinsp;x=1
+mod M*. This inverse only exists if the GCD of *x* and *M* is *1*, but that is always the case if *M* is
+prime and *0 < x < M*. In what follows, assume that the modular inverse exists.
+It turns out this inverse can be computed as a side effect of computing the GCD by keeping track
+of how the internal variables can be written as linear combinations of the inputs at every step
+(see the [extended Euclidean algorithm](https://en.wikipedia.org/wiki/Extended_Euclidean_algorithm)).
+Since the GCD is *1*, such an algorithm will compute numbers *a* and *b* such that a&thinsp;x + b&thinsp;M = 1*.
+Taking that expression *mod M* gives *a&thinsp;x mod M = 1*, and we see that *a* is the modular inverse of *x
+mod M*.
+
+A similar approach can be used to calculate modular inverses using the divsteps-based GCD
+algorithm shown above, if the modulus *M* is odd. To do so, compute *gcd(f=M,g=x)*, while keeping
+track of extra variables *d* and *e*, for which at every step *d = f/x (mod M)* and *e = g/x (mod M)*.
+*f/x* here means the number which multiplied with *x* gives *f mod M*. As *f* and *g* are initialized to *M*
+and *x* respectively, *d* and *e* just start off being *0* (*M/x mod M = 0/x mod M = 0*) and *1* (*x/x mod M
+= 1*).
+
+```python
+def div2(M, x):
+    """Helper routine to compute x/2 mod M (where M is odd)."""
+    assert M & 1
+    if x & 1: # If x is odd, make it even by adding M.
+        x += M
+    # x must be even now, so a clean division by 2 is possible.
+    return x // 2
+
+def modinv(M, x):
+    """Compute the inverse of x mod M (given that it exists, and M is odd)."""
+    assert M & 1
+    delta, f, g, d, e = 1, M, x, 0, 1
+    while g != 0:
+        # Note that while division by two for f and g is only ever done on even inputs, this is
+        # not true for d and e, so we need the div2 helper function.
+        if delta > 0 and g & 1:
+            delta, f, g, d, e = 1 - delta, g, (g - f) // 2, e, div2(M, e - d)
+        elif g & 1:
+            delta, f, g, d, e = 1 + delta, f, (g + f) // 2, d, div2(M, e + d)
+        else:
+            delta, f, g, d, e = 1 + delta, f, (g    ) // 2, d, div2(M, e    )
+        # Verify that the invariants d=f/x mod M, e=g/x mod M are maintained.
+        assert f % M == (d * x) % M
+        assert g % M == (e * x) % M
+    assert f == 1 or f == -1  # |f| is the GCD, it must be 1
+    # Because of invariant d = f/x (mod M), 1/x = d/f (mod M). As |f|=1, d/f = d*f.
+    return (d * f) % M
+```
+
+Also note that this approach to track *d* and *e* throughout the computation to determine the inverse
+is different from the paper. There (see paragraph 12.1 in the paper) a transition matrix for the
+entire computation is determined (see section 3 below) and the inverse is computed from that.
+The approach here avoids the need for 2x2 matrix multiplications of various sizes, and appears to
+be faster at the level of optimization we're able to do in C.
+
+
+## 3. Batching multiple divsteps
+
+Every divstep can be expressed as a matrix multiplication, applying a transition matrix *(1/2 t)*
+to both vectors *[f, g]* and *[d, e]* (see paragraph 8.1 in the paper):
+
+```
+  t = [ u,  v ]
+      [ q,  r ]
+
+  [ out_f ] = (1/2 * t) * [ in_f ]
+  [ out_g ] =             [ in_g ]
+
+  [ out_d ] = (1/2 * t) * [ in_d ]  (mod M)
+  [ out_e ]               [ in_e ]
+```
+
+where *(u, v, q, r)* is *(0, 2, -1, 1)*, *(2, 0, 1, 1)*, or *(2, 0, 0, 1)*, depending on which branch is
+taken. As above, the resulting *f* and *g* are always integers.
+
+Performing multiple divsteps corresponds to a multiplication with the product of all the
+individual divsteps' transition matrices. As each transition matrix consists of integers
+divided by *2*, the product of these matrices will consist of integers divided by *2<sup>N</sup>* (see also
+theorem 9.2 in the paper). These divisions are expensive when updating *d* and *e*, so we delay
+them: we compute the integer coefficients of the combined transition matrix scaled by *2<sup>N</sup>*, and
+do one division by *2<sup>N</sup>* as a final step:
+
+```python
+def divsteps_n_matrix(delta, f, g):
+    """Compute delta and transition matrix t after N divsteps (multiplied by 2^N)."""
+    u, v, q, r = 1, 0, 0, 1 # start with identity matrix
+    for _ in range(N):
+        if delta > 0 and g & 1:
+            delta, f, g, u, v, q, r = 1 - delta, g, (g - f) // 2, 2*q, 2*r, q-u, r-v
+        elif g & 1:
+            delta, f, g, u, v, q, r = 1 + delta, f, (g + f) // 2, 2*u, 2*v, q+u, r+v
+        else:
+            delta, f, g, u, v, q, r = 1 + delta, f, (g    ) // 2, 2*u, 2*v, q  , r
+    return delta, (u, v, q, r)
+```
+
+As the branches in the divsteps are completely determined by the bottom *N* bits of *f* and *g*, this
+function to compute the transition matrix only needs to see those bottom bits. Furthermore all
+intermediate results and outputs fit in *(N+1)*-bit numbers (unsigned for *f* and *g*; signed for *u*, *v*,
+*q*, and *r*) (see also paragraph 8.3 in the paper). This means that an implementation using 64-bit
+integers could set *N=62* and compute the full transition matrix for 62 steps at once without any
+big integer arithmetic at all. This is the reason why this algorithm is efficient: it only needs
+to update the full-size *f*, *g*, *d*, and *e* numbers once every *N* steps.
+
+We still need functions to compute:
+
+```
+  [ out_f ] = (1/2^N * [ u,  v ]) * [ in_f ]
+  [ out_g ]   (        [ q,  r ])   [ in_g ]
+
+  [ out_d ] = (1/2^N * [ u,  v ]) * [ in_d ]  (mod M)
+  [ out_e ]   (        [ q,  r ])   [ in_e ]
+```
+
+Because the divsteps transformation only ever divides even numbers by two, the result of *t&thinsp;[f,g]* is always even. When *t* is a composition of *N* divsteps, it follows that the resulting *f*
+and *g* will be multiple of *2<sup>N</sup>*, and division by *2<sup>N</sup>* is simply shifting them down:
+
+```python
+def update_fg(f, g, t):
+    """Multiply matrix t/2^N with [f, g]."""
+    u, v, q, r = t
+    cf, cg = u*f + v*g, q*f + r*g
+    # (t / 2^N) should cleanly apply to [f,g] so the result of t*[f,g] should have N zero
+    # bottom bits.
+    assert cf % 2**N == 0
+    assert cg % 2**N == 0
+    return cf >> N, cg >> N
+```
+
+The same is not true for *d* and *e*, and we need an equivalent of the `div2` function for division by *2<sup>N</sup> mod M*.
+This is easy if we have precomputed *1/M mod 2<sup>N</sup>* (which always exists for odd *M*):
+
+```python
+def div2n(M, Mi, x):
+    """Compute x/2^N mod M, given Mi = 1/M mod 2^N."""
+    assert (M * Mi) % 2**N == 1
+    # Find a factor m such that m*M has the same bottom N bits as x. We want:
+    #     (m * M) mod 2^N = x mod 2^N
+    # <=> m mod 2^N = (x / M) mod 2^N
+    # <=> m mod 2^N = (x * Mi) mod 2^N
+    m = (Mi * x) % 2**N
+    # Subtract that multiple from x, cancelling its bottom N bits.
+    x -= m * M
+    # Now a clean division by 2^N is possible.
+    assert x % 2**N == 0
+    return (x >> N) % M
+
+def update_de(d, e, t, M, Mi):
+    """Multiply matrix t/2^N with [d, e], modulo M."""
+    u, v, q, r = t
+    cd, ce = u*d + v*e, q*d + r*e
+    return div2n(M, Mi, cd), div2n(M, Mi, ce)
+```
+
+With all of those, we can write a version of `modinv` that performs *N* divsteps at once:
+
+```python3
+def modinv(M, Mi, x):
+    """Compute the modular inverse of x mod M, given Mi=1/M mod 2^N."""
+    assert M & 1
+    delta, f, g, d, e = 1, M, x, 0, 1
+    while g != 0:
+        # Compute the delta and transition matrix t for the next N divsteps (this only needs
+        # (N+1)-bit signed integer arithmetic).
+        delta, t = divsteps_n_matrix(delta, f % 2**N, g % 2**N)
+        # Apply the transition matrix t to [f, g]:
+        f, g = update_fg(f, g, t)
+        # Apply the transition matrix t to [d, e]:
+        d, e = update_de(d, e, t, M, Mi)
+    return (d * f) % M
+```
+
+This means that in practice we'll always perform a multiple of *N* divsteps. This is not a problem
+because once *g=0*, further divsteps do not affect *f*, *g*, *d*, or *e* anymore (only *&delta;* keeps
+increasing). For variable time code such excess iterations will be mostly optimized away in
+section 6.
+
+
+## 4. Avoiding modulus operations
+
+So far, there are two places where we compute a remainder of big numbers modulo *M*: at the end of
+`div2n` in every `update_de`, and at the very end of `modinv` after potentially negating *d* due to the
+sign of *f*. These are relatively expensive operations when done generically.
+
+To deal with the modulus operation in `div2n`, we simply stop requiring *d* and *e* to be in range
+*[0,M)* all the time. Let's start by inlining `div2n` into `update_de`, and dropping the modulus
+operation at the end:
+
+```python
+def update_de(d, e, t, M, Mi):
+    """Multiply matrix t/2^N with [d, e] mod M, given Mi=1/M mod 2^N."""
+    u, v, q, r = t
+    cd, ce = u*d + v*e, q*d + r*e
+    # Cancel out bottom N bits of cd and ce.
+    md = -((Mi * cd) % 2**N)
+    me = -((Mi * ce) % 2**N)
+    cd += md * M
+    ce += me * M
+    # And cleanly divide by 2**N.
+    return cd >> N, ce >> N
+```
+
+Let's look at bounds on the ranges of these numbers. It can be shown that *|u|+|v|* and *|q|+|r|*
+never exceed *2<sup>N</sup>* (see paragraph 8.3 in the paper), and thus a multiplication with *t* will have
+outputs whose absolute values are at most *2<sup>N</sup>* times the maximum absolute input value. In case the
+inputs *d* and *e* are in *(-M,M)*, which is certainly true for the initial values *d=0* and *e=1* assuming
+*M > 1*, the multiplication results in numbers in range *(-2<sup>N</sup>M,2<sup>N</sup>M)*. Subtracting less than *2<sup>N</sup>*
+times *M* to cancel out *N* bits brings that up to *(-2<sup>N+1</sup>M,2<sup>N</sup>M)*, and
+dividing by *2<sup>N</sup>* at the end takes it to *(-2M,M)*. Another application of `update_de` would take that
+to *(-3M,2M)*, and so forth. This progressive expansion of the variables' ranges can be
+counteracted by incrementing *d* and *e* by *M* whenever they're negative:
+
+```python
+    ...
+    if d < 0:
+        d += M
+    if e < 0:
+        e += M
+    cd, ce = u*d + v*e, q*d + r*e
+    # Cancel out bottom N bits of cd and ce.
+    ...
+```
+
+With inputs in *(-2M,M)*, they will first be shifted into range *(-M,M)*, which means that the
+output will again be in *(-2M,M)*, and this remains the case regardless of how many `update_de`
+invocations there are. In what follows, we will try to make this more efficient.
+
+Note that increasing *d* by *M* is equal to incrementing *cd* by *u&thinsp;M* and *ce* by *q&thinsp;M*. Similarly,
+increasing *e* by *M* is equal to incrementing *cd* by *v&thinsp;M* and *ce* by *r&thinsp;M*. So we could instead write:
+
+```python
+    ...
+    cd, ce = u*d + v*e, q*d + r*e
+    # Perform the equivalent of incrementing d, e by M when they're negative.
+    if d < 0:
+        cd += u*M
+        ce += q*M
+    if e < 0:
+        cd += v*M
+        ce += r*M
+    # Cancel out bottom N bits of cd and ce.
+    md = -((Mi * cd) % 2**N)
+    me = -((Mi * ce) % 2**N)
+    cd += md * M
+    ce += me * M
+    ...
+```
+
+Now note that we have two steps of corrections to *cd* and *ce* that add multiples of *M*: this
+increment, and the decrement that cancels out bottom bits. The second one depends on the first
+one, but they can still be efficiently combined by only computing the bottom bits of *cd* and *ce*
+at first, and using that to compute the final *md*, *me* values:
+
+```python
+def update_de(d, e, t, M, Mi):
+    """Multiply matrix t/2^N with [d, e], modulo M."""
+    u, v, q, r = t
+    md, me = 0, 0
+    # Compute what multiples of M to add to cd and ce.
+    if d < 0:
+        md += u
+        me += q
+    if e < 0:
+        md += v
+        me += r
+    # Compute bottom N bits of t*[d,e] + M*[md,me].
+    cd, ce = (u*d + v*e + md*M) % 2**N, (q*d + r*e + me*M) % 2**N
+    # Correct md and me such that the bottom N bits of t*[d,e] + M*[md,me] are zero.
+    md -= (Mi * cd) % 2**N
+    me -= (Mi * ce) % 2**N
+    # Do the full computation.
+    cd, ce = u*d + v*e + md*M, q*d + r*e + me*M
+    # And cleanly divide by 2**N.
+    return cd >> N, ce >> N
+```
+
+One last optimization: we can avoid the *md&thinsp;M* and *me&thinsp;M* multiplications in the bottom bits of *cd*
+and *ce* by moving them to the *md* and *me* correction:
+
+```python
+    ...
+    # Compute bottom N bits of t*[d,e].
+    cd, ce = (u*d + v*e) % 2**N, (q*d + r*e) % 2**N
+    # Correct md and me such that the bottom N bits of t*[d,e]+M*[md,me] are zero.
+    # Note that this is not the same as {md = (-Mi * cd) % 2**N} etc. That would also result in N
+    # zero bottom bits, but isn't guaranteed to be a reduction of [0,2^N) compared to the
+    # previous md and me values, and thus would violate our bounds analysis.
+    md -= (Mi*cd + md) % 2**N
+    me -= (Mi*ce + me) % 2**N
+    ...
+```
+
+The resulting function takes *d* and *e* in range *(-2M,M)* as inputs, and outputs values in the same
+range. That also means that the *d* value at the end of `modinv` will be in that range, while we want
+a result in *[0,M)*. To do that, we need a normalization function. It's easy to integrate the
+conditional negation of *d* (based on the sign of *f*) into it as well:
+
+```python
+def normalize(sign, v, M):
+    """Compute sign*v mod M, where v is in range (-2*M,M); output in [0,M)."""
+    assert sign == 1 or sign == -1
+    # v in (-2*M,M)
+    if v < 0:
+        v += M
+    # v in (-M,M). Now multiply v with sign (which can only be 1 or -1).
+    if sign == -1:
+        v = -v
+    # v in (-M,M)
+    if v < 0:
+        v += M
+    # v in [0,M)
+    return v
+```
+
+And calling it in `modinv` is simply:
+
+```python
+   ...
+   return normalize(f, d, M)
+```
+
+
+## 5. Constant-time operation
+
+The primary selling point of the algorithm is fast constant-time operation. What code flow still
+depends on the input data so far?
+
+- the number of iterations of the while *g &ne; 0* loop in `modinv`
+- the branches inside `divsteps_n_matrix`
+- the sign checks in `update_de`
+- the sign checks in `normalize`
+
+To make the while loop in `modinv` constant time it can be replaced with a constant number of
+iterations. The paper proves (Theorem 11.2) that *741* divsteps are sufficient for any *256*-bit
+inputs, and [safegcd-bounds](https://github.com/sipa/safegcd-bounds) shows that the slightly better bound *724* is
+sufficient even. Given that every loop iteration performs *N* divsteps, it will run a total of
+*&lceil;724/N&rceil;* times.
+
+To deal with the branches in `divsteps_n_matrix` we will replace them with constant-time bitwise
+operations (and hope the C compiler isn't smart enough to turn them back into branches; see
+`valgrind_ctime_test.c` for automated tests that this isn't the case). To do so, observe that a
+divstep can be written instead as (compare to the inner loop of `gcd` in section 1).
+
+```python
+    x = -f if delta > 0 else f         # set x equal to (input) -f or f
+    if g & 1:
+        g += x                         # set g to (input) g-f or g+f
+        if delta > 0:
+            delta = -delta
+            f += g                     # set f to (input) g (note that g was set to g-f before)
+    delta += 1
+    g >>= 1
+```
+
+To convert the above to bitwise operations, we rely on a trick to negate conditionally: per the
+definition of negative numbers in two's complement, (*-v == ~v + 1*) holds for every number *v*. As
+*-1* in two's complement is all *1* bits, bitflipping can be expressed as xor with *-1*. It follows
+that *-v == (v ^ -1) - (-1)*. Thus, if we have a variable *c* that takes on values *0* or *-1*, then
+*(v ^ c) - c* is *v* if *c=0* and *-v* if *c=-1*.
+
+Using this we can write:
+
+```python
+    x = -f if delta > 0 else f
+```
+
+in constant-time form as:
+
+```python
+    c1 = (-delta) >> 63
+    # Conditionally negate f based on c1:
+    x = (f ^ c1) - c1
+```
+
+To use that trick, we need a helper mask variable *c1* that resolves the condition *&delta;>0* to *-1*
+(if true) or *0* (if false). We compute *c1* using right shifting, which is equivalent to dividing by
+the specified power of *2* and rounding down (in Python, and also in C under the assumption of a typical two's complement system; see
+`assumptions.h` for tests that this is the case). Right shifting by *63* thus maps all
+numbers in range *[-2<sup>63</sup>,0)* to *-1*, and numbers in range *[0,2<sup>63</sup>)* to *0*.
+
+Using the facts that *x&0=0* and *x&(-1)=x* (on two's complement systems again), we can write:
+
+```python
+    if g & 1:
+        g += x
+```
+
+as:
+
+```python
+    # Compute c2=0 if g is even and c2=-1 if g is odd.
+    c2 = -(g & 1)
+    # This masks out x if g is even, and leaves x be if g is odd.
+    g += x & c2
+```
+
+Using the conditional negation trick again we can write:
+
+```python
+    if g & 1:
+        if delta > 0:
+            delta = -delta
+```
+
+as:
+
+```python
+    # Compute c3=-1 if g is odd and delta>0, and 0 otherwise.
+    c3 = c1 & c2
+    # Conditionally negate delta based on c3:
+    delta = (delta ^ c3) - c3
+```
+
+Finally:
+
+```python
+    if g & 1:
+        if delta > 0:
+            f += g
+```
+
+becomes:
+
+```python
+    f += g & c3
+```
+
+It turns out that this can be implemented more efficiently by applying the substitution
+*&eta;=-&delta;*. In this representation, negating *&delta;* corresponds to negating *&eta;*, and incrementing
+*&delta;* corresponds to decrementing *&eta;*. This allows us to remove the negation in the *c1*
+computation:
+
+```python
+    # Compute a mask c1 for eta < 0, and compute the conditional negation x of f:
+    c1 = eta >> 63
+    x = (f ^ c1) - c1
+    # Compute a mask c2 for odd g, and conditionally add x to g:
+    c2 = -(g & 1)
+    g += x & c2
+    # Compute a mask c for (eta < 0) and odd (input) g, and use it to conditionally negate eta,
+    # and add g to f:
+    c3 = c1 & c2
+    eta = (eta ^ c3) - c3
+    f += g & c3
+    # Incrementing delta corresponds to decrementing eta.
+    eta -= 1
+    g >>= 1
+```
+
+By replacing the loop in `divsteps_n_matrix` with a variant of the divstep code above (extended to
+also apply all *f* operations to *u*, *v* and all *g* operations to *q*, *r*), a constant-time version of
+`divsteps_n_matrix` is obtained. The full code will be in section 7.
+
+These bit fiddling tricks can also be used to make the conditional negations and additions in
+`update_de` and `normalize` constant-time.
+
+
+## 6. Variable-time optimizations
+
+In section 5, we modified the `divsteps_n_matrix` function (and a few others) to be constant time.
+Constant time operations are only necessary when computing modular inverses of secret data. In
+other cases, it slows down calculations unnecessarily. In this section, we will construct a
+faster non-constant time `divsteps_n_matrix` function.
+
+To do so, first consider yet another way of writing the inner loop of divstep operations in
+`gcd` from section 1. This decomposition is also explained in the paper in section 8.2.
+
+```python
+for _ in range(N):
+    if g & 1 and eta < 0:
+        eta, f, g = -eta, g, -f
+    if g & 1:
+        g += f
+    eta -= 1
+    g >>= 1
+```
+
+Whenever *g* is even, the loop only shifts *g* down and decreases *&eta;*. When *g* ends in multiple zero
+bits, these iterations can be consolidated into one step. This requires counting the bottom zero
+bits efficiently, which is possible on most platforms; it is abstracted here as the function
+`count_trailing_zeros`.
+
+```python
+def count_trailing_zeros(v):
+    """For a non-zero value v, find z such that v=(d<<z) for some odd d."""
+    return (v & -v).bit_length() - 1
+
+i = N # divsteps left to do
+while True:
+    # Get rid of all bottom zeros at once. In the first iteration, g may be odd and the following
+    # lines have no effect (until "if eta < 0").
+    zeros = min(i, count_trailing_zeros(g))
+    eta -= zeros
+    g >>= zeros
+    i -= zeros
+    if i == 0:
+        break
+    # We know g is odd now
+    if eta < 0:
+        eta, f, g = -eta, g, -f
+    g += f
+    # g is even now, and the eta decrement and g shift will happen in the next loop.
+```
+
+We can now remove multiple bottom *0* bits from *g* at once, but still need a full iteration whenever
+there is a bottom *1* bit. In what follows, we will get rid of multiple *1* bits simultaneously as
+well.
+
+Observe that as long as *&eta; &geq; 0*, the loop does not modify *f*. Instead, it cancels out bottom
+bits of *g* and shifts them out, and decreases *&eta;* and *i* accordingly - interrupting only when *&eta;*
+becomes negative, or when *i* reaches *0*. Combined, this is equivalent to adding a multiple of *f* to
+*g* to cancel out multiple bottom bits, and then shifting them out.
+
+It is easy to find what that multiple is: we want a number *w* such that *g+w&thinsp;f* has a few bottom
+zero bits. If that number of bits is *L*, we want *g+w&thinsp;f mod 2<sup>L</sup> = 0*, or *w = -g/f mod 2<sup>L</sup>*. Since *f*
+is odd, such a *w* exists for any *L*. *L* cannot be more than *i* steps (as we'd finish the loop before
+doing more) or more than *&eta;+1* steps (as we'd run `eta, f, g = -eta, g, f` at that point), but
+apart from that, we're only limited by the complexity of computing *w*.
+
+This code demonstrates how to cancel up to 4 bits per step:
+
+```python
+NEGINV16 = [15, 5, 3, 9, 7, 13, 11, 1] # NEGINV16[n//2] = (-n)^-1 mod 16, for odd n
+i = N
+while True:
+    zeros = min(i, count_trailing_zeros(g))
+    eta -= zeros
+    g >>= zeros
+    i -= zeros
+    if i == 0:
+        break
+    # We know g is odd now
+    if eta < 0:
+        eta, f, g = -eta, g, f
+    # Compute limit on number of bits to cancel
+    limit = min(min(eta + 1, i), 4)
+    # Compute w = -g/f mod 2**limit, using the table value for -1/f mod 2**4. Note that f is
+    # always odd, so its inverse modulo a power of two always exists.
+    w = (g * NEGINV16[(f & 15) // 2]) % (2**limit)
+    # As w = -g/f mod (2**limit), g+w*f mod 2**limit = 0 mod 2**limit.
+    g += w * f
+    assert g % (2**limit) == 0
+    # The next iteration will now shift out at least limit bottom zero bits from g.
+```
+
+By using a bigger table more bits can be cancelled at once. The table can also be implemented
+as a formula. Several formulas are known for computing modular inverses modulo powers of two;
+some can be found in Hacker's Delight second edition by Henry S. Warren, Jr. pages 245-247.
+Here we need the negated modular inverse, which is a simple transformation of those:
+
+- Instead of a 3-bit table:
+  - *-f* or *f ^ 6*
+- Instead of a 4-bit table:
+  - *1 - f(f + 1)*
+  - *-(f + (((f + 1) & 4) << 1))*
+- For larger tables the following technique can be used: if *w=-1/f mod 2<sup>L</sup>*, then *w(w&thinsp;f+2)* is
+  *-1/f mod 2<sup>2L</sup>*. This allows extending the previous formulas (or tables). In particular we
+  have this 6-bit function (based on the 3-bit function above):
+  - *f(f<sup>2</sup> - 2)*
+
+This loop, again extended to also handle *u*, *v*, *q*, and *r* alongside *f* and *g*, placed in
+`divsteps_n_matrix`, gives a significantly faster, but non-constant time version.
+
+
+## 7. Final Python version
+
+All together we need the following functions:
+
+- A way to compute the transition matrix in constant time, using the `divsteps_n_matrix` function
+  from section 2, but with its loop replaced by a variant of the constant-time divstep from
+  section 5, extended to handle *u*, *v*, *q*, *r*:
+
+```python
+def divsteps_n_matrix(eta, f, g):
+    """Compute eta and transition matrix t after N divsteps (multiplied by 2^N)."""
+    u, v, q, r = 1, 0, 0, 1 # start with identity matrix
+    for _ in range(N):
+        c1 = eta >> 63
+        # Compute x, y, z as conditionally-negated versions of f, u, v.
+        x, y, z = (f ^ c1) - c1, (u ^ c1) - c1, (v ^ c1) - c1
+        c2 = -(g & 1)
+        # Conditionally add x, y, z to g, q, r.
+        g, q, r = g + (x & c2), q + (y & c2), r + (z & c2)
+        c1 &= c2                     # reusing c1 here for the earlier c3 variable
+        eta = (eta ^ c1) - (c1 + 1)  # inlining the unconditional eta decrement here
+        # Conditionally add g, q, r to f, u, v.
+        f, u, v = f + (g & c1), u + (q & c1), v + (r & c1)
+        # When shifting g down, don't shift q, r, as we construct a transition matrix multiplied
+        # by 2^N. Instead, shift f's coefficients u and v up.
+        g, u, v = g >> 1, u << 1, v << 1
+    return eta, (u, v, q, r)
+```
+
+- The functions to update *f* and *g*, and *d* and *e*, from section 2 and section 4, with the constant-time
+  changes to `update_de` from section 5:
+
+```python
+def update_fg(f, g, t):
+    """Multiply matrix t/2^N with [f, g]."""
+    u, v, q, r = t
+    cf, cg = u*f + v*g, q*f + r*g
+    return cf >> N, cg >> N
+
+def update_de(d, e, t, M, Mi):
+    """Multiply matrix t/2^N with [d, e], modulo M."""
+    u, v, q, r = t
+    d_sign, e_sign = d >> 257, e >> 257
+    md, me = (u & d_sign) + (v & e_sign), (q & d_sign) + (r & e_sign)
+    cd, ce = (u*d + v*e) % 2**N, (q*d + r*e) % 2**N
+    md -= (Mi*cd + md) % 2**N
+    me -= (Mi*ce + me) % 2**N
+    cd, ce = u*d + v*e + Mi*md, q*d + r*e + Mi*me
+    return cd >> N, ce >> N
+```
+
+- The `normalize` function from section 4, made constant time as well:
+
+```python
+def normalize(sign, v, M):
+    """Compute sign*v mod M, where v in (-2*M,M); output in [0,M)."""
+    v_sign = v >> 257
+    # Conditionally add M to v.
+    v += M & v_sign
+    c = (sign - 1) >> 1
+    # Conditionally negate v.
+    v = (v ^ c) - c
+    v_sign = v >> 257
+    # Conditionally add M to v again.
+    v += M & v_sign
+    return v
+```
+
+- And finally the `modinv` function too, adapted to use *&eta;* instead of *&delta;*, and using the fixed
+  iteration count from section 5:
+
+```python
+def modinv(M, Mi, x):
+    """Compute the modular inverse of x mod M, given Mi=1/M mod 2^N."""
+    eta, f, g, d, e = -1, M, x, 0, 1
+    for _ in range((724 + N - 1) // N):
+        eta, t = divsteps_n_matrix(-eta, f % 2**N, g % 2**N)
+        f, g = update_fg(f, g, t)
+        d, e = update_de(d, e, t, M, Mi)
+    return normalize(f, d, M)
+```
+
+- To get a variable time version, replace the `divsteps_n_matrix` function with one that uses the
+  divsteps loop from section 5, and a `modinv` version that calls it without the fixed iteration
+  count:
+
+```python
+NEGINV16 = [15, 5, 3, 9, 7, 13, 11, 1] # NEGINV16[n//2] = (-n)^-1 mod 16, for odd n
+def divsteps_n_matrix_var(eta, f, g):
+    """Compute eta and transition matrix t after N divsteps (multiplied by 2^N)."""
+    u, v, q, r = 1, 0, 0, 1
+    i = N
+    while True:
+        zeros = min(i, count_trailing_zeros(g))
+        eta, i = eta - zeros, i - zeros
+        g, u, v = g >> zeros, u << zeros, v << zeros
+        if i == 0:
+            break
+        if eta < 0:
+            eta, f, u, v, g, q, r = -eta, g, q, r, -f, -u, -v
+        limit = min(min(eta + 1, i), 4)
+        w = (g * NEGINV16[(f & 15) // 2]) % (2**limit)
+        g, q, r = g + w*f, q + w*u, r + w*v
+    return eta, (u, v, q, r)
+
+def modinv_var(M, Mi, x):
+    """Compute the modular inverse of x mod M, given Mi = 1/M mod 2^N."""
+    eta, f, g, d, e = -1, M, x, 0, 1
+    while g != 0:
+        eta, t = divsteps_n_matrix_var(eta, f % 2**N, g % 2**N)
+        f, g = update_fg(f, g, t)
+        d, e = update_de(d, e, t, M, Mi)
+    return normalize(f, d, Mi)
+```
diff --git a/include/secp256k1_multiset.h b/include/secp256k1_multiset.h
deleted file mode 100644
index e77c564..0000000
--- a/include/secp256k1_multiset.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/**********************************************************************
- * Copyright (c) 2017 Tomas van der Wansem                            *
- * Distributed under the MIT software license, see the accompanying   *
- * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
- **********************************************************************/
-
-
-#ifndef _SECP256K1_MULTISET__
-# define _SECP256K1_MULTISET__
-
-# include "secp256k1.h"
-
-
-# ifdef __cplusplus
-extern "C" {
-# endif
-
-
-/** Opaque multiset; this is actually a group element **/
-typedef struct {
-    unsigned char d[96];
-} secp256k1_multiset;
-
-
-
-/** Initialize a multiset
- *  The resulting multiset the multiset for no data elements
- *
- *  Returns: 1: success
- *           0: invalid parameter
- *  Args:    ctx:      pointer to a context object (cannot be NULL)
- *  Out:     multiset: the resulting multiset (cannot be NULL)
- */
-SECP256K1_API int secp256k1_multiset_init(
-  const secp256k1_context* ctx,
-  secp256k1_multiset *multiset
-) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2);
-
-/** Serialize a multiset to bytes
- *
- *  Returns: 1:
- *  Args:        ctx:  pointer to a context object (cannot be NULL)
- *  Out:       out64:  pointer to a 64-byte array to store the serialized multiset (cannot be NULL)
- *  In:    multiset:   pointer to the multiset (cannot be NULL)
- */
-SECP256K1_API int secp256k1_multiset_serialize(
-    const secp256k1_context* ctx,
-    unsigned char *out64,
-    const secp256k1_multiset *multiset
-) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(3);
-
-/** parse a 64 byte multiset into a multiset object
- *
- *  Returns: 1: suceess
- *           0: Failed to parse multiset
- *  Args:        ctx:  pointer to a context object (cannot be NULL)
- *  Out:    multiset:  pointer to a multiset object (cannot be NULL)
- *  In:         in64:  pointer to the 64-byte multiset to be parsed (cannot be NULL)
- */
-SECP256K1_API int secp256k1_multiset_parse(
-    const secp256k1_context* ctx,
-    secp256k1_multiset *multiset,
-    const unsigned char *in64
-) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(3);
-
-/** Adds an element to a multiset from single data element
- *
- *  Returns: 1: success
- *           0: invalid parameter
- *  Args:    ctx:      pointer to a context object (cannot be NULL)
- *  Out:     multiset: the multiset to update (cannot be NULL)
- *  In:      input:    the data to add (cannot be NULL)
- *           inputLen: the size of the data to add
- */
-SECP256K1_API int secp256k1_multiset_add(
-  const secp256k1_context* ctx,
-  secp256k1_multiset *multiset,
-  const unsigned char *input,
-  size_t inputLen
-) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(3);
-
-/** Removes an element from a multiset
- *
- *  Returns: 1: success
- *           0: invalid parameter
- *  Args:    ctx:      pointer to a context object (cannot be NULL)
- *  Out:     multiset: the multiset to update (cannot be NULL)
- *  In:      input:    the data to remove (cannot be NULL)
- *           inputLen: the size of the data to remove
- */
-SECP256K1_API int secp256k1_multiset_remove(
-  const secp256k1_context* ctx,
-  secp256k1_multiset *multiset,
-  const unsigned char *input,
-  size_t inputLen
-) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(3);
-
-
-
-/** Combines two multisets
- *
- *  Returns: 1: success
- *           0: invalid parameter
- *  Args:    ctx:      pointer to a context object (cannot be NULL)
- *  In/Out:  multiset: the multiset to which the input must be added (cannot be NULL)
- *  In:      input:    the multiset to add
- */
-SECP256K1_API int secp256k1_multiset_combine(
-  const secp256k1_context* ctx,
-  secp256k1_multiset *multiset,
-  const secp256k1_multiset *input
-
-) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(3);
-
-
-/** Converts a multiset to a hash
- *
- *  Returns: 1: success
- *           0: invalid parameter
- *  Args:    ctx:      pointer to a context object (cannot be NULL)
- *  Out:     hash:     the resulting 32-byte hash
- *  In:      multiset: the multiset to hash
- */
-SECP256K1_API int secp256k1_multiset_finalize(
-  const secp256k1_context* ctx,
-  unsigned char *resultHash,
-  const secp256k1_multiset *multiset
-) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(3);
-
-
-
-# ifdef __cplusplus
-}
-# endif
-
-#endif
diff --git a/src/basic-config.h b/src/basic-config.h
index bb6b582..e4b1b8b 100644
--- a/src/basic-config.h
+++ b/src/basic-config.h
@@ -13,19 +13,10 @@
 #undef USE_ECMULT_STATIC_PRECOMPUTATION
 #undef USE_EXTERNAL_ASM
 #undef USE_EXTERNAL_DEFAULT_CALLBACKS
-#undef USE_FIELD_INV_BUILTIN
-#undef USE_FIELD_INV_NUM
-#undef USE_NUM_GMP
-#undef USE_NUM_NONE
-#undef USE_SCALAR_INV_BUILTIN
-#undef USE_SCALAR_INV_NUM
 #undef USE_FORCE_WIDEMUL_INT64
 #undef USE_FORCE_WIDEMUL_INT128
 #undef ECMULT_WINDOW_SIZE
 
-#define USE_NUM_NONE 1
-#define USE_FIELD_INV_BUILTIN 1
-#define USE_SCALAR_INV_BUILTIN 1
 #define USE_WIDEMUL_64 1
 #define ECMULT_WINDOW_SIZE 15
 
diff --git a/src/bench_ecmult.c b/src/bench_ecmult.c
index 85b9e43..204e85a 100644
--- a/src/bench_ecmult.c
+++ b/src/bench_ecmult.c
@@ -9,7 +9,6 @@
 
 #include "util.h"
 #include "hash_impl.h"
-#include "num_impl.h"
 #include "field_impl.h"
 #include "group_impl.h"
 #include "scalar_impl.h"
diff --git a/src/bench_internal.c b/src/bench_internal.c
index 7fa6882..73b8a24 100644
--- a/src/bench_internal.c
+++ b/src/bench_internal.c
@@ -10,7 +10,6 @@
 #include "assumptions.h"
 #include "util.h"
 #include "hash_impl.h"
-#include "num_impl.h"
 #include "field_impl.h"
 #include "group_impl.h"
 #include "scalar_impl.h"
@@ -99,15 +98,6 @@ void bench_scalar_negate(void* arg, int iters) {
     }
 }
 
-void bench_scalar_sqr(void* arg, int iters) {
-    int i;
-    bench_inv *data = (bench_inv*)arg;
-
-    for (i = 0; i < iters; i++) {
-        secp256k1_scalar_sqr(&data->scalar[0], &data->scalar[0]);
-    }
-}
-
 void bench_scalar_mul(void* arg, int iters) {
     int i;
     bench_inv *data = (bench_inv*)arg;
@@ -255,26 +245,6 @@ void bench_group_add_affine_var(void* arg, int iters) {
     }
 }
 
-void bench_group_jacobi_var(void* arg, int iters) {
-    int i, j = 0;
-    bench_inv *data = (bench_inv*)arg;
-
-    for (i = 0; i < iters; i++) {
-        j += secp256k1_gej_has_quad_y_var(&data->gej[0]);
-        /* Vary the Y and Z coordinates of the input (the X coordinate doesn't matter to
-           secp256k1_gej_has_quad_y_var). Note that the resulting coordinates will
-           generally not correspond to a point on the curve, but this is not a problem
-           for the code being benchmarked here. Adding and normalizing have less
-           overhead than EC operations (which could guarantee the point remains on the
-           curve). */
-        secp256k1_fe_add(&data->gej[0].y, &data->fe[1]);
-        secp256k1_fe_add(&data->gej[0].z, &data->fe[2]);
-        secp256k1_fe_normalize_var(&data->gej[0].y);
-        secp256k1_fe_normalize_var(&data->gej[0].z);
-    }
-    CHECK(j <= iters);
-}
-
 void bench_group_to_affine_var(void* arg, int iters) {
     int i;
     bench_inv *data = (bench_inv*)arg;
@@ -282,8 +252,10 @@ void bench_group_to_affine_var(void* arg, int iters) {
     for (i = 0; i < iters; ++i) {
         secp256k1_ge_set_gej_var(&data->ge[1], &data->gej[0]);
         /* Use the output affine X/Y coordinates to vary the input X/Y/Z coordinates.
-           Similar to bench_group_jacobi_var, this approach does not result in
-           coordinates of points on the curve. */
+           Note that the resulting coordinates will generally not correspond to a point
+           on the curve, but this is not a problem for the code being benchmarked here.
+           Adding and normalizing have less overhead than EC operations (which could
+           guarantee the point remains on the curve). */
         secp256k1_fe_add(&data->gej[0].x, &data->ge[1].y);
         secp256k1_fe_add(&data->gej[0].y, &data->fe[2]);
         secp256k1_fe_add(&data->gej[0].z, &data->ge[1].x);
@@ -369,35 +341,16 @@ void bench_context_sign(void* arg, int iters) {
     }
 }
 
-#ifndef USE_NUM_NONE
-void bench_num_jacobi(void* arg, int iters) {
-    int i, j = 0;
-    bench_inv *data = (bench_inv*)arg;
-    secp256k1_num nx, na, norder;
-
-    secp256k1_scalar_get_num(&nx, &data->scalar[0]);
-    secp256k1_scalar_order_get_num(&norder);
-    secp256k1_scalar_get_num(&na, &data->scalar[1]);
-
-    for (i = 0; i < iters; i++) {
-        j += secp256k1_num_jacobi(&nx, &norder);
-        secp256k1_num_add(&nx, &nx, &na);
-    }
-    CHECK(j <= iters);
-}
-#endif
-
 int main(int argc, char **argv) {
     bench_inv data;
     int iters = get_iters(20000);
 
     if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "add")) run_benchmark("scalar_add", bench_scalar_add, bench_setup, NULL, &data, 10, iters*100);
     if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "negate")) run_benchmark("scalar_negate", bench_scalar_negate, bench_setup, NULL, &data, 10, iters*100);
-    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "sqr")) run_benchmark("scalar_sqr", bench_scalar_sqr, bench_setup, NULL, &data, 10, iters*10);
     if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "mul")) run_benchmark("scalar_mul", bench_scalar_mul, bench_setup, NULL, &data, 10, iters*10);
     if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "split")) run_benchmark("scalar_split", bench_scalar_split, bench_setup, NULL, &data, 10, iters);
-    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "inverse")) run_benchmark("scalar_inverse", bench_scalar_inverse, bench_setup, NULL, &data, 10, 2000);
-    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "inverse")) run_benchmark("scalar_inverse_var", bench_scalar_inverse_var, bench_setup, NULL, &data, 10, 2000);
+    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "inverse")) run_benchmark("scalar_inverse", bench_scalar_inverse, bench_setup, NULL, &data, 10, iters);
+    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "inverse")) run_benchmark("scalar_inverse_var", bench_scalar_inverse_var, bench_setup, NULL, &data, 10, iters);
 
     if (have_flag(argc, argv, "field") || have_flag(argc, argv, "normalize")) run_benchmark("field_normalize", bench_field_normalize, bench_setup, NULL, &data, 10, iters*100);
     if (have_flag(argc, argv, "field") || have_flag(argc, argv, "normalize")) run_benchmark("field_normalize_weak", bench_field_normalize_weak, bench_setup, NULL, &data, 10, iters*100);
@@ -411,7 +364,6 @@ int main(int argc, char **argv) {
     if (have_flag(argc, argv, "group") || have_flag(argc, argv, "add")) run_benchmark("group_add_var", bench_group_add_var, bench_setup, NULL, &data, 10, iters*10);
     if (have_flag(argc, argv, "group") || have_flag(argc, argv, "add")) run_benchmark("group_add_affine", bench_group_add_affine, bench_setup, NULL, &data, 10, iters*10);
     if (have_flag(argc, argv, "group") || have_flag(argc, argv, "add")) run_benchmark("group_add_affine_var", bench_group_add_affine_var, bench_setup, NULL, &data, 10, iters*10);
-    if (have_flag(argc, argv, "group") || have_flag(argc, argv, "jacobi")) run_benchmark("group_jacobi_var", bench_group_jacobi_var, bench_setup, NULL, &data, 10, iters);
     if (have_flag(argc, argv, "group") || have_flag(argc, argv, "to_affine")) run_benchmark("group_to_affine_var", bench_group_to_affine_var, bench_setup, NULL, &data, 10, iters);
 
     if (have_flag(argc, argv, "ecmult") || have_flag(argc, argv, "wnaf")) run_benchmark("wnaf_const", bench_wnaf_const, bench_setup, NULL, &data, 10, iters);
@@ -424,8 +376,5 @@ int main(int argc, char **argv) {
     if (have_flag(argc, argv, "context") || have_flag(argc, argv, "verify")) run_benchmark("context_verify", bench_context_verify, bench_setup, NULL, &data, 10, 1 + iters/1000);
     if (have_flag(argc, argv, "context") || have_flag(argc, argv, "sign")) run_benchmark("context_sign", bench_context_sign, bench_setup, NULL, &data, 10, 1 + iters/100);
 
-#ifndef USE_NUM_NONE
-    if (have_flag(argc, argv, "num") || have_flag(argc, argv, "jacobi")) run_benchmark("num_jacobi", bench_num_jacobi, bench_setup, NULL, &data, 10, iters*10);
-#endif
     return 0;
 }
diff --git a/src/bench_multiset.c b/src/bench_multiset.c
deleted file mode 100644
index 6696f3f..0000000
--- a/src/bench_multiset.c
+++ /dev/null
@@ -1,62 +0,0 @@
-/**********************************************************************
- * Copyright (c) 2017 Tomas van der Wansem                            *
- * Distributed under the MIT software license, see the accompanying   *
- * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
- **********************************************************************/
-
-#include "include/secp256k1.h"
-#include "include/secp256k1_multiset.h"
-#include "util.h"
-#include "bench.h"
-
-
-typedef struct {
-    secp256k1_context *ctx;
-} bench_multiset_data;
-
-
-void bench_multiset_add(void* arg, int iters) {
-    bench_multiset_data *data = (bench_multiset_data*)arg;
-    int i;
-    secp256k1_multiset multiset;
-    unsigned char buf[100] = "multiset add element benchmark example string for 100 bytes this long message to 100 bytes abcdefghi";
-
-    secp256k1_multiset_init(data->ctx, &multiset);
-
-    for (i = 0; i < iters; i++) {
-        buf[0] = i;
-        buf[1] = i >> 8;
-        buf[2] = i << 8;
-        CHECK(secp256k1_multiset_add(data->ctx, &multiset, buf, sizeof(buf)));
-    }
-}
-
-void bench_multiset_remove(void* arg, int iters) {
-    bench_multiset_data *data = (bench_multiset_data*)arg;
-    int i;
-    secp256k1_multiset multiset;
-    unsigned char buf[100] = "multiset add element benchmark example string for 100 bytes this long message to 100 bytes abcdefghi";
-
-    secp256k1_multiset_init(data->ctx, &multiset);
-
-    for (i = 0; i < iters; i++) {
-        buf[99] = i;
-        buf[98] = i >> 8;
-        buf[97] = i << 8;
-        CHECK(secp256k1_multiset_remove(data->ctx, &multiset, buf, sizeof(buf)));
-    }
-}
-
-int main(void) {
-    int iters = get_iters(10000);
-    bench_multiset_data data;
-
-
-    data.ctx = secp256k1_context_create(SECP256K1_CONTEXT_VERIFY);
-
-    run_benchmark("multiset add", bench_multiset_add, NULL, NULL, &data, 30, iters*2);
-    run_benchmark("multiset remove", bench_multiset_remove, NULL, NULL, &data, 30, iters*2);
-
-    secp256k1_context_destroy(data.ctx);
-    return 0;
-}
diff --git a/src/ecmult.h b/src/ecmult.h
index 7aa394a..7ab617e 100644
--- a/src/ecmult.h
+++ b/src/ecmult.h
@@ -7,7 +7,6 @@
 #ifndef SECP256K1_ECMULT_H
 #define SECP256K1_ECMULT_H
 
-#include "num.h"
 #include "group.h"
 #include "scalar.h"
 #include "scratch.h"
diff --git a/src/field.h b/src/field.h
index ee222ee..c58554b 100644
--- a/src/field.h
+++ b/src/field.h
@@ -104,9 +104,6 @@ static void secp256k1_fe_sqr(secp256k1_fe *r, const secp256k1_fe *a);
  *  itself. */
 static int secp256k1_fe_sqrt(secp256k1_fe *r, const secp256k1_fe *a);
 
-/** Checks whether a field element is a quadratic residue. */
-static int secp256k1_fe_is_quad_var(const secp256k1_fe *a);
-
 /** Sets a field element to be the (modular) inverse of another. Requires the input's magnitude to be
  *  at most 8. The output magnitude is 1 (but not guaranteed to be normalized). */
 static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a);
diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index 62bffdc..c280251 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -9,6 +9,7 @@
 
 #include "util.h"
 #include "field.h"
+#include "modinv32_impl.h"
 
 #ifdef VERIFY
 static void secp256k1_fe_verify(const secp256k1_fe *a) {
@@ -1164,4 +1165,92 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se
 #endif
 }
 
+static void secp256k1_fe_from_signed30(secp256k1_fe *r, const secp256k1_modinv32_signed30 *a) {
+    const uint32_t M26 = UINT32_MAX >> 6;
+    const uint32_t a0 = a->v[0], a1 = a->v[1], a2 = a->v[2], a3 = a->v[3], a4 = a->v[4],
+                   a5 = a->v[5], a6 = a->v[6], a7 = a->v[7], a8 = a->v[8];
+
+    /* The output from secp256k1_modinv32{_var} should be normalized to range [0,modulus), and
+     * have limbs in [0,2^30). The modulus is < 2^256, so the top limb must be below 2^(256-30*8).
+     */
+    VERIFY_CHECK(a0 >> 30 == 0);
+    VERIFY_CHECK(a1 >> 30 == 0);
+    VERIFY_CHECK(a2 >> 30 == 0);
+    VERIFY_CHECK(a3 >> 30 == 0);
+    VERIFY_CHECK(a4 >> 30 == 0);
+    VERIFY_CHECK(a5 >> 30 == 0);
+    VERIFY_CHECK(a6 >> 30 == 0);
+    VERIFY_CHECK(a7 >> 30 == 0);
+    VERIFY_CHECK(a8 >> 16 == 0);
+
+    r->n[0] =  a0                   & M26;
+    r->n[1] = (a0 >> 26 | a1 <<  4) & M26;
+    r->n[2] = (a1 >> 22 | a2 <<  8) & M26;
+    r->n[3] = (a2 >> 18 | a3 << 12) & M26;
+    r->n[4] = (a3 >> 14 | a4 << 16) & M26;
+    r->n[5] = (a4 >> 10 | a5 << 20) & M26;
+    r->n[6] = (a5 >>  6 | a6 << 24) & M26;
+    r->n[7] = (a6 >>  2           ) & M26;
+    r->n[8] = (a6 >> 28 | a7 <<  2) & M26;
+    r->n[9] = (a7 >> 24 | a8 <<  6);
+
+#ifdef VERIFY
+    r->magnitude = 1;
+    r->normalized = 1;
+    secp256k1_fe_verify(r);
+#endif
+}
+
+static void secp256k1_fe_to_signed30(secp256k1_modinv32_signed30 *r, const secp256k1_fe *a) {
+    const uint32_t M30 = UINT32_MAX >> 2;
+    const uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3], a4 = a->n[4],
+                   a5 = a->n[5], a6 = a->n[6], a7 = a->n[7], a8 = a->n[8], a9 = a->n[9];
+
+#ifdef VERIFY
+    VERIFY_CHECK(a->normalized);
+#endif
+
+    r->v[0] = (a0       | a1 << 26) & M30;
+    r->v[1] = (a1 >>  4 | a2 << 22) & M30;
+    r->v[2] = (a2 >>  8 | a3 << 18) & M30;
+    r->v[3] = (a3 >> 12 | a4 << 14) & M30;
+    r->v[4] = (a4 >> 16 | a5 << 10) & M30;
+    r->v[5] = (a5 >> 20 | a6 <<  6) & M30;
+    r->v[6] = (a6 >> 24 | a7 <<  2
+                        | a8 << 28) & M30;
+    r->v[7] = (a8 >>  2 | a9 << 24) & M30;
+    r->v[8] =  a9 >>  6;
+}
+
+static const secp256k1_modinv32_modinfo secp256k1_const_modinfo_fe = {
+    {{-0x3D1, -4, 0, 0, 0, 0, 0, 0, 65536}},
+    0x2DDACACFL
+};
+
+static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *x) {
+    secp256k1_fe tmp;
+    secp256k1_modinv32_signed30 s;
+
+    tmp = *x;
+    secp256k1_fe_normalize(&tmp);
+    secp256k1_fe_to_signed30(&s, &tmp);
+    secp256k1_modinv32(&s, &secp256k1_const_modinfo_fe);
+    secp256k1_fe_from_signed30(r, &s);
+
+    VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == secp256k1_fe_normalizes_to_zero(&tmp));
+}
+
+static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *x) {
+    secp256k1_fe tmp;
+    secp256k1_modinv32_signed30 s;
+
+    tmp = *x;
+    secp256k1_fe_normalize_var(&tmp);
+    secp256k1_fe_to_signed30(&s, &tmp);
+    secp256k1_modinv32_var(&s, &secp256k1_const_modinfo_fe);
+    secp256k1_fe_from_signed30(r, &s);
+
+    VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == secp256k1_fe_normalizes_to_zero(&tmp));
+}
+
 #endif /* SECP256K1_FIELD_REPR_IMPL_H */
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 3465ea3..b73cfea 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -13,6 +13,7 @@
 
 #include "util.h"
 #include "field.h"
+#include "modinv64_impl.h"
 
 #if defined(USE_ASM_X86_64)
 #include "field_5x52_asm_impl.h"
@@ -498,4 +499,80 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se
 #endif
 }
 
+static void secp256k1_fe_from_signed62(secp256k1_fe *r, const secp256k1_modinv64_signed62 *a) {
+    const uint64_t M52 = UINT64_MAX >> 12;
+    const uint64_t a0 = a->v[0], a1 = a->v[1], a2 = a->v[2], a3 = a->v[3], a4 = a->v[4];
+
+    /* The output from secp256k1_modinv64{_var} should be normalized to range [0,modulus), and
+     * have limbs in [0,2^62). The modulus is < 2^256, so the top limb must be below 2^(256-62*4).
+     */
+    VERIFY_CHECK(a0 >> 62 == 0);
+    VERIFY_CHECK(a1 >> 62 == 0);
+    VERIFY_CHECK(a2 >> 62 == 0);
+    VERIFY_CHECK(a3 >> 62 == 0);
+    VERIFY_CHECK(a4 >> 8 == 0);
+
+    r->n[0] =  a0                   & M52;
+    r->n[1] = (a0 >> 52 | a1 << 10) & M52;
+    r->n[2] = (a1 >> 42 | a2 << 20) & M52;
+    r->n[3] = (a2 >> 32 | a3 << 30) & M52;
+    r->n[4] = (a3 >> 22 | a4 << 40);
+
+#ifdef VERIFY
+    r->magnitude = 1;
+    r->normalized = 1;
+    secp256k1_fe_verify(r);
+#endif
+}
+
+static void secp256k1_fe_to_signed62(secp256k1_modinv64_signed62 *r, const secp256k1_fe *a) {
+    const uint64_t M62 = UINT64_MAX >> 2;
+    const uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3], a4 = a->n[4];
+
+#ifdef VERIFY
+    VERIFY_CHECK(a->normalized);
+#endif
+
+    r->v[0] = (a0       | a1 << 52) & M62;
+    r->v[1] = (a1 >> 10 | a2 << 42) & M62;
+    r->v[2] = (a2 >> 20 | a3 << 32) & M62;
+    r->v[3] = (a3 >> 30 | a4 << 22) & M62;
+    r->v[4] =  a4 >> 40;
+}
+
+static const secp256k1_modinv64_modinfo secp256k1_const_modinfo_fe = {
+    {{-0x1000003D1LL, 0, 0, 0, 256}},
+    0x27C7F6E22DDACACFLL
+};
+
+static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *x) {
+    secp256k1_fe tmp;
+    secp256k1_modinv64_signed62 s;
+
+    tmp = *x;
+    secp256k1_fe_normalize(&tmp);
+    secp256k1_fe_to_signed62(&s, &tmp);
+    secp256k1_modinv64(&s, &secp256k1_const_modinfo_fe);
+    secp256k1_fe_from_signed62(r, &s);
+
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == secp256k1_fe_normalizes_to_zero(&tmp));
+#endif
+}
+
+static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *x) {
+    secp256k1_fe tmp;
+    secp256k1_modinv64_signed62 s;
+
+    tmp = *x;
+    secp256k1_fe_normalize_var(&tmp);
+    secp256k1_fe_to_signed62(&s, &tmp);
+    secp256k1_modinv64_var(&s, &secp256k1_const_modinfo_fe);
+    secp256k1_fe_from_signed62(r, &s);
+
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == secp256k1_fe_normalizes_to_zero(&tmp));
+#endif
+}
+
 #endif /* SECP256K1_FIELD_REPR_IMPL_H */
diff --git a/src/field_impl.h b/src/field_impl.h
index f0096f6..374284a 100644
--- a/src/field_impl.h
+++ b/src/field_impl.h
@@ -12,7 +12,6 @@
 #endif
 
 #include "util.h"
-#include "num.h"
 
 #if defined(SECP256K1_WIDEMUL_INT128)
 #include "field_5x52_impl.h"
@@ -136,158 +135,6 @@ static int secp256k1_fe_sqrt(secp256k1_fe *r, const secp256k1_fe *a) {
     return secp256k1_fe_equal(&t1, a);
 }
 
-static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
-    secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
-    int j;
-
-    /** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
-     *  { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
-     *  [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
-     */
-
-    secp256k1_fe_sqr(&x2, a);
-    secp256k1_fe_mul(&x2, &x2, a);
-
-    secp256k1_fe_sqr(&x3, &x2);
-    secp256k1_fe_mul(&x3, &x3, a);
-
-    x6 = x3;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x6, &x6);
-    }
-    secp256k1_fe_mul(&x6, &x6, &x3);
-
-    x9 = x6;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x9, &x9);
-    }
-    secp256k1_fe_mul(&x9, &x9, &x3);
-
-    x11 = x9;
-    for (j=0; j<2; j++) {
-        secp256k1_fe_sqr(&x11, &x11);
-    }
-    secp256k1_fe_mul(&x11, &x11, &x2);
-
-    x22 = x11;
-    for (j=0; j<11; j++) {
-        secp256k1_fe_sqr(&x22, &x22);
-    }
-    secp256k1_fe_mul(&x22, &x22, &x11);
-
-    x44 = x22;
-    for (j=0; j<22; j++) {
-        secp256k1_fe_sqr(&x44, &x44);
-    }
-    secp256k1_fe_mul(&x44, &x44, &x22);
-
-    x88 = x44;
-    for (j=0; j<44; j++) {
-        secp256k1_fe_sqr(&x88, &x88);
-    }
-    secp256k1_fe_mul(&x88, &x88, &x44);
-
-    x176 = x88;
-    for (j=0; j<88; j++) {
-        secp256k1_fe_sqr(&x176, &x176);
-    }
-    secp256k1_fe_mul(&x176, &x176, &x88);
-
-    x220 = x176;
-    for (j=0; j<44; j++) {
-        secp256k1_fe_sqr(&x220, &x220);
-    }
-    secp256k1_fe_mul(&x220, &x220, &x44);
-
-    x223 = x220;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x223, &x223);
-    }
-    secp256k1_fe_mul(&x223, &x223, &x3);
-
-    /* The final result is then assembled using a sliding window over the blocks. */
-
-    t1 = x223;
-    for (j=0; j<23; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, &x22);
-    for (j=0; j<5; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, a);
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, &x2);
-    for (j=0; j<2; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(r, a, &t1);
-}
-
-static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
-#if defined(USE_FIELD_INV_BUILTIN)
-    secp256k1_fe_inv(r, a);
-#elif defined(USE_FIELD_INV_NUM)
-    secp256k1_num n, m;
-    static const secp256k1_fe negone = SECP256K1_FE_CONST(
-        0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL,
-        0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFEUL, 0xFFFFFC2EUL
-    );
-    /* secp256k1 field prime, value p defined in "Standards for Efficient Cryptography" (SEC2) 2.7.1. */
-    static const unsigned char prime[32] = {
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F
-    };
-    unsigned char b[32];
-    int res;
-    secp256k1_fe c = *a;
-    secp256k1_fe_normalize_var(&c);
-    secp256k1_fe_get_b32(b, &c);
-    secp256k1_num_set_bin(&n, b, 32);
-    secp256k1_num_set_bin(&m, prime, 32);
-    secp256k1_num_mod_inverse(&n, &n, &m);
-    secp256k1_num_get_bin(b, 32, &n);
-    res = secp256k1_fe_set_b32(r, b);
-    (void)res;
-    VERIFY_CHECK(res);
-    /* Verify the result is the (unique) valid inverse using non-GMP code. */
-    secp256k1_fe_mul(&c, &c, r);
-    secp256k1_fe_add(&c, &negone);
-    CHECK(secp256k1_fe_normalizes_to_zero_var(&c));
-#else
-#error "Please select field inverse implementation"
-#endif
-}
-
-static int secp256k1_fe_is_quad_var(const secp256k1_fe *a) {
-#ifndef USE_NUM_NONE
-    unsigned char b[32];
-    secp256k1_num n;
-    secp256k1_num m;
-    /* secp256k1 field prime, value p defined in "Standards for Efficient Cryptography" (SEC2) 2.7.1. */
-    static const unsigned char prime[32] = {
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F
-    };
-
-    secp256k1_fe c = *a;
-    secp256k1_fe_normalize_var(&c);
-    secp256k1_fe_get_b32(b, &c);
-    secp256k1_num_set_bin(&n, b, 32);
-    secp256k1_num_set_bin(&m, prime, 32);
-    return secp256k1_num_jacobi(&n, &m) >= 0;
-#else
-    secp256k1_fe r;
-    return secp256k1_fe_sqrt(&r, a);
-#endif
-}
-
 static const secp256k1_fe secp256k1_fe_one = SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 1);
 
 #endif /* SECP256K1_FIELD_IMPL_H */
diff --git a/src/group.h b/src/group.h
index 40bf961..b9cd334 100644
--- a/src/group.h
+++ b/src/group.h
@@ -7,7 +7,6 @@
 #ifndef SECP256K1_GROUP_H
 #define SECP256K1_GROUP_H
 
-#include "num.h"
 #include "field.h"
 
 /** A group element of the secp256k1 curve, in affine coordinates. */
@@ -43,12 +42,6 @@ typedef struct {
 /** Set a group element equal to the point with given X and Y coordinates */
 static void secp256k1_ge_set_xy(secp256k1_ge *r, const secp256k1_fe *x, const secp256k1_fe *y);
 
-/** Set a group element (affine) equal to the point with the given X coordinate
- *  and a Y coordinate that is a quadratic residue modulo p. The return value
- *  is true iff a coordinate with the given X coordinate exists.
- */
-static int secp256k1_ge_set_xquad(secp256k1_ge *r, const secp256k1_fe *x);
-
 /** Set a group element (affine) equal to the point with the given X coordinate, and given oddness
  *  for Y. Return value indicates whether the result is valid. */
 static int secp256k1_ge_set_xo_var(secp256k1_ge *r, const secp256k1_fe *x, int odd);
@@ -96,9 +89,6 @@ static void secp256k1_gej_neg(secp256k1_gej *r, const secp256k1_gej *a);
 /** Check whether a group element is the point at infinity. */
 static int secp256k1_gej_is_infinity(const secp256k1_gej *a);
 
-/** Check whether a group element's y coordinate is a quadratic residue. */
-static int secp256k1_gej_has_quad_y_var(const secp256k1_gej *a);
-
 /** Set r equal to the double of a. Constant time. */
 static void secp256k1_gej_double(secp256k1_gej *r, const secp256k1_gej *a);
 
diff --git a/src/group_impl.h b/src/group_impl.h
index b7094c5..19ebd8f 100644
--- a/src/group_impl.h
+++ b/src/group_impl.h
@@ -7,7 +7,6 @@
 #ifndef SECP256K1_GROUP_IMPL_H
 #define SECP256K1_GROUP_IMPL_H
 
-#include "num.h"
 #include "field.h"
 #include "group.h"
 
@@ -207,18 +206,14 @@ static void secp256k1_ge_clear(secp256k1_ge *r) {
     secp256k1_fe_clear(&r->y);
 }
 
-static int secp256k1_ge_set_xquad(secp256k1_ge *r, const secp256k1_fe *x) {
+static int secp256k1_ge_set_xo_var(secp256k1_ge *r, const secp256k1_fe *x, int odd) {
     secp256k1_fe x2, x3;
     r->x = *x;
     secp256k1_fe_sqr(&x2, x);
     secp256k1_fe_mul(&x3, x, &x2);
     r->infinity = 0;
     secp256k1_fe_add(&x3, &secp256k1_fe_const_b);
-    return secp256k1_fe_sqrt(&r->y, &x3);
-}
-
-static int secp256k1_ge_set_xo_var(secp256k1_ge *r, const secp256k1_fe *x, int odd) {
-    if (!secp256k1_ge_set_xquad(r, x)) {
+    if (!secp256k1_fe_sqrt(&r->y, &x3)) {
         return 0;
     }
     secp256k1_fe_normalize_var(&r->y);
@@ -655,20 +650,6 @@ static void secp256k1_ge_mul_lambda(secp256k1_ge *r, const secp256k1_ge *a) {
     secp256k1_fe_mul(&r->x, &r->x, &beta);
 }
 
-static int secp256k1_gej_has_quad_y_var(const secp256k1_gej *a) {
-    secp256k1_fe yz;
-
-    if (a->infinity) {
-        return 0;
-    }
-
-    /* We rely on the fact that the Jacobi symbol of 1 / a->z^3 is the same as
-     * that of a->z. Thus a->y / a->z^3 is a quadratic residue iff a->y * a->z
-       is */
-    secp256k1_fe_mul(&yz, &a->y, &a->z);
-    return secp256k1_fe_is_quad_var(&yz);
-}
-
 static int secp256k1_ge_is_in_correct_subgroup(const secp256k1_ge* ge) {
 #ifdef EXHAUSTIVE_TEST_ORDER
     secp256k1_gej out;
diff --git a/src/modinv32.h b/src/modinv32.h
new file mode 100644
index 0000000..0efdda9
--- /dev/null
+++ b/src/modinv32.h
@@ -0,0 +1,42 @@
+/***********************************************************************
+ * Copyright (c) 2020 Peter Dettman                                    *
+ * Distributed under the MIT software license, see the accompanying    *
+ * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
+ **********************************************************************/
+
+#ifndef SECP256K1_MODINV32_H
+#define SECP256K1_MODINV32_H
+
+#if defined HAVE_CONFIG_H
+#include "libsecp256k1-config.h"
+#endif
+
+#include "util.h"
+
+/* A signed 30-bit limb representation of integers.
+ *
+ * Its value is sum(v[i] * 2^(30*i), i=0..8). */
+typedef struct {
+    int32_t v[9];
+} secp256k1_modinv32_signed30;
+
+typedef struct {
+    /* The modulus in signed30 notation, must be odd and in [3, 2^256]. */
+    secp256k1_modinv32_signed30 modulus;
+
+    /* modulus^{-1} mod 2^30 */
+    uint32_t modulus_inv30;
+} secp256k1_modinv32_modinfo;
+
+/* Replace x with its modular inverse mod modinfo->modulus. x must be in range [0, modulus).
+ * If x is zero, the result will be zero as well. If not, the inverse must exist (i.e., the gcd of
+ * x and modulus must be 1). These rules are automatically satisfied if the modulus is prime.
+ *
+ * On output, all of x's limbs will be in [0, 2^30).
+ */
+static void secp256k1_modinv32_var(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo);
+
+/* Same as secp256k1_modinv32_var, but constant time in x (not in the modulus). */
+static void secp256k1_modinv32(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo);
+
+#endif /* SECP256K1_MODINV32_H */
diff --git a/src/modinv32_impl.h b/src/modinv32_impl.h
new file mode 100644
index 0000000..aa7988c
--- /dev/null
+++ b/src/modinv32_impl.h
@@ -0,0 +1,587 @@
+/***********************************************************************
+ * Copyright (c) 2020 Peter Dettman                                    *
+ * Distributed under the MIT software license, see the accompanying    *
+ * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
+ **********************************************************************/
+
+#ifndef SECP256K1_MODINV32_IMPL_H
+#define SECP256K1_MODINV32_IMPL_H
+
+#include "modinv32.h"
+
+#include "util.h"
+
+#include <stdlib.h>
+
+/* This file implements modular inversion based on the paper "Fast constant-time gcd computation and
+ * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
+ *
+ * For an explanation of the algorithm, see doc/safegcd_implementation.md. This file contains an
+ * implementation for N=30, using 30-bit signed limbs represented as int32_t.
+ */
+
+#ifdef VERIFY
+static const secp256k1_modinv32_signed30 SECP256K1_SIGNED30_ONE = {{1}};
+
+/* Compute a*factor and put it in r. All but the top limb in r will be in range [0,2^30). */
+static void secp256k1_modinv32_mul_30(secp256k1_modinv32_signed30 *r, const secp256k1_modinv32_signed30 *a, int alen, int32_t factor) {
+    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
+    int64_t c = 0;
+    int i;
+    for (i = 0; i < 8; ++i) {
+        if (i < alen) c += (int64_t)a->v[i] * factor;
+        r->v[i] = (int32_t)c & M30; c >>= 30;
+    }
+    if (8 < alen) c += (int64_t)a->v[8] * factor;
+    VERIFY_CHECK(c == (int32_t)c);
+    r->v[8] = (int32_t)c;
+}
+
+/* Return -1 for a<b*factor, 0 for a==b*factor, 1 for a>b*factor. A consists of alen limbs; b has 9. */
+static int secp256k1_modinv32_mul_cmp_30(const secp256k1_modinv32_signed30 *a, int alen, const secp256k1_modinv32_signed30 *b, int32_t factor) {
+    int i;
+    secp256k1_modinv32_signed30 am, bm;
+    secp256k1_modinv32_mul_30(&am, a, alen, 1); /* Normalize all but the top limb of a. */
+    secp256k1_modinv32_mul_30(&bm, b, 9, factor);
+    for (i = 0; i < 8; ++i) {
+        /* Verify that all but the top limb of a and b are normalized. */
+        VERIFY_CHECK(am.v[i] >> 30 == 0);
+        VERIFY_CHECK(bm.v[i] >> 30 == 0);
+    }
+    for (i = 8; i >= 0; --i) {
+        if (am.v[i] < bm.v[i]) return -1;
+        if (am.v[i] > bm.v[i]) return 1;
+    }
+    return 0;
+}
+#endif
+
+/* Take as input a signed30 number in range (-2*modulus,modulus), and add a multiple of the modulus
+ * to it to bring it to range [0,modulus). If sign < 0, the input will also be negated in the
+ * process. The input must have limbs in range (-2^30,2^30). The output will have limbs in range
+ * [0,2^30). */
+static void secp256k1_modinv32_normalize_30(secp256k1_modinv32_signed30 *r, int32_t sign, const secp256k1_modinv32_modinfo *modinfo) {
+    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
+    int32_t r0 = r->v[0], r1 = r->v[1], r2 = r->v[2], r3 = r->v[3], r4 = r->v[4],
+            r5 = r->v[5], r6 = r->v[6], r7 = r->v[7], r8 = r->v[8];
+    int32_t cond_add, cond_negate;
+
+#ifdef VERIFY
+    /* Verify that all limbs are in range (-2^30,2^30). */
+    int i;
+    for (i = 0; i < 9; ++i) {
+        VERIFY_CHECK(r->v[i] >= -M30);
+        VERIFY_CHECK(r->v[i] <= M30);
+    }
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, 9, &modinfo->modulus, -2) > 0); /* r > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, 9, &modinfo->modulus, 1) < 0); /* r < modulus */
+#endif
+
+    /* In a first step, add the modulus if the input is negative, and then negate if requested.
+     * This brings r from range (-2*modulus,modulus) to range (-modulus,modulus). As all input
+     * limbs are in range (-2^30,2^30), this cannot overflow an int32_t. Note that the right
+     * shifts below are signed sign-extending shifts (see assumptions.h for tests that that is
+     * indeed the behavior of the right shift operator). */
+    cond_add = r8 >> 31;
+    r0 += modinfo->modulus.v[0] & cond_add;
+    r1 += modinfo->modulus.v[1] & cond_add;
+    r2 += modinfo->modulus.v[2] & cond_add;
+    r3 += modinfo->modulus.v[3] & cond_add;
+    r4 += modinfo->modulus.v[4] & cond_add;
+    r5 += modinfo->modulus.v[5] & cond_add;
+    r6 += modinfo->modulus.v[6] & cond_add;
+    r7 += modinfo->modulus.v[7] & cond_add;
+    r8 += modinfo->modulus.v[8] & cond_add;
+    cond_negate = sign >> 31;
+    r0 = (r0 ^ cond_negate) - cond_negate;
+    r1 = (r1 ^ cond_negate) - cond_negate;
+    r2 = (r2 ^ cond_negate) - cond_negate;
+    r3 = (r3 ^ cond_negate) - cond_negate;
+    r4 = (r4 ^ cond_negate) - cond_negate;
+    r5 = (r5 ^ cond_negate) - cond_negate;
+    r6 = (r6 ^ cond_negate) - cond_negate;
+    r7 = (r7 ^ cond_negate) - cond_negate;
+    r8 = (r8 ^ cond_negate) - cond_negate;
+    /* Propagate the top bits, to bring limbs back to range (-2^30,2^30). */
+    r1 += r0 >> 30; r0 &= M30;
+    r2 += r1 >> 30; r1 &= M30;
+    r3 += r2 >> 30; r2 &= M30;
+    r4 += r3 >> 30; r3 &= M30;
+    r5 += r4 >> 30; r4 &= M30;
+    r6 += r5 >> 30; r5 &= M30;
+    r7 += r6 >> 30; r6 &= M30;
+    r8 += r7 >> 30; r7 &= M30;
+
+    /* In a second step add the modulus again if the result is still negative, bringing r to range
+     * [0,modulus). */
+    cond_add = r8 >> 31;
+    r0 += modinfo->modulus.v[0] & cond_add;
+    r1 += modinfo->modulus.v[1] & cond_add;
+    r2 += modinfo->modulus.v[2] & cond_add;
+    r3 += modinfo->modulus.v[3] & cond_add;
+    r4 += modinfo->modulus.v[4] & cond_add;
+    r5 += modinfo->modulus.v[5] & cond_add;
+    r6 += modinfo->modulus.v[6] & cond_add;
+    r7 += modinfo->modulus.v[7] & cond_add;
+    r8 += modinfo->modulus.v[8] & cond_add;
+    /* And propagate again. */
+    r1 += r0 >> 30; r0 &= M30;
+    r2 += r1 >> 30; r1 &= M30;
+    r3 += r2 >> 30; r2 &= M30;
+    r4 += r3 >> 30; r3 &= M30;
+    r5 += r4 >> 30; r4 &= M30;
+    r6 += r5 >> 30; r5 &= M30;
+    r7 += r6 >> 30; r6 &= M30;
+    r8 += r7 >> 30; r7 &= M30;
+
+    r->v[0] = r0;
+    r->v[1] = r1;
+    r->v[2] = r2;
+    r->v[3] = r3;
+    r->v[4] = r4;
+    r->v[5] = r5;
+    r->v[6] = r6;
+    r->v[7] = r7;
+    r->v[8] = r8;
+
+#ifdef VERIFY
+    VERIFY_CHECK(r0 >> 30 == 0);
+    VERIFY_CHECK(r1 >> 30 == 0);
+    VERIFY_CHECK(r2 >> 30 == 0);
+    VERIFY_CHECK(r3 >> 30 == 0);
+    VERIFY_CHECK(r4 >> 30 == 0);
+    VERIFY_CHECK(r5 >> 30 == 0);
+    VERIFY_CHECK(r6 >> 30 == 0);
+    VERIFY_CHECK(r7 >> 30 == 0);
+    VERIFY_CHECK(r8 >> 30 == 0);
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, 9, &modinfo->modulus, 0) >= 0); /* r >= 0 */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, 9, &modinfo->modulus, 1) < 0); /* r < modulus */
+#endif
+}
+
+/* Data type for transition matrices (see section 3 of explanation).
+ *
+ * t = [ u  v ]
+ *     [ q  r ]
+ */
+typedef struct {
+    int32_t u, v, q, r;
+} secp256k1_modinv32_trans2x2;
+
+/* Compute the transition matrix and eta for 30 divsteps.
+ *
+ * Input:  eta: initial eta
+ *         f0:  bottom limb of initial f
+ *         g0:  bottom limb of initial g
+ * Output: t: transition matrix
+ * Return: final eta
+ *
+ * Implements the divsteps_n_matrix function from the explanation.
+ */
+static int32_t secp256k1_modinv32_divsteps_30(int32_t eta, uint32_t f0, uint32_t g0, secp256k1_modinv32_trans2x2 *t) {
+    /* u,v,q,r are the elements of the transformation matrix being built up,
+     * starting with the identity matrix. Semantically they are signed integers
+     * in range [-2^30,2^30], but here represented as unsigned mod 2^32. This
+     * permits left shifting (which is UB for negative numbers). The range
+     * being inside [-2^31,2^31) means that casting to signed works correctly.
+     */
+    uint32_t u = 1, v = 0, q = 0, r = 1;
+    uint32_t c1, c2, f = f0, g = g0, x, y, z;
+    int i;
+
+    for (i = 0; i < 30; ++i) {
+        VERIFY_CHECK((f & 1) == 1); /* f must always be odd */
+        VERIFY_CHECK((u * f0 + v * g0) == f << i);
+        VERIFY_CHECK((q * f0 + r * g0) == g << i);
+        /* Compute conditional masks for (eta < 0) and for (g & 1). */
+        c1 = eta >> 31;
+        c2 = -(g & 1);
+        /* Compute x,y,z, conditionally negated versions of f,u,v. */
+        x = (f ^ c1) - c1;
+        y = (u ^ c1) - c1;
+        z = (v ^ c1) - c1;
+        /* Conditionally add x,y,z to g,q,r. */
+        g += x & c2;
+        q += y & c2;
+        r += z & c2;
+        /* In what follows, c1 is a condition mask for (eta < 0) and (g & 1). */
+        c1 &= c2;
+        /* Conditionally negate eta, and unconditionally subtract 1. */
+        eta = (eta ^ c1) - (c1 + 1);
+        /* Conditionally add g,q,r to f,u,v. */
+        f += g & c1;
+        u += q & c1;
+        v += r & c1;
+        /* Shifts */
+        g >>= 1;
+        u <<= 1;
+        v <<= 1;
+        /* Bounds on eta that follow from the bounds on iteration count (max 25*30 divsteps). */
+        VERIFY_CHECK(eta >= -751 && eta <= 751);
+    }
+    /* Return data in t and return value. */
+    t->u = (int32_t)u;
+    t->v = (int32_t)v;
+    t->q = (int32_t)q;
+    t->r = (int32_t)r;
+    /* The determinant of t must be a power of two. This guarantees that multiplication with t
+     * does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which
+     * will be divided out again). As each divstep's individual matrix has determinant 2, the
+     * aggregate of 30 of them will have determinant 2^30. */
+    VERIFY_CHECK((int64_t)t->u * t->r - (int64_t)t->v * t->q == ((int64_t)1) << 30);
+    return eta;
+}
+
+/* Compute the transition matrix and eta for 30 divsteps (variable time).
+ *
+ * Input:  eta: initial eta
+ *         f0:  bottom limb of initial f
+ *         g0:  bottom limb of initial g
+ * Output: t: transition matrix
+ * Return: final eta
+ *
+ * Implements the divsteps_n_matrix_var function from the explanation.
+ */
+static int32_t secp256k1_modinv32_divsteps_30_var(int32_t eta, uint32_t f0, uint32_t g0, secp256k1_modinv32_trans2x2 *t) {
+    /* inv256[i] = -(2*i+1)^-1 (mod 256) */
+    static const uint8_t inv256[128] = {
+        0xFF, 0x55, 0x33, 0x49, 0xC7, 0x5D, 0x3B, 0x11, 0x0F, 0xE5, 0xC3, 0x59,
+        0xD7, 0xED, 0xCB, 0x21, 0x1F, 0x75, 0x53, 0x69, 0xE7, 0x7D, 0x5B, 0x31,
+        0x2F, 0x05, 0xE3, 0x79, 0xF7, 0x0D, 0xEB, 0x41, 0x3F, 0x95, 0x73, 0x89,
+        0x07, 0x9D, 0x7B, 0x51, 0x4F, 0x25, 0x03, 0x99, 0x17, 0x2D, 0x0B, 0x61,
+        0x5F, 0xB5, 0x93, 0xA9, 0x27, 0xBD, 0x9B, 0x71, 0x6F, 0x45, 0x23, 0xB9,
+        0x37, 0x4D, 0x2B, 0x81, 0x7F, 0xD5, 0xB3, 0xC9, 0x47, 0xDD, 0xBB, 0x91,
+        0x8F, 0x65, 0x43, 0xD9, 0x57, 0x6D, 0x4B, 0xA1, 0x9F, 0xF5, 0xD3, 0xE9,
+        0x67, 0xFD, 0xDB, 0xB1, 0xAF, 0x85, 0x63, 0xF9, 0x77, 0x8D, 0x6B, 0xC1,
+        0xBF, 0x15, 0xF3, 0x09, 0x87, 0x1D, 0xFB, 0xD1, 0xCF, 0xA5, 0x83, 0x19,
+        0x97, 0xAD, 0x8B, 0xE1, 0xDF, 0x35, 0x13, 0x29, 0xA7, 0x3D, 0x1B, 0xF1,
+        0xEF, 0xC5, 0xA3, 0x39, 0xB7, 0xCD, 0xAB, 0x01
+    };
+
+    /* Transformation matrix; see comments in secp256k1_modinv32_divsteps_30. */
+    uint32_t u = 1, v = 0, q = 0, r = 1;
+    uint32_t f = f0, g = g0, m;
+    uint16_t w;
+    int i = 30, limit, zeros;
+
+    for (;;) {
+        /* Use a sentinel bit to count zeros only up to i. */
+        zeros = secp256k1_ctz32_var(g | (UINT32_MAX << i));
+        /* Perform zeros divsteps at once; they all just divide g by two. */
+        g >>= zeros;
+        u <<= zeros;
+        v <<= zeros;
+        eta -= zeros;
+        i -= zeros;
+         /* We're done once we've done 30 divsteps. */
+        if (i == 0) break;
+        VERIFY_CHECK((f & 1) == 1);
+        VERIFY_CHECK((g & 1) == 1);
+        VERIFY_CHECK((u * f0 + v * g0) == f << (30 - i));
+        VERIFY_CHECK((q * f0 + r * g0) == g << (30 - i));
+        /* Bounds on eta that follow from the bounds on iteration count (max 25*30 divsteps). */
+        VERIFY_CHECK(eta >= -751 && eta <= 751);
+        /* If eta is negative, negate it and replace f,g with g,-f. */
+        if (eta < 0) {
+            uint32_t tmp;
+            eta = -eta;
+            tmp = f; f = g; g = -tmp;
+            tmp = u; u = q; q = -tmp;
+            tmp = v; v = r; r = -tmp;
+        }
+        /* eta is now >= 0. In what follows we're going to cancel out the bottom bits of g. No more
+         * than i can be cancelled out (as we'd be done before that point), and no more than eta+1
+         * can be done as its sign will flip once that happens. */
+        limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
+        /* m is a mask for the bottom min(limit, 8) bits (our table only supports 8 bits). */
+        VERIFY_CHECK(limit > 0 && limit <= 30);
+        m = (UINT32_MAX >> (32 - limit)) & 255U;
+        /* Find what multiple of f must be added to g to cancel its bottom min(limit, 8) bits. */
+        w = (g * inv256[(f >> 1) & 127]) & m;
+        /* Do so. */
+        g += f * w;
+        q += u * w;
+        r += v * w;
+        VERIFY_CHECK((g & m) == 0);
+    }
+    /* Return data in t and return value. */
+    t->u = (int32_t)u;
+    t->v = (int32_t)v;
+    t->q = (int32_t)q;
+    t->r = (int32_t)r;
+    /* The determinant of t must be a power of two. This guarantees that multiplication with t
+     * does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which
+     * will be divided out again). As each divstep's individual matrix has determinant 2, the
+     * aggregate of 30 of them will have determinant 2^30. */
+    VERIFY_CHECK((int64_t)t->u * t->r - (int64_t)t->v * t->q == ((int64_t)1) << 30);
+    return eta;
+}
+
+/* Compute (t/2^30) * [d, e] mod modulus, where t is a transition matrix for 30 divsteps.
+ *
+ * On input and output, d and e are in range (-2*modulus,modulus). All output limbs will be in range
+ * (-2^30,2^30).
+ *
+ * This implements the update_de function from the explanation.
+ */
+static void secp256k1_modinv32_update_de_30(secp256k1_modinv32_signed30 *d, secp256k1_modinv32_signed30 *e, const secp256k1_modinv32_trans2x2 *t, const secp256k1_modinv32_modinfo* modinfo) {
+    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
+    const int32_t u = t->u, v = t->v, q = t->q, r = t->r;
+    int32_t di, ei, md, me, sd, se;
+    int64_t cd, ce;
+    int i;
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(d, 9, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(d, 9, &modinfo->modulus, 1) < 0);  /* d <    modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(e, 9, &modinfo->modulus, -2) > 0); /* e > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(e, 9, &modinfo->modulus, 1) < 0);  /* e <    modulus */
+    VERIFY_CHECK((labs(u) + labs(v)) >= 0); /* |u|+|v| doesn't overflow */
+    VERIFY_CHECK((labs(q) + labs(r)) >= 0); /* |q|+|r| doesn't overflow */
+    VERIFY_CHECK((labs(u) + labs(v)) <= M30 + 1); /* |u|+|v| <= 2^30 */
+    VERIFY_CHECK((labs(q) + labs(r)) <= M30 + 1); /* |q|+|r| <= 2^30 */
+#endif
+    /* [md,me] start as zero; plus [u,q] if d is negative; plus [v,r] if e is negative. */
+    sd = d->v[8] >> 31;
+    se = e->v[8] >> 31;
+    md = (u & sd) + (v & se);
+    me = (q & sd) + (r & se);
+    /* Begin computing t*[d,e]. */
+    di = d->v[0];
+    ei = e->v[0];
+    cd = (int64_t)u * di + (int64_t)v * ei;
+    ce = (int64_t)q * di + (int64_t)r * ei;
+    /* Correct md,me so that t*[d,e]+modulus*[md,me] has 30 zero bottom bits. */
+    md -= (modinfo->modulus_inv30 * (uint32_t)cd + md) & M30;
+    me -= (modinfo->modulus_inv30 * (uint32_t)ce + me) & M30;
+    /* Update the beginning of computation for t*[d,e]+modulus*[md,me] now md,me are known. */
+    cd += (int64_t)modinfo->modulus.v[0] * md;
+    ce += (int64_t)modinfo->modulus.v[0] * me;
+    /* Verify that the low 30 bits of the computation are indeed zero, and then throw them away. */
+    VERIFY_CHECK(((int32_t)cd & M30) == 0); cd >>= 30;
+    VERIFY_CHECK(((int32_t)ce & M30) == 0); ce >>= 30;
+    /* Now iteratively compute limb i=1..8 of t*[d,e]+modulus*[md,me], and store them in output
+     * limb i-1 (shifting down by 30 bits). */
+    for (i = 1; i < 9; ++i) {
+        di = d->v[i];
+        ei = e->v[i];
+        cd += (int64_t)u * di + (int64_t)v * ei;
+        ce += (int64_t)q * di + (int64_t)r * ei;
+        cd += (int64_t)modinfo->modulus.v[i] * md;
+        ce += (int64_t)modinfo->modulus.v[i] * me;
+        d->v[i - 1] = (int32_t)cd & M30; cd >>= 30;
+        e->v[i - 1] = (int32_t)ce & M30; ce >>= 30;
+    }
+    /* What remains is limb 9 of t*[d,e]+modulus*[md,me]; store it as output limb 8. */
+    d->v[8] = (int32_t)cd;
+    e->v[8] = (int32_t)ce;
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(d, 9, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(d, 9, &modinfo->modulus, 1) < 0);  /* d <    modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(e, 9, &modinfo->modulus, -2) > 0); /* e > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(e, 9, &modinfo->modulus, 1) < 0);  /* e <    modulus */
+#endif
+}
+
+/* Compute (t/2^30) * [f, g], where t is a transition matrix for 30 divsteps.
+ *
+ * This implements the update_fg function from the explanation.
+ */
+static void secp256k1_modinv32_update_fg_30(secp256k1_modinv32_signed30 *f, secp256k1_modinv32_signed30 *g, const secp256k1_modinv32_trans2x2 *t) {
+    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
+    const int32_t u = t->u, v = t->v, q = t->q, r = t->r;
+    int32_t fi, gi;
+    int64_t cf, cg;
+    int i;
+    /* Start computing t*[f,g]. */
+    fi = f->v[0];
+    gi = g->v[0];
+    cf = (int64_t)u * fi + (int64_t)v * gi;
+    cg = (int64_t)q * fi + (int64_t)r * gi;
+    /* Verify that the bottom 30 bits of the result are zero, and then throw them away. */
+    VERIFY_CHECK(((int32_t)cf & M30) == 0); cf >>= 30;
+    VERIFY_CHECK(((int32_t)cg & M30) == 0); cg >>= 30;
+    /* Now iteratively compute limb i=1..8 of t*[f,g], and store them in output limb i-1 (shifting
+     * down by 30 bits). */
+    for (i = 1; i < 9; ++i) {
+        fi = f->v[i];
+        gi = g->v[i];
+        cf += (int64_t)u * fi + (int64_t)v * gi;
+        cg += (int64_t)q * fi + (int64_t)r * gi;
+        f->v[i - 1] = (int32_t)cf & M30; cf >>= 30;
+        g->v[i - 1] = (int32_t)cg & M30; cg >>= 30;
+    }
+    /* What remains is limb 9 of t*[f,g]; store it as output limb 8. */
+    f->v[8] = (int32_t)cf;
+    g->v[8] = (int32_t)cg;
+}
+
+/* Compute (t/2^30) * [f, g], where t is a transition matrix for 30 divsteps.
+ *
+ * Version that operates on a variable number of limbs in f and g.
+ *
+ * This implements the update_fg function from the explanation in modinv64_impl.h.
+ */
+static void secp256k1_modinv32_update_fg_30_var(int len, secp256k1_modinv32_signed30 *f, secp256k1_modinv32_signed30 *g, const secp256k1_modinv32_trans2x2 *t) {
+    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
+    const int32_t u = t->u, v = t->v, q = t->q, r = t->r;
+    int32_t fi, gi;
+    int64_t cf, cg;
+    int i;
+    VERIFY_CHECK(len > 0);
+    /* Start computing t*[f,g]. */
+    fi = f->v[0];
+    gi = g->v[0];
+    cf = (int64_t)u * fi + (int64_t)v * gi;
+    cg = (int64_t)q * fi + (int64_t)r * gi;
+    /* Verify that the bottom 62 bits of the result are zero, and then throw them away. */
+    VERIFY_CHECK(((int32_t)cf & M30) == 0); cf >>= 30;
+    VERIFY_CHECK(((int32_t)cg & M30) == 0); cg >>= 30;
+    /* Now iteratively compute limb i=1..len of t*[f,g], and store them in output limb i-1 (shifting
+     * down by 30 bits). */
+    for (i = 1; i < len; ++i) {
+        fi = f->v[i];
+        gi = g->v[i];
+        cf += (int64_t)u * fi + (int64_t)v * gi;
+        cg += (int64_t)q * fi + (int64_t)r * gi;
+        f->v[i - 1] = (int32_t)cf & M30; cf >>= 30;
+        g->v[i - 1] = (int32_t)cg & M30; cg >>= 30;
+    }
+    /* What remains is limb (len) of t*[f,g]; store it as output limb (len-1). */
+    f->v[len - 1] = (int32_t)cf;
+    g->v[len - 1] = (int32_t)cg;
+}
+
+/* Compute the inverse of x modulo modinfo->modulus, and replace x with it (constant time in x). */
+static void secp256k1_modinv32(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo) {
+    /* Start with d=0, e=1, f=modulus, g=x, eta=-1. */
+    secp256k1_modinv32_signed30 d = {{0}};
+    secp256k1_modinv32_signed30 e = {{1}};
+    secp256k1_modinv32_signed30 f = modinfo->modulus;
+    secp256k1_modinv32_signed30 g = *x;
+    int i;
+    int32_t eta = -1;
+
+    /* Do 25 iterations of 30 divsteps each = 750 divsteps. 724 suffices for 256-bit inputs. */
+    for (i = 0; i < 25; ++i) {
+        /* Compute transition matrix and new eta after 30 divsteps. */
+        secp256k1_modinv32_trans2x2 t;
+        eta = secp256k1_modinv32_divsteps_30(eta, f.v[0], g.v[0], &t);
+        /* Update d,e using that transition matrix. */
+        secp256k1_modinv32_update_de_30(&d, &e, &t, modinfo);
+        /* Update f,g using that transition matrix. */
+#ifdef VERIFY
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, 9, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, 9, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+#endif
+        secp256k1_modinv32_update_fg_30(&f, &g, &t);
+#ifdef VERIFY
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, 9, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, 9, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+#endif
+    }
+
+    /* At this point sufficient iterations have been performed that g must have reached 0
+     * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
+     * values i.e. +/- 1, and d now contains +/- the modular inverse. */
+#ifdef VERIFY
+    /* g == 0 */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, 9, &SECP256K1_SIGNED30_ONE, 0) == 0);
+    /* |f| == 1, or (x == 0 and d == 0 and |f|=modulus) */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, 9, &SECP256K1_SIGNED30_ONE, -1) == 0 ||
+                 secp256k1_modinv32_mul_cmp_30(&f, 9, &SECP256K1_SIGNED30_ONE, 1) == 0 ||
+                 (secp256k1_modinv32_mul_cmp_30(x, 9, &SECP256K1_SIGNED30_ONE, 0) == 0 &&
+                  secp256k1_modinv32_mul_cmp_30(&d, 9, &SECP256K1_SIGNED30_ONE, 0) == 0 &&
+                  (secp256k1_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, 1) == 0 ||
+                   secp256k1_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, -1) == 0)));
+#endif
+
+    /* Optionally negate d, normalize to [0,modulus), and return it. */
+    secp256k1_modinv32_normalize_30(&d, f.v[8], modinfo);
+    *x = d;
+}
+
+/* Compute the inverse of x modulo modinfo->modulus, and replace x with it (variable time). */
+static void secp256k1_modinv32_var(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo) {
+    /* Start with d=0, e=1, f=modulus, g=x, eta=-1. */
+    secp256k1_modinv32_signed30 d = {{0, 0, 0, 0, 0, 0, 0, 0, 0}};
+    secp256k1_modinv32_signed30 e = {{1, 0, 0, 0, 0, 0, 0, 0, 0}};
+    secp256k1_modinv32_signed30 f = modinfo->modulus;
+    secp256k1_modinv32_signed30 g = *x;
+#ifdef VERIFY
+    int i = 0;
+#endif
+    int j, len = 9;
+    int32_t eta = -1;
+    int32_t cond, fn, gn;
+
+    /* Do iterations of 30 divsteps each until g=0. */
+    while (1) {
+        /* Compute transition matrix and new eta after 30 divsteps. */
+        secp256k1_modinv32_trans2x2 t;
+        eta = secp256k1_modinv32_divsteps_30_var(eta, f.v[0], g.v[0], &t);
+        /* Update d,e using that transition matrix. */
+        secp256k1_modinv32_update_de_30(&d, &e, &t, modinfo);
+        /* Update f,g using that transition matrix. */
+#ifdef VERIFY
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, len, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, len, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+#endif
+        secp256k1_modinv32_update_fg_30_var(len, &f, &g, &t);
+        /* If the bottom limb of g is 0, there is a chance g=0. */
+        if (g.v[0] == 0) {
+            cond = 0;
+            /* Check if all other limbs are also 0. */
+            for (j = 1; j < len; ++j) {
+                cond |= g.v[j];
+            }
+            /* If so, we're done. */
+            if (cond == 0) break;
+        }
+
+        /* Determine if len>1 and limb (len-1) of both f and g is 0 or -1. */
+        fn = f.v[len - 1];
+        gn = g.v[len - 1];
+        cond = ((int32_t)len - 2) >> 31;
+        cond |= fn ^ (fn >> 31);
+        cond |= gn ^ (gn >> 31);
+        /* If so, reduce length, propagating the sign of f and g's top limb into the one below. */
+        if (cond == 0) {
+            f.v[len - 2] |= (uint32_t)fn << 30;
+            g.v[len - 2] |= (uint32_t)gn << 30;
+            --len;
+        }
+#ifdef VERIFY
+        VERIFY_CHECK(++i < 25); /* We should never need more than 25*30 = 750 divsteps */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, len, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, len, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+#endif
+    }
+
+    /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of
+     * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */
+#ifdef VERIFY
+    /* g == 0 */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, len, &SECP256K1_SIGNED30_ONE, 0) == 0);
+    /* |f| == 1, or (x == 0 and d == 0 and |f|=modulus) */
+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, len, &SECP256K1_SIGNED30_ONE, -1) == 0 ||
+                 secp256k1_modinv32_mul_cmp_30(&f, len, &SECP256K1_SIGNED30_ONE, 1) == 0 ||
+                 (secp256k1_modinv32_mul_cmp_30(x, 9, &SECP256K1_SIGNED30_ONE, 0) == 0 &&
+                  secp256k1_modinv32_mul_cmp_30(&d, 9, &SECP256K1_SIGNED30_ONE, 0) == 0 &&
+                  (secp256k1_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, 1) == 0 ||
+                   secp256k1_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, -1) == 0)));
+#endif
+
+    /* Optionally negate d, normalize to [0,modulus), and return it. */
+    secp256k1_modinv32_normalize_30(&d, f.v[len - 1], modinfo);
+    *x = d;
+}
+
+#endif /* SECP256K1_MODINV32_IMPL_H */
diff --git a/src/modinv64.h b/src/modinv64.h
new file mode 100644
index 0000000..da506df
--- /dev/null
+++ b/src/modinv64.h
@@ -0,0 +1,46 @@
+/***********************************************************************
+ * Copyright (c) 2020 Peter Dettman                                    *
+ * Distributed under the MIT software license, see the accompanying    *
+ * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
+ **********************************************************************/
+
+#ifndef SECP256K1_MODINV64_H
+#define SECP256K1_MODINV64_H
+
+#if defined HAVE_CONFIG_H
+#include "libsecp256k1-config.h"
+#endif
+
+#include "util.h"
+
+#ifndef SECP256K1_WIDEMUL_INT128
+#error "modinv64 requires 128-bit wide multiplication support"
+#endif
+
+/* A signed 62-bit limb representation of integers.
+ *
+ * Its value is sum(v[i] * 2^(62*i), i=0..4). */
+typedef struct {
+    int64_t v[5];
+} secp256k1_modinv64_signed62;
+
+typedef struct {
+    /* The modulus in signed62 notation, must be odd and in [3, 2^256]. */
+    secp256k1_modinv64_signed62 modulus;
+
+    /* modulus^{-1} mod 2^62 */
+    uint64_t modulus_inv62;
+} secp256k1_modinv64_modinfo;
+
+/* Replace x with its modular inverse mod modinfo->modulus. x must be in range [0, modulus).
+ * If x is zero, the result will be zero as well. If not, the inverse must exist (i.e., the gcd of
+ * x and modulus must be 1). These rules are automatically satisfied if the modulus is prime.
+ *
+ * On output, all of x's limbs will be in [0, 2^62).
+ */
+static void secp256k1_modinv64_var(secp256k1_modinv64_signed62 *x, const secp256k1_modinv64_modinfo *modinfo);
+
+/* Same as secp256k1_modinv64_var, but constant time in x (not in the modulus). */
+static void secp256k1_modinv64(secp256k1_modinv64_signed62 *x, const secp256k1_modinv64_modinfo *modinfo);
+
+#endif /* SECP256K1_MODINV64_H */
diff --git a/src/modinv64_impl.h b/src/modinv64_impl.h
new file mode 100644
index 0000000..78505fa
--- /dev/null
+++ b/src/modinv64_impl.h
@@ -0,0 +1,589 @@
+/***********************************************************************
+ * Copyright (c) 2020 Peter Dettman                                    *
+ * Distributed under the MIT software license, see the accompanying    *
+ * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
+ **********************************************************************/
+
+#ifndef SECP256K1_MODINV64_IMPL_H
+#define SECP256K1_MODINV64_IMPL_H
+
+#include "modinv64.h"
+
+#include "util.h"
+
+/* This file implements modular inversion based on the paper "Fast constant-time gcd computation and
+ * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
+ *
+ * For an explanation of the algorithm, see doc/safegcd_implementation.md. This file contains an
+ * implementation for N=62, using 62-bit signed limbs represented as int64_t.
+ */
+
+#ifdef VERIFY
+/* Helper function to compute the absolute value of an int64_t.
+ * (we don't use abs/labs/llabs as it depends on the int sizes). */
+static int64_t secp256k1_modinv64_abs(int64_t v) {
+    VERIFY_CHECK(v > INT64_MIN);
+    if (v < 0) return -v;
+    return v;
+}
+
+static const secp256k1_modinv64_signed62 SECP256K1_SIGNED62_ONE = {{1}};
+
+/* Compute a*factor and put it in r. All but the top limb in r will be in range [0,2^62). */
+static void secp256k1_modinv64_mul_62(secp256k1_modinv64_signed62 *r, const secp256k1_modinv64_signed62 *a, int alen, int64_t factor) {
+    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
+    int128_t c = 0;
+    int i;
+    for (i = 0; i < 4; ++i) {
+        if (i < alen) c += (int128_t)a->v[i] * factor;
+        r->v[i] = (int64_t)c & M62; c >>= 62;
+    }
+    if (4 < alen) c += (int128_t)a->v[4] * factor;
+    VERIFY_CHECK(c == (int64_t)c);
+    r->v[4] = (int64_t)c;
+}
+
+/* Return -1 for a<b*factor, 0 for a==b*factor, 1 for a>b*factor. A has alen limbs; b has 5. */
+static int secp256k1_modinv64_mul_cmp_62(const secp256k1_modinv64_signed62 *a, int alen, const secp256k1_modinv64_signed62 *b, int64_t factor) {
+    int i;
+    secp256k1_modinv64_signed62 am, bm;
+    secp256k1_modinv64_mul_62(&am, a, alen, 1); /* Normalize all but the top limb of a. */
+    secp256k1_modinv64_mul_62(&bm, b, 5, factor);
+    for (i = 0; i < 4; ++i) {
+        /* Verify that all but the top limb of a and b are normalized. */
+        VERIFY_CHECK(am.v[i] >> 62 == 0);
+        VERIFY_CHECK(bm.v[i] >> 62 == 0);
+    }
+    for (i = 4; i >= 0; --i) {
+        if (am.v[i] < bm.v[i]) return -1;
+        if (am.v[i] > bm.v[i]) return 1;
+    }
+    return 0;
+}
+#endif
+
+/* Take as input a signed62 number in range (-2*modulus,modulus), and add a multiple of the modulus
+ * to it to bring it to range [0,modulus). If sign < 0, the input will also be negated in the
+ * process. The input must have limbs in range (-2^62,2^62). The output will have limbs in range
+ * [0,2^62). */
+static void secp256k1_modinv64_normalize_62(secp256k1_modinv64_signed62 *r, int64_t sign, const secp256k1_modinv64_modinfo *modinfo) {
+    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
+    int64_t r0 = r->v[0], r1 = r->v[1], r2 = r->v[2], r3 = r->v[3], r4 = r->v[4];
+    int64_t cond_add, cond_negate;
+
+#ifdef VERIFY
+    /* Verify that all limbs are in range (-2^62,2^62). */
+    int i;
+    for (i = 0; i < 5; ++i) {
+        VERIFY_CHECK(r->v[i] >= -M62);
+        VERIFY_CHECK(r->v[i] <= M62);
+    }
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, 5, &modinfo->modulus, -2) > 0); /* r > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, 5, &modinfo->modulus, 1) < 0); /* r < modulus */
+#endif
+
+    /* In a first step, add the modulus if the input is negative, and then negate if requested.
+     * This brings r from range (-2*modulus,modulus) to range (-modulus,modulus). As all input
+     * limbs are in range (-2^62,2^62), this cannot overflow an int64_t. Note that the right
+     * shifts below are signed sign-extending shifts (see assumptions.h for tests that that is
+     * indeed the behavior of the right shift operator). */
+    cond_add = r4 >> 63;
+    r0 += modinfo->modulus.v[0] & cond_add;
+    r1 += modinfo->modulus.v[1] & cond_add;
+    r2 += modinfo->modulus.v[2] & cond_add;
+    r3 += modinfo->modulus.v[3] & cond_add;
+    r4 += modinfo->modulus.v[4] & cond_add;
+    cond_negate = sign >> 63;
+    r0 = (r0 ^ cond_negate) - cond_negate;
+    r1 = (r1 ^ cond_negate) - cond_negate;
+    r2 = (r2 ^ cond_negate) - cond_negate;
+    r3 = (r3 ^ cond_negate) - cond_negate;
+    r4 = (r4 ^ cond_negate) - cond_negate;
+    /* Propagate the top bits, to bring limbs back to range (-2^62,2^62). */
+    r1 += r0 >> 62; r0 &= M62;
+    r2 += r1 >> 62; r1 &= M62;
+    r3 += r2 >> 62; r2 &= M62;
+    r4 += r3 >> 62; r3 &= M62;
+
+    /* In a second step add the modulus again if the result is still negative, bringing
+     * r to range [0,modulus). */
+    cond_add = r4 >> 63;
+    r0 += modinfo->modulus.v[0] & cond_add;
+    r1 += modinfo->modulus.v[1] & cond_add;
+    r2 += modinfo->modulus.v[2] & cond_add;
+    r3 += modinfo->modulus.v[3] & cond_add;
+    r4 += modinfo->modulus.v[4] & cond_add;
+    /* And propagate again. */
+    r1 += r0 >> 62; r0 &= M62;
+    r2 += r1 >> 62; r1 &= M62;
+    r3 += r2 >> 62; r2 &= M62;
+    r4 += r3 >> 62; r3 &= M62;
+
+    r->v[0] = r0;
+    r->v[1] = r1;
+    r->v[2] = r2;
+    r->v[3] = r3;
+    r->v[4] = r4;
+
+#ifdef VERIFY
+    VERIFY_CHECK(r0 >> 62 == 0);
+    VERIFY_CHECK(r1 >> 62 == 0);
+    VERIFY_CHECK(r2 >> 62 == 0);
+    VERIFY_CHECK(r3 >> 62 == 0);
+    VERIFY_CHECK(r4 >> 62 == 0);
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, 5, &modinfo->modulus, 0) >= 0); /* r >= 0 */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, 5, &modinfo->modulus, 1) < 0); /* r < modulus */
+#endif
+}
+
+/* Data type for transition matrices (see section 3 of explanation).
+ *
+ * t = [ u  v ]
+ *     [ q  r ]
+ */
+typedef struct {
+    int64_t u, v, q, r;
+} secp256k1_modinv64_trans2x2;
+
+/* Compute the transition matrix and eta for 62 divsteps.
+ *
+ * Input:  eta: initial eta
+ *         f0:  bottom limb of initial f
+ *         g0:  bottom limb of initial g
+ * Output: t: transition matrix
+ * Return: final eta
+ *
+ * Implements the divsteps_n_matrix function from the explanation.
+ */
+static int64_t secp256k1_modinv64_divsteps_62(int64_t eta, uint64_t f0, uint64_t g0, secp256k1_modinv64_trans2x2 *t) {
+    /* u,v,q,r are the elements of the transformation matrix being built up,
+     * starting with the identity matrix. Semantically they are signed integers
+     * in range [-2^62,2^62], but here represented as unsigned mod 2^64. This
+     * permits left shifting (which is UB for negative numbers). The range
+     * being inside [-2^63,2^63) means that casting to signed works correctly.
+     */
+    uint64_t u = 1, v = 0, q = 0, r = 1;
+    uint64_t c1, c2, f = f0, g = g0, x, y, z;
+    int i;
+
+    for (i = 0; i < 62; ++i) {
+        VERIFY_CHECK((f & 1) == 1); /* f must always be odd */
+        VERIFY_CHECK((u * f0 + v * g0) == f << i);
+        VERIFY_CHECK((q * f0 + r * g0) == g << i);
+        /* Compute conditional masks for (eta < 0) and for (g & 1). */
+        c1 = eta >> 63;
+        c2 = -(g & 1);
+        /* Compute x,y,z, conditionally negated versions of f,u,v. */
+        x = (f ^ c1) - c1;
+        y = (u ^ c1) - c1;
+        z = (v ^ c1) - c1;
+        /* Conditionally add x,y,z to g,q,r. */
+        g += x & c2;
+        q += y & c2;
+        r += z & c2;
+        /* In what follows, c1 is a condition mask for (eta < 0) and (g & 1). */
+        c1 &= c2;
+        /* Conditionally negate eta, and unconditionally subtract 1. */
+        eta = (eta ^ c1) - (c1 + 1);
+        /* Conditionally add g,q,r to f,u,v. */
+        f += g & c1;
+        u += q & c1;
+        v += r & c1;
+        /* Shifts */
+        g >>= 1;
+        u <<= 1;
+        v <<= 1;
+        /* Bounds on eta that follow from the bounds on iteration count (max 12*62 divsteps). */
+        VERIFY_CHECK(eta >= -745 && eta <= 745);
+    }
+    /* Return data in t and return value. */
+    t->u = (int64_t)u;
+    t->v = (int64_t)v;
+    t->q = (int64_t)q;
+    t->r = (int64_t)r;
+    /* The determinant of t must be a power of two. This guarantees that multiplication with t
+     * does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which
+     * will be divided out again). As each divstep's individual matrix has determinant 2, the
+     * aggregate of 62 of them will have determinant 2^62. */
+    VERIFY_CHECK((int128_t)t->u * t->r - (int128_t)t->v * t->q == ((int128_t)1) << 62);
+    return eta;
+}
+
+/* Compute the transition matrix and eta for 62 divsteps (variable time).
+ *
+ * Input:  eta: initial eta
+ *         f0:  bottom limb of initial f
+ *         g0:  bottom limb of initial g
+ * Output: t: transition matrix
+ * Return: final eta
+ *
+ * Implements the divsteps_n_matrix_var function from the explanation.
+ */
+static int64_t secp256k1_modinv64_divsteps_62_var(int64_t eta, uint64_t f0, uint64_t g0, secp256k1_modinv64_trans2x2 *t) {
+    /* Transformation matrix; see comments in secp256k1_modinv64_divsteps_62. */
+    uint64_t u = 1, v = 0, q = 0, r = 1;
+    uint64_t f = f0, g = g0, m;
+    uint32_t w;
+    int i = 62, limit, zeros;
+
+    for (;;) {
+        /* Use a sentinel bit to count zeros only up to i. */
+        zeros = secp256k1_ctz64_var(g | (UINT64_MAX << i));
+        /* Perform zeros divsteps at once; they all just divide g by two. */
+        g >>= zeros;
+        u <<= zeros;
+        v <<= zeros;
+        eta -= zeros;
+        i -= zeros;
+        /* We're done once we've done 62 divsteps. */
+        if (i == 0) break;
+        VERIFY_CHECK((f & 1) == 1);
+        VERIFY_CHECK((g & 1) == 1);
+        VERIFY_CHECK((u * f0 + v * g0) == f << (62 - i));
+        VERIFY_CHECK((q * f0 + r * g0) == g << (62 - i));
+        /* Bounds on eta that follow from the bounds on iteration count (max 12*62 divsteps). */
+        VERIFY_CHECK(eta >= -745 && eta <= 745);
+        /* If eta is negative, negate it and replace f,g with g,-f. */
+        if (eta < 0) {
+            uint64_t tmp;
+            eta = -eta;
+            tmp = f; f = g; g = -tmp;
+            tmp = u; u = q; q = -tmp;
+            tmp = v; v = r; r = -tmp;
+            /* Use a formula to cancel out up to 6 bits of g. Also, no more than i can be cancelled
+             * out (as we'd be done before that point), and no more than eta+1 can be done as its
+             * will flip again once that happens. */
+            limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
+            VERIFY_CHECK(limit > 0 && limit <= 62);
+            /* m is a mask for the bottom min(limit, 6) bits. */
+            m = (UINT64_MAX >> (64 - limit)) & 63U;
+            /* Find what multiple of f must be added to g to cancel its bottom min(limit, 6)
+             * bits. */
+            w = (f * g * (f * f - 2)) & m;
+        } else {
+            /* In this branch, use a simpler formula that only lets us cancel up to 4 bits of g, as
+             * eta tends to be smaller here. */
+            limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
+            VERIFY_CHECK(limit > 0 && limit <= 62);
+            /* m is a mask for the bottom min(limit, 4) bits. */
+            m = (UINT64_MAX >> (64 - limit)) & 15U;
+            /* Find what multiple of f must be added to g to cancel its bottom min(limit, 4)
+             * bits. */
+            w = f + (((f + 1) & 4) << 1);
+            w = (-w * g) & m;
+        }
+        g += f * w;
+        q += u * w;
+        r += v * w;
+        VERIFY_CHECK((g & m) == 0);
+    }
+    /* Return data in t and return value. */
+    t->u = (int64_t)u;
+    t->v = (int64_t)v;
+    t->q = (int64_t)q;
+    t->r = (int64_t)r;
+    /* The determinant of t must be a power of two. This guarantees that multiplication with t
+     * does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which
+     * will be divided out again). As each divstep's individual matrix has determinant 2, the
+     * aggregate of 62 of them will have determinant 2^62. */
+    VERIFY_CHECK((int128_t)t->u * t->r - (int128_t)t->v * t->q == ((int128_t)1) << 62);
+    return eta;
+}
+
+/* Compute (t/2^62) * [d, e] mod modulus, where t is a transition matrix for 62 divsteps.
+ *
+ * On input and output, d and e are in range (-2*modulus,modulus). All output limbs will be in range
+ * (-2^62,2^62).
+ *
+ * This implements the update_de function from the explanation.
+ */
+static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp256k1_modinv64_signed62 *e, const secp256k1_modinv64_trans2x2 *t, const secp256k1_modinv64_modinfo* modinfo) {
+    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
+    const int64_t d0 = d->v[0], d1 = d->v[1], d2 = d->v[2], d3 = d->v[3], d4 = d->v[4];
+    const int64_t e0 = e->v[0], e1 = e->v[1], e2 = e->v[2], e3 = e->v[3], e4 = e->v[4];
+    const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
+    int64_t md, me, sd, se;
+    int128_t cd, ce;
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, 1) < 0);  /* d <    modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(e, 5, &modinfo->modulus, -2) > 0); /* e > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(e, 5, &modinfo->modulus, 1) < 0);  /* e <    modulus */
+    VERIFY_CHECK((secp256k1_modinv64_abs(u) + secp256k1_modinv64_abs(v)) >= 0); /* |u|+|v| doesn't overflow */
+    VERIFY_CHECK((secp256k1_modinv64_abs(q) + secp256k1_modinv64_abs(r)) >= 0); /* |q|+|r| doesn't overflow */
+    VERIFY_CHECK((secp256k1_modinv64_abs(u) + secp256k1_modinv64_abs(v)) <= M62 + 1); /* |u|+|v| <= 2^62 */
+    VERIFY_CHECK((secp256k1_modinv64_abs(q) + secp256k1_modinv64_abs(r)) <= M62 + 1); /* |q|+|r| <= 2^62 */
+#endif
+    /* [md,me] start as zero; plus [u,q] if d is negative; plus [v,r] if e is negative. */
+    sd = d4 >> 63;
+    se = e4 >> 63;
+    md = (u & sd) + (v & se);
+    me = (q & sd) + (r & se);
+    /* Begin computing t*[d,e]. */
+    cd = (int128_t)u * d0 + (int128_t)v * e0;
+    ce = (int128_t)q * d0 + (int128_t)r * e0;
+    /* Correct md,me so that t*[d,e]+modulus*[md,me] has 62 zero bottom bits. */
+    md -= (modinfo->modulus_inv62 * (uint64_t)cd + md) & M62;
+    me -= (modinfo->modulus_inv62 * (uint64_t)ce + me) & M62;
+    /* Update the beginning of computation for t*[d,e]+modulus*[md,me] now md,me are known. */
+    cd += (int128_t)modinfo->modulus.v[0] * md;
+    ce += (int128_t)modinfo->modulus.v[0] * me;
+    /* Verify that the low 62 bits of the computation are indeed zero, and then throw them away. */
+    VERIFY_CHECK(((int64_t)cd & M62) == 0); cd >>= 62;
+    VERIFY_CHECK(((int64_t)ce & M62) == 0); ce >>= 62;
+    /* Compute limb 1 of t*[d,e]+modulus*[md,me], and store it as output limb 0 (= down shift). */
+    cd += (int128_t)u * d1 + (int128_t)v * e1;
+    ce += (int128_t)q * d1 + (int128_t)r * e1;
+    if (modinfo->modulus.v[1]) { /* Optimize for the case where limb of modulus is zero. */
+        cd += (int128_t)modinfo->modulus.v[1] * md;
+        ce += (int128_t)modinfo->modulus.v[1] * me;
+    }
+    d->v[0] = (int64_t)cd & M62; cd >>= 62;
+    e->v[0] = (int64_t)ce & M62; ce >>= 62;
+    /* Compute limb 2 of t*[d,e]+modulus*[md,me], and store it as output limb 1. */
+    cd += (int128_t)u * d2 + (int128_t)v * e2;
+    ce += (int128_t)q * d2 + (int128_t)r * e2;
+    if (modinfo->modulus.v[2]) { /* Optimize for the case where limb of modulus is zero. */
+        cd += (int128_t)modinfo->modulus.v[2] * md;
+        ce += (int128_t)modinfo->modulus.v[2] * me;
+    }
+    d->v[1] = (int64_t)cd & M62; cd >>= 62;
+    e->v[1] = (int64_t)ce & M62; ce >>= 62;
+    /* Compute limb 3 of t*[d,e]+modulus*[md,me], and store it as output limb 2. */
+    cd += (int128_t)u * d3 + (int128_t)v * e3;
+    ce += (int128_t)q * d3 + (int128_t)r * e3;
+    if (modinfo->modulus.v[3]) { /* Optimize for the case where limb of modulus is zero. */
+        cd += (int128_t)modinfo->modulus.v[3] * md;
+        ce += (int128_t)modinfo->modulus.v[3] * me;
+    }
+    d->v[2] = (int64_t)cd & M62; cd >>= 62;
+    e->v[2] = (int64_t)ce & M62; ce >>= 62;
+    /* Compute limb 4 of t*[d,e]+modulus*[md,me], and store it as output limb 3. */
+    cd += (int128_t)u * d4 + (int128_t)v * e4;
+    ce += (int128_t)q * d4 + (int128_t)r * e4;
+    cd += (int128_t)modinfo->modulus.v[4] * md;
+    ce += (int128_t)modinfo->modulus.v[4] * me;
+    d->v[3] = (int64_t)cd & M62; cd >>= 62;
+    e->v[3] = (int64_t)ce & M62; ce >>= 62;
+    /* What remains is limb 5 of t*[d,e]+modulus*[md,me]; store it as output limb 4. */
+    d->v[4] = (int64_t)cd;
+    e->v[4] = (int64_t)ce;
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, 1) < 0);  /* d <    modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(e, 5, &modinfo->modulus, -2) > 0); /* e > -2*modulus */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(e, 5, &modinfo->modulus, 1) < 0);  /* e <    modulus */
+#endif
+}
+
+/* Compute (t/2^62) * [f, g], where t is a transition matrix for 62 divsteps.
+ *
+ * This implements the update_fg function from the explanation.
+ */
+static void secp256k1_modinv64_update_fg_62(secp256k1_modinv64_signed62 *f, secp256k1_modinv64_signed62 *g, const secp256k1_modinv64_trans2x2 *t) {
+    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
+    const int64_t f0 = f->v[0], f1 = f->v[1], f2 = f->v[2], f3 = f->v[3], f4 = f->v[4];
+    const int64_t g0 = g->v[0], g1 = g->v[1], g2 = g->v[2], g3 = g->v[3], g4 = g->v[4];
+    const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
+    int128_t cf, cg;
+    /* Start computing t*[f,g]. */
+    cf = (int128_t)u * f0 + (int128_t)v * g0;
+    cg = (int128_t)q * f0 + (int128_t)r * g0;
+    /* Verify that the bottom 62 bits of the result are zero, and then throw them away. */
+    VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
+    VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
+    /* Compute limb 1 of t*[f,g], and store it as output limb 0 (= down shift). */
+    cf += (int128_t)u * f1 + (int128_t)v * g1;
+    cg += (int128_t)q * f1 + (int128_t)r * g1;
+    f->v[0] = (int64_t)cf & M62; cf >>= 62;
+    g->v[0] = (int64_t)cg & M62; cg >>= 62;
+    /* Compute limb 2 of t*[f,g], and store it as output limb 1. */
+    cf += (int128_t)u * f2 + (int128_t)v * g2;
+    cg += (int128_t)q * f2 + (int128_t)r * g2;
+    f->v[1] = (int64_t)cf & M62; cf >>= 62;
+    g->v[1] = (int64_t)cg & M62; cg >>= 62;
+    /* Compute limb 3 of t*[f,g], and store it as output limb 2. */
+    cf += (int128_t)u * f3 + (int128_t)v * g3;
+    cg += (int128_t)q * f3 + (int128_t)r * g3;
+    f->v[2] = (int64_t)cf & M62; cf >>= 62;
+    g->v[2] = (int64_t)cg & M62; cg >>= 62;
+    /* Compute limb 4 of t*[f,g], and store it as output limb 3. */
+    cf += (int128_t)u * f4 + (int128_t)v * g4;
+    cg += (int128_t)q * f4 + (int128_t)r * g4;
+    f->v[3] = (int64_t)cf & M62; cf >>= 62;
+    g->v[3] = (int64_t)cg & M62; cg >>= 62;
+    /* What remains is limb 5 of t*[f,g]; store it as output limb 4. */
+    f->v[4] = (int64_t)cf;
+    g->v[4] = (int64_t)cg;
+}
+
+/* Compute (t/2^62) * [f, g], where t is a transition matrix for 62 divsteps.
+ *
+ * Version that operates on a variable number of limbs in f and g.
+ *
+ * This implements the update_fg function from the explanation.
+ */
+static void secp256k1_modinv64_update_fg_62_var(int len, secp256k1_modinv64_signed62 *f, secp256k1_modinv64_signed62 *g, const secp256k1_modinv64_trans2x2 *t) {
+    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
+    const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
+    int64_t fi, gi;
+    int128_t cf, cg;
+    int i;
+    VERIFY_CHECK(len > 0);
+    /* Start computing t*[f,g]. */
+    fi = f->v[0];
+    gi = g->v[0];
+    cf = (int128_t)u * fi + (int128_t)v * gi;
+    cg = (int128_t)q * fi + (int128_t)r * gi;
+    /* Verify that the bottom 62 bits of the result are zero, and then throw them away. */
+    VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
+    VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
+    /* Now iteratively compute limb i=1..len of t*[f,g], and store them in output limb i-1 (shifting
+     * down by 62 bits). */
+    for (i = 1; i < len; ++i) {
+        fi = f->v[i];
+        gi = g->v[i];
+        cf += (int128_t)u * fi + (int128_t)v * gi;
+        cg += (int128_t)q * fi + (int128_t)r * gi;
+        f->v[i - 1] = (int64_t)cf & M62; cf >>= 62;
+        g->v[i - 1] = (int64_t)cg & M62; cg >>= 62;
+    }
+    /* What remains is limb (len) of t*[f,g]; store it as output limb (len-1). */
+    f->v[len - 1] = (int64_t)cf;
+    g->v[len - 1] = (int64_t)cg;
+}
+
+/* Compute the inverse of x modulo modinfo->modulus, and replace x with it (constant time in x). */
+static void secp256k1_modinv64(secp256k1_modinv64_signed62 *x, const secp256k1_modinv64_modinfo *modinfo) {
+    /* Start with d=0, e=1, f=modulus, g=x, eta=-1. */
+    secp256k1_modinv64_signed62 d = {{0, 0, 0, 0, 0}};
+    secp256k1_modinv64_signed62 e = {{1, 0, 0, 0, 0}};
+    secp256k1_modinv64_signed62 f = modinfo->modulus;
+    secp256k1_modinv64_signed62 g = *x;
+    int i;
+    int64_t eta = -1;
+
+    /* Do 12 iterations of 62 divsteps each = 744 divsteps. 724 suffices for 256-bit inputs. */
+    for (i = 0; i < 12; ++i) {
+        /* Compute transition matrix and new eta after 62 divsteps. */
+        secp256k1_modinv64_trans2x2 t;
+        eta = secp256k1_modinv64_divsteps_62(eta, f.v[0], g.v[0], &t);
+        /* Update d,e using that transition matrix. */
+        secp256k1_modinv64_update_de_62(&d, &e, &t, modinfo);
+        /* Update f,g using that transition matrix. */
+#ifdef VERIFY
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, 5, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, 5, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, 5, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, 5, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+#endif
+        secp256k1_modinv64_update_fg_62(&f, &g, &t);
+#ifdef VERIFY
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, 5, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, 5, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, 5, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, 5, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+#endif
+    }
+
+    /* At this point sufficient iterations have been performed that g must have reached 0
+     * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g
+     * values i.e. +/- 1, and d now contains +/- the modular inverse. */
+#ifdef VERIFY
+    /* g == 0 */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, 5, &SECP256K1_SIGNED62_ONE, 0) == 0);
+    /* |f| == 1, or (x == 0 and d == 0 and |f|=modulus) */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, 5, &SECP256K1_SIGNED62_ONE, -1) == 0 ||
+                 secp256k1_modinv64_mul_cmp_62(&f, 5, &SECP256K1_SIGNED62_ONE, 1) == 0 ||
+                 (secp256k1_modinv64_mul_cmp_62(x, 5, &SECP256K1_SIGNED62_ONE, 0) == 0 &&
+                  secp256k1_modinv64_mul_cmp_62(&d, 5, &SECP256K1_SIGNED62_ONE, 0) == 0 &&
+                  (secp256k1_modinv64_mul_cmp_62(&f, 5, &modinfo->modulus, 1) == 0 ||
+                   secp256k1_modinv64_mul_cmp_62(&f, 5, &modinfo->modulus, -1) == 0)));
+#endif
+
+    /* Optionally negate d, normalize to [0,modulus), and return it. */
+    secp256k1_modinv64_normalize_62(&d, f.v[4], modinfo);
+    *x = d;
+}
+
+/* Compute the inverse of x modulo modinfo->modulus, and replace x with it (variable time). */
+static void secp256k1_modinv64_var(secp256k1_modinv64_signed62 *x, const secp256k1_modinv64_modinfo *modinfo) {
+    /* Start with d=0, e=1, f=modulus, g=x, eta=-1. */
+    secp256k1_modinv64_signed62 d = {{0, 0, 0, 0, 0}};
+    secp256k1_modinv64_signed62 e = {{1, 0, 0, 0, 0}};
+    secp256k1_modinv64_signed62 f = modinfo->modulus;
+    secp256k1_modinv64_signed62 g = *x;
+#ifdef VERIFY
+    int i = 0;
+#endif
+    int j, len = 5;
+    int64_t eta = -1;
+    int64_t cond, fn, gn;
+
+    /* Do iterations of 62 divsteps each until g=0. */
+    while (1) {
+        /* Compute transition matrix and new eta after 62 divsteps. */
+        secp256k1_modinv64_trans2x2 t;
+        eta = secp256k1_modinv64_divsteps_62_var(eta, f.v[0], g.v[0], &t);
+        /* Update d,e using that transition matrix. */
+        secp256k1_modinv64_update_de_62(&d, &e, &t, modinfo);
+        /* Update f,g using that transition matrix. */
+#ifdef VERIFY
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, len, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, len, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, len, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, len, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+#endif
+        secp256k1_modinv64_update_fg_62_var(len, &f, &g, &t);
+        /* If the bottom limb of g is zero, there is a chance that g=0. */
+        if (g.v[0] == 0) {
+            cond = 0;
+            /* Check if the other limbs are also 0. */
+            for (j = 1; j < len; ++j) {
+                cond |= g.v[j];
+            }
+            /* If so, we're done. */
+            if (cond == 0) break;
+        }
+
+        /* Determine if len>1 and limb (len-1) of both f and g is 0 or -1. */
+        fn = f.v[len - 1];
+        gn = g.v[len - 1];
+        cond = ((int64_t)len - 2) >> 63;
+        cond |= fn ^ (fn >> 63);
+        cond |= gn ^ (gn >> 63);
+        /* If so, reduce length, propagating the sign of f and g's top limb into the one below. */
+        if (cond == 0) {
+            f.v[len - 2] |= (uint64_t)fn << 62;
+            g.v[len - 2] |= (uint64_t)gn << 62;
+            --len;
+        }
+#ifdef VERIFY
+        VERIFY_CHECK(++i < 12); /* We should never need more than 12*62 = 744 divsteps */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, len, &modinfo->modulus, -1) > 0); /* f > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, len, &modinfo->modulus, 1) <= 0); /* f <= modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, len, &modinfo->modulus, -1) > 0); /* g > -modulus */
+        VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, len, &modinfo->modulus, 1) < 0);  /* g <  modulus */
+#endif
+    }
+
+    /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of
+     * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */
+#ifdef VERIFY
+    /* g == 0 */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, len, &SECP256K1_SIGNED62_ONE, 0) == 0);
+    /* |f| == 1, or (x == 0 and d == 0 and |f|=modulus) */
+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, len, &SECP256K1_SIGNED62_ONE, -1) == 0 ||
+                 secp256k1_modinv64_mul_cmp_62(&f, len, &SECP256K1_SIGNED62_ONE, 1) == 0 ||
+                 (secp256k1_modinv64_mul_cmp_62(x, 5, &SECP256K1_SIGNED62_ONE, 0) == 0 &&
+                  secp256k1_modinv64_mul_cmp_62(&d, 5, &SECP256K1_SIGNED62_ONE, 0) == 0 &&
+                  (secp256k1_modinv64_mul_cmp_62(&f, len, &modinfo->modulus, 1) == 0 ||
+                   secp256k1_modinv64_mul_cmp_62(&f, len, &modinfo->modulus, -1) == 0)));
+#endif
+
+    /* Optionally negate d, normalize to [0,modulus), and return it. */
+    secp256k1_modinv64_normalize_62(&d, f.v[len - 1], modinfo);
+    *x = d;
+}
+
+#endif /* SECP256K1_MODINV64_IMPL_H */
diff --git a/src/modules/multiset/Makefile.am.include b/src/modules/multiset/Makefile.am.include
deleted file mode 100644
index 21a94e8..0000000
--- a/src/modules/multiset/Makefile.am.include
+++ /dev/null
@@ -1,8 +0,0 @@
-include_HEADERS += include/secp256k1_multiset.h
-noinst_HEADERS += src/modules/multiset/main_impl.h
-noinst_HEADERS += src/modules/multiset/tests_impl.h
-if USE_BENCHMARK
-noinst_PROGRAMS += bench_multiset
-bench_multiset_SOURCES = src/bench_multiset.c
-bench_multiset_LDADD = libsecp256k1.la $(SECP_LIBS) $(COMMON_LIB)
-endif
diff --git a/src/modules/multiset/README.md b/src/modules/multiset/README.md
deleted file mode 100644
index 5d0f095..0000000
--- a/src/modules/multiset/README.md
+++ /dev/null
@@ -1,81 +0,0 @@
-Secp256k1 multiset module
-=========================
-
-Abstract
---------
-
-This module allows calculating a cryptographically secure hash for a
-set with the properties:
-
-* The order of the elements of the set does not effect the hash
-* Elements can be added to the set without recalculating the entire set
-
-Or mathematically, it is:
-
-* Commutative: H(a,b) = H(b,a)
-* Associative: H(H(a,b),c) = H(a,H(b,c))
-
-Hence it behaves similar to XORing the hashes of the individual elements,
-but without the cryptographic weakness of XOR.
-
-Motivation
-----------
-
-The multiset can be used by cryptocurrencies to cheaply create and
-maintain a commitment to the full UTXO set as proposed by Pieter Wiulle [1]
-
-It can also be used with a bucketed approach to enable cheap UTXO-proofs as
-proposed by Tomas van der Wansem [2]
-
-Usage
------
-
-    // Construct a multiset of (data1,data3)
-
-    unsigned char data1[100],data2[150],data3[175];
-    ...
-    secp256k1_multiset x,y;
-    secp256k1_multiset_init  (context, &x);
-
-    // add all 3 data elements
-    secp256k1_multiset_add(context, &y, data1, sizeof(data1));
-    secp256k1_multiset_add(context, &y, data2, sizeof(data2));
-    secp256k1_multiset_add(context, &y, data3, sizeof(data3));
-
-    // remove data2
-    secp256k1_multiset_remove(context, &y, data2, sizeof(data2));
-
-    // convert to hash
-    secp256k1_multiset_finalize(context, hashBuffer, &x);
-
-Algorithm
----------
-
-Using Elliptic Curves as multisets is described in [3].
-
-This implementation uses Try and Increment [4] to convert the hash into
-point on the secp256k1 curve which serves as multiset. The curve's
-group operations are then used to add and remove multisets.
-Associativity and Commutativity then follow.
-
-Security
---------
-The hash is secure against collision attacks.
-
-The algorithm used is susceptible to timing attacks. Therefore it does
-not securely conceal the underlying data being hashed.
-
-For the purpose of UTXO commitments this is not relevant.
-
-
-References
-----------
-
-[1] https://lists.linuxfoundation.org/pipermail/bitcoin-dev/2017-May/014337.html
-
-[2] https://lists.linuxfoundation.org/pipermail/bitcoin-ml/2017-September/000240.html
-
-[3] https://arxiv.org/pdf/1601.06502.pdf
-
-[4] https://eprint.iacr.org/2009/226.pdf
-
diff --git a/src/modules/multiset/main_impl.h b/src/modules/multiset/main_impl.h
deleted file mode 100644
index e7944ef..0000000
--- a/src/modules/multiset/main_impl.h
+++ /dev/null
@@ -1,241 +0,0 @@
-/**********************************************************************
- * Copyright (c) 2017 Tomas van der Wansem                            *
- * Distributed under the MIT software license, see the accompanying   *
- * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
- **********************************************************************/
- /*For future reference: https://reviews.bitcoinabc.org/D1072 */
-
-#ifndef _SECP256K1_MODULE_MULTISET_MAIN_
-#define _SECP256K1_MODULE_MULTISET_MAIN_
-
-#include "include/secp256k1_multiset.h"
-
-/* Converts a group element (Jacobian) to a multiset.
- * Requires the field elements to be normalized
- * Infinite uses special value, z = 0
- *  Will also normalize the input.
- */
-static void multiset_from_gej_var(secp256k1_multiset *target, secp256k1_gej *input) {
-    if (secp256k1_gej_is_infinity(input)) {
-        memset(&target->d, 0, sizeof(target->d));
-    } else {
-        secp256k1_fe_normalize(&input->x);
-        secp256k1_fe_normalize(&input->y);
-        secp256k1_fe_normalize(&input->z);
-
-        secp256k1_fe_get_b32(target->d, &input->x);
-        secp256k1_fe_get_b32(target->d + 32, &input->y);
-        secp256k1_fe_get_b32(target->d + 64, &input->z);
-    }
-}
-
-/* Converts a multiset to group element (Jacobian)
- * Infinite uses special value, z = 0 */
-static void gej_from_multiset_var(secp256k1_gej *target, const secp256k1_multiset *input) {
-    secp256k1_fe_set_b32(&target->x, input->d);
-    secp256k1_fe_set_b32(&target->y, input->d + 32);
-    secp256k1_fe_set_b32(&target->z, input->d + 64);
-
-    target->infinity = secp256k1_fe_is_zero(&target->z) ? 1 : 0;
-}
-
-/* Converts a data element to a group element (affine)
- *
- * We use Try and Increment which is fast but non-constant time.
- * Though constant time algo's exist we are not concerned with timing attacks
- * as we make no attempt to hide the underlying data
- *
- *  Pass inverse=0 to generate the group element, or inverse=1 to generate its inverse
- */
-static void ge_from_data_var(secp256k1_ge *target, const unsigned char *input, size_t inputLen, int inverse) {
-    secp256k1_sha256 hasher;
-    unsigned char buffer[8+32];
-    unsigned char trial[32];
-    uint64_t prefix;
-
-    /* Hash to buffer, leaving space for 8-byte prefix */
-    secp256k1_sha256_initialize(&hasher);
-    secp256k1_sha256_write(&hasher, input, inputLen);
-    secp256k1_sha256_finalize(&hasher, buffer+8);
-
-    /* Loop through trials, with 50% success per loop
-     * We can assume it ends within 2^64. */
-    for(prefix=0; 1; prefix++)
-    {
-        secp256k1_fe x;
-
-        /* Set prefix in little-endian */
-        buffer[0] = prefix & 0xFF;
-        buffer[1] = (prefix>>8) & 0xFF;
-        buffer[2] = (prefix>>16) & 0xFF;
-        buffer[3] = (prefix>>24) & 0xFF;
-        buffer[4] = (prefix>>32) & 0xFF;
-        buffer[5] = (prefix>>40) & 0xFF;
-        buffer[6] = (prefix>>48) & 0xFF;
-        buffer[7] = (prefix>>56) & 0xFF;
-
-        /* Hash to trial  */
-        secp256k1_sha256_initialize(&hasher);
-        secp256k1_sha256_write(&hasher, buffer, sizeof(buffer));
-        secp256k1_sha256_finalize(&hasher, trial);
-
-        if (!secp256k1_fe_set_b32(&x, trial)) {
-            continue;
-        }
-
-        /* We let y is even be the element and odd be its inverse */
-        if (!secp256k1_ge_set_xo_var(target, &x, inverse)) {
-            continue;
-        }
-
-        VERIFY_CHECK(secp256k1_ge_is_valid_var(target));
-        break;
-    }
-}
-
-/** Adds or removes a data element */
-static int multiset_add_remove(const secp256k1_context *ctx, secp256k1_multiset *multiset, const unsigned char *input, size_t inputLen, int remove) {
-    secp256k1_ge newelm;
-    secp256k1_gej source, target;
-
-    VERIFY_CHECK(ctx != NULL);
-    ARG_CHECK(multiset != NULL);
-    ARG_CHECK(input != NULL);
-
-    gej_from_multiset_var(&source, multiset);
-    ge_from_data_var(&newelm, input, inputLen, remove);
-
-    secp256k1_gej_add_ge_var(&target, &source, &newelm, NULL);
-
-    multiset_from_gej_var(multiset, &target);
-
-    return 1;
-}
-
-/** Adds a data element to the multiset */
-int secp256k1_multiset_add(const secp256k1_context* ctx, secp256k1_multiset *multiset, const unsigned char *input, size_t inputLen) {
-    return multiset_add_remove(ctx, multiset, input, inputLen, 0);
-}
-
-/** Removes a data element from the multiset */
-int secp256k1_multiset_remove(const secp256k1_context* ctx, secp256k1_multiset *multiset, const unsigned char *input, size_t inputLen) {
-    return multiset_add_remove(ctx, multiset, input, inputLen, 1);
-}
-
-/** Adds input multiset to multiset */
-int secp256k1_multiset_combine(const secp256k1_context* ctx, secp256k1_multiset *multiset, const secp256k1_multiset *input) {
-    secp256k1_gej gej_multiset, gej_input, gej_result;
-
-    VERIFY_CHECK(ctx != NULL);
-    ARG_CHECK(multiset != NULL);
-    ARG_CHECK(input != NULL);
-
-    gej_from_multiset_var(&gej_multiset, multiset);
-    gej_from_multiset_var(&gej_input, input);
-
-    secp256k1_gej_add_var(&gej_result, &gej_multiset, &gej_input, NULL);
-
-    multiset_from_gej_var(multiset, &gej_result);
-
-    return 1;
-}
-
-/** Hash the multiset into resultHash */
-int secp256k1_multiset_finalize(const secp256k1_context* ctx, unsigned char *resultHash, const secp256k1_multiset *multiset) {
-    secp256k1_sha256 hasher;
-    unsigned char buffer[64];
-    secp256k1_gej gej;
-    secp256k1_ge ge;
-
-    VERIFY_CHECK(ctx != NULL);
-    ARG_CHECK(resultHash != NULL);
-    ARG_CHECK(multiset != NULL);
-
-    gej_from_multiset_var(&gej, multiset);
-    if (secp256k1_gej_is_infinity(&gej)) {
-        /* empty set is encoded as zeros */
-        memset(resultHash, 0x00, 32);
-        return 1;
-    }
-    /* we must normalize to affine first */
-    secp256k1_ge_set_gej(&ge, &gej);
-    secp256k1_fe_normalize(&ge.x);
-    secp256k1_fe_normalize(&ge.y);
-    secp256k1_fe_get_b32(buffer, &ge.x);
-    secp256k1_fe_get_b32(buffer+32, &ge.y);
-
-    secp256k1_sha256_initialize(&hasher);
-    secp256k1_sha256_write(&hasher, buffer, sizeof(buffer));
-    secp256k1_sha256_finalize(&hasher, resultHash);
-
-    return 1;
-}
-
-/* Inits the multiset with the constant for empty data,
-   represented by the Jacobian GE infinite */
-int secp256k1_multiset_init(const secp256k1_context *ctx, secp256k1_multiset *multiset) {
-    secp256k1_gej inf = SECP256K1_GEJ_CONST_INFINITY;
-
-    VERIFY_CHECK(ctx != NULL);
-    VERIFY_CHECK(multiset != NULL);
-
-    multiset_from_gej_var(multiset, &inf);
-
-    return 1;
-}
-
-int secp256k1_multiset_serialize(const secp256k1_context *ctx, unsigned char *out64, const secp256k1_multiset *multiset) {
-    secp256k1_gej gej;
-    secp256k1_ge ge;
-
-    VERIFY_CHECK(ctx != NULL);
-    ARG_CHECK(out64 != NULL);
-    memset(out64, 0, 64);
-    ARG_CHECK(multiset != NULL);
-
-    gej_from_multiset_var(&gej, multiset);
-    if (secp256k1_gej_is_infinity(&gej)) {
-        /* Return all zeros.*/
-        return 1;
-    }
-
-    secp256k1_ge_set_gej(&ge, &gej);
-
-    secp256k1_fe_normalize_var(&ge.x);
-    secp256k1_fe_normalize_var(&ge.y);
-    secp256k1_fe_get_b32(&out64[0], &ge.x);
-    secp256k1_fe_get_b32(&out64[32], &ge.y);
-    return 1;
-}
-
-int secp256k1_multiset_parse(const secp256k1_context *ctx, secp256k1_multiset *multiset, const unsigned char *in64) {
-    secp256k1_ge ge;
-    secp256k1_gej gej;
-    secp256k1_fe x, y;
-
-    VERIFY_CHECK(ctx != NULL);
-    ARG_CHECK(multiset != NULL);
-    memset(multiset, 0, sizeof(*multiset));
-    ARG_CHECK(in64 != NULL);
-
-    if (!secp256k1_fe_set_b32(&x, &in64[0]) || !secp256k1_fe_set_b32(&y, &in64[32])) {
-        /* Fail if overflowed */
-        return 0;
-    }
-
-    if (secp256k1_fe_is_zero(&x) && secp256k1_fe_is_zero(&y)) {
-        secp256k1_gej_set_infinity(&gej);
-    } else {
-        secp256k1_ge_set_xy(&ge, &x, &y);
-        if (!secp256k1_ge_is_valid_var(&ge)) {
-            return 0;
-        }
-        secp256k1_gej_set_ge(&gej, &ge);
-    }
-
-    multiset_from_gej_var(multiset, &gej);
-
-    return 1;
-}
-
-#endif /* _SECP256K1_MODULE_MULTISET_MAIN_ */
diff --git a/src/modules/multiset/tests_impl.h b/src/modules/multiset/tests_impl.h
deleted file mode 100644
index 94ecb86..0000000
--- a/src/modules/multiset/tests_impl.h
+++ /dev/null
@@ -1,315 +0,0 @@
-/**********************************************************************
- * Copyright (c) 2017 Tomas van der Wansem                            *
- * Distributed under the MIT software license, see the accompanying   *
- * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
- **********************************************************************/
-
-#ifndef _SECP256K1_MODULE_MULTISET_TESTS_
-#define _SECP256K1_MODULE_MULTISET_TESTS_
-
-
-#define DATALEN   64*3
-#define DATACOUNT 100
-
-
-#define CHECK_EQUAL(a,b) { \
-    unsigned char hash1[32]; \
-    unsigned char hash2[32]; \
-    secp256k1_multiset_finalize(ctx, hash1, (a)); \
-    secp256k1_multiset_finalize(ctx, hash2, (b)); \
-    CHECK(memcmp(hash1,hash2,sizeof(hash1))==0); \
-}
-
-#define CHECK_NOTEQUAL(a,b) { \
-    unsigned char hash1[32]; \
-    unsigned char hash2[32]; \
-    secp256k1_multiset_finalize(ctx, hash1, (a)); \
-    secp256k1_multiset_finalize(ctx, hash2, (b)); \
-    CHECK(memcmp(hash1,hash2,sizeof(hash1))!=0); \
-}
-
-static unsigned char elements[DATACOUNT][DATALEN];
-
-/* Create random data */
-static void initdata(void) {
-    secp256k1_testrand_bytes_test(&elements[0][0], DATACOUNT*DATALEN);
-}
-
-void test_unordered(void) {
-
-    /* Check if multisets are uneffected by order */
-
-    secp256k1_multiset empty, r1,r2,r3;
-
-    secp256k1_multiset_init(ctx, &empty);
-    secp256k1_multiset_init(ctx, &r1);
-    secp256k1_multiset_init(ctx, &r2);
-    secp256k1_multiset_init(ctx, &r3);
-
-    secp256k1_multiset_add(ctx, &r1, elements[0], DATALEN);
-    secp256k1_multiset_add(ctx, &r2, elements[1], DATALEN);
-
-    CHECK_NOTEQUAL(&r1,&r2); /* M(0,1)!=M() */
-
-    secp256k1_multiset_add(ctx, &r1, elements[1], DATALEN);
-    secp256k1_multiset_add(ctx, &r2, elements[0], DATALEN);
-    CHECK_EQUAL(&r1,&r2); /* M(0,1)==M(1,0) */
-
-    secp256k1_multiset_init(ctx, &r1);
-    secp256k1_multiset_init(ctx, &r2);
-    secp256k1_multiset_init(ctx, &r3);
-
-    CHECK_EQUAL(&r1,&r2); /* M()==M() */
-
-    secp256k1_multiset_add(ctx, &r1, elements[0], DATALEN);
-    secp256k1_multiset_add(ctx, &r1, elements[1], DATALEN);
-    secp256k1_multiset_add(ctx, &r1, elements[2], DATALEN);
-
-    secp256k1_multiset_add(ctx, &r2, elements[2], DATALEN);
-    secp256k1_multiset_add(ctx, &r2, elements[0], DATALEN);
-    secp256k1_multiset_add(ctx, &r2, elements[1], DATALEN);
-
-    secp256k1_multiset_add(ctx, &r3, elements[1], DATALEN);
-    secp256k1_multiset_add(ctx, &r3, elements[0], DATALEN);
-    secp256k1_multiset_add(ctx, &r3, elements[2], DATALEN);
-
-    CHECK_EQUAL(&r1,&r2); /* M(0,1,2)==M(2,0,1) */
-    CHECK_EQUAL(&r1,&r3); /* M(0,1,2)==M(1,0,2) */
-
-
-    secp256k1_multiset_combine(ctx, &r3, &empty);
-    CHECK_EQUAL(&r1,&r3); /* M(1,0,2)+M()=M(0,1,2) */
-
-    secp256k1_multiset_combine(ctx, &r3, &r2);
-    CHECK_NOTEQUAL(&r1,&r3); /* M(1,0,2)+M(0,1,2)!=M(0,1,2) */
-
-}
-
-void test_combine(void) {
-
-    /* Testing if combining is effectively the same as adding the elements */
-
-    secp256k1_multiset empty, r1,r2,r3;
-
-    secp256k1_multiset_init(ctx, &empty);
-    secp256k1_multiset_init(ctx, &r1);
-    secp256k1_multiset_init(ctx, &r2);
-    secp256k1_multiset_init(ctx, &r3);
-
-    secp256k1_multiset_add(ctx, &r1, elements[0], DATALEN);
-    secp256k1_multiset_add(ctx, &r2, elements[1], DATALEN);
-    CHECK_NOTEQUAL(&r1,&r2); /* M(0) != M(1) */
-
-    secp256k1_multiset_add(ctx, &r1, elements[1], DATALEN);
-    secp256k1_multiset_add(ctx, &r2, elements[0], DATALEN);
-    CHECK_EQUAL(&r1,&r2); /* M(1,0) == M(0,1) */
-
-    secp256k1_multiset_init(ctx, &r1);
-    secp256k1_multiset_init(ctx, &r2);
-    secp256k1_multiset_init(ctx, &r3);
-
-    CHECK_EQUAL(&r1,&r2); /* M() == M() */
-
-    secp256k1_multiset_add(ctx, &r1, elements[0], DATALEN);
-    secp256k1_multiset_add(ctx, &r1, elements[1], DATALEN);
-    secp256k1_multiset_add(ctx, &r1, elements[2], DATALEN);
-
-    secp256k1_multiset_add(ctx, &r2, elements[2], DATALEN);
-    secp256k1_multiset_add(ctx, &r3, elements[0], DATALEN);
-    secp256k1_multiset_add(ctx, &r3, elements[1], DATALEN);
-    secp256k1_multiset_combine(ctx, &r2, &r3);
-    CHECK_EQUAL(&r1,&r2); /* M(0,1,2) == M(2)+M(0,1) */
-
-    secp256k1_multiset_init(ctx, &r2);
-    secp256k1_multiset_init(ctx, &r3);
-    secp256k1_multiset_add(ctx, &r2, elements[2], DATALEN);
-    secp256k1_multiset_add(ctx, &r2, elements[0], DATALEN);
-    secp256k1_multiset_add(ctx, &r3, elements[1], DATALEN);
-    secp256k1_multiset_combine(ctx, &r2, &r3);
-    CHECK_EQUAL(&r1,&r2); /* M(0,1,2) == M(2,0)+M(1) */
-
-    secp256k1_multiset_combine(ctx, &r2, &empty);
-    CHECK_EQUAL(&r1,&r2); /* M(0,1,2)+M() == M(0,1,2) */
-    secp256k1_multiset_combine(ctx, &r2, &r1);
-    CHECK_NOTEQUAL(&r1,&r2); /* M(0,1,2)+M(0,1,2) != M(0,1,2) */
-}
-
-
-void test_remove(void) {
-
-    /* Testing removal of elements */
-    secp256k1_multiset empty, r1,r2,r3;
-
-    secp256k1_multiset_init(ctx, &empty);
-    secp256k1_multiset_init(ctx, &r1);
-    secp256k1_multiset_init(ctx, &r2);
-    secp256k1_multiset_init(ctx, &r3);
-
-    CHECK_EQUAL(&r1,&r2); /* M()==M() */
-
-    secp256k1_multiset_add   (ctx, &r1, elements[0], DATALEN);
-    secp256k1_multiset_add   (ctx, &r1, elements[1], DATALEN);
-    secp256k1_multiset_add   (ctx, &r1, elements[3], DATALEN);
-    secp256k1_multiset_add   (ctx, &r1, elements[9], DATALEN);
-    secp256k1_multiset_add   (ctx, &r1, elements[8], DATALEN);
-
-    secp256k1_multiset_add   (ctx, &r2, elements[1], DATALEN);
-    secp256k1_multiset_add   (ctx, &r2, elements[9], DATALEN);
-    secp256k1_multiset_add   (ctx, &r2, elements[11], DATALEN);
-    secp256k1_multiset_add   (ctx, &r2, elements[10], DATALEN);
-    secp256k1_multiset_add   (ctx, &r2, elements[0], DATALEN);
-    secp256k1_multiset_remove(ctx, &r2, elements[10], DATALEN);
-    secp256k1_multiset_add   (ctx, &r2, elements[3], DATALEN);
-    secp256k1_multiset_add   (ctx, &r2, elements[8], DATALEN);
-    secp256k1_multiset_remove(ctx, &r2, elements[11], DATALEN);
-
-    secp256k1_multiset_add   (ctx, &r3, elements[9], DATALEN);
-    secp256k1_multiset_add   (ctx, &r3, elements[15], DATALEN);
-    secp256k1_multiset_add   (ctx, &r3, elements[15], DATALEN);
-    secp256k1_multiset_add   (ctx, &r3, elements[1], DATALEN);
-    secp256k1_multiset_add   (ctx, &r3, elements[9], DATALEN);
-    secp256k1_multiset_remove(ctx, &r3, elements[15], DATALEN);
-    secp256k1_multiset_add   (ctx, &r3, elements[0], DATALEN);
-    secp256k1_multiset_remove(ctx, &r3, elements[15], DATALEN);
-    secp256k1_multiset_remove(ctx, &r3, elements[9], DATALEN);
-    secp256k1_multiset_add   (ctx, &r3, elements[3], DATALEN);
-    secp256k1_multiset_add   (ctx, &r3, elements[8], DATALEN);
-
-    CHECK_EQUAL(&r1,&r2); /* M(0,1,3,9,8)==M(1,9,11,10,9,3,8)-M(10,11) */
-    CHECK_EQUAL(&r1,&r3); /* M(0,1,3,9,8)==M(9,15,15,1,9,0,3,8)-M(15,15,9) */
-    CHECK_NOTEQUAL(&r1,&empty); /* M(0,1,3,9,8)!=M() */
-
-    secp256k1_multiset_remove(ctx, &r3, elements[8], DATALEN);
-    CHECK_NOTEQUAL(&r1,&r3); /* M(0,1,3,9,8)-M(8)!=M(0,1,3,9,8) */
-
-    secp256k1_multiset_remove(ctx, &r2, elements[0], DATALEN);
-    secp256k1_multiset_remove(ctx, &r2, elements[9], DATALEN);
-    secp256k1_multiset_remove(ctx, &r2, elements[8], DATALEN);
-    secp256k1_multiset_remove(ctx, &r2, elements[1], DATALEN);
-    secp256k1_multiset_remove(ctx, &r2, elements[3], DATALEN);
-
-    CHECK_EQUAL(&r2,&empty); /* M(0,1,3,9,8)-M(0,1,3,9,8)==M() */
-}
-
-void test_duplicate(void) {
-
-    /* Test if the multiset properly handles duplicates */
-    secp256k1_multiset empty, r1,r2,r3;
-
-    secp256k1_multiset_init(ctx, &empty);
-    secp256k1_multiset_init(ctx, &r1);
-    secp256k1_multiset_init(ctx, &r2);
-    secp256k1_multiset_init(ctx, &r3);
-
-    secp256k1_multiset_add   (ctx, &r1, elements[0], DATALEN);
-    secp256k1_multiset_add   (ctx, &r1, elements[0], DATALEN);
-    secp256k1_multiset_add   (ctx, &r1, elements[1], DATALEN);
-    secp256k1_multiset_add   (ctx, &r1, elements[1], DATALEN);
-
-    secp256k1_multiset_add   (ctx, &r2, elements[0], DATALEN);
-    secp256k1_multiset_add   (ctx, &r2, elements[1], DATALEN);
-
-    CHECK_NOTEQUAL(&r1, &r2); /* M(0,0,1,1)!=M(0,1) */
-
-    secp256k1_multiset_add   (ctx, &r2, elements[0], DATALEN);
-    secp256k1_multiset_add   (ctx, &r2, elements[1], DATALEN);
-    CHECK_EQUAL(&r1, &r2); /* M(0,0,1,1)!=M(0,0,1,1) */
-
-    secp256k1_multiset_add   (ctx, &r1, elements[0], DATALEN);
-    secp256k1_multiset_add   (ctx, &r1, elements[1], DATALEN);
-    secp256k1_multiset_add   (ctx, &r3, elements[0], DATALEN);
-    secp256k1_multiset_add   (ctx, &r3, elements[1], DATALEN);
-
-    secp256k1_multiset_combine(ctx, &r2, &r3);
-    CHECK_EQUAL(&r1, &r2); /* M(0,0,0,1,1,1)!=M(0,0,1,1)+M(0,1) */
-}
-
-void test_empty(void) {
-
-    /* Test if empty set properties hold */
-
-    secp256k1_multiset empty, r1,r2;
-
-    secp256k1_multiset_init(ctx, &empty);
-    secp256k1_multiset_init(ctx, &r1);
-    secp256k1_multiset_init(ctx, &r2);
-
-    CHECK_EQUAL(&empty,&r1); /* M()==M() */
-
-    /* empty + empty = empty */
-    secp256k1_multiset_combine(ctx, &r1, &r2);
-    CHECK_EQUAL(&empty, &r1); /* M()+M()==M() */
-}
-
-void test_testvector(void) {
-    /* Tests known values from the specification */
-
-    const unsigned char d1[113] = {
-        0x98,0x20,0x51,0xfd,0x1e,0x4b,0xa7,0x44,0xbb,0xbe,0x68,0x0e,0x1f,0xee,0x14,0x67,0x7b,0xa1,0xa3,0xc3,0x54,0x0b,0xf7,0xb1,0xcd,0xb6,0x06,0xe8,0x57,0x23,0x3e,0x0e,
-        0x00,0x00,0x00,0x00,0x03,0x00,0xf2,0x05,0x2a,0x01,0x00,0x00,0x00,0x43,0x41,0x04,0x96,0xb5,0x38,0xe8,0x53,0x51,0x9c,0x72,0x6a,0x2c,0x91,0xe6,0x1e,0xc1,0x16,0x00,
-        0xae,0x13,0x90,0x81,0x3a,0x62,0x7c,0x66,0xfb,0x8b,0xe7,0x94,0x7b,0xe6,0x3c,0x52,0xda,0x75,0x89,0x37,0x95,0x15,0xd4,0xe0,0xa6,0x04,0xf8,0x14,0x17,0x81,0xe6,0x22,
-        0x94,0x72,0x11,0x66,0xbf,0x62,0x1e,0x73,0xa8,0x2c,0xbf,0x23,0x42,0xc8,0x58,0xee,0xac };
-
-    const unsigned char d2[113] = {
-        0xd5,0xfd,0xcc,0x54,0x1e,0x25,0xde,0x1c,0x7a,0x5a,0xdd,0xed,0xf2,0x48,0x58,0xb8,0xbb,0x66,0x5c,0x9f,0x36,0xef,0x74,0x4e,0xe4,0x2c,0x31,0x60,0x22,0xc9,0x0f,0x9b,
-        0x00,0x00,0x00,0x00,0x05,0x00,0xf2,0x05,0x2a,0x01,0x00,0x00,0x00,0x43,0x41,0x04,0x72,0x11,0xa8,0x24,0xf5,0x5b,0x50,0x52,0x28,0xe4,0xc3,0xd5,0x19,0x4c,0x1f,0xcf,
-        0xaa,0x15,0xa4,0x56,0xab,0xdf,0x37,0xf9,0xb9,0xd9,0x7a,0x40,0x40,0xaf,0xc0,0x73,0xde,0xe6,0xc8,0x90,0x64,0x98,0x4f,0x03,0x38,0x52,0x37,0xd9,0x21,0x67,0xc1,0x3e,
-        0x23,0x64,0x46,0xb4,0x17,0xab,0x79,0xa0,0xfc,0xae,0x41,0x2a,0xe3,0x31,0x6b,0x77,0xac };
-
-    const unsigned char d3[113] = {
-        0x44,0xf6,0x72,0x22,0x60,0x90,0xd8,0x5d,0xb9,0xa9,0xf2,0xfb,0xfe,0x5f,0x0f,0x96,0x09,0xb3,0x87,0xaf,0x7b,0xe5,0xb7,0xfb,0xb7,0xa1,0x76,0x7c,0x83,0x1c,0x9e,0x99,
-        0x00,0x00,0x00,0x00,0x07,0x00,0xf2,0x05,0x2a,0x01,0x00,0x00,0x00,0x43,0x41,0x04,0x94,0xb9,0xd3,0xe7,0x6c,0x5b,0x16,0x29,0xec,0xf9,0x7f,0xff,0x95,0xd7,0xa4,0xbb,
-        0xda,0xc8,0x7c,0xc2,0x60,0x99,0xad,0xa2,0x80,0x66,0xc6,0xff,0x1e,0xb9,0x19,0x12,0x23,0xcd,0x89,0x71,0x94,0xa0,0x8d,0x0c,0x27,0x26,0xc5,0x74,0x7f,0x1d,0xb4,0x9e,
-        0x8c,0xf9,0x0e,0x75,0xdc,0x3e,0x35,0x50,0xae,0x9b,0x30,0x08,0x6f,0x3c,0xd5,0xaa,0xac };
-
-    /* Expected resulting multisets */
-    const unsigned char exp_empty[32]  = { 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
-    const unsigned char exp_m1[32]     = { 0x5e,0x29,0x49,0x84,0xc0,0xb6,0xff,0x1c,0x89,0x7b,0xdb,0xb6,0xf7,0xcf,0x3e,0xf8,0x01,0xe2,0xf1,0x3b,0xc7,0x34,0x28,0xaa,0xcd,0xf8,0xcb,0x8d,0x3b,0xd2,0xf0,0xe5 };
-    const unsigned char exp_m2[32]     = { 0x93,0x70,0x80,0xb6,0x6c,0x2b,0x37,0x2d,0x35,0x39,0x88,0xd6,0xc0,0x92,0x22,0x78,0x8f,0x88,0xa5,0x13,0x0a,0x13,0x32,0xeb,0xc1,0x49,0x5a,0xa3,0xa7,0xfa,0xb4,0xfb };
-    const unsigned char exp_m3[32]     = { 0xdd,0xbf,0x2f,0x18,0xaf,0xe0,0xaf,0xa8,0x87,0x28,0x57,0x93,0xa4,0x82,0xa0,0x7f,0xc2,0x2a,0x46,0x28,0x48,0x36,0x78,0x73,0xd7,0x9d,0x72,0xa7,0x33,0x0a,0x96,0x15 };
-    const unsigned char exp_m1m2[32]   = { 0x48,0x09,0x8f,0x4c,0xa9,0xbb,0x5d,0xac,0x27,0x3e,0x56,0x31,0x6d,0xb6,0x41,0x23,0x69,0xed,0x1f,0xa8,0xbe,0xb5,0x79,0x57,0x05,0x32,0xd1,0x63,0x47,0xfe,0xfc,0xcc };
-    const unsigned char exp_m1m2m3[32] = { 0x47,0xfb,0xdf,0xf4,0x76,0x7c,0x32,0xa4,0xea,0x74,0xca,0x43,0x86,0x59,0x0f,0x62,0x22,0xee,0x83,0x96,0xb8,0xb4,0xf0,0x0e,0xf5,0x6e,0x9b,0x49,0x43,0xf4,0x24,0x93 };
-
-    unsigned char m0[32],m1[32],m2[32],m3[32],m1m2[32],m1m2m3[32];
-    secp256k1_multiset r0,r1,r2,r3;
-
-    secp256k1_multiset_init(ctx, &r0);
-    secp256k1_multiset_init(ctx, &r1);
-    secp256k1_multiset_init(ctx, &r2);
-    secp256k1_multiset_init(ctx, &r3);
-
-    secp256k1_multiset_add    (ctx, &r1, d1, sizeof(d1));
-    secp256k1_multiset_add    (ctx, &r2, d2, sizeof(d2));
-    secp256k1_multiset_add    (ctx, &r3, d3, sizeof(d3));
-
-    secp256k1_multiset_finalize(ctx, m0, &r0);
-    secp256k1_multiset_finalize(ctx, m1, &r1);
-    secp256k1_multiset_finalize(ctx, m2, &r2);
-    secp256k1_multiset_finalize(ctx, m3, &r3);
-
-    secp256k1_multiset_combine(ctx, &r1, &r2);
-    secp256k1_multiset_finalize(ctx, m1m2, &r1);
-
-    secp256k1_multiset_combine(ctx, &r1, &r3);
-    secp256k1_multiset_finalize(ctx, m1m2m3, &r1);
-
-    CHECK(memcmp(m0,exp_empty,32)==0);
-    CHECK(memcmp(m1,exp_m1,32)==0);
-    CHECK(memcmp(m2,exp_m2,32)==0);
-    CHECK(memcmp(m3,exp_m3,32)==0);
-    CHECK(memcmp(m1m2,exp_m1m2,32)==0);
-    CHECK(memcmp(m1m2m3,exp_m1m2m3,32)==0);
-}
-
-void run_multiset_tests(void) {
-
-    initdata();
-    test_unordered();
-    test_combine();
-    test_remove();
-    test_empty();
-    test_duplicate();
-    test_testvector();
-}
-
-#endif /* _SECP256K1_MODULE_MULTISET_TESTS_ */
diff --git a/src/num.h b/src/num.h
deleted file mode 100644
index 59a5cf2..0000000
--- a/src/num.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/***********************************************************************
- * Copyright (c) 2013, 2014 Pieter Wuille                              *
- * Distributed under the MIT software license, see the accompanying    *
- * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
- ***********************************************************************/
-
-#ifndef SECP256K1_NUM_H
-#define SECP256K1_NUM_H
-
-#ifndef USE_NUM_NONE
-
-#if defined HAVE_CONFIG_H
-#include "libsecp256k1-config.h"
-#endif
-
-#if defined(USE_NUM_GMP)
-#include "num_gmp.h"
-#else
-#error "Please select num implementation"
-#endif
-
-/** Copy a number. */
-static void secp256k1_num_copy(secp256k1_num *r, const secp256k1_num *a);
-
-/** Convert a number's absolute value to a binary big-endian string.
- *  There must be enough place. */
-static void secp256k1_num_get_bin(unsigned char *r, unsigned int rlen, const secp256k1_num *a);
-
-/** Set a number to the value of a binary big-endian string. */
-static void secp256k1_num_set_bin(secp256k1_num *r, const unsigned char *a, unsigned int alen);
-
-/** Compute a modular inverse. The input must be less than the modulus. */
-static void secp256k1_num_mod_inverse(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *m);
-
-/** Compute the jacobi symbol (a|b). b must be positive and odd. */
-static int secp256k1_num_jacobi(const secp256k1_num *a, const secp256k1_num *b);
-
-/** Compare the absolute value of two numbers. */
-static int secp256k1_num_cmp(const secp256k1_num *a, const secp256k1_num *b);
-
-/** Test whether two number are equal (including sign). */
-static int secp256k1_num_eq(const secp256k1_num *a, const secp256k1_num *b);
-
-/** Add two (signed) numbers. */
-static void secp256k1_num_add(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b);
-
-/** Subtract two (signed) numbers. */
-static void secp256k1_num_sub(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b);
-
-/** Multiply two (signed) numbers. */
-static void secp256k1_num_mul(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b);
-
-/** Replace a number by its remainder modulo m. M's sign is ignored. The result is a number between 0 and m-1,
-    even if r was negative. */
-static void secp256k1_num_mod(secp256k1_num *r, const secp256k1_num *m);
-
-/** Right-shift the passed number by bits bits. */
-static void secp256k1_num_shift(secp256k1_num *r, int bits);
-
-/** Check whether a number is zero. */
-static int secp256k1_num_is_zero(const secp256k1_num *a);
-
-/** Check whether a number is one. */
-static int secp256k1_num_is_one(const secp256k1_num *a);
-
-/** Check whether a number is strictly negative. */
-static int secp256k1_num_is_neg(const secp256k1_num *a);
-
-/** Change a number's sign. */
-static void secp256k1_num_negate(secp256k1_num *r);
-
-#endif
-
-#endif /* SECP256K1_NUM_H */
diff --git a/src/num_gmp.h b/src/num_gmp.h
deleted file mode 100644
index cc6c51a..0000000
--- a/src/num_gmp.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/***********************************************************************
- * Copyright (c) 2013, 2014 Pieter Wuille                              *
- * Distributed under the MIT software license, see the accompanying    *
- * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
- ***********************************************************************/
-
-#ifndef SECP256K1_NUM_REPR_H
-#define SECP256K1_NUM_REPR_H
-
-#include <gmp.h>
-
-#define NUM_LIMBS ((256+GMP_NUMB_BITS-1)/GMP_NUMB_BITS)
-
-typedef struct {
-    mp_limb_t data[2*NUM_LIMBS];
-    int neg;
-    int limbs;
-} secp256k1_num;
-
-#endif /* SECP256K1_NUM_REPR_H */
diff --git a/src/num_gmp_impl.h b/src/num_gmp_impl.h
deleted file mode 100644
index c07947d..0000000
--- a/src/num_gmp_impl.h
+++ /dev/null
@@ -1,288 +0,0 @@
-/***********************************************************************
- * Copyright (c) 2013, 2014 Pieter Wuille                              *
- * Distributed under the MIT software license, see the accompanying    *
- * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
- ***********************************************************************/
-
-#ifndef SECP256K1_NUM_REPR_IMPL_H
-#define SECP256K1_NUM_REPR_IMPL_H
-
-#include <string.h>
-#include <stdlib.h>
-#include <gmp.h>
-
-#include "util.h"
-#include "num.h"
-
-#ifdef VERIFY
-static void secp256k1_num_sanity(const secp256k1_num *a) {
-    VERIFY_CHECK(a->limbs == 1 || (a->limbs > 1 && a->data[a->limbs-1] != 0));
-}
-#else
-#define secp256k1_num_sanity(a) do { } while(0)
-#endif
-
-static void secp256k1_num_copy(secp256k1_num *r, const secp256k1_num *a) {
-    *r = *a;
-}
-
-static void secp256k1_num_get_bin(unsigned char *r, unsigned int rlen, const secp256k1_num *a) {
-    unsigned char tmp[65];
-    int len = 0;
-    int shift = 0;
-    if (a->limbs>1 || a->data[0] != 0) {
-        len = mpn_get_str(tmp, 256, (mp_limb_t*)a->data, a->limbs);
-    }
-    while (shift < len && tmp[shift] == 0) shift++;
-    VERIFY_CHECK(len-shift <= (int)rlen);
-    memset(r, 0, rlen - len + shift);
-    if (len > shift) {
-        memcpy(r + rlen - len + shift, tmp + shift, len - shift);
-    }
-    memset(tmp, 0, sizeof(tmp));
-}
-
-static void secp256k1_num_set_bin(secp256k1_num *r, const unsigned char *a, unsigned int alen) {
-    int len;
-    VERIFY_CHECK(alen > 0);
-    VERIFY_CHECK(alen <= 64);
-    len = mpn_set_str(r->data, a, alen, 256);
-    if (len == 0) {
-        r->data[0] = 0;
-        len = 1;
-    }
-    VERIFY_CHECK(len <= NUM_LIMBS*2);
-    r->limbs = len;
-    r->neg = 0;
-    while (r->limbs > 1 && r->data[r->limbs-1]==0) {
-        r->limbs--;
-    }
-}
-
-static void secp256k1_num_add_abs(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b) {
-    mp_limb_t c = mpn_add(r->data, a->data, a->limbs, b->data, b->limbs);
-    r->limbs = a->limbs;
-    if (c != 0) {
-        VERIFY_CHECK(r->limbs < 2*NUM_LIMBS);
-        r->data[r->limbs++] = c;
-    }
-}
-
-static void secp256k1_num_sub_abs(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b) {
-    mp_limb_t c = mpn_sub(r->data, a->data, a->limbs, b->data, b->limbs);
-    (void)c;
-    VERIFY_CHECK(c == 0);
-    r->limbs = a->limbs;
-    while (r->limbs > 1 && r->data[r->limbs-1]==0) {
-        r->limbs--;
-    }
-}
-
-static void secp256k1_num_mod(secp256k1_num *r, const secp256k1_num *m) {
-    secp256k1_num_sanity(r);
-    secp256k1_num_sanity(m);
-
-    if (r->limbs >= m->limbs) {
-        mp_limb_t t[2*NUM_LIMBS];
-        mpn_tdiv_qr(t, r->data, 0, r->data, r->limbs, m->data, m->limbs);
-        memset(t, 0, sizeof(t));
-        r->limbs = m->limbs;
-        while (r->limbs > 1 && r->data[r->limbs-1]==0) {
-            r->limbs--;
-        }
-    }
-
-    if (r->neg && (r->limbs > 1 || r->data[0] != 0)) {
-        secp256k1_num_sub_abs(r, m, r);
-        r->neg = 0;
-    }
-}
-
-static void secp256k1_num_mod_inverse(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *m) {
-    int i;
-    mp_limb_t g[NUM_LIMBS+1];
-    mp_limb_t u[NUM_LIMBS+1];
-    mp_limb_t v[NUM_LIMBS+1];
-    mp_size_t sn;
-    mp_size_t gn;
-    secp256k1_num_sanity(a);
-    secp256k1_num_sanity(m);
-
-    /** mpn_gcdext computes: (G,S) = gcdext(U,V), where
-     *  * G = gcd(U,V)
-     *  * G = U*S + V*T
-     *  * U has equal or more limbs than V, and V has no padding
-     *  If we set U to be (a padded version of) a, and V = m:
-     *    G = a*S + m*T
-     *    G = a*S mod m
-     *  Assuming G=1:
-     *    S = 1/a mod m
-     */
-    VERIFY_CHECK(m->limbs <= NUM_LIMBS);
-    VERIFY_CHECK(m->data[m->limbs-1] != 0);
-    for (i = 0; i < m->limbs; i++) {
-        u[i] = (i < a->limbs) ? a->data[i] : 0;
-        v[i] = m->data[i];
-    }
-    sn = NUM_LIMBS+1;
-    gn = mpn_gcdext(g, r->data, &sn, u, m->limbs, v, m->limbs);
-    (void)gn;
-    VERIFY_CHECK(gn == 1);
-    VERIFY_CHECK(g[0] == 1);
-    r->neg = a->neg ^ m->neg;
-    if (sn < 0) {
-        mpn_sub(r->data, m->data, m->limbs, r->data, -sn);
-        r->limbs = m->limbs;
-        while (r->limbs > 1 && r->data[r->limbs-1]==0) {
-            r->limbs--;
-        }
-    } else {
-        r->limbs = sn;
-    }
-    memset(g, 0, sizeof(g));
-    memset(u, 0, sizeof(u));
-    memset(v, 0, sizeof(v));
-}
-
-static int secp256k1_num_jacobi(const secp256k1_num *a, const secp256k1_num *b) {
-    int ret;
-    mpz_t ga, gb;
-    secp256k1_num_sanity(a);
-    secp256k1_num_sanity(b);
-    VERIFY_CHECK(!b->neg && (b->limbs > 0) && (b->data[0] & 1));
-
-    mpz_inits(ga, gb, NULL);
-
-    mpz_import(gb, b->limbs, -1, sizeof(mp_limb_t), 0, 0, b->data);
-    mpz_import(ga, a->limbs, -1, sizeof(mp_limb_t), 0, 0, a->data);
-    if (a->neg) {
-        mpz_neg(ga, ga);
-    }
-
-    ret = mpz_jacobi(ga, gb);
-
-    mpz_clears(ga, gb, NULL);
-
-    return ret;
-}
-
-static int secp256k1_num_is_one(const secp256k1_num *a) {
-    return (a->limbs == 1 && a->data[0] == 1);
-}
-
-static int secp256k1_num_is_zero(const secp256k1_num *a) {
-    return (a->limbs == 1 && a->data[0] == 0);
-}
-
-static int secp256k1_num_is_neg(const secp256k1_num *a) {
-    return (a->limbs > 1 || a->data[0] != 0) && a->neg;
-}
-
-static int secp256k1_num_cmp(const secp256k1_num *a, const secp256k1_num *b) {
-    if (a->limbs > b->limbs) {
-        return 1;
-    }
-    if (a->limbs < b->limbs) {
-        return -1;
-    }
-    return mpn_cmp(a->data, b->data, a->limbs);
-}
-
-static int secp256k1_num_eq(const secp256k1_num *a, const secp256k1_num *b) {
-    if (a->limbs > b->limbs) {
-        return 0;
-    }
-    if (a->limbs < b->limbs) {
-        return 0;
-    }
-    if ((a->neg && !secp256k1_num_is_zero(a)) != (b->neg && !secp256k1_num_is_zero(b))) {
-        return 0;
-    }
-    return mpn_cmp(a->data, b->data, a->limbs) == 0;
-}
-
-static void secp256k1_num_subadd(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b, int bneg) {
-    if (!(b->neg ^ bneg ^ a->neg)) { /* a and b have the same sign */
-        r->neg = a->neg;
-        if (a->limbs >= b->limbs) {
-            secp256k1_num_add_abs(r, a, b);
-        } else {
-            secp256k1_num_add_abs(r, b, a);
-        }
-    } else {
-        if (secp256k1_num_cmp(a, b) > 0) {
-            r->neg = a->neg;
-            secp256k1_num_sub_abs(r, a, b);
-        } else {
-            r->neg = b->neg ^ bneg;
-            secp256k1_num_sub_abs(r, b, a);
-        }
-    }
-}
-
-static void secp256k1_num_add(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b) {
-    secp256k1_num_sanity(a);
-    secp256k1_num_sanity(b);
-    secp256k1_num_subadd(r, a, b, 0);
-}
-
-static void secp256k1_num_sub(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b) {
-    secp256k1_num_sanity(a);
-    secp256k1_num_sanity(b);
-    secp256k1_num_subadd(r, a, b, 1);
-}
-
-static void secp256k1_num_mul(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b) {
-    mp_limb_t tmp[2*NUM_LIMBS+1];
-    secp256k1_num_sanity(a);
-    secp256k1_num_sanity(b);
-
-    VERIFY_CHECK(a->limbs + b->limbs <= 2*NUM_LIMBS+1);
-    if ((a->limbs==1 && a->data[0]==0) || (b->limbs==1 && b->data[0]==0)) {
-        r->limbs = 1;
-        r->neg = 0;
-        r->data[0] = 0;
-        return;
-    }
-    if (a->limbs >= b->limbs) {
-        mpn_mul(tmp, a->data, a->limbs, b->data, b->limbs);
-    } else {
-        mpn_mul(tmp, b->data, b->limbs, a->data, a->limbs);
-    }
-    r->limbs = a->limbs + b->limbs;
-    if (r->limbs > 1 && tmp[r->limbs - 1]==0) {
-        r->limbs--;
-    }
-    VERIFY_CHECK(r->limbs <= 2*NUM_LIMBS);
-    mpn_copyi(r->data, tmp, r->limbs);
-    r->neg = a->neg ^ b->neg;
-    memset(tmp, 0, sizeof(tmp));
-}
-
-static void secp256k1_num_shift(secp256k1_num *r, int bits) {
-    if (bits % GMP_NUMB_BITS) {
-        /* Shift within limbs. */
-        mpn_rshift(r->data, r->data, r->limbs, bits % GMP_NUMB_BITS);
-    }
-    if (bits >= GMP_NUMB_BITS) {
-        int i;
-        /* Shift full limbs. */
-        for (i = 0; i < r->limbs; i++) {
-            int index = i + (bits / GMP_NUMB_BITS);
-            if (index < r->limbs && index < 2*NUM_LIMBS) {
-                r->data[i] = r->data[index];
-            } else {
-                r->data[i] = 0;
-            }
-        }
-    }
-    while (r->limbs>1 && r->data[r->limbs-1]==0) {
-        r->limbs--;
-    }
-}
-
-static void secp256k1_num_negate(secp256k1_num *r) {
-    r->neg ^= 1;
-}
-
-#endif /* SECP256K1_NUM_REPR_IMPL_H */
diff --git a/src/num_impl.h b/src/num_impl.h
deleted file mode 100644
index 880598e..0000000
--- a/src/num_impl.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/***********************************************************************
- * Copyright (c) 2013, 2014 Pieter Wuille                              *
- * Distributed under the MIT software license, see the accompanying    *
- * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
- ***********************************************************************/
-
-#ifndef SECP256K1_NUM_IMPL_H
-#define SECP256K1_NUM_IMPL_H
-
-#if defined HAVE_CONFIG_H
-#include "libsecp256k1-config.h"
-#endif
-
-#include "num.h"
-
-#if defined(USE_NUM_GMP)
-#include "num_gmp_impl.h"
-#elif defined(USE_NUM_NONE)
-/* Nothing. */
-#else
-#error "Please select num implementation"
-#endif
-
-#endif /* SECP256K1_NUM_IMPL_H */
diff --git a/src/scalar.h b/src/scalar.h
index 0b737f9..aaaa3d8 100644
--- a/src/scalar.h
+++ b/src/scalar.h
@@ -7,7 +7,6 @@
 #ifndef SECP256K1_SCALAR_H
 #define SECP256K1_SCALAR_H
 
-#include "num.h"
 #include "util.h"
 
 #if defined HAVE_CONFIG_H
@@ -63,9 +62,6 @@ static void secp256k1_scalar_mul(secp256k1_scalar *r, const secp256k1_scalar *a,
  *  the low bits that were shifted off */
 static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n);
 
-/** Compute the square of a scalar (modulo the group order). */
-static void secp256k1_scalar_sqr(secp256k1_scalar *r, const secp256k1_scalar *a);
-
 /** Compute the inverse of a scalar (modulo the group order). */
 static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *a);
 
@@ -91,14 +87,6 @@ static int secp256k1_scalar_is_high(const secp256k1_scalar *a);
  * Returns -1 if the number was negated, 1 otherwise */
 static int secp256k1_scalar_cond_negate(secp256k1_scalar *a, int flag);
 
-#ifndef USE_NUM_NONE
-/** Convert a scalar to a number. */
-static void secp256k1_scalar_get_num(secp256k1_num *r, const secp256k1_scalar *a);
-
-/** Get the order of the group as a number. */
-static void secp256k1_scalar_order_get_num(secp256k1_num *r);
-#endif
-
 /** Compare two scalars. */
 static int secp256k1_scalar_eq(const secp256k1_scalar *a, const secp256k1_scalar *b);
 
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 3eaa041..a1def26 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -7,6 +7,8 @@
 #ifndef SECP256K1_SCALAR_REPR_IMPL_H
 #define SECP256K1_SCALAR_REPR_IMPL_H
 
+#include "modinv64_impl.h"
+
 /* Limbs of the secp256k1 order. */
 #define SECP256K1_N_0 ((uint64_t)0xBFD25E8CD0364141ULL)
 #define SECP256K1_N_1 ((uint64_t)0xBAAEDCE6AF48A03BULL)
@@ -212,28 +214,6 @@ static int secp256k1_scalar_cond_negate(secp256k1_scalar *r, int flag) {
     VERIFY_CHECK(c1 >= th); \
 }
 
-/** Add 2*a*b to the number defined by (c0,c1,c2). c2 must never overflow. */
-#define muladd2(a,b) { \
-    uint64_t tl, th, th2, tl2; \
-    { \
-        uint128_t t = (uint128_t)a * b; \
-        th = t >> 64;               /* at most 0xFFFFFFFFFFFFFFFE */ \
-        tl = t; \
-    } \
-    th2 = th + th;                  /* at most 0xFFFFFFFFFFFFFFFE (in case th was 0x7FFFFFFFFFFFFFFF) */ \
-    c2 += (th2 < th);               /* never overflows by contract (verified the next line) */ \
-    VERIFY_CHECK((th2 >= th) || (c2 != 0)); \
-    tl2 = tl + tl;                  /* at most 0xFFFFFFFFFFFFFFFE (in case the lowest 63 bits of tl were 0x7FFFFFFFFFFFFFFF) */ \
-    th2 += (tl2 < tl);              /* at most 0xFFFFFFFFFFFFFFFF */ \
-    c0 += tl2;                      /* overflow is handled on the next line */ \
-    th2 += (c0 < tl2);              /* second overflow is handled on the next line */ \
-    c2 += (c0 < tl2) & (th2 == 0);  /* never overflows by contract (verified the next line) */ \
-    VERIFY_CHECK((c0 >= tl2) || (th2 != 0) || (c2 != 0)); \
-    c1 += th2;                      /* overflow is handled on the next line */ \
-    c2 += (c1 < th2);               /* never overflows by contract (verified the next line) */ \
-    VERIFY_CHECK((c1 >= th2) || (c2 != 0)); \
-}
-
 /** Add a to the number defined by (c0,c1,c2). c2 must never overflow. */
 #define sumadd(a) { \
     unsigned int over; \
@@ -743,148 +723,10 @@ static void secp256k1_scalar_mul_512(uint64_t l[8], const secp256k1_scalar *a, c
 #endif
 }
 
-static void secp256k1_scalar_sqr_512(uint64_t l[8], const secp256k1_scalar *a) {
-#ifdef USE_ASM_X86_64
-    __asm__ __volatile__(
-    /* Preload */
-    "movq 0(%%rdi), %%r11\n"
-    "movq 8(%%rdi), %%r12\n"
-    "movq 16(%%rdi), %%r13\n"
-    "movq 24(%%rdi), %%r14\n"
-    /* (rax,rdx) = a0 * a0 */
-    "movq %%r11, %%rax\n"
-    "mulq %%r11\n"
-    /* Extract l0 */
-    "movq %%rax, 0(%%rsi)\n"
-    /* (r8,r9,r10) = (rdx,0) */
-    "movq %%rdx, %%r8\n"
-    "xorq %%r9, %%r9\n"
-    "xorq %%r10, %%r10\n"
-    /* (r8,r9,r10) += 2 * a0 * a1 */
-    "movq %%r11, %%rax\n"
-    "mulq %%r12\n"
-    "addq %%rax, %%r8\n"
-    "adcq %%rdx, %%r9\n"
-    "adcq $0, %%r10\n"
-    "addq %%rax, %%r8\n"
-    "adcq %%rdx, %%r9\n"
-    "adcq $0, %%r10\n"
-    /* Extract l1 */
-    "movq %%r8, 8(%%rsi)\n"
-    "xorq %%r8, %%r8\n"
-    /* (r9,r10,r8) += 2 * a0 * a2 */
-    "movq %%r11, %%rax\n"
-    "mulq %%r13\n"
-    "addq %%rax, %%r9\n"
-    "adcq %%rdx, %%r10\n"
-    "adcq $0, %%r8\n"
-    "addq %%rax, %%r9\n"
-    "adcq %%rdx, %%r10\n"
-    "adcq $0, %%r8\n"
-    /* (r9,r10,r8) += a1 * a1 */
-    "movq %%r12, %%rax\n"
-    "mulq %%r12\n"
-    "addq %%rax, %%r9\n"
-    "adcq %%rdx, %%r10\n"
-    "adcq $0, %%r8\n"
-    /* Extract l2 */
-    "movq %%r9, 16(%%rsi)\n"
-    "xorq %%r9, %%r9\n"
-    /* (r10,r8,r9) += 2 * a0 * a3 */
-    "movq %%r11, %%rax\n"
-    "mulq %%r14\n"
-    "addq %%rax, %%r10\n"
-    "adcq %%rdx, %%r8\n"
-    "adcq $0, %%r9\n"
-    "addq %%rax, %%r10\n"
-    "adcq %%rdx, %%r8\n"
-    "adcq $0, %%r9\n"
-    /* (r10,r8,r9) += 2 * a1 * a2 */
-    "movq %%r12, %%rax\n"
-    "mulq %%r13\n"
-    "addq %%rax, %%r10\n"
-    "adcq %%rdx, %%r8\n"
-    "adcq $0, %%r9\n"
-    "addq %%rax, %%r10\n"
-    "adcq %%rdx, %%r8\n"
-    "adcq $0, %%r9\n"
-    /* Extract l3 */
-    "movq %%r10, 24(%%rsi)\n"
-    "xorq %%r10, %%r10\n"
-    /* (r8,r9,r10) += 2 * a1 * a3 */
-    "movq %%r12, %%rax\n"
-    "mulq %%r14\n"
-    "addq %%rax, %%r8\n"
-    "adcq %%rdx, %%r9\n"
-    "adcq $0, %%r10\n"
-    "addq %%rax, %%r8\n"
-    "adcq %%rdx, %%r9\n"
-    "adcq $0, %%r10\n"
-    /* (r8,r9,r10) += a2 * a2 */
-    "movq %%r13, %%rax\n"
-    "mulq %%r13\n"
-    "addq %%rax, %%r8\n"
-    "adcq %%rdx, %%r9\n"
-    "adcq $0, %%r10\n"
-    /* Extract l4 */
-    "movq %%r8, 32(%%rsi)\n"
-    "xorq %%r8, %%r8\n"
-    /* (r9,r10,r8) += 2 * a2 * a3 */
-    "movq %%r13, %%rax\n"
-    "mulq %%r14\n"
-    "addq %%rax, %%r9\n"
-    "adcq %%rdx, %%r10\n"
-    "adcq $0, %%r8\n"
-    "addq %%rax, %%r9\n"
-    "adcq %%rdx, %%r10\n"
-    "adcq $0, %%r8\n"
-    /* Extract l5 */
-    "movq %%r9, 40(%%rsi)\n"
-    /* (r10,r8) += a3 * a3 */
-    "movq %%r14, %%rax\n"
-    "mulq %%r14\n"
-    "addq %%rax, %%r10\n"
-    "adcq %%rdx, %%r8\n"
-    /* Extract l6 */
-    "movq %%r10, 48(%%rsi)\n"
-    /* Extract l7 */
-    "movq %%r8, 56(%%rsi)\n"
-    :
-    : "S"(l), "D"(a->d)
-    : "rax", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "cc", "memory");
-#else
-    /* 160 bit accumulator. */
-    uint64_t c0 = 0, c1 = 0;
-    uint32_t c2 = 0;
-
-    /* l[0..7] = a[0..3] * b[0..3]. */
-    muladd_fast(a->d[0], a->d[0]);
-    extract_fast(l[0]);
-    muladd2(a->d[0], a->d[1]);
-    extract(l[1]);
-    muladd2(a->d[0], a->d[2]);
-    muladd(a->d[1], a->d[1]);
-    extract(l[2]);
-    muladd2(a->d[0], a->d[3]);
-    muladd2(a->d[1], a->d[2]);
-    extract(l[3]);
-    muladd2(a->d[1], a->d[3]);
-    muladd(a->d[2], a->d[2]);
-    extract(l[4]);
-    muladd2(a->d[2], a->d[3]);
-    extract(l[5]);
-    muladd_fast(a->d[3], a->d[3]);
-    extract_fast(l[6]);
-    VERIFY_CHECK(c1 == 0);
-    l[7] = c0;
-#endif
-}
-
 #undef sumadd
 #undef sumadd_fast
 #undef muladd
 #undef muladd_fast
-#undef muladd2
 #undef extract
 #undef extract_fast
 
@@ -906,12 +748,6 @@ static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n) {
     return ret;
 }
 
-static void secp256k1_scalar_sqr(secp256k1_scalar *r, const secp256k1_scalar *a) {
-    uint64_t l[8];
-    secp256k1_scalar_sqr_512(l, a);
-    secp256k1_scalar_reduce_512(r, l);
-}
-
 static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *k) {
     r1->d[0] = k->d[0];
     r1->d[1] = k->d[1];
@@ -955,4 +791,78 @@ static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const se
     r->d[3] = (r->d[3] & mask0) | (a->d[3] & mask1);
 }
 
+static void secp256k1_scalar_from_signed62(secp256k1_scalar *r, const secp256k1_modinv64_signed62 *a) {
+    const uint64_t a0 = a->v[0], a1 = a->v[1], a2 = a->v[2], a3 = a->v[3], a4 = a->v[4];
+
+    /* The output from secp256k1_modinv64{_var} should be normalized to range [0,modulus), and
+     * have limbs in [0,2^62). The modulus is < 2^256, so the top limb must be below 2^(256-62*4).
+     */
+    VERIFY_CHECK(a0 >> 62 == 0);
+    VERIFY_CHECK(a1 >> 62 == 0);
+    VERIFY_CHECK(a2 >> 62 == 0);
+    VERIFY_CHECK(a3 >> 62 == 0);
+    VERIFY_CHECK(a4 >> 8 == 0);
+
+    r->d[0] = a0      | a1 << 62;
+    r->d[1] = a1 >> 2 | a2 << 60;
+    r->d[2] = a2 >> 4 | a3 << 58;
+    r->d[3] = a3 >> 6 | a4 << 56;
+
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_scalar_check_overflow(r) == 0);
+#endif
+}
+
+static void secp256k1_scalar_to_signed62(secp256k1_modinv64_signed62 *r, const secp256k1_scalar *a) {
+    const uint64_t M62 = UINT64_MAX >> 2;
+    const uint64_t a0 = a->d[0], a1 = a->d[1], a2 = a->d[2], a3 = a->d[3];
+
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0);
+#endif
+
+    r->v[0] =  a0                   & M62;
+    r->v[1] = (a0 >> 62 | a1 <<  2) & M62;
+    r->v[2] = (a1 >> 60 | a2 <<  4) & M62;
+    r->v[3] = (a2 >> 58 | a3 <<  6) & M62;
+    r->v[4] =  a3 >> 56;
+}
+
+static const secp256k1_modinv64_modinfo secp256k1_const_modinfo_scalar = {
+    {{0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, -0x15LL, 0, 256}},
+    0x34F20099AA774EC1LL
+};
+
+static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
+    secp256k1_modinv64_signed62 s;
+#ifdef VERIFY
+    int zero_in = secp256k1_scalar_is_zero(x);
+#endif
+    secp256k1_scalar_to_signed62(&s, x);
+    secp256k1_modinv64(&s, &secp256k1_const_modinfo_scalar);
+    secp256k1_scalar_from_signed62(r, &s);
+
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_scalar_is_zero(r) == zero_in);
+#endif
+}
+
+static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
+    secp256k1_modinv64_signed62 s;
+#ifdef VERIFY
+    int zero_in = secp256k1_scalar_is_zero(x);
+#endif
+    secp256k1_scalar_to_signed62(&s, x);
+    secp256k1_modinv64_var(&s, &secp256k1_const_modinfo_scalar);
+    secp256k1_scalar_from_signed62(r, &s);
+
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_scalar_is_zero(r) == zero_in);
+#endif
+}
+
+SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
+    return !(a->d[0] & 1);
+}
+
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index bf98e01..62c7ae7 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -7,6 +7,8 @@
 #ifndef SECP256K1_SCALAR_REPR_IMPL_H
 #define SECP256K1_SCALAR_REPR_IMPL_H
 
+#include "modinv32_impl.h"
+
 /* Limbs of the secp256k1 order. */
 #define SECP256K1_N_0 ((uint32_t)0xD0364141UL)
 #define SECP256K1_N_1 ((uint32_t)0xBFD25E8CUL)
@@ -291,28 +293,6 @@ static int secp256k1_scalar_cond_negate(secp256k1_scalar *r, int flag) {
     VERIFY_CHECK(c1 >= th); \
 }
 
-/** Add 2*a*b to the number defined by (c0,c1,c2). c2 must never overflow. */
-#define muladd2(a,b) { \
-    uint32_t tl, th, th2, tl2; \
-    { \
-        uint64_t t = (uint64_t)a * b; \
-        th = t >> 32;               /* at most 0xFFFFFFFE */ \
-        tl = t; \
-    } \
-    th2 = th + th;                  /* at most 0xFFFFFFFE (in case th was 0x7FFFFFFF) */ \
-    c2 += (th2 < th);               /* never overflows by contract (verified the next line) */ \
-    VERIFY_CHECK((th2 >= th) || (c2 != 0)); \
-    tl2 = tl + tl;                  /* at most 0xFFFFFFFE (in case the lowest 63 bits of tl were 0x7FFFFFFF) */ \
-    th2 += (tl2 < tl);              /* at most 0xFFFFFFFF */ \
-    c0 += tl2;                      /* overflow is handled on the next line */ \
-    th2 += (c0 < tl2);              /* second overflow is handled on the next line */ \
-    c2 += (c0 < tl2) & (th2 == 0);  /* never overflows by contract (verified the next line) */ \
-    VERIFY_CHECK((c0 >= tl2) || (th2 != 0) || (c2 != 0)); \
-    c1 += th2;                      /* overflow is handled on the next line */ \
-    c2 += (c1 < th2);               /* never overflows by contract (verified the next line) */ \
-    VERIFY_CHECK((c1 >= th2) || (c2 != 0)); \
-}
-
 /** Add a to the number defined by (c0,c1,c2). c2 must never overflow. */
 #define sumadd(a) { \
     unsigned int over; \
@@ -576,71 +556,10 @@ static void secp256k1_scalar_mul_512(uint32_t *l, const secp256k1_scalar *a, con
     l[15] = c0;
 }
 
-static void secp256k1_scalar_sqr_512(uint32_t *l, const secp256k1_scalar *a) {
-    /* 96 bit accumulator. */
-    uint32_t c0 = 0, c1 = 0, c2 = 0;
-
-    /* l[0..15] = a[0..7]^2. */
-    muladd_fast(a->d[0], a->d[0]);
-    extract_fast(l[0]);
-    muladd2(a->d[0], a->d[1]);
-    extract(l[1]);
-    muladd2(a->d[0], a->d[2]);
-    muladd(a->d[1], a->d[1]);
-    extract(l[2]);
-    muladd2(a->d[0], a->d[3]);
-    muladd2(a->d[1], a->d[2]);
-    extract(l[3]);
-    muladd2(a->d[0], a->d[4]);
-    muladd2(a->d[1], a->d[3]);
-    muladd(a->d[2], a->d[2]);
-    extract(l[4]);
-    muladd2(a->d[0], a->d[5]);
-    muladd2(a->d[1], a->d[4]);
-    muladd2(a->d[2], a->d[3]);
-    extract(l[5]);
-    muladd2(a->d[0], a->d[6]);
-    muladd2(a->d[1], a->d[5]);
-    muladd2(a->d[2], a->d[4]);
-    muladd(a->d[3], a->d[3]);
-    extract(l[6]);
-    muladd2(a->d[0], a->d[7]);
-    muladd2(a->d[1], a->d[6]);
-    muladd2(a->d[2], a->d[5]);
-    muladd2(a->d[3], a->d[4]);
-    extract(l[7]);
-    muladd2(a->d[1], a->d[7]);
-    muladd2(a->d[2], a->d[6]);
-    muladd2(a->d[3], a->d[5]);
-    muladd(a->d[4], a->d[4]);
-    extract(l[8]);
-    muladd2(a->d[2], a->d[7]);
-    muladd2(a->d[3], a->d[6]);
-    muladd2(a->d[4], a->d[5]);
-    extract(l[9]);
-    muladd2(a->d[3], a->d[7]);
-    muladd2(a->d[4], a->d[6]);
-    muladd(a->d[5], a->d[5]);
-    extract(l[10]);
-    muladd2(a->d[4], a->d[7]);
-    muladd2(a->d[5], a->d[6]);
-    extract(l[11]);
-    muladd2(a->d[5], a->d[7]);
-    muladd(a->d[6], a->d[6]);
-    extract(l[12]);
-    muladd2(a->d[6], a->d[7]);
-    extract(l[13]);
-    muladd_fast(a->d[7], a->d[7]);
-    extract_fast(l[14]);
-    VERIFY_CHECK(c1 == 0);
-    l[15] = c0;
-}
-
 #undef sumadd
 #undef sumadd_fast
 #undef muladd
 #undef muladd_fast
-#undef muladd2
 #undef extract
 #undef extract_fast
 
@@ -666,12 +585,6 @@ static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n) {
     return ret;
 }
 
-static void secp256k1_scalar_sqr(secp256k1_scalar *r, const secp256k1_scalar *a) {
-    uint32_t l[16];
-    secp256k1_scalar_sqr_512(l, a);
-    secp256k1_scalar_reduce_512(r, l);
-}
-
 static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *k) {
     r1->d[0] = k->d[0];
     r1->d[1] = k->d[1];
@@ -731,4 +644,92 @@ static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const se
     r->d[7] = (r->d[7] & mask0) | (a->d[7] & mask1);
 }
 
+static void secp256k1_scalar_from_signed30(secp256k1_scalar *r, const secp256k1_modinv32_signed30 *a) {
+    const uint32_t a0 = a->v[0], a1 = a->v[1], a2 = a->v[2], a3 = a->v[3], a4 = a->v[4],
+                   a5 = a->v[5], a6 = a->v[6], a7 = a->v[7], a8 = a->v[8];
+
+    /* The output from secp256k1_modinv32{_var} should be normalized to range [0,modulus), and
+     * have limbs in [0,2^30). The modulus is < 2^256, so the top limb must be below 2^(256-30*8).
+     */
+    VERIFY_CHECK(a0 >> 30 == 0);
+    VERIFY_CHECK(a1 >> 30 == 0);
+    VERIFY_CHECK(a2 >> 30 == 0);
+    VERIFY_CHECK(a3 >> 30 == 0);
+    VERIFY_CHECK(a4 >> 30 == 0);
+    VERIFY_CHECK(a5 >> 30 == 0);
+    VERIFY_CHECK(a6 >> 30 == 0);
+    VERIFY_CHECK(a7 >> 30 == 0);
+    VERIFY_CHECK(a8 >> 16 == 0);
+
+    r->d[0] = a0       | a1 << 30;
+    r->d[1] = a1 >>  2 | a2 << 28;
+    r->d[2] = a2 >>  4 | a3 << 26;
+    r->d[3] = a3 >>  6 | a4 << 24;
+    r->d[4] = a4 >>  8 | a5 << 22;
+    r->d[5] = a5 >> 10 | a6 << 20;
+    r->d[6] = a6 >> 12 | a7 << 18;
+    r->d[7] = a7 >> 14 | a8 << 16;
+
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_scalar_check_overflow(r) == 0);
+#endif
+}
+
+static void secp256k1_scalar_to_signed30(secp256k1_modinv32_signed30 *r, const secp256k1_scalar *a) {
+    const uint32_t M30 = UINT32_MAX >> 2;
+    const uint32_t a0 = a->d[0], a1 = a->d[1], a2 = a->d[2], a3 = a->d[3],
+                   a4 = a->d[4], a5 = a->d[5], a6 = a->d[6], a7 = a->d[7];
+
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0);
+#endif
+
+    r->v[0] =  a0                   & M30;
+    r->v[1] = (a0 >> 30 | a1 <<  2) & M30;
+    r->v[2] = (a1 >> 28 | a2 <<  4) & M30;
+    r->v[3] = (a2 >> 26 | a3 <<  6) & M30;
+    r->v[4] = (a3 >> 24 | a4 <<  8) & M30;
+    r->v[5] = (a4 >> 22 | a5 << 10) & M30;
+    r->v[6] = (a5 >> 20 | a6 << 12) & M30;
+    r->v[7] = (a6 >> 18 | a7 << 14) & M30;
+    r->v[8] =  a7 >> 16;
+}
+
+static const secp256k1_modinv32_modinfo secp256k1_const_modinfo_scalar = {
+    {{0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL, -0x146L, 0, 0, 0, 65536}},
+    0x2A774EC1L
+};
+
+static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
+    secp256k1_modinv32_signed30 s;
+#ifdef VERIFY
+    int zero_in = secp256k1_scalar_is_zero(x);
+#endif
+    secp256k1_scalar_to_signed30(&s, x);
+    secp256k1_modinv32(&s, &secp256k1_const_modinfo_scalar);
+    secp256k1_scalar_from_signed30(r, &s);
+
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_scalar_is_zero(r) == zero_in);
+#endif
+}
+
+static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
+    secp256k1_modinv32_signed30 s;
+#ifdef VERIFY
+    int zero_in = secp256k1_scalar_is_zero(x);
+#endif
+    secp256k1_scalar_to_signed30(&s, x);
+    secp256k1_modinv32_var(&s, &secp256k1_const_modinfo_scalar);
+    secp256k1_scalar_from_signed30(r, &s);
+
+#ifdef VERIFY
+    VERIFY_CHECK(secp256k1_scalar_is_zero(r) == zero_in);
+#endif
+}
+
+SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
+    return !(a->d[0] & 1);
+}
+
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */
diff --git a/src/scalar_impl.h b/src/scalar_impl.h
index 61c1fbd..e124474 100644
--- a/src/scalar_impl.h
+++ b/src/scalar_impl.h
@@ -31,231 +31,12 @@
 static const secp256k1_scalar secp256k1_scalar_one = SECP256K1_SCALAR_CONST(0, 0, 0, 0, 0, 0, 0, 1);
 static const secp256k1_scalar secp256k1_scalar_zero = SECP256K1_SCALAR_CONST(0, 0, 0, 0, 0, 0, 0, 0);
 
-#ifndef USE_NUM_NONE
-static void secp256k1_scalar_get_num(secp256k1_num *r, const secp256k1_scalar *a) {
-    unsigned char c[32];
-    secp256k1_scalar_get_b32(c, a);
-    secp256k1_num_set_bin(r, c, 32);
-}
-
-/** secp256k1 curve order, see secp256k1_ecdsa_const_order_as_fe in ecdsa_impl.h */
-static void secp256k1_scalar_order_get_num(secp256k1_num *r) {
-#if defined(EXHAUSTIVE_TEST_ORDER)
-    static const unsigned char order[32] = {
-        0,0,0,0,0,0,0,0,
-        0,0,0,0,0,0,0,0,
-        0,0,0,0,0,0,0,0,
-        0,0,0,0,0,0,0,EXHAUSTIVE_TEST_ORDER
-    };
-#else
-    static const unsigned char order[32] = {
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,
-        0xBA,0xAE,0xDC,0xE6,0xAF,0x48,0xA0,0x3B,
-        0xBF,0xD2,0x5E,0x8C,0xD0,0x36,0x41,0x41
-    };
-#endif
-    secp256k1_num_set_bin(r, order, 32);
-}
-#endif
-
 static int secp256k1_scalar_set_b32_seckey(secp256k1_scalar *r, const unsigned char *bin) {
     int overflow;
     secp256k1_scalar_set_b32(r, bin, &overflow);
     return (!overflow) & (!secp256k1_scalar_is_zero(r));
 }
 
-static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
-#if defined(EXHAUSTIVE_TEST_ORDER)
-    int i;
-    *r = 0;
-    for (i = 0; i < EXHAUSTIVE_TEST_ORDER; i++)
-        if ((i * *x) % EXHAUSTIVE_TEST_ORDER == 1)
-            *r = i;
-    /* If this VERIFY_CHECK triggers we were given a noninvertible scalar (and thus
-     * have a composite group order; fix it in exhaustive_tests.c). */
-    VERIFY_CHECK(*r != 0);
-}
-#else
-    secp256k1_scalar *t;
-    int i;
-    /* First compute xN as x ^ (2^N - 1) for some values of N,
-     * and uM as x ^ M for some values of M. */
-    secp256k1_scalar x2, x3, x6, x8, x14, x28, x56, x112, x126;
-    secp256k1_scalar u2, u5, u9, u11, u13;
-
-    secp256k1_scalar_sqr(&u2, x);
-    secp256k1_scalar_mul(&x2, &u2,  x);
-    secp256k1_scalar_mul(&u5, &u2, &x2);
-    secp256k1_scalar_mul(&x3, &u5,  &u2);
-    secp256k1_scalar_mul(&u9, &x3, &u2);
-    secp256k1_scalar_mul(&u11, &u9, &u2);
-    secp256k1_scalar_mul(&u13, &u11, &u2);
-
-    secp256k1_scalar_sqr(&x6, &u13);
-    secp256k1_scalar_sqr(&x6, &x6);
-    secp256k1_scalar_mul(&x6, &x6, &u11);
-
-    secp256k1_scalar_sqr(&x8, &x6);
-    secp256k1_scalar_sqr(&x8, &x8);
-    secp256k1_scalar_mul(&x8, &x8,  &x2);
-
-    secp256k1_scalar_sqr(&x14, &x8);
-    for (i = 0; i < 5; i++) {
-        secp256k1_scalar_sqr(&x14, &x14);
-    }
-    secp256k1_scalar_mul(&x14, &x14, &x6);
-
-    secp256k1_scalar_sqr(&x28, &x14);
-    for (i = 0; i < 13; i++) {
-        secp256k1_scalar_sqr(&x28, &x28);
-    }
-    secp256k1_scalar_mul(&x28, &x28, &x14);
-
-    secp256k1_scalar_sqr(&x56, &x28);
-    for (i = 0; i < 27; i++) {
-        secp256k1_scalar_sqr(&x56, &x56);
-    }
-    secp256k1_scalar_mul(&x56, &x56, &x28);
-
-    secp256k1_scalar_sqr(&x112, &x56);
-    for (i = 0; i < 55; i++) {
-        secp256k1_scalar_sqr(&x112, &x112);
-    }
-    secp256k1_scalar_mul(&x112, &x112, &x56);
-
-    secp256k1_scalar_sqr(&x126, &x112);
-    for (i = 0; i < 13; i++) {
-        secp256k1_scalar_sqr(&x126, &x126);
-    }
-    secp256k1_scalar_mul(&x126, &x126, &x14);
-
-    /* Then accumulate the final result (t starts at x126). */
-    t = &x126;
-    for (i = 0; i < 3; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 5; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
-    for (i = 0; i < 4; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 5; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 6; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 3; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 5; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
-    for (i = 0; i < 6; i++) { /* 000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 10; i++) { /* 0000000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 9; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x8); /* 11111111 */
-    for (i = 0; i < 5; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
-    for (i = 0; i < 6; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
-    for (i = 0; i < 4; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 5; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x2); /* 11 */
-    for (i = 0; i < 6; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 10; i++) { /* 000000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 4; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
-    for (i = 0; i < 6; i++) { /* 00000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (i = 0; i < 8; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(r, t, &x6); /* 111111 */
-}
-
-SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
-    return !(a->d[0] & 1);
-}
-#endif
-
-static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
-#if defined(USE_SCALAR_INV_BUILTIN)
-    secp256k1_scalar_inverse(r, x);
-#elif defined(USE_SCALAR_INV_NUM)
-    unsigned char b[32];
-    secp256k1_num n, m;
-    secp256k1_scalar t = *x;
-    secp256k1_scalar_get_b32(b, &t);
-    secp256k1_num_set_bin(&n, b, 32);
-    secp256k1_scalar_order_get_num(&m);
-    secp256k1_num_mod_inverse(&n, &n, &m);
-    secp256k1_num_get_bin(b, 32, &n);
-    secp256k1_scalar_set_b32(r, b, NULL);
-    /* Verify that the inverse was computed correctly, without GMP code. */
-    secp256k1_scalar_mul(&t, &t, r);
-    CHECK(secp256k1_scalar_is_one(&t));
-#else
-#error "Please select scalar inverse implementation"
-#endif
-}
-
 /* These parameters are generated using sage/gen_exhaustive_groups.sage. */
 #if defined(EXHAUSTIVE_TEST_ORDER)
 #  if EXHAUSTIVE_TEST_ORDER == 13
diff --git a/src/scalar_low_impl.h b/src/scalar_low_impl.h
index 98ffd15..7176f0b 100644
--- a/src/scalar_low_impl.h
+++ b/src/scalar_low_impl.h
@@ -104,10 +104,6 @@ static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n) {
     return ret;
 }
 
-static void secp256k1_scalar_sqr(secp256k1_scalar *r, const secp256k1_scalar *a) {
-    *r = (*a * *a) % EXHAUSTIVE_TEST_ORDER;
-}
-
 static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *a) {
     *r1 = *a;
     *r2 = 0;
@@ -125,4 +121,19 @@ static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const se
     *r = (*r & mask0) | (*a & mask1);
 }
 
+static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
+    int i;
+    *r = 0;
+    for (i = 0; i < EXHAUSTIVE_TEST_ORDER; i++)
+        if ((i * *x) % EXHAUSTIVE_TEST_ORDER == 1)
+            *r = i;
+    /* If this VERIFY_CHECK triggers we were given a noninvertible scalar (and thus
+     * have a composite group order; fix it in exhaustive_tests.c). */
+    VERIFY_CHECK(*r != 0);
+}
+
+static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
+    secp256k1_scalar_inverse(r, x);
+}
+
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */
diff --git a/src/secp256k1.c b/src/secp256k1.c
index 4e1a507..aef3f99 100644
--- a/src/secp256k1.c
+++ b/src/secp256k1.c
@@ -9,7 +9,6 @@
 
 #include "assumptions.h"
 #include "util.h"
-#include "num_impl.h"
 #include "field_impl.h"
 #include "scalar_impl.h"
 #include "group_impl.h"
@@ -763,10 +762,6 @@ int secp256k1_ec_pubkey_combine(const secp256k1_context* ctx, secp256k1_pubkey *
 # include "modules/ecdh/main_impl.h"
 #endif
 
-#ifdef ENABLE_MODULE_MULTISET
-# include "modules/multiset/main_impl.h"
-#endif
-
 #ifdef ENABLE_MODULE_RECOVERY
 # include "modules/recovery/main_impl.h"
 #endif
diff --git a/src/tests.c b/src/tests.c
index 9304962..ba645bb 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -18,6 +18,7 @@
 #include "include/secp256k1.h"
 #include "include/secp256k1_preallocated.h"
 #include "testrand_impl.h"
+#include "util.h"
 
 #ifdef ENABLE_OPENSSL_TESTS
 #include "openssl/bn.h"
@@ -32,6 +33,11 @@ void ECDSA_SIG_get0(const ECDSA_SIG *sig, const BIGNUM **pr, const BIGNUM **ps)
 #include "contrib/lax_der_parsing.c"
 #include "contrib/lax_der_privatekey_parsing.c"
 
+#include "modinv32_impl.h"
+#ifdef SECP256K1_WIDEMUL_INT128
+#include "modinv64_impl.h"
+#endif
+
 static int count = 64;
 static secp256k1_context *ctx = NULL;
 
@@ -416,6 +422,25 @@ void run_scratch_tests(void) {
     secp256k1_context_destroy(none);
 }
 
+void run_ctz_tests(void) {
+    static const uint32_t b32[] = {1, 0xffffffff, 0x5e56968f, 0xe0d63129};
+    static const uint64_t b64[] = {1, 0xffffffffffffffff, 0xbcd02462139b3fc3, 0x98b5f80c769693ef};
+    int shift;
+    unsigned i;
+    for (i = 0; i < sizeof(b32) / sizeof(b32[0]); ++i) {
+        for (shift = 0; shift < 32; ++shift) {
+            CHECK(secp256k1_ctz32_var_debruijn(b32[i] << shift) == shift);
+            CHECK(secp256k1_ctz32_var(b32[i] << shift) == shift);
+        }
+    }
+    for (i = 0; i < sizeof(b64) / sizeof(b64[0]); ++i) {
+        for (shift = 0; shift < 64; ++shift) {
+            CHECK(secp256k1_ctz64_var_debruijn(b64[i] << shift) == shift);
+            CHECK(secp256k1_ctz64_var(b64[i] << shift) == shift);
+        }
+    }
+}
+
 /***** HASH TESTS *****/
 
 void run_sha256_tests(void) {
@@ -611,202 +636,448 @@ void run_rand_int(void) {
     }
 }
 
-/***** NUM TESTS *****/
+/***** MODINV TESTS *****/
+
+/* Compute the modular inverse of (odd) x mod 2^64. */
+uint64_t modinv2p64(uint64_t x) {
+    /* If w = 1/x mod 2^(2^L), then w*(2 - w*x) = 1/x mod 2^(2^(L+1)). See
+     * Hacker's Delight second edition, Henry S. Warren, Jr., pages 245-247 for
+     * why. Start with L=0, for which it is true for every odd x that
+     * 1/x=1 mod 2. Iterating 6 times gives us 1/x mod 2^64. */
+    int l;
+    uint64_t w = 1;
+    CHECK(x & 1);
+    for (l = 0; l < 6; ++l) w *= (2 - w*x);
+    return w;
+}
 
-#ifndef USE_NUM_NONE
-void random_num_negate(secp256k1_num *num) {
-    if (secp256k1_testrand_bits(1)) {
-        secp256k1_num_negate(num);
+/* compute out = (a*b) mod m; if b=NULL, treat b=1.
+ *
+ * Out is a 512-bit number (represented as 32 uint16_t's in LE order). The other
+ * arguments are 256-bit numbers (represented as 16 uint16_t's in LE order). */
+void mulmod256(uint16_t* out, const uint16_t* a, const uint16_t* b, const uint16_t* m) {
+    uint16_t mul[32];
+    uint64_t c = 0;
+    int i, j;
+    int m_bitlen = 0;
+    int mul_bitlen = 0;
+
+    if (b != NULL) {
+        /* Compute the product of a and b, and put it in mul. */
+        for (i = 0; i < 32; ++i) {
+            for (j = i <= 15 ? 0 : i - 15; j <= i && j <= 15; j++) {
+                c += (uint64_t)a[j] * b[i - j];
+            }
+            mul[i] = c & 0xFFFF;
+            c >>= 16;
+        }
+        CHECK(c == 0);
+
+        /* compute the highest set bit in mul */
+        for (i = 511; i >= 0; --i) {
+            if ((mul[i >> 4] >> (i & 15)) & 1) {
+                mul_bitlen = i;
+                break;
+            }
+        }
+    } else {
+        /* if b==NULL, set mul=a. */
+        memcpy(mul, a, 32);
+        memset(mul + 16, 0, 32);
+        /* compute the highest set bit in mul */
+        for (i = 255; i >= 0; --i) {
+            if ((mul[i >> 4] >> (i & 15)) & 1) {
+                mul_bitlen = i;
+                break;
+            }
+        }
     }
-}
 
-void random_num_order_test(secp256k1_num *num) {
-    secp256k1_scalar sc;
-    random_scalar_order_test(&sc);
-    secp256k1_scalar_get_num(num, &sc);
+    /* Compute the highest set bit in m. */
+    for (i = 255; i >= 0; --i) {
+        if ((m[i >> 4] >> (i & 15)) & 1) {
+            m_bitlen = i;
+            break;
+        }
+    }
+
+    /* Try do mul -= m<<i, for i going down to 0, whenever the result is not negative */
+    for (i = mul_bitlen - m_bitlen; i >= 0; --i) {
+        uint16_t mul2[32];
+        int64_t cs;
+
+        /* Compute mul2 = mul - m<<i. */
+        cs = 0; /* accumulator */
+        for (j = 0; j < 32; ++j) { /* j loops over the output limbs in mul2. */
+            /* Compute sub: the 16 bits in m that will be subtracted from mul2[j]. */
+            uint16_t sub = 0;
+            int p;
+            for (p = 0; p < 16; ++p) { /* p loops over the bit positions in mul2[j]. */
+                int bitpos = j * 16 - i + p; /* bitpos is the correspond bit position in m. */
+                if (bitpos >= 0 && bitpos < 256) {
+                    sub |= ((m[bitpos >> 4] >> (bitpos & 15)) & 1) << p;
+                }
+            }
+            /* Add mul[j]-sub to accumulator, and shift bottom 16 bits out to mul2[j]. */
+            cs += mul[j];
+            cs -= sub;
+            mul2[j] = (cs & 0xFFFF);
+            cs >>= 16;
+        }
+        /* If remainder of subtraction is 0, set mul = mul2. */
+        if (cs == 0) {
+            memcpy(mul, mul2, sizeof(mul));
+        }
+    }
+    /* Sanity check: test that all limbs higher than m's highest are zero */
+    for (i = (m_bitlen >> 4) + 1; i < 32; ++i) {
+        CHECK(mul[i] == 0);
+    }
+    memcpy(out, mul, 32);
 }
 
-void random_num_order(secp256k1_num *num) {
-    secp256k1_scalar sc;
-    random_scalar_order(&sc);
-    secp256k1_scalar_get_num(num, &sc);
+/* Convert a 256-bit number represented as 16 uint16_t's to signed30 notation. */
+void uint16_to_signed30(secp256k1_modinv32_signed30* out, const uint16_t* in) {
+    int i;
+    memset(out->v, 0, sizeof(out->v));
+    for (i = 0; i < 256; ++i) {
+        out->v[i / 30] |= (int32_t)(((in[i >> 4]) >> (i & 15)) & 1) << (i % 30);
+    }
 }
 
-void test_num_negate(void) {
-    secp256k1_num n1;
-    secp256k1_num n2;
-    random_num_order_test(&n1); /* n1 = R */
-    random_num_negate(&n1);
-    secp256k1_num_copy(&n2, &n1); /* n2 = R */
-    secp256k1_num_sub(&n1, &n2, &n1); /* n1 = n2-n1 = 0 */
-    CHECK(secp256k1_num_is_zero(&n1));
-    secp256k1_num_copy(&n1, &n2); /* n1 = R */
-    secp256k1_num_negate(&n1); /* n1 = -R */
-    CHECK(!secp256k1_num_is_zero(&n1));
-    secp256k1_num_add(&n1, &n2, &n1); /* n1 = n2+n1 = 0 */
-    CHECK(secp256k1_num_is_zero(&n1));
-    secp256k1_num_copy(&n1, &n2); /* n1 = R */
-    secp256k1_num_negate(&n1); /* n1 = -R */
-    CHECK(secp256k1_num_is_neg(&n1) != secp256k1_num_is_neg(&n2));
-    secp256k1_num_negate(&n1); /* n1 = R */
-    CHECK(secp256k1_num_eq(&n1, &n2));
+/* Convert a 256-bit number in signed30 notation to a representation as 16 uint16_t's. */
+void signed30_to_uint16(uint16_t* out, const secp256k1_modinv32_signed30* in) {
+    int i;
+    memset(out, 0, 32);
+    for (i = 0; i < 256; ++i) {
+        out[i >> 4] |= (((in->v[i / 30]) >> (i % 30)) & 1) << (i & 15);
+    }
 }
 
-void test_num_add_sub(void) {
+/* Randomly mutate the sign of limbs in signed30 representation, without changing the value. */
+void mutate_sign_signed30(secp256k1_modinv32_signed30* x) {
     int i;
-    secp256k1_scalar s;
-    secp256k1_num n1;
-    secp256k1_num n2;
-    secp256k1_num n1p2, n2p1, n1m2, n2m1;
-    random_num_order_test(&n1); /* n1 = R1 */
-    if (secp256k1_testrand_bits(1)) {
-        random_num_negate(&n1);
+    for (i = 0; i < 16; ++i) {
+        int pos = secp256k1_testrand_int(8);
+        if (x->v[pos] > 0 && x->v[pos + 1] <= 0x3fffffff) {
+            x->v[pos] -= 0x40000000;
+            x->v[pos + 1] += 1;
+        } else if (x->v[pos] < 0 && x->v[pos + 1] >= 0x3fffffff) {
+            x->v[pos] += 0x40000000;
+            x->v[pos + 1] -= 1;
+        }
     }
-    random_num_order_test(&n2); /* n2 = R2 */
-    if (secp256k1_testrand_bits(1)) {
-        random_num_negate(&n2);
-    }
-    secp256k1_num_add(&n1p2, &n1, &n2); /* n1p2 = R1 + R2 */
-    secp256k1_num_add(&n2p1, &n2, &n1); /* n2p1 = R2 + R1 */
-    secp256k1_num_sub(&n1m2, &n1, &n2); /* n1m2 = R1 - R2 */
-    secp256k1_num_sub(&n2m1, &n2, &n1); /* n2m1 = R2 - R1 */
-    CHECK(secp256k1_num_eq(&n1p2, &n2p1));
-    CHECK(!secp256k1_num_eq(&n1p2, &n1m2));
-    secp256k1_num_negate(&n2m1); /* n2m1 = -R2 + R1 */
-    CHECK(secp256k1_num_eq(&n2m1, &n1m2));
-    CHECK(!secp256k1_num_eq(&n2m1, &n1));
-    secp256k1_num_add(&n2m1, &n2m1, &n2); /* n2m1 = -R2 + R1 + R2 = R1 */
-    CHECK(secp256k1_num_eq(&n2m1, &n1));
-    CHECK(!secp256k1_num_eq(&n2p1, &n1));
-    secp256k1_num_sub(&n2p1, &n2p1, &n2); /* n2p1 = R2 + R1 - R2 = R1 */
-    CHECK(secp256k1_num_eq(&n2p1, &n1));
-
-    /* check is_one */
-    secp256k1_scalar_set_int(&s, 1);
-    secp256k1_scalar_get_num(&n1, &s);
-    CHECK(secp256k1_num_is_one(&n1));
-    /* check that 2^n + 1 is never 1 */
-    secp256k1_scalar_get_num(&n2, &s);
-    for (i = 0; i < 250; ++i) {
-        secp256k1_num_add(&n1, &n1, &n1);    /* n1 *= 2 */
-        secp256k1_num_add(&n1p2, &n1, &n2);  /* n1p2 = n1 + 1 */
-        CHECK(!secp256k1_num_is_one(&n1p2));
+}
+
+/* Test secp256k1_modinv32{_var}, using inputs in 16-bit limb format, and returning inverse. */
+void test_modinv32_uint16(uint16_t* out, const uint16_t* in, const uint16_t* mod) {
+    uint16_t tmp[16];
+    secp256k1_modinv32_signed30 x;
+    secp256k1_modinv32_modinfo m;
+    int i, vartime, nonzero;
+
+    uint16_to_signed30(&x, in);
+    nonzero = (x.v[0] | x.v[1] | x.v[2] | x.v[3] | x.v[4] | x.v[5] | x.v[6] | x.v[7] | x.v[8]) != 0;
+    uint16_to_signed30(&m.modulus, mod);
+    mutate_sign_signed30(&m.modulus);
+
+    /* compute 1/modulus mod 2^30 */
+    m.modulus_inv30 = modinv2p64(m.modulus.v[0]) & 0x3fffffff;
+    CHECK(((m.modulus_inv30 * m.modulus.v[0]) & 0x3fffffff) == 1);
+
+    for (vartime = 0; vartime < 2; ++vartime) {
+        /* compute inverse */
+        (vartime ? secp256k1_modinv32_var : secp256k1_modinv32)(&x, &m);
+
+        /* produce output */
+        signed30_to_uint16(out, &x);
+
+        /* check if the inverse times the input is 1 (mod m), unless x is 0. */
+        mulmod256(tmp, out, in, mod);
+        CHECK(tmp[0] == nonzero);
+        for (i = 1; i < 16; ++i) CHECK(tmp[i] == 0);
+
+        /* invert again */
+        (vartime ? secp256k1_modinv32_var : secp256k1_modinv32)(&x, &m);
+
+        /* check if the result is equal to the input */
+        signed30_to_uint16(tmp, &x);
+        for (i = 0; i < 16; ++i) CHECK(tmp[i] == in[i]);
     }
 }
 
-void test_num_mod(void) {
+#ifdef SECP256K1_WIDEMUL_INT128
+/* Convert a 256-bit number represented as 16 uint16_t's to signed62 notation. */
+void uint16_to_signed62(secp256k1_modinv64_signed62* out, const uint16_t* in) {
     int i;
-    secp256k1_scalar s;
-    secp256k1_num order, n;
-
-    /* check that 0 mod anything is 0 */
-    random_scalar_order_test(&s);
-    secp256k1_scalar_get_num(&order, &s);
-    secp256k1_scalar_set_int(&s, 0);
-    secp256k1_scalar_get_num(&n, &s);
-    secp256k1_num_mod(&n, &order);
-    CHECK(secp256k1_num_is_zero(&n));
-
-    /* check that anything mod 1 is 0 */
-    secp256k1_scalar_set_int(&s, 1);
-    secp256k1_scalar_get_num(&order, &s);
-    secp256k1_scalar_get_num(&n, &s);
-    secp256k1_num_mod(&n, &order);
-    CHECK(secp256k1_num_is_zero(&n));
-
-    /* check that increasing the number past 2^256 does not break this */
-    random_scalar_order_test(&s);
-    secp256k1_scalar_get_num(&n, &s);
-    /* multiply by 2^8, which'll test this case with high probability */
-    for (i = 0; i < 8; ++i) {
-        secp256k1_num_add(&n, &n, &n);
+    memset(out->v, 0, sizeof(out->v));
+    for (i = 0; i < 256; ++i) {
+        out->v[i / 62] |= (int64_t)(((in[i >> 4]) >> (i & 15)) & 1) << (i % 62);
     }
-    secp256k1_num_mod(&n, &order);
-    CHECK(secp256k1_num_is_zero(&n));
 }
 
-void test_num_jacobi(void) {
-    secp256k1_scalar sqr;
-    secp256k1_scalar small;
-    secp256k1_scalar five;  /* five is not a quadratic residue */
-    secp256k1_num order, n;
+/* Convert a 256-bit number in signed62 notation to a representation as 16 uint16_t's. */
+void signed62_to_uint16(uint16_t* out, const secp256k1_modinv64_signed62* in) {
     int i;
-    /* squares mod 5 are 1, 4 */
-    const int jacobi5[10] = { 0, 1, -1, -1, 1, 0, 1, -1, -1, 1 };
+    memset(out, 0, 32);
+    for (i = 0; i < 256; ++i) {
+        out[i >> 4] |= (((in->v[i / 62]) >> (i % 62)) & 1) << (i & 15);
+    }
+}
 
-    /* check some small values with 5 as the order */
-    secp256k1_scalar_set_int(&five, 5);
-    secp256k1_scalar_get_num(&order, &five);
-    for (i = 0; i < 10; ++i) {
-        secp256k1_scalar_set_int(&small, i);
-        secp256k1_scalar_get_num(&n, &small);
-        CHECK(secp256k1_num_jacobi(&n, &order) == jacobi5[i]);
+/* Randomly mutate the sign of limbs in signed62 representation, without changing the value. */
+void mutate_sign_signed62(secp256k1_modinv64_signed62* x) {
+    static const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
+    int i;
+    for (i = 0; i < 8; ++i) {
+        int pos = secp256k1_testrand_int(4);
+        if (x->v[pos] > 0 && x->v[pos + 1] <= M62) {
+            x->v[pos] -= (M62 + 1);
+            x->v[pos + 1] += 1;
+        } else if (x->v[pos] < 0 && x->v[pos + 1] >= -M62) {
+            x->v[pos] += (M62 + 1);
+            x->v[pos + 1] -= 1;
+        }
     }
+}
 
-    /** test large values with 5 as group order */
-    secp256k1_scalar_get_num(&order, &five);
-    /* we first need a scalar which is not a multiple of 5 */
-    do {
-        secp256k1_num fiven;
-        random_scalar_order_test(&sqr);
-        secp256k1_scalar_get_num(&fiven, &five);
-        secp256k1_scalar_get_num(&n, &sqr);
-        secp256k1_num_mod(&n, &fiven);
-    } while (secp256k1_num_is_zero(&n));
-    /* next force it to be a residue. 2 is a nonresidue mod 5 so we can
-     * just multiply by two, i.e. add the number to itself */
-    if (secp256k1_num_jacobi(&n, &order) == -1) {
-        secp256k1_num_add(&n, &n, &n);
-    }
-
-    /* test residue */
-    CHECK(secp256k1_num_jacobi(&n, &order) == 1);
-    /* test nonresidue */
-    secp256k1_num_add(&n, &n, &n);
-    CHECK(secp256k1_num_jacobi(&n, &order) == -1);
-
-    /** test with secp group order as order */
-    secp256k1_scalar_order_get_num(&order);
-    random_scalar_order_test(&sqr);
-    secp256k1_scalar_sqr(&sqr, &sqr);
-    /* test residue */
-    secp256k1_scalar_get_num(&n, &sqr);
-    CHECK(secp256k1_num_jacobi(&n, &order) == 1);
-    /* test nonresidue */
-    secp256k1_scalar_mul(&sqr, &sqr, &five);
-    secp256k1_scalar_get_num(&n, &sqr);
-    CHECK(secp256k1_num_jacobi(&n, &order) == -1);
-    /* test multiple of the order*/
-    CHECK(secp256k1_num_jacobi(&order, &order) == 0);
-
-    /* check one less than the order */
-    secp256k1_scalar_set_int(&small, 1);
-    secp256k1_scalar_get_num(&n, &small);
-    secp256k1_num_sub(&n, &order, &n);
-    CHECK(secp256k1_num_jacobi(&n, &order) == 1);  /* sage confirms this is 1 */
+/* Test secp256k1_modinv64{_var}, using inputs in 16-bit limb format, and returning inverse. */
+void test_modinv64_uint16(uint16_t* out, const uint16_t* in, const uint16_t* mod) {
+    static const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
+    uint16_t tmp[16];
+    secp256k1_modinv64_signed62 x;
+    secp256k1_modinv64_modinfo m;
+    int i, vartime, nonzero;
+
+    uint16_to_signed62(&x, in);
+    nonzero = (x.v[0] | x.v[1] | x.v[2] | x.v[3] | x.v[4]) != 0;
+    uint16_to_signed62(&m.modulus, mod);
+    mutate_sign_signed62(&m.modulus);
+
+    /* compute 1/modulus mod 2^62 */
+    m.modulus_inv62 = modinv2p64(m.modulus.v[0]) & M62;
+    CHECK(((m.modulus_inv62 * m.modulus.v[0]) & M62) == 1);
+
+    for (vartime = 0; vartime < 2; ++vartime) {
+        /* compute inverse */
+        (vartime ? secp256k1_modinv64_var : secp256k1_modinv64)(&x, &m);
+
+        /* produce output */
+        signed62_to_uint16(out, &x);
+
+        /* check if the inverse times the input is 1 (mod m), unless x is 0. */
+        mulmod256(tmp, out, in, mod);
+        CHECK(tmp[0] == nonzero);
+        for (i = 1; i < 16; ++i) CHECK(tmp[i] == 0);
+
+        /* invert again */
+        (vartime ? secp256k1_modinv64_var : secp256k1_modinv64)(&x, &m);
+
+        /* check if the result is equal to the input */
+        signed62_to_uint16(tmp, &x);
+        for (i = 0; i < 16; ++i) CHECK(tmp[i] == in[i]);
+    }
 }
+#endif
 
-void run_num_smalltests(void) {
+/* test if a and b are coprime */
+int coprime(const uint16_t* a, const uint16_t* b) {
+    uint16_t x[16], y[16], t[16];
     int i;
-    for (i = 0; i < 100*count; i++) {
-        test_num_negate();
-        test_num_add_sub();
-        test_num_mod();
-        test_num_jacobi();
+    int iszero;
+    memcpy(x, a, 32);
+    memcpy(y, b, 32);
+
+    /* simple gcd loop: while x!=0, (x,y)=(y%x,x) */
+    while (1) {
+        iszero = 1;
+        for (i = 0; i < 16; ++i) {
+            if (x[i] != 0) {
+                iszero = 0;
+                break;
+            }
+        }
+        if (iszero) break;
+        mulmod256(t, y, NULL, x);
+        memcpy(y, x, 32);
+        memcpy(x, t, 32);
+    }
+
+    /* return whether y=1 */
+    if (y[0] != 1) return 0;
+    for (i = 1; i < 16; ++i) {
+        if (y[i] != 0) return 0;
     }
+    return 1;
 }
+
+void run_modinv_tests(void) {
+    /* Fixed test cases. Each tuple is (input, modulus, output), each as 16x16 bits in LE order. */
+    static const uint16_t CASES[][3][16] = {
+        /* Test case known to need 713 divsteps */
+        {{0x1513, 0x5389, 0x54e9, 0x2798, 0x1957, 0x66a0, 0x8057, 0x3477,
+          0x7784, 0x1052, 0x326a, 0x9331, 0x6506, 0xa95c, 0x91f3, 0xfb5e},
+         {0x2bdd, 0x8df4, 0xcc61, 0x481f, 0xdae5, 0x5ca7, 0xf43b, 0x7d54,
+          0x13d6, 0x469b, 0x2294, 0x20f4, 0xb2a4, 0xa2d1, 0x3ff1, 0xfd4b},
+         {0xffd8, 0xd9a0, 0x456e, 0x81bb, 0xbabd, 0x6cea, 0x6dbd, 0x73ab,
+          0xbb94, 0x3d3c, 0xdf08, 0x31c4, 0x3e32, 0xc179, 0x2486, 0xb86b}},
+        /* Test case known to need 589 divsteps, reaching delta=-140 and
+           delta=141. */
+        {{0x3fb1, 0x903b, 0x4eb7, 0x4813, 0xd863, 0x26bf, 0xd89f, 0xa8a9,
+          0x02fe, 0x57c6, 0x554a, 0x4eab, 0x165e, 0x3d61, 0xee1e, 0x456c},
+         {0x9295, 0x823b, 0x5c1f, 0x5386, 0x48e0, 0x02ff, 0x4c2a, 0xa2da,
+          0xe58f, 0x967c, 0xc97e, 0x3f5a, 0x69fb, 0x52d9, 0x0a86, 0xb4a3},
+         {0x3d30, 0xb893, 0xa809, 0xa7a8, 0x26f5, 0x5b42, 0x55be, 0xf4d0,
+          0x12c2, 0x7e6a, 0xe41a, 0x90c7, 0xebfa, 0xf920, 0x304e, 0x1419}},
+        /* Test case known to need 650 divsteps, and doing 65 consecutive (f,g/2) steps. */
+        {{0x8583, 0x5058, 0xbeae, 0xeb69, 0x48bc, 0x52bb, 0x6a9d, 0xcc94,
+          0x2a21, 0x87d5, 0x5b0d, 0x42f6, 0x5b8a, 0x2214, 0xe9d6, 0xa040},
+         {0x7531, 0x27cb, 0x7e53, 0xb739, 0x6a5f, 0x83f5, 0xa45c, 0xcb1d,
+          0x8a87, 0x1c9c, 0x51d7, 0x851c, 0xb9d8, 0x1fbe, 0xc241, 0xd4a3},
+         {0xcdb4, 0x275c, 0x7d22, 0xa906, 0x0173, 0xc054, 0x7fdf, 0x5005,
+          0x7fb8, 0x9059, 0xdf51, 0x99df, 0x2654, 0x8f6e, 0x070f, 0xb347}},
+        /* Test case with the group order as modulus, needing 635 divsteps. */
+        {{0x95ed, 0x6c01, 0xd113, 0x5ff1, 0xd7d0, 0x29cc, 0x5817, 0x6120,
+          0xca8e, 0xaad1, 0x25ae, 0x8e84, 0x9af6, 0x30bf, 0xf0ed, 0x1686},
+         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x1631, 0xbf4a, 0x286a, 0x2716, 0x469f, 0x2ac8, 0x1312, 0xe9bc,
+          0x04f4, 0x304b, 0x9931, 0x113b, 0xd932, 0xc8f4, 0x0d0d, 0x01a1}},
+        /* Test case with the field size as modulus, needing 637 divsteps. */
+        {{0x9ec3, 0x1919, 0xca84, 0x7c11, 0xf996, 0x06f3, 0x5408, 0x6688,
+          0x1320, 0xdb8a, 0x632a, 0x0dcb, 0x8a84, 0x6bee, 0x9c95, 0xe34e},
+         {0xfc2f, 0xffff, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x18e5, 0x19b6, 0xdf92, 0x1aaa, 0x09fb, 0x8a3f, 0x52b0, 0x8701,
+          0xac0c, 0x2582, 0xda44, 0x9bcc, 0x6828, 0x1c53, 0xbd8f, 0xbd2c}},
+        /* Test case with the field size as modulus, needing 935 divsteps with
+           broken eta handling. */
+        {{0x1b37, 0xbdc3, 0x8bcd, 0x25e3, 0x1eae, 0x567d, 0x30b6, 0xf0d8,
+          0x9277, 0x0cf8, 0x9c2e, 0xecd7, 0x631d, 0xe38f, 0xd4f8, 0x5c93},
+         {0xfc2f, 0xffff, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x1622, 0xe05b, 0xe880, 0x7de9, 0x3e45, 0xb682, 0xee6c, 0x67ed,
+          0xa179, 0x15db, 0x6b0d, 0xa656, 0x7ccb, 0x8ef7, 0xa2ff, 0xe279}},
+        /* Test case with the group size as modulus, needing 981 divsteps with
+           broken eta handling. */
+        {{0xfeb9, 0xb877, 0xee41, 0x7fa3, 0x87da, 0x94c4, 0x9d04, 0xc5ae,
+          0x5708, 0x0994, 0xfc79, 0x0916, 0xbf32, 0x3ad8, 0xe11c, 0x5ca2},
+         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x0f12, 0x075e, 0xce1c, 0x6f92, 0xc80f, 0xca92, 0x9a04, 0x6126,
+          0x4b6c, 0x57d6, 0xca31, 0x97f3, 0x1f99, 0xf4fd, 0xda4d, 0x42ce}},
+        /* Test case with the field size as modulus, input = 0. */
+        {{0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+         {0xfc2f, 0xffff, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}},
+        /* Test case with the field size as modulus, input = 1. */
+        {{0x0001, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+         {0xfc2f, 0xffff, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x0001, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}},
+        /* Test case with the field size as modulus, input = 2. */
+        {{0x0002, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+         {0xfc2f, 0xffff, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0xfe18, 0x7fff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x7fff}},
+        /* Test case with the field size as modulus, input = field - 1. */
+        {{0xfc2e, 0xffff, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0xfc2f, 0xffff, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0xfc2e, 0xffff, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}},
+        /* Test case with the group size as modulus, input = 0. */
+        {{0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}},
+        /* Test case with the group size as modulus, input = 1. */
+        {{0x0001, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x0001, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}},
+        /* Test case with the group size as modulus, input = 2. */
+        {{0x0002, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+          0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x20a1, 0x681b, 0x2f46, 0xdfe9, 0x501d, 0x57a4, 0x6e73, 0x5d57,
+          0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x7fff}},
+        /* Test case with the group size as modulus, input = group - 1. */
+        {{0x4140, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x4141, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+         {0x4140, 0xd036, 0x5e8c, 0xbfd2, 0xa03b, 0xaf48, 0xdce6, 0xbaae,
+          0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}}
+    };
+
+    int i, j, ok;
+
+    /* Test known inputs/outputs */
+    for (i = 0; (size_t)i < sizeof(CASES) / sizeof(CASES[0]); ++i) {
+        uint16_t out[16];
+        test_modinv32_uint16(out, CASES[i][0], CASES[i][1]);
+        for (j = 0; j < 16; ++j) CHECK(out[j] == CASES[i][2][j]);
+#ifdef SECP256K1_WIDEMUL_INT128
+        test_modinv64_uint16(out, CASES[i][0], CASES[i][1]);
+        for (j = 0; j < 16; ++j) CHECK(out[j] == CASES[i][2][j]);
+#endif
+    }
+
+    for (i = 0; i < 100 * count; ++i) {
+        /* 256-bit numbers in 16-uint16_t's notation */
+        static const uint16_t ZERO[16] = {0};
+        uint16_t xd[16];  /* the number (in range [0,2^256)) to be inverted */
+        uint16_t md[16];  /* the modulus (odd, in range [3,2^256)) */
+        uint16_t id[16];  /* the inverse of xd mod md */
+
+        /* generate random xd and md, so that md is odd, md>1, xd<md, and gcd(xd,md)=1 */
+        do {
+            /* generate random xd and md (with many subsequent 0s and 1s) */
+            secp256k1_testrand256_test((unsigned char*)xd);
+            secp256k1_testrand256_test((unsigned char*)md);
+            md[0] |= 1; /* modulus must be odd */
+            /* If modulus is 1, find another one. */
+            ok = md[0] != 1;
+            for (j = 1; j < 16; ++j) ok |= md[j] != 0;
+            mulmod256(xd, xd, NULL, md); /* Make xd = xd mod md */
+        } while (!(ok && coprime(xd, md)));
+
+        test_modinv32_uint16(id, xd, md);
+#ifdef SECP256K1_WIDEMUL_INT128
+        test_modinv64_uint16(id, xd, md);
+#endif
+
+        /* In a few cases, also test with input=0 */
+        if (i < count) {
+            test_modinv32_uint16(id, ZERO, md);
+#ifdef SECP256K1_WIDEMUL_INT128
+            test_modinv64_uint16(id, ZERO, md);
 #endif
+        }
+    }
+}
 
 /***** SCALAR TESTS *****/
 
+
 void scalar_test(void) {
     secp256k1_scalar s;
     secp256k1_scalar s1;
     secp256k1_scalar s2;
-#ifndef USE_NUM_NONE
-    secp256k1_num snum, s1num, s2num;
-    secp256k1_num order, half_order;
-#endif
     unsigned char c[32];
 
     /* Set 's' to a random scalar, with value 'snum'. */
@@ -819,16 +1090,6 @@ void scalar_test(void) {
     random_scalar_order_test(&s2);
     secp256k1_scalar_get_b32(c, &s2);
 
-#ifndef USE_NUM_NONE
-    secp256k1_scalar_get_num(&snum, &s);
-    secp256k1_scalar_get_num(&s1num, &s1);
-    secp256k1_scalar_get_num(&s2num, &s2);
-
-    secp256k1_scalar_order_get_num(&order);
-    half_order = order;
-    secp256k1_num_shift(&half_order, 1);
-#endif
-
     {
         int i;
         /* Test that fetching groups of 4 bits from a scalar and recursing n(i)=16*n(i-1)+p(i) reconstructs it. */
@@ -868,80 +1129,6 @@ void scalar_test(void) {
         CHECK(secp256k1_scalar_eq(&n, &s));
     }
 
-#ifndef USE_NUM_NONE
-    {
-        /* Test that adding the scalars together is equal to adding their numbers together modulo the order. */
-        secp256k1_num rnum;
-        secp256k1_num r2num;
-        secp256k1_scalar r;
-        secp256k1_num_add(&rnum, &snum, &s2num);
-        secp256k1_num_mod(&rnum, &order);
-        secp256k1_scalar_add(&r, &s, &s2);
-        secp256k1_scalar_get_num(&r2num, &r);
-        CHECK(secp256k1_num_eq(&rnum, &r2num));
-    }
-
-    {
-        /* Test that multiplying the scalars is equal to multiplying their numbers modulo the order. */
-        secp256k1_scalar r;
-        secp256k1_num r2num;
-        secp256k1_num rnum;
-        secp256k1_num_mul(&rnum, &snum, &s2num);
-        secp256k1_num_mod(&rnum, &order);
-        secp256k1_scalar_mul(&r, &s, &s2);
-        secp256k1_scalar_get_num(&r2num, &r);
-        CHECK(secp256k1_num_eq(&rnum, &r2num));
-        /* The result can only be zero if at least one of the factors was zero. */
-        CHECK(secp256k1_scalar_is_zero(&r) == (secp256k1_scalar_is_zero(&s) || secp256k1_scalar_is_zero(&s2)));
-        /* The results can only be equal to one of the factors if that factor was zero, or the other factor was one. */
-        CHECK(secp256k1_num_eq(&rnum, &snum) == (secp256k1_scalar_is_zero(&s) || secp256k1_scalar_is_one(&s2)));
-        CHECK(secp256k1_num_eq(&rnum, &s2num) == (secp256k1_scalar_is_zero(&s2) || secp256k1_scalar_is_one(&s)));
-    }
-
-    {
-        secp256k1_scalar neg;
-        secp256k1_num negnum;
-        secp256k1_num negnum2;
-        /* Check that comparison with zero matches comparison with zero on the number. */
-        CHECK(secp256k1_num_is_zero(&snum) == secp256k1_scalar_is_zero(&s));
-        /* Check that comparison with the half order is equal to testing for high scalar. */
-        CHECK(secp256k1_scalar_is_high(&s) == (secp256k1_num_cmp(&snum, &half_order) > 0));
-        secp256k1_scalar_negate(&neg, &s);
-        secp256k1_num_sub(&negnum, &order, &snum);
-        secp256k1_num_mod(&negnum, &order);
-        /* Check that comparison with the half order is equal to testing for high scalar after negation. */
-        CHECK(secp256k1_scalar_is_high(&neg) == (secp256k1_num_cmp(&negnum, &half_order) > 0));
-        /* Negating should change the high property, unless the value was already zero. */
-        CHECK((secp256k1_scalar_is_high(&s) == secp256k1_scalar_is_high(&neg)) == secp256k1_scalar_is_zero(&s));
-        secp256k1_scalar_get_num(&negnum2, &neg);
-        /* Negating a scalar should be equal to (order - n) mod order on the number. */
-        CHECK(secp256k1_num_eq(&negnum, &negnum2));
-        secp256k1_scalar_add(&neg, &neg, &s);
-        /* Adding a number to its negation should result in zero. */
-        CHECK(secp256k1_scalar_is_zero(&neg));
-        secp256k1_scalar_negate(&neg, &neg);
-        /* Negating zero should still result in zero. */
-        CHECK(secp256k1_scalar_is_zero(&neg));
-    }
-
-    {
-        /* Test secp256k1_scalar_mul_shift_var. */
-        secp256k1_scalar r;
-        secp256k1_num one;
-        secp256k1_num rnum;
-        secp256k1_num rnum2;
-        unsigned char cone[1] = {0x01};
-        unsigned int shift = 256 + secp256k1_testrand_int(257);
-        secp256k1_scalar_mul_shift_var(&r, &s1, &s2, shift);
-        secp256k1_num_mul(&rnum, &s1num, &s2num);
-        secp256k1_num_shift(&rnum, shift - 1);
-        secp256k1_num_set_bin(&one, cone, 1);
-        secp256k1_num_add(&rnum, &rnum, &one);
-        secp256k1_num_shift(&rnum, 1);
-        secp256k1_scalar_get_num(&rnum2, &r);
-        CHECK(secp256k1_num_eq(&rnum, &rnum2));
-    }
-
     {
         /* test secp256k1_scalar_shr_int */
         secp256k1_scalar r;
@@ -955,34 +1142,6 @@ void scalar_test(void) {
             CHECK(expected == low);
         }
     }
-#endif
-
-    {
-        /* Test that scalar inverses are equal to the inverse of their number modulo the order. */
-        if (!secp256k1_scalar_is_zero(&s)) {
-            secp256k1_scalar inv;
-#ifndef USE_NUM_NONE
-            secp256k1_num invnum;
-            secp256k1_num invnum2;
-#endif
-            secp256k1_scalar_inverse(&inv, &s);
-#ifndef USE_NUM_NONE
-            secp256k1_num_mod_inverse(&invnum, &snum, &order);
-            secp256k1_scalar_get_num(&invnum2, &inv);
-            CHECK(secp256k1_num_eq(&invnum, &invnum2));
-#endif
-            secp256k1_scalar_mul(&inv, &inv, &s);
-            /* Multiplying a scalar with its inverse must result in one. */
-            CHECK(secp256k1_scalar_is_one(&inv));
-            secp256k1_scalar_inverse(&inv, &inv);
-            /* Inverting one must result in one. */
-            CHECK(secp256k1_scalar_is_one(&inv));
-#ifndef USE_NUM_NONE
-            secp256k1_scalar_get_num(&invnum, &inv);
-            CHECK(secp256k1_num_is_one(&invnum));
-#endif
-        }
-    }
 
     {
         /* Test commutativity of add. */
@@ -1054,14 +1213,6 @@ void scalar_test(void) {
         CHECK(secp256k1_scalar_eq(&r1, &r2));
     }
 
-    {
-        /* Test square. */
-        secp256k1_scalar r1, r2;
-        secp256k1_scalar_sqr(&r1, &s1);
-        secp256k1_scalar_mul(&r2, &s1, &s1);
-        CHECK(secp256k1_scalar_eq(&r1, &r2));
-    }
-
     {
         /* Test multiplicative identity. */
         secp256k1_scalar r1, v1;
@@ -1126,48 +1277,6 @@ void run_scalar_tests(void) {
         CHECK(secp256k1_scalar_is_zero(&o));
     }
 
-#ifndef USE_NUM_NONE
-    {
-        /* Test secp256k1_scalar_set_b32 boundary conditions */
-        secp256k1_num order;
-        secp256k1_scalar scalar;
-        unsigned char bin[32];
-        unsigned char bin_tmp[32];
-        int overflow = 0;
-        /* 2^256-1 - order */
-        static const secp256k1_scalar all_ones_minus_order = SECP256K1_SCALAR_CONST(
-            0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000001UL,
-            0x45512319UL, 0x50B75FC4UL, 0x402DA173UL, 0x2FC9BEBEUL
-        );
-
-        /* A scalar set to 0s should be 0. */
-        memset(bin, 0, 32);
-        secp256k1_scalar_set_b32(&scalar, bin, &overflow);
-        CHECK(overflow == 0);
-        CHECK(secp256k1_scalar_is_zero(&scalar));
-
-        /* A scalar with value of the curve order should be 0. */
-        secp256k1_scalar_order_get_num(&order);
-        secp256k1_num_get_bin(bin, 32, &order);
-        secp256k1_scalar_set_b32(&scalar, bin, &overflow);
-        CHECK(overflow == 1);
-        CHECK(secp256k1_scalar_is_zero(&scalar));
-
-        /* A scalar with value of the curve order minus one should not overflow. */
-        bin[31] -= 1;
-        secp256k1_scalar_set_b32(&scalar, bin, &overflow);
-        CHECK(overflow == 0);
-        secp256k1_scalar_get_b32(bin_tmp, &scalar);
-        CHECK(secp256k1_memcmp_var(bin, bin_tmp, 32) == 0);
-
-        /* A scalar set to all 1s should overflow. */
-        memset(bin, 0xFF, 32);
-        secp256k1_scalar_set_b32(&scalar, bin, &overflow);
-        CHECK(overflow == 1);
-        CHECK(secp256k1_scalar_eq(&scalar, &all_ones_minus_order));
-    }
-#endif
-
     {
         /* Does check_overflow check catch all ones? */
         static const secp256k1_scalar overflowed = SECP256K1_SCALAR_CONST(
@@ -1190,9 +1299,7 @@ void run_scalar_tests(void) {
         secp256k1_scalar one;
         secp256k1_scalar r1;
         secp256k1_scalar r2;
-#if defined(USE_SCALAR_INV_NUM)
         secp256k1_scalar zzv;
-#endif
         int overflow;
         unsigned char chal[33][2][32] = {
             {{0xff, 0xff, 0x03, 0x07, 0x00, 0x00, 0x00, 0x00,
@@ -1742,10 +1849,8 @@ void run_scalar_tests(void) {
             if (!secp256k1_scalar_is_zero(&y)) {
                 secp256k1_scalar_inverse(&zz, &y);
                 CHECK(!secp256k1_scalar_check_overflow(&zz));
-#if defined(USE_SCALAR_INV_NUM)
                 secp256k1_scalar_inverse_var(&zzv, &y);
                 CHECK(secp256k1_scalar_eq(&zzv, &zz));
-#endif
                 secp256k1_scalar_mul(&z, &z, &zz);
                 CHECK(!secp256k1_scalar_check_overflow(&z));
                 CHECK(secp256k1_scalar_eq(&x, &z));
@@ -1753,12 +1858,6 @@ void run_scalar_tests(void) {
                 CHECK(!secp256k1_scalar_check_overflow(&zz));
                 CHECK(secp256k1_scalar_eq(&one, &zz));
             }
-            secp256k1_scalar_mul(&z, &x, &x);
-            CHECK(!secp256k1_scalar_check_overflow(&z));
-            secp256k1_scalar_sqr(&zz, &x);
-            CHECK(!secp256k1_scalar_check_overflow(&zz));
-            CHECK(secp256k1_scalar_eq(&zz, &z));
-            CHECK(secp256k1_scalar_eq(&r2, &zz));
         }
     }
 }
@@ -1814,13 +1913,6 @@ int check_fe_equal(const secp256k1_fe *a, const secp256k1_fe *b) {
     return secp256k1_fe_equal_var(&an, &bn);
 }
 
-int check_fe_inverse(const secp256k1_fe *a, const secp256k1_fe *ai) {
-    secp256k1_fe x;
-    secp256k1_fe one = SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 1);
-    secp256k1_fe_mul(&x, a, ai);
-    return check_fe_equal(&x, &one);
-}
-
 void run_field_convert(void) {
     static const unsigned char b32[32] = {
         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
@@ -1940,30 +2032,6 @@ void run_field_misc(void) {
     }
 }
 
-void run_field_inv(void) {
-    secp256k1_fe x, xi, xii;
-    int i;
-    for (i = 0; i < 10*count; i++) {
-        random_fe_non_zero(&x);
-        secp256k1_fe_inv(&xi, &x);
-        CHECK(check_fe_inverse(&x, &xi));
-        secp256k1_fe_inv(&xii, &xi);
-        CHECK(check_fe_equal(&x, &xii));
-    }
-}
-
-void run_field_inv_var(void) {
-    secp256k1_fe x, xi, xii;
-    int i;
-    for (i = 0; i < 10*count; i++) {
-        random_fe_non_zero(&x);
-        secp256k1_fe_inv_var(&xi, &x);
-        CHECK(check_fe_inverse(&x, &xi));
-        secp256k1_fe_inv_var(&xii, &xi);
-        CHECK(check_fe_equal(&x, &xii));
-    }
-}
-
 void run_sqr(void) {
     secp256k1_fe x, s;
 
@@ -2028,6 +2096,169 @@ void run_sqrt(void) {
     }
 }
 
+/***** FIELD/SCALAR INVERSE TESTS *****/
+
+static const secp256k1_scalar scalar_minus_one = SECP256K1_SCALAR_CONST(
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE,
+    0xBAAEDCE6, 0xAF48A03B, 0xBFD25E8C, 0xD0364140
+);
+
+static const secp256k1_fe fe_minus_one = SECP256K1_FE_CONST(
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0xFFFFFC2E
+);
+
+/* These tests test the following identities:
+ *
+ * for x==0: 1/x == 0
+ * for x!=0: x*(1/x) == 1
+ * for x!=0 and x!=1: 1/(1/x - 1) + 1 == -1/(x-1)
+ */
+
+void test_inverse_scalar(secp256k1_scalar* out, const secp256k1_scalar* x, int var)
+{
+    secp256k1_scalar l, r, t;
+
+    (var ? secp256k1_scalar_inverse_var : secp256k1_scalar_inverse_var)(&l, x);  /* l = 1/x */
+    if (out) *out = l;
+    if (secp256k1_scalar_is_zero(x)) {
+        CHECK(secp256k1_scalar_is_zero(&l));
+        return;
+    }
+    secp256k1_scalar_mul(&t, x, &l);                                             /* t = x*(1/x) */
+    CHECK(secp256k1_scalar_is_one(&t));                                          /* x*(1/x) == 1 */
+    secp256k1_scalar_add(&r, x, &scalar_minus_one);                              /* r = x-1 */
+    if (secp256k1_scalar_is_zero(&r)) return;
+    (var ? secp256k1_scalar_inverse_var : secp256k1_scalar_inverse_var)(&r, &r); /* r = 1/(x-1) */
+    secp256k1_scalar_add(&l, &scalar_minus_one, &l);                             /* l = 1/x-1 */
+    (var ? secp256k1_scalar_inverse_var : secp256k1_scalar_inverse_var)(&l, &l); /* l = 1/(1/x-1) */
+    secp256k1_scalar_add(&l, &l, &secp256k1_scalar_one);                         /* l = 1/(1/x-1)+1 */
+    secp256k1_scalar_add(&l, &r, &l);                                            /* l = 1/(1/x-1)+1 + 1/(x-1) */
+    CHECK(secp256k1_scalar_is_zero(&l));                                         /* l == 0 */
+}
+
+void test_inverse_field(secp256k1_fe* out, const secp256k1_fe* x, int var)
+{
+    secp256k1_fe l, r, t;
+
+    (var ? secp256k1_fe_inv_var : secp256k1_fe_inv)(&l, x) ;   /* l = 1/x */
+    if (out) *out = l;
+    t = *x;                                                    /* t = x */
+    if (secp256k1_fe_normalizes_to_zero_var(&t)) {
+        CHECK(secp256k1_fe_normalizes_to_zero(&l));
+        return;
+    }
+    secp256k1_fe_mul(&t, x, &l);                               /* t = x*(1/x) */
+    secp256k1_fe_add(&t, &fe_minus_one);                       /* t = x*(1/x)-1 */
+    CHECK(secp256k1_fe_normalizes_to_zero(&t));                /* x*(1/x)-1 == 0 */
+    r = *x;                                                    /* r = x */
+    secp256k1_fe_add(&r, &fe_minus_one);                       /* r = x-1 */
+    if (secp256k1_fe_normalizes_to_zero_var(&r)) return;
+    (var ? secp256k1_fe_inv_var : secp256k1_fe_inv)(&r, &r);   /* r = 1/(x-1) */
+    secp256k1_fe_add(&l, &fe_minus_one);                       /* l = 1/x-1 */
+    (var ? secp256k1_fe_inv_var : secp256k1_fe_inv)(&l, &l);   /* l = 1/(1/x-1) */
+    secp256k1_fe_add(&l, &secp256k1_fe_one);                   /* l = 1/(1/x-1)+1 */
+    secp256k1_fe_add(&l, &r);                                  /* l = 1/(1/x-1)+1 + 1/(x-1) */
+    CHECK(secp256k1_fe_normalizes_to_zero_var(&l));            /* l == 0 */
+}
+
+void run_inverse_tests(void)
+{
+    /* Fixed test cases for field inverses: pairs of (x, 1/x) mod p. */
+    static const secp256k1_fe fe_cases[][2] = {
+        /* 0 */
+        {SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 0),
+         SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 0)},
+        /* 1 */
+        {SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 1),
+         SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 1)},
+        /* -1 */
+        {SECP256K1_FE_CONST(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0xfffffc2e),
+         SECP256K1_FE_CONST(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0xfffffc2e)},
+        /* 2 */
+        {SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 2),
+         SECP256K1_FE_CONST(0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x7ffffe18)},
+        /* 2**128 */
+        {SECP256K1_FE_CONST(0, 0, 0, 1, 0, 0, 0, 0),
+         SECP256K1_FE_CONST(0xbcb223fe, 0xdc24a059, 0xd838091d, 0xd2253530, 0xffffffff, 0xffffffff, 0xffffffff, 0x434dd931)},
+        /* Input known to need 637 divsteps */
+        {SECP256K1_FE_CONST(0xe34e9c95, 0x6bee8a84, 0x0dcb632a, 0xdb8a1320, 0x66885408, 0x06f3f996, 0x7c11ca84, 0x19199ec3),
+         SECP256K1_FE_CONST(0xbd2cbd8f, 0x1c536828, 0x9bccda44, 0x2582ac0c, 0x870152b0, 0x8a3f09fb, 0x1aaadf92, 0x19b618e5)}
+    };
+    /* Fixed test cases for scalar inverses: pairs of (x, 1/x) mod n. */
+    static const secp256k1_scalar scalar_cases[][2] = {
+        /* 0 */
+        {SECP256K1_SCALAR_CONST(0, 0, 0, 0, 0, 0, 0, 0),
+         SECP256K1_SCALAR_CONST(0, 0, 0, 0, 0, 0, 0, 0)},
+        /* 1 */
+        {SECP256K1_SCALAR_CONST(0, 0, 0, 0, 0, 0, 0, 1),
+         SECP256K1_SCALAR_CONST(0, 0, 0, 0, 0, 0, 0, 1)},
+        /* -1 */
+        {SECP256K1_SCALAR_CONST(0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0xbaaedce6, 0xaf48a03b, 0xbfd25e8c, 0xd0364140),
+         SECP256K1_SCALAR_CONST(0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0xbaaedce6, 0xaf48a03b, 0xbfd25e8c, 0xd0364140)},
+        /* 2 */
+        {SECP256K1_SCALAR_CONST(0, 0, 0, 0, 0, 0, 0, 2),
+         SECP256K1_SCALAR_CONST(0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x5d576e73, 0x57a4501d, 0xdfe92f46, 0x681b20a1)},
+        /* 2**128 */
+        {SECP256K1_SCALAR_CONST(0, 0, 0, 1, 0, 0, 0, 0),
+         SECP256K1_SCALAR_CONST(0x50a51ac8, 0x34b9ec24, 0x4b0dff66, 0x5588b13e, 0x9984d5b3, 0xcf80ef0f, 0xd6a23766, 0xa3ee9f22)},
+        /* Input known to need 635 divsteps */
+        {SECP256K1_SCALAR_CONST(0xcb9f1d35, 0xdd4416c2, 0xcd71bf3f, 0x6365da66, 0x3c9b3376, 0x8feb7ae9, 0x32a5ef60, 0x19199ec3),
+         SECP256K1_SCALAR_CONST(0x1d7c7bba, 0xf1893d53, 0xb834bd09, 0x36b411dc, 0x42c2e42f, 0xec72c428, 0x5e189791, 0x8e9bc708)}
+    };
+    int i, var, testrand;
+    unsigned char b32[32];
+    secp256k1_fe x_fe;
+    secp256k1_scalar x_scalar;
+    memset(b32, 0, sizeof(b32));
+    /* Test fixed test cases through test_inverse_{scalar,field}, both ways. */
+    for (i = 0; (size_t)i < sizeof(fe_cases)/sizeof(fe_cases[0]); ++i) {
+        for (var = 0; var <= 1; ++var) {
+            test_inverse_field(&x_fe, &fe_cases[i][0], var);
+            check_fe_equal(&x_fe, &fe_cases[i][1]);
+            test_inverse_field(&x_fe, &fe_cases[i][1], var);
+            check_fe_equal(&x_fe, &fe_cases[i][0]);
+        }
+    }
+    for (i = 0; (size_t)i < sizeof(scalar_cases)/sizeof(scalar_cases[0]); ++i) {
+        for (var = 0; var <= 1; ++var) {
+            test_inverse_scalar(&x_scalar, &scalar_cases[i][0], var);
+            CHECK(secp256k1_scalar_eq(&x_scalar, &scalar_cases[i][1]));
+            test_inverse_scalar(&x_scalar, &scalar_cases[i][1], var);
+            CHECK(secp256k1_scalar_eq(&x_scalar, &scalar_cases[i][0]));
+        }
+    }
+    /* Test inputs 0..999 and their respective negations. */
+    for (i = 0; i < 1000; ++i) {
+        b32[31] = i & 0xff;
+        b32[30] = (i >> 8) & 0xff;
+        secp256k1_scalar_set_b32(&x_scalar, b32, NULL);
+        secp256k1_fe_set_b32(&x_fe, b32);
+        for (var = 0; var <= 1; ++var) {
+            test_inverse_scalar(NULL, &x_scalar, var);
+            test_inverse_field(NULL, &x_fe, var);
+        }
+        secp256k1_scalar_negate(&x_scalar, &x_scalar);
+        secp256k1_fe_negate(&x_fe, &x_fe, 1);
+        for (var = 0; var <= 1; ++var) {
+            test_inverse_scalar(NULL, &x_scalar, var);
+            test_inverse_field(NULL, &x_fe, var);
+        }
+    }
+    /* test 128*count random inputs; half with testrand256_test, half with testrand256 */
+    for (testrand = 0; testrand <= 1; ++testrand) {
+        for (i = 0; i < 64 * count; ++i) {
+            (testrand ? secp256k1_testrand256_test : secp256k1_testrand256)(b32);
+            secp256k1_scalar_set_b32(&x_scalar, b32, NULL);
+            secp256k1_fe_set_b32(&x_fe, b32);
+            for (var = 0; var <= 1; ++var) {
+                test_inverse_scalar(NULL, &x_scalar, var);
+                test_inverse_field(NULL, &x_fe, var);
+            }
+        }
+    }
+}
+
 /***** GROUP TESTS *****/
 
 void ge_equals_ge(const secp256k1_ge *a, const secp256k1_ge *b) {
@@ -2407,64 +2638,35 @@ void run_ec_combine(void) {
 void test_group_decompress(const secp256k1_fe* x) {
     /* The input itself, normalized. */
     secp256k1_fe fex = *x;
-    secp256k1_fe fez;
-    /* Results of set_xquad_var, set_xo_var(..., 0), set_xo_var(..., 1). */
-    secp256k1_ge ge_quad, ge_even, ge_odd;
-    secp256k1_gej gej_quad;
+    /* Results of set_xo_var(..., 0), set_xo_var(..., 1). */
+    secp256k1_ge ge_even, ge_odd;
     /* Return values of the above calls. */
-    int res_quad, res_even, res_odd;
+    int res_even, res_odd;
 
     secp256k1_fe_normalize_var(&fex);
 
-    res_quad = secp256k1_ge_set_xquad(&ge_quad, &fex);
     res_even = secp256k1_ge_set_xo_var(&ge_even, &fex, 0);
     res_odd = secp256k1_ge_set_xo_var(&ge_odd, &fex, 1);
 
-    CHECK(res_quad == res_even);
-    CHECK(res_quad == res_odd);
+    CHECK(res_even == res_odd);
 
-    if (res_quad) {
-        secp256k1_fe_normalize_var(&ge_quad.x);
+    if (res_even) {
         secp256k1_fe_normalize_var(&ge_odd.x);
         secp256k1_fe_normalize_var(&ge_even.x);
-        secp256k1_fe_normalize_var(&ge_quad.y);
         secp256k1_fe_normalize_var(&ge_odd.y);
         secp256k1_fe_normalize_var(&ge_even.y);
 
         /* No infinity allowed. */
-        CHECK(!ge_quad.infinity);
         CHECK(!ge_even.infinity);
         CHECK(!ge_odd.infinity);
 
         /* Check that the x coordinates check out. */
-        CHECK(secp256k1_fe_equal_var(&ge_quad.x, x));
         CHECK(secp256k1_fe_equal_var(&ge_even.x, x));
         CHECK(secp256k1_fe_equal_var(&ge_odd.x, x));
 
-        /* Check that the Y coordinate result in ge_quad is a square. */
-        CHECK(secp256k1_fe_is_quad_var(&ge_quad.y));
-
         /* Check odd/even Y in ge_odd, ge_even. */
         CHECK(secp256k1_fe_is_odd(&ge_odd.y));
         CHECK(!secp256k1_fe_is_odd(&ge_even.y));
-
-        /* Check secp256k1_gej_has_quad_y_var. */
-        secp256k1_gej_set_ge(&gej_quad, &ge_quad);
-        CHECK(secp256k1_gej_has_quad_y_var(&gej_quad));
-        do {
-            random_fe_test(&fez);
-        } while (secp256k1_fe_is_zero(&fez));
-        secp256k1_gej_rescale(&gej_quad, &fez);
-        CHECK(secp256k1_gej_has_quad_y_var(&gej_quad));
-        secp256k1_gej_neg(&gej_quad, &gej_quad);
-        CHECK(!secp256k1_gej_has_quad_y_var(&gej_quad));
-        do {
-            random_fe_test(&fez);
-        } while (secp256k1_fe_is_zero(&fez));
-        secp256k1_gej_rescale(&gej_quad, &fez);
-        CHECK(!secp256k1_gej_has_quad_y_var(&gej_quad));
-        secp256k1_gej_neg(&gej_quad, &gej_quad);
-        CHECK(secp256k1_gej_has_quad_y_var(&gej_quad));
     }
 }
 
@@ -5383,10 +5585,6 @@ void run_ecdsa_openssl(void) {
 # include "modules/ecdh/tests_impl.h"
 #endif
 
-#ifdef ENABLE_MODULE_MULTISET
-# include "modules/multiset/tests_impl.h"
-#endif
-
 #ifdef ENABLE_MODULE_RECOVERY
 # include "modules/recovery/tests_impl.h"
 #endif
@@ -5610,21 +5808,18 @@ int main(int argc, char **argv) {
     run_rand_bits();
     run_rand_int();
 
+    run_ctz_tests();
+    run_modinv_tests();
+    run_inverse_tests();
+
     run_sha256_tests();
     run_hmac_sha256_tests();
     run_rfc6979_hmac_sha256_tests();
 
-#ifndef USE_NUM_NONE
-    /* num tests */
-    run_num_smalltests();
-#endif
-
     /* scalar tests */
     run_scalar_tests();
 
     /* field tests */
-    run_field_inv();
-    run_field_inv_var();
     run_field_misc();
     run_field_convert();
     run_sqr();
@@ -5672,12 +5867,6 @@ int main(int argc, char **argv) {
     run_ecdsa_openssl();
 #endif
 
-#ifdef ENABLE_MODULE_MULTISET
-    /* ECMH tests */
-    printf("running multiset tests\n");
-    run_multiset_tests();
-#endif
-
 #ifdef ENABLE_MODULE_RECOVERY
     /* ECDSA pubkey recovery tests */
     run_recovery_tests();
diff --git a/src/util.h b/src/util.h
index b7e457c..f788468 100644
--- a/src/util.h
+++ b/src/util.h
@@ -276,4 +276,69 @@ SECP256K1_GNUC_EXT typedef __int128 int128_t;
 # endif
 #endif
 
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+/* Determine the number of trailing zero bits in a (non-zero) 32-bit x.
+ * This function is only intended to be used as fallback for
+ * secp256k1_ctz32_var, but permits it to be tested separately. */
+static SECP256K1_INLINE int secp256k1_ctz32_var_debruijn(uint32_t x) {
+    static const uint8_t debruijn[32] = {
+        0x00, 0x01, 0x02, 0x18, 0x03, 0x13, 0x06, 0x19, 0x16, 0x04, 0x14, 0x0A,
+        0x10, 0x07, 0x0C, 0x1A, 0x1F, 0x17, 0x12, 0x05, 0x15, 0x09, 0x0F, 0x0B,
+        0x1E, 0x11, 0x08, 0x0E, 0x1D, 0x0D, 0x1C, 0x1B
+    };
+    return debruijn[((x & -x) * 0x04D7651F) >> 27];
+}
+
+/* Determine the number of trailing zero bits in a (non-zero) 64-bit x.
+ * This function is only intended to be used as fallback for
+ * secp256k1_ctz64_var, but permits it to be tested separately. */
+static SECP256K1_INLINE int secp256k1_ctz64_var_debruijn(uint64_t x) {
+    static const uint8_t debruijn[64] = {
+        0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28,
+        62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11,
+        63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10,
+        51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12
+    };
+    return debruijn[((x & -x) * 0x022FDD63CC95386D) >> 58];
+}
+
+/* Determine the number of trailing zero bits in a (non-zero) 32-bit x. */
+static SECP256K1_INLINE int secp256k1_ctz32_var(uint32_t x) {
+    VERIFY_CHECK(x != 0);
+#if (__has_builtin(__builtin_ctz) || SECP256K1_GNUC_PREREQ(3,4))
+    /* If the unsigned type is sufficient to represent the largest uint32_t, consider __builtin_ctz. */
+    if (((unsigned)UINT32_MAX) == UINT32_MAX) {
+        return __builtin_ctz(x);
+    }
+#endif
+#if (__has_builtin(__builtin_ctzl) || SECP256K1_GNUC_PREREQ(3,4))
+    /* Otherwise consider __builtin_ctzl (the unsigned long type is always at least 32 bits). */
+    return __builtin_ctzl(x);
+#else
+    /* If no suitable CTZ builtin is available, use a (variable time) software emulation. */
+    return secp256k1_ctz32_var_debruijn(x);
+#endif
+}
+
+/* Determine the number of trailing zero bits in a (non-zero) 64-bit x. */
+static SECP256K1_INLINE int secp256k1_ctz64_var(uint64_t x) {
+    VERIFY_CHECK(x != 0);
+#if (__has_builtin(__builtin_ctzl) || SECP256K1_GNUC_PREREQ(3,4))
+    /* If the unsigned long type is sufficient to represent the largest uint64_t, consider __builtin_ctzl. */
+    if (((unsigned long)UINT64_MAX) == UINT64_MAX) {
+        return __builtin_ctzl(x);
+    }
+#endif
+#if (__has_builtin(__builtin_ctzll) || SECP256K1_GNUC_PREREQ(3,4))
+    /* Otherwise consider __builtin_ctzll (the unsigned long long type is always at least 64 bits). */
+    return __builtin_ctzll(x);
+#else
+    /* If no suitable CTZ builtin is available, use a (variable time) software emulation. */
+    return secp256k1_ctz64_var_debruijn(x);
+#endif
+}
+
 #endif /* SECP256K1_UTIL_H */

From 2cb2e243efe730d7111443143d1e5ad2be03dfd5 Mon Sep 17 00:00:00 2001
From: Elichai Turkel <elichai.turkel@gmail.com>
Date: Wed, 24 Mar 2021 16:52:26 +0200
Subject: [PATCH 4/4] Remove unneeded macro definitions after updating
 libsecp256k1

---
 secp256k1.go | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/secp256k1.go b/secp256k1.go
index 9185eb8..82edb6d 100644
--- a/secp256k1.go
+++ b/secp256k1.go
@@ -4,8 +4,7 @@ package secp256k1
 // // Including the headers and code, and defining the default macros
 // #cgo CFLAGS: -I./depend/secp256k1 -I./depend/secp256k1/src/
 // #cgo CFLAGS: -DSECP256K1_BUILD=1 -DECMULT_WINDOW_SIZE=15 -DENABLE_MODULE_SCHNORRSIG=1 -DENABLE_MODULE_EXTRAKEYS=1
-// // Consider using libgmp. these macros are set to use the slower in-project implementation of nums
-// #cgo CFLAGS: -DUSE_NUM_NONE=1 -DUSE_FIELD_INV_BUILTIN=1 -DUSE_SCALAR_INV_BUILTIN=1 -DECMULT_GEN_PREC_BITS=4
+// #cgo CFLAGS: -DECMULT_GEN_PREC_BITS=4
 // // x86_64 can use the Assembly implementation.
 // #cgo amd64 CFLAGS: -DUSE_ASM_X86_64=1
 // #include "./depend/secp256k1/include/secp256k1.h"