From 9be5c5cbd92803f5e7bce7cd20235500ecf9dbdf Mon Sep 17 00:00:00 2001
From: Dave Collins <davec@conformal.com>
Date: Fri, 20 Dec 2013 13:19:13 -0600
Subject: [PATCH] Significantly optimize signature verification.

This commit essentially rewrites all of the primitives needed to perform
the arithmetic for ECDSA signature verification of secp256k1 signatures to
significantly speed it up.  Benchmarking has shown signature verification
is roughly 10 times faster with this commit over the previous.

In particular, it introduces a new field value which is used to perform the
modular field arithmetic using fixed-precision operations specifically
tailored for the secp256k1 prime.  The field also takes advantage of
special properties of the prime for significantly faster modular reduction
than is available through generic methods.

In addition, the curve point addition and doubling have been optimized
minimize the number of field multiplications in favor field squarings
since they are quite a bit faster.  They routines also now look for
certain assumptions such as z values of 1 or equivalent z values which
can be used to further reduce the number of multiplicaitons needed when
possible.

Note there are still quite a few more optimizations that could be done
such as using precomputation for ScalarBaseMult, making use of the
secp256k1 endomorphism, and using windowed NAF, however this work already
offers significant performance improvements.

For example, testing 10000 random signature verifications resulted in:
New btcec took 15.9821565s
Old btcec took 2m34.1016716s

Closes conformal/btcd#26.
---
 btcec.go |  679 +++++++++++++++++++++++------
 field.go | 1270 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 1807 insertions(+), 142 deletions(-)
 create mode 100644 field.go

diff --git a/btcec.go b/btcec.go
index a2a823ba..28822728 100644
--- a/btcec.go
+++ b/btcec.go
@@ -6,6 +6,10 @@
 
 package btcec
 
+// References:
+//   [SECG]: Recommended Elliptic Curve Domain Parameters
+//     http://www.secg.org/download/aid-784/sec2-v2.pdf
+
 // This package operates, internally, on Jacobian coordinates. For a given
 // (x, y) position on the curve, the Jacobian coordinates are (x1, y1, z1)
 // where x = x1/z1² and y = y1/z1³. The greatest speedups come when the whole
@@ -22,6 +26,13 @@ import (
 //TODO: examine if we need to care about EC optimization as descibed here
 //      https://bitcointalk.org/index.php?topic=155054.0;all
 
+var (
+	// fieldOne is simple the integer 1 in field representation.  It is
+	// used to avoid needing to create it multiple times during the internal
+	// arithmetic.
+	fieldOne = new(fieldVal).SetInt(1)
+)
+
 // KoblitzCurve supports a koblitz curve implementation that fits the ECC Curve
 // interface from crypto/elliptic.
 type KoblitzCurve struct {
@@ -29,200 +40,584 @@ type KoblitzCurve struct {
 	q *big.Int
 }
 
-// Params returns the parameters fro the curve.
+// Params returns the parameters for the curve.
 func (curve *KoblitzCurve) Params() *elliptic.CurveParams {
 	return curve.CurveParams
 }
 
+// bigAffineToField takes an affine point (x, y) as big integers and converts
+// it to an affine point as field values.
+func (curve *KoblitzCurve) bigAffineToField(x, y *big.Int) (*fieldVal, *fieldVal) {
+	x3, y3 := new(fieldVal), new(fieldVal)
+	x3.SetByteSlice(x.Bytes())
+	y3.SetByteSlice(y.Bytes())
+
+	return x3, y3
+}
+
+// fieldJacobianToBigAffine takes a Jacobian point (x, y, z) as field values and
+// converts it to an affine point as big integers.
+func (curve *KoblitzCurve) fieldJacobianToBigAffine(x, y, z *fieldVal) (*big.Int, *big.Int) {
+	// Inversions are expensive and both point addition and point doubling
+	// are faster when working with points that have a z value of one.  So,
+	// if the point needs to be converted to affine, go ahead and normalize
+	// the point itself at the same time as the calculation is the same.
+	var zInv, tempZ fieldVal
+	zInv.Set(z).Inverse()   // zInv = Z^-1
+	tempZ.SquareVal(&zInv)  // tempZ = Z^-2
+	x.Mul(&tempZ)           // X = X/Z^2 (mag: 1)
+	y.Mul(tempZ.Mul(&zInv)) // Y = Y/Z^3 (mag: 1)
+	z.SetInt(1)             // Z = 1 (mag: 1)
+
+	// Normalize the x and y values.
+	x.Normalize()
+	y.Normalize()
+
+	// Convert the field values for the now affine point to big.Ints.
+	x3, y3 := new(big.Int), new(big.Int)
+	x3.SetBytes(x.Bytes()[:])
+	y3.SetBytes(y.Bytes()[:])
+	return x3, y3
+}
+
 // IsOnCurve returns boolean if the point (x,y) is on the curve.
 // Part of the elliptic.Curve interface. This function differs from the
 // crypto/elliptic algorithm since a = 0 not -3.
 func (curve *KoblitzCurve) IsOnCurve(x, y *big.Int) bool {
-	// y² = x³ + b
-	y2 := new(big.Int).Mul(y, y) //y²
-	y2.Mod(y2, curve.P)          //y²%P
+	// Convert big ints to field values for faster arithmetic.
+	fx, fy := curve.bigAffineToField(x, y)
 
-	x3 := new(big.Int).Mul(x, x) //x²
-	x3.Mul(x3, x)                //x³
-
-	x3.Add(x3, curve.B) //x³+B
-	x3.Mod(x3, curve.P) //(x³+B)%P
-
-	return x3.Cmp(y2) == 0
+	// Elliptic curve equation for secp256k1 is: y^2 = x^3 + 7
+	y2 := new(fieldVal).SquareVal(fy).Normalize()
+	result := new(fieldVal).SquareVal(fx).Mul(fx).AddInt(7).Normalize()
+	return y2.Equals(result)
 }
 
-// zForAffine returns a Jacobian Z value for the affine point (x, y). If x and
-// y are zero, it assumes that they represent the point at infinity because (0,
-// 0) is not on the any of the curves handled here.
-func zForAffine(x, y *big.Int) *big.Int {
-	z := new(big.Int)
-	if x.Sign() != 0 || y.Sign() != 0 {
-		z.SetInt64(1)
-	}
-	return z
-}
+// addZ1AndZ2EqualsOne adds two Jacobian points that are already known to have
+// z values of 1 and stores the result in (x3, y3, z3).  That is to say
+// (x1, y1, 1) + (x2, y2, 1) = (x3, y3, z3).  It performs faster addition than
+// the generic add routine since less arithmetic is needed due to the ability to
+// avoid the z value multiplications.
+func (curve *KoblitzCurve) addZ1AndZ2EqualsOne(x1, y1, x2, y2, x3, y3, z3 *fieldVal) {
+	// To compute the point addition efficiently, this implementation splits
+	// the equation into intermediate elements which are used to minimize
+	// the number of field multiplications using the method shown at:
+	// http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-mmadd-2007-bl
+	//
+	// In particular it performs the calculations using the following:
+	// H = X2-X1, HH = H^2, I = 4*HH, J = H*I, r = 2*(Y2-Y1), V = X1*I
+	// X3 = r^2-J-2*V, Y3 = r*(V-X3)-2*Y1*J, Z3 = 2*H
+	//
+	// This results in a cost of 4 field multiplications, 2 field squarings,
+	// 6 field additions, and 5 integer multiplications.
 
-// affineFromJacobian reverses the Jacobian transform. See the comment at the
-// top of the file. If the point is ∞ it returns 0, 0.
-func (curve *KoblitzCurve) affineFromJacobian(x, y, z *big.Int) (xOut, yOut *big.Int) {
-	if z.Sign() == 0 {
-		return new(big.Int), new(big.Int)
+	// When the x coordinates are the same for two points on the curve, the
+	// y coordinates either must be the same, in which case it is point
+	// doubling, or they are opposite and the result is the point at
+	// infinity per the group law for elliptic curve cryptography.
+	x1.Normalize()
+	y1.Normalize()
+	x2.Normalize()
+	y2.Normalize()
+	if x1.Equals(x2) {
+		if y1.Equals(y2) {
+			// Since x1 == x2 and y1 == y2, point doubling must be
+			// done, otherwise the addition would end up dividing
+			// by zero.
+			curve.doubleJacobian(x1, y1, fieldOne, x3, y3, z3)
+			return
+		}
+
+		// Since x1 == x2 and y1 == -y2, the sum is the point at
+		// infinity per the group law.
+		x3.SetInt(0)
+		y3.SetInt(0)
+		z3.SetInt(0)
+		return
 	}
 
-	zinv := new(big.Int).ModInverse(z, curve.P)
-	zinvsq := new(big.Int).Mul(zinv, zinv)
+	// Calculate X3, Y3, and Z3 according to the intermediate elements
+	// breakdown above.
+	var h, i, j, r, v fieldVal
+	var negJ, neg2V, negX3 fieldVal
+	h.Set(x1).Negate(1).Add(x2)                // H = X2-X1 (mag: 3)
+	i.SquareVal(&h).MulInt(4)                  // I = 4*H^2 (mag: 4)
+	j.Mul2(&h, &i)                             // J = H*I (mag: 1)
+	r.Set(y1).Negate(1).Add(y2).MulInt(2)      // r = 2*(Y2-Y1) (mag: 6)
+	v.Mul2(x1, &i)                             // V = X1*I (mag: 1)
+	negJ.Set(&j).Negate(1)                     // negJ = -J (mag: 2)
+	neg2V.Set(&v).MulInt(2).Negate(2)          // neg2V = -(2*V) (mag: 3)
+	x3.Set(&r).Square().Add(&negJ).Add(&neg2V) // X3 = r^2-J-2*V (mag: 6)
+	negX3.Set(x3).Negate(6)                    // negX3 = -X3 (mag: 7)
+	j.Mul(y1).MulInt(2).Negate(2)              // J = -(2*Y1*J) (mag: 3)
+	y3.Set(&v).Add(&negX3).Mul(&r).Add(&j)     // Y3 = r*(V-X3)-2*Y1*J (mag: 4)
+	z3.Set(&h).MulInt(2)                       // Z3 = 2*H (mag: 6)
 
-	xOut = new(big.Int).Mul(x, zinvsq)
-	xOut.Mod(xOut, curve.P)
-	zinvsq.Mul(zinvsq, zinv)
-	yOut = new(big.Int).Mul(y, zinvsq)
-	yOut.Mod(yOut, curve.P)
-	return
+	// Normalize the resulting field values to a magnitude of 1 as needed.
+	x3.Normalize()
+	y3.Normalize()
+	z3.Normalize()
 }
 
-// Add returns the sum of (x1,y1 and (x2,y2). Part of the elliptic.Curve
-// interface.
-func (curve *KoblitzCurve) Add(x1, y1, x2, y2 *big.Int) (*big.Int, *big.Int) {
-	z1 := zForAffine(x1, y1)
-	z2 := zForAffine(x2, y2)
-	return curve.affineFromJacobian(curve.addJacobian(x1, y1, z1, x2, y2, z2))
+// addZ1EqualsZ2 adds two Jacobian points that are already known to have the
+// same z value and stores the result in (x3, y3, z3).  That is to say
+// (x1, y1, z1) + (x2, y2, z1) = (x3, y3, z3).  It performs faster addition than
+// the generic add routine since less arithmetic is needed due to the known
+// equivalence.
+func (curve *KoblitzCurve) addZ1EqualsZ2(x1, y1, z1, x2, y2, x3, y3, z3 *fieldVal) {
+	// To compute the point addition efficiently, this implementation splits
+	// the equation into intermediate elements which are used to minimize
+	// the number of field multiplications using a slightly modified version
+	// of the method shown at:
+	// http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-mmadd-2007-bl
+	//
+	// In particular it performs the calculations using the following:
+	// A = X2-X1, B = A^2, C=Y2-Y1, D = C^2, E = X1*B, F = X2*B
+	// X3 = D-E-F, Y3 = C*(E-X3)-Y1*(F-E), Z3 = Z1*A
+	//
+	// This results in a cost of 5 field multiplications, 2 field squarings,
+	// 9 field additions, and 0 integer multiplications.
+
+	// When the x coordinates are the same for two points on the curve, the
+	// y coordinates either must be the same, in which case it is point
+	// doubling, or they are opposite and the result is the point at
+	// infinity per the group law for elliptic curve cryptography.
+	x1.Normalize()
+	y1.Normalize()
+	x2.Normalize()
+	y2.Normalize()
+	if x1.Equals(x2) {
+		if y1.Equals(y2) {
+			// Since x1 == x2 and y1 == y2, point doubling must be
+			// done, otherwise the addition would end up dividing
+			// by zero.
+			curve.doubleJacobian(x1, y1, z1, x3, y3, z3)
+			return
+		}
+
+		// Since x1 == x2 and y1 == -y2, the sum is the point at
+		// infinity per the group law.
+		x3.SetInt(0)
+		y3.SetInt(0)
+		z3.SetInt(0)
+		return
+	}
+
+	// Calculate X3, Y3, and Z3 according to the intermediate elements
+	// breakdown above.
+	var a, b, c, d, e, f fieldVal
+	var negX1, negY1, negE, negX3 fieldVal
+	negX1.Set(x1).Negate(1)                // negX1 = -X1 (mag: 2)
+	negY1.Set(y1).Negate(1)                // negY1 = -Y1 (mag: 2)
+	a.Set(&negX1).Add(x2)                  // A = X2-X1 (mag: 3)
+	b.SquareVal(&a)                        // B = A^2 (mag: 1)
+	c.Set(&negY1).Add(y2)                  // C = Y2-Y1 (mag: 3)
+	d.SquareVal(&c)                        // D = C^2 (mag: 1)
+	e.Mul2(x1, &b)                         // E = X1*B (mag: 1)
+	negE.Set(&e).Negate(1)                 // negE = -E (mag: 2)
+	f.Mul2(x2, &b)                         // F = X2*B (mag: 1)
+	x3.Add2(&e, &f).Negate(3).Add(&d)      // X3 = D-E-F (mag: 5)
+	negX3.Set(x3).Negate(5).Normalize()    // negX3 = -X3 (mag: 1)
+	y3.Set(y1).Mul(f.Add(&negE)).Negate(3) // Y3 = -(Y1*(F-E)) (mag: 4)
+	y3.Add(e.Add(&negX3).Mul(&c))          // Y3 = C*(E-X3)+Y3 (mag: 5)
+	z3.Mul2(z1, &a)                        // Z3 = Z1*A (mag: 1)
+
+	// Normalize the resulting field values to a magnitude of 1 as needed.
+	x3.Normalize()
+	y3.Normalize()
 }
 
-// addJacobian takes two points in Jacobian coordinates, (x1, y1, z1) and
-// (x2, y2, z2) and returns their sum, also in Jacobian form.
-func (curve *KoblitzCurve) addJacobian(x1, y1, z1, x2, y2, z2 *big.Int) (*big.Int, *big.Int, *big.Int) {
-	// See http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
-	x3, y3, z3 := new(big.Int), new(big.Int), new(big.Int)
-	if z1.Sign() == 0 {
+// addZ2EqualsOne adds two Jacobian points when the second point is already
+// known to have a z value of 1 (and the z value for the first point is not 1)
+// and stores the result in (x3, y3, z3).  That is to say (x1, y1, z1) +
+// (x2, y2, 1) = (x3, y3, z3).  It performs faster addition than the generic
+// add routine since less arithmetic is needed due to the ability to avoid
+// multiplications by the second point's z value.
+func (curve *KoblitzCurve) addZ2EqualsOne(x1, y1, z1, x2, y2, x3, y3, z3 *fieldVal) {
+	// To compute the point addition efficiently, this implementation splits
+	// the equation into intermediate elements which are used to minimize
+	// the number of field multiplications using the method shown at:
+	// http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl
+	//
+	// In particular it performs the calculations using the following:
+	// Z1Z1 = Z1^2, U2 = X2*Z1Z1, S2 = Y2*Z1*Z1Z1, H = U2-X1, HH = H^2,
+	// I = 4*HH, J = H*I, r = 2*(S2-Y1), V = X1*I
+	// X3 = r^2-J-2*V, Y3 = r*(V-X3)-2*Y1*J, Z3 = (Z1+H)^2-Z1Z1-HH
+	//
+	// This results in a cost of 7 field multiplications, 4 field squarings,
+	// 9 field additions, and 4 integer multiplications.
+
+	// When the x coordinates are the same for two points on the curve, the
+	// y coordinates either must be the same, in which case it is point
+	// doubling, or they are opposite and the result is the point at
+	// infinity per the group law for elliptic curve cryptography.  Since
+	// any number of Jacobian coordinates can represent the same affine
+	// point, the x and y values need to be converted to like terms.  Due to
+	// the assumption made for this function that the second point has a z
+	// value of 1 (z2=1), the first point is already "converted".
+	var z1z1, u2, s2 fieldVal
+	x1.Normalize()
+	y1.Normalize()
+	z1z1.SquareVal(z1)                        // Z1Z1 = Z1^2 (mag: 1)
+	u2.Set(x2).Mul(&z1z1).Normalize()         // U2 = X2*Z1Z1 (mag: 1)
+	s2.Set(y2).Mul(&z1z1).Mul(z1).Normalize() // S2 = Y2*Z1*Z1Z1 (mag: 1)
+	if x1.Equals(&u2) {
+		if y1.Equals(&s2) {
+			// Since x1 == x2 and y1 == y2, point doubling must be
+			// done, otherwise the addition would end up dividing
+			// by zero.
+			curve.doubleJacobian(x1, y1, z1, x3, y3, z3)
+			return
+		}
+
+		// Since x1 == x2 and y1 == -y2, the sum is the point at
+		// infinity per the group law.
+		x3.SetInt(0)
+		y3.SetInt(0)
+		z3.SetInt(0)
+		return
+	}
+
+	// Calculate X3, Y3, and Z3 according to the intermediate elements
+	// breakdown above.
+	var h, hh, i, j, r, rr, v fieldVal
+	var negX1, negY1, negX3 fieldVal
+	negX1.Set(x1).Negate(1)                // negX1 = -X1 (mag: 2)
+	h.Add2(&u2, &negX1)                    // H = U2-X1 (mag: 3)
+	hh.SquareVal(&h)                       // HH = H^2 (mag: 1)
+	i.Set(&hh).MulInt(4)                   // I = 4 * HH (mag: 4)
+	j.Mul2(&h, &i)                         // J = H*I (mag: 1)
+	negY1.Set(y1).Negate(1)                // negY1 = -Y1 (mag: 2)
+	r.Set(&s2).Add(&negY1).MulInt(2)       // r = 2*(S2-Y1) (mag: 6)
+	rr.SquareVal(&r)                       // rr = r^2 (mag: 1)
+	v.Mul2(x1, &i)                         // V = X1*I (mag: 1)
+	x3.Set(&v).MulInt(2).Add(&j).Negate(3) // X3 = -(J+2*V) (mag: 4)
+	x3.Add(&rr)                            // X3 = r^2+X3 (mag: 5)
+	negX3.Set(x3).Negate(5)                // negX3 = -X3 (mag: 6)
+	y3.Set(y1).Mul(&j).MulInt(2).Negate(2) // Y3 = -(2*Y1*J) (mag: 3)
+	y3.Add(v.Add(&negX3).Mul(&r))          // Y3 = r*(V-X3)+Y3 (mag: 4)
+	z3.Add2(z1, &h).Square()               // Z3 = (Z1+H)^2 (mag: 1)
+	z3.Add(z1z1.Add(&hh).Negate(2))        // Z3 = Z3-(Z1Z1+HH) (mag: 4)
+
+	// Normalize the resulting field values to a magnitude of 1 as needed.
+	x3.Normalize()
+	y3.Normalize()
+	z3.Normalize()
+}
+
+// addGeneric adds two Jacobian points (x1, y1, z1) and (x2, y2, z2) without any
+// assumptions about the z values of the two points and stores the result in
+// (x3, y3, z3).  That is to say (x1, y1, z1) + (x2, y2, z2) = (x3, y3, z3).  It
+// is the slowest of the add routines due to requiring the most arithmetic.
+func (curve *KoblitzCurve) addGeneric(x1, y1, z1, x2, y2, z2, x3, y3, z3 *fieldVal) {
+	// To compute the point addition efficiently, this implementation splits
+	// the equation into intermediate elements which are used to minimize
+	// the number of field multiplications using the method shown at:
+	// http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
+	//
+	// In particular it performs the calculations using the following:
+	// Z1Z1 = Z1^2, Z2Z2 = Z2^2, U1 = X1*Z2Z2, U2 = X2*Z1Z1, S1 = Y1*Z2*Z2Z2
+	// S2 = Y2*Z1*Z1Z1, H = U2-U1, I = (2*H)^2, J = H*I, r = 2*(S2-S1)
+	// V = U1*I
+	// X3 = r^2-J-2*V, Y3 = r*(V-X3)-2*S1*J, Z3 = ((Z1+Z2)^2-Z1Z1-Z2Z2)*H
+	//
+	// This results in a cost of 11 field multiplications, 5 field squarings,
+	// 9 field additions, and 4 integer multiplications.
+
+	// When the x coordinates are the same for two points on the curve, the
+	// y coordinates either must be the same, in which case it is point
+	// doubling, or they are opposite and the result is the point at
+	// infinity.  Since any number of Jacobian coordinates can represent the
+	// same affine point, the x and y values need to be converted to like
+	// terms.
+	var z1z1, z2z2, u1, u2, s1, s2 fieldVal
+	z1z1.SquareVal(z1)                        // Z1Z1 = Z1^2 (mag: 1)
+	z2z2.SquareVal(z2)                        // Z2Z2 = Z2^2 (mag: 1)
+	u1.Set(x1).Mul(&z2z2).Normalize()         // U1 = X1*Z2Z2 (mag: 1)
+	u2.Set(x2).Mul(&z1z1).Normalize()         // U2 = X2*Z1Z1 (mag: 1)
+	s1.Set(y1).Mul(&z2z2).Mul(z2).Normalize() // S1 = Y1*Z2*Z2Z2 (mag: 1)
+	s2.Set(y2).Mul(&z1z1).Mul(z1).Normalize() // S2 = Y2*Z1*Z1Z1 (mag: 1)
+	if u1.Equals(&u2) {
+		if s1.Equals(&s2) {
+			// Since x1 == x2 and y1 == y2, point doubling must be
+			// done, otherwise the addition would end up dividing
+			// by zero.
+			curve.doubleJacobian(x1, y1, z1, x3, y3, z3)
+			return
+		}
+
+		// Since x1 == x2 and y1 == -y2, the sum is the point at
+		// infinity per the group law.
+		x3.SetInt(0)
+		y3.SetInt(0)
+		z3.SetInt(0)
+		return
+	}
+
+	// Calculate X3, Y3, and Z3 according to the intermediate elements
+	// breakdown above.
+	var h, i, j, r, rr, v fieldVal
+	var negU1, negS1, negX3 fieldVal
+	negU1.Set(&u1).Negate(1)               // negU1 = -U1 (mag: 2)
+	h.Add2(&u2, &negU1)                    // H = U2-U1 (mag: 3)
+	i.Set(&h).MulInt(2).Square()           // I = (2*H)^2 (mag: 2)
+	j.Mul2(&h, &i)                         // J = H*I (mag: 1)
+	negS1.Set(&s1).Negate(1)               // negS1 = -S1 (mag: 2)
+	r.Set(&s2).Add(&negS1).MulInt(2)       // r = 2*(S2-S1) (mag: 6)
+	rr.SquareVal(&r)                       // rr = r^2 (mag: 1)
+	v.Mul2(&u1, &i)                        // V = U1*I (mag: 1)
+	x3.Set(&v).MulInt(2).Add(&j).Negate(3) // X3 = -(J+2*V) (mag: 4)
+	x3.Add(&rr)                            // X3 = r^2+X3 (mag: 5)
+	negX3.Set(x3).Negate(5)                // negX3 = -X3 (mag: 6)
+	y3.Mul2(&s1, &j).MulInt(2).Negate(2)   // Y3 = -(2*S1*J) (mag: 3)
+	y3.Add(v.Add(&negX3).Mul(&r))          // Y3 = r*(V-X3)+Y3 (mag: 4)
+	z3.Add2(z1, z2).Square()               // Z3 = (Z1+Z2)^2 (mag: 1)
+	z3.Add(z1z1.Add(&z2z2).Negate(2))      // Z3 = Z3-(Z1Z1+Z2Z2) (mag: 4)
+	z3.Mul(&h)                             // Z3 = Z3*H (mag: 1)
+
+	// Normalize the resulting field values to a magnitude of 1 as needed.
+	x3.Normalize()
+	y3.Normalize()
+}
+
+// addJacobian adds the passed Jacobian points (x1, y1, z1) and (x2, y2, z2)
+// together and stores the result in (x3, y3, z3).
+func (curve *KoblitzCurve) addJacobian(x1, y1, z1, x2, y2, z2, x3, y3, z3 *fieldVal) {
+	// A point at infinity is the identity according to the group law for
+	// elliptic curve cryptography.  Thus, ∞ + P = P and P + ∞ = P.
+	if (x1.IsZero() && y1.IsZero()) || z1.IsZero() {
 		x3.Set(x2)
 		y3.Set(y2)
 		z3.Set(z2)
-		return x3, y3, z3
+		return
 	}
-	if z2.Sign() == 0 {
+	if (x2.IsZero() && y2.IsZero()) || z2.IsZero() {
 		x3.Set(x1)
 		y3.Set(y1)
 		z3.Set(z1)
-		return x3, y3, z3
+		return
 	}
 
-	z1z1 := new(big.Int).Mul(z1, z1)
-	z1z1.Mod(z1z1, curve.P)
-	z2z2 := new(big.Int).Mul(z2, z2)
-	z2z2.Mod(z2z2, curve.P)
-
-	u1 := new(big.Int).Mul(x1, z2z2)
-	u1.Mod(u1, curve.P)
-	u2 := new(big.Int).Mul(x2, z1z1)
-	u2.Mod(u2, curve.P)
-	h := new(big.Int).Sub(u2, u1)
-	xEqual := h.Sign() == 0
-	if h.Sign() == -1 {
-		h.Add(h, curve.P)
+	// Faster point addition can be achieved when certain assumptions are
+	// met.  For example, when both points have the same z value, arithmetic
+	// on the z values can be avoided.  This section thus checks for these
+	// conditions and calls an appropriate add function which is accelerated
+	// by using those assumptions.
+	z1.Normalize()
+	z2.Normalize()
+	isZ1One := z1.Equals(fieldOne)
+	isZ2One := z2.Equals(fieldOne)
+	switch {
+	case isZ1One && isZ2One:
+		curve.addZ1AndZ2EqualsOne(x1, y1, x2, y2, x3, y3, z3)
+		return
+	case z1.Equals(z2):
+		curve.addZ1EqualsZ2(x1, y1, z1, x2, y2, x3, y3, z3)
+		return
+	case isZ2One:
+		curve.addZ2EqualsOne(x1, y1, z1, x2, y2, x3, y3, z3)
+		return
 	}
-	i := new(big.Int).Lsh(h, 1)
-	i.Mul(i, i)
-	j := new(big.Int).Mul(h, i)
 
-	s1 := new(big.Int).Mul(y1, z2)
-	s1.Mul(s1, z2z2)
-	s1.Mod(s1, curve.P)
-	s2 := new(big.Int).Mul(y2, z1)
-	s2.Mul(s2, z1z1)
-	s2.Mod(s2, curve.P)
-	r := new(big.Int).Sub(s2, s1)
-	if r.Sign() == -1 {
-		r.Add(r, curve.P)
+	// None of the above assumptions are true, so fall back to generic
+	// point addition.
+	curve.addGeneric(x1, y1, z1, x2, y2, z2, x3, y3, z3)
+}
+
+// Add returns the sum of (x1,y1) and (x2,y2). Part of the elliptic.Curve
+// interface.
+func (curve *KoblitzCurve) Add(x1, y1, x2, y2 *big.Int) (*big.Int, *big.Int) {
+	// A point at infinity is the identity according to the group law for
+	// elliptic curve cryptography.  Thus, ∞ + P = P and P + ∞ = P.
+	if x1.Sign() == 0 && y1.Sign() == 0 {
+		return x2, y2
 	}
-	yEqual := r.Sign() == 0
-	if xEqual && yEqual {
-		return curve.doubleJacobian(x1, y1, z1)
+	if x2.Sign() == 0 && y2.Sign() == 0 {
+		return x1, y1
 	}
-	r.Lsh(r, 1)
-	v := new(big.Int).Mul(u1, i)
 
-	x3.Set(r)
-	x3.Mul(x3, x3)
-	x3.Sub(x3, j)
-	x3.Sub(x3, v)
-	x3.Sub(x3, v)
-	x3.Mod(x3, curve.P)
+	// Convert the affine coordinates from big integers to field values
+	// and do the point addition in Jacobian projective space.
+	fx1, fy1 := curve.bigAffineToField(x1, y1)
+	fx2, fy2 := curve.bigAffineToField(x2, y2)
+	fx3, fy3, fz3 := new(fieldVal), new(fieldVal), new(fieldVal)
+	curve.addJacobian(fx1, fy1, fieldOne, fx2, fy2, fieldOne, fx3, fy3, fz3)
 
-	y3.Set(r)
-	v.Sub(v, x3)
-	y3.Mul(y3, v)
-	s1.Mul(s1, j)
-	s1.Lsh(s1, 1)
-	y3.Sub(y3, s1)
-	y3.Mod(y3, curve.P)
+	// Convert the Jacobian coordinate field values back to affine big
+	// integers.
+	return curve.fieldJacobianToBigAffine(fx3, fy3, fz3)
+}
 
-	z3.Add(z1, z2)
-	z3.Mul(z3, z3)
-	z3.Sub(z3, z1z1)
-	z3.Sub(z3, z2z2)
-	z3.Mul(z3, h)
-	z3.Mod(z3, curve.P)
+// doubleZ1EqualsOne performs point doubling on the passed Jacobian point
+// when the point is already known to have a z value of 1 and stores
+// the result in (x3, y3, z3).  That is to say (x3, y3, z3) = 2*(x1, y1, 1).  It
+// performs faster point doubling than the generic routine since less arithmetic
+// is needed due to the ability to avoid multiplication by the z value.
+func (curve *KoblitzCurve) doubleZ1EqualsOne(x1, y1, x3, y3, z3 *fieldVal) {
+	// This function uses the assumptions that z1 is 1, thus the point
+	// doubling formulas reduce to:
+	//
+	// X3 = (3*X1^2)^2 - 8*X1*Y1^2
+	// Y3 = (3*X1^2)*(4*X1*Y1^2 - X3) - 8*Y1^4
+	// Z3 = 2*Y1
+	//
+	// To compute the above efficiently, this implementation splits the
+	// equation into intermediate elements which are used to minimize the
+	// number of field multiplications in favor of field squarings which
+	// are roughly 35% faster than field multiplications with the current
+	// implementation at the time this was written.
+	//
+	// This uses a slightly modified version of the method shown at:
+	// http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-mdbl-2007-bl
+	//
+	// In particular it performs the calculations using the following:
+	// A = X1^2, B = Y1^2, C = B^2, D = 2*((X1+B)^2-A-C)
+	// E = 3*A, F = E^2, X3 = F-2*D, Y3 = E*(D-X3)-8*C
+	// Z3 = 2*Y1
+	//
+	// This results in a cost of 1 field multiplication, 5 field squarings,
+	// 6 field additions, and 5 integer multiplications.
+	var a, b, c, d, e, f fieldVal
+	z3.Set(y1).MulInt(2)                     // Z3 = 2*Y1 (mag: 2)
+	a.SquareVal(x1)                          // A = X1^2 (mag: 1)
+	b.SquareVal(y1)                          // B = Y1^2 (mag: 1)
+	c.SquareVal(&b)                          // C = B^2 (mag: 1)
+	b.Add(x1).Square()                       // B = (X1+B)^2 (mag: 1)
+	d.Set(&a).Add(&c).Negate(2)              // D = -(A+C) (mag: 3)
+	d.Add(&b).MulInt(2)                      // D = 2*(B+D)(mag: 8)
+	e.Set(&a).MulInt(3)                      // E = 3*A (mag: 3)
+	f.SquareVal(&e)                          // F = E^2 (mag: 1)
+	x3.Set(&d).MulInt(2).Negate(16)          // X3 = -(2*D) (mag: 17)
+	x3.Add(&f)                               // X3 = F+X3 (mag: 18)
+	f.Set(x3).Negate(18).Add(&d).Normalize() // F = D-X3 (mag: 1)
+	y3.Set(&c).MulInt(8).Negate(8)           // Y3 = -(8*C) (mag: 9)
+	y3.Add(f.Mul(&e))                        // Y3 = E*F+Y3 (mag: 10)
 
-	return x3, y3, z3
+	// Normalize the field values back to a magnitude of 1.
+	x3.Normalize()
+	y3.Normalize()
+	z3.Normalize()
+}
+
+// doubleGeneric performs point doubling on the passed Jacobian point without
+// any assumptions about the z value and stores the result in (x3, y3, z3).
+// That is to say (x3, y3, z3) = 2*(x1, y1, z1).  It is the slowest of the point
+// doubling routines due to requiring the most arithmetic.
+func (cuve *KoblitzCurve) doubleGeneric(x1, y1, z1, x3, y3, z3 *fieldVal) {
+	// Point doubling formula for Jacobian coordinates for the secp256k1
+	// curve:
+	// X3 = (3*X1^2)^2 - 8*X1*Y1^2
+	// Y3 = (3*X1^2)*(4*X1*Y1^2 - X3) - 8*Y1^4
+	// Z3 = 2*Y1*Z1
+	//
+	// To compute the above efficiently, this implementation splits the
+	// equation into intermediate elements which are used to minimize the
+	// number of field multiplications in favor of field squarings which
+	// are roughly 35% faster than field multiplications with the current
+	// implementation at the time this was written.
+	//
+	// This uses a slightly modified version of the method shown at:
+	// http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
+	//
+	// In particular it performs the calculations using the following:
+	// A = X1^2, B = Y1^2, C = B^2, D = 2*((X1+B)^2-A-C)
+	// E = 3*A, F = E^2, X3 = F-2*D, Y3 = E*(D-X3)-8*C
+	// Z3 = 2*Y1*Z1
+	//
+	// This results in a cost of 1 field multiplication, 5 field squarings,
+	// 6 field additions, and 5 integer multiplications.
+	var a, b, c, d, e, f fieldVal
+	z3.Mul2(y1, z1).MulInt(2)                // Z3 = 2*Y1*Z1 (mag: 2)
+	a.SquareVal(x1)                          // A = X1^2 (mag: 1)
+	b.SquareVal(y1)                          // B = Y1^2 (mag: 1)
+	c.SquareVal(&b)                          // C = B^2 (mag: 1)
+	b.Add(x1).Square()                       // B = (X1+B)^2 (mag: 1)
+	d.Set(&a).Add(&c).Negate(2)              // D = -(A+C) (mag: 3)
+	d.Add(&b).MulInt(2)                      // D = 2*(B+D)(mag: 8)
+	e.Set(&a).MulInt(3)                      // E = 3*A (mag: 3)
+	f.SquareVal(&e)                          // F = E^2 (mag: 1)
+	x3.Set(&d).MulInt(2).Negate(16)          // X3 = -(2*D) (mag: 17)
+	x3.Add(&f)                               // X3 = F+X3 (mag: 18)
+	f.Set(x3).Negate(18).Add(&d).Normalize() // F = D-X3 (mag: 1)
+	y3.Set(&c).MulInt(8).Negate(8)           // Y3 = -(8*C) (mag: 9)
+	y3.Add(f.Mul(&e))                        // Y3 = E*F+Y3 (mag: 10)
+
+	// Normalize the field values back to a magnitude of 1.
+	x3.Normalize()
+	y3.Normalize()
+	z3.Normalize()
+}
+
+// doubleJacobian doubles the passed Jacobian point (x1, y1, z1) and stores the
+// result in (x3, y3, z3).
+func (curve *KoblitzCurve) doubleJacobian(x1, y1, z1, x3, y3, z3 *fieldVal) {
+	// Doubling a point at infinity is still infinity.
+	if y1.IsZero() || z1.IsZero() {
+		x3.SetInt(0)
+		y3.SetInt(0)
+		z3.SetInt(0)
+		return
+	}
+
+	// Slightly faster point doubling can be achieved when the z value is 1
+	// by avoiding the multiplication on the z value.  This section calls
+	// a point doubling function which is accelerated by using that
+	// assumption when possible.
+	if z1.Normalize().Equals(fieldOne) {
+		curve.doubleZ1EqualsOne(x1, y1, x3, y3, z3)
+		return
+	}
+
+	// Fall back to generic point doubling which works with arbitrary z
+	// values.
+	curve.doubleGeneric(x1, y1, z1, x3, y3, z3)
 }
 
 // Double returns 2*(x1,y1). Part of the elliptic.Curve interface.
 func (curve *KoblitzCurve) Double(x1, y1 *big.Int) (*big.Int, *big.Int) {
-	z1 := zForAffine(x1, y1)
-	return curve.affineFromJacobian(curve.doubleJacobian(x1, y1, z1))
-}
+	if y1.Sign() == 0 {
+		return new(big.Int), new(big.Int)
+	}
 
-// doubleJacobian takes a point in Jacobian coordinates, (x, y, z), and
-// returns its double, also in Jacobian form.
-func (curve *KoblitzCurve) doubleJacobian(x, y, z *big.Int) (*big.Int, *big.Int, *big.Int) {
-	// See http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
+	// Convert the affine coordinates from big integers to field values
+	// and do the point doubling in Jacobian projective space.
+	fx1, fy1 := curve.bigAffineToField(x1, y1)
+	fx3, fy3, fz3 := new(fieldVal), new(fieldVal), new(fieldVal)
+	curve.doubleJacobian(fx1, fy1, fieldOne, fx3, fy3, fz3)
 
-	a := new(big.Int).Mul(x, x) //X1²
-	b := new(big.Int).Mul(y, y) //Y1²
-	c := new(big.Int).Mul(b, b) //B²
-
-	d := new(big.Int).Add(x, b) //X1+B
-	d.Mul(d, d)                 //(X1+B)²
-	d.Sub(d, a)                 //(X1+B)²-A
-	d.Sub(d, c)                 //(X1+B)²-A-C
-	d.Mul(d, big.NewInt(2))     //2*((X1+B)²-A-C)
-
-	e := new(big.Int).Mul(big.NewInt(3), a) //3*A
-	f := new(big.Int).Mul(e, e)             //E²
-
-	x3 := new(big.Int).Mul(big.NewInt(2), d) //2*D
-	x3.Sub(f, x3)                            //F-2*D
-	x3.Mod(x3, curve.P)
-
-	y3 := new(big.Int).Sub(d, x3)                  //D-X3
-	y3.Mul(e, y3)                                  //E*(D-X3)
-	y3.Sub(y3, new(big.Int).Mul(big.NewInt(8), c)) //E*(D-X3)-8*C
-	y3.Mod(y3, curve.P)
-
-	z3 := new(big.Int).Mul(y, z) //Y1*Z1
-	z3.Mul(big.NewInt(2), z3)    //3*Y1*Z1
-	z3.Mod(z3, curve.P)
-
-	return x3, y3, z3
+	// Convert the Jacobian coordinate field values back to affine big
+	// integers.
+	return curve.fieldJacobianToBigAffine(fx3, fy3, fz3)
 }
 
 // ScalarMult returns k*(Bx, By) where k is a big endian integer.
 // Part of the elliptic.Curve interface.
 func (curve *KoblitzCurve) ScalarMult(Bx, By *big.Int, k []byte) (*big.Int, *big.Int) {
-	Bz := new(big.Int).SetInt64(1)
-	x, y, z := new(big.Int), new(big.Int), new(big.Int)
+	// This uses the left to right binary method for point multiplication:
 
-	for _, byte := range k {
+	// Point Q = ∞ (point at infinity).
+	qx, qy, qz := new(fieldVal), new(fieldVal), new(fieldVal)
+
+	// Point P = the point to multiply the scalar with.
+	px, py := curve.bigAffineToField(Bx, By)
+	pz := fieldOne
+
+	// Double and add as necessary depending on the bits set in the scalar.
+	for _, byteVal := range k {
 		for bitNum := 0; bitNum < 8; bitNum++ {
-			x, y, z = curve.doubleJacobian(x, y, z)
-			if byte&0x80 == 0x80 {
-				x, y, z = curve.addJacobian(Bx, By, Bz, x, y, z)
+			// Q = 2*Q
+			curve.doubleJacobian(qx, qy, qz, qx, qy, qz)
+			if byteVal&0x80 == 0x80 {
+				// Q = Q + P
+				curve.addJacobian(qx, qy, qz, px, py, pz, qx,
+					qy, qz)
 			}
-			byte <<= 1
+			byteVal <<= 1
 		}
 	}
 
-	return curve.affineFromJacobian(x, y, z)
+	// Convert the Jacobian coordinate field values back to affine big.Ints.
+	return curve.fieldJacobianToBigAffine(qx, qy, qz)
 }
 
 // ScalarBaseMult returns k*G where G is the base point of the group and k is a
@@ -247,7 +642,7 @@ func initAll() {
 }
 
 func initS256() {
-	// See SEC 2 section 2.7.1
+	// See [SECG] section 2.7.1
 	secp256k1.CurveParams = new(elliptic.CurveParams)
 	secp256k1.P, _ = new(big.Int).SetString("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F", 16)
 	secp256k1.N, _ = new(big.Int).SetString("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEBAAEDCE6AF48A03BBFD25E8CD0364141", 16)
diff --git a/field.go b/field.go
new file mode 100644
index 00000000..ac123e9e
--- /dev/null
+++ b/field.go
@@ -0,0 +1,1270 @@
+// Copyright (c) 2013 Conformal Systems LLC.
+// Copyright (c) 2013 Dave Collins
+// Use of this source code is governed by an ISC
+// license that can be found in the LICENSE file.
+
+package btcec
+
+// References:
+//   [HAC]: Handbook of Applied Cryptography Menezes, van Oorschot, Vanstone.
+//     http://cacr.uwaterloo.ca/hac/
+
+// All elliptic curve operations for secp256k1 are done in a finite field
+// characterized by a 256-bit prime.  Given this precision is larger than the
+// biggest available native type, obviously some form of bignum math is needed.
+// This package implements specialized fixed-precision field arithmetic rather
+// than relying on an arbitrary-precision arithmetic package such as math/big
+// for dealing with the field math since the size is known.  As a result, rather
+// large performance gains are achieved by taking advantage of many
+// optimizations not available to arbitrary-precision arithmetic and generic
+// modular arithmetic algorithms.
+//
+// There are various ways to internally represent each finite field element.
+// For example, the most obvious representation would be to use an array of 4
+// uint64s (64 bits * 4 = 256 bits).  However, that representation suffers from
+// a couple of issues.  First, there is no native Go type large enough to handle
+// the intermediate results while adding or multiplying two 64-bit numbers, and
+// second there is no space left for overflows when performing the intermediate
+// arithmetic between each array element which would lead to expensive carry
+// propagation.
+//
+// Given the above, this implementation represents the the field elements as
+// 10 uint32s with each word (array entry) treated as base 2^26.  This was
+// chosen for the following reasons:
+// 1) Most systems at the current time are 64-bit (or at least have 64-bit
+//    registers available for specialized purposes such as MMX) so the
+//    intermediate results can typically be done using a native register (and
+//    using uint64s to avoid the need for additional half-word arithmetic)
+// 2) In order to allow addition of the internal words without having to
+//    propagate the the carry, the max normalized value for each register must
+//    be less than the number of bits available in the register
+// 3) Since we're dealing with 32-bit values, 64-bits of overflow is a
+//    reasonable choice for #2
+// 4) Given the need for 256-bits of precision and the properties stated in #1,
+//    #2, and #3, the representation which best accomodates this is 10 uint32s
+//    with base 2^26 (26 bits * 10 = 260 bits, so the final word only needs 22
+//    bits) which leaves the desired 64 bits (32 * 10 = 320, 320 - 256 = 64) for
+//    overflow
+//
+// Since it is so important that the field arithmetic is extremely fast for
+// high performance crypto, this package does not perform any validation where
+// it ordinarily would.  For example, some functions only give the correct
+// result is the field is normalized and there is no checking to ensure it is.
+// While I typically prefer to ensure all state and input is valid for most
+// packages, this code is really only used internally and every extra check
+// counts.
+
+import (
+	"encoding/hex"
+)
+
+// Constants used to make the code more readable.
+const (
+	twoBitsMask   = 0x3
+	fourBitsMask  = 0xf
+	sixBitsMask   = 0x3f
+	eightBitsMask = 0xff
+)
+
+// Constants related to the field representation.
+const (
+	// fieldWords is the number of words used to internally represent the
+	// 256-bit value.
+	fieldWords = 10
+
+	// fieldBase is the exponent used to form the numeric base of each word.
+	// 2^(fieldBase*i) where i is the word position.
+	fieldBase = 26
+
+	// fieldOverflowBits is the minimum number of "overflow" bits for each
+	// word in the field value.
+	fieldOverflowBits = 32 - fieldBase
+
+	// fieldBaseMask is the mask for the bits in each word needed to
+	// represent the numeric base of each word (except the most significant
+	// word).
+	fieldBaseMask = (1 << fieldBase) - 1
+
+	// fieldMSBBits is the number of bits in the most significant word used
+	// to represent the value.
+	fieldMSBBits = 256 - (fieldBase * (fieldWords - 1))
+
+	// fieldMSBMask is the mask for the bits in the most significant word
+	// needed to represent the value.
+	fieldMSBMask = (1 << fieldMSBBits) - 1
+
+	// fieldPrimeWordZero is word zero of the secp256k1 prime in the
+	// internal field representation.  It is used during modular reduction
+	// and negation.
+	fieldPrimeWordZero = 0x3fffc2f
+
+	// fieldPrimeWordOne is word one of the secp256k1 prime in the
+	// internal field representation.  It is used during modular reduction
+	// and negation.
+	fieldPrimeWordOne = 0x3ffffbf
+)
+
+// fieldVal implements optimized fixed-precision arithmetic over the
+// secp256k1 finite field.  This means all arithmetic is performed modulo
+// 0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f.  It
+// represents each 256-bit value as 10 32-bit integers in base 2^26.  This
+// provides 6 bits of overflow in each word (10 bits in the most significant
+// word) for a total of 64 bits of overflow (9*6 + 10 = 64).  It only implements
+// the arithmetic needed for elliptic curve operations.
+//
+// The following depicts the internal representation:
+// 	 -----------------------------------------------------------------
+// 	|        n[9]       |        n[8]       | ... |        n[0]       |
+// 	| 32 bits available | 32 bits available | ... | 32 bits available |
+// 	| 22 bits for value | 26 bits for value | ... | 26 bits for value |
+// 	| 10 bits overflow  |  6 bits overflow  | ... |  6 bits overflow  |
+// 	| Mult: 2^(26*9)    | Mult: 2^(26*8)    | ... | Mult: 2^(26*0)    |
+// 	 -----------------------------------------------------------------
+//
+// For example, consider the number 2^49 + 1.  It would be represented as:
+// 	n[0] = 1
+// 	n[1] = 2^23
+// 	n[2..9] = 0
+//
+// The full 256-bit value is then calculated by looping i from 9..0 and
+// doing sum(n[i] * 2^(26i)) like so:
+// 	n[9] * 2^(26*9) = 0    * 2^234 = 0
+// 	n[8] * 2^(26*8) = 0    * 2^208 = 0
+// 	...
+// 	n[1] * 2^(26*1) = 2^23 * 2^26  = 2^49
+// 	n[0] * 2^(26*0) = 1    * 2^0   = 1
+// 	Sum: 0 + 0 + ... + 2^49 + 1 = 2^49 + 1
+type fieldVal struct {
+	n [10]uint32
+}
+
+// String returns the field value as a human-readable hex string.
+func (f fieldVal) String() string {
+	t := new(fieldVal).Set(&f).Normalize()
+	return hex.EncodeToString(t.Bytes()[:])
+}
+
+// Zero sets the field value to zero.  A newly created field value is already
+// set to zero.  This function can be useful to clear an existing field value
+// for reuse.
+func (f *fieldVal) Zero() {
+	f.n[0] = 0
+	f.n[1] = 0
+	f.n[2] = 0
+	f.n[3] = 0
+	f.n[4] = 0
+	f.n[5] = 0
+	f.n[6] = 0
+	f.n[7] = 0
+	f.n[8] = 0
+	f.n[9] = 0
+}
+
+// Set sets the field value equal to the passed value.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f := new(fieldVal).Set(f2).Add(1) so that f = f2 + 1 where f2 is not
+// modified.
+func (f *fieldVal) Set(val *fieldVal) *fieldVal {
+	*f = *val
+	return f
+}
+
+// SetInt sets the field value to the passed integer.  This is a convenience
+// function since it is fairly common to perform some arithemetic with small
+// native integers.
+//
+// The field value is returned to support chaining.  This enables syntax such
+// as f := new(fieldVal).SetInt(2).Mul(f2) so that f = 2 * f2.
+func (f *fieldVal) SetInt(ui uint) *fieldVal {
+	f.Zero()
+	f.n[0] = uint32(ui)
+	return f
+}
+
+// SetBytes packs the passed 32-byte big-endian value into the internal field
+// value representation.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f := new(fieldVal).SetBytes(byteArray).Mul(f2) so that f = ba * f2.
+func (f *fieldVal) SetBytes(b *[32]byte) *fieldVal {
+	// Pack the 256 total bits across the 10 uint32 words with a max of
+	// 26-bits per word.  This could be done with a couple of for loops,
+	// but this unrolled version is significantly faster.  Benchmarks show
+	// this is about 34 times faster than the variant which uses loops.
+	f.n[0] = uint32(b[31]) | uint32(b[30])<<8 | uint32(b[29])<<16 |
+		(uint32(b[28])&twoBitsMask)<<24
+	f.n[1] = uint32(b[28])>>2 | uint32(b[27])<<6 | uint32(b[26])<<14 |
+		(uint32(b[25])&fourBitsMask)<<22
+	f.n[2] = uint32(b[25])>>4 | uint32(b[24])<<4 | uint32(b[23])<<12 |
+		(uint32(b[22])&sixBitsMask)<<20
+	f.n[3] = uint32(b[22])>>6 | uint32(b[21])<<2 | uint32(b[20])<<10 |
+		uint32(b[19])<<18
+	f.n[4] = uint32(b[18]) | uint32(b[17])<<8 | uint32(b[16])<<16 |
+		(uint32(b[15])&twoBitsMask)<<24
+	f.n[5] = uint32(b[15])>>2 | uint32(b[14])<<6 | uint32(b[13])<<14 |
+		(uint32(b[12])&fourBitsMask)<<22
+	f.n[6] = uint32(b[12])>>4 | uint32(b[11])<<4 | uint32(b[10])<<12 |
+		(uint32(b[9])&sixBitsMask)<<20
+	f.n[7] = uint32(b[9])>>6 | uint32(b[8])<<2 | uint32(b[7])<<10 |
+		uint32(b[6])<<18
+	f.n[8] = uint32(b[5]) | uint32(b[4])<<8 | uint32(b[3])<<16 |
+		(uint32(b[2])&twoBitsMask)<<24
+	f.n[9] = uint32(b[2])>>2 | uint32(b[1])<<6 | uint32(b[0])<<14
+	return f
+}
+
+// SetByteSlice packs the passed big-endian value into the internal field value
+// representation.  Only the first 32-bytes are used.  As a result, it is up to
+// the caller to ensure numbers of the appropriate size are used or the value
+// will be truncated.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f := new(fieldVal).SetByteSlice(byteSlice)
+func (f *fieldVal) SetByteSlice(b []byte) *fieldVal {
+	var b32 [32]byte
+	for i := 0; i < len(b); i++ {
+		if i < 32 {
+			b32[i+(32-len(b))] = b[i]
+		}
+	}
+	return f.SetBytes(&b32)
+}
+
+// SetHex decodes the passed big-endian hex string into the internal field value
+// representation.  Only the first 32-bytes are used.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f := new(fieldVal).SetHex("0abc").Add(1) so that f = 0x0abc + 1
+func (f *fieldVal) SetHex(hexString string) *fieldVal {
+	if len(hexString)%2 != 0 {
+		hexString = "0" + hexString
+	}
+	bytes, _ := hex.DecodeString(hexString)
+	return f.SetByteSlice(bytes)
+}
+
+// Normalize normalizes the internal field words into the desired range and
+// performs fast modular reduction over the secp256k1 prime by making use of the
+// special form of the prime.
+func (f *fieldVal) Normalize() *fieldVal {
+	// The field representation leaves 6 bits of overflow in each
+	// word so intermediate calculations can be performed without needing
+	// to propagate the carry to each higher word during the calculations.
+	// In order to normalize, first we need to "compact" the full 256-bit
+	// value to the right and treat the additional 64 leftmost bits as
+	// the magnitude.
+	m := f.n[0]
+	t0 := m & fieldBaseMask
+	m = (m >> fieldBase) + f.n[1]
+	t1 := m & fieldBaseMask
+	m = (m >> fieldBase) + f.n[2]
+	t2 := m & fieldBaseMask
+	m = (m >> fieldBase) + f.n[3]
+	t3 := m & fieldBaseMask
+	m = (m >> fieldBase) + f.n[4]
+	t4 := m & fieldBaseMask
+	m = (m >> fieldBase) + f.n[5]
+	t5 := m & fieldBaseMask
+	m = (m >> fieldBase) + f.n[6]
+	t6 := m & fieldBaseMask
+	m = (m >> fieldBase) + f.n[7]
+	t7 := m & fieldBaseMask
+	m = (m >> fieldBase) + f.n[8]
+	t8 := m & fieldBaseMask
+	m = (m >> fieldBase) + f.n[9]
+	t9 := m & fieldMSBMask
+	m = m >> fieldMSBBits
+
+	// At this point, if the magnitude is greater than 0, the overall value
+	// is greater than the max possible 256-bit value.  In particular, it is
+	// "how many times larger" than the max value it is.  Since this field
+	// is doing arithmetic modulo the secp256k1 prime, we need to perform
+	// modular reduction over the prime.
+	//
+	// Per [HAC] section 14.3.4: Reduction method of moduli of special form,
+	// when the modulus is of the special form m = b^t - c, highly efficient
+	// reduction can be achieved.
+	//
+	// The secp256k1 prime is equivalent to 2^256 - 4294968273, so it fits
+	// this criteria.
+	//
+	// 4294968273 in field representation (base 2^26) is:
+	// n[0] = 977
+	// n[1] = 64
+	// That is to say (2^26 * 64) + 977 = 4294968273
+	//
+	// The algorithm presented in the referenced section typically repeats
+	// until the quotient is zero.  However, due to our field representation
+	// we already know at least how many times we would need to repeat as
+	// it's the value currently in m.  Thus we can simply multiply the
+	// magnitude by the field representation of the prime and do a single
+	// iteration.  Notice that nothing will be changed when the magnitude is
+	// zero, so we could skip this in that case, however always running
+	// regardless allows it to run in constant time.
+	r := t0 + m*977
+	t0 = r & fieldBaseMask
+	r = (r >> fieldBase) + t1 + m*64
+	t1 = r & fieldBaseMask
+	r = (r >> fieldBase) + t2
+	t2 = r & fieldBaseMask
+	r = (r >> fieldBase) + t3
+	t3 = r & fieldBaseMask
+	r = (r >> fieldBase) + t4
+	t4 = r & fieldBaseMask
+	r = (r >> fieldBase) + t5
+	t5 = r & fieldBaseMask
+	r = (r >> fieldBase) + t6
+	t6 = r & fieldBaseMask
+	r = (r >> fieldBase) + t7
+	t7 = r & fieldBaseMask
+	r = (r >> fieldBase) + t8
+	t8 = r & fieldBaseMask
+	r = (r >> fieldBase) + t9
+	t9 = r & fieldMSBMask
+
+	// At this point, the result will be in the range 0 <= result <=
+	// prime + (2^64 - c).  Therefore, one more subtraction of the prime
+	// might be needed if the current result is greater than or equal to the
+	// prime.  The following does the final reduction in constant time.
+	// Note that the if/else here intentionally does the bitwise OR with
+	// zero even though it won't change the value to ensure constant time
+	// between the branches.
+	var mask int32
+	if t0 < fieldPrimeWordZero {
+		mask |= -1
+	} else {
+		mask |= 0
+	}
+	if t1 < fieldPrimeWordOne {
+		mask |= -1
+	} else {
+		mask |= 0
+	}
+	if t2 < fieldBaseMask {
+		mask |= -1
+	} else {
+		mask |= 0
+	}
+	if t3 < fieldBaseMask {
+		mask |= -1
+	} else {
+		mask |= 0
+	}
+	if t4 < fieldBaseMask {
+		mask |= -1
+	} else {
+		mask |= 0
+	}
+	if t5 < fieldBaseMask {
+		mask |= -1
+	} else {
+		mask |= 0
+	}
+	if t6 < fieldBaseMask {
+		mask |= -1
+	} else {
+		mask |= 0
+	}
+	if t7 < fieldBaseMask {
+		mask |= -1
+	} else {
+		mask |= 0
+	}
+	if t8 < fieldBaseMask {
+		mask |= -1
+	} else {
+		mask |= 0
+	}
+	if t9 < fieldMSBMask {
+		mask |= -1
+	} else {
+		mask |= 0
+	}
+	t0 = t0 - uint32(^mask&fieldPrimeWordZero)
+	t1 = t1 - uint32(^mask&fieldPrimeWordOne)
+	t2 = t2 & uint32(mask)
+	t3 = t3 & uint32(mask)
+	t4 = t4 & uint32(mask)
+	t5 = t5 & uint32(mask)
+	t6 = t6 & uint32(mask)
+	t7 = t7 & uint32(mask)
+	t8 = t8 & uint32(mask)
+	t9 = t9 & uint32(mask)
+
+	// Finally, set the normalized and reduced words.
+	f.n[0] = t0
+	f.n[1] = t1
+	f.n[2] = t2
+	f.n[3] = t3
+	f.n[4] = t4
+	f.n[5] = t5
+	f.n[6] = t6
+	f.n[7] = t7
+	f.n[8] = t8
+	f.n[9] = t9
+	return f
+}
+
+// PutBytes unpacks the field value to a 32-byte big-endian value using the
+// passed byte array.  There is a similar function, Bytes, which unpacks the
+// field value into a new array and returns that.  This version is provided
+// since it can be useful to cut down on the number of allocations by allowing
+// the caller to reuse a buffer.
+//
+// The field value must be normalized for this function to return the correct
+// result.
+func (f *fieldVal) PutBytes(b *[32]byte) {
+	// Unpack the 256 total bits from the 10 uint32 words with a max of
+	// 26-bits per word.  This could be done with a couple of for loops,
+	// but this unrolled version is a bit faster.  Benchmarks show this is
+	// about 10 times faster than the variant which uses loops.
+	b[31] = byte(f.n[0] & eightBitsMask)
+	b[30] = byte((f.n[0] >> 8) & eightBitsMask)
+	b[29] = byte((f.n[0] >> 16) & eightBitsMask)
+	b[28] = byte((f.n[0]>>24)&twoBitsMask | (f.n[1]&sixBitsMask)<<2)
+	b[27] = byte((f.n[1] >> 6) & eightBitsMask)
+	b[26] = byte((f.n[1] >> 14) & eightBitsMask)
+	b[25] = byte((f.n[1]>>22)&fourBitsMask | (f.n[2]&fourBitsMask)<<4)
+	b[24] = byte((f.n[2] >> 4) & eightBitsMask)
+	b[23] = byte((f.n[2] >> 12) & eightBitsMask)
+	b[22] = byte((f.n[2]>>20)&sixBitsMask | (f.n[3]&twoBitsMask)<<6)
+	b[21] = byte((f.n[3] >> 2) & eightBitsMask)
+	b[20] = byte((f.n[3] >> 10) & eightBitsMask)
+	b[19] = byte((f.n[3] >> 18) & eightBitsMask)
+	b[18] = byte(f.n[4] & eightBitsMask)
+	b[17] = byte((f.n[4] >> 8) & eightBitsMask)
+	b[16] = byte((f.n[4] >> 16) & eightBitsMask)
+	b[15] = byte((f.n[4]>>24)&twoBitsMask | (f.n[5]&sixBitsMask)<<2)
+	b[14] = byte((f.n[5] >> 6) & eightBitsMask)
+	b[13] = byte((f.n[5] >> 14) & eightBitsMask)
+	b[12] = byte((f.n[5]>>22)&fourBitsMask | (f.n[6]&fourBitsMask)<<4)
+	b[11] = byte((f.n[6] >> 4) & eightBitsMask)
+	b[10] = byte((f.n[6] >> 12) & eightBitsMask)
+	b[9] = byte((f.n[6]>>20)&sixBitsMask | (f.n[7]&twoBitsMask)<<6)
+	b[8] = byte((f.n[7] >> 2) & eightBitsMask)
+	b[7] = byte((f.n[7] >> 10) & eightBitsMask)
+	b[6] = byte((f.n[7] >> 18) & eightBitsMask)
+	b[5] = byte(f.n[8] & eightBitsMask)
+	b[4] = byte((f.n[8] >> 8) & eightBitsMask)
+	b[3] = byte((f.n[8] >> 16) & eightBitsMask)
+	b[2] = byte((f.n[8]>>24)&twoBitsMask | (f.n[9]&sixBitsMask)<<2)
+	b[1] = byte((f.n[9] >> 6) & eightBitsMask)
+	b[0] = byte((f.n[9] >> 14) & eightBitsMask)
+}
+
+// Bytes unpacks the field value to a 32-byte big-endian value.  See PutBytes
+// for a variant that allows the a buffer to be passed which can be useful to
+// to cut down on the number of allocations by allowing the caller to reuse a
+// buffer.
+//
+// The field value must be normalized for this function to return correct
+// result.
+func (f *fieldVal) Bytes() *[32]byte {
+	b := new([32]byte)
+	f.PutBytes(b)
+	return b
+}
+
+// IsZero returns whether or not the field value is equal to zero.
+func (f *fieldVal) IsZero() bool {
+	// The value can only be zero if no bits are set in any of the words.
+	// This is a constant time implementation.
+	bits := f.n[0] | f.n[1] | f.n[2] | f.n[3] | f.n[4] |
+		f.n[5] | f.n[6] | f.n[7] | f.n[8] | f.n[9]
+
+	return bits == 0
+}
+
+// IsOdd returns whether or not the field value is an odd number.
+//
+// The field value must be normalized for this function to return correct
+// result.
+func (f *fieldVal) IsOdd() bool {
+	// Only odd numbers have the bottom bit set.
+	return f.n[0]&1 == 1
+}
+
+// Equals returns whether or not the two field values are the same.  Both
+// field values being compared must be normalized for this function to return
+// the correct result.
+func (f *fieldVal) Equals(val *fieldVal) bool {
+	// Xor only sets bits when they are different, so the two field values
+	// can only be the same if no bits are set after xoring each word.
+	// This is a constant time implementation.
+	bits := (f.n[0] ^ val.n[0]) | (f.n[1] ^ val.n[1]) | (f.n[2] ^ val.n[2]) |
+		(f.n[3] ^ val.n[3]) | (f.n[4] ^ val.n[4]) | (f.n[5] ^ val.n[5]) |
+		(f.n[6] ^ val.n[6]) | (f.n[7] ^ val.n[7]) | (f.n[8] ^ val.n[8]) |
+		(f.n[9] ^ val.n[9])
+
+	return bits == 0
+}
+
+// NegateVal negates the passed value and stores the result in f.  The caller
+// must provide the magnitude of the passed value for a correct result.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f.NegateVal(f2).AddInt(1) so that f = -f2 + 1.
+func (f *fieldVal) NegateVal(val *fieldVal, magnitude uint32) *fieldVal {
+	// Negation in the field is just the prime minus the value.  However,
+	// in order to allow negation against a field value without having to
+	// normalize/reduce it first, multiply by the magnitude (that is how
+	// "far" away it is from the normalized value) to adjust.  Also, since
+	// negating a value pushes it one more order of magnitude away from the
+	// normalized range, add 1 to compensate.
+	//
+	// For some intuition here, imagine you're performing mod 12 arithmetic
+	// (picture a clock) and you are negating the number 7.  So you start at
+	// 12 (which is of course 0 under mod 12) and count backwards (left on
+	// the clock) 7 times to arrive at 5.  Notice this is just 12-7 = 5.
+	// Now, assume you're starting with 19, which is a number that is
+	// already larger than the modulus and congruent to 7 (mod 12).  When a
+	// value is already in the desired range, its magnitude is 1.  Since 19
+	// is an additional "step", its magnitude (mod 12) is 2.  Since any
+	// multiple of the modulus is conguent to zero (mod m), the answer can
+	// be shortcut by simply mulplying the magnitude by the modulus and
+	// subtracting.  Keeping with the example, this would be (2*12)-19 = 5.
+	f.n[0] = (magnitude+1)*fieldPrimeWordZero - val.n[0]
+	f.n[1] = (magnitude+1)*fieldPrimeWordOne - val.n[1]
+	f.n[2] = (magnitude+1)*fieldBaseMask - val.n[2]
+	f.n[3] = (magnitude+1)*fieldBaseMask - val.n[3]
+	f.n[4] = (magnitude+1)*fieldBaseMask - val.n[4]
+	f.n[5] = (magnitude+1)*fieldBaseMask - val.n[5]
+	f.n[6] = (magnitude+1)*fieldBaseMask - val.n[6]
+	f.n[7] = (magnitude+1)*fieldBaseMask - val.n[7]
+	f.n[8] = (magnitude+1)*fieldBaseMask - val.n[8]
+	f.n[9] = (magnitude+1)*fieldMSBMask - val.n[9]
+
+	return f
+}
+
+// Negate negates the field value.  The existing field value is modified.  The
+// caller must provide the magnitude of the field value for a correct result.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f.Negate().AddInt(1) so that f = -f + 1.
+func (f *fieldVal) Negate(magnitude uint32) *fieldVal {
+	return f.NegateVal(f, magnitude)
+}
+
+// AddInt adds the passed integer to the existing field value and stores the
+// result in f.  This is a convenience function since it is fairly common to
+// perform some arithemetic with small native integers.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f.AddInt(1).Add(f2) so that f = f + 1 + f2.
+func (f *fieldVal) AddInt(ui uint) *fieldVal {
+	// Since the field representation intentionally provides overflow bits,
+	// it's ok to use carryless addition as the carry bit is safely part of
+	// the word and will be normalized out.
+	f.n[0] += uint32(ui)
+
+	return f
+}
+
+// Add adds the passed value to the existing field value and stores the result
+// in f.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f.Add(f2).AddInt(1) so that f = f + f2 + 1.
+func (f *fieldVal) Add(val *fieldVal) *fieldVal {
+	// Since the field representation intentionally provides overflow bits,
+	// it's ok to use carryless addition as the carry bit is safely part of
+	// each word and will be normalized out.  This could obviously be done
+	// in a loop, but the unrolled version is faster.
+	f.n[0] += val.n[0]
+	f.n[1] += val.n[1]
+	f.n[2] += val.n[2]
+	f.n[3] += val.n[3]
+	f.n[4] += val.n[4]
+	f.n[5] += val.n[5]
+	f.n[6] += val.n[6]
+	f.n[7] += val.n[7]
+	f.n[8] += val.n[8]
+	f.n[9] += val.n[9]
+
+	return f
+}
+
+// Add2 adds the passed two field values together and stores the result in f.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f3.Add2(f, f2).AddInt(1) so that f3 = f + f2 + 1.
+func (f *fieldVal) Add2(val *fieldVal, val2 *fieldVal) *fieldVal {
+	// Since the field representation intentionally provides overflow bits,
+	// it's ok to use carryless addition as the carry bit is safely part of
+	// each word and will be normalized out.  This could obviously be done
+	// in a loop, but the unrolled version is faster.
+	f.n[0] = val.n[0] + val2.n[0]
+	f.n[1] = val.n[1] + val2.n[1]
+	f.n[2] = val.n[2] + val2.n[2]
+	f.n[3] = val.n[3] + val2.n[3]
+	f.n[4] = val.n[4] + val2.n[4]
+	f.n[5] = val.n[5] + val2.n[5]
+	f.n[6] = val.n[6] + val2.n[6]
+	f.n[7] = val.n[7] + val2.n[7]
+	f.n[8] = val.n[8] + val2.n[8]
+	f.n[9] = val.n[9] + val2.n[9]
+
+	return f
+}
+
+// MulInt multiplies the field value by the passed int and stores the result in
+// f.  Note that this function can overflow if multiplying the value by any of
+// the individual words exceeds a max uint32.  Therefore it is important that
+// the caller ensures no overflows will occur before using this function.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f.MulInt(2).Add(f2) so that f = 2 * f + f2.
+func (f *fieldVal) MulInt(val uint) *fieldVal {
+	// Since each word of the field representation can hold up to
+	// fieldOverflowBits extra bits which will be normalized out, it's safe
+	// to multiply each word without using a larger type or carry
+	// propagation so long as the values won't overflow a uint32.  This
+	// could obviously be done in a loop, but the unrolled version is
+	// faster.
+	ui := uint32(val)
+	f.n[0] *= ui
+	f.n[1] *= ui
+	f.n[2] *= ui
+	f.n[3] *= ui
+	f.n[4] *= ui
+	f.n[5] *= ui
+	f.n[6] *= ui
+	f.n[7] *= ui
+	f.n[8] *= ui
+	f.n[9] *= ui
+
+	return f
+}
+
+// Mul multiplies the passed value to the existing field value and stores the
+// result in f.  Note that this function can overflow if multiplying any
+// of the individual words exceeds a max uint32.  In practice, this means the
+// magnitude of either value invovled in the multiplication must be a max of
+// 8.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f.Mul(f2).AddInt(1) so that f = (f * f2) + 1.
+func (f *fieldVal) Mul(val *fieldVal) *fieldVal {
+	return f.Mul2(f, val)
+}
+
+// Mul2 multiplies the passed two field values together and stores the result
+// result in f.  Note that this function can overflow if multiplying any of
+// the individual words exceeds a max uint32.  In practice, this means the
+// magnitude of either value invovled in the multiplication must be a max of
+// 8.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f3.Mul2(f, f2).AddInt(1) so that f3 = (f * f2) + 1.
+func (f *fieldVal) Mul2(val *fieldVal, val2 *fieldVal) *fieldVal {
+	// This could be done with a couple of for loops and an array to store
+	// the intermediate terms, but this unrolled version is significantly
+	// faster.
+
+	// Terms for 2^(fieldBase*0).
+	m := uint64(val.n[0]) * uint64(val2.n[0])
+	t0 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*1).
+	m = (m >> fieldBase) +
+		uint64(val.n[0])*uint64(val2.n[1]) +
+		uint64(val.n[1])*uint64(val2.n[0])
+	t1 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*2).
+	m = (m >> fieldBase) +
+		uint64(val.n[0])*uint64(val2.n[2]) +
+		uint64(val.n[1])*uint64(val2.n[1]) +
+		uint64(val.n[2])*uint64(val2.n[0])
+	t2 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*3).
+	m = (m >> fieldBase) +
+		uint64(val.n[0])*uint64(val2.n[3]) +
+		uint64(val.n[1])*uint64(val2.n[2]) +
+		uint64(val.n[2])*uint64(val2.n[1]) +
+		uint64(val.n[3])*uint64(val2.n[0])
+	t3 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*4).
+	m = (m >> fieldBase) +
+		uint64(val.n[0])*uint64(val2.n[4]) +
+		uint64(val.n[1])*uint64(val2.n[3]) +
+		uint64(val.n[2])*uint64(val2.n[2]) +
+		uint64(val.n[3])*uint64(val2.n[1]) +
+		uint64(val.n[4])*uint64(val2.n[0])
+	t4 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*5).
+	m = (m >> fieldBase) +
+		uint64(val.n[0])*uint64(val2.n[5]) +
+		uint64(val.n[1])*uint64(val2.n[4]) +
+		uint64(val.n[2])*uint64(val2.n[3]) +
+		uint64(val.n[3])*uint64(val2.n[2]) +
+		uint64(val.n[4])*uint64(val2.n[1]) +
+		uint64(val.n[5])*uint64(val2.n[0])
+	t5 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*6).
+	m = (m >> fieldBase) +
+		uint64(val.n[0])*uint64(val2.n[6]) +
+		uint64(val.n[1])*uint64(val2.n[5]) +
+		uint64(val.n[2])*uint64(val2.n[4]) +
+		uint64(val.n[3])*uint64(val2.n[3]) +
+		uint64(val.n[4])*uint64(val2.n[2]) +
+		uint64(val.n[5])*uint64(val2.n[1]) +
+		uint64(val.n[6])*uint64(val2.n[0])
+	t6 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*7).
+	m = (m >> fieldBase) +
+		uint64(val.n[0])*uint64(val2.n[7]) +
+		uint64(val.n[1])*uint64(val2.n[6]) +
+		uint64(val.n[2])*uint64(val2.n[5]) +
+		uint64(val.n[3])*uint64(val2.n[4]) +
+		uint64(val.n[4])*uint64(val2.n[3]) +
+		uint64(val.n[5])*uint64(val2.n[2]) +
+		uint64(val.n[6])*uint64(val2.n[1]) +
+		uint64(val.n[7])*uint64(val2.n[0])
+	t7 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*8).
+	m = (m >> fieldBase) +
+		uint64(val.n[0])*uint64(val2.n[8]) +
+		uint64(val.n[1])*uint64(val2.n[7]) +
+		uint64(val.n[2])*uint64(val2.n[6]) +
+		uint64(val.n[3])*uint64(val2.n[5]) +
+		uint64(val.n[4])*uint64(val2.n[4]) +
+		uint64(val.n[5])*uint64(val2.n[3]) +
+		uint64(val.n[6])*uint64(val2.n[2]) +
+		uint64(val.n[7])*uint64(val2.n[1]) +
+		uint64(val.n[8])*uint64(val2.n[0])
+	t8 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*9).
+	m = (m >> fieldBase) +
+		uint64(val.n[0])*uint64(val2.n[9]) +
+		uint64(val.n[1])*uint64(val2.n[8]) +
+		uint64(val.n[2])*uint64(val2.n[7]) +
+		uint64(val.n[3])*uint64(val2.n[6]) +
+		uint64(val.n[4])*uint64(val2.n[5]) +
+		uint64(val.n[5])*uint64(val2.n[4]) +
+		uint64(val.n[6])*uint64(val2.n[3]) +
+		uint64(val.n[7])*uint64(val2.n[2]) +
+		uint64(val.n[8])*uint64(val2.n[1]) +
+		uint64(val.n[9])*uint64(val2.n[0])
+	t9 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*10).
+	m = (m >> fieldBase) +
+		uint64(val.n[1])*uint64(val2.n[9]) +
+		uint64(val.n[2])*uint64(val2.n[8]) +
+		uint64(val.n[3])*uint64(val2.n[7]) +
+		uint64(val.n[4])*uint64(val2.n[6]) +
+		uint64(val.n[5])*uint64(val2.n[5]) +
+		uint64(val.n[6])*uint64(val2.n[4]) +
+		uint64(val.n[7])*uint64(val2.n[3]) +
+		uint64(val.n[8])*uint64(val2.n[2]) +
+		uint64(val.n[9])*uint64(val2.n[1])
+	t10 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*11).
+	m = (m >> fieldBase) +
+		uint64(val.n[2])*uint64(val2.n[9]) +
+		uint64(val.n[3])*uint64(val2.n[8]) +
+		uint64(val.n[4])*uint64(val2.n[7]) +
+		uint64(val.n[5])*uint64(val2.n[6]) +
+		uint64(val.n[6])*uint64(val2.n[5]) +
+		uint64(val.n[7])*uint64(val2.n[4]) +
+		uint64(val.n[8])*uint64(val2.n[3]) +
+		uint64(val.n[9])*uint64(val2.n[2])
+	t11 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*12).
+	m = (m >> fieldBase) +
+		uint64(val.n[3])*uint64(val2.n[9]) +
+		uint64(val.n[4])*uint64(val2.n[8]) +
+		uint64(val.n[5])*uint64(val2.n[7]) +
+		uint64(val.n[6])*uint64(val2.n[6]) +
+		uint64(val.n[7])*uint64(val2.n[5]) +
+		uint64(val.n[8])*uint64(val2.n[4]) +
+		uint64(val.n[9])*uint64(val2.n[3])
+	t12 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*13).
+	m = (m >> fieldBase) +
+		uint64(val.n[4])*uint64(val2.n[9]) +
+		uint64(val.n[5])*uint64(val2.n[8]) +
+		uint64(val.n[6])*uint64(val2.n[7]) +
+		uint64(val.n[7])*uint64(val2.n[6]) +
+		uint64(val.n[8])*uint64(val2.n[5]) +
+		uint64(val.n[9])*uint64(val2.n[4])
+	t13 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*14).
+	m = (m >> fieldBase) +
+		uint64(val.n[5])*uint64(val2.n[9]) +
+		uint64(val.n[6])*uint64(val2.n[8]) +
+		uint64(val.n[7])*uint64(val2.n[7]) +
+		uint64(val.n[8])*uint64(val2.n[6]) +
+		uint64(val.n[9])*uint64(val2.n[5])
+	t14 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*15).
+	m = (m >> fieldBase) +
+		uint64(val.n[6])*uint64(val2.n[9]) +
+		uint64(val.n[7])*uint64(val2.n[8]) +
+		uint64(val.n[8])*uint64(val2.n[7]) +
+		uint64(val.n[9])*uint64(val2.n[6])
+	t15 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*16).
+	m = (m >> fieldBase) +
+		uint64(val.n[7])*uint64(val2.n[9]) +
+		uint64(val.n[8])*uint64(val2.n[8]) +
+		uint64(val.n[9])*uint64(val2.n[7])
+	t16 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*17).
+	m = (m >> fieldBase) +
+		uint64(val.n[8])*uint64(val2.n[9]) +
+		uint64(val.n[9])*uint64(val2.n[8])
+	t17 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*18).
+	m = (m >> fieldBase) + uint64(val.n[9])*uint64(val2.n[9])
+	t18 := m & fieldBaseMask
+
+	// What's left is for 2^(fieldBase*19).
+	t19 := m >> fieldBase
+
+	// At this point, all of the terms are grouped into their respective
+	// base.
+	//
+	// Per [HAC] section 14.3.4: Reduction method of moduli of special form,
+	// when the modulus is of the special form m = b^t - c, highly efficient
+	// reduction can be achieved per the provided algorithm.
+	//
+	// The secp256k1 prime is equivalent to 2^256 - 4294968273, so it fits
+	// this criteria.
+	//
+	// 4294968273 in field representation (base 2^26) is:
+	// n[0] = 977
+	// n[1] = 64
+	// That is to say (2^26 * 64) + 977 = 4294968273
+	//
+	// Since each word is in base 26, the upper terms (t10 and up) start
+	// at 260 bits (versus the final desired range of 256 bits), so the
+	// field representation of 'c' from above needs to be adjusted for the
+	// extra 4 bits by multiplying it by 2^4 = 16.  4294968273 * 16 =
+	// 68719492368.  Thus, the adjusted field representation of 'c' is:
+	// n[0] = 977 * 16 = 15632
+	// n[1] = 64 * 16 = 1024
+	// That is to say (2^26 * 1024) + 15632 = 68719492368
+	//
+	// To reduce the final term, t19, the entire 'c' value is needed instead
+	// of only n[0] because there are no more terms left to handle n[1].
+	// This means there might be some magnitude left in the upper bits that
+	// is handled below.
+	m = t0 + t10*15632
+	t0 = m & fieldBaseMask
+	m = (m >> fieldBase) + t1 + t10*1024 + t11*15632
+	t1 = m & fieldBaseMask
+	m = (m >> fieldBase) + t2 + t11*1024 + t12*15632
+	t2 = m & fieldBaseMask
+	m = (m >> fieldBase) + t3 + t12*1024 + t13*15632
+	t3 = m & fieldBaseMask
+	m = (m >> fieldBase) + t4 + t13*1024 + t14*15632
+	t4 = m & fieldBaseMask
+	m = (m >> fieldBase) + t5 + t14*1024 + t15*15632
+	t5 = m & fieldBaseMask
+	m = (m >> fieldBase) + t6 + t15*1024 + t16*15632
+	t6 = m & fieldBaseMask
+	m = (m >> fieldBase) + t7 + t16*1024 + t17*15632
+	t7 = m & fieldBaseMask
+	m = (m >> fieldBase) + t8 + t17*1024 + t18*15632
+	t8 = m & fieldBaseMask
+	m = (m >> fieldBase) + t9 + t18*1024 + t19*68719492368
+	t9 = m & fieldMSBMask
+	m = m >> fieldMSBBits
+
+	// At this point, if the magnitude is greater than 0, the overall value
+	// is greater than the max possible 256-bit value.  In particular, it is
+	// "how many times larger" than the max value it is.
+	//
+	// The algorithm presented in [HAC] section 14.3.4 repeats until the
+	// quotient is zero.  However, due to the above, we already know at
+	// least how many times we would need to repeat as it's the value
+	// currently in m.  Thus we can simply multiply the magnitude by the
+	// field representation of the prime and do a single iteration.  Notice
+	// that nothing will be changed when the magnitude is zero, so we could
+	// skip this in that case, however always running regardless allows it
+	// to run in constant time.  The final result will be in the range
+	// 0 <= result <= prime + (2^64 - c), so it is guaranteed to have a
+	// magnitude of 1, but it is denormalized.
+	d := t0 + m*977
+	f.n[0] = uint32(d & fieldBaseMask)
+	d = (d >> fieldBase) + t1 + m*64
+	f.n[1] = uint32(d & fieldBaseMask)
+	f.n[2] = uint32((d >> fieldBase) + t2)
+	f.n[3] = uint32(t3)
+	f.n[4] = uint32(t4)
+	f.n[5] = uint32(t5)
+	f.n[6] = uint32(t6)
+	f.n[7] = uint32(t7)
+	f.n[8] = uint32(t8)
+	f.n[9] = uint32(t9)
+
+	return f
+}
+
+// Square squares the field value.  The existing field value is modified.  Note
+// that this function can overflow if multiplying any of the individual words
+// exceeds a max uint32.  In practice, this means the magnitude of the field
+// must be a max of 8 to prevent overflow.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f.Square().Mul(f2) so that f = f^2 * f2.
+func (f *fieldVal) Square() *fieldVal {
+	return f.SquareVal(f)
+}
+
+// SquareVal squares the passed value and stores the result in f.  Note that
+// this function can overflow if multiplying any of the individual words
+// exceeds a max uint32.  In practice, this means the magnitude of the field
+// being squred must be a max of 8 to prevent overflow.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f3.SquareVal(f).Mul(f) so that f3 = f^2 * f = f^3.
+func (f *fieldVal) SquareVal(val *fieldVal) *fieldVal {
+	// This could be done with a couple of for loops and an array to store
+	// the intermediate terms, but this unrolled version is significantly
+	// faster.
+
+	// Terms for 2^(fieldBase*0).
+	m := uint64(val.n[0]) * uint64(val.n[0])
+	t0 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*1).
+	m = (m >> fieldBase) + 2*uint64(val.n[0])*uint64(val.n[1])
+	t1 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*2).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[0])*uint64(val.n[2]) +
+		uint64(val.n[1])*uint64(val.n[1])
+	t2 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*3).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[0])*uint64(val.n[3]) +
+		2*uint64(val.n[1])*uint64(val.n[2])
+	t3 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*4).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[0])*uint64(val.n[4]) +
+		2*uint64(val.n[1])*uint64(val.n[3]) +
+		uint64(val.n[2])*uint64(val.n[2])
+	t4 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*5).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[0])*uint64(val.n[5]) +
+		2*uint64(val.n[1])*uint64(val.n[4]) +
+		2*uint64(val.n[2])*uint64(val.n[3])
+	t5 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*6).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[0])*uint64(val.n[6]) +
+		2*uint64(val.n[1])*uint64(val.n[5]) +
+		2*uint64(val.n[2])*uint64(val.n[4]) +
+		uint64(val.n[3])*uint64(val.n[3])
+	t6 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*7).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[0])*uint64(val.n[7]) +
+		2*uint64(val.n[1])*uint64(val.n[6]) +
+		2*uint64(val.n[2])*uint64(val.n[5]) +
+		2*uint64(val.n[3])*uint64(val.n[4])
+	t7 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*8).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[0])*uint64(val.n[8]) +
+		2*uint64(val.n[1])*uint64(val.n[7]) +
+		2*uint64(val.n[2])*uint64(val.n[6]) +
+		2*uint64(val.n[3])*uint64(val.n[5]) +
+		uint64(val.n[4])*uint64(val.n[4])
+	t8 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*9).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[0])*uint64(val.n[9]) +
+		2*uint64(val.n[1])*uint64(val.n[8]) +
+		2*uint64(val.n[2])*uint64(val.n[7]) +
+		2*uint64(val.n[3])*uint64(val.n[6]) +
+		2*uint64(val.n[4])*uint64(val.n[5])
+	t9 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*10).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[1])*uint64(val.n[9]) +
+		2*uint64(val.n[2])*uint64(val.n[8]) +
+		2*uint64(val.n[3])*uint64(val.n[7]) +
+		2*uint64(val.n[4])*uint64(val.n[6]) +
+		uint64(val.n[5])*uint64(val.n[5])
+	t10 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*11).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[2])*uint64(val.n[9]) +
+		2*uint64(val.n[3])*uint64(val.n[8]) +
+		2*uint64(val.n[4])*uint64(val.n[7]) +
+		2*uint64(val.n[5])*uint64(val.n[6])
+	t11 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*12).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[3])*uint64(val.n[9]) +
+		2*uint64(val.n[4])*uint64(val.n[8]) +
+		2*uint64(val.n[5])*uint64(val.n[7]) +
+		uint64(val.n[6])*uint64(val.n[6])
+	t12 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*13).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[4])*uint64(val.n[9]) +
+		2*uint64(val.n[5])*uint64(val.n[8]) +
+		2*uint64(val.n[6])*uint64(val.n[7])
+	t13 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*14).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[5])*uint64(val.n[9]) +
+		2*uint64(val.n[6])*uint64(val.n[8]) +
+		uint64(val.n[7])*uint64(val.n[7])
+	t14 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*15).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[6])*uint64(val.n[9]) +
+		2*uint64(val.n[7])*uint64(val.n[8])
+	t15 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*16).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[7])*uint64(val.n[9]) +
+		uint64(val.n[8])*uint64(val.n[8])
+	t16 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*17).
+	m = (m >> fieldBase) + 2*uint64(val.n[8])*uint64(val.n[9])
+	t17 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*18).
+	m = (m >> fieldBase) + uint64(val.n[9])*uint64(val.n[9])
+	t18 := m & fieldBaseMask
+
+	// What's left is for 2^(fieldBase*19).
+	t19 := m >> fieldBase
+
+	// At this point, all of the terms are grouped into their respective
+	// base.
+	//
+	// Per [HAC] section 14.3.4: Reduction method of moduli of special form,
+	// when the modulus is of the special form m = b^t - c, highly efficient
+	// reduction can be achieved per the provided algorithm.
+	//
+	// The secp256k1 prime is equivalent to 2^256 - 4294968273, so it fits
+	// this criteria.
+	//
+	// 4294968273 in field representation (base 2^26) is:
+	// n[0] = 977
+	// n[1] = 64
+	// That is to say (2^26 * 64) + 977 = 4294968273
+	//
+	// Since each word is in base 26, the upper terms (t10 and up) start
+	// at 260 bits (versus the final desired range of 256 bits), so the
+	// field representation of 'c' from above needs to be adjusted for the
+	// extra 4 bits by multiplying it by 2^4 = 16.  4294968273 * 16 =
+	// 68719492368.  Thus, the adjusted field representation of 'c' is:
+	// n[0] = 977 * 16 = 15632
+	// n[1] = 64 * 16 = 1024
+	// That is to say (2^26 * 1024) + 15632 = 68719492368
+	//
+	// To reduce the final term, t19, the entire 'c' value is needed instead
+	// of only n[0] because there are no more terms left to handle n[1].
+	// This means there might be some magnitude left in the upper bits that
+	// is handled below.
+	m = t0 + t10*15632
+	t0 = m & fieldBaseMask
+	m = (m >> fieldBase) + t1 + t10*1024 + t11*15632
+	t1 = m & fieldBaseMask
+	m = (m >> fieldBase) + t2 + t11*1024 + t12*15632
+	t2 = m & fieldBaseMask
+	m = (m >> fieldBase) + t3 + t12*1024 + t13*15632
+	t3 = m & fieldBaseMask
+	m = (m >> fieldBase) + t4 + t13*1024 + t14*15632
+	t4 = m & fieldBaseMask
+	m = (m >> fieldBase) + t5 + t14*1024 + t15*15632
+	t5 = m & fieldBaseMask
+	m = (m >> fieldBase) + t6 + t15*1024 + t16*15632
+	t6 = m & fieldBaseMask
+	m = (m >> fieldBase) + t7 + t16*1024 + t17*15632
+	t7 = m & fieldBaseMask
+	m = (m >> fieldBase) + t8 + t17*1024 + t18*15632
+	t8 = m & fieldBaseMask
+	m = (m >> fieldBase) + t9 + t18*1024 + t19*68719492368
+	t9 = m & fieldMSBMask
+	m = m >> fieldMSBBits
+
+	// At this point, if the magnitude is greater than 0, the overall value
+	// is greater than the max possible 256-bit value.  In particular, it is
+	// "how many times larger" than the max value it is.
+	//
+	// The algorithm presented in [HAC] section 14.3.4 repeats until the
+	// quotient is zero.  However, due to the above, we already know at
+	// least how many times we would need to repeat as it's the value
+	// currently in m.  Thus we can simply multiply the magnitude by the
+	// field representation of the prime and do a single iteration.  Notice
+	// that nothing will be changed when the magnitude is zero, so we could
+	// skip this in that case, however always running regardless allows it
+	// to run in constant time.  The final result will be in the range
+	// 0 <= result <= prime + (2^64 - c), so it is guaranteed to have a
+	// magnitude of 1, but it is denormalized.
+	n := t0 + m*977
+	f.n[0] = uint32(n & fieldBaseMask)
+	n = (n >> fieldBase) + t1 + m*64
+	f.n[1] = uint32(n & fieldBaseMask)
+	f.n[2] = uint32((n >> fieldBase) + t2)
+	f.n[3] = uint32(t3)
+	f.n[4] = uint32(t4)
+	f.n[5] = uint32(t5)
+	f.n[6] = uint32(t6)
+	f.n[7] = uint32(t7)
+	f.n[8] = uint32(t8)
+	f.n[9] = uint32(t9)
+
+	return f
+}
+
+// Inverse finds the modular multiplicative inverse of the field value.  The
+// existing field value is modified.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f.Inverse().Mul(f2) so that f = f^-1 * f2.
+func (f *fieldVal) Inverse() *fieldVal {
+	// Fermat's little theorem states that for a nonzero number a and prime
+	// prime p, a^(p-1) = 1 (mod p).  Since the multipliciative inverse is
+	// a*b = 1 (mod p), it follows that b = a*a^(p-2) = a^(p-1) = 1 (mod p).
+	// Thus, a^(p-2) is the multiplicative inverse.
+	//
+	// In order to efficiently compute a^(p-2), p-2 needs to be split into
+	// a sequence of squares and multipications that minimizes the number of
+	// multiplications needed (since they are more costly than squarings).
+	// Intermediate results are saved and reused as well.
+	//
+	// The secp256k1 prime - 2 is 2^256 - 4294968275.
+	//
+	// This has a cost of 258 field squarings and 33 field multiplications.
+	var a2, a3, a4, a10, a11, a21, a42, a45, a63, a1019, a1023 fieldVal
+	a2.SquareVal(f)
+	a3.Mul2(&a2, f)
+	a4.SquareVal(&a2)
+	a10.SquareVal(&a4).Mul(&a2)
+	a11.Mul2(&a10, f)
+	a21.Mul2(&a10, &a11)
+	a42.SquareVal(&a21)
+	a45.Mul2(&a42, &a3)
+	a63.Mul2(&a42, &a21)
+	a1019.SquareVal(&a63).Square().Square().Square().Mul(&a11)
+	a1023.Mul2(&a1019, &a4)
+	f.Set(&a63)                                    // f = a^(2^6 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^11 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^16 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^16 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^21 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^26 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^26 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^31 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^36 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^36 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^41 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^46 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^46 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^51 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^56 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^56 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^61 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^66 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^66 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^71 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^76 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^76 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^81 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^86 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^86 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^91 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^96 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^96 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^101 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^106 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^106 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^111 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^116 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^116 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^121 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^126 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^126 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^131 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^136 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^136 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^141 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^146 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^146 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^151 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^156 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^156 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^161 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^166 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^166 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^171 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^176 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^176 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^181 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^186 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^186 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^191 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^196 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^196 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^201 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^206 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^206 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^211 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^216 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^216 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^221 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^226 - 1024)
+	f.Mul(&a1019)                                  // f = a^(2^226 - 5)
+	f.Square().Square().Square().Square().Square() // f = a^(2^231 - 160)
+	f.Square().Square().Square().Square().Square() // f = a^(2^236 - 5120)
+	f.Mul(&a1023)                                  // f = a^(2^236 - 4097)
+	f.Square().Square().Square().Square().Square() // f = a^(2^241 - 131104)
+	f.Square().Square().Square().Square().Square() // f = a^(2^246 - 4195328)
+	f.Mul(&a1023)                                  // f = a^(2^246 - 4194305)
+	f.Square().Square().Square().Square().Square() // f = a^(2^251 - 134217760)
+	f.Square().Square().Square().Square().Square() // f = a^(2^256 - 4294968320)
+	return f.Mul(&a45)                             // f = a^(2^256 - 4294968275) = a^(p-2)
+}
+
+// NewFieldVal returns a new field value set to 0.  Callers of this package
+// don't need to work with field values directly.  This is provided for testing
+// purposes.
+func NewFieldVal() *fieldVal {
+	return new(fieldVal)
+}