// Copyright (c) 2013-2016 The btcsuite developers // Copyright (c) 2013-2016 Dave Collins // Use of this source code is governed by an ISC // license that can be found in the LICENSE file. package btcec // References: // [HAC]: Handbook of Applied Cryptography Menezes, van Oorschot, Vanstone. // http://cacr.uwaterloo.ca/hac/ // All elliptic curve operations for secp256k1 are done in a finite field // characterized by a 256-bit prime. Given this precision is larger than the // biggest available native type, obviously some form of bignum math is needed. // This package implements specialized fixed-precision field arithmetic rather // than relying on an arbitrary-precision arithmetic package such as math/big // for dealing with the field math since the size is known. As a result, rather // large performance gains are achieved by taking advantage of many // optimizations not available to arbitrary-precision arithmetic and generic // modular arithmetic algorithms. // // There are various ways to internally represent each finite field element. // For example, the most obvious representation would be to use an array of 4 // uint64s (64 bits * 4 = 256 bits). However, that representation suffers from // a couple of issues. First, there is no native Go type large enough to handle // the intermediate results while adding or multiplying two 64-bit numbers, and // second there is no space left for overflows when performing the intermediate // arithmetic between each array element which would lead to expensive carry // propagation. // // Given the above, this implementation represents the the field elements as // 10 uint32s with each word (array entry) treated as base 2^26. This was // chosen for the following reasons: // 1) Most systems at the current time are 64-bit (or at least have 64-bit // registers available for specialized purposes such as MMX) so the // intermediate results can typically be done using a native register (and // using uint64s to avoid the need for additional half-word arithmetic) // 2) In order to allow addition of the internal words without having to // propagate the the carry, the max normalized value for each register must // be less than the number of bits available in the register // 3) Since we're dealing with 32-bit values, 64-bits of overflow is a // reasonable choice for #2 // 4) Given the need for 256-bits of precision and the properties stated in #1, // #2, and #3, the representation which best accommodates this is 10 uint32s // with base 2^26 (26 bits * 10 = 260 bits, so the final word only needs 22 // bits) which leaves the desired 64 bits (32 * 10 = 320, 320 - 256 = 64) for // overflow // // Since it is so important that the field arithmetic is extremely fast for // high performance crypto, this package does not perform any validation where // it ordinarily would. For example, some functions only give the correct // result is the field is normalized and there is no checking to ensure it is. // While I typically prefer to ensure all state and input is valid for most // packages, this code is really only used internally and every extra check // counts. import ( "encoding/hex" ) // Constants used to make the code more readable. const ( twoBitsMask = 0x3 fourBitsMask = 0xf sixBitsMask = 0x3f eightBitsMask = 0xff ) // Constants related to the field representation. const ( // fieldWords is the number of words used to internally represent the // 256-bit value. fieldWords = 10 // fieldBase is the exponent used to form the numeric base of each word. // 2^(fieldBase*i) where i is the word position. fieldBase = 26 // fieldOverflowBits is the minimum number of "overflow" bits for each // word in the field value. fieldOverflowBits = 32 - fieldBase // fieldBaseMask is the mask for the bits in each word needed to // represent the numeric base of each word (except the most significant // word). fieldBaseMask = (1 << fieldBase) - 1 // fieldMSBBits is the number of bits in the most significant word used // to represent the value. fieldMSBBits = 256 - (fieldBase * (fieldWords - 1)) // fieldMSBMask is the mask for the bits in the most significant word // needed to represent the value. fieldMSBMask = (1 << fieldMSBBits) - 1 // fieldPrimeWordZero is word zero of the secp256k1 prime in the // internal field representation. It is used during negation. fieldPrimeWordZero = 0x3fffc2f // fieldPrimeWordOne is word one of the secp256k1 prime in the // internal field representation. It is used during negation. fieldPrimeWordOne = 0x3ffffbf ) var ( // fieldQBytes is the value Q = (P+1)/4 for the secp256k1 prime P. This // value is used to efficiently compute the square root of values in the // field via exponentiation. The value of Q in hex is: // // Q = 3fffffffffffffffffffffffffffffffffffffffffffffffffffffffbfffff0c fieldQBytes = []byte{ 0x3f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xbf, 0xff, 0xff, 0x0c, } ) // fieldVal implements optimized fixed-precision arithmetic over the // secp256k1 finite field. This means all arithmetic is performed modulo // 0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f. It // represents each 256-bit value as 10 32-bit integers in base 2^26. This // provides 6 bits of overflow in each word (10 bits in the most significant // word) for a total of 64 bits of overflow (9*6 + 10 = 64). It only implements // the arithmetic needed for elliptic curve operations. // // The following depicts the internal representation: // ----------------------------------------------------------------- // | n[9] | n[8] | ... | n[0] | // | 32 bits available | 32 bits available | ... | 32 bits available | // | 22 bits for value | 26 bits for value | ... | 26 bits for value | // | 10 bits overflow | 6 bits overflow | ... | 6 bits overflow | // | Mult: 2^(26*9) | Mult: 2^(26*8) | ... | Mult: 2^(26*0) | // ----------------------------------------------------------------- // // For example, consider the number 2^49 + 1. It would be represented as: // n[0] = 1 // n[1] = 2^23 // n[2..9] = 0 // // The full 256-bit value is then calculated by looping i from 9..0 and // doing sum(n[i] * 2^(26i)) like so: // n[9] * 2^(26*9) = 0 * 2^234 = 0 // n[8] * 2^(26*8) = 0 * 2^208 = 0 // ... // n[1] * 2^(26*1) = 2^23 * 2^26 = 2^49 // n[0] * 2^(26*0) = 1 * 2^0 = 1 // Sum: 0 + 0 + ... + 2^49 + 1 = 2^49 + 1 type fieldVal struct { n [10]uint32 } // String returns the field value as a human-readable hex string. func (f fieldVal) String() string { t := new(fieldVal).Set(&f).Normalize() return hex.EncodeToString(t.Bytes()[:]) } // Zero sets the field value to zero. A newly created field value is already // set to zero. This function can be useful to clear an existing field value // for reuse. func (f *fieldVal) Zero() { f.n[0] = 0 f.n[1] = 0 f.n[2] = 0 f.n[3] = 0 f.n[4] = 0 f.n[5] = 0 f.n[6] = 0 f.n[7] = 0 f.n[8] = 0 f.n[9] = 0 } // Set sets the field value equal to the passed value. // // The field value is returned to support chaining. This enables syntax like: // f := new(fieldVal).Set(f2).Add(1) so that f = f2 + 1 where f2 is not // modified. func (f *fieldVal) Set(val *fieldVal) *fieldVal { *f = *val return f } // SetInt sets the field value to the passed integer. This is a convenience // function since it is fairly common to perform some arithemetic with small // native integers. // // The field value is returned to support chaining. This enables syntax such // as f := new(fieldVal).SetInt(2).Mul(f2) so that f = 2 * f2. func (f *fieldVal) SetInt(ui uint) *fieldVal { f.Zero() f.n[0] = uint32(ui) return f } // SetBytes packs the passed 32-byte big-endian value into the internal field // value representation. // // The field value is returned to support chaining. This enables syntax like: // f := new(fieldVal).SetBytes(byteArray).Mul(f2) so that f = ba * f2. func (f *fieldVal) SetBytes(b *[32]byte) *fieldVal { // Pack the 256 total bits across the 10 uint32 words with a max of // 26-bits per word. This could be done with a couple of for loops, // but this unrolled version is significantly faster. Benchmarks show // this is about 34 times faster than the variant which uses loops. f.n[0] = uint32(b[31]) | uint32(b[30])<<8 | uint32(b[29])<<16 | (uint32(b[28])&twoBitsMask)<<24 f.n[1] = uint32(b[28])>>2 | uint32(b[27])<<6 | uint32(b[26])<<14 | (uint32(b[25])&fourBitsMask)<<22 f.n[2] = uint32(b[25])>>4 | uint32(b[24])<<4 | uint32(b[23])<<12 | (uint32(b[22])&sixBitsMask)<<20 f.n[3] = uint32(b[22])>>6 | uint32(b[21])<<2 | uint32(b[20])<<10 | uint32(b[19])<<18 f.n[4] = uint32(b[18]) | uint32(b[17])<<8 | uint32(b[16])<<16 | (uint32(b[15])&twoBitsMask)<<24 f.n[5] = uint32(b[15])>>2 | uint32(b[14])<<6 | uint32(b[13])<<14 | (uint32(b[12])&fourBitsMask)<<22 f.n[6] = uint32(b[12])>>4 | uint32(b[11])<<4 | uint32(b[10])<<12 | (uint32(b[9])&sixBitsMask)<<20 f.n[7] = uint32(b[9])>>6 | uint32(b[8])<<2 | uint32(b[7])<<10 | uint32(b[6])<<18 f.n[8] = uint32(b[5]) | uint32(b[4])<<8 | uint32(b[3])<<16 | (uint32(b[2])&twoBitsMask)<<24 f.n[9] = uint32(b[2])>>2 | uint32(b[1])<<6 | uint32(b[0])<<14 return f } // SetByteSlice packs the passed big-endian value into the internal field value // representation. Only the first 32-bytes are used. As a result, it is up to // the caller to ensure numbers of the appropriate size are used or the value // will be truncated. // // The field value is returned to support chaining. This enables syntax like: // f := new(fieldVal).SetByteSlice(byteSlice) func (f *fieldVal) SetByteSlice(b []byte) *fieldVal { var b32 [32]byte for i := 0; i < len(b); i++ { if i < 32 { b32[i+(32-len(b))] = b[i] } } return f.SetBytes(&b32) } // SetHex decodes the passed big-endian hex string into the internal field value // representation. Only the first 32-bytes are used. // // The field value is returned to support chaining. This enables syntax like: // f := new(fieldVal).SetHex("0abc").Add(1) so that f = 0x0abc + 1 func (f *fieldVal) SetHex(hexString string) *fieldVal { if len(hexString)%2 != 0 { hexString = "0" + hexString } bytes, _ := hex.DecodeString(hexString) return f.SetByteSlice(bytes) } // Normalize normalizes the internal field words into the desired range and // performs fast modular reduction over the secp256k1 prime by making use of the // special form of the prime. func (f *fieldVal) Normalize() *fieldVal { // The field representation leaves 6 bits of overflow in each word so // intermediate calculations can be performed without needing to // propagate the carry to each higher word during the calculations. In // order to normalize, we need to "compact" the full 256-bit value to // the right while propagating any carries through to the high order // word. // // Since this field is doing arithmetic modulo the secp256k1 prime, we // also need to perform modular reduction over the prime. // // Per [HAC] section 14.3.4: Reduction method of moduli of special form, // when the modulus is of the special form m = b^t - c, highly efficient // reduction can be achieved. // // The secp256k1 prime is equivalent to 2^256 - 4294968273, so it fits // this criteria. // // 4294968273 in field representation (base 2^26) is: // n[0] = 977 // n[1] = 64 // That is to say (2^26 * 64) + 977 = 4294968273 // // The algorithm presented in the referenced section typically repeats // until the quotient is zero. However, due to our field representation // we already know to within one reduction how many times we would need // to repeat as it's the uppermost bits of the high order word. Thus we // can simply multiply the magnitude by the field representation of the // prime and do a single iteration. After this step there might be an // additional carry to bit 256 (bit 22 of the high order word). t9 := f.n[9] m := t9 >> fieldMSBBits t9 = t9 & fieldMSBMask t0 := f.n[0] + m*977 t1 := (t0 >> fieldBase) + f.n[1] + (m << 6) t0 = t0 & fieldBaseMask t2 := (t1 >> fieldBase) + f.n[2] t1 = t1 & fieldBaseMask t3 := (t2 >> fieldBase) + f.n[3] t2 = t2 & fieldBaseMask t4 := (t3 >> fieldBase) + f.n[4] t3 = t3 & fieldBaseMask t5 := (t4 >> fieldBase) + f.n[5] t4 = t4 & fieldBaseMask t6 := (t5 >> fieldBase) + f.n[6] t5 = t5 & fieldBaseMask t7 := (t6 >> fieldBase) + f.n[7] t6 = t6 & fieldBaseMask t8 := (t7 >> fieldBase) + f.n[8] t7 = t7 & fieldBaseMask t9 = (t8 >> fieldBase) + t9 t8 = t8 & fieldBaseMask // At this point, the magnitude is guaranteed to be one, however, the // value could still be greater than the prime if there was either a // carry through to bit 256 (bit 22 of the higher order word) or the // value is greater than or equal to the field characteristic. The // following determines if either or these conditions are true and does // the final reduction in constant time. // // Note that the if/else statements here intentionally do the bitwise // operators even when it won't change the value to ensure constant time // between the branches. Also note that 'm' will be zero when neither // of the aforementioned conditions are true and the value will not be // changed when 'm' is zero. m = 1 if t9 == fieldMSBMask { m &= 1 } else { m &= 0 } if t2&t3&t4&t5&t6&t7&t8 == fieldBaseMask { m &= 1 } else { m &= 0 } if ((t0+977)>>fieldBase + t1 + 64) > fieldBaseMask { m &= 1 } else { m &= 0 } if t9>>fieldMSBBits != 0 { m |= 1 } else { m |= 0 } t0 = t0 + m*977 t1 = (t0 >> fieldBase) + t1 + (m << 6) t0 = t0 & fieldBaseMask t2 = (t1 >> fieldBase) + t2 t1 = t1 & fieldBaseMask t3 = (t2 >> fieldBase) + t3 t2 = t2 & fieldBaseMask t4 = (t3 >> fieldBase) + t4 t3 = t3 & fieldBaseMask t5 = (t4 >> fieldBase) + t5 t4 = t4 & fieldBaseMask t6 = (t5 >> fieldBase) + t6 t5 = t5 & fieldBaseMask t7 = (t6 >> fieldBase) + t7 t6 = t6 & fieldBaseMask t8 = (t7 >> fieldBase) + t8 t7 = t7 & fieldBaseMask t9 = (t8 >> fieldBase) + t9 t8 = t8 & fieldBaseMask t9 = t9 & fieldMSBMask // Remove potential multiple of 2^256. // Finally, set the normalized and reduced words. f.n[0] = t0 f.n[1] = t1 f.n[2] = t2 f.n[3] = t3 f.n[4] = t4 f.n[5] = t5 f.n[6] = t6 f.n[7] = t7 f.n[8] = t8 f.n[9] = t9 return f } // PutBytes unpacks the field value to a 32-byte big-endian value using the // passed byte array. There is a similar function, Bytes, which unpacks the // field value into a new array and returns that. This version is provided // since it can be useful to cut down on the number of allocations by allowing // the caller to reuse a buffer. // // The field value must be normalized for this function to return the correct // result. func (f *fieldVal) PutBytes(b *[32]byte) { // Unpack the 256 total bits from the 10 uint32 words with a max of // 26-bits per word. This could be done with a couple of for loops, // but this unrolled version is a bit faster. Benchmarks show this is // about 10 times faster than the variant which uses loops. b[31] = byte(f.n[0] & eightBitsMask) b[30] = byte((f.n[0] >> 8) & eightBitsMask) b[29] = byte((f.n[0] >> 16) & eightBitsMask) b[28] = byte((f.n[0]>>24)&twoBitsMask | (f.n[1]&sixBitsMask)<<2) b[27] = byte((f.n[1] >> 6) & eightBitsMask) b[26] = byte((f.n[1] >> 14) & eightBitsMask) b[25] = byte((f.n[1]>>22)&fourBitsMask | (f.n[2]&fourBitsMask)<<4) b[24] = byte((f.n[2] >> 4) & eightBitsMask) b[23] = byte((f.n[2] >> 12) & eightBitsMask) b[22] = byte((f.n[2]>>20)&sixBitsMask | (f.n[3]&twoBitsMask)<<6) b[21] = byte((f.n[3] >> 2) & eightBitsMask) b[20] = byte((f.n[3] >> 10) & eightBitsMask) b[19] = byte((f.n[3] >> 18) & eightBitsMask) b[18] = byte(f.n[4] & eightBitsMask) b[17] = byte((f.n[4] >> 8) & eightBitsMask) b[16] = byte((f.n[4] >> 16) & eightBitsMask) b[15] = byte((f.n[4]>>24)&twoBitsMask | (f.n[5]&sixBitsMask)<<2) b[14] = byte((f.n[5] >> 6) & eightBitsMask) b[13] = byte((f.n[5] >> 14) & eightBitsMask) b[12] = byte((f.n[5]>>22)&fourBitsMask | (f.n[6]&fourBitsMask)<<4) b[11] = byte((f.n[6] >> 4) & eightBitsMask) b[10] = byte((f.n[6] >> 12) & eightBitsMask) b[9] = byte((f.n[6]>>20)&sixBitsMask | (f.n[7]&twoBitsMask)<<6) b[8] = byte((f.n[7] >> 2) & eightBitsMask) b[7] = byte((f.n[7] >> 10) & eightBitsMask) b[6] = byte((f.n[7] >> 18) & eightBitsMask) b[5] = byte(f.n[8] & eightBitsMask) b[4] = byte((f.n[8] >> 8) & eightBitsMask) b[3] = byte((f.n[8] >> 16) & eightBitsMask) b[2] = byte((f.n[8]>>24)&twoBitsMask | (f.n[9]&sixBitsMask)<<2) b[1] = byte((f.n[9] >> 6) & eightBitsMask) b[0] = byte((f.n[9] >> 14) & eightBitsMask) } // Bytes unpacks the field value to a 32-byte big-endian value. See PutBytes // for a variant that allows the a buffer to be passed which can be useful to // to cut down on the number of allocations by allowing the caller to reuse a // buffer. // // The field value must be normalized for this function to return correct // result. func (f *fieldVal) Bytes() *[32]byte { b := new([32]byte) f.PutBytes(b) return b } // IsZero returns whether or not the field value is equal to zero. func (f *fieldVal) IsZero() bool { // The value can only be zero if no bits are set in any of the words. // This is a constant time implementation. bits := f.n[0] | f.n[1] | f.n[2] | f.n[3] | f.n[4] | f.n[5] | f.n[6] | f.n[7] | f.n[8] | f.n[9] return bits == 0 } // IsOdd returns whether or not the field value is an odd number. // // The field value must be normalized for this function to return correct // result. func (f *fieldVal) IsOdd() bool { // Only odd numbers have the bottom bit set. return f.n[0]&1 == 1 } // Equals returns whether or not the two field values are the same. Both // field values being compared must be normalized for this function to return // the correct result. func (f *fieldVal) Equals(val *fieldVal) bool { // Xor only sets bits when they are different, so the two field values // can only be the same if no bits are set after xoring each word. // This is a constant time implementation. bits := (f.n[0] ^ val.n[0]) | (f.n[1] ^ val.n[1]) | (f.n[2] ^ val.n[2]) | (f.n[3] ^ val.n[3]) | (f.n[4] ^ val.n[4]) | (f.n[5] ^ val.n[5]) | (f.n[6] ^ val.n[6]) | (f.n[7] ^ val.n[7]) | (f.n[8] ^ val.n[8]) | (f.n[9] ^ val.n[9]) return bits == 0 } // NegateVal negates the passed value and stores the result in f. The caller // must provide the magnitude of the passed value for a correct result. // // The field value is returned to support chaining. This enables syntax like: // f.NegateVal(f2).AddInt(1) so that f = -f2 + 1. func (f *fieldVal) NegateVal(val *fieldVal, magnitude uint32) *fieldVal { // Negation in the field is just the prime minus the value. However, // in order to allow negation against a field value without having to // normalize/reduce it first, multiply by the magnitude (that is how // "far" away it is from the normalized value) to adjust. Also, since // negating a value pushes it one more order of magnitude away from the // normalized range, add 1 to compensate. // // For some intuition here, imagine you're performing mod 12 arithmetic // (picture a clock) and you are negating the number 7. So you start at // 12 (which is of course 0 under mod 12) and count backwards (left on // the clock) 7 times to arrive at 5. Notice this is just 12-7 = 5. // Now, assume you're starting with 19, which is a number that is // already larger than the modulus and congruent to 7 (mod 12). When a // value is already in the desired range, its magnitude is 1. Since 19 // is an additional "step", its magnitude (mod 12) is 2. Since any // multiple of the modulus is conguent to zero (mod m), the answer can // be shortcut by simply mulplying the magnitude by the modulus and // subtracting. Keeping with the example, this would be (2*12)-19 = 5. f.n[0] = (magnitude+1)*fieldPrimeWordZero - val.n[0] f.n[1] = (magnitude+1)*fieldPrimeWordOne - val.n[1] f.n[2] = (magnitude+1)*fieldBaseMask - val.n[2] f.n[3] = (magnitude+1)*fieldBaseMask - val.n[3] f.n[4] = (magnitude+1)*fieldBaseMask - val.n[4] f.n[5] = (magnitude+1)*fieldBaseMask - val.n[5] f.n[6] = (magnitude+1)*fieldBaseMask - val.n[6] f.n[7] = (magnitude+1)*fieldBaseMask - val.n[7] f.n[8] = (magnitude+1)*fieldBaseMask - val.n[8] f.n[9] = (magnitude+1)*fieldMSBMask - val.n[9] return f } // Negate negates the field value. The existing field value is modified. The // caller must provide the magnitude of the field value for a correct result. // // The field value is returned to support chaining. This enables syntax like: // f.Negate().AddInt(1) so that f = -f + 1. func (f *fieldVal) Negate(magnitude uint32) *fieldVal { return f.NegateVal(f, magnitude) } // AddInt adds the passed integer to the existing field value and stores the // result in f. This is a convenience function since it is fairly common to // perform some arithemetic with small native integers. // // The field value is returned to support chaining. This enables syntax like: // f.AddInt(1).Add(f2) so that f = f + 1 + f2. func (f *fieldVal) AddInt(ui uint) *fieldVal { // Since the field representation intentionally provides overflow bits, // it's ok to use carryless addition as the carry bit is safely part of // the word and will be normalized out. f.n[0] += uint32(ui) return f } // Add adds the passed value to the existing field value and stores the result // in f. // // The field value is returned to support chaining. This enables syntax like: // f.Add(f2).AddInt(1) so that f = f + f2 + 1. func (f *fieldVal) Add(val *fieldVal) *fieldVal { // Since the field representation intentionally provides overflow bits, // it's ok to use carryless addition as the carry bit is safely part of // each word and will be normalized out. This could obviously be done // in a loop, but the unrolled version is faster. f.n[0] += val.n[0] f.n[1] += val.n[1] f.n[2] += val.n[2] f.n[3] += val.n[3] f.n[4] += val.n[4] f.n[5] += val.n[5] f.n[6] += val.n[6] f.n[7] += val.n[7] f.n[8] += val.n[8] f.n[9] += val.n[9] return f } // Add2 adds the passed two field values together and stores the result in f. // // The field value is returned to support chaining. This enables syntax like: // f3.Add2(f, f2).AddInt(1) so that f3 = f + f2 + 1. func (f *fieldVal) Add2(val *fieldVal, val2 *fieldVal) *fieldVal { // Since the field representation intentionally provides overflow bits, // it's ok to use carryless addition as the carry bit is safely part of // each word and will be normalized out. This could obviously be done // in a loop, but the unrolled version is faster. f.n[0] = val.n[0] + val2.n[0] f.n[1] = val.n[1] + val2.n[1] f.n[2] = val.n[2] + val2.n[2] f.n[3] = val.n[3] + val2.n[3] f.n[4] = val.n[4] + val2.n[4] f.n[5] = val.n[5] + val2.n[5] f.n[6] = val.n[6] + val2.n[6] f.n[7] = val.n[7] + val2.n[7] f.n[8] = val.n[8] + val2.n[8] f.n[9] = val.n[9] + val2.n[9] return f } // MulInt multiplies the field value by the passed int and stores the result in // f. Note that this function can overflow if multiplying the value by any of // the individual words exceeds a max uint32. Therefore it is important that // the caller ensures no overflows will occur before using this function. // // The field value is returned to support chaining. This enables syntax like: // f.MulInt(2).Add(f2) so that f = 2 * f + f2. func (f *fieldVal) MulInt(val uint) *fieldVal { // Since each word of the field representation can hold up to // fieldOverflowBits extra bits which will be normalized out, it's safe // to multiply each word without using a larger type or carry // propagation so long as the values won't overflow a uint32. This // could obviously be done in a loop, but the unrolled version is // faster. ui := uint32(val) f.n[0] *= ui f.n[1] *= ui f.n[2] *= ui f.n[3] *= ui f.n[4] *= ui f.n[5] *= ui f.n[6] *= ui f.n[7] *= ui f.n[8] *= ui f.n[9] *= ui return f } // Mul multiplies the passed value to the existing field value and stores the // result in f. Note that this function can overflow if multiplying any // of the individual words exceeds a max uint32. In practice, this means the // magnitude of either value involved in the multiplication must be a max of // 8. // // The field value is returned to support chaining. This enables syntax like: // f.Mul(f2).AddInt(1) so that f = (f * f2) + 1. func (f *fieldVal) Mul(val *fieldVal) *fieldVal { return f.Mul2(f, val) } // Mul2 multiplies the passed two field values together and stores the result // result in f. Note that this function can overflow if multiplying any of // the individual words exceeds a max uint32. In practice, this means the // magnitude of either value involved in the multiplication must be a max of // 8. // // The field value is returned to support chaining. This enables syntax like: // f3.Mul2(f, f2).AddInt(1) so that f3 = (f * f2) + 1. func (f *fieldVal) Mul2(val *fieldVal, val2 *fieldVal) *fieldVal { // This could be done with a couple of for loops and an array to store // the intermediate terms, but this unrolled version is significantly // faster. // Terms for 2^(fieldBase*0). m := uint64(val.n[0]) * uint64(val2.n[0]) t0 := m & fieldBaseMask // Terms for 2^(fieldBase*1). m = (m >> fieldBase) + uint64(val.n[0])*uint64(val2.n[1]) + uint64(val.n[1])*uint64(val2.n[0]) t1 := m & fieldBaseMask // Terms for 2^(fieldBase*2). m = (m >> fieldBase) + uint64(val.n[0])*uint64(val2.n[2]) + uint64(val.n[1])*uint64(val2.n[1]) + uint64(val.n[2])*uint64(val2.n[0]) t2 := m & fieldBaseMask // Terms for 2^(fieldBase*3). m = (m >> fieldBase) + uint64(val.n[0])*uint64(val2.n[3]) + uint64(val.n[1])*uint64(val2.n[2]) + uint64(val.n[2])*uint64(val2.n[1]) + uint64(val.n[3])*uint64(val2.n[0]) t3 := m & fieldBaseMask // Terms for 2^(fieldBase*4). m = (m >> fieldBase) + uint64(val.n[0])*uint64(val2.n[4]) + uint64(val.n[1])*uint64(val2.n[3]) + uint64(val.n[2])*uint64(val2.n[2]) + uint64(val.n[3])*uint64(val2.n[1]) + uint64(val.n[4])*uint64(val2.n[0]) t4 := m & fieldBaseMask // Terms for 2^(fieldBase*5). m = (m >> fieldBase) + uint64(val.n[0])*uint64(val2.n[5]) + uint64(val.n[1])*uint64(val2.n[4]) + uint64(val.n[2])*uint64(val2.n[3]) + uint64(val.n[3])*uint64(val2.n[2]) + uint64(val.n[4])*uint64(val2.n[1]) + uint64(val.n[5])*uint64(val2.n[0]) t5 := m & fieldBaseMask // Terms for 2^(fieldBase*6). m = (m >> fieldBase) + uint64(val.n[0])*uint64(val2.n[6]) + uint64(val.n[1])*uint64(val2.n[5]) + uint64(val.n[2])*uint64(val2.n[4]) + uint64(val.n[3])*uint64(val2.n[3]) + uint64(val.n[4])*uint64(val2.n[2]) + uint64(val.n[5])*uint64(val2.n[1]) + uint64(val.n[6])*uint64(val2.n[0]) t6 := m & fieldBaseMask // Terms for 2^(fieldBase*7). m = (m >> fieldBase) + uint64(val.n[0])*uint64(val2.n[7]) + uint64(val.n[1])*uint64(val2.n[6]) + uint64(val.n[2])*uint64(val2.n[5]) + uint64(val.n[3])*uint64(val2.n[4]) + uint64(val.n[4])*uint64(val2.n[3]) + uint64(val.n[5])*uint64(val2.n[2]) + uint64(val.n[6])*uint64(val2.n[1]) + uint64(val.n[7])*uint64(val2.n[0]) t7 := m & fieldBaseMask // Terms for 2^(fieldBase*8). m = (m >> fieldBase) + uint64(val.n[0])*uint64(val2.n[8]) + uint64(val.n[1])*uint64(val2.n[7]) + uint64(val.n[2])*uint64(val2.n[6]) + uint64(val.n[3])*uint64(val2.n[5]) + uint64(val.n[4])*uint64(val2.n[4]) + uint64(val.n[5])*uint64(val2.n[3]) + uint64(val.n[6])*uint64(val2.n[2]) + uint64(val.n[7])*uint64(val2.n[1]) + uint64(val.n[8])*uint64(val2.n[0]) t8 := m & fieldBaseMask // Terms for 2^(fieldBase*9). m = (m >> fieldBase) + uint64(val.n[0])*uint64(val2.n[9]) + uint64(val.n[1])*uint64(val2.n[8]) + uint64(val.n[2])*uint64(val2.n[7]) + uint64(val.n[3])*uint64(val2.n[6]) + uint64(val.n[4])*uint64(val2.n[5]) + uint64(val.n[5])*uint64(val2.n[4]) + uint64(val.n[6])*uint64(val2.n[3]) + uint64(val.n[7])*uint64(val2.n[2]) + uint64(val.n[8])*uint64(val2.n[1]) + uint64(val.n[9])*uint64(val2.n[0]) t9 := m & fieldBaseMask // Terms for 2^(fieldBase*10). m = (m >> fieldBase) + uint64(val.n[1])*uint64(val2.n[9]) + uint64(val.n[2])*uint64(val2.n[8]) + uint64(val.n[3])*uint64(val2.n[7]) + uint64(val.n[4])*uint64(val2.n[6]) + uint64(val.n[5])*uint64(val2.n[5]) + uint64(val.n[6])*uint64(val2.n[4]) + uint64(val.n[7])*uint64(val2.n[3]) + uint64(val.n[8])*uint64(val2.n[2]) + uint64(val.n[9])*uint64(val2.n[1]) t10 := m & fieldBaseMask // Terms for 2^(fieldBase*11). m = (m >> fieldBase) + uint64(val.n[2])*uint64(val2.n[9]) + uint64(val.n[3])*uint64(val2.n[8]) + uint64(val.n[4])*uint64(val2.n[7]) + uint64(val.n[5])*uint64(val2.n[6]) + uint64(val.n[6])*uint64(val2.n[5]) + uint64(val.n[7])*uint64(val2.n[4]) + uint64(val.n[8])*uint64(val2.n[3]) + uint64(val.n[9])*uint64(val2.n[2]) t11 := m & fieldBaseMask // Terms for 2^(fieldBase*12). m = (m >> fieldBase) + uint64(val.n[3])*uint64(val2.n[9]) + uint64(val.n[4])*uint64(val2.n[8]) + uint64(val.n[5])*uint64(val2.n[7]) + uint64(val.n[6])*uint64(val2.n[6]) + uint64(val.n[7])*uint64(val2.n[5]) + uint64(val.n[8])*uint64(val2.n[4]) + uint64(val.n[9])*uint64(val2.n[3]) t12 := m & fieldBaseMask // Terms for 2^(fieldBase*13). m = (m >> fieldBase) + uint64(val.n[4])*uint64(val2.n[9]) + uint64(val.n[5])*uint64(val2.n[8]) + uint64(val.n[6])*uint64(val2.n[7]) + uint64(val.n[7])*uint64(val2.n[6]) + uint64(val.n[8])*uint64(val2.n[5]) + uint64(val.n[9])*uint64(val2.n[4]) t13 := m & fieldBaseMask // Terms for 2^(fieldBase*14). m = (m >> fieldBase) + uint64(val.n[5])*uint64(val2.n[9]) + uint64(val.n[6])*uint64(val2.n[8]) + uint64(val.n[7])*uint64(val2.n[7]) + uint64(val.n[8])*uint64(val2.n[6]) + uint64(val.n[9])*uint64(val2.n[5]) t14 := m & fieldBaseMask // Terms for 2^(fieldBase*15). m = (m >> fieldBase) + uint64(val.n[6])*uint64(val2.n[9]) + uint64(val.n[7])*uint64(val2.n[8]) + uint64(val.n[8])*uint64(val2.n[7]) + uint64(val.n[9])*uint64(val2.n[6]) t15 := m & fieldBaseMask // Terms for 2^(fieldBase*16). m = (m >> fieldBase) + uint64(val.n[7])*uint64(val2.n[9]) + uint64(val.n[8])*uint64(val2.n[8]) + uint64(val.n[9])*uint64(val2.n[7]) t16 := m & fieldBaseMask // Terms for 2^(fieldBase*17). m = (m >> fieldBase) + uint64(val.n[8])*uint64(val2.n[9]) + uint64(val.n[9])*uint64(val2.n[8]) t17 := m & fieldBaseMask // Terms for 2^(fieldBase*18). m = (m >> fieldBase) + uint64(val.n[9])*uint64(val2.n[9]) t18 := m & fieldBaseMask // What's left is for 2^(fieldBase*19). t19 := m >> fieldBase // At this point, all of the terms are grouped into their respective // base. // // Per [HAC] section 14.3.4: Reduction method of moduli of special form, // when the modulus is of the special form m = b^t - c, highly efficient // reduction can be achieved per the provided algorithm. // // The secp256k1 prime is equivalent to 2^256 - 4294968273, so it fits // this criteria. // // 4294968273 in field representation (base 2^26) is: // n[0] = 977 // n[1] = 64 // That is to say (2^26 * 64) + 977 = 4294968273 // // Since each word is in base 26, the upper terms (t10 and up) start // at 260 bits (versus the final desired range of 256 bits), so the // field representation of 'c' from above needs to be adjusted for the // extra 4 bits by multiplying it by 2^4 = 16. 4294968273 * 16 = // 68719492368. Thus, the adjusted field representation of 'c' is: // n[0] = 977 * 16 = 15632 // n[1] = 64 * 16 = 1024 // That is to say (2^26 * 1024) + 15632 = 68719492368 // // To reduce the final term, t19, the entire 'c' value is needed instead // of only n[0] because there are no more terms left to handle n[1]. // This means there might be some magnitude left in the upper bits that // is handled below. m = t0 + t10*15632 t0 = m & fieldBaseMask m = (m >> fieldBase) + t1 + t10*1024 + t11*15632 t1 = m & fieldBaseMask m = (m >> fieldBase) + t2 + t11*1024 + t12*15632 t2 = m & fieldBaseMask m = (m >> fieldBase) + t3 + t12*1024 + t13*15632 t3 = m & fieldBaseMask m = (m >> fieldBase) + t4 + t13*1024 + t14*15632 t4 = m & fieldBaseMask m = (m >> fieldBase) + t5 + t14*1024 + t15*15632 t5 = m & fieldBaseMask m = (m >> fieldBase) + t6 + t15*1024 + t16*15632 t6 = m & fieldBaseMask m = (m >> fieldBase) + t7 + t16*1024 + t17*15632 t7 = m & fieldBaseMask m = (m >> fieldBase) + t8 + t17*1024 + t18*15632 t8 = m & fieldBaseMask m = (m >> fieldBase) + t9 + t18*1024 + t19*68719492368 t9 = m & fieldMSBMask m = m >> fieldMSBBits // At this point, if the magnitude is greater than 0, the overall value // is greater than the max possible 256-bit value. In particular, it is // "how many times larger" than the max value it is. // // The algorithm presented in [HAC] section 14.3.4 repeats until the // quotient is zero. However, due to the above, we already know at // least how many times we would need to repeat as it's the value // currently in m. Thus we can simply multiply the magnitude by the // field representation of the prime and do a single iteration. Notice // that nothing will be changed when the magnitude is zero, so we could // skip this in that case, however always running regardless allows it // to run in constant time. The final result will be in the range // 0 <= result <= prime + (2^64 - c), so it is guaranteed to have a // magnitude of 1, but it is denormalized. d := t0 + m*977 f.n[0] = uint32(d & fieldBaseMask) d = (d >> fieldBase) + t1 + m*64 f.n[1] = uint32(d & fieldBaseMask) f.n[2] = uint32((d >> fieldBase) + t2) f.n[3] = uint32(t3) f.n[4] = uint32(t4) f.n[5] = uint32(t5) f.n[6] = uint32(t6) f.n[7] = uint32(t7) f.n[8] = uint32(t8) f.n[9] = uint32(t9) return f } // Square squares the field value. The existing field value is modified. Note // that this function can overflow if multiplying any of the individual words // exceeds a max uint32. In practice, this means the magnitude of the field // must be a max of 8 to prevent overflow. // // The field value is returned to support chaining. This enables syntax like: // f.Square().Mul(f2) so that f = f^2 * f2. func (f *fieldVal) Square() *fieldVal { return f.SquareVal(f) } // SquareVal squares the passed value and stores the result in f. Note that // this function can overflow if multiplying any of the individual words // exceeds a max uint32. In practice, this means the magnitude of the field // being squred must be a max of 8 to prevent overflow. // // The field value is returned to support chaining. This enables syntax like: // f3.SquareVal(f).Mul(f) so that f3 = f^2 * f = f^3. func (f *fieldVal) SquareVal(val *fieldVal) *fieldVal { // This could be done with a couple of for loops and an array to store // the intermediate terms, but this unrolled version is significantly // faster. // Terms for 2^(fieldBase*0). m := uint64(val.n[0]) * uint64(val.n[0]) t0 := m & fieldBaseMask // Terms for 2^(fieldBase*1). m = (m >> fieldBase) + 2*uint64(val.n[0])*uint64(val.n[1]) t1 := m & fieldBaseMask // Terms for 2^(fieldBase*2). m = (m >> fieldBase) + 2*uint64(val.n[0])*uint64(val.n[2]) + uint64(val.n[1])*uint64(val.n[1]) t2 := m & fieldBaseMask // Terms for 2^(fieldBase*3). m = (m >> fieldBase) + 2*uint64(val.n[0])*uint64(val.n[3]) + 2*uint64(val.n[1])*uint64(val.n[2]) t3 := m & fieldBaseMask // Terms for 2^(fieldBase*4). m = (m >> fieldBase) + 2*uint64(val.n[0])*uint64(val.n[4]) + 2*uint64(val.n[1])*uint64(val.n[3]) + uint64(val.n[2])*uint64(val.n[2]) t4 := m & fieldBaseMask // Terms for 2^(fieldBase*5). m = (m >> fieldBase) + 2*uint64(val.n[0])*uint64(val.n[5]) + 2*uint64(val.n[1])*uint64(val.n[4]) + 2*uint64(val.n[2])*uint64(val.n[3]) t5 := m & fieldBaseMask // Terms for 2^(fieldBase*6). m = (m >> fieldBase) + 2*uint64(val.n[0])*uint64(val.n[6]) + 2*uint64(val.n[1])*uint64(val.n[5]) + 2*uint64(val.n[2])*uint64(val.n[4]) + uint64(val.n[3])*uint64(val.n[3]) t6 := m & fieldBaseMask // Terms for 2^(fieldBase*7). m = (m >> fieldBase) + 2*uint64(val.n[0])*uint64(val.n[7]) + 2*uint64(val.n[1])*uint64(val.n[6]) + 2*uint64(val.n[2])*uint64(val.n[5]) + 2*uint64(val.n[3])*uint64(val.n[4]) t7 := m & fieldBaseMask // Terms for 2^(fieldBase*8). m = (m >> fieldBase) + 2*uint64(val.n[0])*uint64(val.n[8]) + 2*uint64(val.n[1])*uint64(val.n[7]) + 2*uint64(val.n[2])*uint64(val.n[6]) + 2*uint64(val.n[3])*uint64(val.n[5]) + uint64(val.n[4])*uint64(val.n[4]) t8 := m & fieldBaseMask // Terms for 2^(fieldBase*9). m = (m >> fieldBase) + 2*uint64(val.n[0])*uint64(val.n[9]) + 2*uint64(val.n[1])*uint64(val.n[8]) + 2*uint64(val.n[2])*uint64(val.n[7]) + 2*uint64(val.n[3])*uint64(val.n[6]) + 2*uint64(val.n[4])*uint64(val.n[5]) t9 := m & fieldBaseMask // Terms for 2^(fieldBase*10). m = (m >> fieldBase) + 2*uint64(val.n[1])*uint64(val.n[9]) + 2*uint64(val.n[2])*uint64(val.n[8]) + 2*uint64(val.n[3])*uint64(val.n[7]) + 2*uint64(val.n[4])*uint64(val.n[6]) + uint64(val.n[5])*uint64(val.n[5]) t10 := m & fieldBaseMask // Terms for 2^(fieldBase*11). m = (m >> fieldBase) + 2*uint64(val.n[2])*uint64(val.n[9]) + 2*uint64(val.n[3])*uint64(val.n[8]) + 2*uint64(val.n[4])*uint64(val.n[7]) + 2*uint64(val.n[5])*uint64(val.n[6]) t11 := m & fieldBaseMask // Terms for 2^(fieldBase*12). m = (m >> fieldBase) + 2*uint64(val.n[3])*uint64(val.n[9]) + 2*uint64(val.n[4])*uint64(val.n[8]) + 2*uint64(val.n[5])*uint64(val.n[7]) + uint64(val.n[6])*uint64(val.n[6]) t12 := m & fieldBaseMask // Terms for 2^(fieldBase*13). m = (m >> fieldBase) + 2*uint64(val.n[4])*uint64(val.n[9]) + 2*uint64(val.n[5])*uint64(val.n[8]) + 2*uint64(val.n[6])*uint64(val.n[7]) t13 := m & fieldBaseMask // Terms for 2^(fieldBase*14). m = (m >> fieldBase) + 2*uint64(val.n[5])*uint64(val.n[9]) + 2*uint64(val.n[6])*uint64(val.n[8]) + uint64(val.n[7])*uint64(val.n[7]) t14 := m & fieldBaseMask // Terms for 2^(fieldBase*15). m = (m >> fieldBase) + 2*uint64(val.n[6])*uint64(val.n[9]) + 2*uint64(val.n[7])*uint64(val.n[8]) t15 := m & fieldBaseMask // Terms for 2^(fieldBase*16). m = (m >> fieldBase) + 2*uint64(val.n[7])*uint64(val.n[9]) + uint64(val.n[8])*uint64(val.n[8]) t16 := m & fieldBaseMask // Terms for 2^(fieldBase*17). m = (m >> fieldBase) + 2*uint64(val.n[8])*uint64(val.n[9]) t17 := m & fieldBaseMask // Terms for 2^(fieldBase*18). m = (m >> fieldBase) + uint64(val.n[9])*uint64(val.n[9]) t18 := m & fieldBaseMask // What's left is for 2^(fieldBase*19). t19 := m >> fieldBase // At this point, all of the terms are grouped into their respective // base. // // Per [HAC] section 14.3.4: Reduction method of moduli of special form, // when the modulus is of the special form m = b^t - c, highly efficient // reduction can be achieved per the provided algorithm. // // The secp256k1 prime is equivalent to 2^256 - 4294968273, so it fits // this criteria. // // 4294968273 in field representation (base 2^26) is: // n[0] = 977 // n[1] = 64 // That is to say (2^26 * 64) + 977 = 4294968273 // // Since each word is in base 26, the upper terms (t10 and up) start // at 260 bits (versus the final desired range of 256 bits), so the // field representation of 'c' from above needs to be adjusted for the // extra 4 bits by multiplying it by 2^4 = 16. 4294968273 * 16 = // 68719492368. Thus, the adjusted field representation of 'c' is: // n[0] = 977 * 16 = 15632 // n[1] = 64 * 16 = 1024 // That is to say (2^26 * 1024) + 15632 = 68719492368 // // To reduce the final term, t19, the entire 'c' value is needed instead // of only n[0] because there are no more terms left to handle n[1]. // This means there might be some magnitude left in the upper bits that // is handled below. m = t0 + t10*15632 t0 = m & fieldBaseMask m = (m >> fieldBase) + t1 + t10*1024 + t11*15632 t1 = m & fieldBaseMask m = (m >> fieldBase) + t2 + t11*1024 + t12*15632 t2 = m & fieldBaseMask m = (m >> fieldBase) + t3 + t12*1024 + t13*15632 t3 = m & fieldBaseMask m = (m >> fieldBase) + t4 + t13*1024 + t14*15632 t4 = m & fieldBaseMask m = (m >> fieldBase) + t5 + t14*1024 + t15*15632 t5 = m & fieldBaseMask m = (m >> fieldBase) + t6 + t15*1024 + t16*15632 t6 = m & fieldBaseMask m = (m >> fieldBase) + t7 + t16*1024 + t17*15632 t7 = m & fieldBaseMask m = (m >> fieldBase) + t8 + t17*1024 + t18*15632 t8 = m & fieldBaseMask m = (m >> fieldBase) + t9 + t18*1024 + t19*68719492368 t9 = m & fieldMSBMask m = m >> fieldMSBBits // At this point, if the magnitude is greater than 0, the overall value // is greater than the max possible 256-bit value. In particular, it is // "how many times larger" than the max value it is. // // The algorithm presented in [HAC] section 14.3.4 repeats until the // quotient is zero. However, due to the above, we already know at // least how many times we would need to repeat as it's the value // currently in m. Thus we can simply multiply the magnitude by the // field representation of the prime and do a single iteration. Notice // that nothing will be changed when the magnitude is zero, so we could // skip this in that case, however always running regardless allows it // to run in constant time. The final result will be in the range // 0 <= result <= prime + (2^64 - c), so it is guaranteed to have a // magnitude of 1, but it is denormalized. n := t0 + m*977 f.n[0] = uint32(n & fieldBaseMask) n = (n >> fieldBase) + t1 + m*64 f.n[1] = uint32(n & fieldBaseMask) f.n[2] = uint32((n >> fieldBase) + t2) f.n[3] = uint32(t3) f.n[4] = uint32(t4) f.n[5] = uint32(t5) f.n[6] = uint32(t6) f.n[7] = uint32(t7) f.n[8] = uint32(t8) f.n[9] = uint32(t9) return f } // Inverse finds the modular multiplicative inverse of the field value. The // existing field value is modified. // // The field value is returned to support chaining. This enables syntax like: // f.Inverse().Mul(f2) so that f = f^-1 * f2. func (f *fieldVal) Inverse() *fieldVal { // Fermat's little theorem states that for a nonzero number a and prime // prime p, a^(p-1) = 1 (mod p). Since the multipliciative inverse is // a*b = 1 (mod p), it follows that b = a*a^(p-2) = a^(p-1) = 1 (mod p). // Thus, a^(p-2) is the multiplicative inverse. // // In order to efficiently compute a^(p-2), p-2 needs to be split into // a sequence of squares and multipications that minimizes the number of // multiplications needed (since they are more costly than squarings). // Intermediate results are saved and reused as well. // // The secp256k1 prime - 2 is 2^256 - 4294968275. // // This has a cost of 258 field squarings and 33 field multiplications. var a2, a3, a4, a10, a11, a21, a42, a45, a63, a1019, a1023 fieldVal a2.SquareVal(f) a3.Mul2(&a2, f) a4.SquareVal(&a2) a10.SquareVal(&a4).Mul(&a2) a11.Mul2(&a10, f) a21.Mul2(&a10, &a11) a42.SquareVal(&a21) a45.Mul2(&a42, &a3) a63.Mul2(&a42, &a21) a1019.SquareVal(&a63).Square().Square().Square().Mul(&a11) a1023.Mul2(&a1019, &a4) f.Set(&a63) // f = a^(2^6 - 1) f.Square().Square().Square().Square().Square() // f = a^(2^11 - 32) f.Square().Square().Square().Square().Square() // f = a^(2^16 - 1024) f.Mul(&a1023) // f = a^(2^16 - 1) f.Square().Square().Square().Square().Square() // f = a^(2^21 - 32) f.Square().Square().Square().Square().Square() // f = a^(2^26 - 1024) f.Mul(&a1023) // f = a^(2^26 - 1) f.Square().Square().Square().Square().Square() // f = a^(2^31 - 32) f.Square().Square().Square().Square().Square() // f = a^(2^36 - 1024) f.Mul(&a1023) // f = a^(2^36 - 1) f.Square().Square().Square().Square().Square() // f = a^(2^41 - 32) f.Square().Square().Square().Square().Square() // f = a^(2^46 - 1024) f.Mul(&a1023) // f = a^(2^46 - 1) f.Square().Square().Square().Square().Square() // f = a^(2^51 - 32) f.Square().Square().Square().Square().Square() // f = a^(2^56 - 1024) f.Mul(&a1023) // f = a^(2^56 - 1) f.Square().Square().Square().Square().Square() // f = a^(2^61 - 32) f.Square().Square().Square().Square().Square() // f = a^(2^66 - 1024) f.Mul(&a1023) // f = a^(2^66 - 1) f.Square().Square().Square().Square().Square() // f = a^(2^71 - 32) f.Square().Square().Square().Square().Square() // f = a^(2^76 - 1024) f.Mul(&a1023) // f = a^(2^76 - 1) f.Square().Square().Square().Square().Square() // f = a^(2^81 - 32) f.Square().Square().Square().Square().Square() // f = a^(2^86 - 1024) f.Mul(&a1023) // f = a^(2^86 - 1) f.Square().Square().Square().Square().Square() // f = a^(2^91 - 32) f.Square().Square().Square().Square().Square() // f = a^(2^96 - 1024) f.Mul(&a1023) // f = a^(2^96 - 1) f.Square().Square().Square().Square().Square() // f = a^(2^101 - 32) f.Square().Square().Square().Square().Square() // f = a^(2^106 - 1024) f.Mul(&a1023) // f = a^(2^106 - 1) f.Square().Square().Square().Square().Square() // f = a^(2^111 - 32) f.Square().Square().Square().Square().Square() // f = a^(2^116 - 1024) f.Mul(&a1023) // f = a^(2^116 - 1) f.Square().Square().Square().Square().Square() // f = a^(2^121 - 32) f.Square().Square().Square().Square().Square() // f = a^(2^126 - 1024) f.Mul(&a1023) // f = a^(2^126 - 1) f.Square().Square().Square().Square().Square() // f = a^(2^131 - 32) f.Square().Square().Square().Square().Square() // f = a^(2^136 - 1024) f.Mul(&a1023) // f = a^(2^136 - 1) f.Square().Square().Square().Square().Square() // f = a^(2^141 - 32) f.Square().Square().Square().Square().Square() // f = a^(2^146 - 1024) f.Mul(&a1023) // f = a^(2^146 - 1) f.Square().Square().Square().Square().Square() // f = a^(2^151 - 32) f.Square().Square().Square().Square().Square() // f = a^(2^156 - 1024) f.Mul(&a1023) // f = a^(2^156 - 1) f.Square().Square().Square().Square().Square() // f = a^(2^161 - 32) f.Square().Square().Square().Square().Square() // f = a^(2^166 - 1024) f.Mul(&a1023) // f = a^(2^166 - 1) f.Square().Square().Square().Square().Square() // f = a^(2^171 - 32) f.Square().Square().Square().Square().Square() // f = a^(2^176 - 1024) f.Mul(&a1023) // f = a^(2^176 - 1) f.Square().Square().Square().Square().Square() // f = a^(2^181 - 32) f.Square().Square().Square().Square().Square() // f = a^(2^186 - 1024) f.Mul(&a1023) // f = a^(2^186 - 1) f.Square().Square().Square().Square().Square() // f = a^(2^191 - 32) f.Square().Square().Square().Square().Square() // f = a^(2^196 - 1024) f.Mul(&a1023) // f = a^(2^196 - 1) f.Square().Square().Square().Square().Square() // f = a^(2^201 - 32) f.Square().Square().Square().Square().Square() // f = a^(2^206 - 1024) f.Mul(&a1023) // f = a^(2^206 - 1) f.Square().Square().Square().Square().Square() // f = a^(2^211 - 32) f.Square().Square().Square().Square().Square() // f = a^(2^216 - 1024) f.Mul(&a1023) // f = a^(2^216 - 1) f.Square().Square().Square().Square().Square() // f = a^(2^221 - 32) f.Square().Square().Square().Square().Square() // f = a^(2^226 - 1024) f.Mul(&a1019) // f = a^(2^226 - 5) f.Square().Square().Square().Square().Square() // f = a^(2^231 - 160) f.Square().Square().Square().Square().Square() // f = a^(2^236 - 5120) f.Mul(&a1023) // f = a^(2^236 - 4097) f.Square().Square().Square().Square().Square() // f = a^(2^241 - 131104) f.Square().Square().Square().Square().Square() // f = a^(2^246 - 4195328) f.Mul(&a1023) // f = a^(2^246 - 4194305) f.Square().Square().Square().Square().Square() // f = a^(2^251 - 134217760) f.Square().Square().Square().Square().Square() // f = a^(2^256 - 4294968320) return f.Mul(&a45) // f = a^(2^256 - 4294968275) = a^(p-2) } // SqrtVal computes the square root of x modulo the curve's prime, and stores // the result in f. The square root is computed via exponentiation of x by the // value Q = (P+1)/4 using the curve's precomputed big-endian representation of // the Q. This method uses a modified version of square-and-multiply // exponentiation over secp256k1 fieldVals to operate on bytes instead of bits, // which offers better performance over both big.Int exponentiation and bit-wise // square-and-multiply. // // NOTE: This method only works when P is intended to be the secp256k1 prime and // is not constant time. The returned value is of magnitude 1, but is // denormalized. func (f *fieldVal) SqrtVal(x *fieldVal) *fieldVal { // The following computation iteratively computes x^((P+1)/4) = x^Q // using the recursive, piece-wise definition: // // x^n = (x^2)^(n/2) mod P if n is even // x^n = x(x^2)^(n-1/2) mod P if n is odd // // Given n in its big-endian representation b_k, ..., b_0, x^n can be // computed by defining the sequence r_k+1, ..., r_0, where: // // r_k+1 = 1 // r_i = (r_i+1)^2 * x^b_i for i = k, ..., 0 // // The final value r_0 = x^n. // // See https://en.wikipedia.org/wiki/Exponentiation_by_squaring for more // details. // // This can be further optimized, by observing that the value of Q in // secp256k1 has the value: // // Q = 3fffffffffffffffffffffffffffffffffffffffffffffffffffffffbfffff0c // // We can unroll the typical bit-wise interpretation of the // exponentiation algorithm above to instead operate on bytes. // This reduces the number of comparisons by an order of magnitude, // reducing the overhead of failed branch predictions and additional // comparisons in this method. // // Since there there are only 4 unique bytes of Q, this keeps the jump // table small without the need to handle all possible 8-bit values. // Further, we observe that 29 of the 32 bytes are 0xff; making the // first case handle 0xff therefore optimizes the hot path. f.SetInt(1) for _, b := range fieldQBytes { switch b { // Most common case, where all 8 bits are set. case 0xff: f.Square().Mul(x) f.Square().Mul(x) f.Square().Mul(x) f.Square().Mul(x) f.Square().Mul(x) f.Square().Mul(x) f.Square().Mul(x) f.Square().Mul(x) // First byte of Q (0x3f), where all but the top two bits are // set. Note that this case only applies six operations, since // the highest bit of Q resides in bit six of the first byte. We // ignore the first two bits, since squaring for these bits will // result in an invalid result. We forgo squaring f before the // first multiply, since 1^2 = 1. case 0x3f: f.Mul(x) f.Square().Mul(x) f.Square().Mul(x) f.Square().Mul(x) f.Square().Mul(x) f.Square().Mul(x) // Byte 28 of Q (0xbf), where only bit 7 is unset. case 0xbf: f.Square().Mul(x) f.Square() f.Square().Mul(x) f.Square().Mul(x) f.Square().Mul(x) f.Square().Mul(x) f.Square().Mul(x) f.Square().Mul(x) // Byte 31 of Q (0x0c), where only bits 3 and 4 are set. default: f.Square() f.Square() f.Square() f.Square() f.Square().Mul(x) f.Square().Mul(x) f.Square() f.Square() } } return f } // Sqrt computes the square root of f modulo the curve's prime, and stores the // result in f. The square root is computed via exponentiation of x by the value // Q = (P+1)/4 using the curve's precomputed big-endian representation of the Q. // This method uses a modified version of square-and-multiply exponentiation // over secp256k1 fieldVals to operate on bytes instead of bits, which offers // better performance over both big.Int exponentiation and bit-wise // square-and-multiply. // // NOTE: This method only works when P is intended to be the secp256k1 prime and // is not constant time. The returned value is of magnitude 1, but is // denormalized. func (f *fieldVal) Sqrt() *fieldVal { return f.SqrtVal(f) }