txscript: Introduce zero-alloc script tokenizer.

This implements an efficient and zero-allocation script tokenizer that
is exported to both provide a new capability to tokenize scripts to
external consumers of the API as well as to serve as a base for
refactoring the existing highly inefficient internal code.

It is important to note that this tokenizer is intended to be used in
consensus critical code in the future, so it must exactly follow the
existing semantics.

The current script parsing mechanism used throughout the txscript module
is to fully tokenize the scripts into an array of internal parsed
opcodes which are then examined and passed around in order to implement
virtually everything related to scripts.

While that approach does simplify the analysis of certain scripts and
thus provide some nice properties in that regard, it is both extremely
inefficient in many cases, and makes it impossible for external
consumers of the API to implement any form of custom script analysis
without manually implementing a bunch of error prone tokenizing code or,
alternatively, the script engine exposing internal structures.

For example, as shown by profiling the total memory allocations of an
initial sync, the existing script parsing code allocates a total of
around 295.12GB, which equates to around 50% of all allocations
performed.  The zero-alloc tokenizer this introduces will allow that to
be reduced to virtually zero.

The following is a before and after comparison of tokenizing a large
script with a high opcode count using the existing code versus the
tokenizer this introduces for both speed and memory allocations:

benchmark                    old ns/op     new ns/op     delta
BenchmarkScriptParsing-8     63464         677           -98.93%

benchmark                    old allocs     new allocs     delta
BenchmarkScriptParsing-8     1              0              -100.00%

benchmark                    old bytes     new bytes     delta
BenchmarkScriptParsing-8     311299        0             -100.00%

The following is an overview of the changes:

- Introduce new error code ErrUnsupportedScriptVersion
- Implement zero-allocation script tokenizer
- Add a full suite of tests to ensure the tokenizer works as intended
  and follows the required consensus semantics
- Add an example of using the new tokenizer to count the number of
  opcodes in a script
- Update README.md to include the new example
- Update script parsing benchmark to use the new tokenizer
This commit is contained in:
Dave Collins 2019-03-13 01:11:03 -05:00 committed by Roy Lee
parent fc5b1a817c
commit ac002d6422
7 changed files with 497 additions and 7 deletions

View file

@ -37,6 +37,10 @@ $ go get -u github.com/btcsuite/btcd/txscript
* [Manually Signing a Transaction Output](https://pkg.go.dev/github.com/btcsuite/btcd/txscript#example-SignTxOutput)
Demonstrates manually creating and signing a redeem transaction.
* [Counting Opcodes in Scripts](http://godoc.org/github.com/decred/dcrd/txscript#example-ScriptTokenizer)
Demonstrates creating a script tokenizer instance and using it to count the
number of opcodes a script contains.
## GPG Verification Key
All official release tags are signed by Conformal so users can ensure the code

View file

@ -96,17 +96,18 @@ func BenchmarkScriptParsing(b *testing.B) {
b.Fatalf("failed to create benchmark script: %v", err)
}
const scriptVersion = 0
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
pops, err := parseScript(script)
if err != nil {
b.Fatalf("failed to parse script: %v", err)
tokenizer := MakeScriptTokenizer(scriptVersion, script)
for tokenizer.Next() {
_ = tokenizer.Opcode()
_ = tokenizer.Data()
_ = tokenizer.ByteIndex()
}
for _, pop := range pops {
_ = pop.opcode
_ = pop.data
if err := tokenizer.Err(); err != nil {
b.Fatalf("failed to parse script: %v", err)
}
}
}

View file

@ -1,4 +1,5 @@
// Copyright (c) 2013-2017 The btcsuite developers
// Copyright (c) 2015-2019 The Decred developers
// Use of this source code is governed by an ISC
// license that can be found in the LICENSE file.
@ -47,6 +48,10 @@ const (
// the provided data exceeds MaxDataCarrierSize.
ErrTooMuchNullData
// ErrUnsupportedScriptVersion is returned when an unsupported script
// version is passed to a function which deals with script analysis.
ErrUnsupportedScriptVersion
// ------------------------------------------
// Failures related to final execution state.
// ------------------------------------------
@ -352,6 +357,7 @@ var errorCodeStrings = map[ErrorCode]string{
ErrNotMultisigScript: "ErrNotMultisigScript",
ErrTooManyRequiredSigs: "ErrTooManyRequiredSigs",
ErrTooMuchNullData: "ErrTooMuchNullData",
ErrUnsupportedScriptVersion: "ErrUnsupportedScriptVersion",
ErrEarlyReturn: "ErrEarlyReturn",
ErrEmptyStack: "ErrEmptyStack",
ErrEvalFalse: "ErrEvalFalse",

View file

@ -1,4 +1,5 @@
// Copyright (c) 2017 The btcsuite developers
// Copyright (c) 2015-2019 The Decred developers
// Use of this source code is governed by an ISC
// license that can be found in the LICENSE file.
@ -22,6 +23,7 @@ func TestErrorCodeStringer(t *testing.T) {
{ErrUnsupportedAddress, "ErrUnsupportedAddress"},
{ErrTooManyRequiredSigs, "ErrTooManyRequiredSigs"},
{ErrTooMuchNullData, "ErrTooMuchNullData"},
{ErrUnsupportedScriptVersion, "ErrUnsupportedScriptVersion"},
{ErrNotMultisigScript, "ErrNotMultisigScript"},
{ErrEarlyReturn, "ErrEarlyReturn"},
{ErrEmptyStack, "ErrEmptyStack"},

View file

@ -1,4 +1,5 @@
// Copyright (c) 2014-2016 The btcsuite developers
// Copyright (c) 2015-2019 The Decred developers
// Use of this source code is governed by an ISC
// license that can be found in the LICENSE file.
@ -180,3 +181,34 @@ func ExampleSignTxOutput() {
// Output:
// Transaction successfully signed
}
// This example demonstrates creating a script tokenizer instance and using it
// to count the number of opcodes a script contains.
func ExampleScriptTokenizer() {
// Create a script to use in the example. Ordinarily this would come from
// some other source.
hash160 := btcutil.Hash160([]byte("example"))
script, err := txscript.NewScriptBuilder().AddOp(txscript.OP_DUP).
AddOp(txscript.OP_HASH160).AddData(hash160).
AddOp(txscript.OP_EQUALVERIFY).AddOp(txscript.OP_CHECKSIG).Script()
if err != nil {
fmt.Printf("failed to build script: %v\n", err)
return
}
// Create a tokenizer to iterate the script and count the number of opcodes.
const scriptVersion = 0
var numOpcodes int
tokenizer := txscript.MakeScriptTokenizer(scriptVersion, script)
for tokenizer.Next() {
numOpcodes++
}
if tokenizer.Err() != nil {
fmt.Printf("script failed to parse: %v\n", err)
} else {
fmt.Printf("script contains %d opcode(s)\n", numOpcodes)
}
// Output:
// script contains 5 opcode(s)
}

186
txscript/tokenizer.go Normal file
View file

@ -0,0 +1,186 @@
// Copyright (c) 2019 The Decred developers
// Use of this source code is governed by an ISC
// license that can be found in the LICENSE file.
package txscript
import (
"encoding/binary"
"fmt"
)
// opcodeArrayRef is used to break initialization cycles.
var opcodeArrayRef *[256]opcode
func init() {
opcodeArrayRef = &opcodeArray
}
// ScriptTokenizer provides a facility for easily and efficiently tokenizing
// transaction scripts without creating allocations. Each successive opcode is
// parsed with the Next function, which returns false when iteration is
// complete, either due to successfully tokenizing the entire script or
// encountering a parse error. In the case of failure, the Err function may be
// used to obtain the specific parse error.
//
// Upon successfully parsing an opcode, the opcode and data associated with it
// may be obtained via the Opcode and Data functions, respectively.
//
// The ByteIndex function may be used to obtain the tokenizer's current offset
// into the raw script.
type ScriptTokenizer struct {
script []byte
version uint16
offset int32
op *opcode
data []byte
err error
}
// Done returns true when either all opcodes have been exhausted or a parse
// failure was encountered and therefore the state has an associated error.
func (t *ScriptTokenizer) Done() bool {
return t.err != nil || t.offset >= int32(len(t.script))
}
// Next attempts to parse the next opcode and returns whether or not it was
// successful. It will not be successful if invoked when already at the end of
// the script, a parse failure is encountered, or an associated error already
// exists due to a previous parse failure.
//
// In the case of a true return, the parsed opcode and data can be obtained with
// the associated functions and the offset into the script will either point to
// the next opcode or the end of the script if the final opcode was parsed.
//
// In the case of a false return, the parsed opcode and data will be the last
// successfully parsed values (if any) and the offset into the script will
// either point to the failing opcode or the end of the script if the function
// was invoked when already at the end of the script.
//
// Invoking this function when already at the end of the script is not
// considered an error and will simply return false.
func (t *ScriptTokenizer) Next() bool {
if t.Done() {
return false
}
op := &opcodeArrayRef[t.script[t.offset]]
switch {
// No additional data. Note that some of the opcodes, notably OP_1NEGATE,
// OP_0, and OP_[1-16] represent the data themselves.
case op.length == 1:
t.offset++
t.op = op
t.data = nil
return true
// Data pushes of specific lengths -- OP_DATA_[1-75].
case op.length > 1:
script := t.script[t.offset:]
if len(script) < op.length {
str := fmt.Sprintf("opcode %s requires %d bytes, but script only "+
"has %d remaining", op.name, op.length, len(script))
t.err = scriptError(ErrMalformedPush, str)
return false
}
// Move the offset forward and set the opcode and data accordingly.
t.offset += int32(op.length)
t.op = op
t.data = script[1:op.length]
return true
// Data pushes with parsed lengths -- OP_PUSHDATA{1,2,4}.
case op.length < 0:
script := t.script[t.offset+1:]
if len(script) < -op.length {
str := fmt.Sprintf("opcode %s requires %d bytes, but script only "+
"has %d remaining", op.name, -op.length, len(script))
t.err = scriptError(ErrMalformedPush, str)
return false
}
// Next -length bytes are little endian length of data.
var dataLen int32
switch op.length {
case -1:
dataLen = int32(script[0])
case -2:
dataLen = int32(binary.LittleEndian.Uint16(script[:2]))
case -4:
dataLen = int32(binary.LittleEndian.Uint32(script[:4]))
default:
// In practice it should be impossible to hit this
// check as each op code is predefined, and only uses
// the specified lengths.
str := fmt.Sprintf("invalid opcode length %d", op.length)
t.err = scriptError(ErrMalformedPush, str)
return false
}
// Move to the beginning of the data.
script = script[-op.length:]
// Disallow entries that do not fit script or were sign extended.
if dataLen > int32(len(script)) || dataLen < 0 {
str := fmt.Sprintf("opcode %s pushes %d bytes, but script only "+
"has %d remaining", op.name, dataLen, len(script))
t.err = scriptError(ErrMalformedPush, str)
return false
}
// Move the offset forward and set the opcode and data accordingly.
t.offset += 1 + int32(-op.length) + dataLen
t.op = op
t.data = script[:dataLen]
return true
}
// The only remaining case is an opcode with length zero which is
// impossible.
panic("unreachable")
}
// Script returns the full script associated with the tokenizer.
func (t *ScriptTokenizer) Script() []byte {
return t.script
}
// ByteIndex returns the current offset into the full script that will be parsed
// next and therefore also implies everything before it has already been parsed.
func (t *ScriptTokenizer) ByteIndex() int32 {
return t.offset
}
// Opcode returns the current opcode associated with the tokenizer.
func (t *ScriptTokenizer) Opcode() byte {
return t.op.value
}
// Data returns the data associated with the most recently successfully parsed
// opcode.
func (t *ScriptTokenizer) Data() []byte {
return t.data
}
// Err returns any errors currently associated with the tokenizer. This will
// only be non-nil in the case a parsing error was encountered.
func (t *ScriptTokenizer) Err() error {
return t.err
}
// MakeScriptTokenizer returns a new instance of a script tokenizer. Passing
// an unsupported script version will result in the returned tokenizer
// immediately having an err set accordingly.
//
// See the docs for ScriptTokenizer for more details.
func MakeScriptTokenizer(scriptVersion uint16, script []byte) ScriptTokenizer {
// Only version 0 scripts are currently supported.
var err error
if scriptVersion != 0 {
str := fmt.Sprintf("script version %d is not supported", scriptVersion)
err = scriptError(ErrUnsupportedScriptVersion, str)
}
return ScriptTokenizer{version: scriptVersion, script: script, err: err}
}

259
txscript/tokenizer_test.go Normal file
View file

@ -0,0 +1,259 @@
// Copyright (c) 2019 The Decred developers
// Use of this source code is governed by an ISC
// license that can be found in the LICENSE file.
package txscript
import (
"bytes"
"fmt"
"testing"
)
// TestScriptTokenizer ensures a wide variety of behavior provided by the script
// tokenizer performs as expected.
func TestScriptTokenizer(t *testing.T) {
t.Skip()
type expectedResult struct {
op byte // expected parsed opcode
data []byte // expected parsed data
index int32 // expected index into raw script after parsing token
}
type tokenizerTest struct {
name string // test description
script []byte // the script to tokenize
expected []expectedResult // the expected info after parsing each token
finalIdx int32 // the expected final byte index
err error // expected error
}
// Add both positive and negative tests for OP_DATA_1 through OP_DATA_75.
const numTestsHint = 100 // Make prealloc linter happy.
tests := make([]tokenizerTest, 0, numTestsHint)
for op := byte(OP_DATA_1); op < OP_DATA_75; op++ {
data := bytes.Repeat([]byte{0x01}, int(op))
tests = append(tests, tokenizerTest{
name: fmt.Sprintf("OP_DATA_%d", op),
script: append([]byte{op}, data...),
expected: []expectedResult{{op, data, 1 + int32(op)}},
finalIdx: 1 + int32(op),
err: nil,
})
// Create test that provides one less byte than the data push requires.
tests = append(tests, tokenizerTest{
name: fmt.Sprintf("short OP_DATA_%d", op),
script: append([]byte{op}, data[1:]...),
expected: nil,
finalIdx: 0,
err: scriptError(ErrMalformedPush, ""),
})
}
// Add both positive and negative tests for OP_PUSHDATA{1,2,4}.
data := mustParseShortForm("0x01{76}")
tests = append(tests, []tokenizerTest{{
name: "OP_PUSHDATA1",
script: mustParseShortForm("OP_PUSHDATA1 0x4c 0x01{76}"),
expected: []expectedResult{{OP_PUSHDATA1, data, 2 + int32(len(data))}},
finalIdx: 2 + int32(len(data)),
err: nil,
}, {
name: "OP_PUSHDATA1 no data length",
script: mustParseShortForm("OP_PUSHDATA1"),
expected: nil,
finalIdx: 0,
err: scriptError(ErrMalformedPush, ""),
}, {
name: "OP_PUSHDATA1 short data by 1 byte",
script: mustParseShortForm("OP_PUSHDATA1 0x4c 0x01{75}"),
expected: nil,
finalIdx: 0,
err: scriptError(ErrMalformedPush, ""),
}, {
name: "OP_PUSHDATA2",
script: mustParseShortForm("OP_PUSHDATA2 0x4c00 0x01{76}"),
expected: []expectedResult{{OP_PUSHDATA2, data, 3 + int32(len(data))}},
finalIdx: 3 + int32(len(data)),
err: nil,
}, {
name: "OP_PUSHDATA2 no data length",
script: mustParseShortForm("OP_PUSHDATA2"),
expected: nil,
finalIdx: 0,
err: scriptError(ErrMalformedPush, ""),
}, {
name: "OP_PUSHDATA2 short data by 1 byte",
script: mustParseShortForm("OP_PUSHDATA2 0x4c00 0x01{75}"),
expected: nil,
finalIdx: 0,
err: scriptError(ErrMalformedPush, ""),
}, {
name: "OP_PUSHDATA4",
script: mustParseShortForm("OP_PUSHDATA4 0x4c000000 0x01{76}"),
expected: []expectedResult{{OP_PUSHDATA4, data, 5 + int32(len(data))}},
finalIdx: 5 + int32(len(data)),
err: nil,
}, {
name: "OP_PUSHDATA4 no data length",
script: mustParseShortForm("OP_PUSHDATA4"),
expected: nil,
finalIdx: 0,
err: scriptError(ErrMalformedPush, ""),
}, {
name: "OP_PUSHDATA4 short data by 1 byte",
script: mustParseShortForm("OP_PUSHDATA4 0x4c000000 0x01{75}"),
expected: nil,
finalIdx: 0,
err: scriptError(ErrMalformedPush, ""),
}}...)
// Add tests for OP_0, and OP_1 through OP_16 (small integers/true/false).
opcodes := []byte{OP_0}
for op := byte(OP_1); op < OP_16; op++ {
opcodes = append(opcodes, op)
}
for _, op := range opcodes {
tests = append(tests, tokenizerTest{
name: fmt.Sprintf("OP_%d", op),
script: []byte{op},
expected: []expectedResult{{op, nil, 1}},
finalIdx: 1,
err: nil,
})
}
// Add various positive and negative tests for multi-opcode scripts.
tests = append(tests, []tokenizerTest{{
name: "pay-to-pubkey-hash",
script: mustParseShortForm("DUP HASH160 DATA_20 0x01{20} EQUAL CHECKSIG"),
expected: []expectedResult{
{OP_DUP, nil, 1}, {OP_HASH160, nil, 2},
{OP_DATA_20, mustParseShortForm("0x01{20}"), 23},
{OP_EQUAL, nil, 24}, {OP_CHECKSIG, nil, 25},
},
finalIdx: 25,
err: nil,
}, {
name: "almost pay-to-pubkey-hash (short data)",
script: mustParseShortForm("DUP HASH160 DATA_20 0x01{17} EQUAL CHECKSIG"),
expected: []expectedResult{
{OP_DUP, nil, 1}, {OP_HASH160, nil, 2},
},
finalIdx: 2,
err: scriptError(ErrMalformedPush, ""),
}, {
name: "almost pay-to-pubkey-hash (overlapped data)",
script: mustParseShortForm("DUP HASH160 DATA_20 0x01{19} EQUAL CHECKSIG"),
expected: []expectedResult{
{OP_DUP, nil, 1}, {OP_HASH160, nil, 2},
{OP_DATA_20, mustParseShortForm("0x01{19} EQUAL"), 23},
{OP_CHECKSIG, nil, 24},
},
finalIdx: 24,
err: nil,
}, {
name: "pay-to-script-hash",
script: mustParseShortForm("HASH160 DATA_20 0x01{20} EQUAL"),
expected: []expectedResult{
{OP_HASH160, nil, 1},
{OP_DATA_20, mustParseShortForm("0x01{20}"), 22},
{OP_EQUAL, nil, 23},
},
finalIdx: 23,
err: nil,
}, {
name: "almost pay-to-script-hash (short data)",
script: mustParseShortForm("HASH160 DATA_20 0x01{18} EQUAL"),
expected: []expectedResult{
{OP_HASH160, nil, 1},
},
finalIdx: 1,
err: scriptError(ErrMalformedPush, ""),
}, {
name: "almost pay-to-script-hash (overlapped data)",
script: mustParseShortForm("HASH160 DATA_20 0x01{19} EQUAL"),
expected: []expectedResult{
{OP_HASH160, nil, 1},
{OP_DATA_20, mustParseShortForm("0x01{19} EQUAL"), 22},
},
finalIdx: 22,
err: nil,
}}...)
const scriptVersion = 0
for _, test := range tests {
tokenizer := MakeScriptTokenizer(scriptVersion, test.script)
var opcodeNum int
for tokenizer.Next() {
// Ensure Next never returns true when there is an error set.
if err := tokenizer.Err(); err != nil {
t.Fatalf("%q: Next returned true when tokenizer has err: %v",
test.name, err)
}
// Ensure the test data expects a token to be parsed.
op := tokenizer.Opcode()
data := tokenizer.Data()
if opcodeNum >= len(test.expected) {
t.Fatalf("%q: unexpected token '%d' (data: '%x')", test.name,
op, data)
}
expected := &test.expected[opcodeNum]
// Ensure the opcode and data are the expected values.
if op != expected.op {
t.Fatalf("%q: unexpected opcode -- got %v, want %v", test.name,
op, expected.op)
}
if !bytes.Equal(data, expected.data) {
t.Fatalf("%q: unexpected data -- got %x, want %x", test.name,
data, expected.data)
}
tokenizerIdx := tokenizer.ByteIndex()
if tokenizerIdx != expected.index {
t.Fatalf("%q: unexpected byte index -- got %d, want %d",
test.name, tokenizerIdx, expected.index)
}
opcodeNum++
}
// Ensure the tokenizer claims it is done. This should be the case
// regardless of whether or not there was a parse error.
if !tokenizer.Done() {
t.Fatalf("%q: tokenizer claims it is not done", test.name)
}
// Ensure the error is as expected.
if test.err == nil && tokenizer.Err() != nil {
t.Fatalf("%q: unexpected tokenizer err -- got %v, want nil",
test.name, tokenizer.Err())
} else if test.err != nil {
if !IsErrorCode(tokenizer.Err(), test.err.(Error).ErrorCode) {
t.Fatalf("%q: unexpected tokenizer err -- got %v, want %v",
test.name, tokenizer.Err(), test.err.(Error).ErrorCode)
}
}
// Ensure the final index is the expected value.
tokenizerIdx := tokenizer.ByteIndex()
if tokenizerIdx != test.finalIdx {
t.Fatalf("%q: unexpected final byte index -- got %d, want %d",
test.name, tokenizerIdx, test.finalIdx)
}
}
}
// TestScriptTokenizerUnsupportedVersion ensures the tokenizer fails immediately
// with an unsupported script version.
func TestScriptTokenizerUnsupportedVersion(t *testing.T) {
const scriptVersion = 65535
tokenizer := MakeScriptTokenizer(scriptVersion, nil)
if !IsErrorCode(tokenizer.Err(), ErrUnsupportedScriptVersion) {
t.Fatalf("script tokenizer did not error with unsupported version")
}
}