From 3055897c1c5b74bc4c4ec5f702c22ed90020f410 Mon Sep 17 00:00:00 2001 From: Kevin Gillette Date: Mon, 4 Dec 2023 15:00:43 -0700 Subject: [PATCH] json: configurable numeric decoding (#137) * json: stylistic improvements, better code reuse Initial benchmark results show this change to be approximately performance-neutral. * bump tested Go versions to just 1.20 and 1.21 * json: add ParseFlags values UseInt64, UseUint64, UseBigInt * json: use atomic.Pointer --- .github/workflows/benchmark.yml | 88 ++++++++++---------- .github/workflows/test.yml | 29 ++++--- Makefile | 7 +- go.mod | 4 +- json/codec.go | 31 ++++--- json/decode.go | 112 ++++++++++++++++++++----- json/golang_bench_test.go | 6 +- json/int_test.go | 3 +- json/json.go | 13 +++ json/json_test.go | 142 +++++++++++++++++++++++++++++++- proto/fixtures/generate/main.go | 3 +- proto/proto_test.go | 4 +- thrift/decode.go | 3 +- 13 files changed, 334 insertions(+), 111 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 3d2383c..805d1c4 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -1,61 +1,63 @@ +--- name: Benchmark -on: -- pull_request +"on": + - pull_request jobs: benchmark: strategy: matrix: ref: - - master - - ${{ github.sha }} + - master + - ${{ github.sha }} runs-on: ubuntu-latest steps: - - name: Steup Go - uses: actions/setup-go@v2 - with: - go-version: 1.17 - - - name: Checkout - uses: actions/checkout@v2 - with: - ref: ${{ matrix.ref }} - - - name: Run Benchmarks - run: go test -v -run '^$' -bench '(Marshal|Unmarshal)$/codeResponse' -benchmem -benchtime 3s -cpu 1 -count 5 ./json | tee bench.txt - - - name: Upload Benchmarks - uses: actions/upload-artifact@v2 - with: - name: ${{ matrix.ref }} - path: bench.txt + - name: Setup Go + uses: actions/setup-go@v2 + with: + go-version: "1.21" + + - name: Checkout + uses: actions/checkout@v2 + with: + ref: ${{ matrix.ref }} + + - name: Run Benchmarks + # Without 6 iterations, benchstat will claim statistical insignificance. + run: go test -v -run '^$' -bench '(Marshal|Unmarshal)$/codeResponse' -benchmem -benchtime 3s -cpu 1 -count 6 ./json | tee bench.txt + + - name: Upload Benchmarks + uses: actions/upload-artifact@v2 + with: + name: ${{ matrix.ref }} + path: bench.txt benchstat: needs: [benchmark] runs-on: ubuntu-latest steps: - - name: Steup Go - uses: actions/setup-go@v2 - with: - go-version: 1.17 - - - name: Setup Benchstat - run: go install golang.org/x/perf/cmd/benchstat@latest - - - name: Download Benchmark Results - uses: actions/download-artifact@v2 - with: - path: . - - - name: Run Benchstat - run: benchstat ./master/bench.txt ./${{ github.sha }}/bench.txt | tee benchstat.txt - - - name: Upload Benchstat Results - uses: actions/upload-artifact@v2 - with: - name: benchstat - path: benchstat.txt + - name: Steup Go + uses: actions/setup-go@v2 + with: + go-version: "1.21" + + - name: Setup Benchstat + run: go install golang.org/x/perf/cmd/benchstat@latest + + - name: Download Benchmark Results + uses: actions/download-artifact@v2 + with: + path: . + + - name: Run Benchstat + run: benchstat ./master/bench.txt ./${{ github.sha }}/bench.txt | tee benchstat.txt + + - name: Upload Benchstat Results + uses: actions/upload-artifact@v2 + with: + name: benchstat + path: benchstat.txt diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 16acc54..7e4bc41 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,30 +1,29 @@ +--- name: Test -on: -- pull_request +"on": + - pull_request jobs: test: strategy: matrix: go: - - 1.14 - - 1.15 - - 1.16 - - 1.17 + - "1.20" + - "1.21" runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v2 - - name: Setup Go ${{ matrix.go }} - uses: actions/setup-go@v2 - with: - go-version: ${{ matrix.go }} + - name: Setup Go ${{ matrix.go }} + uses: actions/setup-go@v2 + with: + go-version: ${{ matrix.go }} - - name: Download Dependencies - run: go mod download + - name: Download Dependencies + run: go mod download - - name: Run Tests - run: make test + - name: Run Tests + run: make test diff --git a/Makefile b/Makefile index ffb1396..4640625 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ .PHONY: test bench-simple clean update-golang-test fuzz fuzz-json -golang.version ?= 1.15.2 +golang.version ?= 1.21 golang.tmp.root := /tmp/golang$(golang.version) golang.tmp.json.root := $(golang.tmp.root)/go-go$(golang.version)/src/encoding/json golang.test.files := $(wildcard json/golang_*_test.go) @@ -10,7 +10,7 @@ go-fuzz-build := ${GOPATH}/bin/go-fuzz-build go-fuzz-corpus := ${GOPATH}/src/github.com/dvyukov/go-fuzz-corpus go-fuzz-dep := ${GOPATH}/src/github.com/dvyukov/go-fuzz/go-fuzz-dep -test: test-ascii test-json test-json-bugs test-json-1.17 test-proto test-iso8601 test-thrift test-purego +test: test-ascii test-json test-json-bugs test-proto test-iso8601 test-thrift test-purego test-ascii: go test -cover -race ./ascii @@ -21,9 +21,6 @@ test-json: test-json-bugs: go test -race ./json/bugs/... -test-json-1.17: - go test -cover -race -tags go1.17 ./json - test-proto: go test -cover -race ./proto diff --git a/go.mod b/go.mod index 5f43ab4..0096cd2 100644 --- a/go.mod +++ b/go.mod @@ -1,5 +1,7 @@ module github.com/segmentio/encoding -go 1.14 +go 1.18 require github.com/segmentio/asm v1.1.3 + +require golang.org/x/sys v0.0.0-20211110154304-99a53858aa08 // indirect diff --git a/json/codec.go b/json/codec.go index 908c3f6..bd3c1d4 100644 --- a/json/codec.go +++ b/json/codec.go @@ -4,6 +4,7 @@ import ( "encoding" "encoding/json" "fmt" + "math/big" "reflect" "sort" "strconv" @@ -49,19 +50,21 @@ type decodeFunc func(decoder, []byte, unsafe.Pointer) ([]byte, error) type emptyFunc func(unsafe.Pointer) bool type sortFunc func([]reflect.Value) -var ( - // Eventually consistent cache mapping go types to dynamically generated - // codecs. - // - // Note: using a uintptr as key instead of reflect.Type shaved ~15ns off of - // the ~30ns Marhsal/Unmarshal functions which were dominated by the map - // lookup time for simple types like bool, int, etc.. - cache unsafe.Pointer // map[unsafe.Pointer]codec -) +// Eventually consistent cache mapping go types to dynamically generated +// codecs. +// +// Note: using a uintptr as key instead of reflect.Type shaved ~15ns off of +// the ~30ns Marhsal/Unmarshal functions which were dominated by the map +// lookup time for simple types like bool, int, etc.. +var cache atomic.Pointer[map[unsafe.Pointer]codec] func cacheLoad() map[unsafe.Pointer]codec { - p := atomic.LoadPointer(&cache) - return *(*map[unsafe.Pointer]codec)(unsafe.Pointer(&p)) + p := cache.Load() + if p == nil { + return nil + } + + return *p } func cacheStore(typ reflect.Type, cod codec, oldCodecs map[unsafe.Pointer]codec) { @@ -72,7 +75,7 @@ func cacheStore(typ reflect.Type, cod codec, oldCodecs map[unsafe.Pointer]codec) newCodecs[t] = c } - atomic.StorePointer(&cache, *(*unsafe.Pointer)(unsafe.Pointer(&newCodecs))) + cache.Store(&newCodecs) } func typeid(t reflect.Type) unsafe.Pointer { @@ -838,6 +841,7 @@ func constructInlineValueEncodeFunc(encode encodeFunc) encodeFunc { // compiles down to zero instructions. // USE CAREFULLY! // This was copied from the runtime; see issues 23382 and 7921. +// //go:nosplit func noescape(p unsafe.Pointer) unsafe.Pointer { x := uintptr(p) @@ -1078,6 +1082,7 @@ var ( float32Type = reflect.TypeOf(float32(0)) float64Type = reflect.TypeOf(float64(0)) + bigIntType = reflect.TypeOf(new(big.Int)) numberType = reflect.TypeOf(json.Number("")) stringType = reflect.TypeOf("") stringsType = reflect.TypeOf([]string(nil)) @@ -1104,6 +1109,8 @@ var ( jsonUnmarshalerType = reflect.TypeOf((*Unmarshaler)(nil)).Elem() textMarshalerType = reflect.TypeOf((*encoding.TextMarshaler)(nil)).Elem() textUnmarshalerType = reflect.TypeOf((*encoding.TextUnmarshaler)(nil)).Elem() + + bigIntDecoder = constructJSONUnmarshalerDecodeFunc(bigIntType, false) ) // ============================================================================= diff --git a/json/decode.go b/json/decode.go index b1723c2..9792af0 100644 --- a/json/decode.go +++ b/json/decode.go @@ -6,6 +6,7 @@ import ( "encoding/json" "fmt" "math" + "math/big" "reflect" "strconv" "time" @@ -16,6 +17,10 @@ import ( "github.com/segmentio/encoding/iso8601" ) +func (d decoder) anyFlagsSet(flags ParseFlags) bool { + return d.flags&flags != 0 +} + func (d decoder) decodeNull(b []byte, p unsafe.Pointer) ([]byte, error) { if hasNullPrefix(b) { return b[4:], nil @@ -738,10 +743,12 @@ func (d decoder) decodeMapStringInterface(b []byte, p unsafe.Pointer) ([]byte, e m = make(map[string]interface{}, 64) } - var err error - var key string - var val interface{} - var input = b + var ( + input = b + key string + val any + err error + ) b = b[1:] for { @@ -1276,6 +1283,7 @@ func (d decoder) decodeInterface(b []byte, p unsafe.Pointer) ([]byte, error) { if err == nil { *(*interface{})(p) = val } + return b, err } @@ -1286,19 +1294,15 @@ func (d decoder) decodeInterface(b []byte, p unsafe.Pointer) ([]byte, error) { switch k.Class() { case Object: - m := make(map[string]interface{}) - v, err = d.decodeMapStringInterface(v, unsafe.Pointer(&m)) - val = m + v, err = decodeInto[map[string]any](&val, v, d, decoder.decodeMapStringInterface) case Array: - a := make([]interface{}, 0, 10) - v, err = d.decodeSlice(v, unsafe.Pointer(&a), unsafe.Sizeof(a[0]), sliceInterfaceType, decoder.decodeInterface) - val = a + size := alignedSize(interfaceType) + fn := constructSliceDecodeFunc(size, sliceInterfaceType, decoder.decodeInterface) + v, err = decodeInto[[]any](&val, v, d, fn) case String: - s := "" - v, err = d.decodeString(v, unsafe.Pointer(&s)) - val = s + v, err = decodeInto[string](&val, v, d, decoder.decodeString) case Null: v, val = nil, nil @@ -1307,15 +1311,7 @@ func (d decoder) decodeInterface(b []byte, p unsafe.Pointer) ([]byte, error) { v, val = nil, k == True case Num: - if (d.flags & UseNumber) != 0 { - n := Number("") - v, err = d.decodeNumber(v, unsafe.Pointer(&n)) - val = n - } else { - f := 0.0 - v, err = d.decodeFloat64(v, unsafe.Pointer(&f)) - val = f - } + v, err = d.decodeDynamicNumber(v, unsafe.Pointer(&val)) default: return b, syntaxError(v, "expected token but found '%c'", v[0]) @@ -1333,6 +1329,68 @@ func (d decoder) decodeInterface(b []byte, p unsafe.Pointer) ([]byte, error) { return b, nil } +func (d decoder) decodeDynamicNumber(b []byte, p unsafe.Pointer) ([]byte, error) { + kind := Float + var err error + + // Only pre-parse for numeric kind if a conditional decode + // has been requested. + if d.anyFlagsSet(UseBigInt | UseInt64 | UseUint64) { + _, _, kind, err = d.parseNumber(b) + if err != nil { + return b, err + } + } + + var rem []byte + anyPtr := (*any)(p) + + // Mutually exclusive integer handling cases. + switch { + // If requested, attempt decode of positive integers as uint64. + case kind == Uint && d.anyFlagsSet(UseUint64): + rem, err = decodeInto[uint64](anyPtr, b, d, decoder.decodeUint64) + if err == nil { + return rem, err + } + + // If uint64 decode was not requested but int64 decode was requested, + // then attempt decode of positive integers as int64. + case kind == Uint && d.anyFlagsSet(UseInt64): + fallthrough + + // If int64 decode was requested, + // attempt decode of negative integers as int64. + case kind == Int && d.anyFlagsSet(UseInt64): + rem, err = decodeInto[int64](anyPtr, b, d, decoder.decodeInt64) + if err == nil { + return rem, err + } + } + + // Fallback numeric handling cases: + // these cannot be combined into the above switch, + // since these cases also handle overflow + // from the above cases, if decode was already attempted. + switch { + // If *big.Int decode was requested, handle that case for any integer. + case kind == Uint && d.anyFlagsSet(UseBigInt): + fallthrough + case kind == Int && d.anyFlagsSet(UseBigInt): + rem, err = decodeInto[*big.Int](anyPtr, b, d, bigIntDecoder) + + // If json.Number decode was requested, handle that for any number. + case d.anyFlagsSet(UseNumber): + rem, err = decodeInto[Number](anyPtr, b, d, decoder.decodeNumber) + + // Fall back to float64 decode when no special decoding has been requested. + default: + rem, err = decodeInto[float64](anyPtr, b, d, decoder.decodeFloat64) + } + + return rem, err +} + func (d decoder) decodeMaybeEmptyInterface(b []byte, p unsafe.Pointer, t reflect.Type) ([]byte, error) { if hasNullPrefix(b) { *(*interface{})(p) = nil @@ -1460,3 +1518,13 @@ func (d decoder) inputError(b []byte, t reflect.Type) ([]byte, error) { } return skipSpaces(r), unmarshalTypeError(b, t) } + +func decodeInto[T any](dest *any, b []byte, d decoder, fn decodeFunc) ([]byte, error) { + var v T + rem, err := fn(d, b, unsafe.Pointer(&v)) + if err == nil { + *dest = v + } + + return rem, err +} diff --git a/json/golang_bench_test.go b/json/golang_bench_test.go index 07cc378..04c9176 100644 --- a/json/golang_bench_test.go +++ b/json/golang_bench_test.go @@ -14,7 +14,7 @@ import ( "bytes" "compress/gzip" "fmt" - "io/ioutil" + "io" "os" "reflect" "runtime" @@ -51,7 +51,7 @@ func codeInit() { if err != nil { panic(err) } - data, err := ioutil.ReadAll(gz) + data, err := io.ReadAll(gz) if err != nil { panic(err) } @@ -88,7 +88,7 @@ func BenchmarkCodeEncoder(b *testing.B) { b.StartTimer() } b.RunParallel(func(pb *testing.PB) { - enc := NewEncoder(ioutil.Discard) + enc := NewEncoder(io.Discard) for pb.Next() { if err := enc.Encode(&codeStruct); err != nil { b.Fatal("Encode:", err) diff --git a/json/int_test.go b/json/int_test.go index 4611375..cca5af1 100644 --- a/json/int_test.go +++ b/json/int_test.go @@ -13,8 +13,7 @@ func TestAppendInt(t *testing.T) { ints = append(ints, int64(u-1), int64(u), int64(u+1), -int64(u)) } - var std [20]byte - var our [20]byte + var std, our [20]byte for _, i := range ints { expected := strconv.AppendInt(std[:], i, 10) diff --git a/json/json.go b/json/json.go index 47f3ba1..d5f6f9d 100644 --- a/json/json.go +++ b/json/json.go @@ -128,6 +128,19 @@ const ( // mode. DontMatchCaseInsensitiveStructFields + // Decode integers into *big.Int. + // Takes precedence over UseNumber for integers. + UseBigInt + + // Decode in-range integers to int64. + // Takes precedence over UseNumber and UseBigInt for in-range integers. + UseInt64 + + // Decode in-range positive integers to uint64. + // Takes precedence over UseNumber, UseBigInt, and UseInt64 + // for positive, in-range integers. + UseUint64 + // ZeroCopy is a parsing flag that combines all the copy optimizations // available in the package. // diff --git a/json/json_test.go b/json/json_test.go index 6840b85..fb77868 100644 --- a/json/json_test.go +++ b/json/json_test.go @@ -9,8 +9,8 @@ import ( "flag" "fmt" "io" - "io/ioutil" "math" + "math/big" "os" "path/filepath" "reflect" @@ -86,6 +86,13 @@ type tree struct { Right *tree } +var ( + // bigPos128 and bigNeg128 are 1<<128 and -1<<128 + // certainly neither is representable using a uint64/int64. + bigPos128 = new(big.Int).Lsh(big.NewInt(1), 128) + bigNeg128 = new(big.Int).Neg(bigPos128) +) + var testValues = [...]interface{}{ // constants nil, @@ -127,6 +134,9 @@ var testValues = [...]interface{}{ float64(math.SmallestNonzeroFloat64), float64(math.MaxFloat64), + bigPos128, + bigNeg128, + // number Number("0"), Number("1234567890"), @@ -402,7 +412,7 @@ func TestCodec(t *testing.T) { t.Logf("found: %#v", x2) } - if b, err := ioutil.ReadAll(dec.Buffered()); err != nil { + if b, err := io.ReadAll(dec.Buffered()); err != nil { t.Error(err) } else if len(b) != 0 { t.Errorf("leftover trailing bytes in the decoder: %q", b) @@ -485,6 +495,134 @@ func TestCodecDuration(t *testing.T) { } } +var numericParseTests = [...]struct { + name string + input string + flags ParseFlags + want any +}{ + { + name: "zero_flags_default", + input: `0`, + flags: 0, + want: float64(0), + }, + { + name: "zero_flags_int_uint_bigint_number", + input: `0`, + flags: UseInt64 | UseUint64 | UseBigInt | UseNumber, + want: uint64(0), + }, + { + name: "zero_flags_int_bigint_number", + input: `0`, + flags: UseInt64 | UseBigInt | UseNumber, + want: int64(0), + }, + { + name: "zero_flags_bigint_number", + input: `0`, + flags: UseBigInt | UseNumber, + want: big.NewInt(0), + }, + { + name: "zero_flags_number", + input: `0`, + flags: UseNumber, + want: json.Number(`0`), + }, + { + name: "max_uint64_flags_default", + input: fmt.Sprint(uint64(math.MaxUint64)), + flags: 0, + want: float64(math.MaxUint64), + }, + { + name: "max_uint64_flags_int_uint_bigint_number", + input: fmt.Sprint(uint64(math.MaxUint64)), + flags: UseInt64 | UseUint64 | UseBigInt | UseNumber, + want: uint64(math.MaxUint64), + }, + { + name: "min_int64_flags_uint_int_bigint_number", + input: fmt.Sprint(int64(math.MinInt64)), + flags: UseInt64 | UseBigInt | UseNumber, + want: int64(math.MinInt64), + }, + { + name: "max_uint64_flags_int_bigint_number", + input: fmt.Sprint(uint64(math.MaxUint64)), + flags: UseInt64 | UseBigInt | UseNumber, + want: new(big.Int).SetUint64(math.MaxUint64), + }, + { + name: "overflow_uint64_flags_uint_int_bigint_number", + input: bigPos128.String(), + flags: UseUint64 | UseInt64 | UseBigInt | UseNumber, + want: bigPos128, + }, + { + name: "underflow_uint64_flags_uint_int_bigint_number", + input: bigNeg128.String(), + flags: UseUint64 | UseInt64 | UseBigInt | UseNumber, + want: bigNeg128, + }, + { + name: "overflow_uint64_flags_uint_int_number", + input: bigPos128.String(), + flags: UseUint64 | UseInt64 | UseNumber, + want: json.Number(bigPos128.String()), + }, + { + name: "underflow_uint64_flags_uint_int_number", + input: bigNeg128.String(), + flags: UseUint64 | UseInt64 | UseNumber, + want: json.Number(bigNeg128.String()), + }, + { + name: "overflow_uint64_flags_uint_int", + input: bigPos128.String(), + flags: UseUint64 | UseInt64, + want: float64(1 << 128), + }, + { + name: "underflow_uint64_flags_uint_int", + input: bigNeg128.String(), + flags: UseUint64 | UseInt64, + want: float64(-1 << 128), + }, +} + +func TestParse_numeric(t *testing.T) { + t.Parallel() + + for _, test := range numericParseTests { + test := test + + t.Run(test.name, func(t *testing.T) { + t.Parallel() + + var got any + + rem, err := Parse([]byte(test.input), &got, test.flags) + if err != nil { + format := "Parse(%#q, ..., %#b) = %q [error], want nil" + t.Errorf(format, test.input, test.flags, err) + } + + if len(rem) != 0 { + format := "Parse(%#q, ..., %#b) = %#q, want zero length" + t.Errorf(format, test.input, test.flags, rem) + } + + if !reflect.DeepEqual(got, test.want) { + format := "Parse(%#q, %#b) -> %T(%#[3]v), want %T(%#[4]v)" + t.Errorf(format, test.input, test.flags, got, test.want) + } + }) + } +} + func newValue(model interface{}) reflect.Value { if model == nil { return reflect.New(reflect.TypeOf(&model).Elem()) diff --git a/proto/fixtures/generate/main.go b/proto/fixtures/generate/main.go index 3016afd..0614934 100644 --- a/proto/fixtures/generate/main.go +++ b/proto/fixtures/generate/main.go @@ -1,7 +1,6 @@ package main import ( - "io/ioutil" "os" "github.com/golang/protobuf/proto" @@ -28,6 +27,6 @@ func main() { for _, test := range tests { b, _ := proto.Marshal(&test.value) - ioutil.WriteFile("protobuf/"+test.name, b, 0644) + os.WriteFile("protobuf/"+test.name, b, 0644) } } diff --git a/proto/proto_test.go b/proto/proto_test.go index fd7573a..d978aaf 100644 --- a/proto/proto_test.go +++ b/proto/proto_test.go @@ -3,8 +3,8 @@ package proto import ( "encoding/binary" "fmt" - "io/ioutil" "math" + "os" "reflect" "testing" ) @@ -366,7 +366,7 @@ func TestMarshalUnmarshal(t *testing.T) { } func loadProtobuf(t *testing.T, fileName string) RawMessage { - b, err := ioutil.ReadFile("fixtures/protobuf/" + fileName) + b, err := os.ReadFile("fixtures/protobuf/" + fileName) if err != nil { t.Fatal(err) } diff --git a/thrift/decode.go b/thrift/decode.go index 341d10a..4cc394b 100644 --- a/thrift/decode.go +++ b/thrift/decode.go @@ -5,7 +5,6 @@ import ( "bytes" "fmt" "io" - "io/ioutil" "reflect" "sync/atomic" ) @@ -656,7 +655,7 @@ func skipBinary(r Reader) error { case *bufio.Reader: _, err = x.Discard(int(n)) default: - _, err = io.CopyN(ioutil.Discard, x, int64(n)) + _, err = io.CopyN(io.Discard, x, int64(n)) } return dontExpectEOF(err) }