vendor: update buildkit to master@ae9d0f5

Signed-off-by: Justin Chadwell <me@jedevc.com>
This commit is contained in:
Justin Chadwell
2022-11-22 14:39:36 +00:00
parent 6e9b743296
commit 36e663edda
375 changed files with 14834 additions and 13552 deletions

View File

@ -17,6 +17,49 @@ This package provides various compression algorithms.
# changelog
* Sept 26, 2022 (v1.15.11)
* flate: Improve level 1-3 compression https://github.com/klauspost/compress/pull/678
* zstd: Improve "best" compression by @nightwolfz in https://github.com/klauspost/compress/pull/677
* zstd: Fix+reduce decompression allocations https://github.com/klauspost/compress/pull/668
* zstd: Fix non-effective noescape tag https://github.com/klauspost/compress/pull/667
* Sept 16, 2022 (v1.15.10)
* zstd: Add [WithDecodeAllCapLimit](https://pkg.go.dev/github.com/klauspost/compress@v1.15.10/zstd#WithDecodeAllCapLimit) https://github.com/klauspost/compress/pull/649
* Add Go 1.19 - deprecate Go 1.16 https://github.com/klauspost/compress/pull/651
* flate: Improve level 5+6 compression https://github.com/klauspost/compress/pull/656
* zstd: Improve "better" compresssion https://github.com/klauspost/compress/pull/657
* s2: Improve "best" compression https://github.com/klauspost/compress/pull/658
* s2: Improve "better" compression. https://github.com/klauspost/compress/pull/635
* s2: Slightly faster non-assembly decompression https://github.com/klauspost/compress/pull/646
* Use arrays for constant size copies https://github.com/klauspost/compress/pull/659
* July 21, 2022 (v1.15.9)
* zstd: Fix decoder crash on amd64 (no BMI) on invalid input https://github.com/klauspost/compress/pull/645
* zstd: Disable decoder extended memory copies (amd64) due to possible crashes https://github.com/klauspost/compress/pull/644
* zstd: Allow single segments up to "max decoded size" by @klauspost in https://github.com/klauspost/compress/pull/643
* July 13, 2022 (v1.15.8)
* gzip: fix stack exhaustion bug in Reader.Read https://github.com/klauspost/compress/pull/641
* s2: Add Index header trim/restore https://github.com/klauspost/compress/pull/638
* zstd: Optimize seqdeq amd64 asm by @greatroar in https://github.com/klauspost/compress/pull/636
* zstd: Improve decoder memcopy https://github.com/klauspost/compress/pull/637
* huff0: Pass a single bitReader pointer to asm by @greatroar in https://github.com/klauspost/compress/pull/634
* zstd: Branchless getBits for amd64 w/o BMI2 by @greatroar in https://github.com/klauspost/compress/pull/640
* gzhttp: Remove header before writing https://github.com/klauspost/compress/pull/639
* June 29, 2022 (v1.15.7)
* s2: Fix absolute forward seeks https://github.com/klauspost/compress/pull/633
* zip: Merge upstream https://github.com/klauspost/compress/pull/631
* zip: Re-add zip64 fix https://github.com/klauspost/compress/pull/624
* zstd: translate fseDecoder.buildDtable into asm by @WojciechMula in https://github.com/klauspost/compress/pull/598
* flate: Faster histograms https://github.com/klauspost/compress/pull/620
* deflate: Use compound hcode https://github.com/klauspost/compress/pull/622
* June 3, 2022 (v1.15.6)
* s2: Improve coding for long, close matches https://github.com/klauspost/compress/pull/613
* s2c: Add Snappy/S2 stream recompression https://github.com/klauspost/compress/pull/611
@ -72,15 +115,15 @@ This package provides various compression algorithms.
* gzhttp: Add zstd to transport by @klauspost in [#400](https://github.com/klauspost/compress/pull/400)
* gzhttp: Make content-type optional by @klauspost in [#510](https://github.com/klauspost/compress/pull/510)
<details>
<summary>See Details</summary>
Both compression and decompression now supports "synchronous" stream operations. This means that whenever "concurrency" is set to 1, they will operate without spawning goroutines.
Stream decompression is now faster on asynchronous, since the goroutine allocation much more effectively splits the workload. On typical streams this will typically use 2 cores fully for decompression. When a stream has finished decoding no goroutines will be left over, so decoders can now safely be pooled and still be garbage collected.
While the release has been extensively tested, it is recommended to testing when upgrading.
</details>
<details>
<summary>See changes to v1.14.x</summary>
* Feb 22, 2022 (v1.14.4)
* flate: Fix rare huffman only (-2) corruption. [#503](https://github.com/klauspost/compress/pull/503)
* zip: Update deprecated CreateHeaderRaw to correctly call CreateRaw by @saracen in [#502](https://github.com/klauspost/compress/pull/502)
@ -106,6 +149,7 @@ While the release has been extensively tested, it is recommended to testing when
* zstd: Performance improvement in [#420]( https://github.com/klauspost/compress/pull/420) [#456](https://github.com/klauspost/compress/pull/456) [#437](https://github.com/klauspost/compress/pull/437) [#467](https://github.com/klauspost/compress/pull/467) [#468](https://github.com/klauspost/compress/pull/468)
* zstd: add arm64 xxhash assembly in [#464](https://github.com/klauspost/compress/pull/464)
* Add garbled for binaries for s2 in [#445](https://github.com/klauspost/compress/pull/445)
</details>
<details>
<summary>See changes to v1.13.x</summary>

View File

@ -763,17 +763,20 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 1")
}
copy(out, buf[0][:])
copy(out[dstEvery:], buf[1][:])
copy(out[dstEvery*2:], buf[2][:])
copy(out[dstEvery*3:], buf[3][:])
out = out[bufoff:]
decoded += bufoff * 4
// There must at least be 3 buffers left.
if len(out) < dstEvery*3 {
if len(out)-bufoff < dstEvery*3 {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 2")
}
//copy(out, buf[0][:])
//copy(out[dstEvery:], buf[1][:])
//copy(out[dstEvery*2:], buf[2][:])
*(*[bufoff]byte)(out) = buf[0]
*(*[bufoff]byte)(out[dstEvery:]) = buf[1]
*(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
*(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
out = out[bufoff:]
decoded += bufoff * 4
}
}
if off > 0 {
@ -997,17 +1000,22 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 1")
}
copy(out, buf[0][:])
copy(out[dstEvery:], buf[1][:])
copy(out[dstEvery*2:], buf[2][:])
copy(out[dstEvery*3:], buf[3][:])
out = out[bufoff:]
decoded += bufoff * 4
// There must at least be 3 buffers left.
if len(out) < dstEvery*3 {
if len(out)-bufoff < dstEvery*3 {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 2")
}
//copy(out, buf[0][:])
//copy(out[dstEvery:], buf[1][:])
//copy(out[dstEvery*2:], buf[2][:])
// copy(out[dstEvery*3:], buf[3][:])
*(*[bufoff]byte)(out) = buf[0]
*(*[bufoff]byte)(out[dstEvery:]) = buf[1]
*(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
*(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
out = out[bufoff:]
decoded += bufoff * 4
}
}
if off > 0 {

View File

@ -14,12 +14,14 @@ import (
// decompress4x_main_loop_x86 is an x86 assembler implementation
// of Decompress4X when tablelog > 8.
//
//go:noescape
func decompress4x_main_loop_amd64(ctx *decompress4xContext)
// decompress4x_8b_loop_x86 is an x86 assembler implementation
// of Decompress4X when tablelog <= 8 which decodes 4 entries
// per loop.
//
//go:noescape
func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
@ -27,10 +29,7 @@ func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
const fallback8BitSize = 800
type decompress4xContext struct {
pbr0 *bitReaderShifted
pbr1 *bitReaderShifted
pbr2 *bitReaderShifted
pbr3 *bitReaderShifted
pbr *[4]bitReaderShifted
peekBits uint8
out *byte
dstEvery int
@ -89,10 +88,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
ctx := decompress4xContext{
pbr0: &br[0],
pbr1: &br[1],
pbr2: &br[2],
pbr3: &br[3],
pbr: &br,
peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
out: &out[0],
dstEvery: dstEvery,
@ -151,11 +147,13 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
// decompress4x_main_loop_x86 is an x86 assembler implementation
// of Decompress1X when tablelog > 8.
//
//go:noescape
func decompress1x_main_loop_amd64(ctx *decompress1xContext)
// decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation
// of Decompress1X when tablelog > 8.
//
//go:noescape
func decompress1x_main_loop_bmi2(ctx *decompress1xContext)

View File

@ -1,48 +1,42 @@
// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
//go:build amd64 && !appengine && !noasm && gc
// +build amd64,!appengine,!noasm,gc
// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
TEXT ·decompress4x_main_loop_amd64(SB), $8-8
TEXT ·decompress4x_main_loop_amd64(SB), $0-8
XORQ DX, DX
// Preload values
MOVQ ctx+0(FP), AX
MOVBQZX 32(AX), SI
MOVQ 40(AX), DI
MOVQ DI, BX
MOVQ 72(AX), CX
MOVQ CX, (SP)
MOVQ 48(AX), R8
MOVQ 56(AX), R9
MOVQ (AX), R10
MOVQ 8(AX), R11
MOVQ 16(AX), R12
MOVQ 24(AX), R13
MOVBQZX 8(AX), DI
MOVQ 16(AX), SI
MOVQ 48(AX), BX
MOVQ 24(AX), R9
MOVQ 32(AX), R10
MOVQ (AX), R11
// Main loop
main_loop:
MOVQ BX, DI
CMPQ DI, (SP)
MOVQ SI, R8
CMPQ R8, BX
SETGE DL
// br0.fillFast32()
MOVQ 32(R10), R14
MOVBQZX 40(R10), R15
CMPQ R15, $0x20
MOVQ 32(R11), R12
MOVBQZX 40(R11), R13
CMPQ R13, $0x20
JBE skip_fill0
MOVQ 24(R10), AX
SUBQ $0x20, R15
MOVQ 24(R11), AX
SUBQ $0x20, R13
SUBQ $0x04, AX
MOVQ (R10), BP
MOVQ (R11), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (AX)(BP*1), BP
MOVQ R15, CX
SHLQ CL, BP
MOVQ AX, 24(R10)
ORQ BP, R14
MOVL (AX)(R14*1), R14
MOVQ R13, CX
SHLQ CL, R14
MOVQ AX, 24(R11)
ORQ R14, R12
// exhausted = exhausted || (br0.off < 4)
CMPQ AX, $0x04
@ -51,57 +45,57 @@ main_loop:
skip_fill0:
// val0 := br0.peekTopBits(peekBits)
MOVQ R14, BP
MOVQ SI, CX
SHRQ CL, BP
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v0 := table[val0&mask]
MOVW (R9)(BP*2), CX
MOVW (R10)(R14*2), CX
// br0.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R14
ADDB CL, R15
SHLQ CL, R12
ADDB CL, R13
// val1 := br0.peekTopBits(peekBits)
MOVQ SI, CX
MOVQ R14, BP
SHRQ CL, BP
MOVQ DI, CX
MOVQ R12, R14
SHRQ CL, R14
// v1 := table[val1&mask]
MOVW (R9)(BP*2), CX
MOVW (R10)(R14*2), CX
// br0.advance(uint8(v1.entry))
MOVB CH, AH
SHLQ CL, R14
ADDB CL, R15
SHLQ CL, R12
ADDB CL, R13
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
MOVW AX, (DI)
MOVW AX, (R8)
// update the bitrader reader structure
MOVQ R14, 32(R10)
MOVB R15, 40(R10)
ADDQ R8, DI
// update the bitreader structure
MOVQ R12, 32(R11)
MOVB R13, 40(R11)
ADDQ R9, R8
// br1.fillFast32()
MOVQ 32(R11), R14
MOVBQZX 40(R11), R15
CMPQ R15, $0x20
MOVQ 80(R11), R12
MOVBQZX 88(R11), R13
CMPQ R13, $0x20
JBE skip_fill1
MOVQ 24(R11), AX
SUBQ $0x20, R15
MOVQ 72(R11), AX
SUBQ $0x20, R13
SUBQ $0x04, AX
MOVQ (R11), BP
MOVQ 48(R11), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (AX)(BP*1), BP
MOVQ R15, CX
SHLQ CL, BP
MOVQ AX, 24(R11)
ORQ BP, R14
MOVL (AX)(R14*1), R14
MOVQ R13, CX
SHLQ CL, R14
MOVQ AX, 72(R11)
ORQ R14, R12
// exhausted = exhausted || (br1.off < 4)
CMPQ AX, $0x04
@ -110,57 +104,57 @@ skip_fill0:
skip_fill1:
// val0 := br1.peekTopBits(peekBits)
MOVQ R14, BP
MOVQ SI, CX
SHRQ CL, BP
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v0 := table[val0&mask]
MOVW (R9)(BP*2), CX
MOVW (R10)(R14*2), CX
// br1.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R14
ADDB CL, R15
SHLQ CL, R12
ADDB CL, R13
// val1 := br1.peekTopBits(peekBits)
MOVQ SI, CX
MOVQ R14, BP
SHRQ CL, BP
MOVQ DI, CX
MOVQ R12, R14
SHRQ CL, R14
// v1 := table[val1&mask]
MOVW (R9)(BP*2), CX
MOVW (R10)(R14*2), CX
// br1.advance(uint8(v1.entry))
MOVB CH, AH
SHLQ CL, R14
ADDB CL, R15
SHLQ CL, R12
ADDB CL, R13
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
MOVW AX, (DI)
MOVW AX, (R8)
// update the bitrader reader structure
MOVQ R14, 32(R11)
MOVB R15, 40(R11)
ADDQ R8, DI
// update the bitreader structure
MOVQ R12, 80(R11)
MOVB R13, 88(R11)
ADDQ R9, R8
// br2.fillFast32()
MOVQ 32(R12), R14
MOVBQZX 40(R12), R15
CMPQ R15, $0x20
MOVQ 128(R11), R12
MOVBQZX 136(R11), R13
CMPQ R13, $0x20
JBE skip_fill2
MOVQ 24(R12), AX
SUBQ $0x20, R15
MOVQ 120(R11), AX
SUBQ $0x20, R13
SUBQ $0x04, AX
MOVQ (R12), BP
MOVQ 96(R11), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (AX)(BP*1), BP
MOVQ R15, CX
SHLQ CL, BP
MOVQ AX, 24(R12)
ORQ BP, R14
MOVL (AX)(R14*1), R14
MOVQ R13, CX
SHLQ CL, R14
MOVQ AX, 120(R11)
ORQ R14, R12
// exhausted = exhausted || (br2.off < 4)
CMPQ AX, $0x04
@ -169,57 +163,57 @@ skip_fill1:
skip_fill2:
// val0 := br2.peekTopBits(peekBits)
MOVQ R14, BP
MOVQ SI, CX
SHRQ CL, BP
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v0 := table[val0&mask]
MOVW (R9)(BP*2), CX
MOVW (R10)(R14*2), CX
// br2.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R14
ADDB CL, R15
SHLQ CL, R12
ADDB CL, R13
// val1 := br2.peekTopBits(peekBits)
MOVQ SI, CX
MOVQ R14, BP
SHRQ CL, BP
MOVQ DI, CX
MOVQ R12, R14
SHRQ CL, R14
// v1 := table[val1&mask]
MOVW (R9)(BP*2), CX
MOVW (R10)(R14*2), CX
// br2.advance(uint8(v1.entry))
MOVB CH, AH
SHLQ CL, R14
ADDB CL, R15
SHLQ CL, R12
ADDB CL, R13
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
MOVW AX, (DI)
MOVW AX, (R8)
// update the bitrader reader structure
MOVQ R14, 32(R12)
MOVB R15, 40(R12)
ADDQ R8, DI
// update the bitreader structure
MOVQ R12, 128(R11)
MOVB R13, 136(R11)
ADDQ R9, R8
// br3.fillFast32()
MOVQ 32(R13), R14
MOVBQZX 40(R13), R15
CMPQ R15, $0x20
MOVQ 176(R11), R12
MOVBQZX 184(R11), R13
CMPQ R13, $0x20
JBE skip_fill3
MOVQ 24(R13), AX
SUBQ $0x20, R15
MOVQ 168(R11), AX
SUBQ $0x20, R13
SUBQ $0x04, AX
MOVQ (R13), BP
MOVQ 144(R11), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (AX)(BP*1), BP
MOVQ R15, CX
SHLQ CL, BP
MOVQ AX, 24(R13)
ORQ BP, R14
MOVL (AX)(R14*1), R14
MOVQ R13, CX
SHLQ CL, R14
MOVQ AX, 168(R11)
ORQ R14, R12
// exhausted = exhausted || (br3.off < 4)
CMPQ AX, $0x04
@ -228,149 +222,142 @@ skip_fill2:
skip_fill3:
// val0 := br3.peekTopBits(peekBits)
MOVQ R14, BP
MOVQ SI, CX
SHRQ CL, BP
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v0 := table[val0&mask]
MOVW (R9)(BP*2), CX
MOVW (R10)(R14*2), CX
// br3.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R14
ADDB CL, R15
SHLQ CL, R12
ADDB CL, R13
// val1 := br3.peekTopBits(peekBits)
MOVQ SI, CX
MOVQ R14, BP
SHRQ CL, BP
MOVQ DI, CX
MOVQ R12, R14
SHRQ CL, R14
// v1 := table[val1&mask]
MOVW (R9)(BP*2), CX
MOVW (R10)(R14*2), CX
// br3.advance(uint8(v1.entry))
MOVB CH, AH
SHLQ CL, R14
ADDB CL, R15
SHLQ CL, R12
ADDB CL, R13
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
MOVW AX, (DI)
MOVW AX, (R8)
// update the bitrader reader structure
MOVQ R14, 32(R13)
MOVB R15, 40(R13)
ADDQ $0x02, BX
// update the bitreader structure
MOVQ R12, 176(R11)
MOVB R13, 184(R11)
ADDQ $0x02, SI
TESTB DL, DL
JZ main_loop
MOVQ ctx+0(FP), AX
MOVQ 40(AX), CX
MOVQ BX, DX
SUBQ CX, DX
SHLQ $0x02, DX
MOVQ DX, 64(AX)
SUBQ 16(AX), SI
SHLQ $0x02, SI
MOVQ SI, 40(AX)
RET
// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
TEXT ·decompress4x_8b_main_loop_amd64(SB), $16-8
TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
XORQ DX, DX
// Preload values
MOVQ ctx+0(FP), CX
MOVBQZX 32(CX), BX
MOVQ 40(CX), SI
MOVQ SI, (SP)
MOVQ 72(CX), DX
MOVQ DX, 8(SP)
MOVQ 48(CX), DI
MOVQ 56(CX), R8
MOVQ (CX), R9
MOVQ 8(CX), R10
MOVQ 16(CX), R11
MOVQ 24(CX), R12
MOVBQZX 8(CX), DI
MOVQ 16(CX), BX
MOVQ 48(CX), SI
MOVQ 24(CX), R9
MOVQ 32(CX), R10
MOVQ (CX), R11
// Main loop
main_loop:
MOVQ (SP), SI
CMPQ SI, 8(SP)
MOVQ BX, R8
CMPQ R8, SI
SETGE DL
// br1000.fillFast32()
MOVQ 32(R9), R13
MOVBQZX 40(R9), R14
CMPQ R14, $0x20
JBE skip_fill1000
MOVQ 24(R9), R15
SUBQ $0x20, R14
SUBQ $0x04, R15
MOVQ (R9), BP
// br0.fillFast32()
MOVQ 32(R11), R12
MOVBQZX 40(R11), R13
CMPQ R13, $0x20
JBE skip_fill0
MOVQ 24(R11), R14
SUBQ $0x20, R13
SUBQ $0x04, R14
MOVQ (R11), R15
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (R15)(BP*1), BP
MOVQ R14, CX
SHLQ CL, BP
MOVQ R15, 24(R9)
ORQ BP, R13
MOVL (R14)(R15*1), R15
MOVQ R13, CX
SHLQ CL, R15
MOVQ R14, 24(R11)
ORQ R15, R12
// exhausted = exhausted || (br1000.off < 4)
CMPQ R15, $0x04
// exhausted = exhausted || (br0.off < 4)
CMPQ R14, $0x04
SETLT AL
ORB AL, DL
skip_fill1000:
skip_fill0:
// val0 := br0.peekTopBits(peekBits)
MOVQ R13, R15
MOVQ BX, CX
SHRQ CL, R15
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v0 := table[val0&mask]
MOVW (R8)(R15*2), CX
MOVW (R10)(R14*2), CX
// br0.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R13
ADDB CL, R14
SHLQ CL, R12
ADDB CL, R13
// val1 := br0.peekTopBits(peekBits)
MOVQ R13, R15
MOVQ BX, CX
SHRQ CL, R15
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v1 := table[val0&mask]
MOVW (R8)(R15*2), CX
MOVW (R10)(R14*2), CX
// br0.advance(uint8(v1.entry)
MOVB CH, AH
SHLQ CL, R13
ADDB CL, R14
SHLQ CL, R12
ADDB CL, R13
BSWAPL AX
// val2 := br0.peekTopBits(peekBits)
MOVQ R13, R15
MOVQ BX, CX
SHRQ CL, R15
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v2 := table[val0&mask]
MOVW (R8)(R15*2), CX
MOVW (R10)(R14*2), CX
// br0.advance(uint8(v2.entry)
MOVB CH, AH
SHLQ CL, R13
ADDB CL, R14
SHLQ CL, R12
ADDB CL, R13
// val3 := br0.peekTopBits(peekBits)
MOVQ R13, R15
MOVQ BX, CX
SHRQ CL, R15
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v3 := table[val0&mask]
MOVW (R8)(R15*2), CX
MOVW (R10)(R14*2), CX
// br0.advance(uint8(v3.entry)
MOVB CH, AL
SHLQ CL, R13
ADDB CL, R14
SHLQ CL, R12
ADDB CL, R13
BSWAPL AX
// these four writes get coalesced
@ -378,88 +365,88 @@ skip_fill1000:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
MOVL AX, (SI)
MOVL AX, (R8)
// update the bitreader reader structure
MOVQ R13, 32(R9)
MOVB R14, 40(R9)
ADDQ DI, SI
// update the bitreader structure
MOVQ R12, 32(R11)
MOVB R13, 40(R11)
ADDQ R9, R8
// br1001.fillFast32()
MOVQ 32(R10), R13
MOVBQZX 40(R10), R14
CMPQ R14, $0x20
JBE skip_fill1001
MOVQ 24(R10), R15
SUBQ $0x20, R14
SUBQ $0x04, R15
MOVQ (R10), BP
// br1.fillFast32()
MOVQ 80(R11), R12
MOVBQZX 88(R11), R13
CMPQ R13, $0x20
JBE skip_fill1
MOVQ 72(R11), R14
SUBQ $0x20, R13
SUBQ $0x04, R14
MOVQ 48(R11), R15
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (R15)(BP*1), BP
MOVQ R14, CX
SHLQ CL, BP
MOVQ R15, 24(R10)
ORQ BP, R13
MOVL (R14)(R15*1), R15
MOVQ R13, CX
SHLQ CL, R15
MOVQ R14, 72(R11)
ORQ R15, R12
// exhausted = exhausted || (br1001.off < 4)
CMPQ R15, $0x04
// exhausted = exhausted || (br1.off < 4)
CMPQ R14, $0x04
SETLT AL
ORB AL, DL
skip_fill1001:
skip_fill1:
// val0 := br1.peekTopBits(peekBits)
MOVQ R13, R15
MOVQ BX, CX
SHRQ CL, R15
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v0 := table[val0&mask]
MOVW (R8)(R15*2), CX
MOVW (R10)(R14*2), CX
// br1.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R13
ADDB CL, R14
SHLQ CL, R12
ADDB CL, R13
// val1 := br1.peekTopBits(peekBits)
MOVQ R13, R15
MOVQ BX, CX
SHRQ CL, R15
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v1 := table[val0&mask]
MOVW (R8)(R15*2), CX
MOVW (R10)(R14*2), CX
// br1.advance(uint8(v1.entry)
MOVB CH, AH
SHLQ CL, R13
ADDB CL, R14
SHLQ CL, R12
ADDB CL, R13
BSWAPL AX
// val2 := br1.peekTopBits(peekBits)
MOVQ R13, R15
MOVQ BX, CX
SHRQ CL, R15
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v2 := table[val0&mask]
MOVW (R8)(R15*2), CX
MOVW (R10)(R14*2), CX
// br1.advance(uint8(v2.entry)
MOVB CH, AH
SHLQ CL, R13
ADDB CL, R14
SHLQ CL, R12
ADDB CL, R13
// val3 := br1.peekTopBits(peekBits)
MOVQ R13, R15
MOVQ BX, CX
SHRQ CL, R15
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v3 := table[val0&mask]
MOVW (R8)(R15*2), CX
MOVW (R10)(R14*2), CX
// br1.advance(uint8(v3.entry)
MOVB CH, AL
SHLQ CL, R13
ADDB CL, R14
SHLQ CL, R12
ADDB CL, R13
BSWAPL AX
// these four writes get coalesced
@ -467,88 +454,88 @@ skip_fill1001:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
MOVL AX, (SI)
MOVL AX, (R8)
// update the bitreader reader structure
MOVQ R13, 32(R10)
MOVB R14, 40(R10)
ADDQ DI, SI
// update the bitreader structure
MOVQ R12, 80(R11)
MOVB R13, 88(R11)
ADDQ R9, R8
// br1002.fillFast32()
MOVQ 32(R11), R13
MOVBQZX 40(R11), R14
CMPQ R14, $0x20
JBE skip_fill1002
MOVQ 24(R11), R15
SUBQ $0x20, R14
SUBQ $0x04, R15
MOVQ (R11), BP
// br2.fillFast32()
MOVQ 128(R11), R12
MOVBQZX 136(R11), R13
CMPQ R13, $0x20
JBE skip_fill2
MOVQ 120(R11), R14
SUBQ $0x20, R13
SUBQ $0x04, R14
MOVQ 96(R11), R15
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (R15)(BP*1), BP
MOVQ R14, CX
SHLQ CL, BP
MOVQ R15, 24(R11)
ORQ BP, R13
MOVL (R14)(R15*1), R15
MOVQ R13, CX
SHLQ CL, R15
MOVQ R14, 120(R11)
ORQ R15, R12
// exhausted = exhausted || (br1002.off < 4)
CMPQ R15, $0x04
// exhausted = exhausted || (br2.off < 4)
CMPQ R14, $0x04
SETLT AL
ORB AL, DL
skip_fill1002:
skip_fill2:
// val0 := br2.peekTopBits(peekBits)
MOVQ R13, R15
MOVQ BX, CX
SHRQ CL, R15
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v0 := table[val0&mask]
MOVW (R8)(R15*2), CX
MOVW (R10)(R14*2), CX
// br2.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R13
ADDB CL, R14
SHLQ CL, R12
ADDB CL, R13
// val1 := br2.peekTopBits(peekBits)
MOVQ R13, R15
MOVQ BX, CX
SHRQ CL, R15
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v1 := table[val0&mask]
MOVW (R8)(R15*2), CX
MOVW (R10)(R14*2), CX
// br2.advance(uint8(v1.entry)
MOVB CH, AH
SHLQ CL, R13
ADDB CL, R14
SHLQ CL, R12
ADDB CL, R13
BSWAPL AX
// val2 := br2.peekTopBits(peekBits)
MOVQ R13, R15
MOVQ BX, CX
SHRQ CL, R15
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v2 := table[val0&mask]
MOVW (R8)(R15*2), CX
MOVW (R10)(R14*2), CX
// br2.advance(uint8(v2.entry)
MOVB CH, AH
SHLQ CL, R13
ADDB CL, R14
SHLQ CL, R12
ADDB CL, R13
// val3 := br2.peekTopBits(peekBits)
MOVQ R13, R15
MOVQ BX, CX
SHRQ CL, R15
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v3 := table[val0&mask]
MOVW (R8)(R15*2), CX
MOVW (R10)(R14*2), CX
// br2.advance(uint8(v3.entry)
MOVB CH, AL
SHLQ CL, R13
ADDB CL, R14
SHLQ CL, R12
ADDB CL, R13
BSWAPL AX
// these four writes get coalesced
@ -556,88 +543,88 @@ skip_fill1002:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
MOVL AX, (SI)
MOVL AX, (R8)
// update the bitreader reader structure
MOVQ R13, 32(R11)
MOVB R14, 40(R11)
ADDQ DI, SI
// update the bitreader structure
MOVQ R12, 128(R11)
MOVB R13, 136(R11)
ADDQ R9, R8
// br1003.fillFast32()
MOVQ 32(R12), R13
MOVBQZX 40(R12), R14
CMPQ R14, $0x20
JBE skip_fill1003
MOVQ 24(R12), R15
SUBQ $0x20, R14
SUBQ $0x04, R15
MOVQ (R12), BP
// br3.fillFast32()
MOVQ 176(R11), R12
MOVBQZX 184(R11), R13
CMPQ R13, $0x20
JBE skip_fill3
MOVQ 168(R11), R14
SUBQ $0x20, R13
SUBQ $0x04, R14
MOVQ 144(R11), R15
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (R15)(BP*1), BP
MOVQ R14, CX
SHLQ CL, BP
MOVQ R15, 24(R12)
ORQ BP, R13
MOVL (R14)(R15*1), R15
MOVQ R13, CX
SHLQ CL, R15
MOVQ R14, 168(R11)
ORQ R15, R12
// exhausted = exhausted || (br1003.off < 4)
CMPQ R15, $0x04
// exhausted = exhausted || (br3.off < 4)
CMPQ R14, $0x04
SETLT AL
ORB AL, DL
skip_fill1003:
skip_fill3:
// val0 := br3.peekTopBits(peekBits)
MOVQ R13, R15
MOVQ BX, CX
SHRQ CL, R15
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v0 := table[val0&mask]
MOVW (R8)(R15*2), CX
MOVW (R10)(R14*2), CX
// br3.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R13
ADDB CL, R14
SHLQ CL, R12
ADDB CL, R13
// val1 := br3.peekTopBits(peekBits)
MOVQ R13, R15
MOVQ BX, CX
SHRQ CL, R15
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v1 := table[val0&mask]
MOVW (R8)(R15*2), CX
MOVW (R10)(R14*2), CX
// br3.advance(uint8(v1.entry)
MOVB CH, AH
SHLQ CL, R13
ADDB CL, R14
SHLQ CL, R12
ADDB CL, R13
BSWAPL AX
// val2 := br3.peekTopBits(peekBits)
MOVQ R13, R15
MOVQ BX, CX
SHRQ CL, R15
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v2 := table[val0&mask]
MOVW (R8)(R15*2), CX
MOVW (R10)(R14*2), CX
// br3.advance(uint8(v2.entry)
MOVB CH, AH
SHLQ CL, R13
ADDB CL, R14
SHLQ CL, R12
ADDB CL, R13
// val3 := br3.peekTopBits(peekBits)
MOVQ R13, R15
MOVQ BX, CX
SHRQ CL, R15
MOVQ R12, R14
MOVQ DI, CX
SHRQ CL, R14
// v3 := table[val0&mask]
MOVW (R8)(R15*2), CX
MOVW (R10)(R14*2), CX
// br3.advance(uint8(v3.entry)
MOVB CH, AL
SHLQ CL, R13
ADDB CL, R14
SHLQ CL, R12
ADDB CL, R13
BSWAPL AX
// these four writes get coalesced
@ -645,20 +632,18 @@ skip_fill1003:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
MOVL AX, (SI)
MOVL AX, (R8)
// update the bitreader reader structure
MOVQ R13, 32(R12)
MOVB R14, 40(R12)
ADDQ $0x04, (SP)
// update the bitreader structure
MOVQ R12, 176(R11)
MOVB R13, 184(R11)
ADDQ $0x04, BX
TESTB DL, DL
JZ main_loop
MOVQ ctx+0(FP), AX
MOVQ 40(AX), CX
MOVQ (SP), DX
SUBQ CX, DX
SHLQ $0x02, DX
MOVQ DX, 64(AX)
SUBQ 16(AX), BX
SHLQ $0x02, BX
MOVQ BX, 40(AX)
RET
// func decompress1x_main_loop_amd64(ctx *decompress1xContext)
@ -750,10 +735,8 @@ loop_condition:
// Update ctx structure
MOVQ ctx+0(FP), AX
MOVQ DX, CX
MOVQ 16(AX), DX
SUBQ DX, CX
MOVQ CX, 40(AX)
SUBQ 16(AX), DX
MOVQ DX, 40(AX)
MOVQ (AX), AX
MOVQ R9, 24(AX)
MOVQ R10, 32(AX)
@ -847,10 +830,8 @@ loop_condition:
// Update ctx structure
MOVQ ctx+0(FP), AX
MOVQ DX, CX
MOVQ 16(AX), DX
SUBQ DX, CX
MOVQ CX, 40(AX)
SUBQ 16(AX), DX
MOVQ DX, 40(AX)
MOVQ (AX), AX
MOVQ R9, 24(AX)
MOVQ R10, 32(AX)

View File

@ -122,17 +122,21 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 1")
}
copy(out, buf[0][:])
copy(out[dstEvery:], buf[1][:])
copy(out[dstEvery*2:], buf[2][:])
copy(out[dstEvery*3:], buf[3][:])
out = out[bufoff:]
decoded += bufoff * 4
// There must at least be 3 buffers left.
if len(out) < dstEvery*3 {
if len(out)-bufoff < dstEvery*3 {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 2")
}
//copy(out, buf[0][:])
//copy(out[dstEvery:], buf[1][:])
//copy(out[dstEvery*2:], buf[2][:])
//copy(out[dstEvery*3:], buf[3][:])
*(*[bufoff]byte)(out) = buf[0]
*(*[bufoff]byte)(out[dstEvery:]) = buf[1]
*(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
*(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
out = out[bufoff:]
decoded += bufoff * 4
}
}
if off > 0 {

View File

@ -18,6 +18,7 @@ func load64(b []byte, i int) uint64 {
// emitLiteral writes a literal chunk and returns the number of bytes written.
//
// It assumes that:
//
// dst is long enough to hold the encoded bytes
// 1 <= len(lit) && len(lit) <= 65536
func emitLiteral(dst, lit []byte) int {
@ -42,6 +43,7 @@ func emitLiteral(dst, lit []byte) int {
// emitCopy writes a copy chunk and returns the number of bytes written.
//
// It assumes that:
//
// dst is long enough to hold the encoded bytes
// 1 <= offset && offset <= 65535
// 4 <= length && length <= 65535
@ -89,6 +91,7 @@ func emitCopy(dst []byte, offset, length int) int {
// src[i:i+k-j] and src[j:k] have the same contents.
//
// It assumes that:
//
// 0 <= i && i < j && j <= len(src)
func extendMatch(src []byte, i, j int) int {
for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
@ -105,8 +108,9 @@ func hash(u, shift uint32) uint32 {
// been written.
//
// It also assumes that:
//
// len(dst) >= MaxEncodedLen(len(src)) &&
// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlock(dst, src []byte) (d int) {
// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
// The table element type is uint16, as s < sLimit and sLimit < len(src)

View File

@ -12,6 +12,8 @@ The `zstd` package is provided as open source software using a Go standard licen
Currently the package is heavily optimized for 64 bit processors and will be significantly slower on 32 bit processors.
For seekable zstd streams, see [this excellent package](https://github.com/SaveTheRbtz/zstd-seekable-format-go).
## Installation
Install using `go get -u github.com/klauspost/compress`. The package is located in `github.com/klauspost/compress/zstd`.

View File

@ -10,7 +10,6 @@ import (
"errors"
"fmt"
"io"
"io/ioutil"
"os"
"path/filepath"
"sync"
@ -233,7 +232,7 @@ func (b *blockDec) decodeBuf(hist *history) error {
if b.lowMem {
b.dst = make([]byte, b.RLESize)
} else {
b.dst = make([]byte, maxBlockSize)
b.dst = make([]byte, maxCompressedBlockSize)
}
}
b.dst = b.dst[:b.RLESize]
@ -651,7 +650,7 @@ func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.matchLengths.fse))
fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.offsets.fse))
buf.Write(in)
ioutil.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm)
os.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm)
}
return nil

View File

@ -7,7 +7,6 @@ package zstd
import (
"fmt"
"io"
"io/ioutil"
)
type byteBuffer interface {
@ -23,7 +22,7 @@ type byteBuffer interface {
readByte() (byte, error)
// Skip n bytes.
skipN(n int) error
skipN(n int64) error
}
// in-memory buffer
@ -62,9 +61,12 @@ func (b *byteBuf) readByte() (byte, error) {
return r, nil
}
func (b *byteBuf) skipN(n int) error {
func (b *byteBuf) skipN(n int64) error {
bb := *b
if len(bb) < n {
if n < 0 {
return fmt.Errorf("negative skip (%d) requested", n)
}
if int64(len(bb)) < n {
return io.ErrUnexpectedEOF
}
*b = bb[n:]
@ -120,9 +122,9 @@ func (r *readerWrapper) readByte() (byte, error) {
return r.tmp[0], nil
}
func (r *readerWrapper) skipN(n int) error {
n2, err := io.CopyN(ioutil.Discard, r.r, int64(n))
if n2 != int64(n) {
func (r *readerWrapper) skipN(n int64) error {
n2, err := io.CopyN(io.Discard, r.r, n)
if n2 != n {
err = io.ErrUnexpectedEOF
}
return err

View File

@ -35,6 +35,7 @@ type Decoder struct {
br readerWrapper
enabled bool
inFrame bool
dstBuf []byte
}
frame *frameDec
@ -187,21 +188,23 @@ func (d *Decoder) Reset(r io.Reader) error {
}
// If bytes buffer and < 5MB, do sync decoding anyway.
if bb, ok := r.(byter); ok && bb.Len() < 5<<20 {
if bb, ok := r.(byter); ok && bb.Len() < d.o.decodeBufsBelow && !d.o.limitToCap {
bb2 := bb
if debugDecoder {
println("*bytes.Buffer detected, doing sync decode, len:", bb.Len())
}
b := bb2.Bytes()
var dst []byte
if cap(d.current.b) > 0 {
dst = d.current.b
if cap(d.syncStream.dstBuf) > 0 {
dst = d.syncStream.dstBuf[:0]
}
dst, err := d.DecodeAll(b, dst[:0])
dst, err := d.DecodeAll(b, dst)
if err == nil {
err = io.EOF
}
// Save output buffer
d.syncStream.dstBuf = dst
d.current.b = dst
d.current.err = err
d.current.flushed = true
@ -216,6 +219,7 @@ func (d *Decoder) Reset(r io.Reader) error {
d.current.err = nil
d.current.flushed = false
d.current.d = nil
d.syncStream.dstBuf = nil
// Ensure no-one else is still running...
d.streamWg.Wait()
@ -312,6 +316,7 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
// Grab a block decoder and frame decoder.
block := <-d.decoders
frame := block.localFrame
initialSize := len(dst)
defer func() {
if debugDecoder {
printf("re-adding decoder: %p", block)
@ -348,10 +353,22 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
frame.history.setDict(&dict)
}
if frame.WindowSize > d.o.maxWindowSize {
if debugDecoder {
println("window size exceeded:", frame.WindowSize, ">", d.o.maxWindowSize)
}
return dst, ErrWindowSizeExceeded
}
if frame.FrameContentSize != fcsUnknown {
if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)-initialSize) {
if debugDecoder {
println("decoder size exceeded; fcs:", frame.FrameContentSize, "> mcs:", d.o.maxDecodedSize-uint64(len(dst)-initialSize), "len:", len(dst))
}
return dst, ErrDecoderSizeExceeded
}
if d.o.limitToCap && frame.FrameContentSize > uint64(cap(dst)-len(dst)) {
if debugDecoder {
println("decoder size exceeded; fcs:", frame.FrameContentSize, "> (cap-len)", cap(dst)-len(dst))
}
return dst, ErrDecoderSizeExceeded
}
if cap(dst)-len(dst) < int(frame.FrameContentSize) {
@ -361,7 +378,7 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
}
}
if cap(dst) == 0 {
if cap(dst) == 0 && !d.o.limitToCap {
// Allocate len(input) * 2 by default if nothing is provided
// and we didn't get frame content size.
size := len(input) * 2
@ -379,6 +396,9 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
if err != nil {
return dst, err
}
if uint64(len(dst)-initialSize) > d.o.maxDecodedSize {
return dst, ErrDecoderSizeExceeded
}
if len(frame.bBuf) == 0 {
if debugDecoder {
println("frame dbuf empty")
@ -664,6 +684,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
if debugDecoder {
println("Async 1: new history, recent:", block.async.newHist.recentOffsets)
}
hist.reset()
hist.decoders = block.async.newHist.decoders
hist.recentOffsets = block.async.newHist.recentOffsets
hist.windowSize = block.async.newHist.windowSize
@ -695,6 +716,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
seqExecute <- block
}
close(seqExecute)
hist.reset()
}()
var wg sync.WaitGroup
@ -718,6 +740,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
if debugDecoder {
println("Async 2: new history")
}
hist.reset()
hist.windowSize = block.async.newHist.windowSize
hist.allocFrameBuffer = block.async.newHist.allocFrameBuffer
if block.async.newHist.dict != nil {
@ -747,7 +770,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
if block.lowMem {
block.dst = make([]byte, block.RLESize)
} else {
block.dst = make([]byte, maxBlockSize)
block.dst = make([]byte, maxCompressedBlockSize)
}
}
block.dst = block.dst[:block.RLESize]
@ -799,13 +822,14 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
if debugDecoder {
println("decoder goroutines finished")
}
hist.reset()
}()
var hist history
decodeStream:
for {
var hist history
var hasErr bool
hist.reset()
decodeBlock := func(block *blockDec) {
if hasErr {
if block != nil {
@ -849,6 +873,10 @@ decodeStream:
}
}
if err == nil && d.frame.WindowSize > d.o.maxWindowSize {
if debugDecoder {
println("decoder size exceeded, fws:", d.frame.WindowSize, "> mws:", d.o.maxWindowSize)
}
err = ErrDecoderSizeExceeded
}
if err != nil {
@ -917,5 +945,6 @@ decodeStream:
}
close(seqDecode)
wg.Wait()
hist.reset()
d.frame.history.b = frameHistCache
}

View File

@ -14,20 +14,23 @@ type DOption func(*decoderOptions) error
// options retains accumulated state of multiple options.
type decoderOptions struct {
lowMem bool
concurrent int
maxDecodedSize uint64
maxWindowSize uint64
dicts []dict
ignoreChecksum bool
lowMem bool
concurrent int
maxDecodedSize uint64
maxWindowSize uint64
dicts []dict
ignoreChecksum bool
limitToCap bool
decodeBufsBelow int
}
func (o *decoderOptions) setDefault() {
*o = decoderOptions{
// use less ram: true for now, but may change.
lowMem: true,
concurrent: runtime.GOMAXPROCS(0),
maxWindowSize: MaxWindowSize,
lowMem: true,
concurrent: runtime.GOMAXPROCS(0),
maxWindowSize: MaxWindowSize,
decodeBufsBelow: 128 << 10,
}
if o.concurrent > 4 {
o.concurrent = 4
@ -114,6 +117,29 @@ func WithDecoderMaxWindow(size uint64) DOption {
}
}
// WithDecodeAllCapLimit will limit DecodeAll to decoding cap(dst)-len(dst) bytes,
// or any size set in WithDecoderMaxMemory.
// This can be used to limit decoding to a specific maximum output size.
// Disabled by default.
func WithDecodeAllCapLimit(b bool) DOption {
return func(o *decoderOptions) error {
o.limitToCap = b
return nil
}
}
// WithDecodeBuffersBelow will fully decode readers that have a
// `Bytes() []byte` and `Len() int` interface similar to bytes.Buffer.
// This typically uses less allocations but will have the full decompressed object in memory.
// Note that DecodeAllCapLimit will disable this, as well as giving a size of 0 or less.
// Default is 128KiB.
func WithDecodeBuffersBelow(size int) DOption {
return func(o *decoderOptions) error {
o.decodeBufsBelow = size
return nil
}
}
// IgnoreChecksum allows to forcibly ignore checksum checking.
func IgnoreChecksum(b bool) DOption {
return func(o *decoderOptions) error {

View File

@ -32,6 +32,7 @@ type match struct {
length int32
rep int32
est int32
_ [12]byte // Aligned size to cache line: 4+4+4+4+4 bytes + 12 bytes padding = 32 bytes
}
const highScore = 25000

View File

@ -416,15 +416,23 @@ encodeLoop:
// Try to find a better match by searching for a long match at the end of the current best match
if s+matched < sLimit {
// Allow some bytes at the beginning to mismatch.
// Sweet spot is around 3 bytes, but depends on input.
// The skipped bytes are tested in Extend backwards,
// and still picked up as part of the match if they do.
const skipBeginning = 3
nextHashL := hashLen(load6432(src, s+matched), betterLongTableBits, betterLongLen)
cv := load3232(src, s)
s2 := s + skipBeginning
cv := load3232(src, s2)
candidateL := e.longTable[nextHashL]
coffsetL := candidateL.offset - e.cur - matched
if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
coffsetL := candidateL.offset - e.cur - matched + skipBeginning
if coffsetL >= 0 && coffsetL < s2 && s2-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
// Found a long match, at least 4 bytes.
matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
matchedNext := e.matchlen(s2+4, coffsetL+4, src) + 4
if matchedNext > matched {
t = coffsetL
s = s2
matched = matchedNext
if debugMatches {
println("long match at end-of-match")
@ -434,12 +442,13 @@ encodeLoop:
// Check prev long...
if true {
coffsetL = candidateL.prev - e.cur - matched
if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
coffsetL = candidateL.prev - e.cur - matched + skipBeginning
if coffsetL >= 0 && coffsetL < s2 && s2-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
// Found a long match, at least 4 bytes.
matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
matchedNext := e.matchlen(s2+4, coffsetL+4, src) + 4
if matchedNext > matched {
t = coffsetL
s = s2
matched = matchedNext
if debugMatches {
println("prev long match at end-of-match")

View File

@ -1103,7 +1103,8 @@ func (e *doubleFastEncoderDict) Reset(d *dict, singleBlock bool) {
}
if allDirty || dirtyShardCnt > dLongTableShardCnt/2 {
copy(e.longTable[:], e.dictLongTable)
//copy(e.longTable[:], e.dictLongTable)
e.longTable = *(*[dFastLongTableSize]tableEntry)(e.dictLongTable)
for i := range e.longTableShardDirty {
e.longTableShardDirty[i] = false
}
@ -1114,7 +1115,9 @@ func (e *doubleFastEncoderDict) Reset(d *dict, singleBlock bool) {
continue
}
copy(e.longTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize], e.dictLongTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize])
// copy(e.longTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize], e.dictLongTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize])
*(*[dLongTableShardSize]tableEntry)(e.longTable[i*dLongTableShardSize:]) = *(*[dLongTableShardSize]tableEntry)(e.dictLongTable[i*dLongTableShardSize:])
e.longTableShardDirty[i] = false
}
}

View File

@ -304,7 +304,7 @@ func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
minNonLiteralBlockSize = 1 + 1 + inputMargin
)
if debugEncoder {
if len(src) > maxBlockSize {
if len(src) > maxCompressedBlockSize {
panic("src too big")
}
}
@ -871,7 +871,8 @@ func (e *fastEncoderDict) Reset(d *dict, singleBlock bool) {
const shardCnt = tableShardCnt
const shardSize = tableShardSize
if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
copy(e.table[:], e.dictTable)
//copy(e.table[:], e.dictTable)
e.table = *(*[tableSize]tableEntry)(e.dictTable)
for i := range e.tableShardDirty {
e.tableShardDirty[i] = false
}
@ -883,7 +884,8 @@ func (e *fastEncoderDict) Reset(d *dict, singleBlock bool) {
continue
}
copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
//copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
*(*[shardSize]tableEntry)(e.table[i*shardSize:]) = *(*[shardSize]tableEntry)(e.dictTable[i*shardSize:])
e.tableShardDirty[i] = false
}
e.allDirty = false

View File

@ -528,8 +528,8 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
// If a non-single block is needed the encoder will reset again.
e.encoders <- enc
}()
// Use single segments when above minimum window and below 1MB.
single := len(src) < 1<<20 && len(src) > MinWindowSize
// Use single segments when above minimum window and below window size.
single := len(src) <= e.o.windowSize && len(src) > MinWindowSize
if e.o.single != nil {
single = *e.o.single
}

View File

@ -283,7 +283,7 @@ func WithNoEntropyCompression(b bool) EOption {
// a decoder is allowed to reject a compressed frame which requests a memory size beyond decoder's authorized range.
// For broader compatibility, decoders are recommended to support memory sizes of at least 8 MB.
// This is only a recommendation, each decoder is free to support higher or lower limits, depending on local limitations.
// If this is not specified, block encodes will automatically choose this based on the input size.
// If this is not specified, block encodes will automatically choose this based on the input size and the window size.
// This setting has no effect on streamed encodes.
func WithSingleSegment(b bool) EOption {
return func(o *encoderOptions) error {

View File

@ -106,7 +106,7 @@ func (d *frameDec) reset(br byteBuffer) error {
}
n := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
println("Skipping frame with", n, "bytes.")
err = br.skipN(int(n))
err = br.skipN(int64(n))
if err != nil {
if debugDecoder {
println("Reading discarded frame", err)
@ -231,20 +231,27 @@ func (d *frameDec) reset(br byteBuffer) error {
d.crc.Reset()
}
if d.WindowSize > d.o.maxWindowSize {
if debugDecoder {
printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
}
return ErrWindowSizeExceeded
}
if d.WindowSize == 0 && d.SingleSegment {
// We may not need window in this case.
d.WindowSize = d.FrameContentSize
if d.WindowSize < MinWindowSize {
d.WindowSize = MinWindowSize
}
if d.WindowSize > d.o.maxDecodedSize {
if debugDecoder {
printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
}
return ErrDecoderSizeExceeded
}
}
if d.WindowSize > uint64(d.o.maxWindowSize) {
if debugDecoder {
printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
}
return ErrWindowSizeExceeded
}
// The minimum Window_Size is 1 KB.
if d.WindowSize < MinWindowSize {
if debugDecoder {
@ -254,11 +261,16 @@ func (d *frameDec) reset(br byteBuffer) error {
}
d.history.windowSize = int(d.WindowSize)
if !d.o.lowMem || d.history.windowSize < maxBlockSize {
// Alloc 2x window size if not low-mem, or very small window size.
// Alloc 2x window size if not low-mem, or window size below 2MB.
d.history.allocFrameBuffer = d.history.windowSize * 2
} else {
// Alloc with one additional block
d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize
if d.o.lowMem {
// Alloc with 1MB extra.
d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize/2
} else {
// Alloc with 2MB extra.
d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize
}
}
if debugDecoder {
@ -336,7 +348,7 @@ func (d *frameDec) consumeCRC() error {
return nil
}
// runDecoder will create a sync decoder that will decode a block of data.
// runDecoder will run the decoder for the remainder of the frame.
func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
saved := d.history.b
@ -346,12 +358,23 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
// Store input length, so we only check new data.
crcStart := len(dst)
d.history.decoders.maxSyncLen = 0
if d.o.limitToCap {
d.history.decoders.maxSyncLen = uint64(cap(dst) - len(dst))
}
if d.FrameContentSize != fcsUnknown {
d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst))
if !d.o.limitToCap || d.FrameContentSize+uint64(len(dst)) < d.history.decoders.maxSyncLen {
d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst))
}
if d.history.decoders.maxSyncLen > d.o.maxDecodedSize {
if debugDecoder {
println("maxSyncLen:", d.history.decoders.maxSyncLen, "> maxDecodedSize:", d.o.maxDecodedSize)
}
return dst, ErrDecoderSizeExceeded
}
if uint64(cap(dst)) < d.history.decoders.maxSyncLen {
if debugDecoder {
println("maxSyncLen:", d.history.decoders.maxSyncLen)
}
if !d.o.limitToCap && uint64(cap(dst)) < d.history.decoders.maxSyncLen {
// Alloc for output
dst2 := make([]byte, len(dst), d.history.decoders.maxSyncLen+compressedBlockOverAlloc)
copy(dst2, dst)
@ -371,7 +394,13 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
if err != nil {
break
}
if uint64(len(d.history.b)) > d.o.maxDecodedSize {
if uint64(len(d.history.b)-crcStart) > d.o.maxDecodedSize {
println("runDecoder: maxDecodedSize exceeded", uint64(len(d.history.b)-crcStart), ">", d.o.maxDecodedSize)
err = ErrDecoderSizeExceeded
break
}
if d.o.limitToCap && len(d.history.b) > cap(dst) {
println("runDecoder: cap exceeded", uint64(len(d.history.b)), ">", cap(dst))
err = ErrDecoderSizeExceeded
break
}

View File

@ -21,7 +21,8 @@ type buildDtableAsmContext struct {
// buildDtable_asm is an x86 assembly implementation of fseDecoder.buildDtable.
// Function returns non-zero exit code on error.
// go:noescape
//
//go:noescape
func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
// please keep in sync with _generate/gen_fse.go
@ -34,8 +35,8 @@ const (
// buildDtable will build the decoding table.
func (s *fseDecoder) buildDtable() error {
ctx := buildDtableAsmContext{
stateTable: (*uint16)(&s.stateTable[0]),
norm: (*int16)(&s.norm[0]),
stateTable: &s.stateTable[0],
norm: &s.norm[0],
dt: (*uint64)(&s.dt[0]),
}
code := buildDtable_asm(s, &ctx)

View File

@ -1,7 +1,6 @@
// Code generated by command: go run gen_fse.go -out ../fse_decoder_amd64.s -pkg=zstd. DO NOT EDIT.
//go:build !appengine && !noasm && gc && !noasm
// +build !appengine,!noasm,gc,!noasm
// func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
TEXT ·buildDtable_asm(SB), $0-24

View File

@ -37,26 +37,23 @@ func (h *history) reset() {
h.ignoreBuffer = 0
h.error = false
h.recentOffsets = [3]int{1, 4, 8}
if f := h.decoders.litLengths.fse; f != nil && !f.preDefined {
fseDecoderPool.Put(f)
}
if f := h.decoders.offsets.fse; f != nil && !f.preDefined {
fseDecoderPool.Put(f)
}
if f := h.decoders.matchLengths.fse; f != nil && !f.preDefined {
fseDecoderPool.Put(f)
}
h.decoders.freeDecoders()
h.decoders = sequenceDecs{br: h.decoders.br}
if h.huffTree != nil {
if h.dict == nil || h.dict.litEnc != h.huffTree {
huffDecoderPool.Put(h.huffTree)
}
}
h.freeHuffDecoder()
h.huffTree = nil
h.dict = nil
//printf("history created: %+v (l: %d, c: %d)", *h, len(h.b), cap(h.b))
}
func (h *history) freeHuffDecoder() {
if h.huffTree != nil {
if h.dict == nil || h.dict.litEnc != h.huffTree {
huffDecoderPool.Put(h.huffTree)
h.huffTree = nil
}
}
}
func (h *history) setDict(dict *dict) {
if dict == nil {
return

View File

@ -99,6 +99,21 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, out []byte) erro
return nil
}
func (s *sequenceDecs) freeDecoders() {
if f := s.litLengths.fse; f != nil && !f.preDefined {
fseDecoderPool.Put(f)
s.litLengths.fse = nil
}
if f := s.offsets.fse; f != nil && !f.preDefined {
fseDecoderPool.Put(f)
s.offsets.fse = nil
}
if f := s.matchLengths.fse; f != nil && !f.preDefined {
fseDecoderPool.Put(f)
s.matchLengths.fse = nil
}
}
// execute will execute the decoded sequence with the provided history.
// The sequence must be evaluated before being sent.
func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
@ -299,7 +314,10 @@ func (s *sequenceDecs) decodeSync(hist []byte) error {
}
size := ll + ml + len(out)
if size-startSize > maxBlockSize {
return fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
if size-startSize == 424242 {
panic("here")
}
return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
}
if size > cap(out) {
// Not enough size, which can happen under high volume block streaming conditions
@ -411,7 +429,7 @@ func (s *sequenceDecs) decodeSync(hist []byte) error {
// Check if space for literals
if size := len(s.literals) + len(s.out) - startSize; size > maxBlockSize {
return fmt.Errorf("output (%d) bigger than max block size (%d)", size, maxBlockSize)
return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
}
// Add final literals

View File

@ -32,18 +32,22 @@ type decodeSyncAsmContext struct {
// sequenceDecs_decodeSync_amd64 implements the main loop of sequenceDecs.decodeSync in x86 asm.
//
// Please refer to seqdec_generic.go for the reference implementation.
//
//go:noescape
func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// sequenceDecs_decodeSync_bmi2 implements the main loop of sequenceDecs.decodeSync in x86 asm with BMI2 extensions.
//
//go:noescape
func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// sequenceDecs_decodeSync_safe_amd64 does the same as above, but does not write more than output buffer.
//
//go:noescape
func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// sequenceDecs_decodeSync_safe_bmi2 does the same as above, but does not write more than output buffer.
//
//go:noescape
func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
@ -55,16 +59,22 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSize {
return false, nil
}
useSafe := false
if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc {
useSafe = true
}
if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
useSafe = true
}
if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
useSafe = true
}
// FIXME: Using unsafe memory copies leads to rare, random crashes
// with fuzz testing. It is therefore disabled for now.
const useSafe = true
/*
useSafe := false
if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc {
useSafe = true
}
if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
useSafe = true
}
if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
useSafe = true
}
*/
br := s.br
@ -129,7 +139,7 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
if debugDecoder {
println("msl:", s.maxSyncLen, "cap", cap(s.out), "bef:", startSize, "sz:", size-startSize, "mbs:", maxBlockSize, "outsz:", cap(s.out)-startSize)
}
return true, fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
return true, fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
default:
return true, fmt.Errorf("sequenceDecs_decode returned erronous code %d", errCode)
@ -137,7 +147,8 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
s.seqSize += ctx.litRemain
if s.seqSize > maxBlockSize {
return true, fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
return true, fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
}
err := br.close()
if err != nil {
@ -195,20 +206,24 @@ const errorNotEnoughSpace = 5
// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
//
// Please refer to seqdec_generic.go for the reference implementation.
//
//go:noescape
func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
//
// Please refer to seqdec_generic.go for the reference implementation.
//
//go:noescape
func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
//
//go:noescape
func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
//
//go:noescape
func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
@ -275,7 +290,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
s.seqSize += ctx.litRemain
if s.seqSize > maxBlockSize {
return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
}
err := br.close()
if err != nil {
@ -302,10 +317,12 @@ type executeAsmContext struct {
// Returns false if a match offset is too big.
//
// Please refer to seqdec_generic.go for the reference implementation.
//
//go:noescape
func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
// Same as above, but with safe memcopies
//
//go:noescape
func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool

File diff suppressed because it is too large Load Diff

View File

@ -111,7 +111,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
}
s.seqSize += ll + ml
if s.seqSize > maxBlockSize {
return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
}
litRemain -= ll
if litRemain < 0 {
@ -149,7 +149,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
}
s.seqSize += litRemain
if s.seqSize > maxBlockSize {
return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
}
err := br.close()
if err != nil {