vendor: update buildkit to 3e38a2d

Signed-off-by: CrazyMax <crazy-max@users.noreply.github.com>
This commit is contained in:
CrazyMax
2022-04-03 20:40:33 +02:00
parent c3db06cda0
commit a49ad031a5
87 changed files with 2964 additions and 4427 deletions

View File

@ -0,0 +1,5 @@
package huff0
//go:generate go run generate.go
//go:generate asmfmt -w decompress_amd64.s
//go:generate asmfmt -w decompress_8b_amd64.s

View File

@ -165,6 +165,11 @@ func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
return uint16(b.value >> ((64 - n) & 63))
}
// peekTopBits(n) is equvialent to peekBitFast(64 - n)
func (b *bitReaderShifted) peekTopBits(n uint8) uint16 {
return uint16(b.value >> n)
}
func (b *bitReaderShifted) advance(n uint8) {
b.bitsRead += n
b.value <<= n & 63

View File

@ -725,189 +725,6 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
return dst, br.close()
}
// Decompress4X will decompress a 4X encoded stream.
// The length of the supplied input must match the end of a block exactly.
// The *capacity* of the dst slice must match the destination size of
// the uncompressed data exactly.
func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
if len(d.dt.single) == 0 {
return nil, errors.New("no table loaded")
}
if len(src) < 6+(4*1) {
return nil, errors.New("input too small")
}
if use8BitTables && d.actualTableLog <= 8 {
return d.decompress4X8bit(dst, src)
}
var br [4]bitReaderShifted
// Decode "jump table"
start := 6
for i := 0; i < 3; i++ {
length := int(src[i*2]) | (int(src[i*2+1]) << 8)
if start+length >= len(src) {
return nil, errors.New("truncated input (or invalid offset)")
}
err := br[i].init(src[start : start+length])
if err != nil {
return nil, err
}
start += length
}
err := br[3].init(src[start:])
if err != nil {
return nil, err
}
// destination, offset to match first output
dstSize := cap(dst)
dst = dst[:dstSize]
out := dst
dstEvery := (dstSize + 3) / 4
const tlSize = 1 << tableLogMax
const tlMask = tlSize - 1
single := d.dt.single[:tlSize]
// Use temp table to avoid bound checks/append penalty.
buf := d.buffer()
var off uint8
var decoded int
// Decode 2 values from each decoder/loop.
const bufoff = 256
for {
if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
break
}
{
const stream = 0
const stream2 = 1
br[stream].fillFast()
br[stream2].fillFast()
val := br[stream].peekBitsFast(d.actualTableLog)
val2 := br[stream2].peekBitsFast(d.actualTableLog)
v := single[val&tlMask]
v2 := single[val2&tlMask]
br[stream].advance(uint8(v.entry))
br[stream2].advance(uint8(v2.entry))
buf[stream][off] = uint8(v.entry >> 8)
buf[stream2][off] = uint8(v2.entry >> 8)
val = br[stream].peekBitsFast(d.actualTableLog)
val2 = br[stream2].peekBitsFast(d.actualTableLog)
v = single[val&tlMask]
v2 = single[val2&tlMask]
br[stream].advance(uint8(v.entry))
br[stream2].advance(uint8(v2.entry))
buf[stream][off+1] = uint8(v.entry >> 8)
buf[stream2][off+1] = uint8(v2.entry >> 8)
}
{
const stream = 2
const stream2 = 3
br[stream].fillFast()
br[stream2].fillFast()
val := br[stream].peekBitsFast(d.actualTableLog)
val2 := br[stream2].peekBitsFast(d.actualTableLog)
v := single[val&tlMask]
v2 := single[val2&tlMask]
br[stream].advance(uint8(v.entry))
br[stream2].advance(uint8(v2.entry))
buf[stream][off] = uint8(v.entry >> 8)
buf[stream2][off] = uint8(v2.entry >> 8)
val = br[stream].peekBitsFast(d.actualTableLog)
val2 = br[stream2].peekBitsFast(d.actualTableLog)
v = single[val&tlMask]
v2 = single[val2&tlMask]
br[stream].advance(uint8(v.entry))
br[stream2].advance(uint8(v2.entry))
buf[stream][off+1] = uint8(v.entry >> 8)
buf[stream2][off+1] = uint8(v2.entry >> 8)
}
off += 2
if off == 0 {
if bufoff > dstEvery {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 1")
}
copy(out, buf[0][:])
copy(out[dstEvery:], buf[1][:])
copy(out[dstEvery*2:], buf[2][:])
copy(out[dstEvery*3:], buf[3][:])
out = out[bufoff:]
decoded += bufoff * 4
// There must at least be 3 buffers left.
if len(out) < dstEvery*3 {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 2")
}
}
}
if off > 0 {
ioff := int(off)
if len(out) < dstEvery*3+ioff {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 3")
}
copy(out, buf[0][:off])
copy(out[dstEvery:], buf[1][:off])
copy(out[dstEvery*2:], buf[2][:off])
copy(out[dstEvery*3:], buf[3][:off])
decoded += int(off) * 4
out = out[off:]
}
// Decode remaining.
remainBytes := dstEvery - (decoded / 4)
for i := range br {
offset := dstEvery * i
endsAt := offset + remainBytes
if endsAt > len(out) {
endsAt = len(out)
}
br := &br[i]
bitsLeft := br.remaining()
for bitsLeft > 0 {
br.fill()
if offset >= endsAt {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 4")
}
// Read value and increment offset.
val := br.peekBitsFast(d.actualTableLog)
v := single[val&tlMask].entry
nBits := uint8(v)
br.advance(nBits)
bitsLeft -= uint(nBits)
out[offset] = uint8(v >> 8)
offset++
}
if offset != endsAt {
d.bufs.Put(buf)
return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
}
decoded += offset - dstEvery*i
err = br.close()
if err != nil {
return nil, err
}
}
d.bufs.Put(buf)
if dstSize != decoded {
return nil, errors.New("corruption detected: short output block")
}
return dst, nil
}
// Decompress4X will decompress a 4X encoded stream.
// The length of the supplied input must match the end of a block exactly.
// The *capacity* of the dst slice must match the destination size of

View File

@ -0,0 +1,488 @@
// +build !appengine
// +build gc
// +build !noasm
#include "textflag.h"
#include "funcdata.h"
#include "go_asm.h"
#define bufoff 256 // see decompress.go, we're using [4][256]byte table
// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
#define off R8
#define buffer DI
#define table SI
#define br_bits_read R9
#define br_value R10
#define br_offset R11
#define peek_bits R12
#define exhausted DX
#define br0 R13
#define br1 R14
#define br2 R15
#define br3 BP
MOVQ BP, 0(SP)
XORQ exhausted, exhausted // exhausted = false
XORQ off, off // off = 0
MOVBQZX peekBits+32(FP), peek_bits
MOVQ buf+40(FP), buffer
MOVQ tbl+48(FP), table
MOVQ pbr0+0(FP), br0
MOVQ pbr1+8(FP), br1
MOVQ pbr2+16(FP), br2
MOVQ pbr3+24(FP), br3
main_loop:
// const stream = 0
// br0.fillFast()
MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
MOVQ bitReaderShifted_value(br0), br_value
MOVQ bitReaderShifted_off(br0), br_offset
// if b.bitsRead >= 32 {
CMPQ br_bits_read, $32
JB skip_fill0
SUBQ $32, br_bits_read // b.bitsRead -= 32
SUBQ $4, br_offset // b.off -= 4
// v := b.in[b.off-4 : b.off]
// v = v[:4]
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
MOVQ bitReaderShifted_in(br0), AX
MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVQ br_bits_read, CX
SHLQ CL, AX
ORQ AX, br_value
// exhausted = exhausted || (br0.off < 4)
CMPQ br_offset, $4
SETLT DL
ORB DL, DH
// }
skip_fill0:
// val0 := br0.peekTopBits(peekBits)
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v0 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br0.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// val1 := br0.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v1 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br0.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
MOVBQZX AL, CX
SHLQ CX, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off] = uint8(v0.entry >> 8)
// buf[stream][off+1] = uint8(v1.entry >> 8)
MOVW BX, 0(buffer)(off*1)
// SECOND PART:
// val2 := br0.peekTopBits(peekBits)
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v2 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br0.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// val3 := br0.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v3 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br0.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
MOVBQZX AL, CX
SHLQ CX, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off+2] = uint8(v2.entry >> 8)
// buf[stream][off+3] = uint8(v3.entry >> 8)
MOVW BX, 0+2(buffer)(off*1)
// update the bitrader reader structure
MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
MOVQ br_value, bitReaderShifted_value(br0)
MOVQ br_offset, bitReaderShifted_off(br0)
// const stream = 1
// br1.fillFast()
MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
MOVQ bitReaderShifted_value(br1), br_value
MOVQ bitReaderShifted_off(br1), br_offset
// if b.bitsRead >= 32 {
CMPQ br_bits_read, $32
JB skip_fill1
SUBQ $32, br_bits_read // b.bitsRead -= 32
SUBQ $4, br_offset // b.off -= 4
// v := b.in[b.off-4 : b.off]
// v = v[:4]
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
MOVQ bitReaderShifted_in(br1), AX
MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVQ br_bits_read, CX
SHLQ CL, AX
ORQ AX, br_value
// exhausted = exhausted || (br1.off < 4)
CMPQ br_offset, $4
SETLT DL
ORB DL, DH
// }
skip_fill1:
// val0 := br1.peekTopBits(peekBits)
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v0 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br1.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// val1 := br1.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v1 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br1.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
MOVBQZX AL, CX
SHLQ CX, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off] = uint8(v0.entry >> 8)
// buf[stream][off+1] = uint8(v1.entry >> 8)
MOVW BX, 256(buffer)(off*1)
// SECOND PART:
// val2 := br1.peekTopBits(peekBits)
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v2 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br1.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// val3 := br1.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v3 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br1.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
MOVBQZX AL, CX
SHLQ CX, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off+2] = uint8(v2.entry >> 8)
// buf[stream][off+3] = uint8(v3.entry >> 8)
MOVW BX, 256+2(buffer)(off*1)
// update the bitrader reader structure
MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
MOVQ br_value, bitReaderShifted_value(br1)
MOVQ br_offset, bitReaderShifted_off(br1)
// const stream = 2
// br2.fillFast()
MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
MOVQ bitReaderShifted_value(br2), br_value
MOVQ bitReaderShifted_off(br2), br_offset
// if b.bitsRead >= 32 {
CMPQ br_bits_read, $32
JB skip_fill2
SUBQ $32, br_bits_read // b.bitsRead -= 32
SUBQ $4, br_offset // b.off -= 4
// v := b.in[b.off-4 : b.off]
// v = v[:4]
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
MOVQ bitReaderShifted_in(br2), AX
MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVQ br_bits_read, CX
SHLQ CL, AX
ORQ AX, br_value
// exhausted = exhausted || (br2.off < 4)
CMPQ br_offset, $4
SETLT DL
ORB DL, DH
// }
skip_fill2:
// val0 := br2.peekTopBits(peekBits)
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v0 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br2.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// val1 := br2.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v1 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br2.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
MOVBQZX AL, CX
SHLQ CX, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off] = uint8(v0.entry >> 8)
// buf[stream][off+1] = uint8(v1.entry >> 8)
MOVW BX, 512(buffer)(off*1)
// SECOND PART:
// val2 := br2.peekTopBits(peekBits)
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v2 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br2.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// val3 := br2.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v3 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br2.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
MOVBQZX AL, CX
SHLQ CX, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off+2] = uint8(v2.entry >> 8)
// buf[stream][off+3] = uint8(v3.entry >> 8)
MOVW BX, 512+2(buffer)(off*1)
// update the bitrader reader structure
MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
MOVQ br_value, bitReaderShifted_value(br2)
MOVQ br_offset, bitReaderShifted_off(br2)
// const stream = 3
// br3.fillFast()
MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
MOVQ bitReaderShifted_value(br3), br_value
MOVQ bitReaderShifted_off(br3), br_offset
// if b.bitsRead >= 32 {
CMPQ br_bits_read, $32
JB skip_fill3
SUBQ $32, br_bits_read // b.bitsRead -= 32
SUBQ $4, br_offset // b.off -= 4
// v := b.in[b.off-4 : b.off]
// v = v[:4]
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
MOVQ bitReaderShifted_in(br3), AX
MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVQ br_bits_read, CX
SHLQ CL, AX
ORQ AX, br_value
// exhausted = exhausted || (br3.off < 4)
CMPQ br_offset, $4
SETLT DL
ORB DL, DH
// }
skip_fill3:
// val0 := br3.peekTopBits(peekBits)
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v0 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br3.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// val1 := br3.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v1 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br3.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
MOVBQZX AL, CX
SHLQ CX, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off] = uint8(v0.entry >> 8)
// buf[stream][off+1] = uint8(v1.entry >> 8)
MOVW BX, 768(buffer)(off*1)
// SECOND PART:
// val2 := br3.peekTopBits(peekBits)
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v2 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br3.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// val3 := br3.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v3 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br3.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
MOVBQZX AL, CX
SHLQ CX, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off+2] = uint8(v2.entry >> 8)
// buf[stream][off+3] = uint8(v3.entry >> 8)
MOVW BX, 768+2(buffer)(off*1)
// update the bitrader reader structure
MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
MOVQ br_value, bitReaderShifted_value(br3)
MOVQ br_offset, bitReaderShifted_off(br3)
ADDQ $4, off // off += 2
TESTB DH, DH // any br[i].ofs < 4?
JNZ end
CMPQ off, $bufoff
JL main_loop
end:
MOVQ 0(SP), BP
MOVB off, ret+56(FP)
RET
#undef off
#undef buffer
#undef table
#undef br_bits_read
#undef br_value
#undef br_offset
#undef peek_bits
#undef exhausted
#undef br0
#undef br1
#undef br2
#undef br3

View File

@ -0,0 +1,197 @@
// +build !appengine
// +build gc
// +build !noasm
#include "textflag.h"
#include "funcdata.h"
#include "go_asm.h"
#define bufoff 256 // see decompress.go, we're using [4][256]byte table
//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
#define off R8
#define buffer DI
#define table SI
#define br_bits_read R9
#define br_value R10
#define br_offset R11
#define peek_bits R12
#define exhausted DX
#define br0 R13
#define br1 R14
#define br2 R15
#define br3 BP
MOVQ BP, 0(SP)
XORQ exhausted, exhausted // exhausted = false
XORQ off, off // off = 0
MOVBQZX peekBits+32(FP), peek_bits
MOVQ buf+40(FP), buffer
MOVQ tbl+48(FP), table
MOVQ pbr0+0(FP), br0
MOVQ pbr1+8(FP), br1
MOVQ pbr2+16(FP), br2
MOVQ pbr3+24(FP), br3
main_loop:
{{ define "decode_2_values_x86" }}
// const stream = {{ var "id" }}
// br{{ var "id"}}.fillFast()
MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
MOVQ bitReaderShifted_value(br{{ var "id" }}), br_value
MOVQ bitReaderShifted_off(br{{ var "id" }}), br_offset
// if b.bitsRead >= 32 {
CMPQ br_bits_read, $32
JB skip_fill{{ var "id" }}
SUBQ $32, br_bits_read // b.bitsRead -= 32
SUBQ $4, br_offset // b.off -= 4
// v := b.in[b.off-4 : b.off]
// v = v[:4]
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
MOVQ bitReaderShifted_in(br{{ var "id" }}), AX
MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVQ br_bits_read, CX
SHLQ CL, AX
ORQ AX, br_value
// exhausted = exhausted || (br{{ var "id"}}.off < 4)
CMPQ br_offset, $4
SETLT DL
ORB DL, DH
// }
skip_fill{{ var "id" }}:
// val0 := br{{ var "id"}}.peekTopBits(peekBits)
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v0 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br{{ var "id"}}.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// val1 := br{{ var "id"}}.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v1 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br{{ var "id"}}.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
MOVBQZX AL, CX
SHLQ CX, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off] = uint8(v0.entry >> 8)
// buf[stream][off+1] = uint8(v1.entry >> 8)
MOVW BX, {{ var "bufofs" }}(buffer)(off*1)
// SECOND PART:
// val2 := br{{ var "id"}}.peekTopBits(peekBits)
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v2 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br{{ var "id"}}.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// val3 := br{{ var "id"}}.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v3 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br{{ var "id"}}.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
MOVBQZX AL, CX
SHLQ CX, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off+2] = uint8(v2.entry >> 8)
// buf[stream][off+3] = uint8(v3.entry >> 8)
MOVW BX, {{ var "bufofs" }}+2(buffer)(off*1)
// update the bitrader reader structure
MOVB br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
MOVQ br_value, bitReaderShifted_value(br{{ var "id" }})
MOVQ br_offset, bitReaderShifted_off(br{{ var "id" }})
{{ end }}
{{ set "id" "0" }}
{{ set "ofs" "0" }}
{{ set "bufofs" "0" }} {{/* id * bufoff */}}
{{ template "decode_2_values_x86" . }}
{{ set "id" "1" }}
{{ set "ofs" "8" }}
{{ set "bufofs" "256" }}
{{ template "decode_2_values_x86" . }}
{{ set "id" "2" }}
{{ set "ofs" "16" }}
{{ set "bufofs" "512" }}
{{ template "decode_2_values_x86" . }}
{{ set "id" "3" }}
{{ set "ofs" "24" }}
{{ set "bufofs" "768" }}
{{ template "decode_2_values_x86" . }}
ADDQ $4, off // off += 2
TESTB DH, DH // any br[i].ofs < 4?
JNZ end
CMPQ off, $bufoff
JL main_loop
end:
MOVQ 0(SP), BP
MOVB off, ret+56(FP)
RET
#undef off
#undef buffer
#undef table
#undef br_bits_read
#undef br_value
#undef br_offset
#undef peek_bits
#undef exhausted
#undef br0
#undef br1
#undef br2
#undef br3

View File

@ -0,0 +1,181 @@
//go:build amd64 && !appengine && !noasm && gc
// +build amd64,!appengine,!noasm,gc
// This file contains the specialisation of Decoder.Decompress4X
// that uses an asm implementation of its main loop.
package huff0
import (
"errors"
"fmt"
)
// decompress4x_main_loop_x86 is an x86 assembler implementation
// of Decompress4X when tablelog > 8.
// go:noescape
func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
// decompress4x_8b_loop_x86 is an x86 assembler implementation
// of Decompress4X when tablelog <= 8 which decodes 4 entries
// per loop.
// go:noescape
func decompress4x_8b_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
// fallback8BitSize is the size where using Go version is faster.
const fallback8BitSize = 800
// Decompress4X will decompress a 4X encoded stream.
// The length of the supplied input must match the end of a block exactly.
// The *capacity* of the dst slice must match the destination size of
// the uncompressed data exactly.
func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
if len(d.dt.single) == 0 {
return nil, errors.New("no table loaded")
}
if len(src) < 6+(4*1) {
return nil, errors.New("input too small")
}
use8BitTables := d.actualTableLog <= 8
if cap(dst) < fallback8BitSize && use8BitTables {
return d.decompress4X8bit(dst, src)
}
var br [4]bitReaderShifted
// Decode "jump table"
start := 6
for i := 0; i < 3; i++ {
length := int(src[i*2]) | (int(src[i*2+1]) << 8)
if start+length >= len(src) {
return nil, errors.New("truncated input (or invalid offset)")
}
err := br[i].init(src[start : start+length])
if err != nil {
return nil, err
}
start += length
}
err := br[3].init(src[start:])
if err != nil {
return nil, err
}
// destination, offset to match first output
dstSize := cap(dst)
dst = dst[:dstSize]
out := dst
dstEvery := (dstSize + 3) / 4
const tlSize = 1 << tableLogMax
const tlMask = tlSize - 1
single := d.dt.single[:tlSize]
// Use temp table to avoid bound checks/append penalty.
buf := d.buffer()
var off uint8
var decoded int
const debug = false
// see: bitReaderShifted.peekBitsFast()
peekBits := uint8((64 - d.actualTableLog) & 63)
// Decode 2 values from each decoder/loop.
const bufoff = 256
for {
if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
break
}
if use8BitTables {
off = decompress4x_8b_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
} else {
off = decompress4x_main_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
}
if debug {
fmt.Print("DEBUG: ")
fmt.Printf("off=%d,", off)
for i := 0; i < 4; i++ {
fmt.Printf(" br[%d]={bitsRead=%d, value=%x, off=%d}",
i, br[i].bitsRead, br[i].value, br[i].off)
}
fmt.Println("")
}
if off != 0 {
break
}
if bufoff > dstEvery {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 1")
}
copy(out, buf[0][:])
copy(out[dstEvery:], buf[1][:])
copy(out[dstEvery*2:], buf[2][:])
copy(out[dstEvery*3:], buf[3][:])
out = out[bufoff:]
decoded += bufoff * 4
// There must at least be 3 buffers left.
if len(out) < dstEvery*3 {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 2")
}
}
if off > 0 {
ioff := int(off)
if len(out) < dstEvery*3+ioff {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 3")
}
copy(out, buf[0][:off])
copy(out[dstEvery:], buf[1][:off])
copy(out[dstEvery*2:], buf[2][:off])
copy(out[dstEvery*3:], buf[3][:off])
decoded += int(off) * 4
out = out[off:]
}
// Decode remaining.
remainBytes := dstEvery - (decoded / 4)
for i := range br {
offset := dstEvery * i
endsAt := offset + remainBytes
if endsAt > len(out) {
endsAt = len(out)
}
br := &br[i]
bitsLeft := br.remaining()
for bitsLeft > 0 {
br.fill()
if offset >= endsAt {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 4")
}
// Read value and increment offset.
val := br.peekBitsFast(d.actualTableLog)
v := single[val&tlMask].entry
nBits := uint8(v)
br.advance(nBits)
bitsLeft -= uint(nBits)
out[offset] = uint8(v >> 8)
offset++
}
if offset != endsAt {
d.bufs.Put(buf)
return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
}
decoded += offset - dstEvery*i
err = br.close()
if err != nil {
return nil, err
}
}
d.bufs.Put(buf)
if dstSize != decoded {
return nil, errors.New("corruption detected: short output block")
}
return dst, nil
}

View File

@ -0,0 +1,506 @@
// +build !appengine
// +build gc
// +build !noasm
#include "textflag.h"
#include "funcdata.h"
#include "go_asm.h"
#ifdef GOAMD64_v4
#ifndef GOAMD64_v3
#define GOAMD64_v3
#endif
#endif
#define bufoff 256 // see decompress.go, we're using [4][256]byte table
// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
#define off R8
#define buffer DI
#define table SI
#define br_bits_read R9
#define br_value R10
#define br_offset R11
#define peek_bits R12
#define exhausted DX
#define br0 R13
#define br1 R14
#define br2 R15
#define br3 BP
MOVQ BP, 0(SP)
XORQ exhausted, exhausted // exhausted = false
XORQ off, off // off = 0
MOVBQZX peekBits+32(FP), peek_bits
MOVQ buf+40(FP), buffer
MOVQ tbl+48(FP), table
MOVQ pbr0+0(FP), br0
MOVQ pbr1+8(FP), br1
MOVQ pbr2+16(FP), br2
MOVQ pbr3+24(FP), br3
main_loop:
// const stream = 0
// br0.fillFast()
MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
MOVQ bitReaderShifted_value(br0), br_value
MOVQ bitReaderShifted_off(br0), br_offset
// We must have at least 2 * max tablelog left
CMPQ br_bits_read, $64-22
JBE skip_fill0
SUBQ $32, br_bits_read // b.bitsRead -= 32
SUBQ $4, br_offset // b.off -= 4
// v := b.in[b.off-4 : b.off]
// v = v[:4]
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
MOVQ bitReaderShifted_in(br0), AX
// b.value |= uint64(low) << (b.bitsRead & 63)
#ifdef GOAMD64_v3
SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
#else
MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
MOVQ br_bits_read, CX
SHLQ CL, AX
#endif
ORQ AX, br_value
// exhausted = exhausted || (br0.off < 4)
CMPQ br_offset, $4
SETLT DL
ORB DL, DH
// }
skip_fill0:
// val0 := br0.peekTopBits(peekBits)
#ifdef GOAMD64_v3
SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
#else
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
#endif
// v0 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br0.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
#ifdef GOAMD64_v3
MOVBQZX AL, CX
SHLXQ AX, br_value, br_value // value <<= n
#else
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
#endif
ADDQ CX, br_bits_read // bits_read += n
#ifdef GOAMD64_v3
SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
#else
// val1 := br0.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
#endif
// v1 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br0.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
#ifdef GOAMD64_v3
MOVBQZX AL, CX
SHLXQ AX, br_value, br_value // value <<= n
#else
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
#endif
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off] = uint8(v0.entry >> 8)
// buf[stream][off+1] = uint8(v1.entry >> 8)
MOVW BX, 0(buffer)(off*1)
// update the bitrader reader structure
MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
MOVQ br_value, bitReaderShifted_value(br0)
MOVQ br_offset, bitReaderShifted_off(br0)
// const stream = 1
// br1.fillFast()
MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
MOVQ bitReaderShifted_value(br1), br_value
MOVQ bitReaderShifted_off(br1), br_offset
// We must have at least 2 * max tablelog left
CMPQ br_bits_read, $64-22
JBE skip_fill1
SUBQ $32, br_bits_read // b.bitsRead -= 32
SUBQ $4, br_offset // b.off -= 4
// v := b.in[b.off-4 : b.off]
// v = v[:4]
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
MOVQ bitReaderShifted_in(br1), AX
// b.value |= uint64(low) << (b.bitsRead & 63)
#ifdef GOAMD64_v3
SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
#else
MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
MOVQ br_bits_read, CX
SHLQ CL, AX
#endif
ORQ AX, br_value
// exhausted = exhausted || (br1.off < 4)
CMPQ br_offset, $4
SETLT DL
ORB DL, DH
// }
skip_fill1:
// val0 := br1.peekTopBits(peekBits)
#ifdef GOAMD64_v3
SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
#else
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
#endif
// v0 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br1.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
#ifdef GOAMD64_v3
MOVBQZX AL, CX
SHLXQ AX, br_value, br_value // value <<= n
#else
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
#endif
ADDQ CX, br_bits_read // bits_read += n
#ifdef GOAMD64_v3
SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
#else
// val1 := br1.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
#endif
// v1 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br1.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
#ifdef GOAMD64_v3
MOVBQZX AL, CX
SHLXQ AX, br_value, br_value // value <<= n
#else
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
#endif
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off] = uint8(v0.entry >> 8)
// buf[stream][off+1] = uint8(v1.entry >> 8)
MOVW BX, 256(buffer)(off*1)
// update the bitrader reader structure
MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
MOVQ br_value, bitReaderShifted_value(br1)
MOVQ br_offset, bitReaderShifted_off(br1)
// const stream = 2
// br2.fillFast()
MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
MOVQ bitReaderShifted_value(br2), br_value
MOVQ bitReaderShifted_off(br2), br_offset
// We must have at least 2 * max tablelog left
CMPQ br_bits_read, $64-22
JBE skip_fill2
SUBQ $32, br_bits_read // b.bitsRead -= 32
SUBQ $4, br_offset // b.off -= 4
// v := b.in[b.off-4 : b.off]
// v = v[:4]
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
MOVQ bitReaderShifted_in(br2), AX
// b.value |= uint64(low) << (b.bitsRead & 63)
#ifdef GOAMD64_v3
SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
#else
MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
MOVQ br_bits_read, CX
SHLQ CL, AX
#endif
ORQ AX, br_value
// exhausted = exhausted || (br2.off < 4)
CMPQ br_offset, $4
SETLT DL
ORB DL, DH
// }
skip_fill2:
// val0 := br2.peekTopBits(peekBits)
#ifdef GOAMD64_v3
SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
#else
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
#endif
// v0 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br2.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
#ifdef GOAMD64_v3
MOVBQZX AL, CX
SHLXQ AX, br_value, br_value // value <<= n
#else
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
#endif
ADDQ CX, br_bits_read // bits_read += n
#ifdef GOAMD64_v3
SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
#else
// val1 := br2.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
#endif
// v1 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br2.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
#ifdef GOAMD64_v3
MOVBQZX AL, CX
SHLXQ AX, br_value, br_value // value <<= n
#else
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
#endif
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off] = uint8(v0.entry >> 8)
// buf[stream][off+1] = uint8(v1.entry >> 8)
MOVW BX, 512(buffer)(off*1)
// update the bitrader reader structure
MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
MOVQ br_value, bitReaderShifted_value(br2)
MOVQ br_offset, bitReaderShifted_off(br2)
// const stream = 3
// br3.fillFast()
MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
MOVQ bitReaderShifted_value(br3), br_value
MOVQ bitReaderShifted_off(br3), br_offset
// We must have at least 2 * max tablelog left
CMPQ br_bits_read, $64-22
JBE skip_fill3
SUBQ $32, br_bits_read // b.bitsRead -= 32
SUBQ $4, br_offset // b.off -= 4
// v := b.in[b.off-4 : b.off]
// v = v[:4]
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
MOVQ bitReaderShifted_in(br3), AX
// b.value |= uint64(low) << (b.bitsRead & 63)
#ifdef GOAMD64_v3
SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
#else
MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
MOVQ br_bits_read, CX
SHLQ CL, AX
#endif
ORQ AX, br_value
// exhausted = exhausted || (br3.off < 4)
CMPQ br_offset, $4
SETLT DL
ORB DL, DH
// }
skip_fill3:
// val0 := br3.peekTopBits(peekBits)
#ifdef GOAMD64_v3
SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
#else
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
#endif
// v0 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br3.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
#ifdef GOAMD64_v3
MOVBQZX AL, CX
SHLXQ AX, br_value, br_value // value <<= n
#else
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
#endif
ADDQ CX, br_bits_read // bits_read += n
#ifdef GOAMD64_v3
SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
#else
// val1 := br3.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
#endif
// v1 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br3.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
#ifdef GOAMD64_v3
MOVBQZX AL, CX
SHLXQ AX, br_value, br_value // value <<= n
#else
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
#endif
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off] = uint8(v0.entry >> 8)
// buf[stream][off+1] = uint8(v1.entry >> 8)
MOVW BX, 768(buffer)(off*1)
// update the bitrader reader structure
MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
MOVQ br_value, bitReaderShifted_value(br3)
MOVQ br_offset, bitReaderShifted_off(br3)
ADDQ $2, off // off += 2
TESTB DH, DH // any br[i].ofs < 4?
JNZ end
CMPQ off, $bufoff
JL main_loop
end:
MOVQ 0(SP), BP
MOVB off, ret+56(FP)
RET
#undef off
#undef buffer
#undef table
#undef br_bits_read
#undef br_value
#undef br_offset
#undef peek_bits
#undef exhausted
#undef br0
#undef br1
#undef br2
#undef br3

View File

@ -0,0 +1,195 @@
// +build !appengine
// +build gc
// +build !noasm
#include "textflag.h"
#include "funcdata.h"
#include "go_asm.h"
#ifdef GOAMD64_v4
#ifndef GOAMD64_v3
#define GOAMD64_v3
#endif
#endif
#define bufoff 256 // see decompress.go, we're using [4][256]byte table
//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
#define off R8
#define buffer DI
#define table SI
#define br_bits_read R9
#define br_value R10
#define br_offset R11
#define peek_bits R12
#define exhausted DX
#define br0 R13
#define br1 R14
#define br2 R15
#define br3 BP
MOVQ BP, 0(SP)
XORQ exhausted, exhausted // exhausted = false
XORQ off, off // off = 0
MOVBQZX peekBits+32(FP), peek_bits
MOVQ buf+40(FP), buffer
MOVQ tbl+48(FP), table
MOVQ pbr0+0(FP), br0
MOVQ pbr1+8(FP), br1
MOVQ pbr2+16(FP), br2
MOVQ pbr3+24(FP), br3
main_loop:
{{ define "decode_2_values_x86" }}
// const stream = {{ var "id" }}
// br{{ var "id"}}.fillFast()
MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
MOVQ bitReaderShifted_value(br{{ var "id" }}), br_value
MOVQ bitReaderShifted_off(br{{ var "id" }}), br_offset
// We must have at least 2 * max tablelog left
CMPQ br_bits_read, $64-22
JBE skip_fill{{ var "id" }}
SUBQ $32, br_bits_read // b.bitsRead -= 32
SUBQ $4, br_offset // b.off -= 4
// v := b.in[b.off-4 : b.off]
// v = v[:4]
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
MOVQ bitReaderShifted_in(br{{ var "id" }}), AX
// b.value |= uint64(low) << (b.bitsRead & 63)
#ifdef GOAMD64_v3
SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
#else
MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
MOVQ br_bits_read, CX
SHLQ CL, AX
#endif
ORQ AX, br_value
// exhausted = exhausted || (br{{ var "id"}}.off < 4)
CMPQ br_offset, $4
SETLT DL
ORB DL, DH
// }
skip_fill{{ var "id" }}:
// val0 := br{{ var "id"}}.peekTopBits(peekBits)
#ifdef GOAMD64_v3
SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
#else
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
#endif
// v0 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br{{ var "id"}}.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
#ifdef GOAMD64_v3
MOVBQZX AL, CX
SHLXQ AX, br_value, br_value // value <<= n
#else
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
#endif
ADDQ CX, br_bits_read // bits_read += n
#ifdef GOAMD64_v3
SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
#else
// val1 := br{{ var "id"}}.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
#endif
// v1 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br{{ var "id"}}.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
#ifdef GOAMD64_v3
MOVBQZX AL, CX
SHLXQ AX, br_value, br_value // value <<= n
#else
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
#endif
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off] = uint8(v0.entry >> 8)
// buf[stream][off+1] = uint8(v1.entry >> 8)
MOVW BX, {{ var "bufofs" }}(buffer)(off*1)
// update the bitrader reader structure
MOVB br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
MOVQ br_value, bitReaderShifted_value(br{{ var "id" }})
MOVQ br_offset, bitReaderShifted_off(br{{ var "id" }})
{{ end }}
{{ set "id" "0" }}
{{ set "ofs" "0" }}
{{ set "bufofs" "0" }} {{/* id * bufoff */}}
{{ template "decode_2_values_x86" . }}
{{ set "id" "1" }}
{{ set "ofs" "8" }}
{{ set "bufofs" "256" }}
{{ template "decode_2_values_x86" . }}
{{ set "id" "2" }}
{{ set "ofs" "16" }}
{{ set "bufofs" "512" }}
{{ template "decode_2_values_x86" . }}
{{ set "id" "3" }}
{{ set "ofs" "24" }}
{{ set "bufofs" "768" }}
{{ template "decode_2_values_x86" . }}
ADDQ $2, off // off += 2
TESTB DH, DH // any br[i].ofs < 4?
JNZ end
CMPQ off, $bufoff
JL main_loop
end:
MOVQ 0(SP), BP
MOVB off, ret+56(FP)
RET
#undef off
#undef buffer
#undef table
#undef br_bits_read
#undef br_value
#undef br_offset
#undef peek_bits
#undef exhausted
#undef br0
#undef br1
#undef br2
#undef br3

View File

@ -0,0 +1,193 @@
//go:build !amd64 || appengine || !gc || noasm
// +build !amd64 appengine !gc noasm
// This file contains a generic implementation of Decoder.Decompress4X.
package huff0
import (
"errors"
"fmt"
)
// Decompress4X will decompress a 4X encoded stream.
// The length of the supplied input must match the end of a block exactly.
// The *capacity* of the dst slice must match the destination size of
// the uncompressed data exactly.
func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
if len(d.dt.single) == 0 {
return nil, errors.New("no table loaded")
}
if len(src) < 6+(4*1) {
return nil, errors.New("input too small")
}
if use8BitTables && d.actualTableLog <= 8 {
return d.decompress4X8bit(dst, src)
}
var br [4]bitReaderShifted
// Decode "jump table"
start := 6
for i := 0; i < 3; i++ {
length := int(src[i*2]) | (int(src[i*2+1]) << 8)
if start+length >= len(src) {
return nil, errors.New("truncated input (or invalid offset)")
}
err := br[i].init(src[start : start+length])
if err != nil {
return nil, err
}
start += length
}
err := br[3].init(src[start:])
if err != nil {
return nil, err
}
// destination, offset to match first output
dstSize := cap(dst)
dst = dst[:dstSize]
out := dst
dstEvery := (dstSize + 3) / 4
const tlSize = 1 << tableLogMax
const tlMask = tlSize - 1
single := d.dt.single[:tlSize]
// Use temp table to avoid bound checks/append penalty.
buf := d.buffer()
var off uint8
var decoded int
// Decode 2 values from each decoder/loop.
const bufoff = 256
for {
if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
break
}
{
const stream = 0
const stream2 = 1
br[stream].fillFast()
br[stream2].fillFast()
val := br[stream].peekBitsFast(d.actualTableLog)
val2 := br[stream2].peekBitsFast(d.actualTableLog)
v := single[val&tlMask]
v2 := single[val2&tlMask]
br[stream].advance(uint8(v.entry))
br[stream2].advance(uint8(v2.entry))
buf[stream][off] = uint8(v.entry >> 8)
buf[stream2][off] = uint8(v2.entry >> 8)
val = br[stream].peekBitsFast(d.actualTableLog)
val2 = br[stream2].peekBitsFast(d.actualTableLog)
v = single[val&tlMask]
v2 = single[val2&tlMask]
br[stream].advance(uint8(v.entry))
br[stream2].advance(uint8(v2.entry))
buf[stream][off+1] = uint8(v.entry >> 8)
buf[stream2][off+1] = uint8(v2.entry >> 8)
}
{
const stream = 2
const stream2 = 3
br[stream].fillFast()
br[stream2].fillFast()
val := br[stream].peekBitsFast(d.actualTableLog)
val2 := br[stream2].peekBitsFast(d.actualTableLog)
v := single[val&tlMask]
v2 := single[val2&tlMask]
br[stream].advance(uint8(v.entry))
br[stream2].advance(uint8(v2.entry))
buf[stream][off] = uint8(v.entry >> 8)
buf[stream2][off] = uint8(v2.entry >> 8)
val = br[stream].peekBitsFast(d.actualTableLog)
val2 = br[stream2].peekBitsFast(d.actualTableLog)
v = single[val&tlMask]
v2 = single[val2&tlMask]
br[stream].advance(uint8(v.entry))
br[stream2].advance(uint8(v2.entry))
buf[stream][off+1] = uint8(v.entry >> 8)
buf[stream2][off+1] = uint8(v2.entry >> 8)
}
off += 2
if off == 0 {
if bufoff > dstEvery {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 1")
}
copy(out, buf[0][:])
copy(out[dstEvery:], buf[1][:])
copy(out[dstEvery*2:], buf[2][:])
copy(out[dstEvery*3:], buf[3][:])
out = out[bufoff:]
decoded += bufoff * 4
// There must at least be 3 buffers left.
if len(out) < dstEvery*3 {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 2")
}
}
}
if off > 0 {
ioff := int(off)
if len(out) < dstEvery*3+ioff {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 3")
}
copy(out, buf[0][:off])
copy(out[dstEvery:], buf[1][:off])
copy(out[dstEvery*2:], buf[2][:off])
copy(out[dstEvery*3:], buf[3][:off])
decoded += int(off) * 4
out = out[off:]
}
// Decode remaining.
remainBytes := dstEvery - (decoded / 4)
for i := range br {
offset := dstEvery * i
endsAt := offset + remainBytes
if endsAt > len(out) {
endsAt = len(out)
}
br := &br[i]
bitsLeft := br.remaining()
for bitsLeft > 0 {
br.fill()
if offset >= endsAt {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 4")
}
// Read value and increment offset.
val := br.peekBitsFast(d.actualTableLog)
v := single[val&tlMask].entry
nBits := uint8(v)
br.advance(nBits)
bitsLeft -= uint(nBits)
out[offset] = uint8(v >> 8)
offset++
}
if offset != endsAt {
d.bufs.Put(buf)
return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
}
decoded += offset - dstEvery*i
err = br.close()
if err != nil {
return nil, err
}
}
d.bufs.Put(buf)
if dstSize != decoded {
return nil, errors.New("corruption detected: short output block")
}
return dst, nil
}