mirror of
				https://gitea.com/Lydanne/buildx.git
				synced 2025-11-03 09:33:43 +08:00 
			
		
		
		
	
		
			
				
	
	
		
			210 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			210 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
//go:build !appengine && gc && !purego
 | 
						|
// +build !appengine
 | 
						|
// +build gc
 | 
						|
// +build !purego
 | 
						|
 | 
						|
#include "textflag.h"
 | 
						|
 | 
						|
// Registers:
 | 
						|
#define h      AX
 | 
						|
#define d      AX
 | 
						|
#define p      SI // pointer to advance through b
 | 
						|
#define n      DX
 | 
						|
#define end    BX // loop end
 | 
						|
#define v1     R8
 | 
						|
#define v2     R9
 | 
						|
#define v3     R10
 | 
						|
#define v4     R11
 | 
						|
#define x      R12
 | 
						|
#define prime1 R13
 | 
						|
#define prime2 R14
 | 
						|
#define prime4 DI
 | 
						|
 | 
						|
#define round(acc, x) \
 | 
						|
	IMULQ prime2, x   \
 | 
						|
	ADDQ  x, acc      \
 | 
						|
	ROLQ  $31, acc    \
 | 
						|
	IMULQ prime1, acc
 | 
						|
 | 
						|
// round0 performs the operation x = round(0, x).
 | 
						|
#define round0(x) \
 | 
						|
	IMULQ prime2, x \
 | 
						|
	ROLQ  $31, x    \
 | 
						|
	IMULQ prime1, x
 | 
						|
 | 
						|
// mergeRound applies a merge round on the two registers acc and x.
 | 
						|
// It assumes that prime1, prime2, and prime4 have been loaded.
 | 
						|
#define mergeRound(acc, x) \
 | 
						|
	round0(x)         \
 | 
						|
	XORQ  x, acc      \
 | 
						|
	IMULQ prime1, acc \
 | 
						|
	ADDQ  prime4, acc
 | 
						|
 | 
						|
// blockLoop processes as many 32-byte blocks as possible,
 | 
						|
// updating v1, v2, v3, and v4. It assumes that there is at least one block
 | 
						|
// to process.
 | 
						|
#define blockLoop() \
 | 
						|
loop:  \
 | 
						|
	MOVQ +0(p), x  \
 | 
						|
	round(v1, x)   \
 | 
						|
	MOVQ +8(p), x  \
 | 
						|
	round(v2, x)   \
 | 
						|
	MOVQ +16(p), x \
 | 
						|
	round(v3, x)   \
 | 
						|
	MOVQ +24(p), x \
 | 
						|
	round(v4, x)   \
 | 
						|
	ADDQ $32, p    \
 | 
						|
	CMPQ p, end    \
 | 
						|
	JLE  loop
 | 
						|
 | 
						|
// func Sum64(b []byte) uint64
 | 
						|
TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
 | 
						|
	// Load fixed primes.
 | 
						|
	MOVQ ·primes+0(SB), prime1
 | 
						|
	MOVQ ·primes+8(SB), prime2
 | 
						|
	MOVQ ·primes+24(SB), prime4
 | 
						|
 | 
						|
	// Load slice.
 | 
						|
	MOVQ b_base+0(FP), p
 | 
						|
	MOVQ b_len+8(FP), n
 | 
						|
	LEAQ (p)(n*1), end
 | 
						|
 | 
						|
	// The first loop limit will be len(b)-32.
 | 
						|
	SUBQ $32, end
 | 
						|
 | 
						|
	// Check whether we have at least one block.
 | 
						|
	CMPQ n, $32
 | 
						|
	JLT  noBlocks
 | 
						|
 | 
						|
	// Set up initial state (v1, v2, v3, v4).
 | 
						|
	MOVQ prime1, v1
 | 
						|
	ADDQ prime2, v1
 | 
						|
	MOVQ prime2, v2
 | 
						|
	XORQ v3, v3
 | 
						|
	XORQ v4, v4
 | 
						|
	SUBQ prime1, v4
 | 
						|
 | 
						|
	blockLoop()
 | 
						|
 | 
						|
	MOVQ v1, h
 | 
						|
	ROLQ $1, h
 | 
						|
	MOVQ v2, x
 | 
						|
	ROLQ $7, x
 | 
						|
	ADDQ x, h
 | 
						|
	MOVQ v3, x
 | 
						|
	ROLQ $12, x
 | 
						|
	ADDQ x, h
 | 
						|
	MOVQ v4, x
 | 
						|
	ROLQ $18, x
 | 
						|
	ADDQ x, h
 | 
						|
 | 
						|
	mergeRound(h, v1)
 | 
						|
	mergeRound(h, v2)
 | 
						|
	mergeRound(h, v3)
 | 
						|
	mergeRound(h, v4)
 | 
						|
 | 
						|
	JMP afterBlocks
 | 
						|
 | 
						|
noBlocks:
 | 
						|
	MOVQ ·primes+32(SB), h
 | 
						|
 | 
						|
afterBlocks:
 | 
						|
	ADDQ n, h
 | 
						|
 | 
						|
	ADDQ $24, end
 | 
						|
	CMPQ p, end
 | 
						|
	JG   try4
 | 
						|
 | 
						|
loop8:
 | 
						|
	MOVQ  (p), x
 | 
						|
	ADDQ  $8, p
 | 
						|
	round0(x)
 | 
						|
	XORQ  x, h
 | 
						|
	ROLQ  $27, h
 | 
						|
	IMULQ prime1, h
 | 
						|
	ADDQ  prime4, h
 | 
						|
 | 
						|
	CMPQ p, end
 | 
						|
	JLE  loop8
 | 
						|
 | 
						|
try4:
 | 
						|
	ADDQ $4, end
 | 
						|
	CMPQ p, end
 | 
						|
	JG   try1
 | 
						|
 | 
						|
	MOVL  (p), x
 | 
						|
	ADDQ  $4, p
 | 
						|
	IMULQ prime1, x
 | 
						|
	XORQ  x, h
 | 
						|
 | 
						|
	ROLQ  $23, h
 | 
						|
	IMULQ prime2, h
 | 
						|
	ADDQ  ·primes+16(SB), h
 | 
						|
 | 
						|
try1:
 | 
						|
	ADDQ $4, end
 | 
						|
	CMPQ p, end
 | 
						|
	JGE  finalize
 | 
						|
 | 
						|
loop1:
 | 
						|
	MOVBQZX (p), x
 | 
						|
	ADDQ    $1, p
 | 
						|
	IMULQ   ·primes+32(SB), x
 | 
						|
	XORQ    x, h
 | 
						|
	ROLQ    $11, h
 | 
						|
	IMULQ   prime1, h
 | 
						|
 | 
						|
	CMPQ p, end
 | 
						|
	JL   loop1
 | 
						|
 | 
						|
finalize:
 | 
						|
	MOVQ  h, x
 | 
						|
	SHRQ  $33, x
 | 
						|
	XORQ  x, h
 | 
						|
	IMULQ prime2, h
 | 
						|
	MOVQ  h, x
 | 
						|
	SHRQ  $29, x
 | 
						|
	XORQ  x, h
 | 
						|
	IMULQ ·primes+16(SB), h
 | 
						|
	MOVQ  h, x
 | 
						|
	SHRQ  $32, x
 | 
						|
	XORQ  x, h
 | 
						|
 | 
						|
	MOVQ h, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
// func writeBlocks(d *Digest, b []byte) int
 | 
						|
TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
 | 
						|
	// Load fixed primes needed for round.
 | 
						|
	MOVQ ·primes+0(SB), prime1
 | 
						|
	MOVQ ·primes+8(SB), prime2
 | 
						|
 | 
						|
	// Load slice.
 | 
						|
	MOVQ b_base+8(FP), p
 | 
						|
	MOVQ b_len+16(FP), n
 | 
						|
	LEAQ (p)(n*1), end
 | 
						|
	SUBQ $32, end
 | 
						|
 | 
						|
	// Load vN from d.
 | 
						|
	MOVQ s+0(FP), d
 | 
						|
	MOVQ 0(d), v1
 | 
						|
	MOVQ 8(d), v2
 | 
						|
	MOVQ 16(d), v3
 | 
						|
	MOVQ 24(d), v4
 | 
						|
 | 
						|
	// We don't need to check the loop condition here; this function is
 | 
						|
	// always called with at least one block of data to process.
 | 
						|
	blockLoop()
 | 
						|
 | 
						|
	// Copy vN back to d.
 | 
						|
	MOVQ v1, 0(d)
 | 
						|
	MOVQ v2, 8(d)
 | 
						|
	MOVQ v3, 16(d)
 | 
						|
	MOVQ v4, 24(d)
 | 
						|
 | 
						|
	// The number of bytes written is p minus the old base pointer.
 | 
						|
	SUBQ b_base+8(FP), p
 | 
						|
	MOVQ p, ret+32(FP)
 | 
						|
 | 
						|
	RET
 |