mirror of
				https://gitea.com/Lydanne/buildx.git
				synced 2025-11-03 09:33:43 +08:00 
			
		
		
		
	
		
			
				
	
	
		
			184 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			184 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
//go:build !appengine && gc && !purego
 | 
						|
// +build !appengine
 | 
						|
// +build gc
 | 
						|
// +build !purego
 | 
						|
 | 
						|
#include "textflag.h"
 | 
						|
 | 
						|
// Registers:
 | 
						|
#define digest	R1
 | 
						|
#define h	R2 // return value
 | 
						|
#define p	R3 // input pointer
 | 
						|
#define n	R4 // input length
 | 
						|
#define nblocks	R5 // n / 32
 | 
						|
#define prime1	R7
 | 
						|
#define prime2	R8
 | 
						|
#define prime3	R9
 | 
						|
#define prime4	R10
 | 
						|
#define prime5	R11
 | 
						|
#define v1	R12
 | 
						|
#define v2	R13
 | 
						|
#define v3	R14
 | 
						|
#define v4	R15
 | 
						|
#define x1	R20
 | 
						|
#define x2	R21
 | 
						|
#define x3	R22
 | 
						|
#define x4	R23
 | 
						|
 | 
						|
#define round(acc, x) \
 | 
						|
	MADD prime2, acc, x, acc \
 | 
						|
	ROR  $64-31, acc         \
 | 
						|
	MUL  prime1, acc
 | 
						|
 | 
						|
// round0 performs the operation x = round(0, x).
 | 
						|
#define round0(x) \
 | 
						|
	MUL prime2, x \
 | 
						|
	ROR $64-31, x \
 | 
						|
	MUL prime1, x
 | 
						|
 | 
						|
#define mergeRound(acc, x) \
 | 
						|
	round0(x)                     \
 | 
						|
	EOR  x, acc                   \
 | 
						|
	MADD acc, prime4, prime1, acc
 | 
						|
 | 
						|
// blockLoop processes as many 32-byte blocks as possible,
 | 
						|
// updating v1, v2, v3, and v4. It assumes that n >= 32.
 | 
						|
#define blockLoop() \
 | 
						|
	LSR     $5, n, nblocks  \
 | 
						|
	PCALIGN $16             \
 | 
						|
	loop:                   \
 | 
						|
	LDP.P   16(p), (x1, x2) \
 | 
						|
	LDP.P   16(p), (x3, x4) \
 | 
						|
	round(v1, x1)           \
 | 
						|
	round(v2, x2)           \
 | 
						|
	round(v3, x3)           \
 | 
						|
	round(v4, x4)           \
 | 
						|
	SUB     $1, nblocks     \
 | 
						|
	CBNZ    nblocks, loop
 | 
						|
 | 
						|
// func Sum64(b []byte) uint64
 | 
						|
TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
 | 
						|
	LDP b_base+0(FP), (p, n)
 | 
						|
 | 
						|
	LDP  ·primes+0(SB), (prime1, prime2)
 | 
						|
	LDP  ·primes+16(SB), (prime3, prime4)
 | 
						|
	MOVD ·primes+32(SB), prime5
 | 
						|
 | 
						|
	CMP  $32, n
 | 
						|
	CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 }
 | 
						|
	BLT  afterLoop
 | 
						|
 | 
						|
	ADD  prime1, prime2, v1
 | 
						|
	MOVD prime2, v2
 | 
						|
	MOVD $0, v3
 | 
						|
	NEG  prime1, v4
 | 
						|
 | 
						|
	blockLoop()
 | 
						|
 | 
						|
	ROR $64-1, v1, x1
 | 
						|
	ROR $64-7, v2, x2
 | 
						|
	ADD x1, x2
 | 
						|
	ROR $64-12, v3, x3
 | 
						|
	ROR $64-18, v4, x4
 | 
						|
	ADD x3, x4
 | 
						|
	ADD x2, x4, h
 | 
						|
 | 
						|
	mergeRound(h, v1)
 | 
						|
	mergeRound(h, v2)
 | 
						|
	mergeRound(h, v3)
 | 
						|
	mergeRound(h, v4)
 | 
						|
 | 
						|
afterLoop:
 | 
						|
	ADD n, h
 | 
						|
 | 
						|
	TBZ   $4, n, try8
 | 
						|
	LDP.P 16(p), (x1, x2)
 | 
						|
 | 
						|
	round0(x1)
 | 
						|
 | 
						|
	// NOTE: here and below, sequencing the EOR after the ROR (using a
 | 
						|
	// rotated register) is worth a small but measurable speedup for small
 | 
						|
	// inputs.
 | 
						|
	ROR  $64-27, h
 | 
						|
	EOR  x1 @> 64-27, h, h
 | 
						|
	MADD h, prime4, prime1, h
 | 
						|
 | 
						|
	round0(x2)
 | 
						|
	ROR  $64-27, h
 | 
						|
	EOR  x2 @> 64-27, h, h
 | 
						|
	MADD h, prime4, prime1, h
 | 
						|
 | 
						|
try8:
 | 
						|
	TBZ    $3, n, try4
 | 
						|
	MOVD.P 8(p), x1
 | 
						|
 | 
						|
	round0(x1)
 | 
						|
	ROR  $64-27, h
 | 
						|
	EOR  x1 @> 64-27, h, h
 | 
						|
	MADD h, prime4, prime1, h
 | 
						|
 | 
						|
try4:
 | 
						|
	TBZ     $2, n, try2
 | 
						|
	MOVWU.P 4(p), x2
 | 
						|
 | 
						|
	MUL  prime1, x2
 | 
						|
	ROR  $64-23, h
 | 
						|
	EOR  x2 @> 64-23, h, h
 | 
						|
	MADD h, prime3, prime2, h
 | 
						|
 | 
						|
try2:
 | 
						|
	TBZ     $1, n, try1
 | 
						|
	MOVHU.P 2(p), x3
 | 
						|
	AND     $255, x3, x1
 | 
						|
	LSR     $8, x3, x2
 | 
						|
 | 
						|
	MUL prime5, x1
 | 
						|
	ROR $64-11, h
 | 
						|
	EOR x1 @> 64-11, h, h
 | 
						|
	MUL prime1, h
 | 
						|
 | 
						|
	MUL prime5, x2
 | 
						|
	ROR $64-11, h
 | 
						|
	EOR x2 @> 64-11, h, h
 | 
						|
	MUL prime1, h
 | 
						|
 | 
						|
try1:
 | 
						|
	TBZ   $0, n, finalize
 | 
						|
	MOVBU (p), x4
 | 
						|
 | 
						|
	MUL prime5, x4
 | 
						|
	ROR $64-11, h
 | 
						|
	EOR x4 @> 64-11, h, h
 | 
						|
	MUL prime1, h
 | 
						|
 | 
						|
finalize:
 | 
						|
	EOR h >> 33, h
 | 
						|
	MUL prime2, h
 | 
						|
	EOR h >> 29, h
 | 
						|
	MUL prime3, h
 | 
						|
	EOR h >> 32, h
 | 
						|
 | 
						|
	MOVD h, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
// func writeBlocks(d *Digest, b []byte) int
 | 
						|
TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
 | 
						|
	LDP ·primes+0(SB), (prime1, prime2)
 | 
						|
 | 
						|
	// Load state. Assume v[1-4] are stored contiguously.
 | 
						|
	MOVD d+0(FP), digest
 | 
						|
	LDP  0(digest), (v1, v2)
 | 
						|
	LDP  16(digest), (v3, v4)
 | 
						|
 | 
						|
	LDP b_base+8(FP), (p, n)
 | 
						|
 | 
						|
	blockLoop()
 | 
						|
 | 
						|
	// Store updated state.
 | 
						|
	STP (v1, v2), 0(digest)
 | 
						|
	STP (v3, v4), 16(digest)
 | 
						|
 | 
						|
	BIC  $31, n
 | 
						|
	MOVD n, ret+32(FP)
 | 
						|
	RET
 |