mirror of
				https://gitea.com/Lydanne/buildx.git
				synced 2025-11-04 01:53:42 +08:00 
			
		
		
		
	
		
			
				
	
	
		
			450 lines
		
	
	
		
			9.1 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			450 lines
		
	
	
		
			9.1 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
// Copyright 2019 The Go Authors. All rights reserved.
 | 
						|
// Use of this source code is governed by a BSD-style
 | 
						|
// license that can be found in the LICENSE file.
 | 
						|
 | 
						|
// Based on CRYPTOGAMS code with the following comment:
 | 
						|
// # ====================================================================
 | 
						|
// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 | 
						|
// # project. The module is, however, dual licensed under OpenSSL and
 | 
						|
// # CRYPTOGAMS licenses depending on where you obtain it. For further
 | 
						|
// # details see http://www.openssl.org/~appro/cryptogams/.
 | 
						|
// # ====================================================================
 | 
						|
 | 
						|
// Code for the perl script that generates the ppc64 assembler
 | 
						|
// can be found in the cryptogams repository at the link below. It is based on
 | 
						|
// the original from openssl.
 | 
						|
 | 
						|
// https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
 | 
						|
 | 
						|
// The differences in this and the original implementation are
 | 
						|
// due to the calling conventions and initialization of constants.
 | 
						|
 | 
						|
// +build gc,!purego
 | 
						|
 | 
						|
#include "textflag.h"
 | 
						|
 | 
						|
#define OUT  R3
 | 
						|
#define INP  R4
 | 
						|
#define LEN  R5
 | 
						|
#define KEY  R6
 | 
						|
#define CNT  R7
 | 
						|
#define TMP  R15
 | 
						|
 | 
						|
#define CONSTBASE  R16
 | 
						|
#define BLOCKS R17
 | 
						|
 | 
						|
DATA consts<>+0x00(SB)/8, $0x3320646e61707865
 | 
						|
DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
 | 
						|
DATA consts<>+0x10(SB)/8, $0x0000000000000001
 | 
						|
DATA consts<>+0x18(SB)/8, $0x0000000000000000
 | 
						|
DATA consts<>+0x20(SB)/8, $0x0000000000000004
 | 
						|
DATA consts<>+0x28(SB)/8, $0x0000000000000000
 | 
						|
DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d
 | 
						|
DATA consts<>+0x38(SB)/8, $0x0203000106070405
 | 
						|
DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c
 | 
						|
DATA consts<>+0x48(SB)/8, $0x0102030005060704
 | 
						|
DATA consts<>+0x50(SB)/8, $0x6170786561707865
 | 
						|
DATA consts<>+0x58(SB)/8, $0x6170786561707865
 | 
						|
DATA consts<>+0x60(SB)/8, $0x3320646e3320646e
 | 
						|
DATA consts<>+0x68(SB)/8, $0x3320646e3320646e
 | 
						|
DATA consts<>+0x70(SB)/8, $0x79622d3279622d32
 | 
						|
DATA consts<>+0x78(SB)/8, $0x79622d3279622d32
 | 
						|
DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
 | 
						|
DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
 | 
						|
DATA consts<>+0x90(SB)/8, $0x0000000100000000
 | 
						|
DATA consts<>+0x98(SB)/8, $0x0000000300000002
 | 
						|
GLOBL consts<>(SB), RODATA, $0xa0
 | 
						|
 | 
						|
//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
 | 
						|
TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
 | 
						|
	MOVD out+0(FP), OUT
 | 
						|
	MOVD inp+8(FP), INP
 | 
						|
	MOVD len+16(FP), LEN
 | 
						|
	MOVD key+24(FP), KEY
 | 
						|
	MOVD counter+32(FP), CNT
 | 
						|
 | 
						|
	// Addressing for constants
 | 
						|
	MOVD $consts<>+0x00(SB), CONSTBASE
 | 
						|
	MOVD $16, R8
 | 
						|
	MOVD $32, R9
 | 
						|
	MOVD $48, R10
 | 
						|
	MOVD $64, R11
 | 
						|
	SRD $6, LEN, BLOCKS
 | 
						|
	// V16
 | 
						|
	LXVW4X (CONSTBASE)(R0), VS48
 | 
						|
	ADD $80,CONSTBASE
 | 
						|
 | 
						|
	// Load key into V17,V18
 | 
						|
	LXVW4X (KEY)(R0), VS49
 | 
						|
	LXVW4X (KEY)(R8), VS50
 | 
						|
 | 
						|
	// Load CNT, NONCE into V19
 | 
						|
	LXVW4X (CNT)(R0), VS51
 | 
						|
 | 
						|
	// Clear V27
 | 
						|
	VXOR V27, V27, V27
 | 
						|
 | 
						|
	// V28
 | 
						|
	LXVW4X (CONSTBASE)(R11), VS60
 | 
						|
 | 
						|
	// splat slot from V19 -> V26
 | 
						|
	VSPLTW $0, V19, V26
 | 
						|
 | 
						|
	VSLDOI $4, V19, V27, V19
 | 
						|
	VSLDOI $12, V27, V19, V19
 | 
						|
 | 
						|
	VADDUWM V26, V28, V26
 | 
						|
 | 
						|
	MOVD $10, R14
 | 
						|
	MOVD R14, CTR
 | 
						|
 | 
						|
loop_outer_vsx:
 | 
						|
	// V0, V1, V2, V3
 | 
						|
	LXVW4X (R0)(CONSTBASE), VS32
 | 
						|
	LXVW4X (R8)(CONSTBASE), VS33
 | 
						|
	LXVW4X (R9)(CONSTBASE), VS34
 | 
						|
	LXVW4X (R10)(CONSTBASE), VS35
 | 
						|
 | 
						|
	// splat values from V17, V18 into V4-V11
 | 
						|
	VSPLTW $0, V17, V4
 | 
						|
	VSPLTW $1, V17, V5
 | 
						|
	VSPLTW $2, V17, V6
 | 
						|
	VSPLTW $3, V17, V7
 | 
						|
	VSPLTW $0, V18, V8
 | 
						|
	VSPLTW $1, V18, V9
 | 
						|
	VSPLTW $2, V18, V10
 | 
						|
	VSPLTW $3, V18, V11
 | 
						|
 | 
						|
	// VOR
 | 
						|
	VOR V26, V26, V12
 | 
						|
 | 
						|
	// splat values from V19 -> V13, V14, V15
 | 
						|
	VSPLTW $1, V19, V13
 | 
						|
	VSPLTW $2, V19, V14
 | 
						|
	VSPLTW $3, V19, V15
 | 
						|
 | 
						|
	// splat   const values
 | 
						|
	VSPLTISW $-16, V27
 | 
						|
	VSPLTISW $12, V28
 | 
						|
	VSPLTISW $8, V29
 | 
						|
	VSPLTISW $7, V30
 | 
						|
 | 
						|
loop_vsx:
 | 
						|
	VADDUWM V0, V4, V0
 | 
						|
	VADDUWM V1, V5, V1
 | 
						|
	VADDUWM V2, V6, V2
 | 
						|
	VADDUWM V3, V7, V3
 | 
						|
 | 
						|
	VXOR V12, V0, V12
 | 
						|
	VXOR V13, V1, V13
 | 
						|
	VXOR V14, V2, V14
 | 
						|
	VXOR V15, V3, V15
 | 
						|
 | 
						|
	VRLW V12, V27, V12
 | 
						|
	VRLW V13, V27, V13
 | 
						|
	VRLW V14, V27, V14
 | 
						|
	VRLW V15, V27, V15
 | 
						|
 | 
						|
	VADDUWM V8, V12, V8
 | 
						|
	VADDUWM V9, V13, V9
 | 
						|
	VADDUWM V10, V14, V10
 | 
						|
	VADDUWM V11, V15, V11
 | 
						|
 | 
						|
	VXOR V4, V8, V4
 | 
						|
	VXOR V5, V9, V5
 | 
						|
	VXOR V6, V10, V6
 | 
						|
	VXOR V7, V11, V7
 | 
						|
 | 
						|
	VRLW V4, V28, V4
 | 
						|
	VRLW V5, V28, V5
 | 
						|
	VRLW V6, V28, V6
 | 
						|
	VRLW V7, V28, V7
 | 
						|
 | 
						|
	VADDUWM V0, V4, V0
 | 
						|
	VADDUWM V1, V5, V1
 | 
						|
	VADDUWM V2, V6, V2
 | 
						|
	VADDUWM V3, V7, V3
 | 
						|
 | 
						|
	VXOR V12, V0, V12
 | 
						|
	VXOR V13, V1, V13
 | 
						|
	VXOR V14, V2, V14
 | 
						|
	VXOR V15, V3, V15
 | 
						|
 | 
						|
	VRLW V12, V29, V12
 | 
						|
	VRLW V13, V29, V13
 | 
						|
	VRLW V14, V29, V14
 | 
						|
	VRLW V15, V29, V15
 | 
						|
 | 
						|
	VADDUWM V8, V12, V8
 | 
						|
	VADDUWM V9, V13, V9
 | 
						|
	VADDUWM V10, V14, V10
 | 
						|
	VADDUWM V11, V15, V11
 | 
						|
 | 
						|
	VXOR V4, V8, V4
 | 
						|
	VXOR V5, V9, V5
 | 
						|
	VXOR V6, V10, V6
 | 
						|
	VXOR V7, V11, V7
 | 
						|
 | 
						|
	VRLW V4, V30, V4
 | 
						|
	VRLW V5, V30, V5
 | 
						|
	VRLW V6, V30, V6
 | 
						|
	VRLW V7, V30, V7
 | 
						|
 | 
						|
	VADDUWM V0, V5, V0
 | 
						|
	VADDUWM V1, V6, V1
 | 
						|
	VADDUWM V2, V7, V2
 | 
						|
	VADDUWM V3, V4, V3
 | 
						|
 | 
						|
	VXOR V15, V0, V15
 | 
						|
	VXOR V12, V1, V12
 | 
						|
	VXOR V13, V2, V13
 | 
						|
	VXOR V14, V3, V14
 | 
						|
 | 
						|
	VRLW V15, V27, V15
 | 
						|
	VRLW V12, V27, V12
 | 
						|
	VRLW V13, V27, V13
 | 
						|
	VRLW V14, V27, V14
 | 
						|
 | 
						|
	VADDUWM V10, V15, V10
 | 
						|
	VADDUWM V11, V12, V11
 | 
						|
	VADDUWM V8, V13, V8
 | 
						|
	VADDUWM V9, V14, V9
 | 
						|
 | 
						|
	VXOR V5, V10, V5
 | 
						|
	VXOR V6, V11, V6
 | 
						|
	VXOR V7, V8, V7
 | 
						|
	VXOR V4, V9, V4
 | 
						|
 | 
						|
	VRLW V5, V28, V5
 | 
						|
	VRLW V6, V28, V6
 | 
						|
	VRLW V7, V28, V7
 | 
						|
	VRLW V4, V28, V4
 | 
						|
 | 
						|
	VADDUWM V0, V5, V0
 | 
						|
	VADDUWM V1, V6, V1
 | 
						|
	VADDUWM V2, V7, V2
 | 
						|
	VADDUWM V3, V4, V3
 | 
						|
 | 
						|
	VXOR V15, V0, V15
 | 
						|
	VXOR V12, V1, V12
 | 
						|
	VXOR V13, V2, V13
 | 
						|
	VXOR V14, V3, V14
 | 
						|
 | 
						|
	VRLW V15, V29, V15
 | 
						|
	VRLW V12, V29, V12
 | 
						|
	VRLW V13, V29, V13
 | 
						|
	VRLW V14, V29, V14
 | 
						|
 | 
						|
	VADDUWM V10, V15, V10
 | 
						|
	VADDUWM V11, V12, V11
 | 
						|
	VADDUWM V8, V13, V8
 | 
						|
	VADDUWM V9, V14, V9
 | 
						|
 | 
						|
	VXOR V5, V10, V5
 | 
						|
	VXOR V6, V11, V6
 | 
						|
	VXOR V7, V8, V7
 | 
						|
	VXOR V4, V9, V4
 | 
						|
 | 
						|
	VRLW V5, V30, V5
 | 
						|
	VRLW V6, V30, V6
 | 
						|
	VRLW V7, V30, V7
 | 
						|
	VRLW V4, V30, V4
 | 
						|
	BC   16, LT, loop_vsx
 | 
						|
 | 
						|
	VADDUWM V12, V26, V12
 | 
						|
 | 
						|
	WORD $0x13600F8C		// VMRGEW V0, V1, V27
 | 
						|
	WORD $0x13821F8C		// VMRGEW V2, V3, V28
 | 
						|
 | 
						|
	WORD $0x10000E8C		// VMRGOW V0, V1, V0
 | 
						|
	WORD $0x10421E8C		// VMRGOW V2, V3, V2
 | 
						|
 | 
						|
	WORD $0x13A42F8C		// VMRGEW V4, V5, V29
 | 
						|
	WORD $0x13C63F8C		// VMRGEW V6, V7, V30
 | 
						|
 | 
						|
	XXPERMDI VS32, VS34, $0, VS33
 | 
						|
	XXPERMDI VS32, VS34, $3, VS35
 | 
						|
	XXPERMDI VS59, VS60, $0, VS32
 | 
						|
	XXPERMDI VS59, VS60, $3, VS34
 | 
						|
 | 
						|
	WORD $0x10842E8C		// VMRGOW V4, V5, V4
 | 
						|
	WORD $0x10C63E8C		// VMRGOW V6, V7, V6
 | 
						|
 | 
						|
	WORD $0x13684F8C		// VMRGEW V8, V9, V27
 | 
						|
	WORD $0x138A5F8C		// VMRGEW V10, V11, V28
 | 
						|
 | 
						|
	XXPERMDI VS36, VS38, $0, VS37
 | 
						|
	XXPERMDI VS36, VS38, $3, VS39
 | 
						|
	XXPERMDI VS61, VS62, $0, VS36
 | 
						|
	XXPERMDI VS61, VS62, $3, VS38
 | 
						|
 | 
						|
	WORD $0x11084E8C		// VMRGOW V8, V9, V8
 | 
						|
	WORD $0x114A5E8C		// VMRGOW V10, V11, V10
 | 
						|
 | 
						|
	WORD $0x13AC6F8C		// VMRGEW V12, V13, V29
 | 
						|
	WORD $0x13CE7F8C		// VMRGEW V14, V15, V30
 | 
						|
 | 
						|
	XXPERMDI VS40, VS42, $0, VS41
 | 
						|
	XXPERMDI VS40, VS42, $3, VS43
 | 
						|
	XXPERMDI VS59, VS60, $0, VS40
 | 
						|
	XXPERMDI VS59, VS60, $3, VS42
 | 
						|
 | 
						|
	WORD $0x118C6E8C		// VMRGOW V12, V13, V12
 | 
						|
	WORD $0x11CE7E8C		// VMRGOW V14, V15, V14
 | 
						|
 | 
						|
	VSPLTISW $4, V27
 | 
						|
	VADDUWM V26, V27, V26
 | 
						|
 | 
						|
	XXPERMDI VS44, VS46, $0, VS45
 | 
						|
	XXPERMDI VS44, VS46, $3, VS47
 | 
						|
	XXPERMDI VS61, VS62, $0, VS44
 | 
						|
	XXPERMDI VS61, VS62, $3, VS46
 | 
						|
 | 
						|
	VADDUWM V0, V16, V0
 | 
						|
	VADDUWM V4, V17, V4
 | 
						|
	VADDUWM V8, V18, V8
 | 
						|
	VADDUWM V12, V19, V12
 | 
						|
 | 
						|
	CMPU LEN, $64
 | 
						|
	BLT tail_vsx
 | 
						|
 | 
						|
	// Bottom of loop
 | 
						|
	LXVW4X (INP)(R0), VS59
 | 
						|
	LXVW4X (INP)(R8), VS60
 | 
						|
	LXVW4X (INP)(R9), VS61
 | 
						|
	LXVW4X (INP)(R10), VS62
 | 
						|
 | 
						|
	VXOR V27, V0, V27
 | 
						|
	VXOR V28, V4, V28
 | 
						|
	VXOR V29, V8, V29
 | 
						|
	VXOR V30, V12, V30
 | 
						|
 | 
						|
	STXVW4X VS59, (OUT)(R0)
 | 
						|
	STXVW4X VS60, (OUT)(R8)
 | 
						|
	ADD     $64, INP
 | 
						|
	STXVW4X VS61, (OUT)(R9)
 | 
						|
	ADD     $-64, LEN
 | 
						|
	STXVW4X VS62, (OUT)(R10)
 | 
						|
	ADD     $64, OUT
 | 
						|
	BEQ     done_vsx
 | 
						|
 | 
						|
	VADDUWM V1, V16, V0
 | 
						|
	VADDUWM V5, V17, V4
 | 
						|
	VADDUWM V9, V18, V8
 | 
						|
	VADDUWM V13, V19, V12
 | 
						|
 | 
						|
	CMPU  LEN, $64
 | 
						|
	BLT   tail_vsx
 | 
						|
 | 
						|
	LXVW4X (INP)(R0), VS59
 | 
						|
	LXVW4X (INP)(R8), VS60
 | 
						|
	LXVW4X (INP)(R9), VS61
 | 
						|
	LXVW4X (INP)(R10), VS62
 | 
						|
	VXOR   V27, V0, V27
 | 
						|
 | 
						|
	VXOR V28, V4, V28
 | 
						|
	VXOR V29, V8, V29
 | 
						|
	VXOR V30, V12, V30
 | 
						|
 | 
						|
	STXVW4X VS59, (OUT)(R0)
 | 
						|
	STXVW4X VS60, (OUT)(R8)
 | 
						|
	ADD     $64, INP
 | 
						|
	STXVW4X VS61, (OUT)(R9)
 | 
						|
	ADD     $-64, LEN
 | 
						|
	STXVW4X VS62, (OUT)(V10)
 | 
						|
	ADD     $64, OUT
 | 
						|
	BEQ     done_vsx
 | 
						|
 | 
						|
	VADDUWM V2, V16, V0
 | 
						|
	VADDUWM V6, V17, V4
 | 
						|
	VADDUWM V10, V18, V8
 | 
						|
	VADDUWM V14, V19, V12
 | 
						|
 | 
						|
	CMPU LEN, $64
 | 
						|
	BLT  tail_vsx
 | 
						|
 | 
						|
	LXVW4X (INP)(R0), VS59
 | 
						|
	LXVW4X (INP)(R8), VS60
 | 
						|
	LXVW4X (INP)(R9), VS61
 | 
						|
	LXVW4X (INP)(R10), VS62
 | 
						|
 | 
						|
	VXOR V27, V0, V27
 | 
						|
	VXOR V28, V4, V28
 | 
						|
	VXOR V29, V8, V29
 | 
						|
	VXOR V30, V12, V30
 | 
						|
 | 
						|
	STXVW4X VS59, (OUT)(R0)
 | 
						|
	STXVW4X VS60, (OUT)(R8)
 | 
						|
	ADD     $64, INP
 | 
						|
	STXVW4X VS61, (OUT)(R9)
 | 
						|
	ADD     $-64, LEN
 | 
						|
	STXVW4X VS62, (OUT)(R10)
 | 
						|
	ADD     $64, OUT
 | 
						|
	BEQ     done_vsx
 | 
						|
 | 
						|
	VADDUWM V3, V16, V0
 | 
						|
	VADDUWM V7, V17, V4
 | 
						|
	VADDUWM V11, V18, V8
 | 
						|
	VADDUWM V15, V19, V12
 | 
						|
 | 
						|
	CMPU  LEN, $64
 | 
						|
	BLT   tail_vsx
 | 
						|
 | 
						|
	LXVW4X (INP)(R0), VS59
 | 
						|
	LXVW4X (INP)(R8), VS60
 | 
						|
	LXVW4X (INP)(R9), VS61
 | 
						|
	LXVW4X (INP)(R10), VS62
 | 
						|
 | 
						|
	VXOR V27, V0, V27
 | 
						|
	VXOR V28, V4, V28
 | 
						|
	VXOR V29, V8, V29
 | 
						|
	VXOR V30, V12, V30
 | 
						|
 | 
						|
	STXVW4X VS59, (OUT)(R0)
 | 
						|
	STXVW4X VS60, (OUT)(R8)
 | 
						|
	ADD     $64, INP
 | 
						|
	STXVW4X VS61, (OUT)(R9)
 | 
						|
	ADD     $-64, LEN
 | 
						|
	STXVW4X VS62, (OUT)(R10)
 | 
						|
	ADD     $64, OUT
 | 
						|
 | 
						|
	MOVD $10, R14
 | 
						|
	MOVD R14, CTR
 | 
						|
	BNE  loop_outer_vsx
 | 
						|
 | 
						|
done_vsx:
 | 
						|
	// Increment counter by number of 64 byte blocks
 | 
						|
	MOVD (CNT), R14
 | 
						|
	ADD  BLOCKS, R14
 | 
						|
	MOVD R14, (CNT)
 | 
						|
	RET
 | 
						|
 | 
						|
tail_vsx:
 | 
						|
	ADD  $32, R1, R11
 | 
						|
	MOVD LEN, CTR
 | 
						|
 | 
						|
	// Save values on stack to copy from
 | 
						|
	STXVW4X VS32, (R11)(R0)
 | 
						|
	STXVW4X VS36, (R11)(R8)
 | 
						|
	STXVW4X VS40, (R11)(R9)
 | 
						|
	STXVW4X VS44, (R11)(R10)
 | 
						|
	ADD $-1, R11, R12
 | 
						|
	ADD $-1, INP
 | 
						|
	ADD $-1, OUT
 | 
						|
 | 
						|
looptail_vsx:
 | 
						|
	// Copying the result to OUT
 | 
						|
	// in bytes.
 | 
						|
	MOVBZU 1(R12), KEY
 | 
						|
	MOVBZU 1(INP), TMP
 | 
						|
	XOR    KEY, TMP, KEY
 | 
						|
	MOVBU  KEY, 1(OUT)
 | 
						|
	BC     16, LT, looptail_vsx
 | 
						|
 | 
						|
	// Clear the stack values
 | 
						|
	STXVW4X VS48, (R11)(R0)
 | 
						|
	STXVW4X VS48, (R11)(R8)
 | 
						|
	STXVW4X VS48, (R11)(R9)
 | 
						|
	STXVW4X VS48, (R11)(R10)
 | 
						|
	BR      done_vsx
 |