Files
golang.go/src/math/big/arith_arm.s
Russ Cox 7f516a31b0 math/big: replace assembly with mini-compiler output
Step 4 of the mini-compiler: switch to the new generated assembly.
No systematic performance regressions, and many many improvements.

In the benchmarks, the systems are:

	c3h88     GOARCH=amd64     c3h88 perf gomote (newer Intel, Google Cloud)
	c2s16     GOARCH=amd64     c2s16 perf gomote (Intel, Google Cloud)
	s7        GOARCH=amd64     rsc basement server (AMD Ryzen 9 7950X)
	386       GOARCH=386       gotip-linux-386 gomote (Intel, Google Cloud)
	s7-386    GOARCH=386       rsc basement server (AMD Ryzen 9 7950X)
	c4as16    GOARCH=arm64     c4as16 perf gomote (Google Cloud)
	mac       GOARCH=arm64     Apple M3 Pro in MacBook Pro
	arm       GOARCH=arm       gotip-linux-arm gomote
	loong64   GOARCH=loong64   gotip-linux-loong64 gomote
	ppc64le   GOARCH=ppc64le   gotip-linux-ppc64le gomote
	riscv64   GOARCH=riscv64   gotip-linux-riscv64 gomote
	s390x     GOARCH=s390x     linux-s390x-ibm old gomote

benchmark \ system           c3h88    c2s16       s7      386   s7-386   c4as16      mac      arm  loong64  ppc64le  riscv64    s390x
AddVV/words=1               -4.03%   +5.21%   -4.04%   +4.94%        ~        ~        ~        ~  -19.51%        ~        ~        ~
AddVV/words=10             -10.20%   +0.34%   -3.46%  -11.50%   -7.46%   +7.66%   +5.97%        ~  -17.90%        ~        ~        ~
AddVV/words=16             -10.91%   -6.45%   -8.45%  -21.86%  -17.90%   +2.73%   -1.61%        ~  -22.47%   -3.54%        ~        ~
AddVV/words=100             -3.77%   -4.30%   -3.17%  -47.27%  -45.34%   -0.78%        ~   -8.74%  -27.19%        ~        ~        ~
AddVV/words=1000            -0.08%   -0.71%        ~  -49.21%  -48.07%        ~        ~  -16.80%  -24.74%        ~        ~        ~
AddVV/words=10000                ~        ~        ~  -48.73%  -48.56%   -0.06%        ~  -17.08%        ~        ~   -4.81%        ~
AddVV/words=100000               ~        ~        ~  -47.80%  -48.38%        ~        ~  -15.10%  -25.06%        ~   -5.34%        ~
SubVV/words=1               -0.84%   +3.43%   -3.62%   +1.34%        ~   -0.76%        ~        ~  -18.18%   +5.58%        ~        ~
SubVV/words=10              -9.99%   +0.34%        ~  -11.23%   -8.24%   +7.53%   +6.15%        ~  -17.55%   +2.77%   -2.08%        ~
SubVV/words=16             -11.94%   -6.45%   -6.81%  -21.82%  -18.11%   +1.58%   -1.21%        ~  -20.36%        ~        ~        ~
SubVV/words=100             -3.38%   -4.32%   -1.80%  -46.14%  -46.43%   +0.41%        ~   -7.20%  -26.17%        ~   -0.42%        ~
SubVV/words=1000            -0.38%   -0.80%        ~  -49.22%  -48.90%        ~        ~  -15.86%  -24.73%        ~        ~        ~
SubVV/words=10000                ~        ~        ~  -49.57%  -49.64%   -0.03%        ~  -15.85%  -26.52%        ~   -5.05%        ~
SubVV/words=100000               ~        ~        ~  -46.88%  -49.66%        ~        ~  -15.45%  -16.11%        ~   -4.99%        ~
LshVU/words=1                    ~   +5.78%        ~        ~   -2.48%   +1.61%   +2.18%   +2.70%  -18.16%  -34.16%  -21.29%        ~
LshVU/words=10             -18.34%   -3.78%   +2.21%        ~        ~   -2.81%  -12.54%        ~  -25.02%  -24.78%  -38.11%  -66.98%
LshVU/words=16             -23.15%   +1.03%   +7.74%   +0.73%        ~   +8.88%   +1.56%        ~  -25.37%  -28.46%  -41.27%        ~
LshVU/words=100            -32.85%   -8.86%   -2.58%        ~   +2.69%   +1.24%        ~  -20.63%  -44.14%  -42.68%  -53.09%        ~
LshVU/words=1000           -37.30%   -0.20%   +5.67%        ~        ~   +1.44%        ~  -27.83%  -45.01%  -37.07%  -57.02%  -46.57%
LshVU/words=10000          -36.84%   -2.30%   +3.82%        ~   +1.86%   +1.57%  -66.81%  -28.00%  -13.15%  -35.40%  -41.97%        ~
LshVU/words=100000         -40.30%        ~   +3.96%        ~        ~        ~        ~  -24.91%  -19.06%  -36.14%  -40.99%  -66.03%
RshVU/words=1               -3.17%   +4.76%   -4.06%   +4.31%   +4.55%        ~        ~        ~  -20.61%        ~  -26.20%  -51.33%
RshVU/words=10             -22.08%   -4.41%  -17.99%   +3.64%  -11.87%        ~  -16.30%        ~  -30.01%        ~  -40.37%  -63.05%
RshVU/words=16             -26.03%   -8.50%  -18.09%        ~  -17.52%   +6.50%        ~   -2.85%  -30.24%        ~  -42.93%  -63.13%
RshVU/words=100            -20.87%  -28.83%  -29.45%        ~  -26.25%   +1.46%   -1.14%  -16.20%  -45.65%  -16.20%  -53.66%  -77.27%
RshVU/words=1000           -24.03%  -21.37%  -26.71%        ~  -28.95%   +0.98%        ~  -18.82%  -45.21%  -23.55%  -57.09%  -71.18%
RshVU/words=10000          -24.56%  -22.44%  -27.01%        ~  -28.88%   +0.78%   -5.35%  -17.47%  -16.87%  -20.67%  -41.97%        ~
RshVU/words=100000         -23.36%  -15.65%  -27.54%        ~  -29.26%   +1.73%   -6.67%  -13.68%  -21.40%  -23.02%  -40.37%  -66.31%
MulAddVWW/words=1           +2.37%   +8.14%        ~   +4.10%   +3.71%        ~        ~        ~  -21.62%        ~   +1.12%        ~
MulAddVWW/words=10               ~   -2.72%  -15.15%   +8.04%        ~        ~        ~   -2.52%  -19.48%        ~   -6.18%        ~
MulAddVWW/words=16               ~   +1.49%        ~   +4.49%   +6.58%   -8.70%   -7.16%  -12.08%  -21.43%   -6.59%   -9.05%        ~
MulAddVWW/words=100         +0.37%   +1.11%   -4.51%  -13.59%        ~  -11.10%   -3.63%  -21.40%  -22.27%   -2.92%  -14.41%        ~
MulAddVWW/words=1000             ~   +0.90%   -7.13%  -18.94%        ~  -14.02%   -9.97%  -28.31%  -18.72%   -2.32%  -15.80%        ~
MulAddVWW/words=10000            ~   +1.08%   -6.75%  -19.10%        ~  -14.61%   -9.04%  -28.48%  -14.29%   -2.25%   -9.40%        ~
MulAddVWW/words=100000           ~        ~   -6.93%  -18.09%        ~  -14.33%   -9.66%  -28.92%  -16.63%   -2.43%   -8.23%        ~
AddMulVVWW/words=1          +2.30%   +4.83%  -11.37%   +4.58%        ~   -3.14%        ~        ~  -10.58%  +30.35%        ~        ~
AddMulVVWW/words=10         -3.27%        ~   +8.96%   +5.74%        ~   +2.67%   -1.44%   -7.64%  -13.41%        ~        ~        ~
AddMulVVWW/words=16         -6.12%        ~        ~        ~   +1.91%   -7.90%  -16.22%  -14.07%  -14.26%   -4.15%   -7.30%        ~
AddMulVVWW/words=100        -5.48%   -2.14%        ~   -9.40%   +9.98%   -1.43%  -12.35%  -18.56%  -21.94%        ~   -9.84%        ~
AddMulVVWW/words=1000      -11.35%   -3.40%   -3.64%  -11.04%  +12.82%   -1.33%  -15.63%  -20.50%  -20.95%        ~  -11.06%  -51.97%
AddMulVVWW/words=10000     -10.31%   -1.61%   -8.41%  -12.15%  +13.10%   -1.03%  -16.34%  -22.46%   -1.00%        ~  -10.33%  -49.80%
AddMulVVWW/words=100000    -13.71%        ~   -8.31%  -12.18%  +12.98%   -1.35%  -15.20%  -21.89%        ~        ~   -9.38%  -48.30%

Change-Id: I0a33c33602c0d053c84d9946e662500cfa048e2d
Reviewed-on: https://go-review.googlesource.com/c/go/+/664938
Reviewed-by: Alan Donovan <adonovan@google.com>
Auto-Submit: Russ Cox <rsc@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2025-04-19 08:19:23 -07:00

356 lines
6.4 KiB
ArmAsm

// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
//go:build !math_big_pure_go
#include "textflag.h"
// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB), NOSPLIT, $0
MOVW z_len+4(FP), R0
MOVW x_base+12(FP), R1
MOVW y_base+24(FP), R2
MOVW z_base+0(FP), R3
// compute unrolled loop lengths
AND $3, R0, R4
MOVW R0>>2, R0
ADD.S $0, R0 // clear carry
loop1:
TEQ $0, R4; BEQ loop1done
loop1cont:
// unroll 1X
MOVW.P 4(R1), R5
MOVW.P 4(R2), R6
ADC.S R6, R5
MOVW.P R5, 4(R3)
SUB $1, R4
TEQ $0, R4; BNE loop1cont
loop1done:
loop4:
TEQ $0, R0; BEQ loop4done
loop4cont:
// unroll 4X
MOVW.P 4(R1), R4
MOVW.P 4(R1), R5
MOVW.P 4(R1), R6
MOVW.P 4(R1), R7
MOVW.P 4(R2), R8
MOVW.P 4(R2), R9
MOVW.P 4(R2), R11
MOVW.P 4(R2), R12
ADC.S R8, R4
ADC.S R9, R5
ADC.S R11, R6
ADC.S R12, R7
MOVW.P R4, 4(R3)
MOVW.P R5, 4(R3)
MOVW.P R6, 4(R3)
MOVW.P R7, 4(R3)
SUB $1, R0
TEQ $0, R0; BNE loop4cont
loop4done:
SBC R1, R1 // save carry
ADD $1, R1 // convert add carry
MOVW R1, c+36(FP)
RET
// func subVV(z, x, y []Word) (c Word)
TEXT ·subVV(SB), NOSPLIT, $0
MOVW z_len+4(FP), R0
MOVW x_base+12(FP), R1
MOVW y_base+24(FP), R2
MOVW z_base+0(FP), R3
// compute unrolled loop lengths
AND $3, R0, R4
MOVW R0>>2, R0
SUB.S $0, R0 // clear carry
loop1:
TEQ $0, R4; BEQ loop1done
loop1cont:
// unroll 1X
MOVW.P 4(R1), R5
MOVW.P 4(R2), R6
SBC.S R6, R5
MOVW.P R5, 4(R3)
SUB $1, R4
TEQ $0, R4; BNE loop1cont
loop1done:
loop4:
TEQ $0, R0; BEQ loop4done
loop4cont:
// unroll 4X
MOVW.P 4(R1), R4
MOVW.P 4(R1), R5
MOVW.P 4(R1), R6
MOVW.P 4(R1), R7
MOVW.P 4(R2), R8
MOVW.P 4(R2), R9
MOVW.P 4(R2), R11
MOVW.P 4(R2), R12
SBC.S R8, R4
SBC.S R9, R5
SBC.S R11, R6
SBC.S R12, R7
MOVW.P R4, 4(R3)
MOVW.P R5, 4(R3)
MOVW.P R6, 4(R3)
MOVW.P R7, 4(R3)
SUB $1, R0
TEQ $0, R0; BNE loop4cont
loop4done:
SBC R1, R1 // save carry
RSB $0, R1, R1 // convert sub carry
MOVW R1, c+36(FP)
RET
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB), NOSPLIT, $0
MOVW z_len+4(FP), R0
TEQ $0, R0; BEQ ret0
MOVW s+24(FP), R1
MOVW x_base+12(FP), R2
MOVW z_base+0(FP), R3
// run loop backward
ADD R0<<2, R2, R2
ADD R0<<2, R3, R3
// shift first word into carry
MOVW.W -4(R2), R4
MOVW $32, R5
SUB R1, R5
MOVW R4>>R5, R6
MOVW R4<<R1, R4
MOVW R6, c+28(FP)
// shift remaining words
SUB $1, R0
// compute unrolled loop lengths
AND $3, R0, R6
MOVW R0>>2, R0
loop1:
TEQ $0, R6; BEQ loop1done
loop1cont:
// unroll 1X
MOVW.W -4(R2), R7
ORR R7>>R5, R4
MOVW.W R4, -4(R3)
MOVW R7<<R1, R4
SUB $1, R6
TEQ $0, R6; BNE loop1cont
loop1done:
loop4:
TEQ $0, R0; BEQ loop4done
loop4cont:
// unroll 4X
MOVW.W -4(R2), R6
MOVW.W -4(R2), R7
MOVW.W -4(R2), R8
MOVW.W -4(R2), R9
ORR R6>>R5, R4
MOVW.W R4, -4(R3)
MOVW R6<<R1, R4
ORR R7>>R5, R4
MOVW.W R4, -4(R3)
MOVW R7<<R1, R4
ORR R8>>R5, R4
MOVW.W R4, -4(R3)
MOVW R8<<R1, R4
ORR R9>>R5, R4
MOVW.W R4, -4(R3)
MOVW R9<<R1, R4
SUB $1, R0
TEQ $0, R0; BNE loop4cont
loop4done:
// store final shifted bits
MOVW.W R4, -4(R3)
RET
ret0:
MOVW $0, R1
MOVW R1, c+28(FP)
RET
// func rshVU(z, x []Word, s uint) (c Word)
TEXT ·rshVU(SB), NOSPLIT, $0
MOVW z_len+4(FP), R0
TEQ $0, R0; BEQ ret0
MOVW s+24(FP), R1
MOVW x_base+12(FP), R2
MOVW z_base+0(FP), R3
// shift first word into carry
MOVW.P 4(R2), R4
MOVW $32, R5
SUB R1, R5
MOVW R4<<R5, R6
MOVW R4>>R1, R4
MOVW R6, c+28(FP)
// shift remaining words
SUB $1, R0
// compute unrolled loop lengths
AND $3, R0, R6
MOVW R0>>2, R0
loop1:
TEQ $0, R6; BEQ loop1done
loop1cont:
// unroll 1X
MOVW.P 4(R2), R7
ORR R7<<R5, R4
MOVW.P R4, 4(R3)
MOVW R7>>R1, R4
SUB $1, R6
TEQ $0, R6; BNE loop1cont
loop1done:
loop4:
TEQ $0, R0; BEQ loop4done
loop4cont:
// unroll 4X
MOVW.P 4(R2), R6
MOVW.P 4(R2), R7
MOVW.P 4(R2), R8
MOVW.P 4(R2), R9
ORR R6<<R5, R4
MOVW.P R4, 4(R3)
MOVW R6>>R1, R4
ORR R7<<R5, R4
MOVW.P R4, 4(R3)
MOVW R7>>R1, R4
ORR R8<<R5, R4
MOVW.P R4, 4(R3)
MOVW R8>>R1, R4
ORR R9<<R5, R4
MOVW.P R4, 4(R3)
MOVW R9>>R1, R4
SUB $1, R0
TEQ $0, R0; BNE loop4cont
loop4done:
// store final shifted bits
MOVW.P R4, 4(R3)
RET
ret0:
MOVW $0, R1
MOVW R1, c+28(FP)
RET
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVW m+24(FP), R0
MOVW a+28(FP), R1
MOVW z_len+4(FP), R2
MOVW x_base+12(FP), R3
MOVW z_base+0(FP), R4
// compute unrolled loop lengths
AND $3, R2, R5
MOVW R2>>2, R2
loop1:
TEQ $0, R5; BEQ loop1done
loop1cont:
// unroll 1X
MOVW.P 4(R3), R6
// multiply
MULLU R0, R6, (R7, R6)
ADD.S R1, R6
ADC $0, R7, R1
MOVW.P R6, 4(R4)
SUB $1, R5
TEQ $0, R5; BNE loop1cont
loop1done:
loop4:
TEQ $0, R2; BEQ loop4done
loop4cont:
// unroll 4X in batches of 2
MOVW.P 4(R3), R5
MOVW.P 4(R3), R6
// multiply
MULLU R0, R5, (R7, R5)
ADD.S R1, R5
MULLU R0, R6, (R8, R6)
ADC.S R7, R6
ADC $0, R8, R1
MOVW.P R5, 4(R4)
MOVW.P R6, 4(R4)
MOVW.P 4(R3), R5
MOVW.P 4(R3), R6
// multiply
MULLU R0, R5, (R7, R5)
ADD.S R1, R5
MULLU R0, R6, (R8, R6)
ADC.S R7, R6
ADC $0, R8, R1
MOVW.P R5, 4(R4)
MOVW.P R6, 4(R4)
SUB $1, R2
TEQ $0, R2; BNE loop4cont
loop4done:
MOVW R1, c+32(FP)
RET
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
TEXT ·addMulVVWW(SB), NOSPLIT, $0
MOVW m+36(FP), R0
MOVW a+40(FP), R1
MOVW z_len+4(FP), R2
MOVW x_base+12(FP), R3
MOVW y_base+24(FP), R4
MOVW z_base+0(FP), R5
// compute unrolled loop lengths
AND $3, R2, R6
MOVW R2>>2, R2
loop1:
TEQ $0, R6; BEQ loop1done
loop1cont:
// unroll 1X
MOVW.P 4(R3), R7
MOVW.P 4(R4), R8
// multiply
MULLU R0, R8, (R9, R8)
ADD.S R1, R8
ADC $0, R9, R1
// add
ADD.S R7, R8
ADC $0, R1
MOVW.P R8, 4(R5)
SUB $1, R6
TEQ $0, R6; BNE loop1cont
loop1done:
loop4:
TEQ $0, R2; BEQ loop4done
loop4cont:
// unroll 4X in batches of 2
MOVW.P 4(R3), R6
MOVW.P 4(R3), R7
MOVW.P 4(R4), R8
MOVW.P 4(R4), R9
// multiply
MULLU R0, R8, (R11, R8)
ADD.S R1, R8
MULLU R0, R9, (R12, R9)
ADC.S R11, R9
ADC $0, R12, R1
// add
ADD.S R6, R8
ADC.S R7, R9
ADC $0, R1
MOVW.P R8, 4(R5)
MOVW.P R9, 4(R5)
MOVW.P 4(R3), R6
MOVW.P 4(R3), R7
MOVW.P 4(R4), R8
MOVW.P 4(R4), R9
// multiply
MULLU R0, R8, (R11, R8)
ADD.S R1, R8
MULLU R0, R9, (R12, R9)
ADC.S R11, R9
ADC $0, R12, R1
// add
ADD.S R6, R8
ADC.S R7, R9
ADC $0, R1
MOVW.P R8, 4(R5)
MOVW.P R9, 4(R5)
SUB $1, R2
TEQ $0, R2; BNE loop4cont
loop4done:
MOVW R1, c+44(FP)
RET