mirror of
https://github.com/golang/go.git
synced 2026-04-02 17:30:01 +09:00
Step 4 of the mini-compiler: switch to the new generated assembly. No systematic performance regressions, and many many improvements. In the benchmarks, the systems are: c3h88 GOARCH=amd64 c3h88 perf gomote (newer Intel, Google Cloud) c2s16 GOARCH=amd64 c2s16 perf gomote (Intel, Google Cloud) s7 GOARCH=amd64 rsc basement server (AMD Ryzen 9 7950X) 386 GOARCH=386 gotip-linux-386 gomote (Intel, Google Cloud) s7-386 GOARCH=386 rsc basement server (AMD Ryzen 9 7950X) c4as16 GOARCH=arm64 c4as16 perf gomote (Google Cloud) mac GOARCH=arm64 Apple M3 Pro in MacBook Pro arm GOARCH=arm gotip-linux-arm gomote loong64 GOARCH=loong64 gotip-linux-loong64 gomote ppc64le GOARCH=ppc64le gotip-linux-ppc64le gomote riscv64 GOARCH=riscv64 gotip-linux-riscv64 gomote s390x GOARCH=s390x linux-s390x-ibm old gomote benchmark \ system c3h88 c2s16 s7 386 s7-386 c4as16 mac arm loong64 ppc64le riscv64 s390x AddVV/words=1 -4.03% +5.21% -4.04% +4.94% ~ ~ ~ ~ -19.51% ~ ~ ~ AddVV/words=10 -10.20% +0.34% -3.46% -11.50% -7.46% +7.66% +5.97% ~ -17.90% ~ ~ ~ AddVV/words=16 -10.91% -6.45% -8.45% -21.86% -17.90% +2.73% -1.61% ~ -22.47% -3.54% ~ ~ AddVV/words=100 -3.77% -4.30% -3.17% -47.27% -45.34% -0.78% ~ -8.74% -27.19% ~ ~ ~ AddVV/words=1000 -0.08% -0.71% ~ -49.21% -48.07% ~ ~ -16.80% -24.74% ~ ~ ~ AddVV/words=10000 ~ ~ ~ -48.73% -48.56% -0.06% ~ -17.08% ~ ~ -4.81% ~ AddVV/words=100000 ~ ~ ~ -47.80% -48.38% ~ ~ -15.10% -25.06% ~ -5.34% ~ SubVV/words=1 -0.84% +3.43% -3.62% +1.34% ~ -0.76% ~ ~ -18.18% +5.58% ~ ~ SubVV/words=10 -9.99% +0.34% ~ -11.23% -8.24% +7.53% +6.15% ~ -17.55% +2.77% -2.08% ~ SubVV/words=16 -11.94% -6.45% -6.81% -21.82% -18.11% +1.58% -1.21% ~ -20.36% ~ ~ ~ SubVV/words=100 -3.38% -4.32% -1.80% -46.14% -46.43% +0.41% ~ -7.20% -26.17% ~ -0.42% ~ SubVV/words=1000 -0.38% -0.80% ~ -49.22% -48.90% ~ ~ -15.86% -24.73% ~ ~ ~ SubVV/words=10000 ~ ~ ~ -49.57% -49.64% -0.03% ~ -15.85% -26.52% ~ -5.05% ~ SubVV/words=100000 ~ ~ ~ -46.88% -49.66% ~ ~ -15.45% -16.11% ~ -4.99% ~ LshVU/words=1 ~ +5.78% ~ ~ -2.48% +1.61% +2.18% +2.70% -18.16% -34.16% -21.29% ~ LshVU/words=10 -18.34% -3.78% +2.21% ~ ~ -2.81% -12.54% ~ -25.02% -24.78% -38.11% -66.98% LshVU/words=16 -23.15% +1.03% +7.74% +0.73% ~ +8.88% +1.56% ~ -25.37% -28.46% -41.27% ~ LshVU/words=100 -32.85% -8.86% -2.58% ~ +2.69% +1.24% ~ -20.63% -44.14% -42.68% -53.09% ~ LshVU/words=1000 -37.30% -0.20% +5.67% ~ ~ +1.44% ~ -27.83% -45.01% -37.07% -57.02% -46.57% LshVU/words=10000 -36.84% -2.30% +3.82% ~ +1.86% +1.57% -66.81% -28.00% -13.15% -35.40% -41.97% ~ LshVU/words=100000 -40.30% ~ +3.96% ~ ~ ~ ~ -24.91% -19.06% -36.14% -40.99% -66.03% RshVU/words=1 -3.17% +4.76% -4.06% +4.31% +4.55% ~ ~ ~ -20.61% ~ -26.20% -51.33% RshVU/words=10 -22.08% -4.41% -17.99% +3.64% -11.87% ~ -16.30% ~ -30.01% ~ -40.37% -63.05% RshVU/words=16 -26.03% -8.50% -18.09% ~ -17.52% +6.50% ~ -2.85% -30.24% ~ -42.93% -63.13% RshVU/words=100 -20.87% -28.83% -29.45% ~ -26.25% +1.46% -1.14% -16.20% -45.65% -16.20% -53.66% -77.27% RshVU/words=1000 -24.03% -21.37% -26.71% ~ -28.95% +0.98% ~ -18.82% -45.21% -23.55% -57.09% -71.18% RshVU/words=10000 -24.56% -22.44% -27.01% ~ -28.88% +0.78% -5.35% -17.47% -16.87% -20.67% -41.97% ~ RshVU/words=100000 -23.36% -15.65% -27.54% ~ -29.26% +1.73% -6.67% -13.68% -21.40% -23.02% -40.37% -66.31% MulAddVWW/words=1 +2.37% +8.14% ~ +4.10% +3.71% ~ ~ ~ -21.62% ~ +1.12% ~ MulAddVWW/words=10 ~ -2.72% -15.15% +8.04% ~ ~ ~ -2.52% -19.48% ~ -6.18% ~ MulAddVWW/words=16 ~ +1.49% ~ +4.49% +6.58% -8.70% -7.16% -12.08% -21.43% -6.59% -9.05% ~ MulAddVWW/words=100 +0.37% +1.11% -4.51% -13.59% ~ -11.10% -3.63% -21.40% -22.27% -2.92% -14.41% ~ MulAddVWW/words=1000 ~ +0.90% -7.13% -18.94% ~ -14.02% -9.97% -28.31% -18.72% -2.32% -15.80% ~ MulAddVWW/words=10000 ~ +1.08% -6.75% -19.10% ~ -14.61% -9.04% -28.48% -14.29% -2.25% -9.40% ~ MulAddVWW/words=100000 ~ ~ -6.93% -18.09% ~ -14.33% -9.66% -28.92% -16.63% -2.43% -8.23% ~ AddMulVVWW/words=1 +2.30% +4.83% -11.37% +4.58% ~ -3.14% ~ ~ -10.58% +30.35% ~ ~ AddMulVVWW/words=10 -3.27% ~ +8.96% +5.74% ~ +2.67% -1.44% -7.64% -13.41% ~ ~ ~ AddMulVVWW/words=16 -6.12% ~ ~ ~ +1.91% -7.90% -16.22% -14.07% -14.26% -4.15% -7.30% ~ AddMulVVWW/words=100 -5.48% -2.14% ~ -9.40% +9.98% -1.43% -12.35% -18.56% -21.94% ~ -9.84% ~ AddMulVVWW/words=1000 -11.35% -3.40% -3.64% -11.04% +12.82% -1.33% -15.63% -20.50% -20.95% ~ -11.06% -51.97% AddMulVVWW/words=10000 -10.31% -1.61% -8.41% -12.15% +13.10% -1.03% -16.34% -22.46% -1.00% ~ -10.33% -49.80% AddMulVVWW/words=100000 -13.71% ~ -8.31% -12.18% +12.98% -1.35% -15.20% -21.89% ~ ~ -9.38% -48.30% Change-Id: I0a33c33602c0d053c84d9946e662500cfa048e2d Reviewed-on: https://go-review.googlesource.com/c/go/+/664938 Reviewed-by: Alan Donovan <adonovan@google.com> Auto-Submit: Russ Cox <rsc@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
356 lines
6.4 KiB
ArmAsm
356 lines
6.4 KiB
ArmAsm
// Copyright 2025 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
|
|
|
|
//go:build !math_big_pure_go
|
|
|
|
#include "textflag.h"
|
|
|
|
// func addVV(z, x, y []Word) (c Word)
|
|
TEXT ·addVV(SB), NOSPLIT, $0
|
|
MOVW z_len+4(FP), R0
|
|
MOVW x_base+12(FP), R1
|
|
MOVW y_base+24(FP), R2
|
|
MOVW z_base+0(FP), R3
|
|
// compute unrolled loop lengths
|
|
AND $3, R0, R4
|
|
MOVW R0>>2, R0
|
|
ADD.S $0, R0 // clear carry
|
|
loop1:
|
|
TEQ $0, R4; BEQ loop1done
|
|
loop1cont:
|
|
// unroll 1X
|
|
MOVW.P 4(R1), R5
|
|
MOVW.P 4(R2), R6
|
|
ADC.S R6, R5
|
|
MOVW.P R5, 4(R3)
|
|
SUB $1, R4
|
|
TEQ $0, R4; BNE loop1cont
|
|
loop1done:
|
|
loop4:
|
|
TEQ $0, R0; BEQ loop4done
|
|
loop4cont:
|
|
// unroll 4X
|
|
MOVW.P 4(R1), R4
|
|
MOVW.P 4(R1), R5
|
|
MOVW.P 4(R1), R6
|
|
MOVW.P 4(R1), R7
|
|
MOVW.P 4(R2), R8
|
|
MOVW.P 4(R2), R9
|
|
MOVW.P 4(R2), R11
|
|
MOVW.P 4(R2), R12
|
|
ADC.S R8, R4
|
|
ADC.S R9, R5
|
|
ADC.S R11, R6
|
|
ADC.S R12, R7
|
|
MOVW.P R4, 4(R3)
|
|
MOVW.P R5, 4(R3)
|
|
MOVW.P R6, 4(R3)
|
|
MOVW.P R7, 4(R3)
|
|
SUB $1, R0
|
|
TEQ $0, R0; BNE loop4cont
|
|
loop4done:
|
|
SBC R1, R1 // save carry
|
|
ADD $1, R1 // convert add carry
|
|
MOVW R1, c+36(FP)
|
|
RET
|
|
|
|
// func subVV(z, x, y []Word) (c Word)
|
|
TEXT ·subVV(SB), NOSPLIT, $0
|
|
MOVW z_len+4(FP), R0
|
|
MOVW x_base+12(FP), R1
|
|
MOVW y_base+24(FP), R2
|
|
MOVW z_base+0(FP), R3
|
|
// compute unrolled loop lengths
|
|
AND $3, R0, R4
|
|
MOVW R0>>2, R0
|
|
SUB.S $0, R0 // clear carry
|
|
loop1:
|
|
TEQ $0, R4; BEQ loop1done
|
|
loop1cont:
|
|
// unroll 1X
|
|
MOVW.P 4(R1), R5
|
|
MOVW.P 4(R2), R6
|
|
SBC.S R6, R5
|
|
MOVW.P R5, 4(R3)
|
|
SUB $1, R4
|
|
TEQ $0, R4; BNE loop1cont
|
|
loop1done:
|
|
loop4:
|
|
TEQ $0, R0; BEQ loop4done
|
|
loop4cont:
|
|
// unroll 4X
|
|
MOVW.P 4(R1), R4
|
|
MOVW.P 4(R1), R5
|
|
MOVW.P 4(R1), R6
|
|
MOVW.P 4(R1), R7
|
|
MOVW.P 4(R2), R8
|
|
MOVW.P 4(R2), R9
|
|
MOVW.P 4(R2), R11
|
|
MOVW.P 4(R2), R12
|
|
SBC.S R8, R4
|
|
SBC.S R9, R5
|
|
SBC.S R11, R6
|
|
SBC.S R12, R7
|
|
MOVW.P R4, 4(R3)
|
|
MOVW.P R5, 4(R3)
|
|
MOVW.P R6, 4(R3)
|
|
MOVW.P R7, 4(R3)
|
|
SUB $1, R0
|
|
TEQ $0, R0; BNE loop4cont
|
|
loop4done:
|
|
SBC R1, R1 // save carry
|
|
RSB $0, R1, R1 // convert sub carry
|
|
MOVW R1, c+36(FP)
|
|
RET
|
|
|
|
// func lshVU(z, x []Word, s uint) (c Word)
|
|
TEXT ·lshVU(SB), NOSPLIT, $0
|
|
MOVW z_len+4(FP), R0
|
|
TEQ $0, R0; BEQ ret0
|
|
MOVW s+24(FP), R1
|
|
MOVW x_base+12(FP), R2
|
|
MOVW z_base+0(FP), R3
|
|
// run loop backward
|
|
ADD R0<<2, R2, R2
|
|
ADD R0<<2, R3, R3
|
|
// shift first word into carry
|
|
MOVW.W -4(R2), R4
|
|
MOVW $32, R5
|
|
SUB R1, R5
|
|
MOVW R4>>R5, R6
|
|
MOVW R4<<R1, R4
|
|
MOVW R6, c+28(FP)
|
|
// shift remaining words
|
|
SUB $1, R0
|
|
// compute unrolled loop lengths
|
|
AND $3, R0, R6
|
|
MOVW R0>>2, R0
|
|
loop1:
|
|
TEQ $0, R6; BEQ loop1done
|
|
loop1cont:
|
|
// unroll 1X
|
|
MOVW.W -4(R2), R7
|
|
ORR R7>>R5, R4
|
|
MOVW.W R4, -4(R3)
|
|
MOVW R7<<R1, R4
|
|
SUB $1, R6
|
|
TEQ $0, R6; BNE loop1cont
|
|
loop1done:
|
|
loop4:
|
|
TEQ $0, R0; BEQ loop4done
|
|
loop4cont:
|
|
// unroll 4X
|
|
MOVW.W -4(R2), R6
|
|
MOVW.W -4(R2), R7
|
|
MOVW.W -4(R2), R8
|
|
MOVW.W -4(R2), R9
|
|
ORR R6>>R5, R4
|
|
MOVW.W R4, -4(R3)
|
|
MOVW R6<<R1, R4
|
|
ORR R7>>R5, R4
|
|
MOVW.W R4, -4(R3)
|
|
MOVW R7<<R1, R4
|
|
ORR R8>>R5, R4
|
|
MOVW.W R4, -4(R3)
|
|
MOVW R8<<R1, R4
|
|
ORR R9>>R5, R4
|
|
MOVW.W R4, -4(R3)
|
|
MOVW R9<<R1, R4
|
|
SUB $1, R0
|
|
TEQ $0, R0; BNE loop4cont
|
|
loop4done:
|
|
// store final shifted bits
|
|
MOVW.W R4, -4(R3)
|
|
RET
|
|
ret0:
|
|
MOVW $0, R1
|
|
MOVW R1, c+28(FP)
|
|
RET
|
|
|
|
// func rshVU(z, x []Word, s uint) (c Word)
|
|
TEXT ·rshVU(SB), NOSPLIT, $0
|
|
MOVW z_len+4(FP), R0
|
|
TEQ $0, R0; BEQ ret0
|
|
MOVW s+24(FP), R1
|
|
MOVW x_base+12(FP), R2
|
|
MOVW z_base+0(FP), R3
|
|
// shift first word into carry
|
|
MOVW.P 4(R2), R4
|
|
MOVW $32, R5
|
|
SUB R1, R5
|
|
MOVW R4<<R5, R6
|
|
MOVW R4>>R1, R4
|
|
MOVW R6, c+28(FP)
|
|
// shift remaining words
|
|
SUB $1, R0
|
|
// compute unrolled loop lengths
|
|
AND $3, R0, R6
|
|
MOVW R0>>2, R0
|
|
loop1:
|
|
TEQ $0, R6; BEQ loop1done
|
|
loop1cont:
|
|
// unroll 1X
|
|
MOVW.P 4(R2), R7
|
|
ORR R7<<R5, R4
|
|
MOVW.P R4, 4(R3)
|
|
MOVW R7>>R1, R4
|
|
SUB $1, R6
|
|
TEQ $0, R6; BNE loop1cont
|
|
loop1done:
|
|
loop4:
|
|
TEQ $0, R0; BEQ loop4done
|
|
loop4cont:
|
|
// unroll 4X
|
|
MOVW.P 4(R2), R6
|
|
MOVW.P 4(R2), R7
|
|
MOVW.P 4(R2), R8
|
|
MOVW.P 4(R2), R9
|
|
ORR R6<<R5, R4
|
|
MOVW.P R4, 4(R3)
|
|
MOVW R6>>R1, R4
|
|
ORR R7<<R5, R4
|
|
MOVW.P R4, 4(R3)
|
|
MOVW R7>>R1, R4
|
|
ORR R8<<R5, R4
|
|
MOVW.P R4, 4(R3)
|
|
MOVW R8>>R1, R4
|
|
ORR R9<<R5, R4
|
|
MOVW.P R4, 4(R3)
|
|
MOVW R9>>R1, R4
|
|
SUB $1, R0
|
|
TEQ $0, R0; BNE loop4cont
|
|
loop4done:
|
|
// store final shifted bits
|
|
MOVW.P R4, 4(R3)
|
|
RET
|
|
ret0:
|
|
MOVW $0, R1
|
|
MOVW R1, c+28(FP)
|
|
RET
|
|
|
|
// func mulAddVWW(z, x []Word, m, a Word) (c Word)
|
|
TEXT ·mulAddVWW(SB), NOSPLIT, $0
|
|
MOVW m+24(FP), R0
|
|
MOVW a+28(FP), R1
|
|
MOVW z_len+4(FP), R2
|
|
MOVW x_base+12(FP), R3
|
|
MOVW z_base+0(FP), R4
|
|
// compute unrolled loop lengths
|
|
AND $3, R2, R5
|
|
MOVW R2>>2, R2
|
|
loop1:
|
|
TEQ $0, R5; BEQ loop1done
|
|
loop1cont:
|
|
// unroll 1X
|
|
MOVW.P 4(R3), R6
|
|
// multiply
|
|
MULLU R0, R6, (R7, R6)
|
|
ADD.S R1, R6
|
|
ADC $0, R7, R1
|
|
MOVW.P R6, 4(R4)
|
|
SUB $1, R5
|
|
TEQ $0, R5; BNE loop1cont
|
|
loop1done:
|
|
loop4:
|
|
TEQ $0, R2; BEQ loop4done
|
|
loop4cont:
|
|
// unroll 4X in batches of 2
|
|
MOVW.P 4(R3), R5
|
|
MOVW.P 4(R3), R6
|
|
// multiply
|
|
MULLU R0, R5, (R7, R5)
|
|
ADD.S R1, R5
|
|
MULLU R0, R6, (R8, R6)
|
|
ADC.S R7, R6
|
|
ADC $0, R8, R1
|
|
MOVW.P R5, 4(R4)
|
|
MOVW.P R6, 4(R4)
|
|
MOVW.P 4(R3), R5
|
|
MOVW.P 4(R3), R6
|
|
// multiply
|
|
MULLU R0, R5, (R7, R5)
|
|
ADD.S R1, R5
|
|
MULLU R0, R6, (R8, R6)
|
|
ADC.S R7, R6
|
|
ADC $0, R8, R1
|
|
MOVW.P R5, 4(R4)
|
|
MOVW.P R6, 4(R4)
|
|
SUB $1, R2
|
|
TEQ $0, R2; BNE loop4cont
|
|
loop4done:
|
|
MOVW R1, c+32(FP)
|
|
RET
|
|
|
|
// func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
|
|
TEXT ·addMulVVWW(SB), NOSPLIT, $0
|
|
MOVW m+36(FP), R0
|
|
MOVW a+40(FP), R1
|
|
MOVW z_len+4(FP), R2
|
|
MOVW x_base+12(FP), R3
|
|
MOVW y_base+24(FP), R4
|
|
MOVW z_base+0(FP), R5
|
|
// compute unrolled loop lengths
|
|
AND $3, R2, R6
|
|
MOVW R2>>2, R2
|
|
loop1:
|
|
TEQ $0, R6; BEQ loop1done
|
|
loop1cont:
|
|
// unroll 1X
|
|
MOVW.P 4(R3), R7
|
|
MOVW.P 4(R4), R8
|
|
// multiply
|
|
MULLU R0, R8, (R9, R8)
|
|
ADD.S R1, R8
|
|
ADC $0, R9, R1
|
|
// add
|
|
ADD.S R7, R8
|
|
ADC $0, R1
|
|
MOVW.P R8, 4(R5)
|
|
SUB $1, R6
|
|
TEQ $0, R6; BNE loop1cont
|
|
loop1done:
|
|
loop4:
|
|
TEQ $0, R2; BEQ loop4done
|
|
loop4cont:
|
|
// unroll 4X in batches of 2
|
|
MOVW.P 4(R3), R6
|
|
MOVW.P 4(R3), R7
|
|
MOVW.P 4(R4), R8
|
|
MOVW.P 4(R4), R9
|
|
// multiply
|
|
MULLU R0, R8, (R11, R8)
|
|
ADD.S R1, R8
|
|
MULLU R0, R9, (R12, R9)
|
|
ADC.S R11, R9
|
|
ADC $0, R12, R1
|
|
// add
|
|
ADD.S R6, R8
|
|
ADC.S R7, R9
|
|
ADC $0, R1
|
|
MOVW.P R8, 4(R5)
|
|
MOVW.P R9, 4(R5)
|
|
MOVW.P 4(R3), R6
|
|
MOVW.P 4(R3), R7
|
|
MOVW.P 4(R4), R8
|
|
MOVW.P 4(R4), R9
|
|
// multiply
|
|
MULLU R0, R8, (R11, R8)
|
|
ADD.S R1, R8
|
|
MULLU R0, R9, (R12, R9)
|
|
ADC.S R11, R9
|
|
ADC $0, R12, R1
|
|
// add
|
|
ADD.S R6, R8
|
|
ADC.S R7, R9
|
|
ADC $0, R1
|
|
MOVW.P R8, 4(R5)
|
|
MOVW.P R9, 4(R5)
|
|
SUB $1, R2
|
|
TEQ $0, R2; BNE loop4cont
|
|
loop4done:
|
|
MOVW R1, c+44(FP)
|
|
RET
|