cmd/compile: use 128-bit arm64 vector ops for Move expansion

Update Move rewrite rules to use FMOVQload/store and FLDPQ/FSTPQ
for medium-sized copies (16-64 bytes). This generates fewer and
wider instructions than the previous approach using LDP/STP pairs.

Executable           Base .text         go1     Change
----------------------------------------------------
asm                     2112308     2105732     -0.31%
cgo                     1826132     1823172     -0.16%
compile                10474868    10460644     -0.14%
cover                   1990036     1985748     -0.22%
fix                     3234116     3226340     -0.24%
link                    2702628     2695316     -0.27%
preprofile               947652      947028     -0.07%
vet                     3140964     3133524     -0.24%

Performance effect on OrangePi 6 plus:

          │   orig.out   │               movq.out               │
          │    sec/op    │    sec/op     vs base                │
CopyFat16   0.4711n ± 0%   0.3852n ± 0%  -18.23% (p=0.000 n=10)
CopyFat17   0.7705n ± 0%   0.7705n ± 0%        ~ (p=0.984 n=10)
CopyFat18   0.7703n ± 0%   0.7703n ± 0%        ~ (p=0.771 n=10)
CopyFat19   0.7703n ± 0%   0.7703n ± 0%        ~ (p=0.637 n=10)
CopyFat20   0.7703n ± 0%   0.7704n ± 0%        ~ (p=0.103 n=10)
CopyFat21   0.7703n ± 0%   0.7708n ± 0%        ~ (p=0.505 n=10)
CopyFat22   0.7704n ± 0%   0.7705n ± 0%        ~ (p=0.589 n=10)
CopyFat23   0.7703n ± 0%   0.7703n ± 0%        ~ (p=0.347 n=10)
CopyFat24   0.7704n ± 0%   0.7703n ± 0%        ~ (p=0.383 n=10)
CopyFat25   0.8385n ± 0%   0.6589n ± 0%  -21.41% (p=0.000 n=10)
CopyFat26   0.8386n ± 0%   0.6590n ± 0%  -21.42% (p=0.000 n=10)
CopyFat27   0.8385n ± 0%   0.6590n ± 0%  -21.41% (p=0.000 n=10)
CopyFat28   0.8386n ± 0%   0.6571n ± 0%  -21.65% (p=0.000 n=10)
CopyFat29   0.8385n ± 0%   0.6590n ± 0%  -21.41% (p=0.000 n=10)
CopyFat30   0.8387n ± 0%   0.6591n ± 0%  -21.42% (p=0.000 n=10)
CopyFat31   0.8385n ± 0%   0.6589n ± 0%  -21.42% (p=0.000 n=10)
CopyFat32   0.8318n ± 0%   0.4969n ± 0%  -40.26% (p=0.000 n=10)
CopyFat33   1.1550n ± 0%   0.7705n ± 0%  -33.29% (p=0.000 n=10)
CopyFat34   1.1560n ± 0%   0.7703n ± 0%  -33.37% (p=0.000 n=10)
CopyFat35   1.1550n ± 0%   0.7705n ± 0%  -33.29% (p=0.000 n=10)
CopyFat36   1.1550n ± 0%   0.7704n ± 0%  -33.30% (p=0.000 n=10)
CopyFat37   1.1555n ± 0%   0.7704n ± 0%  -33.33% (p=0.000 n=10)
CopyFat38   1.1550n ± 0%   0.7704n ± 0%  -33.30% (p=0.000 n=10)
CopyFat39   1.1560n ± 0%   0.7703n ± 0%  -33.36% (p=0.000 n=10)
CopyFat40   1.0020n ± 0%   0.7705n ± 0%  -23.10% (p=0.000 n=10)
CopyFat41   1.2060n ± 0%   0.7703n ± 0%  -36.12% (p=0.000 n=10)
CopyFat42   1.2060n ± 0%   0.7704n ± 0%  -36.12% (p=0.000 n=10)
CopyFat43   1.2060n ± 0%   0.7705n ± 0%  -36.11% (p=0.000 n=10)
CopyFat44   1.2060n ± 0%   0.7704n ± 0%  -36.12% (p=0.000 n=10)
CopyFat45   1.2060n ± 0%   0.7704n ± 0%  -36.12% (p=0.000 n=10)
CopyFat46   1.2060n ± 0%   0.7703n ± 0%  -36.13% (p=0.000 n=10)
CopyFat47   1.2060n ± 0%   0.7703n ± 0%  -36.12% (p=0.000 n=10)
CopyFat48   1.2060n ± 0%   0.7703n ± 0%  -36.13% (p=0.000 n=10)
CopyFat49   1.3620n ± 0%   0.8622n ± 0%  -36.70% (p=0.000 n=10)
CopyFat50   1.3620n ± 0%   0.8621n ± 0%  -36.70% (p=0.000 n=10)
CopyFat51   1.3620n ± 0%   0.8622n ± 0%  -36.70% (p=0.000 n=10)
CopyFat52   1.3620n ± 0%   0.8623n ± 0%  -36.69% (p=0.000 n=10)
CopyFat53   1.3620n ± 0%   0.8621n ± 0%  -36.70% (p=0.000 n=10)
CopyFat54   1.3620n ± 0%   0.8622n ± 0%  -36.70% (p=0.000 n=10)
CopyFat55   1.3620n ± 0%   0.8620n ± 0%  -36.71% (p=0.000 n=10)
CopyFat56   1.3120n ± 0%   0.8622n ± 0%  -34.28% (p=0.000 n=10)
CopyFat57   1.5905n ± 0%   0.8621n ± 0%  -45.80% (p=0.000 n=10)
CopyFat58   1.5830n ± 1%   0.8622n ± 0%  -45.53% (p=0.000 n=10)
CopyFat59   1.5865n ± 1%   0.8621n ± 0%  -45.66% (p=0.000 n=10)
CopyFat60   1.5720n ± 1%   0.8622n ± 0%  -45.15% (p=0.000 n=10)
CopyFat61   1.5900n ± 1%   0.8621n ± 0%  -45.78% (p=0.000 n=10)
CopyFat62   1.5890n ± 0%   0.8622n ± 0%  -45.74% (p=0.000 n=10)
CopyFat63   1.5900n ± 1%   0.8620n ± 0%  -45.78% (p=0.000 n=10)
CopyFat64   1.5440n ± 0%   0.8568n ± 0%  -44.51% (p=0.000 n=10)
geomean      1.093n        0.7636n       -30.13%

Kunpeng 920C:
goos: linux
goarch: arm64
pkg: runtime
          │   orig.out   │               movq.out               │
          │    sec/op    │    sec/op     vs base                │
CopyFat16   0.4892n ± 1%   0.5072n ± 0%   +3.68% (p=0.000 n=10)
CopyFat17   0.6394n ± 0%   0.4638n ± 0%  -27.47% (p=0.000 n=10)
CopyFat18   0.6394n ± 0%   0.4638n ± 0%  -27.46% (p=0.000 n=10)
CopyFat19   0.6395n ± 0%   0.4638n ± 0%  -27.48% (p=0.000 n=10)
CopyFat20   0.6393n ± 0%   0.4638n ± 0%  -27.45% (p=0.000 n=10)
CopyFat21   0.6394n ± 0%   0.4637n ± 0%  -27.48% (p=0.000 n=10)
CopyFat22   0.6395n ± 0%   0.4638n ± 0%  -27.47% (p=0.000 n=10)
CopyFat23   0.6395n ± 0%   0.4638n ± 0%  -27.47% (p=0.000 n=10)
CopyFat24   0.6091n ± 0%   0.4639n ± 0%  -23.84% (p=0.000 n=10)
CopyFat25   0.9109n ± 0%   0.4674n ± 0%  -48.69% (p=0.000 n=10)
CopyFat26   0.9107n ± 0%   0.4674n ± 0%  -48.68% (p=0.000 n=10)
CopyFat27   0.9108n ± 0%   0.4674n ± 0%  -48.69% (p=0.000 n=10)
CopyFat28   0.9109n ± 0%   0.4674n ± 0%  -48.69% (p=0.000 n=10)
CopyFat29   0.9110n ± 0%   0.4673n ± 0%  -48.70% (p=0.000 n=10)
CopyFat30   0.9109n ± 0%   0.4673n ± 0%  -48.70% (p=0.000 n=10)
CopyFat31   0.9110n ± 0%   0.4674n ± 0%  -48.69% (p=0.000 n=10)
CopyFat32   0.6845n ± 0%   0.4845n ± 1%  -29.21% (p=0.000 n=10)
CopyFat33   0.9130n ± 0%   0.9117n ± 0%   -0.14% (p=0.000 n=10)
CopyFat34   0.9131n ± 0%   0.9118n ± 0%   -0.14% (p=0.001 n=10)
CopyFat35   0.9131n ± 0%   0.9117n ± 0%   -0.15% (p=0.001 n=10)
CopyFat36   0.9129n ± 0%   0.9117n ± 0%   -0.14% (p=0.003 n=10)
CopyFat37   0.9129n ± 0%   0.9117n ± 0%   -0.14% (p=0.000 n=10)
CopyFat38   0.9130n ± 0%   0.9118n ± 0%   -0.14% (p=0.000 n=10)
CopyFat39   0.9131n ± 0%   0.9118n ± 0%   -0.15% (p=0.000 n=10)
CopyFat40   0.9112n ± 0%   0.9118n ± 0%   +0.07% (p=0.027 n=10)
CopyFat41   1.1390n ± 0%   0.9118n ± 0%  -19.95% (p=0.000 n=10)
CopyFat42   1.1390n ± 0%   0.9118n ± 0%  -19.95% (p=0.000 n=10)
CopyFat43   1.1390n ± 0%   0.9116n ± 0%  -19.96% (p=0.000 n=10)
CopyFat44   1.1390n ± 0%   0.9119n ± 0%  -19.94% (p=0.000 n=10)
CopyFat45   1.1390n ± 0%   0.9118n ± 0%  -19.95% (p=0.000 n=10)
CopyFat46   1.1390n ± 0%   0.9118n ± 0%  -19.95% (p=0.000 n=10)
CopyFat47   1.1390n ± 0%   0.9117n ± 0%  -19.96% (p=0.000 n=10)
CopyFat48   0.9111n ± 0%   0.9116n ± 0%   +0.06% (p=0.002 n=10)
CopyFat49   1.2160n ± 0%   0.9292n ± 0%  -23.59% (p=0.000 n=10)
CopyFat50   1.2160n ± 0%   0.9302n ± 0%  -23.50% (p=0.000 n=10)
CopyFat51   1.2160n ± 0%   0.9292n ± 0%  -23.59% (p=0.000 n=10)
CopyFat52   1.2160n ± 0%   0.9302n ± 0%  -23.50% (p=0.000 n=10)
CopyFat53   1.2160n ± 0%   0.9293n ± 0%  -23.58% (p=0.000 n=10)
CopyFat54   1.2160n ± 0%   0.9302n ± 0%  -23.50% (p=0.000 n=10)
CopyFat55   1.2160n ± 0%   0.9292n ± 0%  -23.59% (p=0.000 n=10)
CopyFat56   1.1480n ± 0%   0.9303n ± 0%  -18.96% (p=0.000 n=10)
CopyFat57   1.3690n ± 0%   0.9293n ± 0%  -32.12% (p=0.000 n=10)
CopyFat58   1.3690n ± 0%   0.9303n ± 0%  -32.05% (p=0.000 n=10)
CopyFat59   1.3690n ± 0%   0.9293n ± 0%  -32.12% (p=0.000 n=10)
CopyFat60   1.3690n ± 0%   0.9303n ± 0%  -32.05% (p=0.000 n=10)
CopyFat61   1.3690n ± 0%   0.9293n ± 0%  -32.12% (p=0.000 n=10)
CopyFat62   1.3690n ± 0%   0.9303n ± 0%  -32.05% (p=0.000 n=10)
CopyFat63   1.3690n ± 0%   0.9293n ± 0%  -32.12% (p=0.000 n=10)
CopyFat64   1.1470n ± 0%   0.5742n ± 0%  -49.94% (p=0.000 n=10)
geomean     0.9710n        0.7214n       -25.70%

Change-Id: Iecfe52fde1d431a1e4503cd848813a67f3896512
Reviewed-on: https://go-review.googlesource.com/c/go/+/738261
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
Alexander Musman
2026-01-21 23:29:59 +03:00
committed by Keith Randall
parent ab1a2c8874
commit e0ebb4c646
2 changed files with 80 additions and 176 deletions

View File

@@ -434,33 +434,25 @@
(Move [15] dst src mem) =>
(MOVDstore [7] dst (MOVDload [7] src mem)
(MOVDstore dst (MOVDload src mem) mem))
(Move [16] dst src mem) =>
(STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem)
(Move [16] dst src mem) => (FMOVQstore dst (FMOVQload src mem) mem)
(Move [s] dst src mem) && s > 16 && s <= 24 =>
(MOVDstore [int32(s-8)] dst (MOVDload [int32(s-8)] src mem)
(STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem))
(Move [s] dst src mem) && s > 24 && s <= 32 =>
(STP [int32(s-16)] dst (Select0 <typ.UInt64> (LDP [int32(s-16)] src mem)) (Select1 <typ.UInt64> (LDP [int32(s-16)] src mem))
(STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem))
(FMOVQstore dst (FMOVQload src mem) mem))
(Move [s] dst src mem) && s > 24 && s < 32 =>
(FMOVQstore [int32(s-16)] dst (FMOVQload [int32(s-16)] src mem)
(FMOVQstore dst (FMOVQload src mem) mem))
(Move [32] dst src mem) =>
(FSTPQ dst (Select0 <typ.Vec128> (FLDPQ src mem)) (Select1 <typ.Vec128> (FLDPQ src mem)) mem)
(Move [s] dst src mem) && s > 32 && s <= 40 =>
(MOVDstore [int32(s-8)] dst (MOVDload [int32(s-8)] src mem)
(STP [16] dst (Select0 <typ.UInt64> (LDP [16] src mem)) (Select1 <typ.UInt64> (LDP [16] src mem))
(STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem)))
(FSTPQ dst (Select0 <typ.Vec128> (FLDPQ src mem)) (Select1 <typ.Vec128> (FLDPQ src mem)) mem))
(Move [s] dst src mem) && s > 40 && s <= 48 =>
(STP [int32(s-16)] dst (Select0 <typ.UInt64> (LDP [int32(s-16)] src mem)) (Select1 <typ.UInt64> (LDP [int32(s-16)] src mem))
(STP [16] dst (Select0 <typ.UInt64> (LDP [16] src mem)) (Select1 <typ.UInt64> (LDP [16] src mem))
(STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem)))
(Move [s] dst src mem) && s > 48 && s <= 56 =>
(MOVDstore [int32(s-8)] dst (MOVDload [int32(s-8)] src mem)
(STP [32] dst (Select0 <typ.UInt64> (LDP [32] src mem)) (Select1 <typ.UInt64> (LDP [32] src mem))
(STP [16] dst (Select0 <typ.UInt64> (LDP [16] src mem)) (Select1 <typ.UInt64> (LDP [16] src mem))
(STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem))))
(Move [s] dst src mem) && s > 56 && s <= 64 =>
(STP [int32(s-16)] dst (Select0 <typ.UInt64> (LDP [int32(s-16)] src mem)) (Select1 <typ.UInt64> (LDP [int32(s-16)] src mem))
(STP [32] dst (Select0 <typ.UInt64> (LDP [32] src mem)) (Select1 <typ.UInt64> (LDP [32] src mem))
(STP [16] dst (Select0 <typ.UInt64> (LDP [16] src mem)) (Select1 <typ.UInt64> (LDP [16] src mem))
(STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem))))
(FMOVQstore [int32(s-16)] dst (FMOVQload [int32(s-16)] src mem)
(FSTPQ dst (Select0 <typ.Vec128> (FLDPQ src mem)) (Select1 <typ.Vec128> (FLDPQ src mem)) mem))
(Move [s] dst src mem) && s > 48 && s <= 64 =>
(FSTPQ [int32(s-32)] dst (Select0 <typ.Vec128> (FLDPQ [int32(s-32)] src mem)) (Select1 <typ.Vec128> (FLDPQ [int32(s-32)] src mem))
(FSTPQ dst (Select0 <typ.Vec128> (FLDPQ src mem)) (Select1 <typ.Vec128> (FLDPQ src mem)) mem))
(Move [s] dst src mem) && s > 64 && s < 192 && logLargeCopy(v, s) => (LoweredMove [s] dst src mem)
(Move [s] dst src mem) && s >= 192 && logLargeCopy(v, s) => (LoweredMoveLoop [s] dst src mem)

View File

@@ -19817,7 +19817,7 @@ func rewriteValueARM64_OpMove(v *Value) bool {
return true
}
// match: (Move [16] dst src mem)
// result: (STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem)
// result: (FMOVQstore dst (FMOVQload src mem) mem)
for {
if auxIntToInt64(v.AuxInt) != 16 {
break
@@ -19825,19 +19825,15 @@ func rewriteValueARM64_OpMove(v *Value) bool {
dst := v_0
src := v_1
mem := v_2
v.reset(OpARM64STP)
v0 := b.NewValue0(v.Pos, OpSelect0, typ.UInt64)
v1 := b.NewValue0(v.Pos, OpARM64LDP, types.NewTuple(typ.UInt64, typ.UInt64))
v1.AddArg2(src, mem)
v0.AddArg(v1)
v2 := b.NewValue0(v.Pos, OpSelect1, typ.UInt64)
v2.AddArg(v1)
v.AddArg4(dst, v0, v2, mem)
v.reset(OpARM64FMOVQstore)
v0 := b.NewValue0(v.Pos, OpARM64FMOVQload, typ.Vec128)
v0.AddArg2(src, mem)
v.AddArg3(dst, v0, mem)
return true
}
// match: (Move [s] dst src mem)
// cond: s > 16 && s <= 24
// result: (MOVDstore [int32(s-8)] dst (MOVDload [int32(s-8)] src mem) (STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem))
// result: (MOVDstore [int32(s-8)] dst (MOVDload [int32(s-8)] src mem) (FMOVQstore dst (FMOVQload src mem) mem))
for {
s := auxIntToInt64(v.AuxInt)
dst := v_0
@@ -19851,51 +19847,58 @@ func rewriteValueARM64_OpMove(v *Value) bool {
v0 := b.NewValue0(v.Pos, OpARM64MOVDload, typ.UInt64)
v0.AuxInt = int32ToAuxInt(int32(s - 8))
v0.AddArg2(src, mem)
v1 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
v2 := b.NewValue0(v.Pos, OpSelect0, typ.UInt64)
v3 := b.NewValue0(v.Pos, OpARM64LDP, types.NewTuple(typ.UInt64, typ.UInt64))
v3.AddArg2(src, mem)
v2.AddArg(v3)
v4 := b.NewValue0(v.Pos, OpSelect1, typ.UInt64)
v4.AddArg(v3)
v1.AddArg4(dst, v2, v4, mem)
v1 := b.NewValue0(v.Pos, OpARM64FMOVQstore, types.TypeMem)
v2 := b.NewValue0(v.Pos, OpARM64FMOVQload, typ.Vec128)
v2.AddArg2(src, mem)
v1.AddArg3(dst, v2, mem)
v.AddArg3(dst, v0, v1)
return true
}
// match: (Move [s] dst src mem)
// cond: s > 24 && s <= 32
// result: (STP [int32(s-16)] dst (Select0 <typ.UInt64> (LDP [int32(s-16)] src mem)) (Select1 <typ.UInt64> (LDP [int32(s-16)] src mem)) (STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem))
// cond: s > 24 && s < 32
// result: (FMOVQstore [int32(s-16)] dst (FMOVQload [int32(s-16)] src mem) (FMOVQstore dst (FMOVQload src mem) mem))
for {
s := auxIntToInt64(v.AuxInt)
dst := v_0
src := v_1
mem := v_2
if !(s > 24 && s <= 32) {
if !(s > 24 && s < 32) {
break
}
v.reset(OpARM64STP)
v.reset(OpARM64FMOVQstore)
v.AuxInt = int32ToAuxInt(int32(s - 16))
v0 := b.NewValue0(v.Pos, OpSelect0, typ.UInt64)
v1 := b.NewValue0(v.Pos, OpARM64LDP, types.NewTuple(typ.UInt64, typ.UInt64))
v1.AuxInt = int32ToAuxInt(int32(s - 16))
v0 := b.NewValue0(v.Pos, OpARM64FMOVQload, typ.Vec128)
v0.AuxInt = int32ToAuxInt(int32(s - 16))
v0.AddArg2(src, mem)
v1 := b.NewValue0(v.Pos, OpARM64FMOVQstore, types.TypeMem)
v2 := b.NewValue0(v.Pos, OpARM64FMOVQload, typ.Vec128)
v2.AddArg2(src, mem)
v1.AddArg3(dst, v2, mem)
v.AddArg3(dst, v0, v1)
return true
}
// match: (Move [32] dst src mem)
// result: (FSTPQ dst (Select0 <typ.Vec128> (FLDPQ src mem)) (Select1 <typ.Vec128> (FLDPQ src mem)) mem)
for {
if auxIntToInt64(v.AuxInt) != 32 {
break
}
dst := v_0
src := v_1
mem := v_2
v.reset(OpARM64FSTPQ)
v0 := b.NewValue0(v.Pos, OpSelect0, typ.Vec128)
v1 := b.NewValue0(v.Pos, OpARM64FLDPQ, types.NewTuple(typ.Vec128, typ.Vec128))
v1.AddArg2(src, mem)
v0.AddArg(v1)
v2 := b.NewValue0(v.Pos, OpSelect1, typ.UInt64)
v2 := b.NewValue0(v.Pos, OpSelect1, typ.Vec128)
v2.AddArg(v1)
v3 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
v4 := b.NewValue0(v.Pos, OpSelect0, typ.UInt64)
v5 := b.NewValue0(v.Pos, OpARM64LDP, types.NewTuple(typ.UInt64, typ.UInt64))
v5.AddArg2(src, mem)
v4.AddArg(v5)
v6 := b.NewValue0(v.Pos, OpSelect1, typ.UInt64)
v6.AddArg(v5)
v3.AddArg4(dst, v4, v6, mem)
v.AddArg4(dst, v0, v2, v3)
v.AddArg4(dst, v0, v2, mem)
return true
}
// match: (Move [s] dst src mem)
// cond: s > 32 && s <= 40
// result: (MOVDstore [int32(s-8)] dst (MOVDload [int32(s-8)] src mem) (STP [16] dst (Select0 <typ.UInt64> (LDP [16] src mem)) (Select1 <typ.UInt64> (LDP [16] src mem)) (STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem)))
// result: (MOVDstore [int32(s-8)] dst (MOVDload [int32(s-8)] src mem) (FSTPQ dst (Select0 <typ.Vec128> (FLDPQ src mem)) (Select1 <typ.Vec128> (FLDPQ src mem)) mem))
for {
s := auxIntToInt64(v.AuxInt)
dst := v_0
@@ -19909,30 +19912,20 @@ func rewriteValueARM64_OpMove(v *Value) bool {
v0 := b.NewValue0(v.Pos, OpARM64MOVDload, typ.UInt64)
v0.AuxInt = int32ToAuxInt(int32(s - 8))
v0.AddArg2(src, mem)
v1 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
v1.AuxInt = int32ToAuxInt(16)
v2 := b.NewValue0(v.Pos, OpSelect0, typ.UInt64)
v3 := b.NewValue0(v.Pos, OpARM64LDP, types.NewTuple(typ.UInt64, typ.UInt64))
v3.AuxInt = int32ToAuxInt(16)
v1 := b.NewValue0(v.Pos, OpARM64FSTPQ, types.TypeMem)
v2 := b.NewValue0(v.Pos, OpSelect0, typ.Vec128)
v3 := b.NewValue0(v.Pos, OpARM64FLDPQ, types.NewTuple(typ.Vec128, typ.Vec128))
v3.AddArg2(src, mem)
v2.AddArg(v3)
v4 := b.NewValue0(v.Pos, OpSelect1, typ.UInt64)
v4 := b.NewValue0(v.Pos, OpSelect1, typ.Vec128)
v4.AddArg(v3)
v5 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
v6 := b.NewValue0(v.Pos, OpSelect0, typ.UInt64)
v7 := b.NewValue0(v.Pos, OpARM64LDP, types.NewTuple(typ.UInt64, typ.UInt64))
v7.AddArg2(src, mem)
v6.AddArg(v7)
v8 := b.NewValue0(v.Pos, OpSelect1, typ.UInt64)
v8.AddArg(v7)
v5.AddArg4(dst, v6, v8, mem)
v1.AddArg4(dst, v2, v4, v5)
v1.AddArg4(dst, v2, v4, mem)
v.AddArg3(dst, v0, v1)
return true
}
// match: (Move [s] dst src mem)
// cond: s > 40 && s <= 48
// result: (STP [int32(s-16)] dst (Select0 <typ.UInt64> (LDP [int32(s-16)] src mem)) (Select1 <typ.UInt64> (LDP [int32(s-16)] src mem)) (STP [16] dst (Select0 <typ.UInt64> (LDP [16] src mem)) (Select1 <typ.UInt64> (LDP [16] src mem)) (STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem)))
// result: (FMOVQstore [int32(s-16)] dst (FMOVQload [int32(s-16)] src mem) (FSTPQ dst (Select0 <typ.Vec128> (FLDPQ src mem)) (Select1 <typ.Vec128> (FLDPQ src mem)) mem))
for {
s := auxIntToInt64(v.AuxInt)
dst := v_0
@@ -19941,131 +19934,50 @@ func rewriteValueARM64_OpMove(v *Value) bool {
if !(s > 40 && s <= 48) {
break
}
v.reset(OpARM64STP)
v.reset(OpARM64FMOVQstore)
v.AuxInt = int32ToAuxInt(int32(s - 16))
v0 := b.NewValue0(v.Pos, OpSelect0, typ.UInt64)
v1 := b.NewValue0(v.Pos, OpARM64LDP, types.NewTuple(typ.UInt64, typ.UInt64))
v1.AuxInt = int32ToAuxInt(int32(s - 16))
v1.AddArg2(src, mem)
v0.AddArg(v1)
v2 := b.NewValue0(v.Pos, OpSelect1, typ.UInt64)
v2.AddArg(v1)
v3 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
v3.AuxInt = int32ToAuxInt(16)
v4 := b.NewValue0(v.Pos, OpSelect0, typ.UInt64)
v5 := b.NewValue0(v.Pos, OpARM64LDP, types.NewTuple(typ.UInt64, typ.UInt64))
v5.AuxInt = int32ToAuxInt(16)
v5.AddArg2(src, mem)
v4.AddArg(v5)
v6 := b.NewValue0(v.Pos, OpSelect1, typ.UInt64)
v6.AddArg(v5)
v7 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
v8 := b.NewValue0(v.Pos, OpSelect0, typ.UInt64)
v9 := b.NewValue0(v.Pos, OpARM64LDP, types.NewTuple(typ.UInt64, typ.UInt64))
v9.AddArg2(src, mem)
v8.AddArg(v9)
v10 := b.NewValue0(v.Pos, OpSelect1, typ.UInt64)
v10.AddArg(v9)
v7.AddArg4(dst, v8, v10, mem)
v3.AddArg4(dst, v4, v6, v7)
v.AddArg4(dst, v0, v2, v3)
return true
}
// match: (Move [s] dst src mem)
// cond: s > 48 && s <= 56
// result: (MOVDstore [int32(s-8)] dst (MOVDload [int32(s-8)] src mem) (STP [32] dst (Select0 <typ.UInt64> (LDP [32] src mem)) (Select1 <typ.UInt64> (LDP [32] src mem)) (STP [16] dst (Select0 <typ.UInt64> (LDP [16] src mem)) (Select1 <typ.UInt64> (LDP [16] src mem)) (STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem))))
for {
s := auxIntToInt64(v.AuxInt)
dst := v_0
src := v_1
mem := v_2
if !(s > 48 && s <= 56) {
break
}
v.reset(OpARM64MOVDstore)
v.AuxInt = int32ToAuxInt(int32(s - 8))
v0 := b.NewValue0(v.Pos, OpARM64MOVDload, typ.UInt64)
v0.AuxInt = int32ToAuxInt(int32(s - 8))
v0 := b.NewValue0(v.Pos, OpARM64FMOVQload, typ.Vec128)
v0.AuxInt = int32ToAuxInt(int32(s - 16))
v0.AddArg2(src, mem)
v1 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
v1.AuxInt = int32ToAuxInt(32)
v2 := b.NewValue0(v.Pos, OpSelect0, typ.UInt64)
v3 := b.NewValue0(v.Pos, OpARM64LDP, types.NewTuple(typ.UInt64, typ.UInt64))
v3.AuxInt = int32ToAuxInt(32)
v1 := b.NewValue0(v.Pos, OpARM64FSTPQ, types.TypeMem)
v2 := b.NewValue0(v.Pos, OpSelect0, typ.Vec128)
v3 := b.NewValue0(v.Pos, OpARM64FLDPQ, types.NewTuple(typ.Vec128, typ.Vec128))
v3.AddArg2(src, mem)
v2.AddArg(v3)
v4 := b.NewValue0(v.Pos, OpSelect1, typ.UInt64)
v4 := b.NewValue0(v.Pos, OpSelect1, typ.Vec128)
v4.AddArg(v3)
v5 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
v5.AuxInt = int32ToAuxInt(16)
v6 := b.NewValue0(v.Pos, OpSelect0, typ.UInt64)
v7 := b.NewValue0(v.Pos, OpARM64LDP, types.NewTuple(typ.UInt64, typ.UInt64))
v7.AuxInt = int32ToAuxInt(16)
v7.AddArg2(src, mem)
v6.AddArg(v7)
v8 := b.NewValue0(v.Pos, OpSelect1, typ.UInt64)
v8.AddArg(v7)
v9 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
v10 := b.NewValue0(v.Pos, OpSelect0, typ.UInt64)
v11 := b.NewValue0(v.Pos, OpARM64LDP, types.NewTuple(typ.UInt64, typ.UInt64))
v11.AddArg2(src, mem)
v10.AddArg(v11)
v12 := b.NewValue0(v.Pos, OpSelect1, typ.UInt64)
v12.AddArg(v11)
v9.AddArg4(dst, v10, v12, mem)
v5.AddArg4(dst, v6, v8, v9)
v1.AddArg4(dst, v2, v4, v5)
v1.AddArg4(dst, v2, v4, mem)
v.AddArg3(dst, v0, v1)
return true
}
// match: (Move [s] dst src mem)
// cond: s > 56 && s <= 64
// result: (STP [int32(s-16)] dst (Select0 <typ.UInt64> (LDP [int32(s-16)] src mem)) (Select1 <typ.UInt64> (LDP [int32(s-16)] src mem)) (STP [32] dst (Select0 <typ.UInt64> (LDP [32] src mem)) (Select1 <typ.UInt64> (LDP [32] src mem)) (STP [16] dst (Select0 <typ.UInt64> (LDP [16] src mem)) (Select1 <typ.UInt64> (LDP [16] src mem)) (STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem))))
// cond: s > 48 && s <= 64
// result: (FSTPQ [int32(s-32)] dst (Select0 <typ.Vec128> (FLDPQ [int32(s-32)] src mem)) (Select1 <typ.Vec128> (FLDPQ [int32(s-32)] src mem)) (FSTPQ dst (Select0 <typ.Vec128> (FLDPQ src mem)) (Select1 <typ.Vec128> (FLDPQ src mem)) mem))
for {
s := auxIntToInt64(v.AuxInt)
dst := v_0
src := v_1
mem := v_2
if !(s > 56 && s <= 64) {
if !(s > 48 && s <= 64) {
break
}
v.reset(OpARM64STP)
v.AuxInt = int32ToAuxInt(int32(s - 16))
v0 := b.NewValue0(v.Pos, OpSelect0, typ.UInt64)
v1 := b.NewValue0(v.Pos, OpARM64LDP, types.NewTuple(typ.UInt64, typ.UInt64))
v1.AuxInt = int32ToAuxInt(int32(s - 16))
v.reset(OpARM64FSTPQ)
v.AuxInt = int32ToAuxInt(int32(s - 32))
v0 := b.NewValue0(v.Pos, OpSelect0, typ.Vec128)
v1 := b.NewValue0(v.Pos, OpARM64FLDPQ, types.NewTuple(typ.Vec128, typ.Vec128))
v1.AuxInt = int32ToAuxInt(int32(s - 32))
v1.AddArg2(src, mem)
v0.AddArg(v1)
v2 := b.NewValue0(v.Pos, OpSelect1, typ.UInt64)
v2 := b.NewValue0(v.Pos, OpSelect1, typ.Vec128)
v2.AddArg(v1)
v3 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
v3.AuxInt = int32ToAuxInt(32)
v4 := b.NewValue0(v.Pos, OpSelect0, typ.UInt64)
v5 := b.NewValue0(v.Pos, OpARM64LDP, types.NewTuple(typ.UInt64, typ.UInt64))
v5.AuxInt = int32ToAuxInt(32)
v3 := b.NewValue0(v.Pos, OpARM64FSTPQ, types.TypeMem)
v4 := b.NewValue0(v.Pos, OpSelect0, typ.Vec128)
v5 := b.NewValue0(v.Pos, OpARM64FLDPQ, types.NewTuple(typ.Vec128, typ.Vec128))
v5.AddArg2(src, mem)
v4.AddArg(v5)
v6 := b.NewValue0(v.Pos, OpSelect1, typ.UInt64)
v6 := b.NewValue0(v.Pos, OpSelect1, typ.Vec128)
v6.AddArg(v5)
v7 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
v7.AuxInt = int32ToAuxInt(16)
v8 := b.NewValue0(v.Pos, OpSelect0, typ.UInt64)
v9 := b.NewValue0(v.Pos, OpARM64LDP, types.NewTuple(typ.UInt64, typ.UInt64))
v9.AuxInt = int32ToAuxInt(16)
v9.AddArg2(src, mem)
v8.AddArg(v9)
v10 := b.NewValue0(v.Pos, OpSelect1, typ.UInt64)
v10.AddArg(v9)
v11 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
v12 := b.NewValue0(v.Pos, OpSelect0, typ.UInt64)
v13 := b.NewValue0(v.Pos, OpARM64LDP, types.NewTuple(typ.UInt64, typ.UInt64))
v13.AddArg2(src, mem)
v12.AddArg(v13)
v14 := b.NewValue0(v.Pos, OpSelect1, typ.UInt64)
v14.AddArg(v13)
v11.AddArg4(dst, v12, v14, mem)
v7.AddArg4(dst, v8, v10, v11)
v3.AddArg4(dst, v4, v6, v7)
v3.AddArg4(dst, v4, v6, mem)
v.AddArg4(dst, v0, v2, v3)
return true
}