cmd/compile: on arm64 pair a load with a load in a subsequent block

Look into the following block(s) for a load that can be paired with
the load we're trying to pair up.

This particularly helps the generated equality functions. Instead of doing

MOVD x(R0), R2
MOVD x(R1), R3
CMP R2, R3
BNE noteq
MOVD x+8(R0), R2
MOVD x+8(R1), R3
CMP R2, R3
BNE noteq

we do

LDP x(R0), (R2, R4)
LDP x(R1), (R3, R5)
CMP R2, R3
BNE noteq
CMP R4, R5
BNE noteq

Removes 5296 bytes of code from cmd/go.

Change-Id: I6368686892ac944783c8b07ed7252126d1ef4031
Reviewed-on: https://go-review.googlesource.com/c/go/+/740741
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Keith Randall <khr@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
Keith Randall
2026-01-30 14:16:40 -08:00
parent 51abbb12c4
commit c8df1410d5
2 changed files with 121 additions and 0 deletions

View File

@@ -7,6 +7,7 @@ package ssa
import (
"cmd/compile/internal/ir"
"cmd/compile/internal/types"
"cmd/internal/obj"
"slices"
)
@@ -206,6 +207,117 @@ func pairLoads(f *Func) {
i++ // Skip y next time around the loop.
}
}
// Try to pair a load with a load from a subsequent block.
// Note that this is always safe to do if the memory arguments match.
// (But see the memory barrier case below.)
type nextBlockKey struct {
op Op
ptr ID
mem ID
auxInt int64
aux any
}
nextBlock := map[nextBlockKey]*Value{}
for _, b := range f.Blocks {
if memoryBarrierTest(b) {
// TODO: Do we really need to skip write barrier test blocks?
// type T struct {
// a *byte
// b int
// }
// func f(t *T) int {
// r := t.b
// t.a = nil
// return r
// }
// This would issue a single LDP for both the t.a and t.b fields,
// *before* we check the write barrier flag. (We load the t.a field
// to put it in the write barrier buffer.) Not sure if that is ok.
continue
}
// Find loads in the next block(s) that we can move to this one.
// TODO: could maybe look further than just one successor hop.
clear(nextBlock)
for _, e := range b.Succs {
if len(e.b.Preds) > 1 {
continue
}
for _, v := range e.b.Values {
info := pairableLoads[v.Op]
if info.width == 0 {
continue
}
if !offsetOk(v.Aux, v.AuxInt, info.width) {
continue // not advisable
}
nextBlock[nextBlockKey{op: v.Op, ptr: v.Args[0].ID, mem: v.Args[1].ID, auxInt: v.AuxInt, aux: v.Aux}] = v
}
}
if len(nextBlock) == 0 {
continue
}
// don't move too many loads. Each requires a register across a basic block boundary.
const maxMoved = 4
nMoved := 0
for i := len(b.Values) - 1; i >= 0 && nMoved < maxMoved; i-- {
x := b.Values[i]
info := pairableLoads[x.Op]
if info.width == 0 {
continue
}
if !offsetOk(x.Aux, x.AuxInt, info.width) {
continue // not advisable
}
key := nextBlockKey{op: x.Op, ptr: x.Args[0].ID, mem: x.Args[1].ID, auxInt: x.AuxInt + info.width, aux: x.Aux}
if y := nextBlock[key]; y != nil {
delete(nextBlock, key)
// Make the 2-register load.
load := b.NewValue2IA(x.Pos, info.pair, types.NewTuple(x.Type, y.Type), x.AuxInt, x.Aux, x.Args[0], x.Args[1])
// Modify x to be (Select0 load).
x.reset(OpSelect0)
x.SetArgs1(load)
// Modify y to be (Copy (Select1 load)).
// Note: the Select* needs to live in the load's block, not y's block.
y.reset(OpCopy)
y.SetArgs1(b.NewValue1(y.Pos, OpSelect1, y.Type, load))
nMoved++
continue
}
key.auxInt = x.AuxInt - info.width
if y := nextBlock[key]; y != nil {
delete(nextBlock, key)
// Make the 2-register load.
load := b.NewValue2IA(x.Pos, info.pair, types.NewTuple(y.Type, x.Type), y.AuxInt, x.Aux, x.Args[0], x.Args[1])
// Modify x to be (Select1 load).
x.reset(OpSelect1)
x.SetArgs1(load)
// Modify y to be (Copy (Select0 load)).
y.reset(OpCopy)
y.SetArgs1(b.NewValue1(y.Pos, OpSelect0, y.Type, load))
nMoved++
continue
}
}
}
}
func memoryBarrierTest(b *Block) bool {
if b.Kind != BlockARM64NZW {
return false
}
c := b.Controls[0]
if c.Op != OpARM64MOVWUload {
return false
}
if globl, ok := c.Aux.(*obj.LSym); ok {
return globl.Name == "runtime.writeBarrier"
}
return false
}
func pairStores(f *Func) {

View File

@@ -1027,6 +1027,15 @@ func dwloadResult2(p *[2]int64) (int64, int64) {
return p[1], p[0]
}
func dwloadConditional(p *[2]int64) (int64, int64) {
// arm64:"LDP \\(R0\\), \\(R0, R1\\)"
x := p[0]
if x == 0 {
return x, 0
}
return x, p[1]
}
// ---------------------------------- //
// Arm64 double-register stores //
// ---------------------------------- //