cmd/compile: (amd64) optimize float32(round64(float64(x)))

Not a fix because there are other architectures still to be done. Updates #75463. Change-Id: I3d7754ce4a26af0f5c4ef0be1254d164e68f8442 Reviewed-on: https://go-review.googlesource.com/c/go/+/729160 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Keith Randall <khr@golang.org> Reviewed-by: Keith Randall <khr@google.com>
2026-04-02 01:10:27 +09:00 · 2025-12-10 16:05:55 -05:00
parent 831c489f9c
commit 7336381cd1
7 changed files with 126 additions and 1 deletions
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@@ -1491,7 +1491,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 		}
 	case ssa.OpAMD64LoweredRound32F, ssa.OpAMD64LoweredRound64F:
 		// input is already rounded
-	case ssa.OpAMD64ROUNDSD:
+	case ssa.OpAMD64ROUNDSD, ssa.OpAMD64ROUNDSS:
 		p := s.Prog(v.Op.Asm())
 		val := v.AuxInt
 		// 0 means math.RoundToEven, 1 Floor, 2 Ceil, 3 Trunc
--- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules
@@ -124,6 +124,8 @@
 (Ceil x)        => (ROUNDSD [2] x)
 (Trunc x)       => (ROUNDSD [3] x)

+(CVTSD2SS (ROUNDSD [c] (CVTSS2SD x))) => (ROUNDSS [c] x)
+
 (FMA x y z) => (VFMADD231SD z x y)

 // Lowering extension
--- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
@@ -798,6 +798,7 @@ func init() {
 		// ROUNDSD instruction is only guaraneteed to be available if GOAMD64>=v2.
 		// For GOAMD64<v2, any use must be preceded by a successful check of runtime.x86HasSSE41.
 		{name: "ROUNDSD", argLength: 1, reg: fp11, aux: "Int8", asm: "ROUNDSD"},
+		{name: "ROUNDSS", argLength: 1, reg: fp11, aux: "Int8", asm: "ROUNDSS"},
 		// See why we need those in issue #71204
 		{name: "LoweredRound32F", argLength: 1, reg: fp11, resultInArg0: true, zeroWidth: true},
 		{name: "LoweredRound64F", argLength: 1, reg: fp11, resultInArg0: true, zeroWidth: true},
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -929,6 +929,7 @@ const (
 	OpAMD64SQRTSD
 	OpAMD64SQRTSS
 	OpAMD64ROUNDSD
+	OpAMD64ROUNDSS
 	OpAMD64LoweredRound32F
 	OpAMD64LoweredRound64F
 	OpAMD64VFMADD231SS
@@ -16231,6 +16232,20 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:    "ROUNDSS",
+		auxType: auxInt8,
+		argLen:  1,
+		asm:     x86.AROUNDSS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:         "LoweredRound32F",
 		argLen:       1,
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -222,6 +222,8 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpAMD64CMPXCHGLlock(v)
 	case OpAMD64CMPXCHGQlock:
 		return rewriteValueAMD64_OpAMD64CMPXCHGQlock(v)
+	case OpAMD64CVTSD2SS:
+		return rewriteValueAMD64_OpAMD64CVTSD2SS(v)
 	case OpAMD64DIVSD:
 		return rewriteValueAMD64_OpAMD64DIVSD(v)
 	case OpAMD64DIVSDload:
@@ -13461,6 +13463,27 @@ func rewriteValueAMD64_OpAMD64CMPXCHGQlock(v *Value) bool {
 	}
 	return false
 }
+func rewriteValueAMD64_OpAMD64CVTSD2SS(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CVTSD2SS (ROUNDSD [c] (CVTSS2SD x)))
+	// result: (ROUNDSS [c] x)
+	for {
+		if v_0.Op != OpAMD64ROUNDSD {
+			break
+		}
+		c := auxIntToInt8(v_0.AuxInt)
+		v_0_0 := v_0.Args[0]
+		if v_0_0.Op != OpAMD64CVTSS2SD {
+			break
+		}
+		x := v_0_0.Args[0]
+		v.reset(OpAMD64ROUNDSS)
+		v.AuxInt = int8ToAuxInt(c)
+		v.AddArg(x)
+		return true
+	}
+	return false
+}
 func rewriteValueAMD64_OpAMD64DIVSD(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
--- a/src/cmd/compile/internal/ssa/rewrite_test.go
+++ b/src/cmd/compile/internal/ssa/rewrite_test.go
@@ -6,6 +6,8 @@ package ssa

 import (
 	"cmd/compile/internal/rttype"
+	"math"
+	"math/rand"
 	"reflect"
 	"testing"
 	"unsafe"
@@ -42,6 +44,78 @@ func TestSubFlags(t *testing.T) {
 	}
 }

+//go:noinline
+func unopt(f func(float64) float64, x float32) float32 {
+	return float32(f(float64(x)))
+}
+
+func differ(x, y float32) bool {
+	if x != x && y != y {
+		// if both are NaN, exact bit pattern of the NaN is uninteresting
+		return false
+	}
+	return math.Float32bits(x) != math.Float32bits(y)
+}
+
+func test32bitUnary(t *testing.T, x float32) {
+	if want, got := unopt(math.Round, x), float32(math.Round(float64(x))); differ(want, got) {
+		t.Errorf("Optimized 32-bit Round did not match, x=%f, want=%f, got=%f", x, want, got)
+	}
+	if want, got := unopt(math.RoundToEven, x), float32(math.RoundToEven(float64(x))); differ(want, got) {
+		t.Errorf("Optimized 32-bit RoundToEven did not match, x=%f, want=%f, got=%f", x, want, got)
+	}
+	if want, got := unopt(math.Trunc, x), float32(math.Trunc(float64(x))); differ(want, got) {
+		t.Errorf("Optimized 32-bit Trunc did not match, x=%f, want=%f, got=%f", x, want, got)
+	}
+	if want, got := unopt(math.Ceil, x), float32(math.Ceil(float64(x))); differ(want, got) {
+		t.Errorf("Optimized 32-bit Ceil did not match, x=%f, want=%f, got=%f", x, want, got)
+	}
+	if want, got := unopt(math.Floor, x), float32(math.Floor(float64(x))); differ(want, got) {
+		t.Errorf("Optimized 32-bit Floor did not match, x=%f, want=%f, got=%f", x, want, got)
+	}
+	if x >= 0 {
+		if want, got := unopt(math.Sqrt, x), float32(math.Sqrt(float64(x))); differ(want, got) {
+			t.Errorf("Optimized 32-bit Sqrt did not match, x=%f, want=%f, got=%f", x, want, got)
+		}
+	}
+	if want, got := unopt(math.Abs, x), float32(math.Abs(float64(x))); differ(want, got) {
+		t.Errorf("Optimized 32-bit Abs did not match, x=%f, want=%f, got=%f", x, want, got)
+	}
+
+}
+
+var zero float32
+
+func Test32bitUnary(t *testing.T) {
+	// this is mostly for testing rounding.
+	test32bitUnary(t, -1.5)
+	test32bitUnary(t, -0.5)
+	test32bitUnary(t, 0.5)
+	test32bitUnary(t, 1.5)
+
+	test32bitUnary(t, -1.4)
+	test32bitUnary(t, -0.4)
+	test32bitUnary(t, 0.4)
+	test32bitUnary(t, 1.4)
+
+	test32bitUnary(t, -1.6)
+	test32bitUnary(t, -0.6)
+	test32bitUnary(t, 0.6)
+	test32bitUnary(t, 1.6)
+
+	// negative zero
+	test32bitUnary(t, 1/(-1/zero))
+
+	var rnd = rand.New(rand.NewSource(0))
+
+	for i := uint32(0); i <= 1<<20; i++ {
+		test32bitUnary(t, math.Float32frombits(math.Float32bits(math.MaxFloat32)-i))
+		test32bitUnary(t, float32(i)+1.5)
+		test32bitUnary(t, math.Float32frombits(rnd.Uint32()))
+	}
+
+}
+
 func TestIsPPC64WordRotateMask(t *testing.T) {
 	tests := []struct {
 		input    int64
--- a/test/codegen/floats.go
+++ b/test/codegen/floats.go
@@ -280,6 +280,16 @@ func Float64ConstantStore(p *float64) {
 	*p = 5.432
 }

+func WideCeilNarrow(x float32) float32 {
+	// amd64/v3:"ROUNDSS"
+	return float32(math.Ceil(float64(x)))
+}
+
+func WideTruncNarrow(x float32) float32 {
+	// amd64/v3:"ROUNDSS"
+	return float32(math.Trunc(float64(x)))
+}
+
 // ------------------------ //
 //  Subnormal tests         //
 // ------------------------ //