encoding/{base32,base64}: speed up Encode

This CL clarifies (*Encoding).Encode and speeds it up by reducing the
number of bounds checks in its loop.

Here are some benchmark results (no change to allocations):

goos: darwin
goarch: arm64
pkg: encoding/base32
cpu: Apple M4
                  │     old     │             new              │
                  │   sec/op    │   sec/op     vs base         │
EncodeToString-10   7.310µ ± 0%   5.308µ ± 0%  -27.39% (n=180)
Encode-10           5.651µ ± 0%   3.603µ ± 0%  -36.25% (n=180)
geomean             6.427µ        4.373µ       -31.96%

                  │     old      │                  new                  │
                  │     B/s      │     B/s       vs base                 │
EncodeToString-10   1.044Gi ± 0%   1.437Gi ± 0%  +37.71% (p=0.000 n=180)
Encode-10           1.350Gi ± 0%   2.118Gi ± 0%  +56.88% (p=0.000 n=180)
geomean             1.187Gi        1.745Gi       +46.98%

pkg: encoding/base64
                  │     old     │             new              │
                  │   sec/op    │   sec/op     vs base         │
EncodeToString-10   7.058µ ± 0%   6.034µ ± 0%  -14.51% (n=180)

                  │     old      │                  new                  │
                  │     B/s      │     B/s       vs base                 │
EncodeToString-10   1.081Gi ± 0%   1.264Gi ± 0%  +16.97% (p=0.000 n=180)

Updates #20206

Change-Id: I7d46891ddb4371df004bfd612a8efc6638715b94
GitHub-Last-Rev: 1caac3d655
GitHub-Pull-Request: golang/go#78344
Reviewed-on: https://go-review.googlesource.com/c/go/+/759100
Reviewed-by: Keith Randall <khr@golang.org>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Keith Randall <khr@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
Julien Cretel
2026-03-25 22:32:18 +00:00
committed by Gopher Robot
parent 286a79658e
commit e4fcdc6c55
2 changed files with 54 additions and 61 deletions

View File

@@ -127,61 +127,59 @@ func (enc *Encoding) Encode(dst, src []byte) {
// outside of the loop to speed up the encoder.
_ = enc.encode
di, si := 0, 0
n := (len(src) / 5) * 5
for si < n {
for len(src) >= 5 {
// Combining two 32 bit loads allows the same code to be used
// for 32 and 64 bit platforms.
hi := uint32(src[si+0])<<24 | uint32(src[si+1])<<16 | uint32(src[si+2])<<8 | uint32(src[si+3])
lo := hi<<8 | uint32(src[si+4])
hi := uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3])
lo := hi<<8 | uint32(src[4])
dst[di+0] = enc.encode[(hi>>27)&0x1F]
dst[di+1] = enc.encode[(hi>>22)&0x1F]
dst[di+2] = enc.encode[(hi>>17)&0x1F]
dst[di+3] = enc.encode[(hi>>12)&0x1F]
dst[di+4] = enc.encode[(hi>>7)&0x1F]
dst[di+5] = enc.encode[(hi>>2)&0x1F]
dst[di+6] = enc.encode[(lo>>5)&0x1F]
dst[di+7] = enc.encode[(lo)&0x1F]
_ = dst[7] // Eliminate bounds checks below.
dst[0] = enc.encode[(hi>>27)&0x1F]
dst[1] = enc.encode[(hi>>22)&0x1F]
dst[2] = enc.encode[(hi>>17)&0x1F]
dst[3] = enc.encode[(hi>>12)&0x1F]
dst[4] = enc.encode[(hi>>7)&0x1F]
dst[5] = enc.encode[(hi>>2)&0x1F]
dst[6] = enc.encode[(lo>>5)&0x1F]
dst[7] = enc.encode[(lo)&0x1F]
si += 5
di += 8
src = src[5:]
dst = dst[8:]
}
// Add the remaining small block
remain := len(src) - si
if remain == 0 {
if len(src) == 0 {
return
}
// Encode the remaining bytes in reverse order.
val := uint32(0)
switch remain {
switch len(src) {
case 4:
val |= uint32(src[si+3])
dst[di+6] = enc.encode[val<<3&0x1F]
dst[di+5] = enc.encode[val>>2&0x1F]
val |= uint32(src[3])
dst[6] = enc.encode[val<<3&0x1F]
dst[5] = enc.encode[val>>2&0x1F]
fallthrough
case 3:
val |= uint32(src[si+2]) << 8
dst[di+4] = enc.encode[val>>7&0x1F]
val |= uint32(src[2]) << 8
dst[4] = enc.encode[val>>7&0x1F]
fallthrough
case 2:
val |= uint32(src[si+1]) << 16
dst[di+3] = enc.encode[val>>12&0x1F]
dst[di+2] = enc.encode[val>>17&0x1F]
val |= uint32(src[1]) << 16
dst[3] = enc.encode[val>>12&0x1F]
dst[2] = enc.encode[val>>17&0x1F]
fallthrough
case 1:
val |= uint32(src[si+0]) << 24
dst[di+1] = enc.encode[val>>22&0x1F]
dst[di+0] = enc.encode[val>>27&0x1F]
val |= uint32(src[0]) << 24
dst[1] = enc.encode[val>>22&0x1F]
dst[0] = enc.encode[val>>27&0x1F]
}
// Pad the final quantum
if enc.padChar != NoPadding {
nPad := (remain * 8 / 5) + 1
nPad := (len(src) * 8 / 5) + 1
for i := nPad; i < 8; i++ {
dst[di+i] = byte(enc.padChar)
dst[i] = byte(enc.padChar)
}
}
}

View File

@@ -151,44 +151,39 @@ func (enc *Encoding) Encode(dst, src []byte) {
// outside of the loop to speed up the encoder.
_ = enc.encode
di, si := 0, 0
n := (len(src) / 3) * 3
for si < n {
for len(src) >= 3 {
// Convert 3x 8bit source bytes into 4 bytes
val := uint(src[si+0])<<16 | uint(src[si+1])<<8 | uint(src[si+2])
val := uint(src[0])<<16 | uint(src[1])<<8 | uint(src[2])
dst[di+0] = enc.encode[val>>18&0x3F]
dst[di+1] = enc.encode[val>>12&0x3F]
dst[di+2] = enc.encode[val>>6&0x3F]
dst[di+3] = enc.encode[val&0x3F]
_ = dst[3] // Eliminate bounds checks below.
dst[0] = enc.encode[val>>18&0x3F]
dst[1] = enc.encode[val>>12&0x3F]
dst[2] = enc.encode[val>>6&0x3F]
dst[3] = enc.encode[val&0x3F]
si += 3
di += 4
src = src[3:]
dst = dst[4:]
}
remain := len(src) - si
if remain == 0 {
// Add the remaining small block (if any).
switch len(src) {
case 0:
return
}
// Add the remaining small block
val := uint(src[si+0]) << 16
if remain == 2 {
val |= uint(src[si+1]) << 8
}
dst[di+0] = enc.encode[val>>18&0x3F]
dst[di+1] = enc.encode[val>>12&0x3F]
switch remain {
case 2:
dst[di+2] = enc.encode[val>>6&0x3F]
if enc.padChar != NoPadding {
dst[di+3] = byte(enc.padChar)
}
case 1:
val := uint(src[0]) << 16
dst[0] = enc.encode[val>>18&0x3F]
dst[1] = enc.encode[val>>12&0x3F]
if enc.padChar != NoPadding {
dst[di+2] = byte(enc.padChar)
dst[di+3] = byte(enc.padChar)
dst[2] = byte(enc.padChar)
dst[3] = byte(enc.padChar)
}
case 2:
val := uint(src[0])<<16 | uint(src[1])<<8
dst[0] = enc.encode[val>>18&0x3F]
dst[1] = enc.encode[val>>12&0x3F]
dst[2] = enc.encode[val>>6&0x3F]
if enc.padChar != NoPadding {
dst[3] = byte(enc.padChar)
}
}
}