From e4fcdc6c55bfdc90f4f42eefc4e97998de3ef4e7 Mon Sep 17 00:00:00 2001 From: Julien Cretel Date: Wed, 25 Mar 2026 22:32:18 +0000 Subject: [PATCH] encoding/{base32,base64}: speed up Encode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This CL clarifies (*Encoding).Encode and speeds it up by reducing the number of bounds checks in its loop. Here are some benchmark results (no change to allocations): goos: darwin goarch: arm64 pkg: encoding/base32 cpu: Apple M4 │ old │ new │ │ sec/op │ sec/op vs base │ EncodeToString-10 7.310µ ± 0% 5.308µ ± 0% -27.39% (n=180) Encode-10 5.651µ ± 0% 3.603µ ± 0% -36.25% (n=180) geomean 6.427µ 4.373µ -31.96% │ old │ new │ │ B/s │ B/s vs base │ EncodeToString-10 1.044Gi ± 0% 1.437Gi ± 0% +37.71% (p=0.000 n=180) Encode-10 1.350Gi ± 0% 2.118Gi ± 0% +56.88% (p=0.000 n=180) geomean 1.187Gi 1.745Gi +46.98% pkg: encoding/base64 │ old │ new │ │ sec/op │ sec/op vs base │ EncodeToString-10 7.058µ ± 0% 6.034µ ± 0% -14.51% (n=180) │ old │ new │ │ B/s │ B/s vs base │ EncodeToString-10 1.081Gi ± 0% 1.264Gi ± 0% +16.97% (p=0.000 n=180) Updates #20206 Change-Id: I7d46891ddb4371df004bfd612a8efc6638715b94 GitHub-Last-Rev: 1caac3d65532fefacbbed57f11a4a49273f173e2 GitHub-Pull-Request: golang/go#78344 Reviewed-on: https://go-review.googlesource.com/c/go/+/759100 Reviewed-by: Keith Randall Auto-Submit: Keith Randall Reviewed-by: Dmitri Shuralyov Reviewed-by: Keith Randall LUCI-TryBot-Result: Go LUCI --- src/encoding/base32/base32.go | 60 +++++++++++++++++------------------ src/encoding/base64/base64.go | 55 +++++++++++++++----------------- 2 files changed, 54 insertions(+), 61 deletions(-) diff --git a/src/encoding/base32/base32.go b/src/encoding/base32/base32.go index 8bda6c6799..7a3221aea2 100644 --- a/src/encoding/base32/base32.go +++ b/src/encoding/base32/base32.go @@ -127,61 +127,59 @@ func (enc *Encoding) Encode(dst, src []byte) { // outside of the loop to speed up the encoder. _ = enc.encode - di, si := 0, 0 - n := (len(src) / 5) * 5 - for si < n { + for len(src) >= 5 { // Combining two 32 bit loads allows the same code to be used // for 32 and 64 bit platforms. - hi := uint32(src[si+0])<<24 | uint32(src[si+1])<<16 | uint32(src[si+2])<<8 | uint32(src[si+3]) - lo := hi<<8 | uint32(src[si+4]) + hi := uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3]) + lo := hi<<8 | uint32(src[4]) - dst[di+0] = enc.encode[(hi>>27)&0x1F] - dst[di+1] = enc.encode[(hi>>22)&0x1F] - dst[di+2] = enc.encode[(hi>>17)&0x1F] - dst[di+3] = enc.encode[(hi>>12)&0x1F] - dst[di+4] = enc.encode[(hi>>7)&0x1F] - dst[di+5] = enc.encode[(hi>>2)&0x1F] - dst[di+6] = enc.encode[(lo>>5)&0x1F] - dst[di+7] = enc.encode[(lo)&0x1F] + _ = dst[7] // Eliminate bounds checks below. + dst[0] = enc.encode[(hi>>27)&0x1F] + dst[1] = enc.encode[(hi>>22)&0x1F] + dst[2] = enc.encode[(hi>>17)&0x1F] + dst[3] = enc.encode[(hi>>12)&0x1F] + dst[4] = enc.encode[(hi>>7)&0x1F] + dst[5] = enc.encode[(hi>>2)&0x1F] + dst[6] = enc.encode[(lo>>5)&0x1F] + dst[7] = enc.encode[(lo)&0x1F] - si += 5 - di += 8 + src = src[5:] + dst = dst[8:] } // Add the remaining small block - remain := len(src) - si - if remain == 0 { + if len(src) == 0 { return } // Encode the remaining bytes in reverse order. val := uint32(0) - switch remain { + switch len(src) { case 4: - val |= uint32(src[si+3]) - dst[di+6] = enc.encode[val<<3&0x1F] - dst[di+5] = enc.encode[val>>2&0x1F] + val |= uint32(src[3]) + dst[6] = enc.encode[val<<3&0x1F] + dst[5] = enc.encode[val>>2&0x1F] fallthrough case 3: - val |= uint32(src[si+2]) << 8 - dst[di+4] = enc.encode[val>>7&0x1F] + val |= uint32(src[2]) << 8 + dst[4] = enc.encode[val>>7&0x1F] fallthrough case 2: - val |= uint32(src[si+1]) << 16 - dst[di+3] = enc.encode[val>>12&0x1F] - dst[di+2] = enc.encode[val>>17&0x1F] + val |= uint32(src[1]) << 16 + dst[3] = enc.encode[val>>12&0x1F] + dst[2] = enc.encode[val>>17&0x1F] fallthrough case 1: - val |= uint32(src[si+0]) << 24 - dst[di+1] = enc.encode[val>>22&0x1F] - dst[di+0] = enc.encode[val>>27&0x1F] + val |= uint32(src[0]) << 24 + dst[1] = enc.encode[val>>22&0x1F] + dst[0] = enc.encode[val>>27&0x1F] } // Pad the final quantum if enc.padChar != NoPadding { - nPad := (remain * 8 / 5) + 1 + nPad := (len(src) * 8 / 5) + 1 for i := nPad; i < 8; i++ { - dst[di+i] = byte(enc.padChar) + dst[i] = byte(enc.padChar) } } } diff --git a/src/encoding/base64/base64.go b/src/encoding/base64/base64.go index 57aa1a697f..32014f45bb 100644 --- a/src/encoding/base64/base64.go +++ b/src/encoding/base64/base64.go @@ -151,44 +151,39 @@ func (enc *Encoding) Encode(dst, src []byte) { // outside of the loop to speed up the encoder. _ = enc.encode - di, si := 0, 0 - n := (len(src) / 3) * 3 - for si < n { + for len(src) >= 3 { // Convert 3x 8bit source bytes into 4 bytes - val := uint(src[si+0])<<16 | uint(src[si+1])<<8 | uint(src[si+2]) + val := uint(src[0])<<16 | uint(src[1])<<8 | uint(src[2]) - dst[di+0] = enc.encode[val>>18&0x3F] - dst[di+1] = enc.encode[val>>12&0x3F] - dst[di+2] = enc.encode[val>>6&0x3F] - dst[di+3] = enc.encode[val&0x3F] + _ = dst[3] // Eliminate bounds checks below. + dst[0] = enc.encode[val>>18&0x3F] + dst[1] = enc.encode[val>>12&0x3F] + dst[2] = enc.encode[val>>6&0x3F] + dst[3] = enc.encode[val&0x3F] - si += 3 - di += 4 + src = src[3:] + dst = dst[4:] } - remain := len(src) - si - if remain == 0 { + // Add the remaining small block (if any). + switch len(src) { + case 0: return - } - // Add the remaining small block - val := uint(src[si+0]) << 16 - if remain == 2 { - val |= uint(src[si+1]) << 8 - } - - dst[di+0] = enc.encode[val>>18&0x3F] - dst[di+1] = enc.encode[val>>12&0x3F] - - switch remain { - case 2: - dst[di+2] = enc.encode[val>>6&0x3F] - if enc.padChar != NoPadding { - dst[di+3] = byte(enc.padChar) - } case 1: + val := uint(src[0]) << 16 + dst[0] = enc.encode[val>>18&0x3F] + dst[1] = enc.encode[val>>12&0x3F] if enc.padChar != NoPadding { - dst[di+2] = byte(enc.padChar) - dst[di+3] = byte(enc.padChar) + dst[2] = byte(enc.padChar) + dst[3] = byte(enc.padChar) + } + case 2: + val := uint(src[0])<<16 | uint(src[1])<<8 + dst[0] = enc.encode[val>>18&0x3F] + dst[1] = enc.encode[val>>12&0x3F] + dst[2] = enc.encode[val>>6&0x3F] + if enc.padChar != NoPadding { + dst[3] = byte(enc.padChar) } } }