From cdf8951060a3e86736adc8a8b3f702a8cb22a3fa Mon Sep 17 00:00:00 2001 From: David Finkel Date: Thu, 5 Feb 2026 18:10:43 -0500 Subject: [PATCH] runtime: printquoted: preserve bytes of invalid UTF-8 encodings Previously, printquoted would render each byte of an invalid UTF-8 sequence as "\uFFFD", which was lossy. This CL adjusts printquoted to distinguish valid encodings of U+FFFD from invalid encodings; each byte of the latter is now printed losslessly as "\xXX" Updates #76349 Change-Id: If9a877f01b497763425d9d11a58eb2a6e2c816b2 Reviewed-on: https://go-review.googlesource.com/c/go/+/742305 Reviewed-by: Alan Donovan Auto-Submit: Michael Pratt Reviewed-by: Michael Pratt LUCI-TryBot-Result: Go LUCI --- src/runtime/print.go | 10 +++++++++- src/runtime/print_quoted_test.go | 6 ++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/runtime/print.go b/src/runtime/print.go index 5d1bc22809..f39df39d79 100644 --- a/src/runtime/print.go +++ b/src/runtime/print.go @@ -200,7 +200,7 @@ func printhex(v uint64) { func printquoted(s string) { printlock() gwrite([]byte(`"`)) - for _, r := range s { + for i, r := range s { switch r { case '\n': gwrite([]byte(`\n`)) @@ -215,6 +215,14 @@ func printquoted(s string) { case '\\', '"': gwrite([]byte{byte('\\'), byte(r)}) continue + case runeError: + // Distinguish errors from a valid encoding of U+FFFD. + if _, j := decoderune(s, i); j == i+1 { + gwrite(bytes(`\x`)) + printhexopts(false, 2, uint64(s[i])) + continue + } + // Fall through to quoting. } // For now, only allow basic printable ascii through unescaped if r >= ' ' && r <= '~' { diff --git a/src/runtime/print_quoted_test.go b/src/runtime/print_quoted_test.go index f9e947b569..a3a87a07c5 100644 --- a/src/runtime/print_quoted_test.go +++ b/src/runtime/print_quoted_test.go @@ -20,6 +20,12 @@ func TestPrintQuoted(t *testing.T) { // make sure null and escape bytes are properly escaped {in: "b\033it", expected: `"b\x1bit"`}, {in: "b\000ar", expected: `"b\x00ar"`}, + // Make sure invalid UTF8 bytes make it through as expected + {in: "b\xfdar", expected: `"b\xfdar"`}, + {in: "b\xfda", expected: `"b\xfda"`}, + {in: "b\xfd\xffar", expected: `"b\xfd\xffar"`}, + // make sure the unicode replacement character gets correctly escaped + {in: "\ufffd!!!!", expected: `"\ufffd!!!!"`}, // verify that simple 16-bit unicode runes are escaped with \u, including a greek upper-case sigma and an arbitrary unicode character. {in: "\u1234Σ", expected: `"\u1234\u03a3"`}, // verify that 32-bit unicode runes are escaped with \U along with tabs