diff --git a/html/charset/charset.go b/html/charset/charset.go index f91c5ea4..13bed159 100644 --- a/html/charset/charset.go +++ b/html/charset/charset.go @@ -26,14 +26,23 @@ import ( // Lookup returns the encoding with the specified label, and its canonical // name. It returns nil and the empty string if label is not one of the // standard encodings for HTML. Matching is case-insensitive and ignores -// leading and trailing whitespace. +// leading and trailing whitespace. Encoders will use HTML escape sequences for +// runes that are not supported by the character set. func Lookup(label string) (e encoding.Encoding, name string) { e, err := htmlindex.Get(label) if err != nil { return nil, "" } name, _ = htmlindex.Name(e) - return e, name + return &htmlEncoding{e}, name +} + +type htmlEncoding struct{ encoding.Encoding } + +func (h *htmlEncoding) NewEncoder() *encoding.Encoder { + // HTML requires a non-terminating legacy encoder. We use HTML escapes to + // substitute unsupported code points. + return encoding.HTMLEscapeUnsupported(h.Encoding.NewEncoder()) } // DetermineEncoding determines the encoding of an HTML document by examining diff --git a/html/charset/charset_test.go b/html/charset/charset_test.go index 31941ca8..f509448c 100644 --- a/html/charset/charset_test.go +++ b/html/charset/charset_test.go @@ -21,9 +21,12 @@ func transformString(t transform.Transformer, s string) (string, error) { return string(b), err } -var testCases = []struct { +type testCase struct { utf8, other, otherEncoding string -}{ +} + +// testCases for encoding and decoding. +var testCases = []testCase{ {"Résumé", "Résumé", "utf8"}, {"Résumé", "R\xe9sum\xe9", "latin1"}, {"これは漢字です。", "S0\x8c0o0\"oW[g0Y0\x020", "UTF-16LE"}, @@ -86,6 +89,11 @@ func TestDecode(t *testing.T) { } func TestEncode(t *testing.T) { + testCases := append(testCases, []testCase{ + // U+0144 LATIN SMALL LETTER N WITH ACUTE not supported by encoding. + {"Gdańsk", "Gdańsk", "ISO-8859-11"}, + {"\ufffd", "�", "ISO-8859-11"}, + }...) for _, tc := range testCases { e, _ := Lookup(tc.otherEncoding) if e == nil {