diff --git a/html/charset/charset.go b/html/charset/charset.go index 603a58f1..39dc2681 100644 --- a/html/charset/charset.go +++ b/html/charset/charset.go @@ -6,6 +6,7 @@ package charset import ( "bytes" + "io" "mime" "strings" "unicode/utf8" @@ -13,6 +14,7 @@ import ( "code.google.com/p/go.net/html" "code.google.com/p/go.text/encoding" "code.google.com/p/go.text/encoding/charmap" + "code.google.com/p/go.text/transform" ) // Lookup returns the encoding with the specified label, and its canonical @@ -83,6 +85,27 @@ func DetermineEncoding(content []byte, contentType string) (e encoding.Encoding, return charmap.Windows1252, "windows-1252", false } +// NewReader returns an io.Reader that converts the content of r to UTF-8. +// It calls DetermineEncoding to find out what r's encoding is. +func NewReader(r io.Reader, contentType string) (io.Reader, error) { + preview := make([]byte, 1024) + n, err := io.ReadFull(r, preview) + switch { + case err == io.ErrUnexpectedEOF: + preview = preview[:n] + r = bytes.NewReader(preview) + case err != nil: + return nil, err + default: + r = io.MultiReader(bytes.NewReader(preview), r) + } + + if e, _, _ := DetermineEncoding(preview, contentType); e != encoding.Nop { + r = transform.NewReader(r, e.NewDecoder()) + } + return r, nil +} + func prescan(content []byte) (e encoding.Encoding, name string) { z := html.NewTokenizer(bytes.NewReader(content)) for { diff --git a/html/charset/charset_test.go b/html/charset/charset_test.go index 60a96c2d..a656dd90 100644 --- a/html/charset/charset_test.go +++ b/html/charset/charset_test.go @@ -1,6 +1,7 @@ package charset import ( + "bytes" "io/ioutil" "strings" "testing" @@ -143,6 +144,40 @@ func TestSniff(t *testing.T) { } } +func TestReader(t *testing.T) { + for _, tc := range sniffTestCases { + content, err := ioutil.ReadFile("testdata/" + tc.filename) + if err != nil { + t.Errorf("%s: error reading file: %v", tc.filename, err) + continue + } + + r, err := NewReader(bytes.NewReader(content), tc.declared) + if err != nil { + t.Errorf("%s: error creating reader: %v", tc.filename, err) + continue + } + + got, err := ioutil.ReadAll(r) + if err != nil { + t.Errorf("%s: error reading from charset.NewReader: %v", tc.filename, err) + continue + } + + e, _ := Lookup(tc.want) + want, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader(content), e.NewDecoder())) + if err != nil { + t.Errorf("%s: error decoding with hard-coded charset name: %v", tc.filename, err) + continue + } + + if !bytes.Equal(got, want) { + t.Errorf("%s: got %q, want %q", tc.filename, got, want) + continue + } + } +} + var metaTestCases = []struct { meta, want string }{