Files
golang.net/html/token_test.go
Michael Piatek 480e7b06ec go.net/html: Tokenizer.Raw returns the original input when tokenizer errors occur.
Two tweaks enable this:
1) Updating the raw and data span pointers when Tokenizer.Next is called, even
if an error has occurred. This prevents duplicate data from being returned by
Raw in the common case of an EOF.

2) Treating '</>' as an empty comment token to expose the raw text as a
tokenization event. (This matches the semantics of other non-token events,
e.g., '</ >' is treated as '<!-- -->'.)

Fixes golang/go#7029.

R=golang-codereviews, r, bradfitz
CC=golang-codereviews
https://golang.org/cl/46370043
2014-01-02 10:51:00 -08:00

689 lines
12 KiB
Go

// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
import (
"bytes"
"io"
"io/ioutil"
"reflect"
"runtime"
"strings"
"testing"
)
type tokenTest struct {
// A short description of the test case.
desc string
// The HTML to parse.
html string
// The string representations of the expected tokens, joined by '$'.
golden string
}
var tokenTests = []tokenTest{
{
"empty",
"",
"",
},
// A single text node. The tokenizer should not break text nodes on whitespace,
// nor should it normalize whitespace within a text node.
{
"text",
"foo bar",
"foo bar",
},
// An entity.
{
"entity",
"one &lt; two",
"one &lt; two",
},
// A start, self-closing and end tag. The tokenizer does not care if the start
// and end tokens don't match; that is the job of the parser.
{
"tags",
"<a>b<c/>d</e>",
"<a>$b$<c/>$d$</e>",
},
// Angle brackets that aren't a tag.
{
"not a tag #0",
"<",
"&lt;",
},
{
"not a tag #1",
"</",
"&lt;/",
},
{
"not a tag #2",
"</>",
"<!---->",
},
{
"not a tag #3",
"a</>b",
"a$<!---->$b",
},
{
"not a tag #4",
"</ >",
"<!-- -->",
},
{
"not a tag #5",
"</.",
"<!--.-->",
},
{
"not a tag #6",
"</.>",
"<!--.-->",
},
{
"not a tag #7",
"a < b",
"a &lt; b",
},
{
"not a tag #8",
"<.>",
"&lt;.&gt;",
},
{
"not a tag #9",
"a<<<b>>>c",
"a&lt;&lt;$<b>$&gt;&gt;c",
},
{
"not a tag #10",
"if x<0 and y < 0 then x*y>0",
"if x&lt;0 and y &lt; 0 then x*y&gt;0",
},
// EOF in a tag name.
{
"tag name eof #0",
"<a",
"",
},
{
"tag name eof #1",
"<a ",
"",
},
{
"tag name eof #2",
"a<b",
"a",
},
{
"tag name eof #3",
"<a><b",
"<a>",
},
{
"tag name eof #4",
`<a x`,
``,
},
// Some malformed tags that are missing a '>'.
{
"malformed tag #0",
`<p</p>`,
`<p< p="">`,
},
{
"malformed tag #1",
`<p </p>`,
`<p <="" p="">`,
},
{
"malformed tag #2",
`<p id`,
``,
},
{
"malformed tag #3",
`<p id=`,
``,
},
{
"malformed tag #4",
`<p id=>`,
`<p id="">`,
},
{
"malformed tag #5",
`<p id=0`,
``,
},
{
"malformed tag #6",
`<p id=0</p>`,
`<p id="0&lt;/p">`,
},
{
"malformed tag #7",
`<p id="0</p>`,
``,
},
{
"malformed tag #8",
`<p id="0"</p>`,
`<p id="0" <="" p="">`,
},
{
"malformed tag #9",
`<p></p id`,
`<p>`,
},
// Raw text and RCDATA.
{
"basic raw text",
"<script><a></b></script>",
"<script>$&lt;a&gt;&lt;/b&gt;$</script>",
},
{
"unfinished script end tag",
"<SCRIPT>a</SCR",
"<script>$a&lt;/SCR",
},
{
"broken script end tag",
"<SCRIPT>a</SCR ipt>",
"<script>$a&lt;/SCR ipt&gt;",
},
{
"EOF in script end tag",
"<SCRIPT>a</SCRipt",
"<script>$a&lt;/SCRipt",
},
{
"scriptx end tag",
"<SCRIPT>a</SCRiptx",
"<script>$a&lt;/SCRiptx",
},
{
"' ' completes script end tag",
"<SCRIPT>a</SCRipt ",
"<script>$a",
},
{
"'>' completes script end tag",
"<SCRIPT>a</SCRipt>",
"<script>$a$</script>",
},
{
"self-closing script end tag",
"<SCRIPT>a</SCRipt/>",
"<script>$a$</script>",
},
{
"nested script tag",
"<SCRIPT>a</SCRipt<script>",
"<script>$a&lt;/SCRipt&lt;script&gt;",
},
{
"script end tag after unfinished",
"<SCRIPT>a</SCRipt</script>",
"<script>$a&lt;/SCRipt$</script>",
},
{
"script/style mismatched tags",
"<script>a</style>",
"<script>$a&lt;/style&gt;",
},
{
"style element with entity",
"<style>&apos;",
"<style>$&amp;apos;",
},
{
"textarea with tag",
"<textarea><div></textarea>",
"<textarea>$&lt;div&gt;$</textarea>",
},
{
"title with tag and entity",
"<title><b>K&amp;R C</b></title>",
"<title>$&lt;b&gt;K&amp;R C&lt;/b&gt;$</title>",
},
// DOCTYPE tests.
{
"Proper DOCTYPE",
"<!DOCTYPE html>",
"<!DOCTYPE html>",
},
{
"DOCTYPE with no space",
"<!doctypehtml>",
"<!DOCTYPE html>",
},
{
"DOCTYPE with two spaces",
"<!doctype html>",
"<!DOCTYPE html>",
},
{
"looks like DOCTYPE but isn't",
"<!DOCUMENT html>",
"<!--DOCUMENT html-->",
},
{
"DOCTYPE at EOF",
"<!DOCtype",
"<!DOCTYPE >",
},
// XML processing instructions.
{
"XML processing instruction",
"<?xml?>",
"<!--?xml?-->",
},
// Comments.
{
"comment0",
"abc<b><!-- skipme --></b>def",
"abc$<b>$<!-- skipme -->$</b>$def",
},
{
"comment1",
"a<!-->z",
"a$<!---->$z",
},
{
"comment2",
"a<!--->z",
"a$<!---->$z",
},
{
"comment3",
"a<!--x>-->z",
"a$<!--x>-->$z",
},
{
"comment4",
"a<!--x->-->z",
"a$<!--x->-->$z",
},
{
"comment5",
"a<!>z",
"a$<!---->$z",
},
{
"comment6",
"a<!->z",
"a$<!----->$z",
},
{
"comment7",
"a<!---<>z",
"a$<!---<>z-->",
},
{
"comment8",
"a<!--z",
"a$<!--z-->",
},
{
"comment9",
"a<!--z-",
"a$<!--z-->",
},
{
"comment10",
"a<!--z--",
"a$<!--z-->",
},
{
"comment11",
"a<!--z---",
"a$<!--z--->",
},
{
"comment12",
"a<!--z----",
"a$<!--z---->",
},
{
"comment13",
"a<!--x--!>z",
"a$<!--x-->$z",
},
// An attribute with a backslash.
{
"backslash",
`<p id="a\"b">`,
`<p id="a\" b"="">`,
},
// Entities, tag name and attribute key lower-casing, and whitespace
// normalization within a tag.
{
"tricky",
"<p \t\n iD=\"a&quot;B\" foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
`<p id="a&#34;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
},
// A nonexistent entity. Tokenizing and converting back to a string should
// escape the "&" to become "&amp;".
{
"noSuchEntity",
`<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
`<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
},
{
"entity without semicolon",
`&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
`¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
},
{
"entity with digits",
"&frac12;",
"½",
},
// Attribute tests:
// http://dev.w3.org/html5/spec/Overview.html#attributes-0
{
"Empty attribute",
`<input disabled FOO>`,
`<input disabled="" foo="">`,
},
{
"Empty attribute, whitespace",
`<input disabled FOO >`,
`<input disabled="" foo="">`,
},
{
"Unquoted attribute value",
`<input value=yes FOO=BAR>`,
`<input value="yes" foo="BAR">`,
},
{
"Unquoted attribute value, spaces",
`<input value = yes FOO = BAR>`,
`<input value="yes" foo="BAR">`,
},
{
"Unquoted attribute value, trailing space",
`<input value=yes FOO=BAR >`,
`<input value="yes" foo="BAR">`,
},
{
"Single-quoted attribute value",
`<input value='yes' FOO='BAR'>`,
`<input value="yes" foo="BAR">`,
},
{
"Single-quoted attribute value, trailing space",
`<input value='yes' FOO='BAR' >`,
`<input value="yes" foo="BAR">`,
},
{
"Double-quoted attribute value",
`<input value="I'm an attribute" FOO="BAR">`,
`<input value="I&#39;m an attribute" foo="BAR">`,
},
{
"Attribute name characters",
`<meta http-equiv="content-type">`,
`<meta http-equiv="content-type">`,
},
{
"Mixed attributes",
`a<P V="0 1" w='2' X=3 y>z`,
`a$<p v="0 1" w="2" x="3" y="">$z`,
},
{
"Attributes with a solitary single quote",
`<p id=can't><p id=won't>`,
`<p id="can&#39;t">$<p id="won&#39;t">`,
},
}
func TestTokenizer(t *testing.T) {
loop:
for _, tt := range tokenTests {
z := NewTokenizer(strings.NewReader(tt.html))
if tt.golden != "" {
for i, s := range strings.Split(tt.golden, "$") {
if z.Next() == ErrorToken {
t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err())
continue loop
}
actual := z.Token().String()
if s != actual {
t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
continue loop
}
}
}
z.Next()
if z.Err() != io.EOF {
t.Errorf("%s: want EOF got %q", tt.desc, z.Err())
}
}
}
func TestPassthrough(t *testing.T) {
// Accumulating the raw output for each parse event should reconstruct the
// original input.
for _, test := range tokenTests {
z := NewTokenizer(strings.NewReader(test.html))
var parsed bytes.Buffer
for {
tt := z.Next()
parsed.Write(z.Raw())
if tt == ErrorToken {
break
}
}
if got, want := parsed.String(), test.html; got != want {
t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want)
}
}
}
func TestBufAPI(t *testing.T) {
s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
z := NewTokenizer(bytes.NewBufferString(s))
var result bytes.Buffer
depth := 0
loop:
for {
tt := z.Next()
switch tt {
case ErrorToken:
if z.Err() != io.EOF {
t.Error(z.Err())
}
break loop
case TextToken:
if depth > 0 {
result.Write(z.Text())
}
case StartTagToken, EndTagToken:
tn, _ := z.TagName()
if len(tn) == 1 && tn[0] == 'a' {
if tt == StartTagToken {
depth++
} else {
depth--
}
}
}
}
u := "14567"
v := string(result.Bytes())
if u != v {
t.Errorf("TestBufAPI: want %q got %q", u, v)
}
}
func TestConvertNewlines(t *testing.T) {
testCases := map[string]string{
"Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n",
"Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n",
"DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
"": "",
"\n": "\n",
"\n\r": "\n\n",
"\r": "\n",
"\r\n": "\n",
"\r\n\n": "\n\n",
"\r\n\r": "\n\n",
"\r\n\r\n": "\n\n",
"\r\r": "\n\n",
"\r\r\n": "\n\n",
"\r\r\n\n": "\n\n\n",
"\r\r\r\n": "\n\n\n",
"\r \n": "\n \n",
"xyz": "xyz",
}
for in, want := range testCases {
if got := string(convertNewlines([]byte(in))); got != want {
t.Errorf("input %q: got %q, want %q", in, got, want)
}
}
}
func TestReaderEdgeCases(t *testing.T) {
const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>"
testCases := []io.Reader{
&zeroOneByteReader{s: s},
&eofStringsReader{s: s},
&stuckReader{},
}
for i, tc := range testCases {
got := []TokenType{}
z := NewTokenizer(tc)
for {
tt := z.Next()
if tt == ErrorToken {
break
}
got = append(got, tt)
}
if err := z.Err(); err != nil && err != io.EOF {
if err != io.ErrNoProgress {
t.Errorf("i=%d: %v", i, err)
}
continue
}
want := []TokenType{
StartTagToken,
TextToken,
EndTagToken,
}
if !reflect.DeepEqual(got, want) {
t.Errorf("i=%d: got %v, want %v", i, got, want)
continue
}
}
}
// zeroOneByteReader is like a strings.Reader that alternates between
// returning 0 bytes and 1 byte at a time.
type zeroOneByteReader struct {
s string
n int
}
func (r *zeroOneByteReader) Read(p []byte) (int, error) {
if len(p) == 0 {
return 0, nil
}
if len(r.s) == 0 {
return 0, io.EOF
}
r.n++
if r.n%2 != 0 {
return 0, nil
}
p[0], r.s = r.s[0], r.s[1:]
return 1, nil
}
// eofStringsReader is like a strings.Reader but can return an (n, err) where
// n > 0 && err != nil.
type eofStringsReader struct {
s string
}
func (r *eofStringsReader) Read(p []byte) (int, error) {
n := copy(p, r.s)
r.s = r.s[n:]
if r.s != "" {
return n, nil
}
return n, io.EOF
}
// stuckReader is an io.Reader that always returns no data and no error.
type stuckReader struct{}
func (*stuckReader) Read(p []byte) (int, error) {
return 0, nil
}
const (
rawLevel = iota
lowLevel
highLevel
)
func benchmarkTokenizer(b *testing.B, level int) {
buf, err := ioutil.ReadFile("testdata/go1.html")
if err != nil {
b.Fatalf("could not read testdata/go1.html: %v", err)
}
b.SetBytes(int64(len(buf)))
runtime.GC()
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
z := NewTokenizer(bytes.NewBuffer(buf))
for {
tt := z.Next()
if tt == ErrorToken {
if err := z.Err(); err != nil && err != io.EOF {
b.Fatalf("tokenizer error: %v", err)
}
break
}
switch level {
case rawLevel:
// Calling z.Raw just returns the raw bytes of the token. It does
// not unescape &lt; to <, or lower-case tag names and attribute keys.
z.Raw()
case lowLevel:
// Caling z.Text, z.TagName and z.TagAttr returns []byte values
// whose contents may change on the next call to z.Next.
switch tt {
case TextToken, CommentToken, DoctypeToken:
z.Text()
case StartTagToken, SelfClosingTagToken:
_, more := z.TagName()
for more {
_, _, more = z.TagAttr()
}
case EndTagToken:
z.TagName()
}
case highLevel:
// Calling z.Token converts []byte values to strings whose validity
// extend beyond the next call to z.Next.
z.Token()
}
}
}
}
func BenchmarkRawLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, rawLevel) }
func BenchmarkLowLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, lowLevel) }
func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }