Files
golang.net/html/parse_test.go
Kunpei Sakai 574d568418 html: add "in head noscript" im support
In the spec 12.2.6.4.5, the "in head noscript" insertion mode is defined.
However, this package and its parser doesn't have the insertion mode,
because the scripting=false case is not considered currently.

This commit adds a test and a support for the "in head noscript"
insertion mode. This change has no effect on the actual behavior.

Updates golang/go#16318

Change-Id: I9314c3342bea27fa2acf2fa7d980a127ee0fbf91
Reviewed-on: https://go-review.googlesource.com/c/net/+/172557
Reviewed-by: Nigel Tao <nigeltao@golang.org>
2019-04-24 02:42:50 +00:00

439 lines
11 KiB
Go

// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"io/ioutil"
"os"
"path/filepath"
"runtime"
"sort"
"strings"
"testing"
"golang.org/x/net/html/atom"
)
// readParseTest reads a single test case from r.
func readParseTest(r *bufio.Reader) (text, want, context string, err error) {
line, err := r.ReadSlice('\n')
if err != nil {
return "", "", "", err
}
var b []byte
// Read the HTML.
if string(line) != "#data\n" {
return "", "", "", fmt.Errorf(`got %q want "#data\n"`, line)
}
for {
line, err = r.ReadSlice('\n')
if err != nil {
return "", "", "", err
}
if line[0] == '#' {
break
}
b = append(b, line...)
}
text = strings.TrimSuffix(string(b), "\n")
b = b[:0]
// Skip the error list.
if string(line) != "#errors\n" {
return "", "", "", fmt.Errorf(`got %q want "#errors\n"`, line)
}
for {
line, err = r.ReadSlice('\n')
if err != nil {
return "", "", "", err
}
if line[0] == '#' {
break
}
}
if string(line) == "#document-fragment\n" {
line, err = r.ReadSlice('\n')
if err != nil {
return "", "", "", err
}
context = strings.TrimSpace(string(line))
line, err = r.ReadSlice('\n')
if err != nil {
return "", "", "", err
}
}
// Read the dump of what the parse tree should be.
if string(line) != "#document\n" {
return "", "", "", fmt.Errorf(`got %q want "#document\n"`, line)
}
inQuote := false
for {
line, err = r.ReadSlice('\n')
if err != nil && err != io.EOF {
return "", "", "", err
}
trimmed := bytes.Trim(line, "| \n")
if len(trimmed) > 0 {
if line[0] == '|' && trimmed[0] == '"' {
inQuote = true
}
if trimmed[len(trimmed)-1] == '"' && !(line[0] == '|' && len(trimmed) == 1) {
inQuote = false
}
}
if len(line) == 0 || len(line) == 1 && line[0] == '\n' && !inQuote {
break
}
b = append(b, line...)
}
return text, string(b), context, nil
}
func dumpIndent(w io.Writer, level int) {
io.WriteString(w, "| ")
for i := 0; i < level; i++ {
io.WriteString(w, " ")
}
}
type sortedAttributes []Attribute
func (a sortedAttributes) Len() int {
return len(a)
}
func (a sortedAttributes) Less(i, j int) bool {
if a[i].Namespace != a[j].Namespace {
return a[i].Namespace < a[j].Namespace
}
return a[i].Key < a[j].Key
}
func (a sortedAttributes) Swap(i, j int) {
a[i], a[j] = a[j], a[i]
}
func dumpLevel(w io.Writer, n *Node, level int) error {
dumpIndent(w, level)
level++
switch n.Type {
case ErrorNode:
return errors.New("unexpected ErrorNode")
case DocumentNode:
return errors.New("unexpected DocumentNode")
case ElementNode:
if n.Namespace != "" {
fmt.Fprintf(w, "<%s %s>", n.Namespace, n.Data)
} else {
fmt.Fprintf(w, "<%s>", n.Data)
}
attr := sortedAttributes(n.Attr)
sort.Sort(attr)
for _, a := range attr {
io.WriteString(w, "\n")
dumpIndent(w, level)
if a.Namespace != "" {
fmt.Fprintf(w, `%s %s="%s"`, a.Namespace, a.Key, a.Val)
} else {
fmt.Fprintf(w, `%s="%s"`, a.Key, a.Val)
}
}
if n.Namespace == "" && n.DataAtom == atom.Template {
io.WriteString(w, "\n")
dumpIndent(w, level)
level++
io.WriteString(w, "content")
}
case TextNode:
fmt.Fprintf(w, `"%s"`, n.Data)
case CommentNode:
fmt.Fprintf(w, "<!-- %s -->", n.Data)
case DoctypeNode:
fmt.Fprintf(w, "<!DOCTYPE %s", n.Data)
if n.Attr != nil {
var p, s string
for _, a := range n.Attr {
switch a.Key {
case "public":
p = a.Val
case "system":
s = a.Val
}
}
if p != "" || s != "" {
fmt.Fprintf(w, ` "%s"`, p)
fmt.Fprintf(w, ` "%s"`, s)
}
}
io.WriteString(w, ">")
case scopeMarkerNode:
return errors.New("unexpected scopeMarkerNode")
default:
return errors.New("unknown node type")
}
io.WriteString(w, "\n")
for c := n.FirstChild; c != nil; c = c.NextSibling {
if err := dumpLevel(w, c, level); err != nil {
return err
}
}
return nil
}
func dump(n *Node) (string, error) {
if n == nil || n.FirstChild == nil {
return "", nil
}
var b bytes.Buffer
for c := n.FirstChild; c != nil; c = c.NextSibling {
if err := dumpLevel(&b, c, 0); err != nil {
return "", err
}
}
return b.String(), nil
}
var testDataDirs = []string{"testdata/webkit/", "testdata/go/"}
func TestParser(t *testing.T) {
for _, testDataDir := range testDataDirs {
testFiles, err := filepath.Glob(testDataDir + "*.dat")
if err != nil {
t.Fatal(err)
}
for _, tf := range testFiles {
f, err := os.Open(tf)
if err != nil {
t.Fatal(err)
}
defer f.Close()
r := bufio.NewReader(f)
for i := 0; ; i++ {
text, want, context, err := readParseTest(r)
if err == io.EOF {
break
}
if err != nil {
t.Fatal(err)
}
err = testParseCase(text, want, context, Parse)
if err != nil {
t.Errorf("%s test #%d %q, %s", tf, i, text, err)
}
}
}
}
}
// Issue 16318
func TestParserWithoutScripting(t *testing.T) {
text := `<noscript><img src='https://golang.org/doc/gopher/frontpage.png' /></noscript><p><img src='https://golang.org/doc/gopher/doc.png' /></p>`
want := `| <html>
| <head>
| <noscript>
| <body>
| "<img src='https://golang.org/doc/gopher/frontpage.png' />"
| <p>
| <img>
| src="https://golang.org/doc/gopher/doc.png"
`
err := testParseCase(text, want, "", func(r io.Reader) (*Node, error) {
p := &parser{
tokenizer: NewTokenizer(r),
doc: &Node{
Type: DocumentNode,
},
scripting: false,
framesetOK: true,
im: initialIM,
}
err := p.parse()
if err != nil {
return nil, err
}
return p.doc, nil
})
if err != nil {
t.Errorf("test with scripting is disabled, %q, %s", text, err)
}
}
// testParseCase tests one test case from the test files. If the test does not
// pass, it returns an error that explains the failure.
// text is the HTML to be parsed, want is a dump of the correct parse tree,
// and context is the name of the context node, if any.
func testParseCase(text, want, context string, parseFunc func(r io.Reader) (*Node, error)) (err error) {
defer func() {
if x := recover(); x != nil {
switch e := x.(type) {
case error:
err = e
default:
err = fmt.Errorf("%v", e)
}
}
}()
var doc *Node
if context == "" {
doc, err = parseFunc(strings.NewReader(text))
if err != nil {
return err
}
} else {
contextNode := &Node{
Type: ElementNode,
DataAtom: atom.Lookup([]byte(context)),
Data: context,
}
nodes, err := ParseFragment(strings.NewReader(text), contextNode)
if err != nil {
return err
}
doc = &Node{
Type: DocumentNode,
}
for _, n := range nodes {
doc.AppendChild(n)
}
}
if err := checkTreeConsistency(doc); err != nil {
return err
}
got, err := dump(doc)
if err != nil {
return err
}
// Compare the parsed tree to the #document section.
if got != want {
return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want)
}
if renderTestBlacklist[text] || context != "" {
return nil
}
// Check that rendering and re-parsing results in an identical tree.
pr, pw := io.Pipe()
go func() {
pw.CloseWithError(Render(pw, doc))
}()
doc1, err := Parse(pr)
if err != nil {
return err
}
got1, err := dump(doc1)
if err != nil {
return err
}
if got != got1 {
return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1)
}
return nil
}
// Some test input result in parse trees are not 'well-formed' despite
// following the HTML5 recovery algorithms. Rendering and re-parsing such a
// tree will not result in an exact clone of that tree. We blacklist such
// inputs from the render test.
var renderTestBlacklist = map[string]bool{
// The second <a> will be reparented to the first <table>'s parent. This
// results in an <a> whose parent is an <a>, which is not 'well-formed'.
`<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y`: true,
// The same thing with a <p>:
`<p><table></p>`: true,
// More cases of <a> being reparented:
`<a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe`: true,
`<a><table><a></table><p><a><div><a>`: true,
`<a><table><td><a><table></table><a></tr><a></table><a>`: true,
`<template><a><table><a>`: true,
// A similar reparenting situation involving <nobr>:
`<!DOCTYPE html><body><b><nobr>1<table><nobr></b><i><nobr>2<nobr></i>3`: true,
// A <plaintext> element is reparented, putting it before a table.
// A <plaintext> element can't have anything after it in HTML.
`<table><plaintext><td>`: true,
`<!doctype html><table><plaintext></plaintext>`: true,
`<!doctype html><table><tbody><plaintext></plaintext>`: true,
`<!doctype html><table><tbody><tr><plaintext></plaintext>`: true,
// A form inside a table inside a form doesn't work either.
`<!doctype html><form><table></form><form></table></form>`: true,
// A script that ends at EOF may escape its own closing tag when rendered.
`<!doctype html><script><!--<script `: true,
`<!doctype html><script><!--<script <`: true,
`<!doctype html><script><!--<script <a`: true,
`<!doctype html><script><!--<script </`: true,
`<!doctype html><script><!--<script </s`: true,
`<!doctype html><script><!--<script </script`: true,
`<!doctype html><script><!--<script </scripta`: true,
`<!doctype html><script><!--<script -`: true,
`<!doctype html><script><!--<script -a`: true,
`<!doctype html><script><!--<script -<`: true,
`<!doctype html><script><!--<script --`: true,
`<!doctype html><script><!--<script --a`: true,
`<!doctype html><script><!--<script --<`: true,
`<script><!--<script `: true,
`<script><!--<script <a`: true,
`<script><!--<script </script`: true,
`<script><!--<script </scripta`: true,
`<script><!--<script -`: true,
`<script><!--<script -a`: true,
`<script><!--<script --`: true,
`<script><!--<script --a`: true,
`<script><!--<script <`: true,
`<script><!--<script </`: true,
`<script><!--<script </s`: true,
// Reconstructing the active formatting elements results in a <plaintext>
// element that contains an <a> element.
`<!doctype html><p><a><plaintext>b`: true,
`<table><math><select><mi><select></table>`: true,
}
func TestNodeConsistency(t *testing.T) {
// inconsistentNode is a Node whose DataAtom and Data do not agree.
inconsistentNode := &Node{
Type: ElementNode,
DataAtom: atom.Frameset,
Data: "table",
}
_, err := ParseFragment(strings.NewReader("<p>hello</p>"), inconsistentNode)
if err == nil {
t.Errorf("got nil error, want non-nil")
}
}
func TestParseFragmentWithNilContext(t *testing.T) {
// This shouldn't panic.
ParseFragment(strings.NewReader("<p>hello</p>"), nil)
}
func BenchmarkParser(b *testing.B) {
buf, err := ioutil.ReadFile("testdata/go1.html")
if err != nil {
b.Fatalf("could not read testdata/go1.html: %v", err)
}
b.SetBytes(int64(len(buf)))
runtime.GC()
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
Parse(bytes.NewBuffer(buf))
}
}