mirror of
https://github.com/golang/net.git
synced 2026-03-31 18:37:08 +09:00
html: parse comments per HTML spec
Updates golang/go#58246 Change-Id: Iaba5ed65f5d244fd47372ef0c08fc4cdb5ed90f9 Reviewed-on: https://go-review.googlesource.com/c/net/+/466776 TryBot-Result: Gopher Robot <gobot@golang.org> Auto-Submit: Nigel Tao <nigeltao@golang.org> Reviewed-by: Damien Neil <dneil@google.com> Run-TryBot: Nigel Tao <nigeltao@golang.org> Reviewed-by: Nigel Tao (INACTIVE; USE @golang.org INSTEAD) <nigeltao@google.com>
This commit is contained in:
270
html/comment_test.go
Normal file
270
html/comment_test.go
Normal file
@@ -0,0 +1,270 @@
|
||||
// Copyright 2023 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package html
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestComments exhaustively tests every 'interesting' N-byte string is
|
||||
// correctly parsed as a comment. N ranges from 4+1 to 4+suffixLen inclusive,
|
||||
// where 4 is the length of the "<!--" prefix that starts an HTML comment.
|
||||
//
|
||||
// 'Interesting' means that the N-4 byte suffix consists entirely of bytes
|
||||
// sampled from the interestingCommentBytes const string, below. These cover
|
||||
// all of the possible state transitions from comment-related parser states, as
|
||||
// listed in the HTML spec (https://html.spec.whatwg.org/#comment-start-state
|
||||
// and subsequent sections).
|
||||
//
|
||||
// The spec is written as an explicit state machine that, as a side effect,
|
||||
// accumulates "the comment token's data" to a separate buffer.
|
||||
// Tokenizer.readComment in this package does not have an explicit state
|
||||
// machine and usually returns the comment text as a sub-slice of the input,
|
||||
// between the opening '<' and closing '>' or EOF. This test confirms that the
|
||||
// two algorithms match.
|
||||
func TestComments(t *testing.T) {
|
||||
const prefix = "<!--"
|
||||
const suffixLen = 6
|
||||
buffer := make([]byte, 0, len(prefix)+suffixLen)
|
||||
testAllComments(t, append(buffer, prefix...))
|
||||
}
|
||||
|
||||
// NUL isn't in this list, even though the HTML spec sections 13.2.5.43 -
|
||||
// 13.2.5.52 mentions it. It's not interesting in terms of state transitions.
|
||||
// It's equivalent to any other non-interesting byte (other than being replaced
|
||||
// by U+FFFD REPLACEMENT CHARACTER).
|
||||
//
|
||||
// EOF isn't in this list. The HTML spec treats EOF as "an input character" but
|
||||
// testOneComment below breaks the loop instead.
|
||||
//
|
||||
// 'x' represents all other "non-interesting" comment bytes.
|
||||
var interestingCommentBytes = [...]byte{
|
||||
'!', '-', '<', '>', 'x',
|
||||
}
|
||||
|
||||
// testAllComments recursively fills in buffer[len(buffer):cap(buffer)] with
|
||||
// interesting bytes and then tests that this package's tokenization matches
|
||||
// the HTML spec.
|
||||
//
|
||||
// Precondition: len(buffer) < cap(buffer)
|
||||
// Precondition: string(buffer[:4]) == "<!--"
|
||||
func testAllComments(t *testing.T, buffer []byte) {
|
||||
for _, interesting := range interestingCommentBytes {
|
||||
b := append(buffer, interesting)
|
||||
testOneComment(t, b)
|
||||
if len(b) < cap(b) {
|
||||
testAllComments(t, b)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func testOneComment(t *testing.T, b []byte) {
|
||||
z := NewTokenizer(bytes.NewReader(b))
|
||||
if next := z.Next(); next != CommentToken {
|
||||
t.Fatalf("Next(%q): got %v, want %v", b, next, CommentToken)
|
||||
}
|
||||
gotRemainder := string(b[len(z.Raw()):])
|
||||
gotComment := string(z.Text())
|
||||
|
||||
i := len("<!--")
|
||||
wantBuffer := []byte(nil)
|
||||
loop:
|
||||
for state := 43; ; {
|
||||
// Consume the next input character, handling EOF.
|
||||
if i >= len(b) {
|
||||
break
|
||||
}
|
||||
nextInputCharacter := b[i]
|
||||
i++
|
||||
|
||||
switch state {
|
||||
case 43: // 13.2.5.43 Comment start state.
|
||||
switch nextInputCharacter {
|
||||
case '-':
|
||||
state = 44
|
||||
case '>':
|
||||
break loop
|
||||
default:
|
||||
i-- // Reconsume.
|
||||
state = 45
|
||||
}
|
||||
|
||||
case 44: // 13.2.5.44 Comment start dash state.
|
||||
switch nextInputCharacter {
|
||||
case '-':
|
||||
state = 51
|
||||
case '>':
|
||||
break loop
|
||||
default:
|
||||
wantBuffer = append(wantBuffer, '-')
|
||||
i-- // Reconsume.
|
||||
state = 45
|
||||
}
|
||||
|
||||
case 45: // 13.2.5.45 Comment state.
|
||||
switch nextInputCharacter {
|
||||
case '-':
|
||||
state = 50
|
||||
case '<':
|
||||
wantBuffer = append(wantBuffer, '<')
|
||||
state = 46
|
||||
default:
|
||||
wantBuffer = append(wantBuffer, nextInputCharacter)
|
||||
}
|
||||
|
||||
case 46: // 13.2.5.46 Comment less-than sign state.
|
||||
switch nextInputCharacter {
|
||||
case '!':
|
||||
wantBuffer = append(wantBuffer, '!')
|
||||
state = 47
|
||||
case '<':
|
||||
wantBuffer = append(wantBuffer, '<')
|
||||
state = 46
|
||||
default:
|
||||
i-- // Reconsume.
|
||||
state = 45
|
||||
}
|
||||
|
||||
case 47: // 13.2.5.47 Comment less-than sign bang state.
|
||||
switch nextInputCharacter {
|
||||
case '-':
|
||||
state = 48
|
||||
default:
|
||||
i-- // Reconsume.
|
||||
state = 45
|
||||
}
|
||||
|
||||
case 48: // 13.2.5.48 Comment less-than sign bang dash state.
|
||||
switch nextInputCharacter {
|
||||
case '-':
|
||||
state = 49
|
||||
default:
|
||||
i-- // Reconsume.
|
||||
state = 50
|
||||
}
|
||||
|
||||
case 49: // 13.2.5.49 Comment less-than sign bang dash dash state.
|
||||
switch nextInputCharacter {
|
||||
case '>':
|
||||
break loop
|
||||
default:
|
||||
i-- // Reconsume.
|
||||
state = 51
|
||||
}
|
||||
|
||||
case 50: // 13.2.5.50 Comment end dash state.
|
||||
switch nextInputCharacter {
|
||||
case '-':
|
||||
state = 51
|
||||
default:
|
||||
wantBuffer = append(wantBuffer, '-')
|
||||
i-- // Reconsume.
|
||||
state = 45
|
||||
}
|
||||
|
||||
case 51: // 13.2.5.51 Comment end state.
|
||||
switch nextInputCharacter {
|
||||
case '!':
|
||||
state = 52
|
||||
case '-':
|
||||
wantBuffer = append(wantBuffer, '-')
|
||||
case '>':
|
||||
break loop
|
||||
default:
|
||||
wantBuffer = append(wantBuffer, "--"...)
|
||||
i-- // Reconsume.
|
||||
state = 45
|
||||
}
|
||||
|
||||
case 52: // 13.2.5.52 Comment end bang state.
|
||||
switch nextInputCharacter {
|
||||
case '-':
|
||||
wantBuffer = append(wantBuffer, "--!"...)
|
||||
state = 50
|
||||
case '>':
|
||||
break loop
|
||||
default:
|
||||
wantBuffer = append(wantBuffer, "--!"...)
|
||||
i-- // Reconsume.
|
||||
state = 45
|
||||
}
|
||||
|
||||
default:
|
||||
t.Fatalf("input=%q: unexpected state %d", b, state)
|
||||
}
|
||||
}
|
||||
|
||||
wantRemainder := ""
|
||||
if i < len(b) {
|
||||
wantRemainder = string(b[i:])
|
||||
}
|
||||
wantComment := string(wantBuffer)
|
||||
if (gotComment != wantComment) || (gotRemainder != wantRemainder) {
|
||||
t.Errorf("input=%q\ngot: %q + %q\nwant: %q + %q",
|
||||
b, gotComment, gotRemainder, wantComment, wantRemainder)
|
||||
}
|
||||
}
|
||||
|
||||
// This table below summarizes the HTML-comment-related state machine from
|
||||
// 13.2.5.43 "Comment start state" and subsequent sections.
|
||||
// https://html.spec.whatwg.org/#comment-start-state
|
||||
//
|
||||
// Get to state 13.2.5.43 after seeing "<!--". Specifically, starting from the
|
||||
// initial 13.2.5.1 "Data state":
|
||||
// - "<" moves to 13.2.5.6 "Tag open state",
|
||||
// - "!" moves to 13.2.5.42 "Markup declaration open state",
|
||||
// - "--" moves to 13.2.5.43 "Comment start state".
|
||||
// Each of these transitions are the only way to get to the 6/42/43 states.
|
||||
//
|
||||
// State ! - < > NUL EOF default HTML spec section
|
||||
// 43 ... s44 ... s01.T.E0 ... ... r45 13.2.5.43 Comment start state
|
||||
// 44 ... s51 ... s01.T.E0 ... T.Z.E1 r45.A- 13.2.5.44 Comment start dash state
|
||||
// 45 ... s50 s46.A< ... t45.A?.E2 T.Z.E1 t45.Ax 13.2.5.45 Comment state
|
||||
// 46 s47.A! ... t46.A< ... ... ... r45 13.2.5.46 Comment less-than sign state
|
||||
// 47 ... s48 ... ... ... ... r45 13.2.5.47 Comment less-than sign bang state
|
||||
// 48 ... s49 ... ... ... ... r50 13.2.5.48 Comment less-than sign bang dash state
|
||||
// 49 ... ... ... s01.T ... T.Z.E1 r51.E3 13.2.5.49 Comment less-than sign bang dash dash state
|
||||
// 50 ... s51 ... ... ... T.Z.E1 r45.A- 13.2.5.50 Comment end dash state
|
||||
// 51 s52 t51.A- ... s01.T ... T.Z.E1 r45.A-- 13.2.5.51 Comment end state
|
||||
// 52 ... s50.A--! ... s01.T.E4 ... T.Z.E1 r45.A--! 13.2.5.52 Comment end bang state
|
||||
//
|
||||
// State 43 is the "Comment start state" meaning that we've only seen "<!--"
|
||||
// and nothing else. Similarly, state 44 means that we've only seen "<!---",
|
||||
// with three dashes, and nothing else. For the other states, we deduce
|
||||
// (working backwards) that the immediate prior input must be:
|
||||
// - 45 something that's not '-'
|
||||
// - 46 "<"
|
||||
// - 47 "<!"
|
||||
// - 48 "<!-"
|
||||
// - 49 "<!--" not including the opening "<!--"
|
||||
// - 50 "-" not including the opening "<!--" and also not "--"
|
||||
// - 51 "--" not including the opening "<!--"
|
||||
// - 52 "--!"
|
||||
//
|
||||
// The table cell actions:
|
||||
// - ... do the default action
|
||||
// - A! append "!" to the comment token's data.
|
||||
// - A- append "-" to the comment token's data.
|
||||
// - A-- append "--" to the comment token's data.
|
||||
// - A--! append "--!" to the comment token's data.
|
||||
// - A< append "<" to the comment token's data.
|
||||
// - A? append "\uFFFD" to the comment token's data.
|
||||
// - Ax append the current input character to the comment token's data.
|
||||
// - E0 parse error (abrupt-closing-of-empty-comment).
|
||||
// - E1 parse error (eof-in-comment).
|
||||
// - E2 parse error (unexpected-null-character).
|
||||
// - E3 parse error (nested-comment).
|
||||
// - E4 parse error (incorrectly-closed-comment).
|
||||
// - T emit the current comment token.
|
||||
// - Z emit an end-of-file token.
|
||||
// - rNN reconsume in the 13.2.5.NN state (after any A* or E* operations).
|
||||
// - s01 switch to the 13.2.5.1 Data state (after any A* or E* operations).
|
||||
// - sNN switch to the 13.2.5.NN state (after any A* or E* operations).
|
||||
// - tNN stay in the 13.2.5.NN state (after any A* or E* operations).
|
||||
//
|
||||
// The E* actions are called errors in the HTML spec but they are not fatal
|
||||
// (https://html.spec.whatwg.org/#parse-errors says "may [but not must] abort
|
||||
// the parser"). They are warnings that, in practice, browsers simply ignore.
|
||||
@@ -598,6 +598,11 @@ scriptDataDoubleEscapeEnd:
|
||||
// readComment reads the next comment token starting with "<!--". The opening
|
||||
// "<!--" has already been consumed.
|
||||
func (z *Tokenizer) readComment() {
|
||||
// When modifying this function, consider manually increasing the suffixLen
|
||||
// constant in func TestComments, from 6 to e.g. 9 or more. That increase
|
||||
// should only be temporary, not committed, as it exponentially affects the
|
||||
// test running time.
|
||||
|
||||
z.data.start = z.raw.end
|
||||
defer func() {
|
||||
if z.data.end < z.data.start {
|
||||
@@ -611,11 +616,7 @@ func (z *Tokenizer) readComment() {
|
||||
for {
|
||||
c := z.readByte()
|
||||
if z.err != nil {
|
||||
// Ignore up to two dashes at EOF.
|
||||
if dashCount > 2 {
|
||||
dashCount = 2
|
||||
}
|
||||
z.data.end = z.raw.end - dashCount
|
||||
z.data.end = z.calculateAbruptCommentDataEnd()
|
||||
return
|
||||
}
|
||||
switch c {
|
||||
@@ -631,12 +632,15 @@ func (z *Tokenizer) readComment() {
|
||||
if dashCount >= 2 {
|
||||
c = z.readByte()
|
||||
if z.err != nil {
|
||||
z.data.end = z.raw.end
|
||||
z.data.end = z.calculateAbruptCommentDataEnd()
|
||||
return
|
||||
}
|
||||
if c == '>' {
|
||||
} else if c == '>' {
|
||||
z.data.end = z.raw.end - len("--!>")
|
||||
return
|
||||
} else if c == '-' {
|
||||
dashCount = 1
|
||||
beginning = false
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -645,6 +649,35 @@ func (z *Tokenizer) readComment() {
|
||||
}
|
||||
}
|
||||
|
||||
func (z *Tokenizer) calculateAbruptCommentDataEnd() int {
|
||||
raw := z.Raw()
|
||||
const prefixLen = len("<!--")
|
||||
if len(raw) >= prefixLen {
|
||||
raw = raw[prefixLen:]
|
||||
if hasSuffix(raw, "--!") {
|
||||
return z.raw.end - 3
|
||||
} else if hasSuffix(raw, "--") {
|
||||
return z.raw.end - 2
|
||||
} else if hasSuffix(raw, "-") {
|
||||
return z.raw.end - 1
|
||||
}
|
||||
}
|
||||
return z.raw.end
|
||||
}
|
||||
|
||||
func hasSuffix(b []byte, suffix string) bool {
|
||||
if len(b) < len(suffix) {
|
||||
return false
|
||||
}
|
||||
b = b[len(b)-len(suffix):]
|
||||
for i := range b {
|
||||
if b[i] != suffix[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// readUntilCloseAngle reads until the next ">".
|
||||
func (z *Tokenizer) readUntilCloseAngle() {
|
||||
z.data.start = z.raw.end
|
||||
|
||||
@@ -295,7 +295,7 @@ var tokenTests = []tokenTest{
|
||||
"<?xml?>",
|
||||
"<!--?xml?-->",
|
||||
},
|
||||
// Comments.
|
||||
// Comments. See also func TestComments.
|
||||
{
|
||||
"comment0",
|
||||
"abc<b><!-- skipme --></b>def",
|
||||
@@ -376,6 +376,41 @@ var tokenTests = []tokenTest{
|
||||
"a<!-- !-->z",
|
||||
"a$<!-- !-->$z",
|
||||
},
|
||||
{
|
||||
"comment16",
|
||||
"a<!--i\x00j-->z",
|
||||
"a$<!--i\uFFFDj-->$z",
|
||||
},
|
||||
{
|
||||
"comment17",
|
||||
"a<!--\x00",
|
||||
"a$<!--\uFFFD-->",
|
||||
},
|
||||
{
|
||||
"comment18",
|
||||
"a<!--<!-->z",
|
||||
"a$<!--<!-->$z",
|
||||
},
|
||||
{
|
||||
"comment19",
|
||||
"a<!--<!--",
|
||||
"a$<!--<!-->",
|
||||
},
|
||||
{
|
||||
"comment20",
|
||||
"a<!--ij--kl-->z",
|
||||
"a$<!--ij--kl-->$z",
|
||||
},
|
||||
{
|
||||
"comment21",
|
||||
"a<!--ij--kl--!>z",
|
||||
"a$<!--ij--kl-->$z",
|
||||
},
|
||||
{
|
||||
"comment22",
|
||||
"a<!--!--!<--!-->z",
|
||||
"a$<!--!--!<--!-->$z",
|
||||
},
|
||||
// An attribute with a backslash.
|
||||
{
|
||||
"backslash",
|
||||
|
||||
Reference in New Issue
Block a user