mirror of
https://github.com/golang/net.git
synced 2026-03-31 10:27:08 +09:00
html: limit buffering during tokenization.
This is optional. By default, buffering is unlimited. Fixes golang/go#7053 R=bradfitz CC=golang-codereviews https://golang.org/cl/43190044
This commit is contained in:
committed by
Brad Fitzpatrick
parent
480e7b06ec
commit
384e4d292e
@@ -6,6 +6,7 @@ package html
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"io"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -33,6 +34,9 @@ const (
|
||||
DoctypeToken
|
||||
)
|
||||
|
||||
// ErrBufferExceeded means that the buffering limit was exceeded.
|
||||
var ErrBufferExceeded = errors.New("max buffer exceeded")
|
||||
|
||||
// String returns a string representation of the TokenType.
|
||||
func (t TokenType) String() string {
|
||||
switch t {
|
||||
@@ -142,6 +146,8 @@ type Tokenizer struct {
|
||||
// buf[raw.end:] is buffered input that will yield future tokens.
|
||||
raw span
|
||||
buf []byte
|
||||
// maxBuf limits the data buffered in buf. A value of 0 means unlimited.
|
||||
maxBuf int
|
||||
// buf[data.start:data.end] holds the raw bytes of the current token's data:
|
||||
// a text token's text, a tag token's tag name, etc.
|
||||
data span
|
||||
@@ -273,6 +279,10 @@ func (z *Tokenizer) readByte() byte {
|
||||
}
|
||||
x := z.buf[z.raw.end]
|
||||
z.raw.end++
|
||||
if z.maxBuf > 0 && z.raw.end-z.raw.start >= z.maxBuf {
|
||||
z.err = ErrBufferExceeded
|
||||
return 0
|
||||
}
|
||||
return x
|
||||
}
|
||||
|
||||
@@ -1167,6 +1177,12 @@ func (z *Tokenizer) Token() Token {
|
||||
return t
|
||||
}
|
||||
|
||||
// SetMaxBuf sets a limit on the amount of data buffered during tokenization.
|
||||
// A value of 0 means unlimited.
|
||||
func (z *Tokenizer) SetMaxBuf(n int) {
|
||||
z.maxBuf = n
|
||||
}
|
||||
|
||||
// NewTokenizer returns a new HTML Tokenizer for the given Reader.
|
||||
// The input is assumed to be UTF-8 encoded.
|
||||
func NewTokenizer(r io.Reader) *Tokenizer {
|
||||
|
||||
@@ -469,6 +469,63 @@ loop:
|
||||
}
|
||||
}
|
||||
|
||||
func TestMaxBuffer(t *testing.T) {
|
||||
// Exceeding the maximum buffer size generates ErrBufferExceeded.
|
||||
z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10)))
|
||||
z.SetMaxBuf(5)
|
||||
tt := z.Next()
|
||||
if got, want := tt, ErrorToken; got != want {
|
||||
t.Fatalf("token type: got: %v want: %v", got, want)
|
||||
}
|
||||
if got, want := z.Err(), ErrBufferExceeded; got != want {
|
||||
t.Errorf("error type: got: %v want: %v", got, want)
|
||||
}
|
||||
if got, want := string(z.Raw()), "<tttt"; got != want {
|
||||
t.Fatalf("buffered before overflow: got: %q want: %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMaxBufferReconstruction(t *testing.T) {
|
||||
// Exceeding the maximum buffer size at any point while tokenizing permits
|
||||
// reconstructing the original input.
|
||||
tests:
|
||||
for _, test := range tokenTests {
|
||||
buffer:
|
||||
for maxBuf := 1; ; maxBuf++ {
|
||||
r := strings.NewReader(test.html)
|
||||
z := NewTokenizer(r)
|
||||
z.SetMaxBuf(maxBuf)
|
||||
var tokenized bytes.Buffer
|
||||
for {
|
||||
tt := z.Next()
|
||||
tokenized.Write(z.Raw())
|
||||
if tt == ErrorToken {
|
||||
if z.Err() == ErrBufferExceeded {
|
||||
continue buffer
|
||||
}
|
||||
// EOF is expected, and indicates that we found the max maxBuf that
|
||||
// generates ErrBufferExceeded, so continue to the next test.
|
||||
if err := z.Err(); err != io.EOF {
|
||||
t.Errorf("%s: unexpected error: %v", test.desc, err)
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
// Anything tokenizing along with input left in the reader.
|
||||
assembled, err := ioutil.ReadAll(io.MultiReader(&tokenized, r))
|
||||
if err != nil {
|
||||
t.Errorf("%s: ReadAll: %v", test.desc, err)
|
||||
continue tests
|
||||
}
|
||||
if got, want := string(assembled), test.html; got != want {
|
||||
t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want)
|
||||
continue tests
|
||||
}
|
||||
break
|
||||
} // buffer sizes
|
||||
} // tests
|
||||
}
|
||||
|
||||
func TestPassthrough(t *testing.T) {
|
||||
// Accumulating the raw output for each parse event should reconstruct the
|
||||
// original input.
|
||||
|
||||
Reference in New Issue
Block a user