diff --git a/html/token.go b/html/token.go index 3c57880d..6598c1f7 100644 --- a/html/token.go +++ b/html/token.go @@ -839,8 +839,22 @@ func (z *Tokenizer) readStartTag() TokenType { if raw { z.rawTag = strings.ToLower(string(z.buf[z.data.start:z.data.end])) } - // Look for a self-closing token like "
". - if z.err == nil && z.buf[z.raw.end-2] == '/' { + // Look for a self-closing token (e.g.
). + // + // Originally, we did this by just checking that the last character of the + // tag (ignoring the closing bracket) was a solidus (/) character, but this + // is not always accurate. + // + // We need to be careful that we don't misinterpret a non-self-closing tag + // as self-closing, as can happen if the tag contains unquoted attribute + // values (i.e.

). + // + // To avoid this, we check that the last non-bracket character of the tag + // (z.raw.end-2) isn't the same character as the last non-quote character of + // the last attribute of the tag (z.pendingAttr[1].end-1), if the tag has + // attributes. + nAttrs := len(z.attr) + if z.err == nil && z.buf[z.raw.end-2] == '/' && (nAttrs == 0 || z.raw.end-2 != z.attr[nAttrs-1][1].end-1) { return SelfClosingTagToken } return StartTagToken diff --git a/html/token_test.go b/html/token_test.go index a36d112d..44773f17 100644 --- a/html/token_test.go +++ b/html/token_test.go @@ -616,6 +616,16 @@ var tokenTests = []tokenTest{ `

`, `

`, }, + { + "slash at end of unquoted attribute value", + `

`, + `

`, + }, + { + "self-closing tag with attribute", + `

`, + `

`, + }, } func TestTokenizer(t *testing.T) { @@ -815,6 +825,14 @@ func TestReaderEdgeCases(t *testing.T) { } } +func TestSelfClosingTagValueConfusion(t *testing.T) { + z := NewTokenizer(strings.NewReader(`

`)) + tok := z.Next() + if tok != StartTagToken { + t.Fatalf("unexpected token type: got %s, want %s", tok, StartTagToken) + } +} + // zeroOneByteReader is like a strings.Reader that alternates between // returning 0 bytes and 1 byte at a time. type zeroOneByteReader struct {