Skip to content

Commit 384e4d2

Browse files
Michael Piatekbradfitz
Michael Piatek
authored andcommitted
html: limit buffering during tokenization.
This is optional. By default, buffering is unlimited. Fixes golang/go#7053 R=bradfitz CC=golang-codereviews https://golang.org/cl/43190044
1 parent 480e7b0 commit 384e4d2

File tree

2 files changed

+73
-0
lines changed

2 files changed

+73
-0
lines changed

html/token.go

+16
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package html
66

77
import (
88
"bytes"
9+
"errors"
910
"io"
1011
"strconv"
1112
"strings"
@@ -33,6 +34,9 @@ const (
3334
DoctypeToken
3435
)
3536

37+
// ErrBufferExceeded means that the buffering limit was exceeded.
38+
var ErrBufferExceeded = errors.New("max buffer exceeded")
39+
3640
// String returns a string representation of the TokenType.
3741
func (t TokenType) String() string {
3842
switch t {
@@ -142,6 +146,8 @@ type Tokenizer struct {
142146
// buf[raw.end:] is buffered input that will yield future tokens.
143147
raw span
144148
buf []byte
149+
// maxBuf limits the data buffered in buf. A value of 0 means unlimited.
150+
maxBuf int
145151
// buf[data.start:data.end] holds the raw bytes of the current token's data:
146152
// a text token's text, a tag token's tag name, etc.
147153
data span
@@ -273,6 +279,10 @@ func (z *Tokenizer) readByte() byte {
273279
}
274280
x := z.buf[z.raw.end]
275281
z.raw.end++
282+
if z.maxBuf > 0 && z.raw.end-z.raw.start >= z.maxBuf {
283+
z.err = ErrBufferExceeded
284+
return 0
285+
}
276286
return x
277287
}
278288

@@ -1167,6 +1177,12 @@ func (z *Tokenizer) Token() Token {
11671177
return t
11681178
}
11691179

1180+
// SetMaxBuf sets a limit on the amount of data buffered during tokenization.
1181+
// A value of 0 means unlimited.
1182+
func (z *Tokenizer) SetMaxBuf(n int) {
1183+
z.maxBuf = n
1184+
}
1185+
11701186
// NewTokenizer returns a new HTML Tokenizer for the given Reader.
11711187
// The input is assumed to be UTF-8 encoded.
11721188
func NewTokenizer(r io.Reader) *Tokenizer {

html/token_test.go

+57
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,63 @@ loop:
469469
}
470470
}
471471

472+
func TestMaxBuffer(t *testing.T) {
473+
// Exceeding the maximum buffer size generates ErrBufferExceeded.
474+
z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10)))
475+
z.SetMaxBuf(5)
476+
tt := z.Next()
477+
if got, want := tt, ErrorToken; got != want {
478+
t.Fatalf("token type: got: %v want: %v", got, want)
479+
}
480+
if got, want := z.Err(), ErrBufferExceeded; got != want {
481+
t.Errorf("error type: got: %v want: %v", got, want)
482+
}
483+
if got, want := string(z.Raw()), "<tttt"; got != want {
484+
t.Fatalf("buffered before overflow: got: %q want: %q", got, want)
485+
}
486+
}
487+
488+
func TestMaxBufferReconstruction(t *testing.T) {
489+
// Exceeding the maximum buffer size at any point while tokenizing permits
490+
// reconstructing the original input.
491+
tests:
492+
for _, test := range tokenTests {
493+
buffer:
494+
for maxBuf := 1; ; maxBuf++ {
495+
r := strings.NewReader(test.html)
496+
z := NewTokenizer(r)
497+
z.SetMaxBuf(maxBuf)
498+
var tokenized bytes.Buffer
499+
for {
500+
tt := z.Next()
501+
tokenized.Write(z.Raw())
502+
if tt == ErrorToken {
503+
if z.Err() == ErrBufferExceeded {
504+
continue buffer
505+
}
506+
// EOF is expected, and indicates that we found the max maxBuf that
507+
// generates ErrBufferExceeded, so continue to the next test.
508+
if err := z.Err(); err != io.EOF {
509+
t.Errorf("%s: unexpected error: %v", test.desc, err)
510+
}
511+
break
512+
}
513+
}
514+
// Anything tokenizing along with input left in the reader.
515+
assembled, err := ioutil.ReadAll(io.MultiReader(&tokenized, r))
516+
if err != nil {
517+
t.Errorf("%s: ReadAll: %v", test.desc, err)
518+
continue tests
519+
}
520+
if got, want := string(assembled), test.html; got != want {
521+
t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want)
522+
continue tests
523+
}
524+
break
525+
} // buffer sizes
526+
} // tests
527+
}
528+
472529
func TestPassthrough(t *testing.T) {
473530
// Accumulating the raw output for each parse event should reconstruct the
474531
// original input.

0 commit comments

Comments
 (0)