From 10d4674b4d1db7f7359ee8fb4d21e0021166d99c Mon Sep 17 00:00:00 2001 From: Josh Bleecher Snyder Date: Fri, 19 Sep 2014 18:08:31 -0700 Subject: [PATCH 1/5] pdf: add String method for ValueKind Generated by stringer. --- valuekind_string.go | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 valuekind_string.go diff --git a/valuekind_string.go b/valuekind_string.go new file mode 100644 index 0000000..006be9c --- /dev/null +++ b/valuekind_string.go @@ -0,0 +1,21 @@ +// generated by stringer -type=ValueKind; DO NOT EDIT + +package pdf + +import "fmt" + +const _ValueKind_name = "NullBoolIntegerRealStringNameDictArrayStream" + +var _ValueKind_index = [...]uint8{4, 8, 15, 19, 25, 29, 33, 38, 44} + +func (i ValueKind) String() string { + if i < 0 || i >= ValueKind(len(_ValueKind_index)) { + return fmt.Sprintf("ValueKind(%d)", i) + } + hi := _ValueKind_index[i] + lo := uint8(0) + if i > 0 { + lo = _ValueKind_index[i-1] + } + return _ValueKind_name[lo:hi] +} From e1e130ce877eafe870a6c752c05f25c0fadeabf8 Mon Sep 17 00:00:00 2001 From: Josh Bleecher Snyder Date: Fri, 19 Sep 2014 18:06:54 -0700 Subject: [PATCH 2/5] pdf: handle Array "Contents" Value Encountered parsing NYT crosswords. --- page.go | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/page.go b/page.go index 9c7d688..2ba876a 100644 --- a/page.go +++ b/page.go @@ -403,7 +403,23 @@ type gstate struct { // Content returns the page's content. func (p Page) Content() Content { - strm := p.V.Key("Contents") + switch v := p.V.Key("Contents"); v.Kind() { + case Stream: + return p.contentForStream(v) + case Array: + var c Content + for i := 0; i < v.Len(); i++ { + cfs := p.contentForStream(v.Index(i)) + c.Text = append(c.Text, cfs.Text...) + c.Rect = append(c.Rect, cfs.Rect...) + } + return c + default: + panic("bad content kind") + } +} + +func (p Page) contentForStream(strm Value) Content { var enc TextEncoding = &nopEncoder{} var g = gstate{ From e8eef97af8741c776645e1764eb8d3d360fdc21e Mon Sep 17 00:00:00 2001 From: Josh Bleecher Snyder Date: Fri, 19 Sep 2014 18:08:03 -0700 Subject: [PATCH 3/5] pdf: don't panic when popping from empty gstack Restoring from an empty graphics stack looks like a mistake in the PDF, but other PDF readers handle it fine, and ignoring it appears to work correctly. Encountered parsing NYT crosswords. --- page.go | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/page.go b/page.go index 2ba876a..424b9ef 100644 --- a/page.go +++ b/page.go @@ -500,9 +500,12 @@ func (p Page) contentForStream(strm Value) Content { gstack = append(gstack, g) case "Q": // restore graphics state - n := len(gstack) - 1 - g = gstack[n] - gstack = gstack[:n] + // gstack should not be empty...but sometimes it is + if len(gstack) > 0 { + n := len(gstack) - 1 + g = gstack[n] + gstack = gstack[:n] + } case "BT": // begin text (reset text matrix and line matrix) g.Tm = ident From cf21b8da348bb0731a913b21455721063a97464a Mon Sep 17 00:00:00 2001 From: Josh Bleecher Snyder Date: Fri, 19 Sep 2014 18:33:01 -0700 Subject: [PATCH 4/5] pdf: remove unnecessary string escaping --- read.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/read.go b/read.go index 6fe41b4..8636d25 100644 --- a/read.go +++ b/read.go @@ -67,6 +67,7 @@ import ( "crypto/cipher" "crypto/md5" "crypto/rc4" + "errors" "fmt" "io" "io/ioutil" @@ -128,7 +129,7 @@ func NewReader(f io.ReaderAt, size int64) (*Reader, error) { } buf = bytes.TrimRight(buf, "\r\n\t ") if !bytes.HasSuffix(buf, []byte("%%EOF")) { - return nil, fmt.Errorf("not a PDF file: missing %%%%EOF") + return nil, errors.New("not a PDF file: missing %%EOF") } i := findLastLine(buf, "startxref") if i < 0 { From 181e8f43db838853d1f55a7fccadb4ea2a9463bf Mon Sep 17 00:00:00 2001 From: Josh Bleecher Snyder Date: Mon, 22 Sep 2014 12:07:24 -0700 Subject: [PATCH 5/5] pdf: allow junk after %%EOF Some PDFs contain non-whitespace junk after the %%EOF marker. Other PDF parsers handle it fine. Encountered parsing NYT crosswords. --- read.go | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/read.go b/read.go index 8636d25..bfd5401 100644 --- a/read.go +++ b/read.go @@ -124,13 +124,11 @@ func NewReader(f io.ReaderAt, size int64) (*Reader, error) { const endChunk = 100 buf = make([]byte, endChunk) f.ReadAt(buf, end-endChunk) - for len(buf) > 0 && buf[len(buf)-1] == '\n' || buf[len(buf)-1] == '\r' { - buf = buf[:len(buf)-1] - } - buf = bytes.TrimRight(buf, "\r\n\t ") - if !bytes.HasSuffix(buf, []byte("%%EOF")) { + eof := bytes.LastIndex(buf, []byte("%%EOF")) + if eof == -1 { return nil, errors.New("not a PDF file: missing %%EOF") } + buf = buf[:eof] i := findLastLine(buf, "startxref") if i < 0 { return nil, fmt.Errorf("malformed PDF file: missing final startxref")