ref: 474117ed563f8f84f11d3dcf90635c584be29ec1
parent: 17128cefa8384e9433de8a725686b4e544a83308
author: Sigrid Haflínudóttir <ftrvxmtrx@gmail.com>
date: Mon Aug 31 12:27:10 EDT 2020
move xref logic into a separate file
--- a/misc.c
+++ b/misc.c
@@ -72,3 +72,15 @@
c == '[' || c == ']' || c == '{' || c == '}' ||
c == '/' || c == '%';
}
+
+int
+Bgetint(Biobuf *b, int *i)
+{
+ double d;
+
+ if(Bgetd(b, &d) != 1 || isNaN(d))
+ return -1;
+ *i = d;
+
+ return 1;
+}
--- a/mkfile
+++ b/mkfile
@@ -18,6 +18,7 @@
pdffs.$O\
stream.$O\
string.$O\
+ xref.$O\
HFILES=\
pdf.h\
--- a/pdf.c
+++ b/pdf.c
@@ -4,80 +4,7 @@
#include <ctype.h>
#include "pdf.h"
-int Tfmt(Fmt *f);
-int ⊗fmt(Fmt *f);
-
-/*
- * pre-1.5 xref section reader
- * PDF>=1.5 may have BOTH (or either) old xref format and xref streams
- */
static int
-xrefread(Pdf *pdf, int xref0, int nxref)
-{
- int i, j, sz, n, newnxref;
- Xref xref;
- char *s, *e;
- Xref *x;
-
- s = nil;
- if((x = realloc(pdf->xref, (pdf->nxref + nxref)*sizeof(Xref))) == nil)
- goto err;
- pdf->xref = x;
-
- /* read the entire thing at once */
- sz = nxref*20;
- if((s = malloc(sz)) == nil)
- goto err;
- for(i = 0; i < sz; i += n){
- if((n = Bread(pdf->bio, s+i, sz-i)) < 1)
- goto err;
- }
-
- /* store non-free objects only */
- newnxref = pdf->nxref;
- for(e = s, i = 0; i < nxref; i++, e += 20){
- if(!isspace(e[10]) || !isspace(e[18]) || !isspace(e[19])){
- werrstr("invalid xref line (%d/%d)", i, nxref);
- goto err;
- }
- xref.id = xref0 + i;
- xref.off = strtoul(e, nil, 10);
- /* FIXME xref.gen */
- xref.type = Xusual;
-
- /* search in already existing xrefs, update if found */
- for(j = 0; j < pdf->nxref; j++){
- if(pdf->xref[j].id != xref.id)
- continue;
- if(e[17] == 'f') /* it was freed */
- pdf->xref[j].id = 0;
- else if(e[17] == 'n')
- pdf->xref[j].off = xref.off;
- break;
- }
- if(j >= pdf->nxref && e[17] == 'n') /* that's a new one, insert unless it's free */
- pdf->xref[newnxref++] = xref;
- }
- free(s);
- s = nil;
-
- /* scale down */
- for(i = j = 0; i < newnxref; i++){
- if(pdf->xref[i].id != 0)
- pdf->xref[j++] = pdf->xref[i];
- }
- if((x = realloc(pdf->xref, j*sizeof(Xref))) == nil)
- goto err;
- pdf->xref = x;
- pdf->nxref = j;
-
- return 0;
-err:
- free(s);
- return -1;
-}
-
-static int
trailerread(Pdf *pdf)
{
Object *o;
@@ -100,104 +27,6 @@
return -1;
}
-static int
-getint(uchar *b, int sz, int dflt)
-{
- int x, i;
-
- if(sz == 0)
- return dflt;
- x = 0;
- for(i = 0; i < sz; i++)
- x = x<<8 | b[i];
-
- return x;
-}
-
-/* 7.5.8.3 */
-static int
-xrefstreamread(Pdf *pdf)
-{
- Object *o;
- Stream *s;
- Xref *x;
- uchar buf[32];
- int w[8], nw, i, c, n, nxref, newnxref, prev, extra;
-
- s = nil;
- if((o = pdfobj(pdf, pdf->bio)) == nil){
- werrstr("xref stream obj: %r");
- goto err;
- }
- if((prev = dictint(o, "Prev")) > 0){
- if(Bseek(pdf->bio, prev, 0) != prev){
- werrstr("xref stream prev seek failed");
- goto err;
- }
- if(xrefstreamread(pdf) != 0){
- pdfobjfree(o);
- return -1;
- }
- }
- if((s = streamopen(o)) == nil){
- werrstr("failed to stream xref: %r");
- goto err;
- }
- if((nw = dictints(o, "W", w, nelem(w))) < 3 || nw >= nelem(w)){
- werrstr("nW=%d", nw);
- goto err;
- }
-
- for(n = i = 0; i < nw; i++)
- n += w[i]; /* size of each element. w[i] MAY be 0 */
- if(n > sizeof(buf)){
- werrstr("W is beyond imaginable: %d bytes", n);
- goto err;
- }
- if((nxref = streamsize(s)/n) < 1){
- werrstr("no xref elements in the stream");
- goto err;
- }
- extra = streamsize(s) % (nxref*n);
- if(extra != 0)
- fprint(2, "extra %d bytes in xref stream", extra);
-
- newnxref = pdf->nxref + nxref;
- if((x = realloc(pdf->xref, newnxref*sizeof(Xref))) == nil)
- goto err;
- pdf->xref = x;
- x += pdf->nxref;
- while(Bread(s->bio, buf, n) == n){ /* stop on short read or error */
- c = getint(buf, w[0], 1); /* default type is 1 */
- if(c == 1){ /* not compressed */
- x->off = getint(buf+w[0], w[1], 0);
- x->gen = getint(buf+w[0]+w[1], w[2], 0);
- x->type = Xuncompressed;
- pdf->nxref++;
- fprint(2, "xref %⊗\n", *x);
- x++;
- }else if(c == 2){ /* compressed */
- x->objnum = getint(buf+w[0], w[1], 0);
- x->id = getint(buf+w[0]+w[1], w[2], 0);
- x->type = Xcompressed;
- pdf->nxref++;
- fprint(2, "xref %⊗\n", *x);
- x++;
- }
- }
-
- streamclose(s);
- pdf->root = pdfref(dictget(o, "Root"));
- pdf->info = pdfref(dictget(o, "Info"));
- pdfobjfree(o);
-
- return 0;
-err:
- streamclose(s);
- pdfobjfree(o);
- return -1;
-}
-
Pdf *
pdfopen(Biobuf *b)
{
@@ -204,8 +33,6 @@
Pdf *pdf;
Object *o;
char tmp[64], *s, *x;
- int xref0; /* 7.5.4 xref subsection first object number */
- int nxref; /* 7.5.4 xref subsection number of objects */
int xreftb; /* 7.5.4 xref table offset from the beginning of the file */
int i, n, off;
@@ -252,40 +79,33 @@
werrstr("xref position out of range");
goto err;
}
-morexref:
- off = Boffset(b);
- n = sizeof(tmp)-1;
- if((n = Bread(b, tmp, n)) < 16){
+ for(;;){
+ off = Boffset(b);
+ if(Bread(b, tmp, sizeof(tmp)) < 8){
badxref:
- werrstr("invalid xref: %r");
- goto err;
- }
- tmp[n] = 0;
- if(memcmp(tmp, "xref", 4) == 0){
- /* 7.5.4 xref */
- x = tmp+4;
- xref0 = strtol(x, &x, 10);
- nxref = strtol(x, &x, 10);
- /* skip whitespace and move to the first subsection */
- for(; isws(*x) && x < tmp+n; x++);
- n = x-tmp+off;
- if(Bseek(b, n, 0) != n)
- goto badxref;
- if(xref0 >= 0 && nxref > 0 && xrefread(pdf, xref0, nxref) != 0)
- goto badxref;
- goto morexref; /* there could be more updates, try it */
- }else if(memcmp(tmp, "trailer", 7) == 0){ /* 7.5.5 file trailer */
- /* move to the trailer dictionary */
- n = off + 8;
- if(Bseek(b, n, 0) != n || trailerread(pdf) != 0){
- werrstr("invalid trailer: %r");
+ werrstr("invalid xref: %r");
goto err;
}
- }else if(isdigit(tmp[0])){ /* could be 7.5.8 xref stream (since PDF 1.5) */
- if(Bseek(b, xreftb, 0) != xreftb)
- goto badxref;
- if(xrefstreamread(pdf) != 0)
- goto err;
+ if(memcmp(tmp, "xref", 4) == 0){
+ if(Bseek(b, -sizeof(tmp)+5, 1) < 0 || xrefreadold(pdf) != 0)
+ goto err;
+ /* there could be more updates, try it */
+ }else if(memcmp(tmp, "trailer", 7) == 0){ /* 7.5.5 file trailer */
+ /* move to the trailer dictionary */
+ n = off + 8;
+ if(Bseek(b, n, 0) != n || trailerread(pdf) != 0){
+ werrstr("invalid trailer: %r");
+ goto err;
+ }
+ /* trailer is supposed to be the last thing */
+ break;
+ }else if(isdigit(tmp[0])){ /* could be 7.5.8 xref stream (since PDF 1.5) */
+ if(Bseek(b, xreftb, 0) != xreftb)
+ goto badxref;
+ if(xrefreadstream(pdf) != 0)
+ goto err;
+ break;
+ }
}
/* root is required, info is optional */
--- a/pdf.h
+++ b/pdf.h
@@ -21,11 +21,7 @@
typedef struct Pdf Pdf;
typedef struct Stream Stream;
typedef struct Xref Xref;
-#pragma incomplete Filter
-#pragma varargck type "T" Object*
-#pragma varargck type "⊗" Xref
-
struct Buffer {
uchar *b;
int ro;
@@ -177,3 +173,12 @@
int bufput(Buffer *b, uchar *d, int sz);
int bufget(Buffer *b, uchar *d, int sz);
void bufdump(Buffer *b);
+
+#pragma varargck type "T" Object*
+#pragma varargck type "⊗" Xref
+int Tfmt(Fmt *f);
+int ⊗fmt(Fmt *f);
+int Bgetint(Biobuf *b, int *i);
+
+int xrefreadold(Pdf *pdf);
+int xrefreadstream(Pdf *pdf);
--- /dev/null
+++ b/xref.c
@@ -1,0 +1,189 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <ctype.h>
+#include "pdf.h"
+
+/*
+ * 7.5.4 pre-1.5 xref section reader
+ * PDF>=1.5 may have BOTH (or either) old xref format and xref streams
+ */
+int
+xrefreadold(Pdf *pdf)
+{
+ int xref0; /* 7.5.4 xref subsection first object number */
+ int nxref; /* 7.5.4 xref subsection number of objects */
+ int i, j, sz, n, newnxref;
+ Xref xref;
+ char *s, *e;
+ Xref *x;
+
+ if(Bgetint(pdf->bio, &xref0) != 1 || xref0 < 0){
+ werrstr("invalid xref0");
+ return -1;
+ }
+ if(Bgetint(pdf->bio, &nxref) != 1 || nxref < 0){
+ werrstr("invalid nxref");
+ return -1;
+ }
+
+ /* skip whitespace and move to the first subsection */
+ while(isspace(Bgetc(pdf->bio)));
+ Bungetc(pdf->bio);
+
+ s = nil;
+ if((x = realloc(pdf->xref, (pdf->nxref + nxref)*sizeof(Xref))) == nil)
+ goto err;
+ pdf->xref = x;
+
+ /* read the entire thing at once */
+ sz = nxref*20;
+ if((s = malloc(sz)) == nil)
+ goto err;
+ for(i = 0; i < sz; i += n){
+ if((n = Bread(pdf->bio, s+i, sz-i)) < 1)
+ goto err;
+ }
+
+ /* store non-free objects only */
+ newnxref = pdf->nxref;
+ for(e = s, i = 0; i < nxref; i++, e += 20){
+ if(!isspace(e[10]) || !isspace(e[18]) || !isspace(e[19])){
+ werrstr("invalid xref line (%d/%d)", i, nxref);
+ goto err;
+ }
+ xref.id = xref0 + i;
+ xref.off = strtoul(e, nil, 10);
+ /* FIXME xref.gen */
+ xref.type = Xusual;
+
+ /* search in already existing xrefs, update if found */
+ for(j = 0; j < pdf->nxref; j++){
+ if(pdf->xref[j].id != xref.id)
+ continue;
+ if(e[17] == 'f') /* it was freed */
+ pdf->xref[j].id = 0;
+ else if(e[17] == 'n')
+ pdf->xref[j].off = xref.off;
+ break;
+ }
+ if(j >= pdf->nxref && e[17] == 'n') /* that's a new one, insert unless it's free */
+ pdf->xref[newnxref++] = xref;
+ }
+ free(s);
+ s = nil;
+
+ /* scale down */
+ for(i = j = 0; i < newnxref; i++){
+ if(pdf->xref[i].id != 0)
+ pdf->xref[j++] = pdf->xref[i];
+ }
+ if((x = realloc(pdf->xref, j*sizeof(Xref))) == nil)
+ goto err;
+ pdf->xref = x;
+ pdf->nxref = j;
+
+ return 0;
+err:
+ free(s);
+ return -1;
+}
+
+static int
+getint(uchar *b, int sz, int dflt)
+{
+ int x, i;
+
+ if(sz == 0)
+ return dflt;
+ x = 0;
+ for(i = 0; i < sz; i++)
+ x = x<<8 | b[i];
+
+ return x;
+}
+
+/* 7.5.8.3 */
+int
+xrefreadstream(Pdf *pdf)
+{
+ Object *o;
+ Stream *s;
+ Xref *x;
+ uchar buf[32];
+ int w[8], nw, i, c, n, nxref, newnxref, prev, extra;
+
+ s = nil;
+ if((o = pdfobj(pdf, pdf->bio)) == nil){
+ werrstr("xref stream obj: %r");
+ goto err;
+ }
+ if((prev = dictint(o, "Prev")) > 0){
+ if(Bseek(pdf->bio, prev, 0) != prev){
+ werrstr("xref stream prev seek failed");
+ goto err;
+ }
+ if(xrefreadstream(pdf) != 0){
+ pdfobjfree(o);
+ return -1;
+ }
+ }
+ if((s = streamopen(o)) == nil){
+ werrstr("failed to stream xref: %r");
+ goto err;
+ }
+ if((nw = dictints(o, "W", w, nelem(w))) < 3 || nw >= nelem(w)){
+ werrstr("nW=%d", nw);
+ goto err;
+ }
+
+ for(n = i = 0; i < nw; i++)
+ n += w[i]; /* size of each element. w[i] MAY be 0 */
+ if(n > sizeof(buf)){
+ werrstr("W is beyond imaginable: %d bytes", n);
+ goto err;
+ }
+ if((nxref = streamsize(s)/n) < 1){
+ werrstr("no xref elements in the stream");
+ goto err;
+ }
+ extra = streamsize(s) % (nxref*n);
+ if(extra != 0)
+ fprint(2, "extra %d bytes in xref stream", extra);
+
+ newnxref = pdf->nxref + nxref;
+ if((x = realloc(pdf->xref, newnxref*sizeof(Xref))) == nil)
+ goto err;
+ pdf->xref = x;
+ x += pdf->nxref;
+ while(Bread(s->bio, buf, n) == n){ /* stop on short read or error */
+ c = getint(buf, w[0], 1); /* default type is 1 */
+ if(c == 1){ /* not compressed */
+ x->off = getint(buf+w[0], w[1], 0);
+ x->gen = getint(buf+w[0]+w[1], w[2], 0);
+ x->type = Xuncompressed;
+ pdf->nxref++;
+ fprint(2, "xref %⊗\n", *x);
+ x++;
+ }else if(c == 2){ /* compressed */
+ x->objnum = getint(buf+w[0], w[1], 0);
+ x->id = getint(buf+w[0]+w[1], w[2], 0);
+ x->type = Xcompressed;
+ pdf->nxref++;
+ fprint(2, "xref %⊗\n", *x);
+ x++;
+ }
+ }
+
+ streamclose(s);
+ pdf->root = pdfref(dictget(o, "Root"));
+ pdf->info = pdfref(dictget(o, "Info"));
+ pdfobjfree(o);
+
+ return 0;
+err:
+ streamclose(s);
+ pdfobjfree(o);
+ return -1;
+}
+