shithub: neatpost

Download patch

ref: fdd132556d3330e9557c36c7e5631844d7cddb37
parent: 68d9ac00bfee6a2667d8f981c24c9779aee5a012
author: Ali Gholami Rudi <ali@rudi.ir>
date: Sat Apr 14 20:42:53 EDT 2018

pdf: basic support for \X'pdf pic.pdf'

--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@
 CFLAGS = -Wall -O2 "-DTROFFFDIR=\"$(FDIR)\""
 LDFLAGS =
 OBJS = post.o ps.o font.o dev.o clr.o dict.o iset.o
-OBJSPDF = post.o pdf.o font.o dev.o clr.o dict.o iset.o sbuf.o
+OBJSPDF = post.o pdf.o pdfext.o font.o dev.o clr.o dict.o iset.o sbuf.o
 
 all: post pdf
 %.o: %.c post.h
--- a/pdf.c
+++ b/pdf.c
@@ -1,3 +1,4 @@
+/* PDF post processor functions */
 #include <fcntl.h>
 #include <stdarg.h>
 #include <stdio.h>
@@ -25,6 +26,8 @@
 static int p_f, p_s, p_m;	/* output font */
 static int o_queued;		/* queued character type */
 static char o_iset[1024];	/* fonts accesssed in this page */
+static int *xobj;		/* page xobject object ids  */
+static int xobj_sz, xobj_n;	/* number of xobjects */
 
 /* loaded PDF fonts */
 struct pfont {
@@ -58,6 +61,13 @@
 	pdf_pos += strlen(s);
 }
 
+/* print pdf output */
+static void pdfmem(char *s, int len)
+{
+	fwrite(s, len, 1, stdout);
+	pdf_pos += len;
+}
+
 /* allocate an object number */
 static int obj_map(void)
 {
@@ -164,7 +174,7 @@
 	for (i = 0; i < n; i++) {
 		sbuf_chr(d, hex[((unsigned char) s[i]) >> 4]);
 		sbuf_chr(d, hex[((unsigned char) s[i]) & 0x0f]);
-		if (i % 80 == 79 && i + 1 < n)
+		if (i % 40 == 39 && i + 1 < n)
 			sbuf_chr(d, '\n');
 	}
 	sbuf_str(d, ">\n");
@@ -481,6 +491,130 @@
 {
 }
 
+static char *strcut(char *dst, char *src)
+{
+	while (*src == ' ' || *src == '\n')
+		src++;
+	if (src[0] == '"') {
+		src++;
+		while (*src && (src[0] != '"' || src[1] == '"')) {
+			if (*src == '"')
+				src++;
+			*dst++ = *src++;
+		}
+		if (*src == '"')
+			src++;
+	} else {
+		while (*src && *src != ' ' && *src != '\n')
+			*dst++ = *src++;
+	}
+	*dst = '\0';
+	return src;
+}
+
+/* return a copy of a pdf object; returns a static buffer */
+static char *pdf_copy(char *pdf, int len, int pos)
+{
+	static char buf[256];
+	int datlen;
+	pos += pdf_ws(pdf, len, pos);
+	datlen = pdf_len(pdf, len, pos);
+	if (datlen > sizeof(buf) - 1)
+		datlen = sizeof(buf) - 1;
+	memcpy(buf, pdf + pos, datlen);
+	buf[datlen] = '\0';
+	return buf;
+}
+
+/* return stream length */
+static int pdf_slen(char *pdf, int len, int pos, int slen)
+{
+	int old = pos;
+	pos += pdf_ws(pdf, len, pos);
+	pos += strlen("stream");
+	if (pdf[pos] == '\r')
+		pos++;
+	pos += 1 + slen;
+	if (pdf[pos] == '\n')
+		pos++;
+	pos += strlen("endstream");
+	return pos - old;
+}
+
+static int pdfext(char *pdf, int len)
+{
+	char *cont_fields[] = {"/Filter", "/DecodeParms"};
+	int trailer = pdf_trailer(pdf, len);
+	int root, cont, pages, page1, stream;
+	int kids_val, page1_val, val;
+	int xobj_id, length;
+	int bbox;
+	int i;
+	root = pdf_dval_obj(pdf, len, trailer, "/Root");
+	pages = pdf_dval_obj(pdf, len, root, "/Pages");
+	kids_val = pdf_dval_val(pdf, len, pages, "/Kids");
+	page1_val = pdf_lval(pdf, len, kids_val, 0);
+	page1 = pdf_ref(pdf, len, page1_val);
+	cont = pdf_dval_obj(pdf, len, page1, "/Contents");
+	val = pdf_dval_val(pdf, len, cont, "/Length");
+	length = atoi(pdf + val);
+	bbox = pdf_dval_val(pdf, len, page1, "/MediaBox");
+	if (bbox < 0)
+		bbox = pdf_dval_val(pdf, len, pages, "/MediaBox");
+	xobj_id = obj_beg(0);
+	pdfout("<<\n");
+	pdfout("  /Type /XObject\n");
+	pdfout("  /Subtype /Form\n");
+	pdfout("  /FormType 1\n");
+	if (bbox >= 0)
+		pdfout("  /BBox %s\n", pdf_copy(pdf, len, bbox));
+	pdfout("  /Matrix [1 0 0 1 %s]\n", pdfpos(o_h, o_v));
+	pdfout("  /Resources << /ProcSet [/PDF] >>\n");
+	pdfout("  /Length %d\n", length);
+	for (i = 0; i < LEN(cont_fields); i++)
+		if ((val = pdf_dval_val(pdf, len, cont, cont_fields[i])) >= 0)
+			pdfout("  %s %s\n", cont_fields[i],
+				pdf_copy(pdf, len, val));
+	pdfout(">>\n");
+	stream = cont + pdf_len(pdf, len, cont);
+	stream += pdf_ws(pdf, len, stream);
+	pdfmem(pdf + stream, pdf_slen(pdf, len, stream, length));
+	pdfout("\n");
+	obj_end();
+	if (xobj_n == xobj_sz) {
+		xobj_sz += 8;
+		xobj = mextend(xobj, xobj_n, xobj_sz, sizeof(xobj[0]));
+	}
+	xobj[xobj_n++] = xobj_id;
+	return xobj_n - 1;
+}
+
+void outpdf(char *spec)
+{
+	char pdf[1 << 12];
+	char buf[1 << 12];
+	struct sbuf *sb;
+	int xobj_id;
+	int fd, nr;
+	spec = strcut(pdf, spec);
+	if (!pdf[0])
+		return;
+	/* reading the pdf file */
+	sb = sbuf_make();
+	fd = open(pdf, O_RDONLY);
+	while ((nr = read(fd, buf, sizeof(buf))) > 0)
+		sbuf_mem(sb, buf, nr);
+	close(fd);
+	/* the XObject */
+	xobj_id = pdfext(sbuf_buf(sb), sbuf_len(sb));
+	sbuf_free(sb);
+	o_flush();
+	out_fontup();
+	sbuf_printf(pg, "ET /FO%d Do BT\n", xobj_id);
+	p_h = -1;
+	p_v = -1;
+}
+
 void outlink(char *spec)
 {
 }
@@ -687,10 +821,10 @@
 	/* page contents */
 	cont_id = obj_beg(0);
 	pdfout("<<\n");
-	pdfout("  /Length %d\n", sbuf_len(pg));
+	pdfout("  /Length %d\n", sbuf_len(pg) - 1);
 	pdfout(">>\n");
 	pdfout("stream\n");
-	pdfouts(sbuf_buf(pg));
+	pdfmem(sbuf_buf(pg), sbuf_len(pg));
 	pdfout("endstream\n");
 	obj_end();
 	/* the page object */
@@ -714,6 +848,12 @@
 		}
 	}
 	pdfout(" >>\n");
+	if (xobj_n) {				/* XObjects */
+		pdfout("    /XObject <<");
+		for (i = 0; i < xobj_n; i++)
+			pdfout(" /FO%d %d 0 R", i, xobj[i]);
+		pdfout(" >>\n");
+	}
 	pdfout("  >>\n");
 	pdfout("  /Contents %d 0 R\n", cont_id);
 	pdfout(">>\n");
@@ -720,4 +860,8 @@
 	obj_end();
 	sbuf_free(pg);
 	memset(o_iset, 0, pfonts_n * sizeof(o_iset[0]));
+	free(xobj);
+	xobj = NULL;
+	xobj_n = 0;
+	xobj_sz = 0;
 }
--- /dev/null
+++ b/pdfext.c
@@ -1,0 +1,281 @@
+/* Parse and extract PDF objects */
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "post.h"
+
+/* the number white space characters */
+int pdf_ws(char *pdf, int len, int pos)
+{
+	int i = pos;
+	while (i < len && isspace((unsigned char) pdf[i]))
+		i++;
+	return i - pos;
+}
+
+/* s: string, d: dictionary, l: list, n: number, /: name, r: reference */
+int pdf_type(char *pdf, int len, int pos)
+{
+	pos += pdf_ws(pdf, len, pos);
+	if (pdf[pos] == '/')
+		return '/';
+	if (pdf[pos] == '(')
+		return 's';
+	if (pdf[pos] == '<' && pdf[pos + 1] != '<')
+		return 's';
+	if (pdf[pos] == '<' && pdf[pos + 1] == '<')
+		return 'd';
+	if (pdf[pos] == '[')
+		return 'l';
+	if (strchr("0123456789+-.", (unsigned char) pdf[pos])) {
+		if (!isdigit((unsigned char) pdf[pos]))
+			return 'n';
+		while (pos < len && isdigit((unsigned char) pdf[pos]))
+			pos++;
+		pos += pdf_ws(pdf, len, pos);
+		if (!isdigit((unsigned char) pdf[pos]))
+			return 'n';
+		while (pos < len && isdigit((unsigned char) pdf[pos]))
+			pos++;
+		pos += pdf_ws(pdf, len, pos);
+		return pos < len && pdf[pos] == 'R' ? 'r' : 'n';
+	}
+	return -1;
+}
+
+/* the length of a pdf object */
+int pdf_len(char *pdf, int len, int pos)
+{
+	int c;
+	int old = pos;
+	if (pos >= len)
+		return 0;
+	pos += pdf_ws(pdf, len, pos);
+	c = (unsigned char) pdf[pos];
+	if (strchr("0123456789+-.", c)) {
+		if (pdf_type(pdf, len, pos) == 'r') {
+			char *r = memchr(pdf + pos, 'R', len - pos);
+			return r - (pdf + old) + 1;
+		}
+		pos++;
+		while (pos < len && strchr("0123456789.", (unsigned char) pdf[pos]))
+			pos++;
+	}
+	if (c == '(') {
+		int depth = 1;
+		pos++;
+		while (pos < len && depth > 0) {
+			if (pdf[pos] == '(')
+				depth++;
+			if (pdf[pos] == ')')
+				depth--;
+			if (pdf[pos] == '\\')
+				pos++;
+			pos++;
+		}
+	}
+	if (c == '<' && pos + 1 < len && pdf[pos + 1] == '<') {
+		pos += 2;
+		while (pos + 2 < len && (pdf[pos] != '>' || pdf[pos + 1] != '>')) {
+			pos += pdf_len(pdf, len, pos);
+			pos += pdf_len(pdf, len, pos);
+			pos += pdf_ws(pdf, len, pos);
+		}
+		if (pos + 2 < len)
+			pos += 2;
+	} else if (c == '<') {
+		while (pos < len && pdf[pos] != '>')
+			pos++;
+		if (pos < len)
+			pos++;
+	}
+	if (c == '/') {
+		pos++;
+		while (pos < len && !strchr(" \t\r\n\f()<>[]{}/%",
+					(unsigned char) pdf[pos]))
+			pos++;
+	}
+	if (c == '[') {
+		pos++;
+		while (pos < len && pdf[pos] != ']') {
+			pos += pdf_len(pdf, len, pos);
+			pos += pdf_ws(pdf, len, pos);
+		}
+		pos++;
+	}
+	return pos - old;
+}
+
+static int startswith(char *s, char *t)
+{
+	while (*s && *t)
+		if (*s++ != *t++)
+			return 0;
+	return 1;
+}
+
+/* read an indirect reference */
+int pdf_obj(char *pdf, int len, int pos, int *obj, int *rev)
+{
+	if (pdf_type(pdf, len, pos) != 'r')
+		return -1;
+	*obj = atoi(pdf + pos);
+	pos += pdf_len(pdf, len, pos);
+	*rev = atoi(pdf + pos);
+	return 0;
+}
+
+/* the value of a pdf dictionary key */
+int pdf_dval(char *pdf, int len, int pos, char *key)
+{
+	pos += 2;
+	while (pos + 2 < len && (pdf[pos] != '>' || pdf[pos + 1] != '>')) {
+		pos += pdf_ws(pdf, len, pos);
+		if (startswith(key, pdf + pos)) {
+			pos += pdf_len(pdf, len, pos);
+			pos += pdf_ws(pdf, len, pos);
+			return pos;
+		}
+		pos += pdf_len(pdf, len, pos);
+		pos += pdf_len(pdf, len, pos);
+		pos += pdf_ws(pdf, len, pos);
+	}
+	return -1;
+}
+
+/* return a dictionary key */
+int pdf_dkey(char *pdf, int len, int pos, int key)
+{
+	int i = 0;
+	pos += 2;
+	while (pos + 2 < len && (pdf[pos] != '>' || pdf[pos + 1] != '>')) {
+		pos += pdf_ws(pdf, len, pos);
+		if (i++ == key)
+			return pos;
+		pos += pdf_len(pdf, len, pos);
+		pos += pdf_len(pdf, len, pos);
+		pos += pdf_ws(pdf, len, pos);
+	}
+	return -1;
+}
+
+/* return a list entry */
+int pdf_lval(char *pdf, int len, int pos, int idx)
+{
+	int i = 0;
+	pos++;
+	while (pos < len && pdf[pos] != ']') {
+		if (i++ == idx)
+			return pos;
+		pos += pdf_len(pdf, len, pos);
+		pos += pdf_ws(pdf, len, pos);
+	}
+	return -1;
+}
+
+void *memrchr(void *m, int c, long n);
+
+static int prevline(char *pdf, int len, int off)
+{
+	char *nl = memrchr(pdf, '\n', off);
+	if (nl && nl > pdf) {
+		char *nl2 = memrchr(pdf, '\n', nl - pdf -1);
+		if (nl2)
+			return nl2 - pdf + 1;
+	}
+	return -1;
+}
+
+static int nextline(char *pdf, int len, int off)
+{
+	char *nl = memchr(pdf + off, '\n', len - off);
+	if (nl)
+		return nl - pdf + 1;
+	return -1;
+}
+
+/* the position of the trailer */
+int pdf_trailer(char *pdf, int len)
+{
+	int pos = prevline(pdf, len, len);		/* %%EOF */
+	while (!startswith(pdf + pos, "trailer"))
+		if ((pos = prevline(pdf, len, pos)) < 0)
+			return -1;
+	return nextline(pdf, len, pos);			/* skip trailer\n */
+}
+
+/* the position of the last xref table */
+static int pdf_xref(char *pdf, int len)
+{
+	int pos = prevline(pdf, len, len);		/* %%EOF */
+	if ((pos = prevline(pdf, len, pos)) < 0)
+		return -1;
+	/* read startxref offset */
+	if (sscanf(pdf + pos, "%d", &pos) != 1 || pos >= len || pos < 0)
+		return -1;
+	return nextline(pdf, len, pos);			/* skip xref\n */
+}
+
+/* find a pdf object */
+int pdf_find(char *pdf, int len, int obj, int rev)
+{
+	int obj_beg, obj_cnt;
+	int cur_rev, cur_pos;
+	char *beg;
+	int i;
+	int pos = pdf_xref(pdf, len);
+	if (pos < 0)
+		return -1;
+	/* the numbers after xref */
+	while (pos < len && sscanf(pdf + pos, "%d %d", &obj_beg, &obj_cnt) == 2) {
+		for (i = 0; i < obj_cnt; i++) {
+			if ((pos = nextline(pdf, len, pos)) < 0)
+				return -1;
+			if (sscanf(pdf + pos, "%d %d", &cur_pos, &cur_rev) != 2)
+				return -1;
+			if (obj_beg + i == obj && cur_rev == rev) {
+				if (cur_pos < 0 || cur_pos >= len)
+					return -1;
+				if (!(beg = strstr(pdf + cur_pos, "obj")))
+					return -1;
+				pos = beg - pdf + 3;
+				pos += pdf_ws(pdf, len, pos);
+				return pos;
+			}
+		}
+	}
+	return -1;
+}
+
+/* read and dereference an indirect reference */
+int pdf_ref(char *pdf, int len, int pos)
+{
+	int obj, rev;
+	if (pdf_obj(pdf, len, pos, &obj, &rev))
+		return -1;
+	return pdf_find(pdf, len, obj, rev);
+}
+
+/* retrieve and dereference a dictionary entry */
+int pdf_dval_val(char *pdf, int len, int pos, char *key)
+{
+	int val = pdf_dval(pdf, len, pos, key);
+	int val_obj, val_rev;
+	if (val < 0)
+		return -1;
+	if (pdf_type(pdf, len, val) == 'r') {
+		pdf_obj(pdf, len, val, &val_obj, &val_rev);
+		return pdf_find(pdf, len, val_obj, val_rev);
+	}
+	return val;
+}
+
+/* retrieve a dictionary entry, which is an indirect reference */
+int pdf_dval_obj(char *pdf, int len, int pos, char *key)
+{
+	int val = pdf_dval(pdf, len, pos, key);
+	if (val < 0)
+		return -1;
+	return pdf_ref(pdf, len, val);
+}
--- a/post.c
+++ b/post.c
@@ -260,6 +260,8 @@
 		outrotate(atoi(arg));
 	if (!strcmp("eps", cmd))
 		outeps(arg);
+	if (!strcmp("pdf", cmd))
+		outpdf(arg);
 	if (!strcmp("link", cmd))
 		outlink(arg);
 	if (!strcmp("BeginObject", cmd))
--- a/post.h
+++ b/post.h
@@ -57,7 +57,8 @@
 void outsize(int s);
 void outcolor(int c);
 void outrotate(int deg);
-void outeps(char *eps);
+void outeps(char *spec);
+void outpdf(char *spec);
 void outlink(char *spec);
 void outpage(void);
 void outmnt(int f);
@@ -119,3 +120,17 @@
 void sbuf_chr(struct sbuf *sbuf, int c);
 void sbuf_mem(struct sbuf *sbuf, char *s, int len);
 void sbuf_cut(struct sbuf *sb, int len);
+
+/* reading PDF files */
+int pdf_ws(char *pdf, int len, int pos);
+int pdf_len(char *pdf, int len, int pos);
+int pdf_type(char *pdf, int len, int pos);
+int pdf_dval(char *pdf, int len, int pos, char *key);
+int pdf_dkey(char *pdf, int len, int pos, int key);
+int pdf_lval(char *pdf, int len, int pos, int idx);
+int pdf_trailer(char *pdf, int len);
+int pdf_obj(char *pdf, int len, int pos, int *obj, int *rev);
+int pdf_find(char *pdf, int len, int obj, int rev);
+int pdf_ref(char *pdf, int len, int pos);
+int pdf_dval_val(char *pdf, int len, int pos, char *key);
+int pdf_dval_obj(char *pdf, int len, int pos, char *key);
--- a/ps.c
+++ b/ps.c
@@ -330,6 +330,10 @@
 	outf("EPSFEND\n");
 }
 
+void outpdf(char *spec)
+{
+}
+
 void outlink(char *spec)
 {
 	char lnk[1 << 12];