shithub: neatpost

Download patch

ref: ad5819b2b326fca58c1eacdc433186b09a51d2aa
parent: 3a912712527c7c749cdc090d0c66917ad116ddbc
author: Ali Gholami Rudi <ali@rudi.ir>
date: Wed Mar 11 12:52:59 EDT 2020

post: support Unicode bookmarks and titles

Suggested and tested by Dirk-Wilhelm Peters <peters@schwertfisch.de>.

--- a/Makefile
+++ b/Makefile
@@ -4,7 +4,7 @@
 CC = cc
 CFLAGS = -Wall -O2 "-DTROFFFDIR=\"$(FDIR)\""
 LDFLAGS =
-OBJS = post.o ps.o font.o dev.o clr.o dict.o iset.o
+OBJS = post.o ps.o font.o dev.o clr.o dict.o iset.o sbuf.o
 OBJSPDF = post.o pdf.o pdfext.o font.o dev.o clr.o dict.o iset.o sbuf.o
 
 all: post pdf
--- a/pdf.c
+++ b/pdf.c
@@ -733,7 +733,7 @@
 	if (lnk[0] == '#') {	/* internal links */
 		pdfout("  /A << /S /GoTo /D (%s) >>\n", lnk + 1);
 	} else {		/* external links */
-		pdfout("  /A << /S /URI /URI (%s) >>\n", lnk);
+		pdfout("  /A << /S /URI /URI %s >>\n", pdftext_static(lnk));
 	}
 	pdfout(">>\n");
 	obj_end();
@@ -786,7 +786,7 @@
 				cnt++;
 		obj_beg(objs[i]);
 		pdfout("<<\n");
-		pdfout("  /Title (%s)\n", desc[i]);
+		pdfout("  /Title %s\n", pdftext_static(desc[i]));
 		/* the parent field */
 		for (j = i - 1; j >= 0 && level[j] >= level[i]; j--)
 			;
@@ -996,9 +996,9 @@
 	info_id = obj_beg(0);
 	pdfout("<<\n");
 	if (pdf_title[0])
-		pdfout("  /Title (%s)\n", pdf_title);
+		pdfout("  /Title %s\n", pdftext_static(pdf_title));
 	if (pdf_author[0])
-		pdfout("  /Author (%s)\n", pdf_author);
+		pdfout("  /Author %s\n", pdftext_static(pdf_author));
 	pdfout("  /Creator (Neatroff)\n");
 	pdfout("  /Producer (Neatpost)\n");
 	pdfout(">>\n");
--- a/post.c
+++ b/post.c
@@ -54,18 +54,16 @@
 
 static int utf8len(int c)
 {
-	if (c <= 0x7f)
+	if (~c & 0x80)		/* ASCII */
+		return c > 0;
+	if (~c & 0x40)		/* invalid UTF-8 */
 		return 1;
-	if (c >= 0xfc)
-		return 6;
-	if (c >= 0xf8)
-		return 5;
-	if (c >= 0xf0)
-		return 4;
-	if (c >= 0xe0)
-		return 3;
-	if (c >= 0xc0)
+	if (~c & 0x20)
 		return 2;
+	if (~c & 0x10)
+		return 3;
+	if (~c & 0x08)
+		return 4;
 	return 1;
 }
 
@@ -546,6 +544,69 @@
 	memset(new + oldsz * memsz, 0, (newsz - oldsz) * memsz);
 	free(old);
 	return new;
+}
+
+/* the unicode codepoint of the given utf-8 character */
+static int utf8code(char *s)
+{
+	int c = (unsigned char) s[0];
+	if (!(c & 0x80))
+		return c;
+	if (!(c & 0x20))
+		return ((c & 0x1f) << 6) | (s[1] & 0x3f);
+	if (!(c & 0x10))
+		return ((c & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f);
+	if (!(c & 0x08))
+		return ((c & 0x07) << 18) | ((s[1] & 0x3f) << 12) | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
+	return c;
+}
+
+static int pdftext_ascii(char *s)
+{
+	for (; *s; s++)
+		if (((unsigned char) *s) & 0x80 || *s == '(' || *s == ')')
+			return 0;
+	return 1;
+}
+
+/* encode s as pdf text string */
+static char *pdftext(char *s)
+{
+	struct sbuf *sb = sbuf_make();
+	if (pdftext_ascii(s)) {
+		sbuf_chr(sb, '(');
+		sbuf_str(sb, s);
+		sbuf_chr(sb, ')');
+		return sbuf_done(sb);
+	}
+	/* read utf-8 and write utf-16 */
+	sbuf_str(sb, "<FEFF");		/* unicode byte order marker */
+	while (*s) {
+		int l = utf8len((unsigned char) *s);
+		int c = utf8code(s);
+		if ((c >= 0 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xffff)) {
+			sbuf_printf(sb, "%02X%02X", c >> 8, c & 0xff);
+		}
+		if (c >= 0x010000 && c <= 0x10ffff) {
+			int c1 = 0xd800 + ((c - 0x10000) >> 10);
+			int c2 = 0xdc00 + ((c - 0x10000) & 0x3ff);
+			sbuf_printf(sb, "%02X%02X", c1 >> 8, c1 & 0xff);
+			sbuf_printf(sb, "%02X%02X", c2 >> 8, c2 & 0xff);
+		}
+		s += l;
+	}
+	sbuf_chr(sb, '>');
+	return sbuf_done(sb);
+}
+
+/* encode s as pdf text string; returns a static buffer */
+char *pdftext_static(char *s)
+{
+	static char buf[1024];
+	char *r = pdftext(s);
+	snprintf(buf, sizeof(buf), "%s", r);
+	free(r);
+	return buf;
 }
 
 static char *usage =
--- a/post.h
+++ b/post.h
@@ -110,6 +110,8 @@
 
 /* memory allocation */
 void *mextend(void *old, long oldsz, long newsz, int memsz);
+/* helper functions */
+char *pdftext_static(char *s);
 
 /* string buffers */
 struct sbuf *sbuf_make(void);
--- a/ps.c
+++ b/ps.c
@@ -317,9 +317,9 @@
 			lnk[0] == '#' ? lnk + 1 : lnk);
 	} else {
 		outf("[ /Rect [ %d %d t %d %d t ] "
-			"/Action << /Subtype /URI /URI (%s) >> /Open true "
+			"/Action << /Subtype /URI /URI %s >> /Open true "
 			"/Subtype /Link /LNK pdfmark\n",
-			o_h, o_v, o_h + hwid, o_v + vwid, lnk);
+			o_h, o_v, o_h + hwid, o_v + vwid, pdftext_static(lnk));
 	}
 }
 
@@ -346,7 +346,7 @@
 		for (j = i + 1; j < n && level[j] > level[i]; j++)
 			if (level[j] == level[i] + 1)
 				cnt++;
-		outf("[ /Title (%s)", desc[i]);
+		outf("[ /Title %s", pdftext_static(desc[i]));
 		if (page[i] > 0)
 			outf(" /Page %d", page[i]);
 		if (cnt > 0)
@@ -385,9 +385,9 @@
 {
 	out("[");
 	if (ps_title[0])
-		out(" /Title (%s)", ps_title);
+		out(" /Title %s", pdftext_static(ps_title));
 	if (ps_author[0])
-		out(" /Author (%s)", ps_author);
+		out(" /Author %s", pdftext_static(ps_author));
 	out(" /Creator (Neatroff) /DOCINFO pdfmark\n");
 	out("%%%%Trailer\n");
 	out("done\n");