ref: 05cf7ced4f8ffc7dc0ab2fbc601d5d33e6e5ca42
author: sirjofri <sirjofri@sirjofri.de>
date: Mon Jul 8 07:59:39 EDT 2024
adds files from archive, adds rudimentary mkfile for easy installation
--- /dev/null
+++ b/README
@@ -1,0 +1,23 @@
+xml tools from 9atom as archived from
+https://github.com/Plan9-Archive/9atom
+
+plug-in repository for the following xml tools:
+
+- libxml:
+ /sys/include/xml.h
+ /$objtype/lib/libxml.a
+ /sys/man/2/xml
+- xb:
+ /$objtype/bin/xb
+ /sys/man/1/xb
+
+Installation:
+
+mk install
+mk clean (optional)
+
+Deinstallation:
+
+mk nuke
+
+This will remove all the installed files.
--- /dev/null
+++ b/libxml/doc.c
@@ -1,0 +1,32 @@
+/*
+ * Generate a state transition diagram from the state tables,
+ * we skip error transitions as they just confuse the drawing.
+ */
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include "state-machine.h"
+
+void
+main(void)
+{
+ int os, s, a, t;
+
+ print("digraph xml_parser {\n");
+ print(" rankdir=LR;\n");
+ print(" size=\"11,9\"\n");
+ print(" node [shape = circle];\n");
+
+ for(t = 0; t < NumToks; t++)
+ for(os = 0; os < NumStates; os++){
+ s = statab[os][t];
+ a = acttab[os][t];
+ if(a != Aerr)
+ print(" %s -> %s [ label = \"%s / %s\" ];\n",
+ stastr[os], stastr[s], tokstr[t], actstr[a]);
+ }
+
+ print("}\n");
+ exits(nil);
+}
+
--- /dev/null
+++ b/libxml/heap.c
@@ -1,0 +1,134 @@
+#include <u.h>
+#include <libc.h>
+#include "xml.h"
+
+#define Roundup(x, g) (((x) + (unsigned)(g-1)) & ~((unsigned)(g-1)))
+
+struct Xtree {
+ Xtree *left;
+ Xtree *right;
+ char *str;
+ int hits;
+};
+
+struct Xblock {
+ Xblock *next;
+ char *free;
+ char *end;
+};
+
+static int Strdups, Commons, Unique, Memblocks;
+
+static void *
+getmem(Xml *xp, int len)
+{
+ int sz;
+ Xblock *b;
+ char *ret;
+
+ len = Roundup(len, sizeof(long long));
+
+ sz = xp->alloc.blksiz; /* shorthand */
+ b = xp->alloc.active;
+
+ if(len > sz)
+ sysfatal("store: object too big (%d > %d)\n", len, sz);
+
+ if(xp->alloc.active == nil || b->free + len >= b->end){
+ Memblocks++;
+ b = mallocz(sizeof(Xblock) + sz, 0);
+ b->free = (char *)&b[1];
+ b->end = (char *)&b->free[sz];
+
+ b->next = xp->alloc.active;
+ xp->alloc.active = b;
+ }
+
+ ret = b->free;
+ b->free += len;
+ return ret;
+}
+
+static Xtree *
+lookadd(Xml *xp, Xtree *t, char *str, Xtree **match)
+{
+ int n;
+
+ if(t == nil){
+ Unique++;
+ t = getmem(xp, sizeof(Xtree) + strlen(str)+1);
+ t->left = nil;
+ t->right = nil;
+ t->str = (char *)&t[1];
+ strcpy(t->str, str);
+ *match = t;
+ t->hits = 1;
+ return t;
+ }
+
+ if((n = strcmp(str, t->str)) == 0){
+ *match = t;
+ t->hits++;
+ }
+ if(n < 0)
+ t->left = lookadd(xp, t->left, str, match);
+ if(n > 0)
+ t->right = lookadd(xp, t->right, str, match);
+ return t;
+}
+
+char *
+xmlstrdup(Xml *xp, char *str, int iscommon)
+{
+ char *s;
+ Xtree *t;
+
+ Strdups++;
+ if(iscommon){
+ Commons++;
+ xp->alloc.root = lookadd(xp, xp->alloc.root, str, &t);
+ return t->str;
+ }
+
+ s = getmem(xp, strlen(str)+1);
+ return strcpy(s, str);
+}
+
+void *
+xmlcalloc(Xml *xp, int n, int m)
+{
+ void *v;
+
+ v = getmem(xp, n * m);
+ memset(v, 0, n * m);
+ return v;
+}
+
+void *
+xmlmalloc(Xml *xp, int n)
+{
+ return getmem(xp, n);
+}
+
+
+void
+_Xheapstats(void)
+{
+ fprint(2, "total=%d common=%d -> unique=%d rare=%d memblocks=%d\n",
+ Strdups, Commons, Unique, Strdups - Commons, Memblocks);
+}
+
+void
+_Xheapfree(Xml *xp)
+{
+ Xblock *b, *n;
+
+ for(b = xp->alloc.active; b; b = n){
+ n = b->next;
+ if(xmldebug)
+ memset(b, 0x7e, xp->alloc.blksiz);
+ free(b);
+ }
+
+}
+
--- /dev/null
+++ b/libxml/mkfile
@@ -1,0 +1,34 @@
+</$objtype/mkfile
+
+LIB=/$objtype/lib/libxml.a
+
+OFILES=\
+ xmlattr.$O\
+ xmlelem.$O\
+ xmlfind.$O\
+ xmlfree.$O\
+ xmlparse.$O\
+ xmlprint.$O\
+ xmlnew.$O\
+ xmllook.$O\
+ xmlvalue.$O\
+ heap.$O\
+
+HFILES=\
+ /sys/include/xml.h
+
+CLEANFILES=doc.ps $O.doc
+
+UPDATE=\
+ mkfile\
+ $HFILES\
+ ${OFILES:%.$O=%.c}\
+ ${LIB:/$objtype/%=/386/%}
+
+</sys/src/cmd/mksyslib
+
+$O.doc: doc.$O
+ $LD -o $target $prereq
+
+doc.ps: 8.doc
+ 8.doc | dot '-Grotate=90' '-Gsize=10,8' -Tps > doc.ps
--- /dev/null
+++ b/libxml/state-machine.h
@@ -1,0 +1,83 @@
+enum { /* Lexer Tokens */
+ Twhite = 0,
+ Topen,
+ Tname,
+ Tclose,
+ Tequal,
+ Tendblk,
+ Tnulblk,
+ NumToks
+};
+
+enum { /* Parser states */
+ Slost = 0,
+ Sopened = 1,
+ Snamed = 2,
+ Sattred = 3,
+ Sequed = 4,
+ Sendblk = 5,
+ Sclosed = 6,
+ NumStates
+
+};
+
+enum { /* Parser Actions */
+ Aerr = 0,
+ Anop = 1,
+ Aelem = 2,
+ Apcdata = 3,
+ Aattr = 4,
+ Avalue = 5,
+ Aup = 6,
+ Adown = 7,
+ Acheck = 8,
+ NumActions
+};
+
+static char *
+tokstr[] = { /* lexer token names for debug */
+ [Twhite] "white", [Topen] "open",
+ [Tname] "name", [Tclose] "close",
+ [Tequal] "equal", [Tendblk] "endblk",
+ [Tnulblk] "nulblk"
+};
+
+static char *
+stastr[] = { /* parser state names for debug */
+ [Slost] "lost", [Sopened] "opened",
+ [Snamed] "named", [Sattred] "attred",
+ [Sequed] "equed", [Sendblk] "endblk",
+ [Sclosed] "closed",
+};
+
+static char *
+actstr[] = { /* parser action names for debug */
+ [Aerr] "error", [Anop] "nop",
+ [Apcdata] "pcdata", [Aattr] "attr",
+ [Avalue] "value", [Aelem] "elem",
+ [Aup] "up", [Adown] "down",
+ [Acheck] "check"
+};
+
+
+static int statab[7][7] = { /* Parser state transition table */
+/* Twhite Topen Tname Tclose Tequal Tendblk Tnulblk */
+ [Slost] { Slost, Sopened,Slost, Slost, Slost, Slost, Slost },
+ [Sopened] { 0, 0, Snamed, 0, 0, 0, 0 },
+ [Snamed] { Snamed, 0, Sattred,Sendblk,0, Slost, Slost },
+ [Sattred] { Sattred, 0, 0, 0, Sequed, 0, 0 },
+ [Sequed] { Sequed, 0, Snamed, 0, 0, 0, 0 },
+ [Sendblk] { 0, 0, Sclosed,0, 0, 0, 0 },
+ [Sclosed] { 0, 0, 0, Slost, 0, 0, 0 },
+};
+
+static int acttab[7][7] = { /* Parser action table */
+/* Twhite Topen Tname Tclose Tequal Tendblk Tnulblk */
+ [Slost] { Apcdata, Anop, Apcdata, Apcdata, Apcdata, Aup, Apcdata },
+ [Sopened] { 0, 0, Aelem, 0, 0, 0, 0 },
+ [Snamed] { Anop, 0, Aattr, Adown, 0, Anop, Anop },
+ [Sattred] { Anop, 0, 0, 0, Anop, 0, 0 },
+ [Sequed] { Anop, 0, Avalue, 0, 0, 0, 0 },
+ [Sendblk] { 0, 0, Acheck, 0, 0, 0, 0 },
+ [Sclosed] { 0, 0, 0, Anop, 0, 0, 0 },
+};
--- /dev/null
+++ b/libxml/xmlattr.c
@@ -1,0 +1,31 @@
+#include <u.h>
+#include <libc.h>
+#include "xml.h"
+
+Attr *
+xmlattr(Xml *xp, Attr **root, Elem *parent, char *name, char *value)
+{
+ Attr *ap, *t;
+
+ USED(xp);
+ if((ap = xmlcalloc(xp, sizeof(Attr), 1)) == nil)
+ sysfatal("no memory - %r\n");
+ if(*root == nil){
+ *root = ap;
+ }
+ else{
+ for (t = *root; t->next; t = t->next)
+ continue;
+ t->next = ap;
+ }
+ ap->parent = parent;
+
+ if(name)
+ if((ap->name = xmlstrdup(xp, name, 1)) == nil)
+ sysfatal("no memory - %r\n");
+
+ if(value)
+ if((ap->value = xmlstrdup(xp, value, 0)) == nil)
+ sysfatal("no memory - %r\n");
+ return ap;
+}
--- /dev/null
+++ b/libxml/xmlelem.c
@@ -1,0 +1,27 @@
+#include <u.h>
+#include <libc.h>
+#include "xml.h"
+
+Elem *
+xmlelem(Xml *xp, Elem **root, Elem *parent, char *name)
+{
+ Elem *ep, *t;
+
+ USED(xp);
+ if((ep = xmlcalloc(xp, sizeof(Elem), 1)) == nil)
+ sysfatal("no memory - %r\n");
+ if(! *root){
+ *root = ep;
+ }
+ else{
+ for (t = *root; t->next; t = t->next)
+ continue;
+ t->next = ep;
+ }
+ ep->parent = parent;
+ if(name)
+ if((ep->name = xmlstrdup(xp, name, 1)) == nil)
+ sysfatal("no memory - %r\n");
+ return ep;
+}
+
--- /dev/null
+++ b/libxml/xmlfind.c
@@ -1,0 +1,35 @@
+#include <u.h>
+#include <libc.h>
+#include "xml.h"
+
+/*
+ * search for element, starting at ep.
+ */
+
+Elem *
+xmlfind(Xml *xp, Elem *ep, char *path)
+{
+ char *p;
+ Elem *t;
+
+ USED(xp);
+
+ if (path == nil)
+ return nil;
+ if (*path == '/')
+ path++;
+ if ((p = strchr(path, '/')) == nil)
+ if((p = strchr(path, 0)) == nil)
+ return nil; // shut up lint !
+
+ for(; ep; ep = ep->next)
+ if (strncmp(ep->name, path, p-path) == 0){
+ if (*p == 0)
+ return ep;
+ if (! ep->child)
+ continue;
+ if ((t = xmlfind(xp, ep->child, p)) != nil)
+ return t;
+ }
+ return nil;
+}
--- /dev/null
+++ b/libxml/xmlfree.c
@@ -1,0 +1,10 @@
+#include <u.h>
+#include <libc.h>
+#include "xml.h"
+
+void
+xmlfree(Xml *xp)
+{
+ _Xheapfree(xp);
+ free(xp);
+}
--- /dev/null
+++ b/libxml/xmllook.c
@@ -1,0 +1,49 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include "xml.h"
+
+/*
+ * search for element, starting at ep.
+ * if attr!=nil the elem must have an attribute attr
+ * if value!=nil then the elem must have attr=value
+ */
+
+Elem *
+xmllook(Elem *ep, char *path, char *attr, char *value)
+{
+ char *p;
+ Elem *t;
+ Attr *ap;
+
+ if (path == nil)
+ return nil;
+ if (*path == '/')
+ path++;
+ if ((p = strchr(path, '/')) == nil)
+ if((p = strchr(path, 0)) == nil)
+ return nil; // shut up lint !
+
+ for(; ep; ep = ep->next)
+ if (strncmp(ep->name, path, p-path) == 0){
+ if (*p == '/'){
+ if (ep->child)
+ if ((t = xmllook(ep->child, p, attr, value)) != nil)
+ return t;
+ continue;
+ }
+ if (attr == nil)
+ return ep;
+ for (ap = ep->attrs; ap; ap = ap->next)
+ if (strcmp(ap->name, attr) == 0){
+ if (value == nil)
+ return ep;
+ if (strcmp(ap->value, value) == 0)
+ return ep;
+ }
+ if (ep->child)
+ if ((t = xmllook(ep->child, p, attr, value)) != nil)
+ return t;
+ }
+ return nil;
+}
--- /dev/null
+++ b/libxml/xmlnew.c
@@ -1,0 +1,15 @@
+#include <u.h>
+#include <libc.h>
+#include "xml.h"
+
+Xml *
+xmlnew(int blksize)
+{
+ Xml *xp;
+
+ xp = mallocz(sizeof(Xml), 1);
+ xp->alloc.blksiz = blksize;
+ if(xp == nil)
+ return nil;
+ return xp;
+}
--- /dev/null
+++ b/libxml/xmlparse.c
@@ -1,0 +1,570 @@
+#include <u.h>
+#include <libc.h>
+#include <ctype.h>
+#include <bio.h>
+#include "xml.h"
+#include "state-machine.h"
+
+int xmldebug = 0;
+
+enum {
+ Grain = 16
+};
+
+#define isname1(c) (isalpha((c)) || c == '_') /* FIXME: not enforced yet */
+#define isnameN(r) (isalpharune((r)) || isdigitrune((r)) || r == L'_' || r == L'-' || r == L'.' || r == L':')
+#define Roundup(x, g) (((x) + (unsigned)(g-1)) & ~((unsigned)(g-1)))
+
+enum {
+ Ntext = 1024, /* longest name or atribute value possible */
+ Nref = 32 /* longest entity reference name */
+};
+
+
+typedef struct {
+ int line; /* Line number (for errors) */
+ Biobuf *bp; /* input stream */
+ int flags; /* misc flags, see xml.h */
+ Xml *xml;
+ int failed;
+} State;
+
+typedef struct {
+ char *buf;
+ int sz;
+} Lexbuf;
+
+
+static int
+trimwhite(char *s)
+{
+ char *p;
+
+ for(p = s; *p; p++)
+ if(! isspace(*p))
+ return 0;
+
+ /* trim any whitespace into a single space */
+ s[0] = ' ';
+ s[1] = 0;
+ return 1;
+}
+
+static void
+growstr(State *st, Lexbuf *lb, char *str)
+{
+ int b, s, sz;
+
+ if(str == nil || *str == 0)
+ return;
+ if((st->flags & Fcrushwhite) && trimwhite(str) &&
+ (lb->buf == nil || lb->buf[0] == 0))
+ return;
+ b = 0;
+ if(lb->buf)
+ b = strlen(lb->buf);
+ s = strlen(str);
+
+ sz = Roundup(b+s+1, Grain);
+ if(sz >= lb->sz){
+ lb->buf = realloc(lb->buf, sz);
+ if(lb->buf == nil)
+ sysfatal("No memory, wanted %d bytes\n", sz);
+ lb->sz = sz;
+ }
+ strcpy(lb->buf+b, str);
+}
+
+static void
+growrune(State *st, Lexbuf *lb, Rune r)
+{
+ int n;
+ char str[UTFmax+1];
+
+ n = runetochar(str, &r);
+ str[n] = 0;
+ growstr(st, lb, str);
+}
+
+static void
+stripns(char *str)
+{
+ char *p;
+
+ if((p = strrchr(str, ':')) == nil)
+ return;
+ strcpy(str, p+1);
+}
+
+static void
+failed(State *st, char *fmt, ...)
+{
+ int n;
+ va_list arg;
+ char err[ERRMAX];
+
+ st->failed = 1;
+ va_start(arg, fmt);
+ n = snprint(err, sizeof(err), "%d ", st->line);
+ vsnprint(err+n, sizeof(err)-n, fmt, arg);
+ va_end(arg);
+ werrstr("%s", err);
+
+}
+
+static void
+unget(State *st, int c)
+{
+ if(c == '\n')
+ st->line--;
+ Bungetrune(st->bp);
+}
+
+static long
+get(State *st)
+{
+ long r;
+
+ r = Bgetrune(st->bp);
+ if(r == Runeerror){
+ failed(st, "bad UTF-8 sequence");
+ r = L' ';
+ }
+
+ if(r == L'\n')
+ st->line++;
+
+ if(xmldebug){
+ if(xmldebug == 3)
+ fprint(2, "%C", (Rune)r);
+ if(xmldebug == 1 && r == -1)
+ fprint(2, "EOF\n");
+ }
+ return r;
+}
+
+static struct {
+ char *name;
+ Rune rune;
+} Entities[] = {
+ { "amp", L'&' },
+ { "lt", L'<' },
+ { "gt", L'>' },
+ { "apos", L'\'' },
+ { "quot", L'"' },
+ { "nbsp", 0xa0 }, /* no-break space */
+};
+
+static long
+entityref(State *st)
+{
+ int n, i, l;
+ long r;
+ Rune x;
+ char *p, buf[Nref];
+
+ l = 0;
+ p = buf;
+ while((r = get(st)) != -1 && r != L';' && ! isspacerune(r))
+ if(l < (sizeof(buf) - UTFmax)){
+ x = r;
+ n = runetochar(p, &x);
+ p += n;
+ l += n;
+ }
+ *p = 0;
+ if(r == -1)
+ return -1;
+
+ /* false positive */
+ if(r != L';'){
+ fprint(2, "%d: unquoted '&' - ignored\n", st->line);
+ for(i = --l; i >= 0; i--)
+ unget(st, buf[i]);
+ return L'&';
+ }
+
+ if(buf[0] == '#'){
+ if(buf[1] == 'x' || buf[1] == 'X')
+ return strtol(buf+2, 0, 16);
+ return strtol(buf+1, 0, 10);
+ }
+
+ for(i = 0; i < nelem(Entities); i++)
+ if(memcmp(Entities[i].name, buf, l) == 0)
+ return Entities[i].rune;
+
+ fprint(2, "%d: '&%s;' unknown/unsupported entity reference\n", st->line, buf);
+ return L'?';
+}
+
+static int
+match(State *st, Rune *s)
+{
+ long r;
+ Rune *p;
+
+ r = -1;
+ for(p = s; *p; p++)
+ if((r = get(st)) != *p)
+ break;
+ if(r == -1)
+ return -1; /* EOF */
+ if(*p == 0)
+ return 0; /* match */
+ unget(st, r);
+ for(p--; p >= s; p--)
+ unget(st, *p);
+ return 1; /* no match */
+}
+
+static int
+comment(State *st)
+{
+ long r;
+ int startline;
+
+ startline = st->line;
+ do{
+ if(get(st) == -1)
+ break;
+ }while(match(st, L"--") == 1);
+
+ r = get(st);
+ if(r == -1){
+ failed(st, "EOF in comment (re: line %d)", startline);
+ return -1;
+ }
+ if(r != L'>'){
+ failed(st, "'--' illegal in a comment (re: line %d)", startline);
+ return Twhite;
+ }
+ return Twhite;
+}
+
+static int
+doctype(State *st, Lexbuf *lb)
+{
+ long r;
+ char *p;
+ int startline;
+
+ startline = st->line;
+
+ /* trim leading whitespace */
+ while((r = get(st)) != -1 && isspacerune(r))
+ continue;
+ unget(st, r);
+
+ if(lb->buf)
+ lb->buf[0] = 0;
+
+ while((r = get(st)) != -1 && r != L'>')
+ growrune(st, lb, r);
+
+ if(r == -1){
+ failed(st, "EOF in DOCTYPE (re: line %d)", startline);
+ return -1;
+ }
+ /* trim trailing whitespace */
+ p = strrchr(lb->buf, 0);
+ for(p--; p >= lb->buf && isspace(*p); p--)
+ *p = 0;
+
+ st->xml->doctype = xmlstrdup(st->xml, lb->buf, 0);
+ return Twhite;
+}
+
+static int
+cdata(State *st, Lexbuf *lb)
+{
+ long r;
+ int startline;
+
+ startline = st->line;
+ do{
+ if((r = get(st)) == -1)
+ break;
+ if(r == L'&')
+ if((r = entityref(st)) == -1)
+ break;
+ growrune(st, lb, r);
+ }while(match(st, L"]]>") == 1);
+
+ if(r == -1){
+ failed(st, "EOF in CDATA (re: line %d)", startline);
+ return -1;
+ }
+ return Tname;
+}
+
+/*
+ * byte order mark.
+ * This is pointless for utf8 which has defined byte order,
+ * and we don't support utf16 or utf32 but some xml seems to
+ * prepend them to utf8 so we need to find them and skip them
+ */
+static int
+bom(State *st)
+{
+ long r;
+
+ if((r = get(st)) == -1)
+ return -1;
+
+ if(r != 0xbb){
+ unget(st, 0xef);
+ unget(st, r);
+ return -1;
+ }
+ if((r = get(st)) == -1){
+ unget(st, 0xef);
+ unget(st, 0xbb);
+ return -1;
+ }
+ if(r != 0xbf){
+ unget(st, 0xef);
+ unget(st, 0xbb);
+ unget(st, r);
+ }
+ return 0;
+}
+
+static int
+xlex(State *st, Lexbuf *lb, int s)
+{
+ long r;
+ Rune q;
+
+ while((r = get(st)) != -1){
+ if(r == 0xef)
+ if(bom(st) == 0)
+ continue;
+ if(r == L'<'){
+ r = get(st);
+ switch(r){
+ case L'?':
+ while((r = get(st)) != -1 && r != L'>')
+ continue;
+ if(r == -1)
+ return -1;
+ return Twhite;
+ case L'!':
+ if(match(st, L"--") == 0)
+ return comment(st);
+ if(match(st, L"DOCTYPE ") == 0)
+ return doctype(st, lb);
+ if(match(st, L"[CDATA[") == 0)
+ return cdata(st, lb);
+ failed(st, "<!name not known");
+ case L'/':
+ return Tendblk;
+ case L' ':
+ case L'\t':
+ case L'\f':
+ case L'\n':
+ case L'\r':
+ failed(st, "whitespace following '<'");
+ break;
+ default:
+ unget(st, r);
+ return Topen;
+ }
+ continue;
+ }
+
+ if(s != Slost){
+ switch(r){
+ case '=':
+ return Tequal;
+ case '>':
+ return Tclose;
+ case '/':
+ r = get(st);
+ if(r == '>')
+ return Tnulblk;
+ unget(st, r);
+ continue;
+ case '\'':
+ case '"': /* attribute value */
+ q = r;
+ while((r = get(st)) != -1 && r != q){
+ if(r == L'&')
+ if((r = entityref(st)) == -1)
+ break;
+ growrune(st, lb, r);
+ }
+ if(r == -1)
+ return -1;
+ return Tname;
+ case '\n':
+ case '\r':
+ case ' ':
+ case '\v':
+ case '\f':
+ case '\t':
+ do
+ growrune(st, lb, r);
+ while((r = get(st)) != -1 && isspacerune(r));
+ if(r == -1)
+ return -1;
+ unget(st, r);
+ return Twhite;
+ default: /* attribute name */
+ do
+ growrune(st, lb, r);
+ while((r = get(st)) != -1 && isnameN(r));
+ if(r == -1)
+ return -1;
+ unget(st, r);
+ return Tname;
+ }
+ }
+
+ do{
+ if(r == L'&')
+ if((r = entityref(st)) == -1)
+ break;
+ growrune(st, lb, r);
+ }while((r = get(st)) != -1 && r != '<');
+ if(r == -1)
+ return -1;
+ unget(st, r);
+ return Tname;
+ }
+ return -1;
+}
+
+static Elem *
+_xmlparse(State *st, Elem *parent, int depth)
+{
+ Attr *ap;
+ Lexbuf lexbuf, *lb;
+ Lexbuf pcdata, *pc;
+ Elem *root, *ep;
+ int os, s, t, a;
+
+ ap = nil;
+ ep = nil;
+ s = Slost;
+ root = nil;
+ lb = &lexbuf;
+ memset(lb, 0, sizeof(Lexbuf));
+ pc = &pcdata;
+ memset(pc, 0, sizeof(Lexbuf));
+ while((t = xlex(st, lb, s)) != -1){
+ os = s;
+ s = statab[os][t];
+ a = acttab[os][t];
+ if(xmldebug == 2)
+ fprint(2, "depth=%d token=%s action=%s state=%s->%s str='%s'\n",
+ depth, tokstr[t], actstr[a], stastr[os], stastr[s], lb->buf);
+ switch(a){
+ case Aelem:
+ if(xmldebug == 1)
+ fprint(2, "%-3d %*.selem name='%s'\n", st->line, depth, "", lb->buf);
+ if(!isname1(lb->buf[0]))
+ failed(st, "'%s' is an illegal element name", lb->buf);
+ if(st->flags & Fstripnamespace)
+ stripns(lb->buf);
+ assert((ep = xmlelem(st->xml, &root, parent, lb->buf)) != nil);
+ ep->line = st->line;
+ break;
+ case Apcdata:
+ if(parent)
+ growstr(st, pc, lb->buf);
+ break;
+ case Aattr:
+ assert(ep != nil);
+ if(xmldebug == 1)
+ fprint(2, "%-3d %*.sattr name='%s'\n", st->line, depth, "", lb->buf);
+ if(!isname1(lb->buf[0]))
+ failed(st, "'%s' is an illegal attribute name", lb->buf);
+ if(st->flags & Fstripnamespace)
+ stripns(lb->buf);
+ assert((ap = xmlattr(st->xml, &(ep->attrs), ep, lb->buf, nil)) != nil);
+ break;
+ case Avalue:
+ assert(ep != nil);
+ assert(ap != nil);
+ ap->value = xmlstrdup(st->xml, lb->buf, 0);
+ ap = nil;
+ if(xmldebug == 1)
+ fprint(2, "%*.sattr value=%s\n", depth, "", lb->buf);
+ break;
+ case Adown:
+ assert(ep != nil);
+ if(xmldebug == 1)
+ fprint(2, "%*.sdown name=%s\n", depth, "", ep->name);
+ ep->child = _xmlparse(st, ep, depth+1);
+ if(xmldebug == 1 && ep->pcdata)
+ fprint(2, "%*.s name=%s pcdata len=%ld\n",
+ depth, "",
+ ep->name,
+ (ep->pcdata)? strlen(ep->pcdata): 0L);
+ break;
+ case Aup:
+ if(pc->buf){
+ parent->pcdata = xmlstrdup(st->xml, pc->buf, 0);
+ free(pc->buf);
+ }
+ free(lb->buf);
+ return root;
+ /* NOTREACHED */
+ break;
+ case Acheck:
+ assert(ep != nil);
+ if(st->flags & Fstripnamespace)
+ stripns(lb->buf);
+ if(ep->name && strcmp(lb->buf, ep->name) != 0)
+ failed(st, "</%s> found, expecting match for <%s> (re: line %d) - nesting error",
+ lb->buf, ep->name, ep->line);
+ break;
+ case Anop:
+ break;
+ case Aerr:
+ failed(st, "%s syntax error", lb->buf);
+ break;
+ default:
+ sysfatal("xmlparse: %d - internal error, unknown action\n", a);
+ break;
+ }
+ if(lb->buf)
+ lb->buf[0] = 0;
+ }
+ if(t == -1 && depth != 0)
+ failed(st, "unexpected EOF (depth=%d)", depth);
+
+ if(pc->buf){
+ parent->pcdata = xmlstrdup(st->xml, pc->buf, 0);
+ free(pc->buf);
+ }
+ free(lb->buf);
+ return root;
+}
+
+Xml *
+xmlparse(int fd, int blksize, int flags)
+{
+ State s;
+ Biobuf bio;
+ Xml *x;
+
+ memset(&s, 0, sizeof(s));
+ s.line = 1;
+ Binit(&bio, fd, OREAD);
+ s.bp = &bio;
+ s.flags = flags;
+
+ x = xmlnew(blksize);
+ s.xml = x;
+
+ x->root = _xmlparse(&s, nil, 0);
+ if(s.failed){
+ if(x)
+ xmlfree(x);
+ x = nil;
+ }
+ Bterm(&bio);
+ return x;
+}
--- /dev/null
+++ b/libxml/xmlprint.c
@@ -1,0 +1,83 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <ctype.h>
+#include "xml.h"
+
+static void
+prval(Biobuf *bp, char *s)
+{
+ char *p;
+ Rune r;
+
+ p = s;
+ while(*p){
+ p += chartorune(&r, p);
+ switch(r){
+ case L'&': Bprint(bp, "&"); break;
+ case L'<': Bprint(bp, "<"); break;
+ case L'>': Bprint(bp, ">"); break;
+ case L'"': Bprint(bp, """); break;
+ case L'\'': Bprint(bp, "'"); break;
+ default:
+ if(r >= L' ')
+ Bprint(bp, "%C", r);
+ else
+ Bprint(bp, "&#x%04x;", r);
+ break;
+ }
+ }
+}
+
+static void
+_xmlprint(Biobuf *bp, Elem *ep, int in)
+{
+ Attr *ap;
+ enum {indent = 4};
+
+ for(; ep; ep = ep->next){
+ Bprint(bp, "%*s<%s", in, "", ep->name);
+
+ for (ap = ep->attrs; ap; ap = ap->next){
+ Bprint(bp, " %s=\'", ap->name);
+ prval(bp, ap->value);
+ Bprint(bp, "\'");
+ }
+
+ if(ep->child){
+ if(ep->pcdata){
+ Bprint(bp, ">\n%*s\n", in+indent, "");
+ prval(bp, ep->pcdata);
+ }
+ else
+ Bprint(bp, ">\n");
+ _xmlprint(bp, ep->child, in+indent);
+ Bprint(bp, "%*s</%s>\n", in, "", ep->name);
+ }
+ else{
+ if(ep->pcdata){
+ Bprint(bp, ">\n%*s", in+indent, "");
+ prval(bp, ep->pcdata);
+ Bprint(bp, "\n%*s</%s>\n", in, "", ep->name);
+ }
+ else
+ Bprint(bp, "/>\n");
+ }
+ }
+}
+
+void
+xmlprint(Xml *xp, int fd)
+{
+ Biobuf bout;
+
+ Binit(&bout, fd, OWRITE);
+ if(xp->doctype){
+ Bprint(&bout, "<?xml version='1.0' encoding='utf-8'?>\n");
+ Bprint(&bout, "<!DOCTYPE %s>\n", xp->doctype);
+ }
+ else
+ Bprint(&bout, "<?xml version='1.0' encoding='utf-8' standalone='yes'?>\n");
+ _xmlprint(&bout, xp->root, 0);
+ Bterm(&bout);
+}
--- /dev/null
+++ b/libxml/xmlvalue.c
@@ -1,0 +1,19 @@
+#include <u.h>
+#include <libc.h>
+#include "xml.h"
+
+char *
+xmlvalue(Elem *ep, char *name)
+{
+ Attr *ap;
+
+ /*
+ * This enables the common idiom: xmlvalue(xmllook(), name);
+ */
+ if (ep == nil)
+ return nil;
+ for(ap = ep->attrs; ap; ap = ap->next)
+ if(strcmp(ap->name, name) == 0)
+ return ap->value;
+ return nil;
+}
--- /dev/null
+++ b/mkfile
@@ -1,0 +1,44 @@
+</$objtype/mkfile
+
+INSTALLFILES=\
+ /sys/man/1/xb \
+ /sys/man/2/xml \
+ /sys/include/xml.h \
+ /$objtype/lib/libxml.a \
+ /$objtype/bin/xb \
+
+CFLAGS=$CFLAGS -I..
+
+all:VQ:
+ echo only automatic install possible for now. >[1=2]
+ echo Otherwise, try some manual installation. >[1=2]
+
+install:V: $INSTALLFILES
+
+/sys/man/1/xb: xb
+ cp $prereq $target
+
+/sys/man/2/xml: xml
+ cp $prereq $target
+
+/sys/include/xml.h: xml.h
+ cp $prereq $target
+
+/$objtype/lib/libxml.a:
+ cd libxml && mk install && cd ..
+
+/$objtype/bin/xb: $O.xb
+ cp $prereq $target
+
+$O.%: %.$O
+ $LD $LDFLAGS -o $target $prereq
+
+%.$O: %.c
+ $CC $CFLAGS $stem.c
+
+clean:V:
+ cd libxml && mk clean && cd ..
+ rm -f [$OS].* *.[$OS]
+
+nuke:V:
+ rm -f $INSTALLFILES
--- /dev/null
+++ b/xb
@@ -1,0 +1,52 @@
+.TH XB 1
+.SH NAME
+xb \- XML beautifier
+.SH SYNOPSIS
+.B xb
+[
+.B dM
+][
+.I file
+]
+.SH DESCRIPTION
+.I Xb
+performs a DOM model, non-validating parse of the given XML
+file (or standard input if none specified). This is then written
+to standard output reformatted into an indented, one-line-per-element
+style.
+.PP
+.I Xb
+takes some liberties with the document which may effect later
+interpretation, though it is idempotent:
+.IP -
+Re-formatting the will change the meaning of PCdata in the
+XML file if leading and trailing whitespace is significant.
+.IP -
+All comments are stripped, though the
+.B DOCTYPE
+structured comment is preserved. A new XML version
+comment is always generated on the output.
+.IP -
+Attributes are always quoted with single quotes
+whether the source uses single or double quotes.
+.IP -
+Empty long form elements are replaced with their short form;
+for example:
+.EX
+ <fred a='1'>
+ </fred>
+.EE
+ is replaced with
+.EX
+ <fred a='1' />
+.EE
+.IP -
+.I Xml
+has its own view of the valid characterset for XML files, which might
+result in slightly different characters being escaped as entity references
+to those on the input. This is only a matter of interpretation, the meaning
+of the file will be unaffected.
+.SH SOURCE
+.B /sys/src/cmd/xb.c
+.SH "SEE ALSO"
+.IR xml (2)
--- /dev/null
+++ b/xb.c
@@ -1,0 +1,75 @@
+/*
+ * Almost an xml beautifier.
+ *
+ * It can be useful just for re-indenting machine generated xml
+ * to make it human readable. It also performs some basic checks
+ * on the file's validity which can be handy if hand editing the
+ * xml source.
+ *
+ * It does take some liberties with the data:
+ *
+ * - It strips the all comments, though the xml version
+ * and DOCTYPE "structured comments" are preserved.
+ *
+ * - attributes are always quoted with more plan9 style single
+ * quotes whether the source has single or double quotes.
+ *
+ * - empty elements are replaced with the short form, e.g.
+ * <fred a='1'> </fred> is mapped to <fred a='1' />
+ */
+
+#include <u.h>
+#include <libc.h>
+#include <pool.h>
+#include <xml.h>
+
+static void
+usage(void)
+{
+ fprint(2, "usage: %s [-dM] [file.xml]\n", argv0);
+ exits("usage");
+}
+
+void
+main(int argc, char *argv[])
+{
+ Xml *xp;
+ int fd, mem;
+
+ mem = 0;
+ ARGBEGIN{
+ case 'd':
+ xmldebug++;
+ break;
+ case 'M':
+ mem++;
+ break;
+ default:
+ usage();
+ }ARGEND;
+
+
+ if(mem > 1)
+ mainmem->flags |= POOL_NOREUSE|POOL_PARANOIA|POOL_ANTAGONISM;
+
+ if(argc == 0){
+ if((xp = xmlparse(0, 8192, Fcrushwhite)) == nil)
+ sysfatal("stdin:%r\n");
+ }
+ else{
+ if((fd = open(argv[0], OREAD)) == -1)
+ sysfatal("%s cannot open\n", argv[0]);
+ if((xp = xmlparse(fd, 8192, Fcrushwhite)) == nil)
+ sysfatal("%s:%r\n", argv[0]);
+ close(fd);
+ }
+ if(mem)
+ fprint(2, "%s: parsed - mem size=%lud free=%lud alloc=%lud nfree=%d\n",
+ argv0, mainmem->cursize, mainmem->curfree, mainmem->curalloc, mainmem->nfree);
+ xmlprint(xp, 1);
+ xmlfree(xp);
+ if(mem)
+ fprint(2, "%s: free'd - mem size=%lud free=%lud alloc=%lud nfree=%d\n",
+ argv0, mainmem->cursize, mainmem->curfree, mainmem->curalloc, mainmem->nfree);
+ exits("");
+}
--- /dev/null
+++ b/xml
@@ -1,0 +1,181 @@
+.TH XML 2
+.SH NAME
+xmlattr,
+xmlcalloc,
+xmlelem,
+xmlfind,
+xmlfree,
+xmllook,
+xmlmalloc,
+xmlnew,
+xmlparse,
+xmlprint,
+xmlstrdup,
+xmlvalue
+\- XML parser
+.SH SYNOPSIS
+.de PB
+.PP
+.ft L
+.nf
+..
+.PB
+#include <u.h>
+#include <libc.h>
+#include <xml.h>
+.PB
+enum {
+ Fcrushwhite = 1,
+ Fstripnamespace = 2,
+};
+.PB
+struct Xml{
+ Elem *root; /* root of tree */
+ char *doctype; /* DOCTYPE structured comment, or nil */
+ ...
+};
+.PB
+struct Elem {
+ Elem *next; /* next element at this hierarchy level */
+ Elem *child; /* first child of this node */
+ Elem *parent; /* parent of this node */
+ Attr *attrs; /* linked list of atributes */
+ char *name; /* element name */
+ char *pcdata; /* pcdata following this element */
+ int line; /* Line number (for errors) */
+};
+.PB
+struct Attr {
+ Attr *next; /* next atribute */
+ Elem *parent; /* parent element */
+ char *name; /* atributes name */
+ char *value; /* atributes value */
+};
+.PB
+.PD 0
+.ta +\w'\fL 'u +\w'\fL 'u +6n +4n
+Attr* xmlattr(Xml *xp, Attr **root, Elem *parent,
+ char *name, char *value)
+.PB
+Elem* xmlelem(Xml *xp, Elem **root, Elem *parent, char *name)
+.PB
+Elem* xmlfind(Xml *xp, Elem *ep, char *path)
+.PB
+Elem* xmllook(Elem *ep, char *path, char *attr, char *value)
+.PB
+Xml* xmlnew(int blksize)
+.PB
+Xml* xmlparse(int fd, int blksize, int flags)
+.PB
+char* xmlvalue(Elem *ep, char *name)
+.PB
+void* xmlmalloc(Xml *xp, usize size)
+.PB
+void* xmlcalloc(Xml *xp, usize nelem, usize elemsz)
+.BP
+void* xmlstrdup(Xml *xp, char *s)
+.PB
+void xmlfree(Xml *xp)
+.PB
+void xmlprint(Xml *xp, int fd)
+.SH DESCRIPTION
+.PP
+.I Libxml
+is a library for manipulating an XML document, in-memory
+(known as the DOM model). Each element may have a
+number of children, each of which has a number of attributes, each attribute
+has a single value. All elements contain a pointer to their parent
+element, the root element having a nil parent pointer.
+.I Pcdata
+(free form text) found between elements is attached to element which follows it.
+The line numbers where each element was found is stored to allow unambigious
+error messages during later analysis.
+.PP
+Strings are stored in two data structures: a binary tree for common names such as
+element and attribute names. Uncommon names such as values and
+pcdata are stored in a simple, unmanaged heap. These steps vastly reduce the memory
+footprint of the parsed file and the time needed to free the XML data.
+.PP
+.I Xmlparse
+reads the given file and builds an in-memory tree.
+.I Blocksize
+controls the granularity of allocation of the string heap described above,
+8192 is typically used.
+The
+.I flags
+field allows some control over the parser, it is a bitwise
+.B or
+of the following values:
+.TP 10
+.B Fcrushwhite
+All strings whitespace in PCdata is replaced by a single space and leading and trailing whitespace is
+removed.
+.TP
+.B Fstripsnamespace
+Remove leading namespace strings form all element and attribute names; this effectively ignores namespaces
+which can lead to parsing ambiguities, though in practice it has not been a problem—yet.
+.PP
+Xml trees may also be built up by calling
+.I xmlnew
+to create the XML tree, followed by
+.I xmlelem
+and
+.I xmlattr
+to create individual elements and attributes respectively.
+.I Xmlelem
+takes the address of the root of an element list to which the new
+element should be appended, the address of the parent node the new
+element should reference, and the name
+of the node to create; It returns the address of the created element.
+.PP
+.I Xmlattr
+attaches an attribute to an existing element. It takes a list pointer
+and parent pointer like
+.IR xmlelem ,
+but requires both an atribute name and value,
+and returns the address of the new attribute.
+.PP
+.I Xmllook
+descends through the tree rooted at
+.I ep
+using the path specified in
+.IR path .
+It then returns if
+.I elem
+is nil, or continues to search for a matching element.
+if
+.I attr
+and
+.I value
+are not nil, the search will continue for
+for an element which contains this attribute and value pair.
+.PP
+.I Xmlvalue
+searches the given element's attribute list and returns the
+value of the attribute found or nil if that attribute is not found.
+.PP
+.I Xmlprint
+writes the XML hierarchy rooted at \fIep\fR as text to the given
+file descriptor.
+.PP
+.IR Xmlmalloc ,
+.IR xmlcalloc ,
+and
+.IR xmlstrdup
+allocate memory within the
+.B Xml
+tree.
+.I Xmlfree
+frees all memory used by the given
+.B Xml
+tree.
+.SH SOURCE
+/sys/src/libxml
+.SH "SEE ALSO"
+.IR xb (1).
+.SH BUGS
+Namespaces should be handled properly.
+.PP
+A SAX model parser will probably be needed sometime (e.g. for Ebooks).
+.PP
+UTF-16 headers should be respected but UTF-16 files seems rare.
--- /dev/null
+++ b/xml.h
@@ -1,0 +1,61 @@
+
+#pragma lib "libxml.a"
+
+typedef struct Xml Xml;
+typedef struct Attr Attr;
+typedef struct Elem Elem;
+
+typedef struct Xtree Xtree;
+typedef struct Xblock Xblock;
+
+#pragma incomplete Xtree
+#pragma incomplete Xblock
+
+enum {
+ Fcrushwhite = 1,
+ Fstripnamespace = 2,
+};
+
+struct Xml {
+ Elem *root; /* root of tree */
+ char *doctype; /* DOCTYPE structured comment, or nil */
+ struct {
+ Xtree *root;
+ Xblock *active;
+ int blksiz;
+ } alloc;
+};
+
+struct Elem {
+ Elem *next; /* next element at this hierarchy level */
+ Elem *child; /* first child of this node */
+ Elem *parent; /* parent of this node */
+ Attr *attrs; /* linked list of atributes */
+ char *name; /* element name */
+ char *pcdata; /* pcdata following this element */
+ int line; /* Line number (for errors) */
+};
+
+struct Attr {
+ Attr *next; /* next atribute */
+ Elem *parent; /* parent element */
+ char *name; /* atributes name (nil for coments) */
+ char *value; /* atributes value */
+};
+
+extern int xmldebug;
+
+Attr* xmlattr(Xml *, Attr **, Elem *, char *, char *);
+Elem* xmlelem(Xml *, Elem **, Elem *, char *);
+Elem* xmlfind(Xml *, Elem *, char *);
+void xmlfree(Xml *);
+char* xmlstrdup(Xml*, char *, int);
+void* xmlcalloc(Xml *, int, int);
+void* xmlmalloc(Xml *, int);
+void _Xheapstats(void);
+void _Xheapfree(Xml *);
+Elem* xmllook(Elem *, char *, char *, char *);
+Xml* xmlnew(int);
+Xml* xmlparse(int, int, int);
+void xmlprint(Xml *, int);
+char* xmlvalue(Elem *, char *);