shithub: xml-9atom

Download patch

ref: 05cf7ced4f8ffc7dc0ab2fbc601d5d33e6e5ca42
author: sirjofri <sirjofri@sirjofri.de>
date: Mon Jul 8 07:59:39 EDT 2024

adds files from archive, adds rudimentary mkfile for easy installation

diff: cannot open b/libxml//null: file does not exist: 'b/libxml//null'
--- /dev/null
+++ b/README
@@ -1,0 +1,23 @@
+xml tools from 9atom as archived from
+https://github.com/Plan9-Archive/9atom
+
+plug-in repository for the following xml tools:
+
+- libxml:
+  /sys/include/xml.h
+  /$objtype/lib/libxml.a
+  /sys/man/2/xml
+- xb:
+  /$objtype/bin/xb
+  /sys/man/1/xb
+
+Installation:
+
+mk install
+mk clean (optional)
+
+Deinstallation:
+
+mk nuke
+
+This will remove all the installed files.
--- /dev/null
+++ b/libxml/doc.c
@@ -1,0 +1,32 @@
+/*
+ * Generate a state transition diagram from the state tables,
+ * we skip error transitions as they just confuse the drawing.
+ */
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include "state-machine.h"
+
+void
+main(void)
+{
+	int os, s, a, t;
+
+	print("digraph xml_parser {\n");
+	print("	rankdir=LR;\n");
+	print("	size=\"11,9\"\n");
+	print("	node [shape = circle];\n");
+
+	for(t = 0; t < NumToks; t++)
+		for(os = 0; os < NumStates; os++){
+			s = statab[os][t];
+			a = acttab[os][t];
+			if(a != Aerr)
+				print("	%s -> %s [ label = \"%s / %s\" ];\n",
+					stastr[os], stastr[s], tokstr[t], actstr[a]);
+		}
+
+	print("}\n");
+	exits(nil);
+}
+
--- /dev/null
+++ b/libxml/heap.c
@@ -1,0 +1,134 @@
+#include <u.h>
+#include <libc.h>
+#include "xml.h"
+
+#define Roundup(x, g)	(((x) + (unsigned)(g-1)) & ~((unsigned)(g-1)))
+
+struct Xtree {
+	Xtree *left;
+	Xtree *right;
+	char *str;
+	int hits;
+};
+
+struct Xblock {
+	Xblock *next;
+	char *free;
+	char *end;
+};
+
+static int Strdups, Commons, Unique, Memblocks;
+
+static void *
+getmem(Xml *xp, int len)
+{
+	int sz;
+	Xblock *b;
+	char *ret;
+
+	len = Roundup(len, sizeof(long long));
+
+	sz = xp->alloc.blksiz;		/* shorthand */
+	b = xp->alloc.active;
+
+	if(len > sz)
+		sysfatal("store: object too big (%d > %d)\n", len, sz);
+
+	if(xp->alloc.active == nil || b->free + len >= b->end){
+		Memblocks++;
+		b = mallocz(sizeof(Xblock) + sz, 0);
+		b->free = (char *)&b[1];
+		b->end = (char *)&b->free[sz];
+
+		b->next = xp->alloc.active;
+		xp->alloc.active = b;
+	}
+
+	ret = b->free;
+	b->free += len;
+	return ret;
+}
+
+static Xtree *
+lookadd(Xml *xp, Xtree *t, char *str, Xtree **match)
+{
+	int n;
+
+	if(t == nil){
+		Unique++;
+		t = getmem(xp, sizeof(Xtree) + strlen(str)+1);
+		t->left = nil;
+		t->right = nil;
+		t->str = (char *)&t[1];
+		strcpy(t->str, str);
+		*match = t;
+		t->hits = 1;
+		return t;
+	}
+
+	if((n = strcmp(str, t->str)) == 0){
+		*match = t;
+		t->hits++;
+	}
+	if(n < 0)
+		t->left = lookadd(xp, t->left, str, match);
+	if(n > 0)
+		t->right = lookadd(xp, t->right, str, match);
+	return t;
+}
+
+char *
+xmlstrdup(Xml *xp, char *str, int iscommon)
+{
+	char *s;
+	Xtree *t;
+
+	Strdups++;
+	if(iscommon){
+		Commons++;
+		xp->alloc.root = lookadd(xp, xp->alloc.root, str, &t);
+		return t->str;
+	}
+
+	s = getmem(xp, strlen(str)+1);
+	return strcpy(s, str);
+}
+
+void *
+xmlcalloc(Xml *xp, int n, int m)
+{
+	void *v;
+
+	v = getmem(xp, n * m);
+	memset(v, 0, n * m);
+	return v;
+}
+
+void *
+xmlmalloc(Xml *xp, int n)
+{
+	return getmem(xp, n);
+}
+
+
+void
+_Xheapstats(void)
+{
+	fprint(2, "total=%d common=%d -> unique=%d rare=%d memblocks=%d\n",
+		Strdups, Commons, Unique, Strdups - Commons, Memblocks);
+}
+
+void
+_Xheapfree(Xml *xp)
+{
+	Xblock *b, *n;
+
+	for(b = xp->alloc.active; b; b = n){
+		n = b->next;
+		if(xmldebug)
+			memset(b, 0x7e, xp->alloc.blksiz);
+		free(b);
+	}
+
+}
+
--- /dev/null
+++ b/libxml/mkfile
@@ -1,0 +1,34 @@
+</$objtype/mkfile
+
+LIB=/$objtype/lib/libxml.a
+
+OFILES=\
+	xmlattr.$O\
+	xmlelem.$O\
+	xmlfind.$O\
+	xmlfree.$O\
+	xmlparse.$O\
+	xmlprint.$O\
+	xmlnew.$O\
+	xmllook.$O\
+	xmlvalue.$O\
+	heap.$O\
+
+HFILES=\
+	/sys/include/xml.h
+
+CLEANFILES=doc.ps $O.doc
+
+UPDATE=\
+	mkfile\
+	$HFILES\
+	${OFILES:%.$O=%.c}\
+	${LIB:/$objtype/%=/386/%}
+
+</sys/src/cmd/mksyslib
+
+$O.doc: doc.$O
+	$LD -o $target $prereq
+
+doc.ps: 8.doc 
+	8.doc | dot '-Grotate=90' '-Gsize=10,8' -Tps > doc.ps
--- /dev/null
+++ b/libxml/state-machine.h
@@ -1,0 +1,83 @@
+enum {			/* Lexer Tokens */
+	Twhite = 0,
+	Topen,
+	Tname,
+	Tclose,
+	Tequal,
+	Tendblk,
+	Tnulblk,
+	NumToks
+};
+
+enum {			/* Parser states */
+	Slost	= 0,
+	Sopened	= 1,
+	Snamed	= 2,
+	Sattred	= 3,
+	Sequed	= 4,
+	Sendblk	= 5,
+	Sclosed	= 6,
+	NumStates
+
+};
+
+enum {			/* Parser Actions */
+	Aerr	= 0,
+	Anop	= 1,
+	Aelem	= 2,
+	Apcdata	= 3,
+	Aattr	= 4,
+	Avalue	= 5,
+	Aup	= 6,
+	Adown	= 7,
+	Acheck	= 8,
+	NumActions
+};
+
+static char *
+tokstr[] = {	/* lexer token names for debug */
+	[Twhite]	"white",	[Topen]		"open",
+	[Tname]		"name",		[Tclose]	"close",
+	[Tequal]	"equal",	[Tendblk]	"endblk",
+	[Tnulblk]	"nulblk"
+};
+
+static char *
+stastr[] = {	/* parser state names for debug */
+	[Slost]		"lost",		[Sopened]	"opened",
+	[Snamed]	"named",	[Sattred]	"attred",
+	[Sequed]	"equed",	[Sendblk]	"endblk",
+	[Sclosed]	"closed",
+};
+
+static char *
+actstr[] = {	/* parser action names for debug */
+	[Aerr]		"error",	[Anop]		"nop",
+	[Apcdata]	"pcdata",	[Aattr]		"attr",
+	[Avalue]	"value",	[Aelem]		"elem",
+	[Aup]		"up",		[Adown]		"down",
+	[Acheck]	"check"
+};
+
+
+static int statab[7][7] = {	/* Parser state transition table */
+/* 			Twhite	Topen	Tname	Tclose	Tequal	Tendblk	Tnulblk */ 
+	[Slost]     {	Slost, 	Sopened,Slost, 	Slost, 	Slost, 	Slost,	Slost },
+	[Sopened]   {	0, 	0, 	Snamed,	0, 	0, 	0, 	0 },
+	[Snamed]    {	Snamed, 0, 	Sattred,Sendblk,0, 	Slost,	Slost },            
+	[Sattred]   {	Sattred, 0, 	0, 	0, 	Sequed, 0, 	0 },
+	[Sequed]    {	Sequed, 0, 	Snamed,	0, 	0, 	0, 	0 },
+	[Sendblk]   {	0, 	0, 	Sclosed,0, 	0, 	0, 	0 },
+	[Sclosed]   {	0,	0,	0,	Slost,	0,	0,	0 },          
+};
+
+static int acttab[7][7] = {	/* Parser action table */
+/* 			Twhite	Topen	Tname	Tclose	Tequal	Tendblk	Tnulblk */         
+	[Slost]     {	Apcdata, Anop, 	Apcdata, Apcdata, Apcdata, Aup,	Apcdata },
+	[Sopened]   {	0, 	0, 	Aelem,	0, 	0, 	0, 	0 },
+	[Snamed]    {	Anop,	0, 	Aattr,	Adown,	0, 	Anop,	Anop },            
+	[Sattred]   {	Anop, 	0, 	0, 	0, 	Anop,	0, 	0 },
+	[Sequed]    {	Anop, 	0, 	Avalue,	0, 	0, 	0, 	0 },
+	[Sendblk]   {	0, 	0, 	Acheck, 0, 	0, 	0, 	0 },
+	[Sclosed]   {	0,	0,	0,	Anop,	0,	0,	0 },          
+};
--- /dev/null
+++ b/libxml/xmlattr.c
@@ -1,0 +1,31 @@
+#include <u.h>
+#include <libc.h>
+#include "xml.h"
+
+Attr *
+xmlattr(Xml *xp, Attr **root, Elem *parent, char *name, char *value)
+{
+	Attr *ap, *t;
+
+	USED(xp);
+	if((ap = xmlcalloc(xp, sizeof(Attr), 1)) == nil)
+		sysfatal("no memory - %r\n");
+	if(*root == nil){
+		*root = ap;
+	}
+	else{
+		for (t = *root; t->next; t = t->next)
+			continue;
+		t->next = ap;
+	}
+	ap->parent = parent;
+
+	if(name)
+		if((ap->name = xmlstrdup(xp, name, 1)) == nil)
+			sysfatal("no memory - %r\n");
+
+	if(value)
+		if((ap->value = xmlstrdup(xp, value, 0)) == nil)
+			sysfatal("no memory - %r\n");
+	return ap;
+}
--- /dev/null
+++ b/libxml/xmlelem.c
@@ -1,0 +1,27 @@
+#include <u.h>
+#include <libc.h>
+#include "xml.h"
+
+Elem *
+xmlelem(Xml *xp, Elem **root, Elem *parent, char *name)
+{
+	Elem *ep, *t;
+
+	USED(xp);
+	if((ep = xmlcalloc(xp, sizeof(Elem), 1)) == nil)
+		sysfatal("no memory - %r\n");
+	if(! *root){
+		*root = ep;
+	}
+	else{
+		for (t = *root; t->next; t = t->next)
+			continue;
+		t->next = ep;
+	}
+	ep->parent = parent;
+	if(name)
+		if((ep->name = xmlstrdup(xp, name, 1)) == nil)
+			sysfatal("no memory - %r\n");
+	return ep;
+}
+
--- /dev/null
+++ b/libxml/xmlfind.c
@@ -1,0 +1,35 @@
+#include <u.h>
+#include <libc.h>
+#include "xml.h"
+
+/*
+ * search for element, starting at ep.
+ */
+
+Elem *
+xmlfind(Xml *xp, Elem *ep, char *path)
+{
+	char *p;
+	Elem *t;
+
+	USED(xp);
+
+	if (path == nil)
+		return nil;
+	if (*path == '/')
+		path++;
+	if ((p = strchr(path, '/')) == nil)
+		if((p = strchr(path, 0)) == nil)
+			return nil;		// shut up lint !
+
+	for(; ep; ep = ep->next)
+		if (strncmp(ep->name, path, p-path) == 0){
+			if (*p == 0)
+				return ep;
+			if (! ep->child)
+				continue;
+			if ((t = xmlfind(xp, ep->child, p)) != nil)
+				return t;
+		}
+	return nil;
+}
--- /dev/null
+++ b/libxml/xmlfree.c
@@ -1,0 +1,10 @@
+#include <u.h>
+#include <libc.h>
+#include "xml.h"
+
+void
+xmlfree(Xml *xp)
+{
+	_Xheapfree(xp);
+	free(xp);
+}
--- /dev/null
+++ b/libxml/xmllook.c
@@ -1,0 +1,49 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include "xml.h"
+
+/*
+ * search for element, starting at ep.
+ * if attr!=nil the elem must have an attribute attr
+ * if value!=nil then the elem must have attr=value
+ */
+
+Elem *
+xmllook(Elem *ep, char *path, char *attr, char *value)
+{
+	char *p;
+	Elem *t;
+	Attr *ap;
+
+	if (path == nil)
+		return nil;
+	if (*path == '/')
+		path++;
+	if ((p = strchr(path, '/')) == nil)
+		if((p = strchr(path, 0)) == nil)
+			return nil;			// shut up lint !
+
+	for(; ep; ep = ep->next)
+		if (strncmp(ep->name, path, p-path) == 0){
+			if (*p == '/'){
+				if (ep->child)
+					if ((t = xmllook(ep->child, p, attr, value)) != nil)
+						return t;
+				continue;
+			}
+			if (attr == nil)
+				return ep;
+			for (ap = ep->attrs; ap; ap = ap->next)
+				if (strcmp(ap->name, attr) == 0){
+					if (value == nil)
+						return ep;
+					if (strcmp(ap->value, value) == 0)
+						return ep;
+				}
+			if (ep->child)
+				if ((t = xmllook(ep->child, p, attr, value)) != nil)
+					return t;
+		}
+	return nil;
+}
--- /dev/null
+++ b/libxml/xmlnew.c
@@ -1,0 +1,15 @@
+#include <u.h>
+#include <libc.h>
+#include "xml.h"
+
+Xml *
+xmlnew(int blksize)
+{
+	Xml *xp;
+
+	xp = mallocz(sizeof(Xml), 1);
+	xp->alloc.blksiz = blksize;
+	if(xp == nil)
+		return nil;
+	return xp;
+}
--- /dev/null
+++ b/libxml/xmlparse.c
@@ -1,0 +1,570 @@
+#include <u.h>
+#include <libc.h>
+#include <ctype.h>
+#include <bio.h>
+#include "xml.h"
+#include "state-machine.h"
+
+int xmldebug = 0;
+
+enum {
+	Grain = 16
+};
+
+#define isname1(c)	(isalpha((c)) || c == '_')	/* FIXME: not enforced yet */
+#define isnameN(r)	(isalpharune((r)) || isdigitrune((r)) || r == L'_' || r == L'-' || r == L'.' || r == L':')
+#define Roundup(x, g)	(((x) + (unsigned)(g-1)) & ~((unsigned)(g-1)))
+
+enum {
+	Ntext = 1024,	/* longest name or atribute value possible */
+	Nref = 32	/* longest entity reference name */
+};
+
+
+typedef struct {
+	int line;	/* Line number (for errors) */
+	Biobuf *bp;	/* input stream */
+	int flags;	/* misc flags, see xml.h */
+	Xml *xml;
+	int failed;
+} State;
+
+typedef struct {
+	char *buf;
+	int sz;
+} Lexbuf;
+
+
+static int 
+trimwhite(char *s)
+{
+	char *p;
+
+	for(p = s; *p; p++)
+		if(! isspace(*p))
+			return 0;
+
+	/* trim any whitespace into a single space */
+	s[0] = ' ';
+	s[1] = 0;
+	return 1;
+}
+
+static void
+growstr(State *st, Lexbuf *lb, char *str)
+{
+	int b, s, sz;
+
+	if(str == nil || *str == 0)
+		return;
+	if((st->flags & Fcrushwhite) && trimwhite(str) &&
+	   (lb->buf == nil || lb->buf[0] == 0))
+		return;
+	b = 0;
+	if(lb->buf)
+		b = strlen(lb->buf);
+	s = strlen(str);
+
+	sz = Roundup(b+s+1, Grain);
+	if(sz >= lb->sz){
+		lb->buf = realloc(lb->buf, sz);
+		if(lb->buf == nil)
+			sysfatal("No memory, wanted %d bytes\n", sz);
+		lb->sz = sz;
+	}
+	strcpy(lb->buf+b, str);
+}
+
+static void
+growrune(State *st, Lexbuf *lb, Rune r)
+{
+	int n;
+	char str[UTFmax+1];
+
+	n = runetochar(str, &r);
+	str[n] = 0;
+	growstr(st, lb, str);
+}
+
+static void
+stripns(char *str)
+{
+	char *p;
+	
+	if((p = strrchr(str, ':')) == nil)
+		return;
+	strcpy(str, p+1);
+}
+
+static void
+failed(State *st, char *fmt, ...)
+{
+	int n;
+	va_list arg;
+	char err[ERRMAX];
+
+	st->failed = 1;
+	va_start(arg, fmt);
+	n = snprint(err, sizeof(err), "%d ", st->line);
+	vsnprint(err+n, sizeof(err)-n, fmt, arg);
+	va_end(arg);
+	werrstr("%s", err);
+	
+}
+
+static void
+unget(State *st, int c)
+{
+	if(c == '\n')
+		st->line--;
+	Bungetrune(st->bp);
+}
+
+static long
+get(State *st)
+{
+	long r;
+
+	r = Bgetrune(st->bp);
+	if(r == Runeerror){
+		failed(st, "bad UTF-8 sequence");
+		r = L' ';
+	}
+
+	if(r == L'\n')
+		st->line++;
+
+	if(xmldebug){
+		if(xmldebug == 3)
+			fprint(2, "%C", (Rune)r);
+		if(xmldebug == 1 && r == -1)
+			fprint(2, "EOF\n");
+	}
+	return r;
+}
+
+static struct {
+	char *name;
+	Rune rune;
+} Entities[] = {
+	{ "amp",	L'&' },
+	{ "lt",		L'<' },
+	{ "gt",		L'>' },
+	{ "apos",	L'\'' },
+	{ "quot",	L'"' },
+	{ "nbsp",	0xa0 },		/* no-break space */
+};
+
+static long
+entityref(State *st)
+{
+	int n, i, l;
+	long r;
+	Rune x;
+	char *p, buf[Nref];
+
+	l = 0;
+	p = buf;
+	while((r = get(st)) != -1 && r != L';' && ! isspacerune(r))
+		if(l < (sizeof(buf) - UTFmax)){
+			x = r;
+			n = runetochar(p, &x);
+			p += n;
+			l += n;
+		}
+	*p = 0;
+	if(r == -1)
+		return -1;
+
+	/* false positive */
+	if(r != L';'){
+		fprint(2, "%d: unquoted '&' - ignored\n", st->line);
+		for(i = --l; i >= 0; i--)
+			unget(st, buf[i]);
+		return L'&';
+	}
+
+	if(buf[0] == '#'){
+		if(buf[1] == 'x' || buf[1] == 'X')
+			return strtol(buf+2, 0, 16);
+		return strtol(buf+1, 0, 10);
+	}
+
+	for(i = 0; i < nelem(Entities); i++)
+		if(memcmp(Entities[i].name, buf, l) == 0)
+			return Entities[i].rune;
+
+	fprint(2, "%d: '&%s;' unknown/unsupported entity reference\n", st->line, buf);
+	return L'?';
+}
+
+static int
+match(State *st, Rune *s)
+{
+	long r;
+	Rune *p;
+
+	r = -1;
+	for(p = s; *p; p++)
+		if((r = get(st)) != *p)
+			break;
+	if(r == -1)
+		return -1;	/* EOF */
+	if(*p == 0)
+		return 0;	/* match */
+	unget(st, r);
+	for(p--; p >= s; p--)
+		unget(st, *p);
+	return 1;		/* no match */
+}
+
+static int
+comment(State *st)
+{
+	long r;
+	int startline;
+
+	startline = st->line;
+	do{
+		if(get(st) == -1)
+			break;
+	}while(match(st, L"--") == 1);
+
+	r = get(st);
+	if(r == -1){
+		failed(st, "EOF in comment (re: line %d)", startline);
+		return -1;
+	}
+	if(r != L'>'){
+		failed(st, "'--' illegal in a comment (re: line %d)", startline);
+		return Twhite;
+	}
+	return Twhite;
+}
+
+static int
+doctype(State *st, Lexbuf *lb)
+{
+	long r;
+	char *p;
+	int startline;
+
+	startline = st->line;
+	
+	/* trim leading whitespace */
+	while((r = get(st)) != -1 && isspacerune(r))
+		continue;
+	unget(st, r);
+
+	if(lb->buf)
+		lb->buf[0] = 0;
+
+	while((r = get(st)) != -1 && r != L'>')
+		growrune(st, lb, r);
+
+	if(r == -1){
+		failed(st, "EOF in DOCTYPE (re: line %d)", startline);
+		return -1;
+	}
+	/* trim trailing whitespace */
+	p = strrchr(lb->buf, 0);
+	for(p--; p >= lb->buf && isspace(*p); p--)
+		*p = 0;
+
+	st->xml->doctype = xmlstrdup(st->xml, lb->buf, 0);
+	return Twhite;
+}
+
+static int
+cdata(State *st, Lexbuf *lb)
+{
+	long r;
+	int startline;
+
+	startline = st->line;
+	do{
+		if((r = get(st)) == -1)
+			break;
+		if(r == L'&')
+			if((r = entityref(st)) == -1)
+				break;
+		growrune(st, lb, r);
+	}while(match(st, L"]]>") == 1);
+
+	if(r == -1){
+		failed(st, "EOF in CDATA (re: line %d)", startline);
+		return -1;
+	}
+	return Tname;
+}
+
+/*
+ * byte order mark.
+ * This is pointless for utf8 which has defined byte order,
+ * and we don't support utf16 or utf32 but some xml seems to
+ * prepend them to utf8 so we need to find them and skip them
+ */
+static int
+bom(State *st)
+{
+	long r;
+
+	if((r = get(st)) == -1)
+		return -1;
+
+	if(r != 0xbb){
+		unget(st, 0xef);
+		unget(st, r);
+		return -1;
+	}
+	if((r = get(st)) == -1){
+		unget(st, 0xef);
+		unget(st, 0xbb);
+		return -1;
+	}
+	if(r != 0xbf){
+		unget(st, 0xef);
+		unget(st, 0xbb);
+		unget(st, r);
+	}
+	return 0;
+}
+
+static int
+xlex(State *st, Lexbuf *lb, int s)
+{
+	long r;
+	Rune q;
+
+	while((r = get(st)) != -1){
+		if(r == 0xef)
+			if(bom(st) == 0)
+				continue;
+		if(r == L'<'){
+			r = get(st);
+			switch(r){
+			case L'?':
+				while((r = get(st)) != -1 && r != L'>')
+					continue;
+				if(r == -1)
+					return -1;
+				return Twhite;
+			case L'!':
+				if(match(st, L"--") == 0)
+					return comment(st);
+				if(match(st, L"DOCTYPE ") == 0)
+					return doctype(st, lb);
+				if(match(st, L"[CDATA[") == 0)
+					return cdata(st, lb);
+				failed(st, "<!name not known");
+			case L'/':
+				return Tendblk;
+			case L' ':
+			case L'\t':
+			case L'\f':
+			case L'\n':
+			case L'\r':
+				failed(st, "whitespace following '<'");
+				break;
+			default:
+				unget(st, r);
+				return Topen;
+			}
+			continue;
+		}
+
+		if(s != Slost){
+			switch(r){
+			case '=':
+				return Tequal;
+			case '>':
+				return Tclose;
+			case '/':
+				r = get(st);
+				if(r == '>')
+					return Tnulblk;
+				unget(st, r);
+				continue;
+			case '\'':
+			case '"':		/* attribute value */
+				q = r;
+				while((r = get(st)) != -1 && r != q){
+					if(r == L'&')
+						if((r = entityref(st)) == -1)
+							break;
+					growrune(st, lb, r);
+				}
+				if(r == -1)
+					return -1;
+				return Tname;
+			case '\n':
+			case '\r':
+			case ' ':
+			case '\v':
+			case '\f':
+			case '\t':
+				do
+					growrune(st, lb, r);
+				while((r = get(st)) != -1 && isspacerune(r));
+				if(r == -1)
+					return -1;
+				unget(st, r);
+				return Twhite;
+			default:		/* attribute name */
+				do
+					growrune(st, lb, r);
+				while((r = get(st)) != -1 && isnameN(r));
+				if(r == -1)
+					return -1;
+				unget(st, r);
+				return Tname;
+			}
+		}
+
+		do{
+			if(r == L'&')
+				if((r = entityref(st)) == -1)
+					break;
+			growrune(st, lb, r);
+		}while((r = get(st)) != -1 && r != '<');
+		if(r == -1)
+			return -1;
+		unget(st, r);
+		return Tname;
+	}
+	return -1;
+}
+
+static Elem *
+_xmlparse(State *st, Elem *parent, int depth)
+{
+	Attr *ap;
+	Lexbuf lexbuf, *lb;
+	Lexbuf pcdata, *pc;
+	Elem *root, *ep;
+	int os, s, t, a;
+
+	ap = nil;
+	ep = nil;
+	s = Slost;
+	root = nil;
+	lb = &lexbuf;
+	memset(lb, 0, sizeof(Lexbuf));
+	pc = &pcdata;
+	memset(pc, 0, sizeof(Lexbuf));
+	while((t = xlex(st, lb, s)) != -1){
+		os = s;
+		s = statab[os][t];
+		a = acttab[os][t];
+		if(xmldebug == 2)
+			fprint(2, "depth=%d token=%s action=%s state=%s->%s str='%s'\n",
+				depth, tokstr[t], actstr[a], stastr[os], stastr[s], lb->buf);
+		switch(a){
+		case Aelem:
+			if(xmldebug == 1)
+				fprint(2, "%-3d %*.selem name='%s'\n", st->line, depth, "", lb->buf);
+			if(!isname1(lb->buf[0]))
+				failed(st, "'%s' is an illegal element name", lb->buf);
+			if(st->flags & Fstripnamespace)
+				stripns(lb->buf);
+			assert((ep = xmlelem(st->xml, &root, parent, lb->buf)) != nil);
+			ep->line = st->line;
+			break;
+		case Apcdata:
+			if(parent)
+				growstr(st, pc, lb->buf);
+			break;
+		case Aattr:
+			assert(ep != nil);
+			if(xmldebug == 1)
+				fprint(2, "%-3d %*.sattr name='%s'\n", st->line, depth, "", lb->buf);
+			if(!isname1(lb->buf[0]))
+				failed(st, "'%s' is an illegal attribute name", lb->buf);
+			if(st->flags & Fstripnamespace)
+				stripns(lb->buf);
+			assert((ap = xmlattr(st->xml, &(ep->attrs), ep, lb->buf, nil)) != nil);
+			break;
+		case Avalue:
+			assert(ep != nil);
+			assert(ap != nil);
+			ap->value = xmlstrdup(st->xml, lb->buf, 0);
+			ap = nil;
+			if(xmldebug == 1)
+				fprint(2, "%*.sattr value=%s\n", depth, "", lb->buf);
+			break;
+		case Adown:
+			assert(ep != nil);
+			if(xmldebug == 1)
+				fprint(2, "%*.sdown name=%s\n", depth, "", ep->name);
+			ep->child = _xmlparse(st, ep, depth+1);
+			if(xmldebug == 1 && ep->pcdata)
+				fprint(2, "%*.s     name=%s pcdata len=%ld\n", 
+					depth, "",
+					ep->name,
+					(ep->pcdata)? strlen(ep->pcdata): 0L);
+			break;
+		case Aup:
+			if(pc->buf){
+				parent->pcdata = xmlstrdup(st->xml, pc->buf, 0);
+				free(pc->buf);
+			}
+			free(lb->buf);
+			return root;
+			/* NOTREACHED */
+			break;
+		case Acheck:
+			assert(ep != nil);
+			if(st->flags & Fstripnamespace)
+				stripns(lb->buf);
+			if(ep->name && strcmp(lb->buf, ep->name) != 0)
+				failed(st, "</%s> found, expecting match for <%s> (re: line %d) - nesting error",
+					lb->buf, ep->name, ep->line);
+			break;
+		case Anop:
+			break;
+		case Aerr:
+			failed(st, "%s syntax error", lb->buf);
+			break;
+		default:
+			sysfatal("xmlparse: %d - internal error, unknown action\n", a);
+			break;
+		}
+		if(lb->buf)
+			lb->buf[0] = 0;
+	}
+	if(t == -1 && depth != 0)
+		failed(st, "unexpected EOF (depth=%d)", depth);
+
+	if(pc->buf){
+		parent->pcdata = xmlstrdup(st->xml, pc->buf, 0);
+		free(pc->buf);
+	}
+	free(lb->buf);
+	return root;
+}
+
+Xml *
+xmlparse(int fd, int blksize, int flags)
+{
+	State s;
+	Biobuf bio;
+	Xml *x;
+
+	memset(&s, 0, sizeof(s));
+	s.line = 1;
+	Binit(&bio, fd, OREAD);
+	s.bp = &bio;
+	s.flags = flags;
+
+	x = xmlnew(blksize);
+	s.xml = x;
+
+	x->root = _xmlparse(&s, nil, 0);
+	if(s.failed){
+		if(x)
+			xmlfree(x);
+		x = nil;
+	}
+	Bterm(&bio);
+	return x;
+}
--- /dev/null
+++ b/libxml/xmlprint.c
@@ -1,0 +1,83 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <ctype.h>
+#include "xml.h"
+
+static void
+prval(Biobuf *bp, char *s)
+{
+	char *p;
+	Rune r;
+
+	p = s;
+	while(*p){
+		p += chartorune(&r, p);
+		switch(r){
+		case L'&': Bprint(bp, "&amp;"); break;
+		case L'<': Bprint(bp, "&lt;"); break;
+		case L'>': Bprint(bp, "&gt;"); break;
+		case L'"': Bprint(bp, "&quot;"); break;
+		case L'\'': Bprint(bp, "&apos;"); break;
+		default: 
+			if(r >= L' ')
+				Bprint(bp, "%C", r);
+			else
+				Bprint(bp, "&#x%04x;", r);
+			break;
+		}
+	}
+}
+
+static void
+_xmlprint(Biobuf *bp, Elem *ep, int in)
+{
+	Attr *ap;
+	enum {indent = 4};
+
+	for(; ep; ep = ep->next){
+		Bprint(bp, "%*s<%s", in, "", ep->name);
+	
+		for (ap = ep->attrs; ap; ap = ap->next){
+			Bprint(bp, " %s=\'", ap->name);
+			prval(bp, ap->value);
+			Bprint(bp, "\'");
+		}
+
+		if(ep->child){
+			if(ep->pcdata){
+				Bprint(bp, ">\n%*s\n", in+indent, "");
+				prval(bp, ep->pcdata);
+			}
+			else
+				Bprint(bp, ">\n");
+			_xmlprint(bp, ep->child, in+indent);
+			Bprint(bp, "%*s</%s>\n", in, "", ep->name);
+		}
+		else{
+			if(ep->pcdata){
+				Bprint(bp, ">\n%*s", in+indent, "");
+				prval(bp, ep->pcdata);
+				Bprint(bp, "\n%*s</%s>\n", in, "", ep->name);
+			}
+			else
+				Bprint(bp, "/>\n");
+		}
+	}
+}
+
+void
+xmlprint(Xml *xp, int fd)
+{
+	Biobuf bout;
+
+	Binit(&bout, fd, OWRITE);
+	if(xp->doctype){
+		Bprint(&bout, "<?xml version='1.0' encoding='utf-8'?>\n");
+		Bprint(&bout, "<!DOCTYPE %s>\n", xp->doctype);
+	}
+	else
+		Bprint(&bout, "<?xml version='1.0' encoding='utf-8' standalone='yes'?>\n");
+	_xmlprint(&bout, xp->root, 0);
+	Bterm(&bout);
+}
--- /dev/null
+++ b/libxml/xmlvalue.c
@@ -1,0 +1,19 @@
+#include <u.h>
+#include <libc.h>
+#include "xml.h"
+
+char *
+xmlvalue(Elem *ep, char *name)
+{
+	Attr *ap;
+
+	/*
+	 * This enables the common idiom: xmlvalue(xmllook(), name);
+	 */
+	if (ep == nil)
+		return nil;
+	for(ap = ep->attrs; ap; ap = ap->next)
+		if(strcmp(ap->name, name) == 0)
+			return ap->value;
+	return nil;
+}
--- /dev/null
+++ b/mkfile
@@ -1,0 +1,44 @@
+</$objtype/mkfile
+
+INSTALLFILES=\
+	/sys/man/1/xb \
+	/sys/man/2/xml \
+	/sys/include/xml.h \
+	/$objtype/lib/libxml.a \
+	/$objtype/bin/xb \
+
+CFLAGS=$CFLAGS -I..
+
+all:VQ:
+	echo only automatic install possible for now. >[1=2]
+	echo Otherwise, try some manual installation. >[1=2]
+
+install:V: $INSTALLFILES
+
+/sys/man/1/xb: xb
+	cp $prereq $target
+
+/sys/man/2/xml: xml
+	cp $prereq $target
+
+/sys/include/xml.h: xml.h
+	cp $prereq $target
+
+/$objtype/lib/libxml.a:
+	cd libxml && mk install && cd ..
+
+/$objtype/bin/xb: $O.xb
+	cp $prereq $target
+
+$O.%: %.$O
+	$LD $LDFLAGS -o $target $prereq
+
+%.$O: %.c
+	$CC $CFLAGS $stem.c
+
+clean:V:
+	cd libxml && mk clean && cd ..
+	rm -f [$OS].* *.[$OS]
+
+nuke:V:
+	rm -f $INSTALLFILES
--- /dev/null
+++ b/xb
@@ -1,0 +1,52 @@
+.TH XB 1
+.SH NAME
+xb \- XML beautifier
+.SH SYNOPSIS
+.B xb
+[
+.B dM
+][
+.I file
+]
+.SH DESCRIPTION
+.I Xb
+performs a DOM model, non-validating parse of the given XML
+file (or standard input if none specified). This is then written
+to standard output reformatted into an indented, one-line-per-element
+style.
+.PP
+.I Xb
+takes some liberties with the document which may effect later
+interpretation, though it is idempotent:
+.IP -
+Re-formatting the will change the meaning of PCdata in the
+XML file if leading and trailing whitespace is significant.
+.IP -
+All comments are stripped, though the 
+.B DOCTYPE
+structured comment is preserved. A new XML version
+comment is always generated on the output.
+.IP -
+Attributes are always quoted with single quotes
+whether the source uses single or double quotes.
+.IP -
+Empty long form elements are replaced with their short form;
+for example:
+.EX
+	<fred a='1'>
+	</fred>
+.EE
+ is replaced with
+.EX
+	<fred a='1' />
+.EE
+.IP -
+.I Xml
+has its own view of the valid characterset for XML files, which might
+result in slightly different characters being escaped as entity references
+to those on the input. This is only a matter of interpretation, the meaning
+of the file will be unaffected.
+.SH SOURCE
+.B /sys/src/cmd/xb.c
+.SH "SEE ALSO"
+.IR xml (2)
--- /dev/null
+++ b/xb.c
@@ -1,0 +1,75 @@
+/*
+ * Almost an xml beautifier.
+ *
+ * It can be useful just for re-indenting machine generated xml
+ * to make it human readable. It also performs some basic checks
+ * on the file's validity which can be handy if hand editing the
+ * xml source.
+ *
+ * It does take some liberties with the data:
+ *
+ *	- It strips the all comments, though the xml version
+ *	   and DOCTYPE "structured comments" are preserved.
+ *
+ *	- attributes are always quoted with more plan9 style single
+ *	  quotes whether the source has single or double quotes.
+ *
+ *	- empty elements are replaced with the short form, e.g.
+ *	   <fred a='1'> </fred> is mapped to <fred a='1' />
+ */
+
+#include <u.h>
+#include <libc.h>
+#include <pool.h>
+#include <xml.h>
+
+static void
+usage(void)
+{
+	fprint(2, "usage: %s [-dM] [file.xml]\n", argv0);
+	exits("usage");
+}
+
+void
+main(int argc, char *argv[])
+{
+	Xml *xp;
+	int fd, mem;
+
+	mem = 0;
+	ARGBEGIN{
+	case 'd':
+		xmldebug++;
+		break;
+	case 'M':
+		mem++;
+		break;
+	default:
+		usage();
+	}ARGEND;
+
+
+	if(mem > 1)
+		mainmem->flags |= POOL_NOREUSE|POOL_PARANOIA|POOL_ANTAGONISM;
+
+	if(argc == 0){
+		if((xp = xmlparse(0, 8192, Fcrushwhite)) == nil)
+			sysfatal("stdin:%r\n");
+	}
+	else{
+		if((fd = open(argv[0], OREAD)) == -1)
+			sysfatal("%s cannot open\n", argv[0]);
+		if((xp = xmlparse(fd, 8192, Fcrushwhite)) == nil)
+			sysfatal("%s:%r\n", argv[0]);
+		close(fd);
+	}
+	if(mem)
+		fprint(2, "%s: parsed - mem size=%lud free=%lud alloc=%lud nfree=%d\n",
+			argv0, mainmem->cursize, mainmem->curfree, mainmem->curalloc, mainmem->nfree);
+	xmlprint(xp, 1);
+	xmlfree(xp);
+	if(mem)
+		fprint(2, "%s: free'd - mem size=%lud free=%lud alloc=%lud nfree=%d\n",
+			argv0, mainmem->cursize, mainmem->curfree, mainmem->curalloc, mainmem->nfree);
+	exits("");
+}
--- /dev/null
+++ b/xml
@@ -1,0 +1,181 @@
+.TH XML 2
+.SH NAME
+xmlattr,
+xmlcalloc,
+xmlelem,
+xmlfind,
+xmlfree,
+xmllook,
+xmlmalloc,
+xmlnew,
+xmlparse,
+xmlprint,
+xmlstrdup,
+xmlvalue
+\- XML parser
+.SH SYNOPSIS
+.de PB
+.PP
+.ft L
+.nf
+..
+.PB
+#include <u.h>
+#include <libc.h>
+#include <xml.h>
+.PB
+enum {
+	Fcrushwhite = 1,
+	Fstripnamespace = 2,
+};
+.PB
+struct Xml{
+	Elem *root;		/* root of tree */
+	char *doctype;		/* DOCTYPE structured comment, or nil */
+	...
+};
+.PB
+struct Elem {
+	Elem *next;		/* next element at this hierarchy level */
+	Elem *child;		/* first child of this node */
+	Elem *parent;		/* parent of this node */
+	Attr *attrs;		/* linked list of atributes */
+	char *name;		/* element name */
+	char *pcdata;		/* pcdata following this element */
+	int line;			/* Line number (for errors) */
+};
+.PB
+struct Attr {
+	Attr *next;		/* next atribute */
+	Elem *parent;		/* parent element */
+	char *name;		/* atributes name */
+	char *value;		/* atributes value */
+};
+.PB
+.PD 0
+.ta +\w'\fL      'u +\w'\fL    'u +6n +4n
+Attr*	xmlattr(Xml *xp, Attr **root, Elem *parent,
+		char *name, char *value)
+.PB
+Elem*	xmlelem(Xml *xp, Elem **root, Elem *parent, char *name)
+.PB
+Elem*	xmlfind(Xml *xp, Elem *ep, char *path)
+.PB
+Elem*	xmllook(Elem *ep, char *path, char *attr, char *value)
+.PB
+Xml*	xmlnew(int blksize)
+.PB
+Xml*	xmlparse(int fd, int blksize, int flags)
+.PB
+char*	xmlvalue(Elem *ep, char *name)
+.PB
+void*	xmlmalloc(Xml *xp, usize size)
+.PB
+void*	xmlcalloc(Xml *xp, usize nelem, usize elemsz)
+.BP
+void*	xmlstrdup(Xml *xp, char *s)
+.PB
+void	xmlfree(Xml *xp)
+.PB
+void	xmlprint(Xml *xp, int fd)
+.SH DESCRIPTION
+.PP
+.I Libxml
+is a library for manipulating an XML document, in-memory 
+(known as the DOM model). Each element may have a
+number of children, each of which has a number of attributes, each attribute
+has a single value. All elements contain a pointer to their parent
+element, the root element having a nil parent pointer.
+.I Pcdata
+(free form text) found between elements is attached to element which follows it.
+The line numbers where each element was found is stored to allow unambigious
+error messages during later analysis.
+.PP
+Strings are stored in two data structures: a binary tree for common names such as
+element and attribute names. Uncommon names such as values and
+pcdata are stored in a simple, unmanaged heap. These steps vastly reduce the memory
+footprint of the parsed file and the time needed to free the XML data.
+.PP
+.I Xmlparse
+reads the given file and builds an in-memory tree.
+.I Blocksize
+controls the granularity of allocation of the string heap described above,
+8192 is typically used.
+The
+.I flags
+field allows some control over the parser, it is a bitwise
+.B or
+of the following values: 
+.TP 10
+.B Fcrushwhite
+All strings whitespace in PCdata is replaced by a single space and leading and trailing whitespace is
+removed.
+.TP
+.B Fstripsnamespace
+Remove leading namespace strings form all element and attribute names; this effectively ignores namespaces
+which can lead to parsing ambiguities, though in practice it has not been a problem—yet.
+.PP
+Xml trees may also be built up by calling
+.I xmlnew
+to create the XML tree, followed by
+.I xmlelem
+and
+.I xmlattr
+to create individual elements and attributes  respectively.
+.I Xmlelem
+takes the address of the root of an element list to which the new
+element should be appended, the address of the parent node the new
+element should reference, and the name
+of the node to create; It returns the address of the created element.
+.PP
+.I Xmlattr
+attaches an attribute to an existing element. It takes a list pointer
+and parent pointer like 
+.IR xmlelem ,
+but requires both an atribute name and value,
+and returns the address of the new attribute.
+.PP
+.I Xmllook
+descends through the tree rooted at
+.I ep
+using the path specified in
+.IR path .
+It then returns if
+.I elem
+is nil, or continues to search for a matching element.
+if
+.I attr
+and 
+.I value
+are not nil, the search will continue for
+for an element which contains this attribute and value pair.
+.PP
+.I Xmlvalue
+searches the given element's attribute list and returns the
+value of the attribute found or nil if that attribute is not found. 
+.PP
+.I Xmlprint
+writes the XML hierarchy rooted at \fIep\fR as text to the given
+file descriptor.
+.PP
+.IR Xmlmalloc ,
+.IR xmlcalloc ,
+and
+.IR xmlstrdup
+allocate memory within the
+.B Xml
+tree.
+.I Xmlfree
+frees all memory used by the given
+.B Xml
+tree.
+.SH SOURCE
+/sys/src/libxml
+.SH "SEE ALSO"
+.IR xb (1).
+.SH BUGS
+Namespaces should be handled properly.
+.PP
+A SAX model parser will probably be needed sometime (e.g. for Ebooks).
+.PP
+UTF-16 headers should be respected but UTF-16 files seems rare.
--- /dev/null
+++ b/xml.h
@@ -1,0 +1,61 @@
+
+#pragma lib "libxml.a"
+
+typedef struct Xml Xml;
+typedef struct Attr Attr;
+typedef struct Elem Elem;
+
+typedef struct Xtree Xtree;
+typedef struct Xblock Xblock;
+
+#pragma incomplete Xtree
+#pragma incomplete Xblock
+
+enum {
+	Fcrushwhite = 1,
+	Fstripnamespace = 2,
+};
+
+struct Xml {
+	Elem	*root;			/* root of tree */
+	char	*doctype;		/* DOCTYPE structured comment, or nil */
+	struct {
+		Xtree	*root;
+		Xblock	*active;
+		int	blksiz;
+	} alloc;
+};
+
+struct Elem {
+	Elem	*next;			/* next element at this hierarchy level */
+	Elem	*child;		/* first child of this node */
+	Elem	*parent;		/* parent of this node */
+	Attr	*attrs;		/* linked list of atributes */
+	char	*name;			/* element name */
+	char	*pcdata;		/* pcdata following this element */
+	int	line;			/* Line number (for errors) */
+};
+
+struct Attr {
+	Attr	*next;			/* next atribute */
+	Elem	*parent;		/* parent element */
+	char	*name;			/* atributes name (nil for coments) */
+	char	*value;		/* atributes value */
+};
+
+extern int xmldebug;
+
+Attr*	xmlattr(Xml *, Attr **, Elem *, char *, char *);
+Elem*	xmlelem(Xml *, Elem **, Elem *, char *);
+Elem*	xmlfind(Xml *, Elem *, char *);
+void	xmlfree(Xml *);
+char*	xmlstrdup(Xml*, char *, int);
+void*	xmlcalloc(Xml *, int, int);
+void*	xmlmalloc(Xml *, int);
+void	_Xheapstats(void);
+void	_Xheapfree(Xml *);
+Elem*	xmllook(Elem *, char *, char *, char *);
+Xml*	xmlnew(int);
+Xml*	xmlparse(int, int, int);
+void	xmlprint(Xml *, int);
+char*	xmlvalue(Elem *, char *);