shithub: pprolog

ref: 13efe91101a11f41caf6321a8b2fbdd96ef9927a
dir: pprolog/parser.c

View raw version
#include <u.h>
#include <libc.h>
#include <bio.h>

#include "dat.h"
#include "fns.h"

typedef struct Token Token;
typedef struct OpInfo OpInfo;
struct Token
{
	int	tag;
	Rune	*text;
	double	dval;
	vlong	ival;
};

struct OpInfo
{
	int level;
	int type;
};

enum {
	AtomTok			= 1<<0, /* 1 */
	FunctorTok		= 1<<1, /* 2 */
	VarTok			= 1<<2, /* 4 */
	StringTok		= 1<<3, /* 8 */
	IntTok			= 1<<4, /* 16 */
	FloatTok		= 1<<5, /* 32 */
	ParenLeftTok		= 1<<6, /* 64 */
	ParenRightTok		= 1<<7, /* 128 */
	SquareBracketLeftTok	= 1<<8, /* 256 */
	SquareBracketRightTok	= 1<<9, /* 512 */
	CurlyBracketLeftTok	= 1<<10, /* 1024 */
	CurlyBracketRightTok	= 1<<11, /* 2048 */
	CommaTok		= 1<<12, /* 4096 */
	PipeTok			= 1<<13, /* 8192 */
	EofTok			= 1<<14, /* 16384 */
};

static Biobuf *parsein;
static Token lookahead;
static Module *currentmod;
static VarName *varnames;

void nexttoken(void);
Term *fullterm(int, Rune *, Term *);
Term *term(void);
Term *listterm(int);
Term *curlybracketterm(void);
Term *compound(void);
Term *parsevar(void);
Term *parseoperators(Term *);
void match(int);
void syntaxerror_parser(char *);
Term *parseterm(void);

Term *
parse(Biobuf *bio, Module *mod, VarName **vns)
{
	parsein = bio;
	currentmod = mod;
	varnames = nil;
	nexttoken();

	Term *result = parseterm();
	*vns = varnames;
	return result;
}

Term *
parseterm(void)
{
	if(lookahead.tag == EofTok)
		return nil;

	Term *t = fullterm(AtomTok, L".", nil);
	if(lookahead.tag != AtomTok || runestrcmp(lookahead.text, L".") != 0)
		syntaxerror_parser("parseterm");

	return t;
}

Term *
fullterm(int stoptag, Rune *stoptext, Term *list)
{
	if((lookahead.tag & stoptag) && (stoptext == nil || runestrcmp(stoptext, lookahead.text) == 0))
		return parseoperators(list);

	Term *t = term();
	list = appendterm(list, t);
	return fullterm(stoptag, stoptext, list);
}

Term *
term(void)
{
	Term *result;

	switch(lookahead.tag){
	case AtomTok:
		result = mkatom(lookahead.text);
		match(AtomTok);
		break;
	case FunctorTok:
		result = compound();
		break;
	case VarTok:
		result = parsevar();
		break;
	case IntTok:
		result = mkinteger(lookahead.ival);
		match(IntTok);
		break;
	case FloatTok:
		result = mkfloat(lookahead.dval);
		match(FloatTok);
		break;
	case CommaTok:
		result = mkatom(L",");
		match(CommaTok);
		break;
	case SquareBracketLeftTok:
		result = listterm(1);
		break;
	case CurlyBracketLeftTok:
		result = curlybracketterm();
		break;
	case ParenLeftTok:
		match(ParenLeftTok);
		result = fullterm(ParenRightTok, nil, nil);
		result->inparens = 1;
		match(ParenRightTok);
		break;
	case StringTok:
		result = mkstring(lookahead.text);
		match(StringTok);
		break;
	default:
		print("Can't parse term of token type %d\n", lookahead.tag);
		syntaxerror_parser("term");
		result = nil;
	}

	return result;
}

Term *
listterm(int start)
{
	if(start)
		match(SquareBracketLeftTok);

	if(lookahead.tag == SquareBracketRightTok){
		match(SquareBracketRightTok);
		return mkatom(L"[]");
	}else if(lookahead.tag == PipeTok){
		match(PipeTok);
		Term *t = fullterm(SquareBracketRightTok, nil, nil);
		match(SquareBracketRightTok);
		return t;
	}else{
		Term *t = fullterm(CommaTok|PipeTok|SquareBracketRightTok, nil, nil);
		if(lookahead.tag == CommaTok)
			match(CommaTok);
		t->next = listterm(0);
		return mkcompound(L".", 2, t);
	}
}

Term *
curlybracketterm(void)
{
	match(CurlyBracketLeftTok);
	Term *result = fullterm(CurlyBracketRightTok, nil, nil);
	match(CurlyBracketRightTok);
	return mkcompound(L"{}", 1, result);
}

Term *
compound(void)
{
	Rune *name = lookahead.text;
	Term *args = nil;
	int arity = 0;

	match(FunctorTok);
	match(ParenLeftTok);

	while(lookahead.tag != ParenRightTok){
		Term *t = fullterm(CommaTok|ParenRightTok, nil, nil);
		if(lookahead.tag == CommaTok)
			match(CommaTok);
		args = appendterm(args, t);
		arity++;
	}

	match(ParenRightTok);
	return mkcompound(name, arity, args);
}

Term *
parsevar(void)
{
	Rune *name = lookahead.text;
	match(VarTok);

	VarName *vn;
	uvlong i = 0;
	for(vn = varnames; vn != nil; vn = vn->next, i++)
		if(runestrcmp(vn->name, name) == 0 && !runestrcmp(vn->name, L"_") == 0){
			vn->count++;
			return copyterm(vn->var);
		}

	VarName *new = gmalloc(sizeof(VarName));
	new->name = name;
	new->var = mkvariable();
	new->count = 1;
	new->next = varnames;
	varnames = new;
	return new->var;
}

Term *
parseoperators(Term *list)
{
	int i;
	int length = termslength(list);
	Term *t;
	Term **terms = gmalloc(sizeof(Term *) * length);
	OpInfo *infos = gmalloc(sizeof(OpInfo) * length);

	for(i = 0, t = list; i < length; i++){
		Operator *op;
		if(t->tag == AtomTerm && !t->inparens && (op = getoperator(t->text, currentmod))){
			infos[i].type = op->type;
			infos[i].level = op->level;
		}else{
			infos[i].type = 0;
			infos[i].level = 0;
		}
		terms[i] = t;
		Term *tmp = t;
		t = t->next;
		tmp->next = nil;
	}

	while(length > 1){
		int index = -1;
		int lowest = PrecedenceLevels + 1;
		for(i = 0; i < length; i++){
			if(infos[i].type == 0)
				continue;
			if(infos[i].level == lowest && infos[i].type == Xfy)
				index = i;
			else if(infos[i].level < lowest){
				index = i;
				lowest = infos[i].level;
			}
		}

		if(index == -1){
			print("Can't parse, list of length %d contains no operators: ", length);
			for(i = 0; i < length; i++)
				print("%S(%d) ", prettyprint(terms[i], 0, 1, 0, currentmod), infos[i].level);
			print("\n");
			syntaxerror_parser("parseoperators");
		}

		int infixlevel = infos[index].type & (Xfx|Xfy|Yfx);
		int prefixlevel = infos[index].type & (Fx|Fy);
		int postfixlevel = infos[index].type & (Xf|Yf);

		if(infixlevel && index != 0 && index != length-1 && infos[index-1].type == 0 && infos[index-1].type == 0){
			infos[index-1].type = 0;
			infos[index-1].level = infos[index].level;
			Term *args = terms[index-1];
			args->next = terms[index+1];
			terms[index-1] = mkcompound(terms[index]->text, 2, args);
			length -= 2;
			for(i = index; i < length; i++){
				infos[i].level = infos[i+2].level;
				infos[i].type = infos[i+2].type;
				terms[i] = terms[i+2];
			}
		}else if(prefixlevel && index != length-1 && infos[index+1].type == 0){
			infos[index].type = 0;
			terms[index] = mkcompound(terms[index]->text, 1, terms[index+1]);
			length--;
			for(i = index+1; i < length; i++){
				infos[i].level = infos[i+1].level;
				infos[i].type = infos[i+1].type;
				terms[i] = terms[i+1];
			}
		}else if(postfixlevel && index != 0 && infos[index-1].type == 0){
			infos[index-1].type = 0;
			infos[index-1].level = infos[index].level;
			terms[index-1] = mkcompound(terms[index]->text, 1, terms[index-1]);
			length--;
			for(i = index; i < length; i++){
				infos[i].level = infos[i+1].level;
				infos[i].type = infos[i+1].type;
				terms[i] = terms[i+1];
			}
		}else{
			print("Parse error when parsing operator %S (prefix=%d, postfix=%d, infix=%d level=%d)\n", prettyprint(terms[index], 0, 0, 0, currentmod), prefixlevel, postfixlevel, infixlevel, infos[index].level);
			syntaxerror_parser("parseoperators");
		}
	}

	Term *result = terms[0];
	return result;
}

void
nexttoken(void)
{
	Rune buf[1024];
	Rune peek;
	int i = 0;
	double numD;
	vlong numI = 0;
	int negative = 0;
	static long replaypeek = -1;

	if(replaypeek == -1)
		peek = Bgetrune(parsein);
	else{
		peek = replaypeek;
		replaypeek = -1;
	}

SkipWhite:
	/* Skip whitespace */
	while(isspacerune(peek))
		peek = Bgetrune(parsein);

	/* Skip single line comments */
	if(peek == L'%'){
		while(peek != L'\n')
			peek = Bgetrune(parsein);
		goto SkipWhite;
	}

	/* Variables */
	if(peek == L'_' || isupperrune(peek)){
		while(peek == L'_' || isalpharune(peek) || isdigitrune(peek)){
			/* FIXME Should handle input length, also elsewhere in this function */
			buf[i++] = peek;
			peek = Bgetrune(parsein);
		}
		buf[i] = '\0';
		Bungetrune(parsein);

		lookahead.tag = VarTok;
		lookahead.text = runestrdup(buf);
		return;
	}

	/* Strings */
	if(peek == L'"'){
		peek = Bgetrune(parsein);
		while(peek != L'"'){
			buf[i++] = peek;
			peek = Bgetrune(parsein);
		}
		buf[i] = '\0';
		lookahead.tag = StringTok;
		lookahead.text = runestrdup(buf);
		return;
	}

	/* Numbers */
	if(peek == L'-'){
		peek = Bgetrune(parsein);
		if(isdigitrune(peek))
			negative = 1;
		else{
			Bungetrune(parsein);
			peek = L'-';
		}
	}
	if(isdigitrune(peek)){
		if(peek == L'0'){
			peek = Bgetrune(parsein);
			switch(peek){
			case L'\'': /* Character code */
				peek = Bgetrune(parsein);
				lookahead.tag = IntTok;
				lookahead.ival = peek;
				return;
			case L'o': /* Octal */
			case L'x': /* Hexadecimal */
			case L'b': /* Binary */
				print("Cant lex numbers in other bases than base 10 right now\n");
				exits("lexer");
			}
		}
		while(isdigitrune(peek)){
			numI *= 10;
			numI += peek - L'0';
			peek = Bgetrune(parsein);
		}
		if(peek == L'.'){
			int place = 1;
			numD = numI;
			peek = Bgetrune(parsein);
			if(!isdigitrune(peek)){
				Bungetrune(parsein);
				replaypeek = L'.';
				goto Integer;
			}
			while(isdigitrune(peek)){
				double addition = (peek - L'0') / (double)(10 * place);
				numD += addition;
				peek = Bgetrune(parsein);
				place *= 10;
			}
			Bungetrune(parsein);
			/* Should also lex 123.45E10 */
			lookahead.tag = FloatTok;
			lookahead.dval = negative ? -numD : numD;
		}else{
			Bungetrune(parsein);
Integer:
			lookahead.tag = IntTok;
			lookahead.ival = negative ? -numI : numI;
		}
		return;
	}

	/* Atoms */
	/* Atoms in single quotes */
	if(peek == L'\''){
		peek = Bgetrune(parsein);
		while(peek != L'\''){
QuotedAtomLoop:
			buf[i++] = peek;
			peek = Bgetrune(parsein);
		}
		peek = Bgetrune(parsein);
		if(peek == L'\'')
			goto QuotedAtomLoop;
		else
			Bungetrune(parsein);

		buf[i] = '\0';
		lookahead.tag = AtomTok;
		lookahead.text = runestrdup(buf);
		goto Functor;
	}

	/* Special atom [] */
	if(peek == L'['){
		peek = Bgetrune(parsein);
		if(peek == L']'){
			lookahead.tag = AtomTok;
			lookahead.text = runestrdup(L"[]");
			goto Functor;
		}else{
			Bungetrune(parsein);
			lookahead.tag = SquareBracketLeftTok;
			return;
		}
	}

	/* Special atom {} */
	if(peek == L'{'){
		peek = Bgetrune(parsein);
		if(peek == L'}'){
			lookahead.tag = AtomTok;
			lookahead.text = runestrdup(L"{}");
			goto Functor;
		}else{
			Bungetrune(parsein);
			lookahead.tag = CurlyBracketLeftTok;
			return;
		}
	}

	/* Graphic atom */
	Rune *graphics = L"#$&*+-./:<=>?@^~\\"; /* keep in sync with prettyprint*/
	if(runestrchr(graphics, peek)){
		while(runestrchr(graphics, peek)){
			buf[i++] = peek;
			peek = Bgetrune(parsein);
		}
		buf[i] = '\0';
		Bungetrune(parsein);
		lookahead.tag = AtomTok;
		lookahead.text = runestrdup(buf);
		goto Functor;
	}

	/* simple atom */
	if(islowerrune(peek)){
		while(isalpharune(peek) || isdigitrune(peek) || peek == L'_'){
			buf[i++] = peek;
			peek = Bgetrune(parsein);
		}
		buf[i] = '\0';
		Bungetrune(parsein);
		lookahead.tag = AtomTok;
		lookahead.text = runestrdup(buf);
		goto Functor;
	}

	/* Other */
	if(runestrchr(L",.()]}|!;", peek)){
		switch(peek){
		case L',': lookahead.tag = CommaTok; break;
		case L'(': lookahead.tag = ParenLeftTok; break;
		case L')': lookahead.tag = ParenRightTok; break;
		case L']': lookahead.tag = SquareBracketRightTok; break;
		case L'}': lookahead.tag = CurlyBracketRightTok; break;
		case L'|': lookahead.tag = PipeTok; break;
		case L'!': lookahead.tag = AtomTok; lookahead.text = runestrdup(L"!"); break;
		case L';': lookahead.tag = AtomTok; lookahead.text = runestrdup(L";"); break;
		}
		return;
	}

	/* EOF */
	if(peek == Beof){
		lookahead.tag = EofTok;
		return;
	}

	print("Could not lex rune %C (%ud)\n", peek, peek);
	exits("lexer");

Functor:
	if(peek == Beof){
		Bgetrune(parsein);
		return;
	}

	peek = Bgetrune(parsein);
	if(peek == L'(')
		lookahead.tag = FunctorTok;
	Bungetrune(parsein);
	return;
}

void
match(int tag)
{
	if(lookahead.tag == tag)
		nexttoken();
	else
		syntaxerror_parser("match");
}

void
syntaxerror_parser(char *where)
{
	print("Syntax error: Unexpected %d (%S) token in %s\n", lookahead.tag, lookahead.text, where);
	exits("syntax error");
}