ref: 3b7e03a30ed29ee0666c00568bf67b8179c28d12
dir: /sys/src/cmd/upas/bayes/msgtok.c/
/*
 * RFC822 message tokenizer (really feature generator) for spam filter.
 * 
 * See Paul Graham's musings on spam filtering for theory.
 */
#include <u.h>
#include <libc.h>
#include <bio.h>
#include "regexp.h"
#include <ctype.h>
#include "dfa.h"
void buildre(Dreprog*[3]);
int debug;
char *refile = "/mail/lib/classify.re";
int maxtoklen = 20;
int trim(char*);
void
usage(void)
{
	fprint(2, "usage: msgtok [-D] [-r /mail/lib/classify.re] [file]\n");
	exits("usage");
}
void
main(int argc, char **argv)
{
	int i, hdr, n, eof, off;
	Dreprog *re[3];
	int m[3];
	char *p, *ep, *tag;
	Biobuf bout, bin;
	char msg[1024+1];
	char buf[1024];
	buildre(re);
	ARGBEGIN{
	case 'D':
		debug = 1;
		break;
	case 'n':
		maxtoklen = atoi(EARGF(usage()));
		break;
	case 'r':
		refile = EARGF(usage());
		break;
	default:
		usage();
	}ARGEND;
	if(argc > 1)
		usage();
	if(argc == 1){
		close(0);
		if(open(argv[0], OREAD) < 0)
			sysfatal("open %s: %r", argv[0]);
	}
	tag = nil;
	Binit(&bin, 0, OREAD);
	Binit(&bout, 1, OWRITE);
	ep = msg;
	p = msg;
	eof = 0;
	off = 0;
	hdr = 1;
	for(;;){
		/* replenish buffer */
		if(ep - p < 512 && !eof){
			if(p > msg + 1){
				n = ep - p;
				memmove(msg, p-1, ep-(p-1));
				off += (p-1) - msg;
				p = msg+1;
				ep = p + n;
			}
			n = Bread(&bin, ep, msg+(sizeof msg - 1)- ep);
			if(n < 0)
				sysfatal("read error: %r");
			if(n == 0)
				eof = 1;
			ep += n;
			*ep = 0;
		}
		if(p >= ep)
			break;
		if(*p == 0){
			p++;
			continue;
		}
		if(hdr && p[-1]=='\n'){
			if(p[0]=='\n')
				hdr = 0;
			else if(cistrncmp(p-1, "\nfrom:", 6) == 0)
				tag = "From*";
			else if(cistrncmp(p-1, "\nto:", 4) == 0)
				tag = "To*";
			else if(cistrncmp(p-1, "\nsubject:", 9) == 0)
				tag = "Subject*";
			else if(cistrncmp(p-1, "\nreturn-path:", 13) == 0)
				tag = "Return-Path*";
			else
				tag = nil;
		}
		m[0] = dregexec(re[0], p, p==msg || p[-1]=='\n');
		m[1] = dregexec(re[1], p, p==msg || p[-1]=='\n');
		m[2] = dregexec(re[2], p, p==msg || p[-1]=='\n');
		n = m[0];
		if(n < m[1])
			n = m[1];
		if(n < m[2])
			n = m[2];
		if(n <= 0){
fprint(2, "«%s» %.2ux", p, p[0]);
			sysfatal("no regexps matched at %zd", off + (p-msg));
		}
		if(m[0] >= m[1] && m[0] >= m[2]){
			/* "From " marks start of new message */
			Bprint(&bout, "*From*\n");
			n = m[0];
			hdr = 1;
		}else if(m[2] > 1){
			/* ignore */
			n = m[2];
		}else if(m[1] >= m[0] && m[1] >= m[2] && m[1] > 2 && m[1] <= maxtoklen){
			/* keyword */
			/* should do UTF-aware lowercasing, too much bother */
/*
			for(i=0; i<n; i++)
				if('A' <= p[i] && p[i] <= 'Z')
					p[i] += 'a' - 'A';
*/
			if(tag){
				i = strlen(tag);	
				memmove(buf, tag, i);
				memmove(buf+i, p, m[1]);
				buf[i+m[1]] = 0;
			}else{
				memmove(buf, p, m[1]);
				buf[m[1]] = 0;
			}
			Bprint(&bout, "%s\n", buf);
			while(trim(buf) >= 0)
				Bprint(&bout, "stem*%s\n", buf);
			n = m[1];
		}else
			n = m[2];
		if(debug)
			fprint(2, "%.*s¦", utfnlen(p, n), p);
		p += n;
	}
	Bterm(&bout);
	exits(0);
}
void
buildre(Dreprog *re[3])
{
	Biobuf *b;
	if((b = Bopen(refile, OREAD)) == nil)
		sysfatal("open %s: %r", refile);
	re[0] = Breaddfa(b);
	re[1] = Breaddfa(b);
	re[2] = Breaddfa(b);
	if(re[0]==nil || re[1]==nil || re[2]==nil)
		sysfatal("Breaddfa: %r");
	Bterm(b);
}
/* perhaps this belongs in the tokenizer */
int
trim(char *s)
{
	char *p, *op;
	int mix, mix1;
	if(*s == '*')
		return -1;
	/* strip leading punctuation */
	p = strchr(s, '*');
	if(p == nil)
		p = s;
	while(*p && !isalpha(*p))
		p++;
	if(strlen(p) < 2)
{
		return -1;
}
	memmove(s, p, strlen(p)+1);
	/* strip suffix of punctuation */
	p = s+strlen(s);
	op = p;
	while(p > s && (uchar)p[-1]<0x80 && !isalpha(p[-1]))
		p--;
	/* chop punctuation */
	if(p > s){
		/* free!!! -> free! */
		if(p+1 < op){
			p[1] = 0;
			return 0;
		}
		/* free! -> free */
		if(p < op){
			p[0] = 0;
			return 0;
		}
	}
	mix = mix1 = 0;
	if(isupper(s[0]))
		mix = 1;
	for(p=s+1; *p; p++)
		if(isupper(*p)){
			mix1 = 1;
			break;
		}
	/* turn FREE into Free */
	if(mix1){
		for(p=s+1; *p; p++)
			if(isupper(*p))
				*p += 'a'-'A';
		return 0;
	}
	/* turn Free into free */
	if(mix){
		*s += 'a'-'A';
		return 0;
	}
	return -1;
}