ref: 2aec1f8a09ae0bc9fb269d84ef53defebc47eede
dir: /sys/src/cmd/upas/scanmail/common.c/
#include <u.h> #include <libc.h> #include <bio.h> #include <regexp.h> #include "spam.h" enum { Quanta = 8192, Minbody = 6000, HdrMax = 15, }; typedef struct keyword Keyword; typedef struct word Word; struct word{ char *string; int n; }; struct keyword{ char *string; int value; }; Word htmlcmds[] = { "html", 4, "!doctype html", 13, 0, }; Word hrefs[] = { "a href=", 7, "a title=", 8, "a target=", 9, "base href=", 10, "img src=", 8, "img border=", 11, "form action=", 12, "!--", 3, 0, }; /* * RFC822 header keywords to look for for fractured header. * all lengths must be less than HdrMax defined above. */ Word hdrwords[] = { "cc:", 3, "bcc:", 4, "to:", 3, 0, 0, }; Keyword keywords[] = { "header", HoldHeader, "line", SaveLine, "hold", Hold, "dump", Dump, "loff", Lineoff, 0, Nactions, }; Patterns patterns[] = { [Dump] { "DUMP:", 0, 0 }, [HoldHeader] { "HEADER:", 0, 0 }, [Hold] { "HOLD:", 0, 0 }, [SaveLine] { "LINE:", 0, 0 }, [Lineoff] { "LINEOFF:", 0, 0 }, [Nactions] { 0, 0, 0 }, }; static char* endofhdr(char*, char*); static int escape(char**); static int extract(char*); static int findkey(char*); static int hash(int); static int isword(Word*, char*, int); static void parsealt(Biobuf*, char*, Spat**); /* * The canonicalizer: convert input to canonical representation */ char* readmsg(Biobuf *bp, int *hsize, int *bufsize) { char *p, *buf; int n, offset, eoh, bsize, delta; buf = 0; offset = 0; if(bufsize) *bufsize = 0; if(hsize) *hsize = 0; for(;;) { buf = Realloc(buf, offset+Quanta+1); n = Bread(bp, buf+offset, Quanta); if(n < 0){ free(buf); return 0; } p = buf+offset; /* start of this chunk */ offset += n; /* end of this chunk */ buf[offset] = 0; if(n == 0){ if(offset == 0) return 0; break; } if(hsize == 0) /* don't process header */ break; if(p != buf && p[-1] == '\n') /* check for EOH across buffer split */ p--; p = endofhdr(p, buf+offset); if(p) break; if(offset >= Maxread) /* gargantuan header - just punt*/ { if(hsize) *hsize = offset; if(bufsize) *bufsize = offset; return buf; } } eoh = p-buf; /* End of header */ bsize = offset - eoh; /* amount of body already read */ /* Read at least Minbody bytes of the body */ if (bsize < Minbody){ delta = Minbody-bsize; buf = Realloc(buf, offset+delta+1); n = Bread(bp, buf+offset, delta); if(n > 0) { offset += n; buf[offset] = 0; } } if(hsize) *hsize = eoh; if(bufsize) *bufsize = offset; return buf; } static int isword(Word *wp, char *text, int len) { for(;wp->string; wp++) if(len >= wp->n && strncmp(text, wp->string, wp->n) == 0) return 1; return 0; } static char* endofhdr(char *raw, char *end) { int i; char *p, *q; char buf[HdrMax]; /* * can't use strchr to search for newlines because * there may be embedded NULL's. */ for(p = raw; p < end; p++){ if(*p != '\n' || p[1] != '\n') continue; p++; for(i = 0, q = p+1; i < sizeof(buf) && *q; q++){ buf[i++] = tolower(*q); if(*q == ':' || *q == '\n') break; } if(!isword(hdrwords, buf, i)) return p+1; } return 0; } static int htmlmatch(Word *wp, char *text, char *end, int *n) { char *cp; int i, c, lastc; char buf[MaxHtml]; /* * extract a string up to '>' */ i = lastc = 0; cp = text; while (cp < end && i < sizeof(buf)-1){ c = *cp++; if(c == '=') c = escape(&cp); switch(c){ case 0: case '\r': continue; case '>': goto out; case '\n': case ' ': case '\t': if(lastc == ' ') continue; c = ' '; break; default: c = tolower(c); break; } buf[i++] = lastc = c; } out: buf[i] = 0; if(n) *n = cp-text; return isword(wp, buf, i); } static int escape(char **msg) { int c; char *p; p = *msg; c = *p; if(c == '\n'){ p++; c = *p++; } else if(c == '2'){ c = tolower(p[1]); if(c == 'e'){ p += 2; c = '.'; }else if(c == 'f'){ p += 2; c = '/'; }else if(c == '0'){ p += 2; c = ' '; } else c = '='; } else { if(c == '3' && tolower(p[1]) == 'd') p += 2; c = '='; } *msg = p; return c; } static int htmlchk(char **msg, char *end) { int n; char *p; static int ishtml; p = *msg; if(ishtml == 0){ ishtml = htmlmatch(htmlcmds, p, end, &n); /* If not an HTML keyword, check if it's * an HTML comment (<!comment>). if so, * skip over it; otherwise copy it in. */ if(ishtml == 0 && *p != '!') /* not comment */ return '<'; /* copy it */ } else if(htmlmatch(hrefs, p, end, &n)) /* if special HTML string */ return '<'; /* copy it */ /* * this is an uninteresting HTML command; skip over it. */ p += n; *msg = p+1; return *p; } /* * decode a base 64 encode body */ void conv64(char *msg, char *end, char *buf, int bufsize) { int len, i; char *cp; len = end - msg; i = (len*3)/4+1; // room for max chars + null cp = Malloc(i); len = dec64((uchar*)cp, i, msg, len); convert(cp, cp+len, buf, bufsize, 1); free(cp); } int convert(char *msg, char *end, char *buf, int bufsize, int isbody) { char *p; int c, lastc, base64; lastc = 0; base64 = 0; while(msg < end && bufsize > 0){ c = *msg++; /* * In the body only, try to strip most HTML and * replace certain MIME escape sequences with the character */ if(isbody) { do{ p = msg; if(c == '<') c = htmlchk(&msg, end); if(c == '=') c = escape(&msg); } while(p != msg && p < end); } switch(c){ case 0: case '\r': continue; case '\t': case ' ': case '\n': if(lastc == ' ') continue; c = ' '; break; case 'C': /* check for MIME base 64 encoding in header */ case 'c': if(isbody == 0) if(msg < end-32 && *msg == 'o' && msg[1] == 'n') if(cistrncmp(msg+2, "tent-transfer-encoding: base64", 30) == 0) base64 = 1; c = 'c'; break; default: c = tolower(c); break; } *buf++ = c; lastc = c; bufsize--; } *buf = 0; return base64; } /* * The pattern parser: build data structures from the pattern file */ static int hash(int c) { return c & 127; } static int findkey(char *val) { Keyword *kp; for(kp = keywords; kp->string; kp++) if(strcmp(val, kp->string) == 0) break; return kp->value; } #define whitespace(c) ((c) == ' ' || (c) == '\t') void parsepats(Biobuf *bp) { Pattern *p, *new; char *cp, *qp; int type, action, n, h; Spat *spat; for(;;){ cp = Brdline(bp, '\n'); if(cp == 0) break; cp[Blinelen(bp)-1] = 0; while(*cp == ' ' || *cp == '\t') cp++; if(*cp == '#' || *cp == 0) continue; type = regexp; if(*cp == '*'){ type = string; cp++; } qp = strchr(cp, ':'); if(qp == 0) continue; *qp = 0; if(debug) fprint(2, "action = %s\n", cp); action = findkey(cp); if(action >= Nactions) continue; cp = qp+1; n = extract(cp); if(n <= 0 || *cp == 0) continue; qp = strstr(cp, "~~"); if(qp){ *qp = 0; n = strlen(cp); } if(debug) fprint(2, " Pattern: `%s'\n", cp); /* Hook regexps into a chain */ if(type == regexp) { new = Malloc(sizeof(Pattern)); new->action = action; new->pat = regcomp(cp); if(new->pat == 0){ free(new); continue; } new->type = regexp; new->alt = 0; new->next = 0; if(qp) parsealt(bp, qp+2, &new->alt); new->next = patterns[action].regexps; patterns[action].regexps = new; continue; } /* not a Regexp - hook strings into Pattern hash chain */ spat = Malloc(sizeof(*spat)); spat->next = 0; spat->alt = 0; spat->len = n; spat->string = Malloc(n+1); spat->c1 = cp[1]; strcpy(spat->string, cp); if(qp) parsealt(bp, qp+2, &spat->alt); p = patterns[action].strings; if(p == 0) { p = Malloc(sizeof(Pattern)); memset(p, 0, sizeof(*p)); p->action = action; p->type = string; patterns[action].strings = p; } h = hash(*spat->string); spat->next = p->spat[h]; p->spat[h] = spat; } } static void parsealt(Biobuf *bp, char *cp, Spat** head) { char *p; Spat *alt; while(cp){ if(*cp == 0){ /*escaped newline*/ do{ cp = Brdline(bp, '\n'); if(cp == 0) return; cp[Blinelen(bp)-1] = 0; } while(extract(cp) <= 0 || *cp == 0); } p = cp; cp = strstr(p, "~~"); if(cp){ *cp = 0; cp += 2; } if(strlen(p)){ alt = Malloc(sizeof(*alt)); alt->string = strdup(p); alt->next = *head; *head = alt; } } } static int extract(char *cp) { int c; char *p, *q, *r; p = q = r = cp; while(whitespace(*p)) p++; while(c = *p++){ if (c == '#') break; if(c == '"'){ while(*p && *p != '"'){ if(*p == '\\' && p[1] == '"') p++; if('A' <= *p && *p <= 'Z') *q++ = *p++ + ('a'-'A'); else *q++ = *p++; } if(*p) p++; r = q; /* never back up over a quoted string */ } else { if('A' <= c && c <= 'Z') c += ('a'-'A'); *q++ = c; } } while(q > r && whitespace(q[-1])) q--; *q = 0; return q-cp; } /* * The matching engine: compare canonical input to pattern structures */ static Spat* isalt(char *message, Spat *alt) { while(alt) { if(*cmd) if(message != cmd && strstr(cmd, alt->string)) break; if(message != header+1 && strstr(header+1, alt->string)) break; if(strstr(message, alt->string)) break; alt = alt->next; } return alt; } int matchpat(Pattern *p, char *message, Resub *m) { Spat *spat; char *s; int c, c1; if(p->type == string){ c1 = *message; for(s=message; c=c1; s++){ c1 = s[1]; for(spat=p->spat[hash(c)]; spat; spat=spat->next){ if(c1 == spat->c1) if(memcmp(s, spat->string, spat->len) == 0) if(!isalt(message, spat->alt)){ m->sp = s; m->ep = s + spat->len; return 1; } } } return 0; } m->sp = m->ep = 0; if(regexec(p->pat, message, m, 1) == 0) return 0; if(isalt(message, p->alt)) return 0; return 1; } void xprint(int fd, char *type, Resub *m) { char *p, *q; int i; if(m->sp == 0 || m->ep == 0) return; /* back up approx 30 characters to whitespace */ for(p = m->sp, i = 0; *p && i < 30; i++, p--) ; while(*p && *p != ' ') p--; p++; /* grab about 30 more chars beyond the end of the match */ for(q = m->ep, i = 0; *q && i < 30; i++, q++) ; while(*q && *q != ' ') q++; fprint(fd, "%s %.*s~%.*s~%.*s\n", type, (int)(m->sp-p), p, (int)(m->ep-m->sp), m->sp, (int)(q-m->ep), m->ep); } enum { INVAL= 255 }; static uchar t64d[256] = { /*00 */ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, /*10*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, /*20*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, 62, INVAL, INVAL, INVAL, 63, /*30*/ 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, /*40*/ INVAL, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /*50*/ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, INVAL, INVAL, INVAL, INVAL, INVAL, /*60*/ INVAL, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, /*70*/ 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, INVAL, INVAL, INVAL, INVAL, INVAL, /*80*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, /*90*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, /*A0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, /*B0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, /*C0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, /*D0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, /*E0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, /*F0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, };