ref: db809f2d4786af8fbdf221d59f638c6d0d0d439c
dir: /sys/src/cmd/upas/bayes/msgclass.c/
#include <u.h> #include <libc.h> #include <bio.h> #include <ctype.h> #include "msgdb.h" void usage(void) { fprint(2, "usage: upas/msgclass [-a] [-d name dbfile]... [-l lockfile] [-m mul] [-t thresh] [tokenfile ...]\n"); exits("usage"); } enum { MAXBEST = 32, MAXLEN = 64, MAXTAB = 256, }; typedef struct Ndb Ndb; struct Ndb { char *name; char *file; Msgdb *db; double p; long nmsg; }; typedef struct Word Word; struct Word { char s[MAXLEN]; int count[MAXTAB]; double p[MAXTAB]; double mp; int mi; /* w.p[w.mi] = w.mp */ int nmsg; }; Ndb db[MAXTAB]; int ndb; int add; int mul; Msgdb *indb; Word best[MAXBEST]; int mbest = 15; int nbest; void process(Biobuf*, char*); void lockfile(char*); void noteword(Word *w, char *s) { int i; for(i=nbest-1; i>=0; i--) if(w->mp < best[i].mp) break; i++; if(i >= mbest) return; if(nbest == mbest) nbest--; if(i < nbest) memmove(&best[i+1], &best[i], (nbest-i)*sizeof(best[0])); best[i] = *w; strecpy(best[i].s, best[i].s+MAXLEN, s); nbest++; } void main(int argc, char **argv) { int i, bad, m, tot, nn, j; Biobuf bin, *b, bout; char *s, *lf; double totp, p, thresh; long n; Word w; lf = nil; thresh = 0; ARGBEGIN{ case 'a': add = 1; break; case 'd': if(ndb >= MAXTAB) sysfatal("too many db classes"); db[ndb].name = EARGF(usage()); db[ndb].file = EARGF(usage()); ndb++; break; case 'l': lf = EARGF(usage()); break; case 'm': mul = atoi(EARGF(usage())); break; case 't': thresh = atof(EARGF(usage())); break; default: usage(); }ARGEND if(ndb == 0){ fprint(2, "must have at least one -d option\n"); usage(); } indb = mdopen(nil, 1); if(argc == 0){ Binit(&bin, 0, OREAD); process(&bin, "<stdin>"); Bterm(&bin); }else{ bad = 0; for(i=0; i<argc; i++){ if((b = Bopen(argv[i], OREAD)) == nil){ fprint(2, "opening %s: %r\n", argv[i]); bad = 1; continue; } process(b, argv[i]); Bterm(b); } if(bad) exits("open inputs"); } lockfile(lf); bad = 0; for(i=0; i<ndb; i++){ if((db[i].db = mdopen(db[i].file, 0)) == nil){ fprint(2, "opendb %s: %r\n", db[i].file); bad = 1; } db[i].nmsg = mdget(db[i].db, "*From*"); } if(bad) exits("open databases"); /* run conditional probabilities of input words, getting 15 most specific */ mdenum(indb); nbest = 0; while(mdnext(indb, &s, &n) >= 0){ tot = 0; totp = 0.0; for(i=0; i<ndb; i++){ nn = mdget(db[i].db, s)*(i==0 ? 3 : 1); tot += nn; w.count[i] = nn; p = w.count[i]/(double)db[i].nmsg; if(p >= 1.0) p = 1.0; w.p[i] = p; totp += p; } //fprint(2, "%s tot %d totp %g\n", s, tot, totp); if(tot < 2) continue; w.mp = 0.0; for(i=0; i<ndb; i++){ p = w.p[i]; p /= totp; if(p < 0.001) p = 0.001; else if(p > 0.999) p = 0.999; if(p > w.mp){ w.mp = p; w.mi = i; } w.p[i] = p; } noteword(&w, s); } /* compute conditional probabilities of message classes using 15 most specific */ totp = 0.0; for(i=0; i<ndb; i++){ p = 1.0; for(j=0; j<nbest; j++) p *= best[j].p[i]; db[i].p = p; totp += p; } for(i=0; i<ndb; i++) db[i].p /= totp; m = 0; for(i=1; i<ndb; i++) if(db[i].p > db[m].p) m = i; Binit(&bout, 1, OWRITE); if(db[m].p < thresh) m = -1; if(m >= 0) Bprint(&bout, "%s", db[m].name); else Bprint(&bout, "inconclusive"); for(j=0; j<ndb; j++) Bprint(&bout, " %s=%g", db[j].name, db[j].p); Bprint(&bout, "\n"); for(i=0; i<nbest; i++){ Bprint(&bout, "%s", best[i].s); for(j=0; j<ndb; j++) Bprint(&bout, " %s=%g", db[j].name, best[i].p[j]); Bprint(&bout, "\n"); } Bprint(&bout, "%s %g\n", best[i].s, best[i].p[m]); Bterm(&bout); if(m >= 0 && add){ mdenum(indb); while(mdnext(indb, &s, &n) >= 0) mdput(db[m].db, s, mdget(db[m].db, s)+n*mul); mdclose(db[m].db); } exits(nil); } void process(Biobuf *b, char*) { char *s; char *p; long n; while((s = Brdline(b, '\n')) != nil){ s[Blinelen(b)-1] = 0; if((p = strrchr(s, ' ')) != nil){ *p++ = 0; n = atoi(p); }else n = 1; mdput(indb, s, mdget(indb, s)+n); } } int tpid; void killtickle(void) { postnote(PNPROC, tpid, "die"); } void lockfile(char *s) { int fd, t, w; char err[ERRMAX]; if(s == nil) return; w = 50; t = 0; for(;;){ fd = open(s, OREAD); if(fd >= 0) break; rerrstr(err, sizeof err); if(strstr(err, "file is locked")==nil && strstr(err, "exclusive lock")==nil)) break; sleep(w); t += w; if(w < 1000) w = (w*3)/2; if(t > 120*1000) break; } if(fd < 0) sysfatal("could not lock %s", s); switch(tpid = fork()){ case -1: sysfatal("fork: %r"); case 0: for(;;){ sleep(30*1000); free(dirfstat(fd)); } _exits(nil); default: break; } close(fd); atexit(killtickle); }