ref: f9096fcdfd46f5bf566e87107675bfd439771a44
dir: /sys/src/cmd/diff/diffreg.c/
#include <u.h> #include <libc.h> #include <bio.h> #include "diff.h" /* diff - differential file comparison * * Uses an algorithm due to Harold Stone, which finds * a pair of longest identical subsequences in the two * files. * * The major goal is to generate the match vector J. * J[i] is the index of the line in file1 corresponding * to line i file0. J[i] = 0 if there is no * such line in file1. * * Lines are hashed so as to work in core. All potential * matches are located by sorting the lines of each file * on the hash (called value). In particular, this * collects the equivalence classes in file1 together. * Subroutine equiv replaces the value of each line in * file0 by the index of the first element of its * matching equivalence in (the reordered) file1. * To save space equiv squeezes file1 into a single * array member in which the equivalence classes * are simply concatenated, except that their first * members are flagged by changing sign. * * Next the indices that point into member are unsorted into * array class according to the original order of file0. * * The cleverness lies in routine stone. This marches * through the lines of file0, developing a vector klist * of "k-candidates". At step i a k-candidate is a matched * pair of lines x,y (x in file0 y in file1) such that * there is a common subsequence of lenght k * between the first i lines of file0 and the first y * lines of file1, but there is no such subsequence for * any smaller y. x is the earliest possible mate to y * that occurs in such a subsequence. * * Whenever any of the members of the equivalence class of * lines in file1 matable to a line in file0 has serial number * less than the y of some k-candidate, that k-candidate * with the smallest such y is replaced. The new * k-candidate is chained (via pred) to the current * k-1 candidate so that the actual subsequence can * be recovered. When a member has serial number greater * that the y of all k-candidates, the klist is extended. * At the end, the longest subsequence is pulled out * and placed in the array J by unravel. * * With J in hand, the matches there recorded are * check'ed against reality to assure that no spurious * matches have crept in due to hashing. If they have, * they are broken, and "jackpot " is recorded--a harmless * matter except that a true match for a spuriously * mated line may now be unnecessarily reported as a change. * * Much of the complexity of the program comes simply * from trying to minimize core utilization and * maximize the range of doable problems by dynamically * allocating what is needed and reusing what is not. * The core requirements for problems larger than somewhat * are (in words) 2*length(file0) + length(file1) + * 3*(number of k-candidates installed), typically about * 6n words for files of length n. */ typedef struct Cand Cand; struct Cand { int x; int y; int pred; }; Cand cand; Line *file[2], line; int len[2]; int binary; Line *sfile[2]; /*shortened by pruning common prefix and suffix*/ int slen[2]; int pref, suff; /*length of prefix and suffix*/ int *class; /*will be overlaid on file[0]*/ int *member; /*will be overlaid on file[1]*/ int *klist; /*will be overlaid on file[0] after class*/ Cand *clist; /* merely a free storage pot for candidates */ int clen; int *J; /*will be overlaid on class*/ long *ixold; /*will be overlaid on klist*/ long *ixnew; /*will be overlaid on file[1]*/ static void sort(Line *a, int n) /*shellsort CACM #201*/ { int m; Line *ai, *aim, *j, *k; Line w; int i; m = 0; for (i = 1; i <= n; i *= 2) m = 2*i - 1; for (m /= 2; m != 0; m /= 2) { k = a+(n-m); for (j = a+1; j <= k; j++) { ai = j; aim = ai+m; do { if (aim->value > ai->value || aim->value == ai->value && aim->serial > ai->serial) break; w = *ai; *ai = *aim; *aim = w; aim = ai; ai -= m; } while (ai > a && aim >= ai); } } } static void unsort(Line *f, int l, int *b) { int *a; int i; a = malloc((l+1)*sizeof(int)); for(i=1;i<=l;i++) a[f[i].serial] = f[i].value; for(i=1;i<=l;i++) b[i] = a[i]; free(a); } static void prune(void) { int i,j; for(pref=0;pref<len[0]&&pref<len[1]&& file[0][pref+1].value==file[1][pref+1].value; pref++ ) ; for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&& file[0][len[0]-suff].value==file[1][len[1]-suff].value; suff++) ; for(j=0;j<2;j++) { sfile[j] = file[j]+pref; slen[j] = len[j]-pref-suff; for(i=0;i<=slen[j];i++) sfile[j][i].serial = i; } } static void equiv(Line *a, int n, Line *b, int m, int *c) { int i, j; i = j = 1; while(i<=n && j<=m) { if(a[i].value < b[j].value) a[i++].value = 0; else if(a[i].value == b[j].value) a[i++].value = j; else j++; } while(i <= n) a[i++].value = 0; b[m+1].value = 0; j = 0; while(++j <= m) { c[j] = -b[j].serial; while(b[j+1].value == b[j].value) { j++; c[j] = b[j].serial; } } c[j] = -1; } static int newcand(int x, int y, int pred) { Cand *q; clist = erealloc(clist, (clen+1)*sizeof(Cand)); q = clist + clen; q->x = x; q->y = y; q->pred = pred; return clen++; } static int search(int *c, int k, int y) { int i, j, l; int t; if(clist[c[k]].y < y) /*quick look for typical case*/ return k+1; i = 0; j = k+1; while((l=(i+j)/2) > i) { t = clist[c[l]].y; if(t > y) j = l; else if(t < y) i = l; else return l; } return l+1; } static int stone(int *a, int n, int *b, int *c) { int i, k,y; int j, l; int oldc, tc; int oldl; k = 0; c[0] = newcand(0,0,0); for(i=1; i<=n; i++) { j = a[i]; if(j==0) continue; y = -b[j]; oldl = 0; oldc = c[0]; do { if(y <= clist[oldc].y) continue; l = search(c, k, y); if(l!=oldl+1) oldc = c[l-1]; if(l<=k) { if(clist[c[l]].y <= y) continue; tc = c[l]; c[l] = newcand(i,y,oldc); oldc = tc; oldl = l; } else { c[l] = newcand(i,y,oldc); k++; break; } } while((y=b[++j]) > 0); } return k; } static void unravel(int p) { int i; Cand *q; for(i=0; i<=len[0]; i++) { if (i <= pref) J[i] = i; else if (i > len[0]-suff) J[i] = i+len[1]-len[0]; else J[i] = 0; } for(q=clist+p;q->y!=0;q=clist+q->pred) J[q->x+pref] = q->y+pref; } static void output(void) { int m, i0, i1, j0, j1; m = len[0]; J[0] = 0; J[m+1] = len[1]+1; if (mode != 'e') { for (i0 = 1; i0 <= m; i0 = i1+1) { while (i0 <= m && J[i0] == J[i0-1]+1) i0++; j0 = J[i0-1]+1; i1 = i0-1; while (i1 < m && J[i1+1] == 0) i1++; j1 = J[i1+1]-1; J[i1] = j1; change(i0, i1, j0, j1); } } else { for (i0 = m; i0 >= 1; i0 = i1-1) { while (i0 >= 1 && J[i0] == J[i0+1]-1 && J[i0]) i0--; j0 = J[i0+1]-1; i1 = i0+1; while (i1 > 1 && J[i1-1] == 0) i1--; j1 = J[i1-1]+1; J[i1] = j1; change(i1 , i0, j1, j0); } } if (m == 0) change(1, 0, 1, len[1]); flushchanges(); } #define BUF 4096 static int cmp(Biobuf* b1, Biobuf* b2) { int n; uchar buf1[BUF], buf2[BUF]; int f1, f2; vlong nc = 1; uchar *b1s, *b1e, *b2s, *b2e; f1 = Bfildes(b1); f2 = Bfildes(b2); seek(f1, 0, 0); seek(f2, 0, 0); b1s = b1e = buf1; b2s = b2e = buf2; for(;;){ if(b1s >= b1e){ if(b1s >= &buf1[BUF]) b1s = buf1; n = read(f1, b1s, &buf1[BUF] - b1s); b1e = b1s + n; } if(b2s >= b2e){ if(b2s >= &buf2[BUF]) b2s = buf2; n = read(f2, b2s, &buf2[BUF] - b2s); b2e = b2s + n; } n = b2e - b2s; if(n > b1e - b1s) n = b1e - b1s; if(n <= 0) break; if(memcmp((void *)b1s, (void *)b2s, n) != 0){ return 1; } nc += n; b1s += n; b2s += n; } if(b1e - b1s == b2e - b2s) return 0; return 1; } void diffreg(char *f, char *fo, char *t, char *to) { Biobuf *b0, *b1; int k; binary = 0; b0 = prepare(0, f, fo); if (!b0) return; b1 = prepare(1, t, to); if (!b1) { Bterm(b0); return; } if (binary){ // could use b0 and b1 but this is simpler. if (cmp(b0, b1)) print("binary files %s %s differ\n", f, t); Bterm(b0); Bterm(b1); return; } clen = 0; prune(); sort(sfile[0], slen[0]); sort(sfile[1], slen[1]); member = (int *)file[1]; equiv(sfile[0], slen[0], sfile[1], slen[1], member); member = erealloc(member, (slen[1]+2)*sizeof(int)); class = (int *)file[0]; unsort(sfile[0], slen[0], class); class = erealloc(class, (slen[0]+2)*sizeof(int)); klist = emalloc((slen[0]+2)*sizeof(int)); clist = emalloc(sizeof(Cand)); k = stone(class, slen[0], member, klist); free(member); free(class); J = emalloc((len[0]+2)*sizeof(int)); unravel(klist[k]); free(clist); free(klist); ixold = emalloc((len[0]+2)*sizeof(long)); ixnew = emalloc((len[1]+2)*sizeof(long)); Bseek(b0, 0, 0); Bseek(b1, 0, 0); check(b0, b1); output(); free(J); free(ixold); free(ixnew); Bterm(b0); Bterm(b1); }