ref: ff650e3049019496422fa3a437531a43567e5307
parent: bfbbf051c90844ab0e3e18c10d646013b17e38d9
author: JeffBezanson <jeff.bezanson@gmail.com>
date: Wed May 20 00:30:00 EDT 2009
fixing hash function to do a better job on long lists.
--- a/femtolisp/equal.c
+++ b/femtolisp/equal.c
@@ -259,9 +259,10 @@
#define doublehash(a) int64to32hash(a)
#endif
-// *ut means we had to start using the table
-static uptrint_t bounded_hash(value_t a, int bound, int *ut)
+// *oob: output argument, means we hit the limit specified by 'bound'
+static uptrint_t bounded_hash(value_t a, int bound, int *oob)
{
+ *oob = 0;
double d;
numerictype_t nt;
size_t i, len;
@@ -269,12 +270,7 @@
cprim_t *cp;
void *data;
uptrint_t h = 0;
- if (*ut) {
- h = (uptrint_t)ptrhash_get(&equal_eq_hashtable, (void*)a);
- if (h != (uptrint_t)HT_NOTFOUND)
- return h;
- }
- int tg = tag(a);
+ int oob2, tg = tag(a);
switch(tg) {
case TAG_NUM :
case TAG_NUM1:
@@ -282,7 +278,7 @@
return doublehash(*(int64_t*)&d);
case TAG_FUNCTION:
if (uintval(a) > N_BUILTINS)
- return bounded_hash(((function_t*)ptr(a))->bcode, bound, ut);
+ return bounded_hash(((function_t*)ptr(a))->bcode, bound, oob);
return inthash(a);
case TAG_SYM:
return ((symbol_t*)ptr(a))->hash;
@@ -296,39 +292,39 @@
cv = (cvalue_t*)ptr(a);
data = cv_data(cv);
return memhash(data, cv_len(cv));
+
case TAG_VECTOR:
if (bound <= 0) {
- h = ++(*ut) + (uptrint_t)HT_NOTFOUND;
- ptrhash_put(&equal_eq_hashtable, (void*)a, (void*)h);
- return h;
+ *oob = 1;
+ return 1;
}
len = vector_size(a);
for(i=0; i < len; i++) {
- h = MIX(h, bounded_hash(vector_elt(a,i), bound-1, ut)+1);
+ h = MIX(h, bounded_hash(vector_elt(a,i), bound/2, &oob2)+1);
+ if (oob2)
+ bound/=2;
+ *oob = *oob || oob2;
}
return h;
+
case TAG_CONS:
- if (bound <= 0)
+ if (bound <= 0) {
+ *oob = 1;
return 1;
- return MIX(bounded_hash(car_(a), bound/2, ut),
- bounded_hash(cdr_(a), bound/2, ut)+2);
- // this should be able to hash long lists with greater fidelity,
- // but it does not work yet.
- /*
- first = a;
- bb = BOUNDED_HASH_BOUND;
- do {
- h = MIX(h, bounded_hash(car_(a), bound-1, ut));
- a = cdr_(a);
- bb--;
- if (bb <= 0) {
- *ut = 1;
- ptrhash_put(&equal_eq_hashtable, (void*)first, (void*)h);
- return h;
- }
- } while (iscons(a));
- return MIX(h, bounded_hash(a, bound-1, ut));
- */
+ }
+ h = bounded_hash(car_(a), bound/2, oob);
+ // bounds balancing: try to share the bounds efficiently
+ // between the car and cdr so we can hash better when a list is
+ // car-shallow and cdr-deep (a common case) or vice-versa.
+ if (*oob)
+ bound/=2;
+ else
+ bound--;
+ h = MIX(h, bounded_hash(cdr_(a), bound, &oob2)+2);
+ // recursive OOB propagation. otherwise this case is slow:
+ // (hash '#2=('#0=(#1=(#1#) . #0#) . #2#))
+ *oob = *oob || oob2;
+ return h;
}
return 0;
}
@@ -342,10 +338,8 @@
uptrint_t hash_lispvalue(value_t a)
{
- int ut=0;
- uptrint_t n = bounded_hash(a, BOUNDED_HASH_BOUND, &ut);
- if (ut)
- htable_reset(&equal_eq_hashtable, 512);
+ int oob=0;
+ uptrint_t n = bounded_hash(a, BOUNDED_HASH_BOUND, &oob);
return n;
}
--- a/femtolisp/unittest.lsp
+++ b/femtolisp/unittest.lsp
@@ -154,5 +154,17 @@
(hash [6 1 [2 [[3 1 [2 [1]] 3]]] 3])
(hash [6 1 [2 [[1 1 [2 [1]] 3]]] 3]))))
+(assert (equal? (hash '#0=(1 . #0#))
+ (hash '#1=(1 1 . #1#))))
+
+(assert (not (equal? (hash '#0=(1 1 . #0#))
+ (hash '#1=(1 #0# . #1#)))))
+
+(assert (not (equal? (hash (iota 10))
+ (hash (iota 20)))))
+
+(assert (not (equal? (hash (iota 41))
+ (hash (iota 42)))))
+
(princ "all tests pass\n")
#t