ref: 08442dd9c3426e2dbe598ef1f6fb10f2f67638aa
parent: 76086513de9efd7a000d0bc229c3e990f0af5a2f
author: Ori Bernstein <ori@eigenstate.org>
date: Fri Oct 2 19:27:11 EDT 2015
Implement some asm optimized memcpy/memmove checks. TODO: memcmp
--- a/bench/bld.sub
+++ b/bench/bld.sub
@@ -31,6 +31,14 @@
lib @/lib/sys:sys
lib @/lib/regex:regex
;;
+
+bin many-memcpy =
+ memcpy.myr
+ lib @/lib/std:std
+ lib @/lib/sys:sys
+;;
+
+# benchmark runner
bin runbench =
runbench.myr
lib @/lib/std:std
@@ -45,4 +53,5 @@
bigfactorial
mandelbrot
regex-match
+ many-memcpy
;;
--- a/bench/copious-allocs.myr
+++ b/bench/copious-allocs.myr
@@ -5,32 +5,31 @@
;;
const main = {
- var i, j
var a : blob#[10000]
- for j = 0; j < 100; j++
+ for var j = 0; j < 100; j++
/* alloc forwards, dealloc forwards */
- for i = 0; i < a.len; i++
+ for var i = 0; i < a.len; i++
a[i] = std.alloc()
;;
- for i = 0; i < a.len; i++
+ for var i = 0; i < a.len; i++
std.free(a[i])
;;
/* alloc forwards, dealloc backwards */
- for i = 0; i < a.len; i++
+ for var i = 0; i < a.len; i++
a[i] = std.alloc()
;;
- for i = a.len; i > 0; i--
+ for var i = a.len; i > 0; i--
std.free(a[i - 1])
;;
/* alloc forwards, dealloc randomly */
- for i = 0; i < a.len; i++
+ for var i = 0; i < a.len; i++
a[i] = std.alloc()
;;
shuffle(a[:])
- for i = a.len; i > 0; i--
+ for var i = a.len; i > 0; i--
std.free(a[i - 1])
;;
;;
@@ -39,10 +38,11 @@
const shuffle = {a
var t
var rng
- var i, j
+ var j
+ /* we want determinism for benchmarking */
rng = std.mksrng(123)
- for i = 0; i < a.len - 1; i++
+ for var i = 0; i < a.len - 1; i++
j = std.rngrand(rng, i, a.len)
t = a[j]
a[j] = a[i]
--- /dev/null
+++ b/bench/many-memcpy.myr
@@ -1,0 +1,27 @@
+use std
+
+const main = {
+ var a : uint64[100000]
+
+ for var j = 0; j < 100; j++
+ /* independent copies forward */
+ for var i = 0; i < 10; i++
+ std.slcp(a[:a.len/2-1], a[a.len/2+1:])
+ ;;
+ /* independent copies backward */
+ for var i = 0; i < 10; i++
+ std.slcp(a[:a.len/2-1], a[a.len/2+1:])
+ ;;
+
+ /* dependent copies forward */
+ for var i = 0; i < 10; i++
+ std.slcp(a[:a.len/2+1000], a[a.len/2-1000:])
+ ;;
+ /* dependent copies backward */
+ for var i = 0; i < 10; i++
+ std.slcp(a[a.len/2-1000:], a[:a.len/2+1000])
+ ;;
+ ;;
+}
+
+
--- a/lib/std/alloc.myr
+++ b/lib/std/alloc.myr
@@ -3,6 +3,7 @@
use "types.use"
use "units.use"
use "syswrap.use"
+use "memops.use"
/*
The allocator implementation here is based on Bonwick's slab allocator.
@@ -181,11 +182,15 @@
/* Grows a slice, filling new entries with zero bytes */
generic slzgrow = {sl : @a[:], len
- var oldsz
+ var oldlen
+ var base
- oldsz = sl.len*sizeof(@a)
+ oldlen = sl.len
sl = slgrow(sl, len)
- zfill((sl castto(byte#))[oldsz:len*sizeof(@a)])
+ base = sl castto(byte#) castto(intptr)
+ if oldlen < len
+ memfill(sl[oldlen:] castto(byte#), 0, (len - oldlen)*sizeof(@a))
+ ;;
-> sl
}
@@ -200,16 +205,10 @@
var p
p = bytealloc(sz)
- zfill(p[0:sz])
+ memfill(p, 0, sz)
-> p
}
-const zfill = {sl
- for var i = 0; i < sl.len; i++
- sl[i] = 0
- ;;
-}
-
/* Allocates a blob that is 'sz' bytes long. Dies if the allocation fails */
const bytealloc = {sz
var bkt, p
@@ -229,12 +228,8 @@
/* frees a blob that is 'sz' bytes long. */
const bytefree = {p, sz
var bkt
- var b, i
- b = (p castto(uint64#))[:sz/8]
- for i = 0; i < sz>>3; i++
- b[i] = 0xa8a8a8a8a8a8a8a8
- ;;
+ memfill(p, 0xa8, sz)
if (sz < Bktmax)
bkt = &buckets[bktnum(sz)]
bktfree(bkt, p)
--- a/lib/std/bld.sub
+++ b/lib/std/bld.sub
@@ -62,6 +62,11 @@
utf.myr
varargs.myr
+ # asm optimizations
+ memops.myr
+ memops-impl.myr
+ memops-impl+posixy-x64.s
+
# platform specific files
env+plan9.myr
env+posixy.myr
--- /dev/null
+++ b/lib/std/memops-impl+posixy-x64.s
@@ -1,0 +1,50 @@
+/*
+std.memblit : (dst : byte#, src : byte#, len : std.size -> void)
+std.memfill : (dst : byte#, val : byte, len : std.size -> void)
+*/
+.globl _std$memblit
+.globl std$memblit
+_std$memblit:
+std$memblit:
+ cmpq %rdi,%rsi
+ jz .done
+ jg .fwdcpy
+ movq %rsi,%rax
+ subq %rdi,%rax
+ cmpq %rax,%rcx
+ jg .revcpy
+.fwdcpy:
+ movq %rdx,%rcx
+ shrq $3,%rcx
+ rep movsq
+ movq %rdx,%rcx
+ andq $7,%rcx
+ rep movsb
+ jmp .done
+.revcpy:
+ std
+ movq %rdx,%rcx
+ leaq -1(%rdx,%rsi),%rsi
+ leaq -1(%rdx,%rdi),%rdi
+ rep movsb
+ cld
+.done:
+ ret
+
+.globl _std$memfill
+.globl std$memfill
+_std$memfill:
+std$memfill:
+ /* generate 8 bytes of fill */
+ movzbq %sil,%rbx
+ mov $0x101010101010101,%rax
+ imul %rbx,%rax
+
+ /* and fill */
+ movq %rdx,%rcx
+ shrq $3,%rcx
+ rep stosq
+ movq %rdx,%rcx
+ andq $7,%rcx
+ rep stosb
+ ret
--- /dev/null
+++ b/lib/std/memops-impl.myr
@@ -1,0 +1,39 @@
+use "types.use"
+
+pkg std =
+ pkglocal const memblit : (dst : byte#, src : byte#, len : std.size -> void)
+ pkglocal const memfill : (dst : byte#, val : byte, len : std.size -> void)
+;;
+
+
+const memblit = {dst, src, len
+ var sa, da
+ var s, d
+
+ da = dst castto(intptr)
+ sa = src castto(intptr)
+ d = dst[:len]
+ s = src[:len]
+
+ if da == sa
+ ->
+ elif da < sa
+ for var i = 0; i < d.len; i++
+ d[i] = s[i]
+ ;;
+ else
+ for var i = d.len; i > 0; i--
+ d[i - 1] = s[i - 1]
+ ;;
+ ;;
+}
+
+const memfill = {dst, val, len
+ var d
+
+ d = dst[:len]
+ for var i = 0; i < d.len; i++
+ d[i] = val
+ ;;
+}
+
--- /dev/null
+++ b/lib/std/memops.myr
@@ -1,0 +1,7 @@
+use "types.use"
+
+pkg std =
+ pkglocal extern const memblit : (src : byte#, dst : byte#, len : std.size -> void)
+ pkglocal extern const memfill : (src : byte#, val : byte, len : std.size -> void)
+;;
+
--- a/lib/std/slcp.myr
+++ b/lib/std/slcp.myr
@@ -1,5 +1,6 @@
use "die.use"
use "types.use"
+use "memops.use"
pkg std =
generic slcp : (a : @a[:], b : @a[:] -> void)
@@ -6,20 +7,6 @@
;;
generic slcp = {a : @a[:], b : @a[:]
- var addr_a, addr_b
-
- assert(a.len == b.len, "arguments to slcp() must be of equal length")
-
- addr_a = a castto(@a#) castto(intptr)
- addr_b = b castto(@a#) castto(intptr)
- if addr_a <= addr_b
- for var i = 0; i < a.len; i++
- a[i] = b[i]
- ;;
- else
- for var i = a.len; i > 0; i--
- a[i - 1] = b[i - 1]
- ;;
- ;;
-
+ assert(a.len == b.len, "arguments to slcp() must be of equal length\n")
+ memblit(a castto(byte#), b castto(byte#), a.len * sizeof(@a))
}
--- a/lib/std/test/slcp.myr
+++ b/lib/std/test/slcp.myr
@@ -9,6 +9,8 @@
std.slcp(a[:a.len-2], a[2:])
std.slcp(b[2:], b[:b.len-2])
+ std.put("a: {}, a_cped: {}\n", a[:], a_cped[:])
+ std.put("b: {}, b_cped: {}\n", b[:], b_cped[:])
std.assert(std.sleq(a[:], a_cped[:]), "slcp of a failed")
- std.assert(std.sleq(b[:], b_cped[:]), "slcp of a failed")
+ std.assert(std.sleq(b[:], b_cped[:]), "slcp of b failed")
}