shithub: mc

Download patch

ref: fb0c459a47563a8f455962ae8dc26654ae1d4d9c
parent: 654eb21dce9327393d64b13d3eb3cbc8db35fe3b
author: Ori Bernstein <ori@markovcorp.com>
date: Wed Mar 28 07:34:06 EDT 2018

Improve comments on the regex interpreter.

--- a/lib/regex/interp.myr
+++ b/lib/regex/interp.myr
@@ -173,7 +173,13 @@
 	-> `std.Some ret
 }
 
-/* returns a matching thread, or Zthr if no threads matched */
+/*
+ * Run manages the virtual machine state, and schedules the
+ * regex threads. Each linear match runs in its own thread.
+ * When a new thread is created, it is carefully scheduled
+ * after the current thread, to ensure that match order is
+ * preserved.
+ */
 const run = {re, str, idx, wholestr
 	var bestmatch
 	var consumed
@@ -189,7 +195,13 @@
 	states = std.mkbs()
 	re.runq = mkthread(re, 0)
 	if re.debug
-		/* The last run could have left things here, since we need this info after the run */
+		/* 
+		  If we're in debug mode, then we keep
+		  the traces around, so we can show them
+		  to the user. To avoid leaking, we need
+		  to free the traces from the last run
+		  when we start a new one.
+		 */
 		for bs : re.traces
 			std.bsfree(bs)
 		;;
@@ -206,16 +218,34 @@
 			if re.trace
 				std.put("switch\n")
 			;;
-			/* set up the next thread */
 			thr = re.runq
 			re.runq = thr.next
 
 			ip = thr.ip
+			/* 
+			  Stepping continues until the first
+			  non-consuming operator is seen. This
+			  keeps all the threads in lockstep,
+			  which means that when a match is
+			  encountered, we know all the other
+			  threads have seen what they need
+			  to, and we can terminate them.
+			 */
 			consumed = step(re, thr, -1)
 			while !consumed
 				consumed = step(re, thr, ip)
 			;;
 
+			/*
+			 * Because threads have no memory,
+			 * their ip (and current input
+			 * character, which is the same
+			 * thanks to the above mentioned
+			 * lockstep) uniquely identify them.
+			 * As a result, if we have two
+			 * threads with the same ip, one of
+			 * them can be culled.
+			 */
 			if std.bshas(states, thr.ip)
 				die(re, thr)
 			;;
@@ -259,12 +289,27 @@
 	-> bestmatch
 }
 
-/* 
- Steps forward one instruction. Returns true if a byte of input was
- consumed, false otherwise.
-*/
-const step = {re, thr, curip
-	var str, nthr, inst
+/*
+ * Step executes a single step of the compiled regex.
+ *
+ * Operations fall into two overall categories. Consuming
+ * operators advance the match, and nonconsuming operators
+ * change the state of the regex virtual machine.
+ *
+ * Consuming operators are simple: They check if the current
+ * character matches a criteria, and then advance the regex.
+ *
+ * Nonconsuming operators can do one of several things. They
+ * can fork the vm, record a successful match, or mark a
+ * thread as a failure.
+ *
+ * A thread continues to run forward until a consuming
+ * opcode is encountered, after which it must switch.
+ * This is in order to keep all threads in lockstep
+ * operating over the same characters, and finishing
+ * at the same time.
+ */
+const step = {re, thr, curip var str, nthr, inst
 
 	str = re.str
 	inst = re.code[thr.ip]
@@ -275,7 +320,7 @@
 		std.bsput(re.traces[thr.tid], thr.ip)
 	;;
 	match inst & 0xf
-	/* Char matching. Consume exactly one byte from the string. */
+	/* Consuming opcodes */
 	| OpRange:
 		var lo = (inst >>  4 : byte)
 		var hi = (inst >> 16 : byte)
@@ -308,10 +353,7 @@
 		;;
 		thr.ip = lip
 		-> false
-	/*
-	  Non-consuming. All of these return false, and expect step to be
-	  called again until exactly one byte is consumed from the string.
-	 */
+	/* Non-consuming opcodes. */
 	| OpJmp:
 		var ip = (inst >> 4 : std.size)
 		thr.ip = ip
@@ -345,7 +387,6 @@
 		else
 			die(re, thr)
 		;;
-	/* check for word characters */
 	| OpBow:
 		if iswordchar(str[re.strp:]) && (re.strp == 0 || !iswordchar(prevchar(str, re.strp)))
 			thr.ip++
@@ -443,7 +484,6 @@
 	;;
 }
 
-/* must be called with i >= 1 */
 const prevchar = {s, i
 	std.assert(i != 0, "prevchar must be called with i >= 1\n")
 	i--