shithub: mc

ref: 21b5fb876c5f1e6188a3a6b0e9e77311f06525f0
dir: /doc/api/libregex/index.txt/

View raw version
{
	title: libregex
	description: Libregex API documentation.
}

Summary
-------

	pkg regex =
                type ast = union
                        /* basic string building */
                        `Alt	(ast#, ast#)
                        `Cat	(ast#, ast#)

                        /* repetition */
                        `Star	ast#
                        `Rstar  ast#
                        `Plus	ast#
                        `Rplus	ast#
                        `Quest	ast#	

                        /* end matches */
                        `Chr	char
                        `Ranges	char[2][:]

                        /* meta */
                        `Cap	(std.size, ast#) /* id, ast */
                        `Bol	/* beginning of line */
                        `Eol	/* end of line */
                        `Bow	/* beginning of word */
                        `Eow	/* end of word */
                ;;

                type status = union
                        `Noimpl
                        `Incomplete
                        `Unbalanced char
                        `Emptyparen
                        `Badrep char
                        `Badrange byte[:]
                        `Badescape char
                ;;

	/* regex compilation */
        const parse	: (re : byte[:]	-> std.result(ast#, status))
        const compile	: (re : byte[:] -> std.result(regex#, status))
        const dbgcompile	: (re : byte[:] -> std.result(regex#, status))
        const free	: (re : regex# -> void)

        /* regex execution */
        const exec	: (re : regex#, str : byte[:] -> std.option(byte[:][:]))
        const search	: (re : regex#, str : byte[:] -> std.option(byte[:][:]))

        const sub	: (re : regex#, str : byte[:], subst : byte[:][:] -> std.option(byte[:]))
        const sbsub	: (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> bool)
        const suball	: (re : regex#, str : byte[:], subst : byte[:][:] -> byte[:])
        const sbsuball	: (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> void)

        const matchfree	: (pat : byte[:][:] -> void)
    ;;


Overview
--------

Libregex is a simple regex API that uses a parallel NFA implementation. This
means that while it is not blazingly fast, it does not exhibit pathological
behavior on regexes like `(aa|aab?)\*` that many common regex APIs will see.

Regex Syntax
-------------

The grammar for regexes that are accepted is sketched out below.

           regex       : altexpr
           altexpr     : catexpr ('|' altexpr)+
           catexpr     : repexpr (catexpr)+
           repexpr     : baseexpr[*+?][?]
           baseexpr    : literal
                       | charclass
                       | charrange
                       | '.'
                       | '^'
                       | '$'
                       | '(' regex ')'
           charclass   : see below
           charrange   : '[' (literal('-' literal)?)+']'

The following metacharacters have the meanings listed below:

Matches a single unicode character

<table>
	<tr><tr><th>Metachar</th>        <th>Description</th></tr>
	<tr><td><code>^</td></code>      <td>Matches the beginning of a line. Does not consume any characters.</td></tr>
	<tr><td><code>$</td></code>      <td>Matches the end of a line. Does not consume any characters.</td></tr>
	<tr><td><code>*</td></code>      <td>Matches any number of repetitions of the preceding regex fragment.</td></tr>
	<tr><td><code>+</td></code>      <td>Matches one or more repetitions of the preceding regex fragment.</td></tr>
	<tr><td><code>?</td></code>      <td>Matches zero or one of the preceding regex fragment.</td></tr>
</table>

In order to match a literal metacharacter, it needs to be preceded by a '\' character.

The following character classes are supported:

<table>
	<tr><tr><th>Charclass</th>       <th>Description</th></tr>
	<tr><td><code>\d </code></td>    <td>ASCII digits</td></tr>
	<tr><td><code>\D </code></td>    <td>Negation of ASCII digits</td></tr>
	<tr><td><code>\x </code></td>    <td>ASCII Hex digits</td></tr>
	<tr><td><code>\X </code></td>    <td>Negation of ASCII Hex digits</td></tr>
	<tr><td><code>\s </code></td>    <td>ASCII spaces</td></tr>
	<tr><td><code>\S </code></td>    <td>Negation of ASCII spaces</td></tr>
	<tr><td><code>\w </code></td>    <td>ASCII word characters</td></tr>
	<tr><td><code>\W </code></td>    <td>Negation of ASCII word characters</td></tr>
	<tr><td><code>\h </code></td>    <td>ASCII whitespace characters</td></tr>
	<tr><td><code>\H </code></td>    <td>Negation of ASCII whitespace characters</td></tr>
	<tr><td><code>\pX</code></td>    <td>Characters with unicode property 'X'</td></tr>
	<tr><td><code>\PX</code></td>    <td>Negation of characters with property 'X'</td></tr>
</table>

The current list of supported Unicode character classes `X` are

<table>
	<tr><th>Abbrev</th> <th>Full name</th>      <th>Description</th></tr>
	<tr>
		<td><code>L</code></td>  <td><code>Letter</code></td>
		<td>All letters, including lowercase, uppercase, titlecase,
		and uncased.</td>
	</tr>
	<tr>
		<td><code>Lu</code></td> <td><code>Uppercase_Letter</code></td>
		<td>All uppercase letters.</td>
	</tr>
	<tr>
		<td><code>Ll</code></td> <td><code>Lowercase_Letter</code></td>
		<td>All lowercase letters.</td>
	</tr>
	<tr>
		<td><code>Lt</code></td> <td><code>Titlecase_Letter</code></td>
		<td>All titlecase letters.</td>
	</tr>
	<tr>
		<td><code>N</code></td>  <td><code>Number</code></td>
		<td>All numbers.</td>
	</tr>
	<tr>
		<td><code>Z</code></td>  <td><code>Separator</code></td>
		<td>All separators, including spaces and control characers.</td>
	</tr>
	<tr>
		<td><code>Zs</code></td> <td><code>Space_Separator</code></td>
		<td>All space separators, including tabs and ASCII spaces.</td>
        </tr>
</table>

Functions
---------

    const parse	: (re : byte[:]	-> std.result(ast#, status))

Parse takes a regex string, and converts it to a regex syntax tree, returning
`\`std.Success ast#` if the regex was valid, or a `\`std.Failure r` if the
regex could not be parsed. This AST can be used to further process the regex,
possibly turning it into a multiregex as in Hairless, or using it for NFA and
DFA tricks.

    const compile	: (re : byte[:] -> std.result(regex#, status))
    const dbgcompile	: (re : byte[:] -> std.result(regex#, status))

`compile` takes a regex string, and converts it to a compiled regex, returning
`\`std.Success regex` if the regex was valid, or a `\`std.Failure r` with the
reason that the compilation failed. `dbgcompile` is similar, however, the
regex is compiled so it will spit out a good deal of debugging output. Unless
you are intent on debugging the internals of the regex engine, this is likely
only of academic interest.

    const free	: (re : regex# -> void)

`free` must be called on a compiled regex to release it's resources after you
are finished using it.

    const exec	: (re : regex#, str : byte[:] -> std.option(byte[:][:])

`exec` runs the regex over the specified text, returning an `\`std.Some matches`
if the text matched, or `std.None` if the text did not match. matches[0] is
always the full text that was matched, and will always be returned regardless
of whether capture groups are specified.

    const search	: (re : regex#, str : byte[:] -> std.option(byte[:][:]))

`search` searches for a matching sub-segment of the regex over the specified
text, returning an `\`std.Some matches` if the text matched, or `std.None` if
the text did not match.  matches[0] is always the full text that was matched,
and will always be returned regardless of whether capture groups are
specified. `search` returns the the earliest match in the string provided.


    const sub	: (re : regex#, str : byte[:], subst : byte[:][:] -> std.option(byte[:]))
    const sbsub	: (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> bool)

`sub` will take a pattern, an input string, and a set of substitutions, and
attempt to match. If the match is successful, it will replace each group
within `str` with `subst`, returning a freshly allocated string.  `sbsub`
behaves identically, however it inserts the new string into the string
buffer provided, instead of allocating a new string.

If there is no match, then `\`std.None` will be returned.

    const suball	: (re : regex#, str : byte[:], subst : byte[:][:] -> byte[:])
    const sbsuball	: (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> void)

`suball` replaces every match within the string using the given substitutions.
Only captured groups will be substituted. The remaining text will be left in
place.

Example
------

#### Pattern matching

```{runmyr regex}
use std
use regex

const main = {
	match regex.compile("ab(c+)")
	| `std.Ok re:	runwith(re, "abccc")
	| `std.Fail m:	std.fatal("Failed to compile regex\n")
	;;
}

const runwith = {re, txt
	match regex.exec(re, txt)
	| `std.Some matches:
		std.put("matched {}, got {} matches\n", txt, matches.len)
		for m in matches
			std.put("Match: {}\n", m)
		;;
		regex.matchfree(matches)
	| `std.None:
		std.put("%s did not match\n")
	;;
}
```

#### Substitution

```{runmyr regex}
use std
use regex

const main = {
	var re

	re = std.try(regex.compile("(a*)bc(d)e"))
	match regex.sub(re, "aaabcdef", ["HEY", "X"][:])
	| `std.Some sub:
		std.put("{}\n", sub[0])
		regex.matchfree(matches)
	| `std.None:
		std.fatal("should have matched")
	;;
}
```

```{runmyr regex}
use std
use regex

const main = {
	var re, sub

	re = std.try(regex.compile("(b|e)"))
        sub = regex.suball(re, "aaabbbcdef", ["SUB"][:])
	std.put("subst: {}\n", sub)
	std.slfree(sub)
}
```