This article is a continuation of the earlier post, "Compiling regular expressions (I)".
Automata are modeled as 'state' records with two fields. The pos
field contains the set of positions that are valid for recognition in the given state. Transitions are modeled as lists of pairs of symbols and states. In this way a state may contain transitions that reference itself.
type state = { pos : Int_set.t; mutable trans : (char * state) list ; }
We will require a function that for each input symbol $a$ and a given set of positions $s$, computes the list of pairs $(a, s')$ where $s'$ is the subset of $s$ that correspond to $a$.
let (partition : char option array -> Int_set.t -> (char option * Int_set.t) list) = fun chars s -> let f acc c = match c with | Some _ -> if List.mem_assoc c acc then acc else let f i acc = if chars.(i) <> c then acc else Int_set.add i acc in (c, Int_set.fold f s (Int_set.empty)) :: acc | None -> if List.mem_assoc c acc then acc else (c, Int_set.empty) :: acc in List.rev (Array.fold_left f [] chars)This function makes a list from a set of ints.
let list_of_int_set : Int_set.t -> Int_set.elt list = fun s -> List.rev (Int_set.fold (fun e acc -> e :: acc) s [])This function,
accessible
given a state, computes the list of sets that accessible from that state.
let (accessible : state -> Int_set.t array -> char option array -> (char * Int_set.t) list) = fun s follow chars -> let part = partition chars s.pos in let f p rest = match p with | (Some c, l) -> (c, List.fold_left (Int_set.union) (Int_set.empty) (List.map (Array.get follow) (list_of_int_set l)) ) :: rest | _ -> rest in List.fold_right f part []
find_state
takes a set $s$ and two lists of states (marked and unmarked). It searches for a state which has a pos
field equal to $s$ and returns this state or it fails.
let (find_state : Int_set.t -> state list -> state list -> state) = fun s l m -> let test e = e.pos = s in try List.find test l with | Not_found -> List.find test m
The algorithm to compute the automata works like this. Two lists are maintained, marked and unmarked states. The algorithm is initialized such that the only state is unmarked with a pos
field containing first_pos
$n_{0}$ where $n_{0}$ is the root of the syntax tree; the list of transitions is empty.
For an unmarked state $st$, the algorithm does these things:
- Calculate a set of numbers accessible from $st$. That is, a set of pairs $(c, s)$, where $c$ is a character and $s$ a set of positions. A position $j$ is accessible from $st$ by $c$ if there is an $i$ in
st.pos
such that $j$ is infollow
$i$ and $i$ numbers the character $c$. - For each of the pairs $(c, s)$
- If there exists a state
st'
(whether marked or unmarked) such that $s = $st'.pos
, it adds $(c, st')$ to the transitions of $st$; - Otherwise, a new state $st'$ without transitions is created, added to the transitions of $st$, and $st'$ is added to the list of unmarked states.
- If there exists a state
- It marks $st$.
Accept
. Here then is the algorithm in code.
let rec (compute_states : state list -> state list -> Int_set.t array -> char option array -> state array) = fun marked unmarked follow chars -> match unmarked with | [] -> Array.of_list marked | st :: umsts -> let access = accessible st follow chars in let marked1 = st :: marked in let f (c, s) umsts = if Int_set.is_empty s then umsts (*Suppress empty sets*) else try st.trans <- (c, find_state s marked1 umsts) ::st.trans ; umsts with | Not_found -> let state1 = {pos = s; trans = []} in st.trans <- (c, state1) :: st.trans; state1 :: umsts in let unmarked1 = List.fold_right f access umsts in compute_states marked1 unmarked1 follow chars
We are just about ready to write the function to compute the automaton. It is fundamentally a call to compute_states
but does one more thing. That is, it searches the resulting array for the index of the initial state and puts the index in the first slot of the array. To do this it uses the utility function array_indexq
which performs the search for the index using physical equality. This is because the usual test using structural equality will not terminate on structures that loop.
let (array_indexq : 'a array -> 'a -> int) = fun arr e -> let rec loop i = if i = Array.length arr then raise (Not_found) else if Array.get arr i == e then i else loop (i + 1) in loop 0So, here it is,
dfa_of
, the function to compute the automaton.
let (dfa_of : augmented_regexp * Int_set.t array * char option array -> state array) = fun (e, follow, chars) -> let init_state = {pos = first_pos e; trans = []} in let dfa = compute_states [] [init_state] follow chars in (*Installing initial state at index 0*) let idx_start = array_indexq dfa init_state in dfa.(idx_start) <- dfa.(0); dfa.(0) <- init_state; dfa
We are now on the home stretch. All that remains is to write a function to interpret the automaton. To do this, we'll make use of a mini-combinator library of recognizers. I'll not provide the OCaml code for that today - you could reverse engineer from my earlier 'Recognizers' blog-post or, consult [1].
let (interpret_dfa : state array -> int -> char Recognizer.recognizer) = fun dfa accept -> let num_states = Array.length dfa in let fvect = Array.make (num_states) (fun _ -> failwith "no value") in for i = 0 to num_states - 1 do let trans = dfa.(i).trans in let f (c, st) = let pc = Recognizer.recognizer_of_char c in let j = array_indexq dfa st in Recognizer.compose_and pc (fun l -> fvect.(j) l) in let parsers = List.map f trans in if Int_set.mem accept (dfa.(i).pos) then fvect.(i) <- compose_or_list (Recognizer.end_of_input) parsers else match parsers with | [] -> failwith "Impossible" | p :: ps -> fvect.(i) <- Recognizer.compose_or_list p ps done; fvect.(0)We wrap up with a couple of high level convenience functions :
compile
produces a recognizer from a string representation of a regular expression and match
takes a recognizer (that is, a compiled regular expression) and a string and uses the recognizer to categorize the given string as admissible or not (where explode
is a simple function that transforms a string
into a char list
- recognizers operate on lists).
let compile xpr = let ((e, follow, chars) as ast) = regexp_follow xpr in let dfa = dfa_of ast in let parser = interpret_dfa dfa (Array.length chars - 1) in fun s -> parser (explode s) let re_match xpr s = let result = xpr s in match result with | Recognizer.Remains [] -> true | _ -> false
Here's a simple test driver that shows how these functions can be used.
let test xpr s = match re_match xpr s with | true -> Printf.printf "\"%s\" : success\n" s | false -> Printf.printf "\"%s\" : fail\n" s let _ = try let xpr = compile "(a|b)*abb" in Printf.printf "Pattern: \"%s\"\n" "(a|b)*abb" ; test xpr "abb" ; test xpr "aabb" ; test xpr "baabb" ; test xpr "bbbbbbbbbbbbbaabb" ; test xpr "aaaaaaabbbaabbbaabbabaabb" ; test xpr "baab" ; test xpr "aa" ; test xpr "ab" ; test xpr "bb" ; test xpr "" ; test xpr "ccabb" ; with | Failure msg -> print_endline msg
So that's it for this series of posts on building recognizers for regular expressions. Hope you enjoyed it!
References
[1] "The Functional Approach to Programming" - Cousineau & Mauny
[2] "Compilers Principles, Techniques & Tools" - Aho et. al.