Monday, December 22, 2014

Compiling regular expressions (II)

This article is a continuation of the earlier post, "Compiling regular expressions (I)".

Automata are modeled as 'state' records with two fields. The pos field contains the set of positions that are valid for recognition in the given state. Transitions are modeled as lists of pairs of symbols and states. In this way a state may contain transitions that reference itself.

type state = {
  pos : Int_set.t;
  mutable trans : (char * state) list ;

We will require a function that for each input symbol $a$ and a given set of positions $s$, computes the list of pairs $(a, s')$ where $s'$ is the subset of $s$ that correspond to $a$.

let (partition : char option array -> Int_set.t 
               -> (char option * Int_set.t) list) =
  fun chars s ->
    let f acc c =
      match c with
      | Some _ ->
        if List.mem_assoc c acc then acc 
          let f i acc = 
            if chars.(i) <> c then acc else Int_set.add i acc in
          (c, Int_set.fold f s (Int_set.empty)) :: acc
      | None -> 
        if List.mem_assoc c acc then acc else (c, Int_set.empty) :: acc in
    List.rev (Array.fold_left f [] chars)
This function makes a list from a set of ints.
let list_of_int_set : Int_set.t -> Int_set.elt list = 
  fun s -> List.rev (Int_set.fold (fun e acc -> e :: acc) s [])
This function, accessible given a state, computes the list of sets that accessible from that state.
let (accessible : state -> Int_set.t array -> char option array 
                                      -> (char * Int_set.t) list) =
  fun s follow chars ->
    let part = partition chars s.pos in
    let f p rest =
      match p with
      | (Some c, l) -> 
           ( (Array.get follow) (list_of_int_set l))
        ) :: rest
      | _ -> rest  in
    List.fold_right f part []
find_state takes a set $s$ and two lists of states (marked and unmarked). It searches for a state which has a pos field equal to $s$ and returns this state or it fails.
let (find_state : Int_set.t -> state list -> state list -> state) =
  fun s l m ->
    let test e = e.pos = s in
      List.find test l
    | Not_found -> List.find test m

The algorithm to compute the automata works like this. Two lists are maintained, marked and unmarked states. The algorithm is initialized such that the only state is unmarked with a pos field containing first_pos $n_{0}$ where $n_{0}$ is the root of the syntax tree; the list of transitions is empty.

For an unmarked state $st$, the algorithm does these things:

  • Calculate a set of numbers accessible from $st$. That is, a set of pairs $(c, s)$, where $c$ is a character and $s$ a set of positions. A position $j$ is accessible from $st$ by $c$ if there is an $i$ in st.pos such that $j$ is in follow $i$ and $i$ numbers the character $c$.
  • For each of the pairs $(c, s)$
    • If there exists a state st' (whether marked or unmarked) such that $s = $st'.pos, it adds $(c, st')$ to the transitions of $st$;
    • Otherwise, a new state $st'$ without transitions is created, added to the transitions of $st$, and $st'$ is added to the list of unmarked states.
  • It marks $st$.
The algorithm terminates only when there are no remaining unmarked states. The result is an array of states obtained from the list of marked states. The terminal states are all those containing the number associated with Accept. Here then is the algorithm in code.
let rec (compute_states : state list -> state list -> Int_set.t array 
                                   -> char option array -> state array) =
  fun marked unmarked follow chars ->
    match unmarked with
    | [] -> Array.of_list marked
    | st :: umsts ->
      let access = accessible st follow chars in
      let marked1 = st :: marked in
      let f (c, s) umsts =
        if Int_set.is_empty s then 
          umsts (*Suppress empty sets*)
            st.trans <- (c, find_state s marked1 umsts) ::st.trans ;
          | Not_found -> 
            let state1 = {pos = s; trans = []} in
            st.trans <- (c, state1) :: st.trans;
            state1 :: umsts in
      let unmarked1 = List.fold_right f access umsts in
      compute_states marked1 unmarked1 follow chars

We are just about ready to write the function to compute the automaton. It is fundamentally a call to compute_states but does one more thing. That is, it searches the resulting array for the index of the initial state and puts the index in the first slot of the array. To do this it uses the utility function array_indexq which performs the search for the index using physical equality. This is because the usual test using structural equality will not terminate on structures that loop.

let (array_indexq : 'a array -> 'a -> int) =
  fun arr e ->
    let rec loop i =
      if i = Array.length arr then
        raise (Not_found)
      else if Array.get arr i == e then i
      else loop (i + 1) in
    loop 0
So, here it is, dfa_of, the function to compute the automaton.
let (dfa_of : augmented_regexp * Int_set.t array * char option array 
                                                        -> state array) =
  fun (e, follow, chars) ->
    let init_state = {pos = first_pos e; trans = []} in
    let dfa = compute_states [] [init_state] follow chars in
    (*Installing initial state at index 0*)
    let idx_start = array_indexq dfa init_state in
    dfa.(idx_start) <- dfa.(0);
    dfa.(0) <- init_state;

We are now on the home stretch. All that remains is to write a function to interpret the automaton. To do this, we'll make use of a mini-combinator library of recognizers. I'll not provide the OCaml code for that today - you could reverse engineer from my earlier 'Recognizers' blog-post or, consult [1].

let (interpret_dfa : state array -> int -> char Recognizer.recognizer) =
  fun dfa accept ->
    let num_states = Array.length dfa in
    let fvect = Array.make (num_states) (fun _ -> failwith "no value") in
    for i = 0 to num_states - 1 do
      let trans = dfa.(i).trans in
      let f (c, st) =
        let pc = Recognizer.recognizer_of_char c in
        let j = array_indexq dfa st in
        Recognizer.compose_and pc (fun l -> fvect.(j) l) in
      let parsers = f trans in
      if Int_set.mem accept (dfa.(i).pos) then
        fvect.(i) <- compose_or_list 
                        (Recognizer.end_of_input) parsers
      else match parsers with
      | [] -> failwith "Impossible"
      | p :: ps -> fvect.(i) <- Recognizer.compose_or_list p ps
We wrap up with a couple of high level convenience functions : compile produces a recognizer from a string representation of a regular expression and match takes a recognizer (that is, a compiled regular expression) and a string and uses the recognizer to categorize the given string as admissible or not (where explode is a simple function that transforms a string into a char list - recognizers operate on lists).
let compile xpr =
    let ((e, follow, chars) as ast) = regexp_follow xpr in
    let dfa = dfa_of ast in
    let parser = interpret_dfa dfa (Array.length chars - 1) in
    fun s -> parser (explode s)

let re_match xpr s =
  let result = xpr s in
  match result with
  | Recognizer.Remains [] -> true
  | _ -> false

Here's a simple test driver that shows how these functions can be used.

let test xpr s = 
  match re_match xpr s with
  | true -> Printf.printf "\"%s\" : success\n" s
  | false -> Printf.printf "\"%s\" : fail\n" s

let _ = 
    let xpr = compile "(a|b)*abb" in
    Printf.printf "Pattern: \"%s\"\n" "(a|b)*abb" ;
    test xpr "abb" ;
    test xpr "aabb" ;
    test xpr "baabb" ;
    test xpr "bbbbbbbbbbbbbaabb" ;
    test xpr "aaaaaaabbbaabbbaabbabaabb" ;
    test xpr "baab" ;
    test xpr "aa" ;
    test xpr "ab" ;
    test xpr "bb" ;
    test xpr "" ;
    test xpr "ccabb" ;
  | Failure msg -> print_endline msg

So that's it for this series of posts on building recognizers for regular expressions. Hope you enjoyed it!

[1] "The Functional Approach to Programming" - Cousineau & Mauny
[2] "Compilers Principles, Techniques & Tools" - Aho et. al.