Writing compilers and interpreters requires rigorous management of source code locations harvested during syntax analysis and associated error handling mechanisms that involve reporting those locations along with details of errors they associate to.
This article does a "deep dive" into the the Location
module of the OCaml compiler. The original source can be found in
the ocaml/parsing directory of an OCaml distribution
(copyright Xavier
Leroy).
Location is a masterclass in
using the standard library Format module. If you have
had difficulties understanding Format and what it
provides the OCaml programmer, then this is for
you. Furthermore, Location contains invaluable idioms
for error reporting & exception handling. Learn them here to be
able to apply them in your own programs.
Describing locations
A location corresponds to a range of characters in a source file.
Location defines this type and a suite of functions
for the production of location values.
type t = {
loc_start : Lexing.position;
loc_end : Lexing.position;
loc_ghost : bool
}
val none : t
val in_file : string → t
val init : Lexing.lexbuf → string → unit
val curr : Lexing.lexbuf → t
val symbol_rloc : unit → t
val symbol_gloc : unit → t
val rhs_loc : int → t
type 'a loc = {
txt : 'a;
loc : t;
}
val mkloc : 'a → t → 'a loc
val mknoloc : 'a → 'a loc
A value of the (standard library)
type Lexing.position describes a point in a source
file.
type position = {
pos_fname : string;
pos_lnum : int;
pos_bol : int;
pos_cnum : int
}
The fields of this record have the following meanings:
pos_fnameis the file namepos_lnumis the line numberpos_bolis the offset of the beginning of the line (the number of characters between the beginning of the lexbuf and the beginning of the line)pos_cnumis the offset of the position (number of characters between the beginning of the lexbuf (details below) and the position)
pos_cnum
and pos_bol is the character offset within the line
(i.e. the column number, assuming each character is one column
wide).
A location in a source file is defined by two positions : where the location starts and where the location ends.
type t = {
loc_start : position;
loc_end : position;
loc_ghost : bool
}
The third field loc_ghost is used to disambiguate
locations that do not appear explicitly in the source file. A
location will not be marked as ghost if it contains a
piece of code that is syntactically valid and corresponds to an
AST node and will be marked as a ghost location otherwise.
There is a specific value denoting a null position. It is
called none and it is defined by the
function in_file.
let in_file (name : string) : t =
let loc : position = {
pos_fname = name; (*The name of the file*)
pos_lnum = 1; (*The line number of the position*)
pos_bol = 0; (*Offset from the beginning of the lexbuf of the line*)
pos_cnum = -1; (*Offset of the position from the beginning of the lexbuf*)
} in
{ loc_start = loc; loc_end = loc; loc_ghost = true }
let none : t = in_file "_none_"
Lexing.lexbuf is the (standard library) type of
lexer buffers. A lexer buffer is the argument passed to the
scanning functions defined by generated scanners (lexical
analyzers). The lexer buffer holds the current state of the
scanner plus a function to refill the buffer from the input.
type lexbuf = {
refill_buff : lexbuf → unit;
mutable lex_buffer : bytes;
mutable lex_buffer_len : int;
mutable lex_abs_pos : int;
mutable lex_start_pos : int;
mutable lex_curr_pos : int;
mutable lex_last_pos : int;
mutable lex_last_action : int;
mutable lex_eof_reached : bool;
mutable lex_mem : int array;
mutable lex_start_p : position;
mutable lex_curr_p : position;
}
At each token, the lexing engine will copy lex_curr_p
to lex_start_p then change the pos_cnum
field of lex_curr_p by updating it with the number of
characters read since the start of the lexbuf. The
other fields are left unchanged by the the lexing engine. In order
to keep them accurate, they must be initialized before the first
use of the lexbuf and updated by the relevant lexer actions.
(*Set the file name and line number of the [lexbuf] to be the start
of the named file*)
let init (lexbuf : Lexing.lexbuf) (fname : string) : unit =
let open Lexing in
lexbuf.lex_curr_p <- {
pos_fname = fname;
pos_lnum = 1;
pos_bol = 0;
pos_cnum = 0;
}
The location of the current token in a lexbuf is computed by the
function curr.
(*Get the location of the current token from the [lexbuf]*)
let curr (lexbuf : Lexing.lexbuf) : t =
let open Lexing in {
loc_start = lexbuf.lex_start_p;
loc_end = lexbuf.lex_curr_p;
loc_ghost = false;
}
Parsing is the run-time library for parsers generated
by ocamlyacc. The functions symbol_start
and symbol_end are to be called in the action part of
a grammar rule (only). They return the offset of the string that
matches the left-hand-side of the rule : symbol_start
returns the offset of the first character; symbol_end
returns the offset after the last character. The first
character in a file is at offset 0.
symbol_start_pos and symbol_end_pos are
like symbol_start and symbol_end but
return Lexing.position values instead of offsets.
(*Compute the span of the left-hand-side of the matched rule in the
program source*)
let symbol_rloc () : t = {
loc_start = Parsing.symbol_start_pos ();
loc_end = Parsing.symbol_end_pos ();
loc_ghost = false
}
(*Same as [symbol_rloc] but designates the span as a ghost range*)
let symbol_gloc () : t = {
loc_start = Parsing.symbol_start_pos ();
loc_end = Parsing.symbol_end_pos ();
loc_ghost = true
}
The Parsing functions rhs_start
and rhs_end are the same
as symbol_start and symbol_end but
return the offset of the string matching the n-th
item on the right-hand-side of the rule where n is
the integer parameter to rhs_start
and rhs_end. n is 1 for the leftmost
item. rhs_start_pos and rhs_end_pos
return a position instead of an offset.
(*Compute the span of the [n]th item of the right-hand-side of the
matched rule*)
let rhs_loc n = {
loc_start = Parsing.rhs_start_pos n;
loc_end = Parsing.rhs_end_pos n;
loc_ghost = false;
}
The type 'a loc associates a value with a
location.
(*A type for the association of a value with a location*)
type 'a loc = {
txt : 'a;
loc : t;
}
(*Create an ['a loc] value from an ['a] value and location*)
let mkloc (txt : 'a) (loc : t) : 'a loc = { txt ; loc }
(*Create an ['a loc] value bound to the distinguished location called
[none]*)
let mknoloc (txt : 'a) : 'a loc = mkloc txt none
Error reporting with locations
Location has a framework for error reporting across
modules concerned with locations (think lexer, parser,
type-checker, etc).
open Format
type error =
{
loc : t;
msg : string;
sub : error list;
}
val error_of_printer : t → (formatter → 'a → unit) → 'a → error
val errorf_prefixed : ?loc : t → ?sub : error list → ('a, Format.formatter, unit, error) format4 → 'a
So, in the definition of the error
record, loc is a location in the source
code, msg an explanation of the error
and sub a list of related errors. We deal here with
the error formatting functions. The utility
function print_error_prefix simply writes an error
prefix to a formatter.
let error_prefix = "Error"
let warning_prefix = "Warning"
let print_error_prefix (ppf : formatter) () : unit =
fprintf ppf "@{<error>%s@}:" error_prefix;
()
The syntax, "@{<error>%s}@" associates the embedded text
with the named tag "error".
Next another utility, pp_ksprintf.
let pp_ksprintf
?(before : (formatter → unit) option)
(k : string → 'd)
(fmt : ('a, formatter, unit, 'd) format4) : 'a =
let buf = Buffer.create 64 in
let ppf = Format.formatter_of_buffer buf in
begin match before with
| None → ()
| Some f → f ppf
end;
kfprintf
(fun (_ : formatter) : 'd →
pp_print_flush ppf ();
let msg = Buffer.contents buf in
k msg)
ppf fmt
It proceeds as follows. A buffer and a formatter over that buffer
is created. When presented with all of the arguments of the format
operations implied by the fmt argument, if
the before argument is non-empty, call it on the
formatter. Finally, call kfprintf (from the standard
library Format module) which performs the format
operations on the buffer before handing control to a function that
retrieves the contents of the now formatted buffer and passes them
to the user provided continuation k.
With pp_ksprintf at our disposal, one can write
the function errorf_prefixed.
let errorf_prefixed
?(loc:t = none)
?(sub : error list = [])
(fmt : ('a, Format.formatter, unit, error) format4) : 'a =
let e : 'a =
pp_ksprintf
~before:(fun ppf → fprintf ppf "%a " print_error_prefix ())
(fun (msg : string) : error → {loc; msg; sub})
fmt
in e
errorf_prefixed computes a function. The function it
computes provides the means to produce error values
by way of formatting operations to produce the msg
field of the error result value. The formatting
operations include prefixing the msg field with
the error_prefix string. The type of the arguments of
the computed function unifies with the type
variable 'a. In other words, the type of the computed
function is 'a → error. For example, the type
of errorf_prefixed "%d %s" is int → string →
error.
The groundwork laid with errorf_prefixed above means
a polymorphic function error_of_printer can now be
produced.
let error_of_printer
(loc : t)
(printer : formatter → 'error_t → unit)
(x : 'error_t) : error =
let mk_err : 'error_t → error =
errorf_prefixed ~loc "%a@?" printer in
mk_err x
The idea is that error_of_printer is provided a
function that can format a value of type 'error. This
function is composed with errorf_prefixed thereby
producing a function of type 'error → error. For
example, we can illustrate how this works by making an error of a
simple integer with code like the following:
# error_of_printer none (fun ppf x → Format.fprintf ppf "Code %d" x) 3;;
- : error =
{loc =
{loc_start =
{pos_fname = "_none_"; pos_lnum = 1; pos_bol = 0; pos_cnum = -1};
loc_end = {pos_fname = "_none_"; pos_lnum = 1; pos_bol = 0; pos_cnum = -1};
loc_ghost = true};
msg = "Error: Code 3"; sub = []}
So, that's error_of_printer. The following utility
function is much simpler - it simply writes a given filename to a
formatter.
let print_filename (ppf : formatter) (file : string) : unit =
fprintf ppf "%s" file
Next, a set of constants for consistent messages that involve
locations and a function to get the file, line and column of a
position.
let (msg_file, msg_line, msg_chars, msg_to, msg_colon) =
("File \"", (*'msg file'*)
"\", line ", (*'msg line'*)
", characters ", (*'msg chars'*)
"-", (*'msg to'*)
":") (*'msg colon'*)
let get_pos_info pos = (pos.pos_fname, pos.pos_lnum, pos.pos_cnum - pos.pos_bol)
Making use of the above we have now print_loc : a
function to print a location on a formatter in terms of file, line
and character numbers.
let print_loc (ppf : formatter) (loc : t) : unit =
let (file, line, startchar) = get_pos_info loc.loc_start in
let endchar = loc.loc_end.pos_cnum - loc.loc_start.pos_cnum + startchar in
if file = "//toplevel//" then
fprintf ppf "Characters %i-%i"
loc.loc_start.pos_cnum loc.loc_end.pos_cnum
else begin
fprintf ppf "%s@{<loc>%a%s%i" msg_file print_filename file msg_line line;
if startchar >= 0 then
fprintf ppf "%s%i%s%i" msg_chars startchar msg_to endchar;
fprintf ppf "@}"
end
Locations generally speaking come out in a
format along the lines of: File "<string>, line 1,
characters 0-10:"
let print (ppf : formatter) (loc : t) : unit =
(* The syntax, [@{<loc>%a@}] associates the embedded text with the
named tag 'loc'*)
fprintf ppf "@{<loc>%a@}%s@." print_loc loc msg_colon
That last function, print is just a small wrapper
over print_loc that appends a colon to the location.
Exception handling involving errors with locations
This section is concerned with the following section of
the Location's signature.
val register_error_of_exn : (exn → error option) → unit
val error_of_exn : exn → error option
val error_reporter : (formatter → error → unit) ref
val report_error : formatter → error → unit
val report_exception : formatter → exn → unit
Location contains a mutable list of exception
handlers where an exception handler is a function of
type exn → error option.
let error_of_exn : (exn → error option) list ref = ref []
A function is provided that adds an exception handler to the above
list.
let register_error_of_exn f = error_of_exn := f :: !error_of_exn
The next function error_of_exn (yes, it is the only
remaining function that manipulates the
list error_exn previously defined directly) walks the
list looking for a handler returning the contents of the result of
the first such function that doesn't return a None
value.
let error_of_exn exn =
let rec loop = function
| [] → None
| f :: rest →
match f exn with
| Some _ as r → r
| None → loop rest
in
loop !error_of_exn
We define now a "default" error reporting function. Given a formatter and an error, write the error location, an explanation of the error to the formatter and the same for any associated "sub" errors.
let rec default_error_reporter
(ppf : formatter) ({loc; msg; sub} : error) : unit =
print ppf loc;
Format.pp_print_string ppf msg;
List.iter (Format.fprintf ppf "@\n@[<2>%a@]" default_error_reporter) sub
Now, error_reporter itself is a reference cell with
default value default_error_reporter.
let error_reporter = ref default_error_reporter
This next function, print_updating_num_loc_lines
looks more complicated than it is but does demonstrate a rather
advanced usage of Format by containing calls to the
functions pp_get_formatter_out_functions,
pp_set_formatter_out_functions to tempoarily replace
the default function for writing strings. The semantic of the
function is to print an error on a formatter incidentally
recording the number of lines required to do so.
(* A mutable line count*)
let num_loc_lines : int ref = ref 0
(*Prints an error on a formatter incidentally recording the number of
lines required to do so*)
let print_updating_num_loc_lines
(ppf : formatter)
(f : formatter → error → unit)
(arg : error) : unit =
(*A record of functions of output primitives*)
let out_functions : formatter_out_functions
= pp_get_formatter_out_functions ppf () in
(*The strategoy is to temporarily replace the basic function for
writing a string with this one*)
let out_string (str : string) (start : int) (len : int) : unit =
(*A function for counting line breaks in [str]. [c] is the current
count, [i] is the current char under consideration*)
let rec count (i : int) (c : int) : int=
if i = start + len then c
else if String.get str i = '\n' then count (succ i) (succ c)
else count (succ i) c
in
(*Update the count*)
num_loc_lines := !num_loc_lines + count start 0;
(*Write the string to the formatter*)
out_functions.out_string str start len
in
(*Replace the standard string output primitive with the one just
defined *)
pp_set_formatter_out_functions ppf
{out_functions with out_string} ;
(*Write the error to the formatter*)
f ppf arg ;
pp_print_flush ppf ();
(*Restore the standard primitive*)
pp_set_formatter_out_functions ppf out_functions
The function report_error uses the currently
installed error reporter to write an error report for a given
error and formatter incidentally updating a count indicating the
number of lines written.
let report_error (ppf : formatter) (err : error) : unit=
print_updating_num_loc_lines ppf !error_reporter err
This next function, report_exception_rec tries a
number of times to find a handler for a given error and if
successful formats it. In the worst case a handler is never found
and the exception propogates.
let rec report_exception_rec (n : int) (ppf : formatter) (exn : exn) : unit =
(*Try to find a handler for the exception*)
try match error_of_exn exn with
| Some err →
(*Format the resulting error using the current error reporter*)
fprintf ppf "@[%a@]@." report_error err
(*The syntax @[%a@]@ writes function output in a box followed by a
'cut' break hint*)
| None → raise exn (*A handler could not be found*)
with exn when n > 0 →
(*A handler wasn't found. Try again*)
report_exception_rec (n - 1) ppf exn
The last piece is report_exception. It attempts to
write an error report for the given exception on the provided
formatter. The exception can be re-raised if no handler is found.
let report_exception (ppf : formatter) (exn : exn) : unit =
report_exception_rec 5 ppf exn
Usage
In this section we see how an example of how the above
machinery is used. Consider defining a lexical analyzer as an
example. Suppose the scanner is defined by the
file lexer.mll (the input file
to ocamllex). We can imagine its header containing
code like the following.
{
(*The cases of lexer errors*)
type error =
| Illegal_character of char
| Unterminated_comment of Location.t
(*The lexer exception type*)
exception Error of error * Location.t
(*This function takes a formatter and an instance of type
[error] and writes a message to the formatter explaining the
meaning. This is a "printer"*)
let report_error (ppf : Format.formatter) : error → unit = function
| Illegal_character c →
Format.fprintf ppf "Illegal character (%s)" (Char.escaped c)
| Unterminated_comment _ →
Format.fprintf ppf "Comment not terminated"
(*Note that [report_error] is a function that unifies with
the [formatter → 'a → unit] parameter of
[error_of_printer]*)
(*Register an exception handler for the lexer exception type*)
let () =
Location.register_error_of_exn
(function
| Error (err, loc) →
Some (Location.error_of_printer loc report_error err)
| _ → None
)
}
/*...*/
rule token = ...
A function to handle errors with attached locations (in a REPL for
example) is expressible as an idiom as simple as something like
this.
let handle_interpreter_error ?(finally=(fun () → ())) ex =
finally ();
Location.report_exception (Format.std_formatter) ex
let safe_proc ?finally f =
try f ()
with exn → handle_interpreter_error ?finally exn