🐫

TaPLのコード解説(Chapter4)

2021/09/23に公開

parser

TaPL(Types and Programming Language)を読んでいるとOCamlのサンプルコードが載っている章があります。

これら章には型は書いてあるものの、Lexerや、Parserなどについては書いておりません。そのため、HPからImplementationsに飛ぶとソースコードが載っています。そこで完全なコードが書かれています。

英語でコメントも付いていますが、まずは概観が理解ができるように本記事を書いています。

初回の今回はChapter4のコードに関して触れていきます。

コードは私がホームページから持ってきて、日本語でコメントしたものをGitHubに置いており、そちらから抜粋したコードをここでは載せています。

まず、本文にあるMakefileによるビルドよりも記述が簡単(主観)なDuneを使うようにしてみました。

`dune`

以下のように記述するとMakefileいらずで起動ができます。

パット見、duneで使われている文法は目新しいかもしれませんが、構文木をそのまま書いているだけで、targetは生成ファイルでdepsが依存ファイル。actionで実際にどうするかを記述しています。envの部分はビルド時のwarningを抑えるためのフラグが書かれています。

dune

(rule
 (target lexer.ml)
 (deps lexer.mll)
 (action
  (chdir
   %{workspace_root}
   (run %{bin:ocamllex} -q -o %{target} %{deps}))))

(rule
 (targets parser.ml parser.mli)
 (deps parser.mly)
 (action
  (chdir
   %{workspace_root}
   (run %{bin:ocamlyacc} %{deps}))))

(env
 (dev
  (flags
   (:standard -w -a))))

(executable
 (name main))

また、dune-projectは生成時のまま使っています。

dune-project

(lang dune 2.9)

全体像

各ファイルの説明

このプロジェクトではdune、dune-project、.ocamlformatとテストで使うtest.fを除くと.ml、.mli、.mll、.mlyの4つの拡張子の付いたファイルがあります。

.mlは実装ファイルで定義を書きます。
.mliはインターフェースファイルで、仕様を書きます。
.mllはocamllexというソフトが字句解析プログラムを生成するのに使います。
.mlyはocamlyaccというソフトが構文解析プログラムを生成するのに使います。

このうち.mliのみコメントがありしっかりと作者は理解をしてほしそうです。ということで、今回はこちらのファイルに焦点を絞って解説をします。

4種類の総行数は次の通りで短いです。

❯ fd "(ml|mli|mll|mly)$" | xargs wc -l
      49 core.ml
       9 core.mli
     216 lexer.mll
     100 main.ml
     141 parser.mly
      73 support.ml
      57 support.mli
     102 syntax.ml
      24 syntax.mli
     771 total

依存関係の順に説明していきます。

`support.ml` `support.mli`

ここでは各種ファイルで使われる関数が書かれています。

最初はPervasiveというサブモジュールのインターフェースを制限しています。ここではprのシグネチャが書かれています。これは標準出力に使われており、短縮して呼び出せるようにしています。

support.mli

(* module Support

   Collects a number of low-level facilities used by the other modules
   in the typechecker/evaluator.
*)

(* ------------------------------------------------------------------------ *)
(* Some pervasive abbreviations -- opened everywhere by convention *)
(* 短縮形の定義 *)

module Pervasive : sig
  val pr : string -> unit
end

次の部分ではErrorというサブモジュールのインターフェースを制限しています。

最初にファイル情報に関する型や関数仕様を書いています。

supprot.mli

(* ------------------------------------------------------------------------ *)
(* Error printing utilities -- opened everywhere by convention *)

module Error : sig
  (* An exception raised by the low-level error printer; exported
     here so that it can be caught in module Main and converted into
     an exit status for the whole program. *)
  (* ポインタに関するエラーが生じた際に使う例外 *)
  exception Exit of int

  (* An element of the type info represents a "file position": a
     file name, line number, and character position within the line.
     Used for printing error messages. *)
  (* ファイル情報に関する要素に使う型 *)
  type info

  (* ダミーに使われる値 *)
  val dummyinfo : info

  (* Create file position info: filename lineno column *)
  (* ファイル情報に関するデータを作る関数 *)
  val createInfo : string -> int -> int -> info

  (* ファイル情報を出力する関数 *)
  val printInfo : info -> unit

次の部分はパーサーで用いるファイル情報の型やそれに関する関数の仕様が書かれています。

support.mli

  (* A convenient datatype for a "value with file info."  Used in
     the lexer and parser. *)
  (* パーサーで用いるファイル情報に関する型 *)
  type 'a withinfo = { i : info; v : 'a }

  (* Print an error message and fail.  The printing function is called
     in a context where the formatter is processing an hvbox.  Insert
     calls to Format.print_space to print a space or, if necessary,
     break the line at that point. *)
  (* エラーメッセージを出してプログラムを終了する *)
  val errf : (unit -> unit) -> 'a

  (* エラーがどこで起きたかを出力 *)
  val errfAt : info -> (unit -> unit) -> 'a

  (* Convenient wrappers for the above, for the common case where the
     action to be performed is just to print a given string. *)
  (* 上の関数をまとめる関数 *)
  val err : string -> 'a

  (* 同上 *)
  val error : info -> string -> 'a

  (* Variants that print a message but do not fail afterwards *)
  (* 上の場合はプログラムが終了するが警告だけで終了はしない *)
  val warning : string -> unit

  (* 警告がどこで起きたかを出力 *)
  val warningAt : info -> string -> unit
end

support.mlではそれらの実装が書かれています。

support.ml

open Format

module Error = struct
  exception Exit of int

  type info = FI of string * int * int | UNKNOWN

  type 'a withinfo = { i : info; v : 'a }

  let dummyinfo = UNKNOWN

  let createInfo f l c = FI (f, l, c)

  let errf f =
    print_flush ();
    open_vbox 0;
    open_hvbox 0;
    f ();
    print_cut ();
    close_box ();
    print_newline ();
    raise (Exit 1)

  let printInfo =
    (* In the text of the book, file positions in error messages are replaced
       with the string "Error:" *)
    function
    | FI (f, l, c) ->
        print_string f;
        print_string ":";
        print_int l;
        print_string ".";
        print_int c;
        print_string ":"
    | UNKNOWN -> print_string "<Unknown file and line>: "

  let errfAt fi f =
    errf (fun () ->
        printInfo fi;
        print_space ();
        f ())

  let err s =
    errf (fun () ->
        print_string "Error: ";
        print_string s;
        print_newline ())

  let error fi s =
    errfAt fi (fun () ->
        print_string s;
        print_newline ())

  let warning s =
    print_string "Warning: ";
    print_string s;
    print_newline ()

  let warningAt fi s =
    printInfo fi;
    print_string " Warning: ";
    print_string s;
    print_newline ()
end

(* ---------------------------------------------------------------------- *)

module Pervasive = struct
  type info = Error.info

  let pr = Format.print_string
end
(* module pervasive *)

`lexer.mll` `parser.mly`

これはocamllexとocamlyaccというレクサー、パーサーのコードを生成するツールに関するコードです。

ocamllexの基本的な文法は以下のようになっています。

{ header }
let ident = regexp …
[refill { refill-handler }]
rule entrypoint [arg1… argn] =
  parse regexp { action }
      | …
      | regexp { action }
and entrypoint [arg1… argn] =
  parse …
and …
{ trailer }

最初と最後に{}で囲まれたheaderとtrailerがあり、ヘッダーでは予約語を決めています。
headerとentrypointの間にはidentというよく使う正規表現が定義できます。これを書くことで後で労力を節約できるというわけです。TaPLのサンプルコードではここに関数がまとめられています。
entrypointでは実際に字句解析する際の文法を正規表現で書き、その際に実行する処理を{ action }内に書きます。正規表現は最大までマッチするようになっています。

細かい文法はHPを参照ください。

lexerl.mll

(* 
   The lexical analyzer: lexer.ml is generated automatically
   from lexer.mll.
   
   The only modification commonly needed here is adding new keywords to the 
   list of reserved words at the top.  
*)

{
open Support.Error

(* 予約語 *)
let reservedWords = [
  (* Keywords *)
  (* 予約キーワード *)
  ("import", fun i -> Parser.IMPORT i);
  ("if", fun i -> Parser.IF i);
  ("then", fun i -> Parser.THEN i);
  ("else", fun i -> Parser.ELSE i);
  ("true", fun i -> Parser.TRUE i);
  ("false", fun i -> Parser.FALSE i);
  ("succ", fun i -> Parser.SUCC i);
  ("pred", fun i -> Parser.PRED i);
  ("iszero", fun i -> Parser.ISZERO i);
  
  (* 予約記号 *)
  (* Symbols *)
  ("_", fun i -> Parser.USCORE i);
  ("'", fun i -> Parser.APOSTROPHE i);
  ("\"", fun i -> Parser.DQUOTE i);
  ("!", fun i -> Parser.BANG i);
  ("#", fun i -> Parser.HASH i);
  ("$", fun i -> Parser.TRIANGLE i);
  ("*", fun i -> Parser.STAR i);
  ("|", fun i -> Parser.VBAR i);
  (".", fun i -> Parser.DOT i);
  (";", fun i -> Parser.SEMI i);
  (",", fun i -> Parser.COMMA i);
  ("/", fun i -> Parser.SLASH i);
  (":", fun i -> Parser.COLON i);
  ("::", fun i -> Parser.COLONCOLON i);
  ("=", fun i -> Parser.EQ i);
  ("==", fun i -> Parser.EQEQ i);
  ("[", fun i -> Parser.LSQUARE i); 
  ("<", fun i -> Parser.LT i);
  ("{", fun i -> Parser.LCURLY i); 
  ("(", fun i -> Parser.LPAREN i); 
  ("<-", fun i -> Parser.LEFTARROW i); 
  ("{|", fun i -> Parser.LCURLYBAR i); 
  ("[|", fun i -> Parser.LSQUAREBAR i); 
  ("}", fun i -> Parser.RCURLY i);
  (")", fun i -> Parser.RPAREN i);
  ("]", fun i -> Parser.RSQUARE i);
  (">", fun i -> Parser.GT i);
  ("|}", fun i -> Parser.BARRCURLY i);
  ("|>", fun i -> Parser.BARGT i);
  ("|]", fun i -> Parser.BARRSQUARE i);

  (* Special compound symbols: *)
  (* 上の記号が複数結合してできた記号 *)
  (":=", fun i -> Parser.COLONEQ i);
  ("->", fun i -> Parser.ARROW i);
  ("=>", fun i -> Parser.DARROW i);
  ("==>", fun i -> Parser.DDARROW i);
]

(* Support functions *)
(* 諸々の補助関数 *)
type buildfun = info -> Parser.token
let (symbolTable : (string,buildfun) Hashtbl.t) = Hashtbl.create 1024
let _ =
  List.iter (fun (str,f) -> Hashtbl.add symbolTable str f) reservedWords

let createID i str =
  try (Hashtbl.find symbolTable str) i
  with _ ->
    if (String.get str 0) >= 'A' && (String.get str 0) <= 'Z' then
       Parser.UCID {i=i;v=str}
    else 
       Parser.LCID {i=i;v=str}

let lineno   = ref 1
and depth    = ref 0
and start    = ref 0

and filename = ref ""
and startLex = ref dummyinfo

let create inFile stream =
  if not (Filename.is_implicit inFile) then filename := inFile
  else filename := Filename.concat (Sys.getcwd()) inFile;
  lineno := 1; start := 0; Lexing.from_channel stream

let newline lexbuf = incr lineno; start := (Lexing.lexeme_start lexbuf)

let info lexbuf =
  createInfo (!filename) (!lineno) (Lexing.lexeme_start lexbuf - !start)

let text = Lexing.lexeme

let stringBuffer = ref (Bytes.create 2048)
let stringEnd = ref 0

let resetStr () = stringEnd := 0

let addStr ch =
  let x = !stringEnd in
  let buffer = !stringBuffer
in
  if x = Bytes.length buffer then
    begin
      let newBuffer = Bytes.create (x*2) in
      Bytes.blit buffer 0 newBuffer 0 x;
      Bytes.set newBuffer x ch;
      stringBuffer := newBuffer;
      stringEnd := x+1
    end
  else
    begin
      Bytes.set buffer x ch;
      stringEnd := x+1
    end

let getStr () = Bytes.sub_string (!stringBuffer) 0 (!stringEnd)

let extractLineno yytext offset =
  int_of_string (String.sub yytext offset (String.length yytext - offset))
}


(* The main body of the lexical analyzer *)
(* 字句解析 *)
rule main = parse
  [' ' '\009' '\012']+     { main lexbuf }

| [' ' '\009' '\012']*"\n" { newline lexbuf; main lexbuf }

| "*/" { error (info lexbuf) "Unmatched end of comment" }

| "/*" { depth := 1; startLex := info lexbuf; comment lexbuf; main lexbuf }

| "# " ['0'-'9']+
    { lineno := extractLineno (text lexbuf) 2 - 1; getFile lexbuf }

| "# line " ['0'-'9']+
    { lineno := extractLineno (text lexbuf) 7 - 1; getFile lexbuf }

| ['0'-'9']+
    { Parser.INTV{i=info lexbuf; v=int_of_string (text lexbuf)} }

| ['0'-'9']+ '.' ['0'-'9']+
    { Parser.FLOATV{i=info lexbuf; v=float_of_string (text lexbuf)} }

| ['A'-'Z' 'a'-'z' '_']
  ['A'-'Z' 'a'-'z' '_' '0'-'9' '\'']*
    { createID (info lexbuf) (text lexbuf) }

| ":=" | "<:" | "<-" | "->" | "=>" | "==>"
| "{|" | "|}" | "<|" | "|>" | "[|" | "|]" | "=="
    { createID (info lexbuf) (text lexbuf) }

| ['~' '%' '\\' '+' '-' '&' '|' ':' '@' '`' '$']+
    { createID (info lexbuf) (text lexbuf) }

| ['*' '#' '/' '!' '?' '^' '(' ')' '{' '}' '[' ']' '<' '>' '.' ';' '_' ','
   '=' '\'']
    { createID (info lexbuf) (text lexbuf) }

| "\"" { resetStr(); startLex := info lexbuf; string lexbuf }

| eof { Parser.EOF(info lexbuf) }

| _  { error (info lexbuf) "Illegal character" }

(* コメント *)
and comment = parse
  "/*"
    { depth := succ !depth; comment lexbuf }
| "*/"
    { depth := pred !depth; if !depth > 0 then comment lexbuf }
| eof
    { error (!startLex) "Comment not terminated" }
| [^ '\n']
    { comment lexbuf }
| "\n"
    { newline lexbuf; comment lexbuf }

(* getFile, getName, finishNameでファイル名を取得している *)
and getFile = parse
  " "* "\"" { getName lexbuf }

and getName = parse
  [^ '"' '\n']+ { filename := (text lexbuf); finishName lexbuf }

and finishName = parse
  '"' [^ '\n']* { main lexbuf }

(* 文字列 *)
and string = parse
  '"'  { Parser.STRINGV {i = !startLex; v=getStr()} }
| '\\' { addStr(escaped lexbuf); string lexbuf }
| '\n' { addStr '\n'; newline lexbuf; string lexbuf }
| eof  { error (!startLex) "String not terminated" }
| _    { addStr (Lexing.lexeme_char lexbuf 0); string lexbuf }

(* 文字列のエスケープ文字 *)
and escaped = parse
  'n'	 { '\n' }
| 't'	 { '\t' }
| '\\'	 { '\\' }
| '"'    { '\034'  }
| '\''	 { '\'' }
| ['0'-'9']['0'-'9']['0'-'9']
    {
      let x = int_of_string(text lexbuf) in
      if x > 255 then
	error (info lexbuf) "Illegal character constant"
      else
	Char.chr x
    }
| [^ '"' '\\' 't' 'n' '\'']
    { error (info lexbuf) "Illegal character constant" }

(*  *)

ocamlyaccのファイルは以下のように書かれている。

%{
  header
%}
  declarations
%%
  rules
%%
  trailer

parser.mlyにはheaderとtrailerはなく、declarationsに当たる部分でトークンの宣言がされ、rulesに当たる部分でパースの規則を書いている。字句解析と同様にrulesではパターンマッチをして、{ }内に処理を書いています。

parser.mly

/*  
 *  Yacc grammar for the parser.  The files parser.mli and parser.ml
 *  are generated automatically from parser.mly.
 */

%{
open Support.Error
open Support.Pervasive
open Syntax
%}

/* ---------------------------------------------------------------------- */
/* Preliminaries */

/* We first list all the tokens mentioned in the parsing rules
   below.  The names of the tokens are common to the parser and the
   generated lexical analyzer.  Each token is annotated with the type
   of data that it carries; normally, this is just file information
   (which is used by the parser to annotate the abstract syntax trees
   that it constructs), but sometimes -- in the case of identifiers and
   constant values -- more information is provided.
 */

/* Keyword tokens */
(* トークン *)
%token <Support.Error.info> IMPORT
%token <Support.Error.info> IF
%token <Support.Error.info> THEN
%token <Support.Error.info> ELSE
%token <Support.Error.info> TRUE
%token <Support.Error.info> FALSE
%token <Support.Error.info> SUCC
%token <Support.Error.info> PRED
%token <Support.Error.info> ISZERO

/* Identifier and constant value tokens */
(* 識別子、定数に使う *)
%token <string Support.Error.withinfo> UCID  /* uppercase-initial */
%token <string Support.Error.withinfo> LCID  /* lowercase/symbolic-initial */
%token <int Support.Error.withinfo> INTV
%token <float Support.Error.withinfo> FLOATV
%token <string Support.Error.withinfo> STRINGV

/* Symbolic tokens */
(* 記号に使う *)
%token <Support.Error.info> APOSTROPHE
%token <Support.Error.info> DQUOTE
%token <Support.Error.info> ARROW
%token <Support.Error.info> BANG
%token <Support.Error.info> BARGT
%token <Support.Error.info> BARRCURLY
%token <Support.Error.info> BARRSQUARE
%token <Support.Error.info> COLON
%token <Support.Error.info> COLONCOLON
%token <Support.Error.info> COLONEQ
%token <Support.Error.info> COLONHASH
%token <Support.Error.info> COMMA
%token <Support.Error.info> DARROW
%token <Support.Error.info> DDARROW
%token <Support.Error.info> DOT
%token <Support.Error.info> EOF
%token <Support.Error.info> EQ
%token <Support.Error.info> EQEQ
%token <Support.Error.info> EXISTS
%token <Support.Error.info> GT
%token <Support.Error.info> HASH
%token <Support.Error.info> LCURLY
%token <Support.Error.info> LCURLYBAR
%token <Support.Error.info> LEFTARROW
%token <Support.Error.info> LPAREN
%token <Support.Error.info> LSQUARE
%token <Support.Error.info> LSQUAREBAR
%token <Support.Error.info> LT
%token <Support.Error.info> RCURLY
%token <Support.Error.info> RPAREN
%token <Support.Error.info> RSQUARE
%token <Support.Error.info> SEMI
%token <Support.Error.info> SLASH
%token <Support.Error.info> STAR
%token <Support.Error.info> TRIANGLE
%token <Support.Error.info> USCORE
%token <Support.Error.info> VBAR

/* ---------------------------------------------------------------------- */
/* The starting production of the generated parser is the syntactic class
   toplevel.  The type that is returned when a toplevel is recognized is
   Syntax.command list.
*/

%start toplevel
%type < Syntax.command list > toplevel
%%

/* ---------------------------------------------------------------------- */
/* Main body of the parser definition */

/* The top level of a file is a sequence of commands, each terminated
   by a semicolon. */
(* ここから開始 *)
toplevel :
    EOF
      { [] }
  | Command SEMI toplevel
      { let cmd = $1 in
          let cmds = $3 in
          cmd::cmds }

/* A top-level command */
(* インポートもしくは Term を生成 *)
Command :
    IMPORT STRINGV { (Import($2.v)) }
  | Term 
      { (let t = $1 in Eval(tmInfo t,t)) }

(* ATerm もしくは AppTerm を生成 *)
Term :
    AppTerm
      { $1 }
  | IF Term THEN Term ELSE Term
      { TmIf($1, $2, $4, $6) }

(* ATerm を生成 *)
AppTerm :
    ATerm
      { $1 }
  | SUCC ATerm
      { TmSucc($1, $2) }
  | PRED ATerm
      { TmPred($1, $2) }
  | ISZERO ATerm
      { TmIsZero($1, $2) }

/* Atomic terms are ones that never require extra parentheses */
(* 終端 *)
ATerm :
    LPAREN Term RPAREN  
      { $2 } 
  | TRUE
      { TmTrue($1) }
  | FALSE
      { TmFalse($1) }
  | INTV
      { let rec f n = match n with
              0 -> TmZero($1.i)
            | n -> TmSucc($1.i, f (n-1))
          in f $1.v }


/*   */

`syntax.ml` `syntax.mli`

これはそのままですね。

syntax.mli

(* module Syntax: syntax trees and associated support functions *)

open Support.Pervasive
open Support.Error

(* Data type definitions *)
type term =
  | TmTrue of info
  | TmFalse of info
  | TmIf of info * term * term * term
  | TmZero of info
  | TmSucc of info * term
  | TmPred of info * term
  | TmIsZero of info * term

type command = Import of string | Eval of info * term

(* Printing *)
val printtm : term -> unit

val printtm_ATerm : bool -> term -> unit

(* Misc *)
val tmInfo : term -> info

`core.ml` `core.mli`

すべてのtermは評価するとtermになるということです。

core.mli

(* module Core

   Core typechecking and evaluation functions
*)

open Syntax
open Support.Error

val eval : term -> term

`main.ml`

これらを組み合わせてこの処理を実行します。すると本書で触れられている言語が動きます。

main.ml

(* Module Main: The main program.  Deals with processing the command
   line, reading files, building and connecting lexers and parsers, etc.

   For most experiments with the implementation, it should not be
   necessary to change this file.
*)

open Format
open Support.Pervasive
open Support.Error
open Syntax
open Core

let searchpath = ref [ "" ]

let argDefs =
  [
    ( "-I",
      Arg.String (fun f -> searchpath := f :: !searchpath),
      "Append a directory to the search path" );
  ]

let parseArgs () =
  let inFile = ref (None : string option) in
  Arg.parse argDefs
    (fun s ->
      match !inFile with
      | Some _ -> err "You must specify exactly one input file"
      | None -> inFile := Some s)
    "";
  match !inFile with
  | None -> err "You must specify an input file"
  | Some s -> s

let openfile infile =
  let rec trynext l =
    match l with
    | [] -> err ("Could not find " ^ infile)
    | d :: rest -> (
        let name = if d = "" then infile else d ^ "/" ^ infile in
        try open_in name with Sys_error m -> trynext rest)
  in
  trynext !searchpath

let parseFile inFile =
  let pi = openfile inFile in
  let lexbuf = Lexer.create inFile pi in
  let result =
    try Parser.toplevel Lexer.main lexbuf
    with Parsing.Parse_error -> error (Lexer.info lexbuf) "Parse error"
  in
  Parsing.clear_parser ();
  close_in pi;
  result

let alreadyImported = ref ([] : string list)

let rec process_file f =
  if List.mem f !alreadyImported then ()
  else (
    alreadyImported := f :: !alreadyImported;
    let cmds = parseFile f in
    let g c =
      open_hvbox 0;
      let results = process_command c in
      print_flush ();
      results
    in
    List.iter g cmds)

and process_command cmd =
  match cmd with
  | Import f -> process_file f
  | Eval (fi, t) ->
      let t' = eval t in
      printtm_ATerm true t';
      force_newline ();
      ()

let main () =
  let inFile = parseArgs () in
  let _ = process_file inFile in
  ()

let () = set_max_boxes 1000

let () = set_margin 67

let res =
  Printexc.catch
    (fun () ->
      try
        main ();
        0
      with Exit x -> x)
    ()

let () = print_flush ()

let () = exit res

注意

ちなみに自分の環境だと.mlyファイルに日本語のコメントを書くとエラーとなるので日本語のコメントはあくまで記事のためという理解でよろしくお願いします。

TaPLのコード解説(Chapter4)

`dune`

全体像

各ファイルの説明

`support.ml` `support.mli`

`lexer.mll` `parser.mly`

`syntax.ml` `syntax.mli`

`core.ml` `core.mli`

`main.ml`

注意

参考

Discussion

dune

全体像

各ファイルの説明

support.ml support.mli

lexer.mll parser.mly

syntax.ml syntax.mli

core.ml core.mli

main.ml

注意

参考

Discussion

`dune`

`support.ml` `support.mli`

`lexer.mll` `parser.mly`

`syntax.ml` `syntax.mli`

`core.ml` `core.mli`

`main.ml`