| 
24579
 | 
     1  | 
(*  Title:      Pure/ML/ml_lex.ML
  | 
| 
 | 
     2  | 
    ID:         $Id$
  | 
| 
 | 
     3  | 
    Author:     Makarius
  | 
| 
 | 
     4  | 
  | 
| 
 | 
     5  | 
Lexical syntax for SML.
  | 
| 
 | 
     6  | 
*)
  | 
| 
 | 
     7  | 
  | 
| 
 | 
     8  | 
signature ML_LEX =
  | 
| 
 | 
     9  | 
sig
  | 
| 
 | 
    10  | 
  datatype token_kind =
  | 
| 
24596
 | 
    11  | 
    Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
  | 
| 
24579
 | 
    12  | 
    Space | Comment | Error of string | EOF
  | 
| 
 | 
    13  | 
  eqtype token
  | 
| 
27732
 | 
    14  | 
  val stopper: token Scan.stopper
  | 
| 
24596
 | 
    15  | 
  val is_regular: token -> bool
  | 
| 
 | 
    16  | 
  val is_improper: token -> bool
  | 
| 
 | 
    17  | 
  val pos_of: token -> string
  | 
| 
24579
 | 
    18  | 
  val kind_of: token -> token_kind
  | 
| 
27817
 | 
    19  | 
  val content_of: token -> string
  | 
| 
24579
 | 
    20  | 
  val keywords: string list
  | 
| 
24596
 | 
    21  | 
  val source: (Symbol.symbol, 'a) Source.source ->
  | 
| 
27772
 | 
    22  | 
    (token, (SymbolPos.T, Position.T * (Symbol.symbol, 'a) Source.source)
  | 
| 
 | 
    23  | 
      Source.source) Source.source
  | 
| 
24579
 | 
    24  | 
end;
  | 
| 
 | 
    25  | 
  | 
| 
 | 
    26  | 
structure ML_Lex: ML_LEX =
  | 
| 
 | 
    27  | 
struct
  | 
| 
 | 
    28  | 
  | 
| 
 | 
    29  | 
(** tokens **)
  | 
| 
 | 
    30  | 
  | 
| 
 | 
    31  | 
(* datatype token *)
  | 
| 
 | 
    32  | 
  | 
| 
 | 
    33  | 
datatype token_kind =
  | 
| 
24596
 | 
    34  | 
  Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
  | 
| 
24579
 | 
    35  | 
  Space | Comment | Error of string | EOF;
  | 
| 
 | 
    36  | 
  | 
| 
27772
 | 
    37  | 
datatype token = Token of Position.range * (token_kind * string);
  | 
| 
 | 
    38  | 
  | 
| 
 | 
    39  | 
  | 
| 
 | 
    40  | 
(* position *)
  | 
| 
 | 
    41  | 
  | 
| 
 | 
    42  | 
fun position_of (Token ((pos, _), _)) = pos;
  | 
| 
 | 
    43  | 
fun end_position_of (Token ((_, pos), _)) = pos;
  | 
| 
 | 
    44  | 
  | 
| 
 | 
    45  | 
val pos_of = Position.str_of o position_of;
  | 
| 
24579
 | 
    46  | 
  | 
| 
 | 
    47  | 
  | 
| 
24596
 | 
    48  | 
(* control tokens *)
  | 
| 
24579
 | 
    49  | 
  | 
| 
27772
 | 
    50  | 
fun mk_eof pos = Token ((pos, Position.none), (EOF, ""));
  | 
| 
 | 
    51  | 
val eof = mk_eof Position.none;
  | 
| 
24579
 | 
    52  | 
  | 
| 
 | 
    53  | 
fun is_eof (Token (_, (EOF, _))) = true
  | 
| 
 | 
    54  | 
  | is_eof _ = false;
  | 
| 
 | 
    55  | 
  | 
| 
27772
 | 
    56  | 
val stopper =
  | 
| 
 | 
    57  | 
  Scan.stopper (fn [] => eof | toks => mk_eof (end_position_of (List.last toks))) is_eof;
  | 
| 
 | 
    58  | 
  | 
| 
24579
 | 
    59  | 
  | 
| 
27772
 | 
    60  | 
(* token content *)
  | 
| 
 | 
    61  | 
  | 
| 
27817
 | 
    62  | 
fun content_of (Token (_, (_, x))) = x;
  | 
| 
 | 
    63  | 
fun token_leq (tok, tok') = content_of tok <= content_of tok';
  | 
| 
27772
 | 
    64  | 
  | 
| 
 | 
    65  | 
fun kind_of (Token (_, (k, _))) = k;
  | 
| 
24579
 | 
    66  | 
  | 
| 
24596
 | 
    67  | 
fun is_regular (Token (_, (Error _, _))) = false
  | 
| 
 | 
    68  | 
  | is_regular (Token (_, (EOF, _))) = false
  | 
| 
 | 
    69  | 
  | is_regular _ = true;
  | 
| 
 | 
    70  | 
  | 
| 
 | 
    71  | 
fun is_improper (Token (_, (Space, _))) = true
  | 
| 
 | 
    72  | 
  | is_improper (Token (_, (Comment, _))) = true
  | 
| 
 | 
    73  | 
  | is_improper _ = false;
  | 
| 
 | 
    74  | 
  | 
| 
 | 
    75  | 
  | 
| 
24579
 | 
    76  | 
  | 
| 
 | 
    77  | 
(** scanners **)
  | 
| 
 | 
    78  | 
  | 
| 
27772
 | 
    79  | 
open BasicSymbolPos;
  | 
| 
24579
 | 
    80  | 
  | 
| 
27772
 | 
    81  | 
fun !!! msg = SymbolPos.!!! ("SML lexical error: " ^ msg);
 | 
| 
24579
 | 
    82  | 
  | 
| 
 | 
    83  | 
  | 
| 
27772
 | 
    84  | 
(* blanks *)
  | 
| 
24579
 | 
    85  | 
  | 
| 
27772
 | 
    86  | 
val scan_blank = Scan.one (Symbol.is_ascii_blank o symbol);
  | 
| 
 | 
    87  | 
val scan_blanks1 = Scan.repeat1 scan_blank;
  | 
| 
24579
 | 
    88  | 
  | 
| 
 | 
    89  | 
  | 
| 
 | 
    90  | 
(* keywords *)
  | 
| 
 | 
    91  | 
  | 
| 
 | 
    92  | 
val keywords = ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=",
 | 
| 
 | 
    93  | 
  "=>", "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
 | 
| 
 | 
    94  | 
  "case", "datatype", "do", "else", "end", "eqtype", "exception", "fn",
  | 
| 
 | 
    95  | 
  "fun", "functor", "handle", "if", "in", "include", "infix", "infixr",
  | 
| 
 | 
    96  | 
  "let", "local", "nonfix", "of", "op", "open", "orelse", "raise", "rec",
  | 
| 
 | 
    97  | 
  "sharing", "sig", "signature", "struct", "structure", "then", "type",
  | 
| 
 | 
    98  | 
  "val", "where", "while", "with", "withtype"];
  | 
| 
 | 
    99  | 
  | 
| 
27772
 | 
   100  | 
val lex = Scan.make_lexicon (map explode keywords);
  | 
| 
 | 
   101  | 
fun scan_keyword x = Scan.literal lex x;
  | 
| 
24579
 | 
   102  | 
  | 
| 
 | 
   103  | 
  | 
| 
 | 
   104  | 
(* identifiers *)
  | 
| 
 | 
   105  | 
  | 
| 
24596
 | 
   106  | 
local
  | 
| 
 | 
   107  | 
  | 
| 
24579
 | 
   108  | 
val scan_letdigs =
  | 
| 
27772
 | 
   109  | 
  Scan.many ((Symbol.is_ascii_letter orf Symbol.is_ascii_digit orf Symbol.is_ascii_quasi) o symbol);
  | 
| 
24579
 | 
   110  | 
  | 
| 
27772
 | 
   111  | 
val scan_alphanumeric = Scan.one (Symbol.is_ascii_letter o symbol) -- scan_letdigs >> op ::;
  | 
| 
24579
 | 
   112  | 
  | 
| 
27772
 | 
   113  | 
val scan_symbolic = Scan.many1 (member (op =) (explode "!#$%&*+-/:<=>?@\\^`|~") o symbol);
  | 
| 
24579
 | 
   114  | 
  | 
| 
24596
 | 
   115  | 
in
  | 
| 
 | 
   116  | 
  | 
| 
24579
 | 
   117  | 
val scan_ident = scan_alphanumeric || scan_symbolic;
  | 
| 
 | 
   118  | 
  | 
| 
 | 
   119  | 
val scan_longident =
  | 
| 
27772
 | 
   120  | 
  (Scan.repeat1 (scan_alphanumeric @@@ $$$ ".") >> flat) @@@ (scan_ident || $$$ "=");
  | 
| 
24579
 | 
   121  | 
  | 
| 
27772
 | 
   122  | 
val scan_typevar = $$$ "'" @@@ scan_letdigs;
  | 
| 
24579
 | 
   123  | 
  | 
| 
24596
 | 
   124  | 
end;
  | 
| 
24579
 | 
   125  | 
  | 
| 
 | 
   126  | 
  | 
| 
 | 
   127  | 
(* numerals *)
  | 
| 
 | 
   128  | 
  | 
| 
24596
 | 
   129  | 
local
  | 
| 
 | 
   130  | 
  | 
| 
27772
 | 
   131  | 
val scan_dec = Scan.many1 (Symbol.is_ascii_digit o symbol);
  | 
| 
 | 
   132  | 
val scan_hex = Scan.many1 (Symbol.is_ascii_hex o symbol);
  | 
| 
 | 
   133  | 
val scan_sign = Scan.optional ($$$ "~") [];
  | 
| 
 | 
   134  | 
val scan_decint = scan_sign @@@ scan_dec;
  | 
| 
24579
 | 
   135  | 
  | 
| 
24596
 | 
   136  | 
in
  | 
| 
 | 
   137  | 
  | 
| 
27772
 | 
   138  | 
val scan_word =
  | 
| 
 | 
   139  | 
  $$$ "0" @@@ $$$ "w" @@@ $$$ "x" @@@ scan_hex ||
  | 
| 
 | 
   140  | 
  $$$ "0" @@@ $$$ "w" @@@ scan_dec;
  | 
| 
24579
 | 
   141  | 
  | 
| 
27772
 | 
   142  | 
val scan_int = scan_sign @@@ ($$$ "0" @@@ $$$ "x" @@@ scan_hex || scan_dec);
  | 
| 
24579
 | 
   143  | 
  | 
| 
27772
 | 
   144  | 
val scan_exp = ($$$ "E" || $$$ "e") @@@ scan_decint;
  | 
| 
24579
 | 
   145  | 
  | 
| 
 | 
   146  | 
val scan_real =
  | 
| 
27772
 | 
   147  | 
  scan_decint @@@ $$$ "." @@@ scan_dec @@@ Scan.optional scan_exp [] ||
  | 
| 
 | 
   148  | 
  scan_decint @@@ scan_exp;
  | 
| 
24579
 | 
   149  | 
  | 
| 
24596
 | 
   150  | 
end;
  | 
| 
 | 
   151  | 
  | 
| 
24579
 | 
   152  | 
  | 
| 
 | 
   153  | 
(* chars and strings *)
  | 
| 
 | 
   154  | 
  | 
| 
24596
 | 
   155  | 
local
  | 
| 
 | 
   156  | 
  | 
| 
 | 
   157  | 
val scan_escape =
  | 
| 
27772
 | 
   158  | 
  Scan.one (member (op =) (explode "\"\\abtnvfr") o symbol) >> single ||
  | 
| 
 | 
   159  | 
  $$$ "^" @@@ (Scan.one (fn (s, _) => ord "@" <= ord s andalso ord s <= ord "_") >> single) ||
  | 
| 
 | 
   160  | 
  Scan.one (Symbol.is_ascii_digit o symbol) --
  | 
| 
 | 
   161  | 
    Scan.one (Symbol.is_ascii_digit o symbol) --
  | 
| 
 | 
   162  | 
    Scan.one (Symbol.is_ascii_digit o symbol) >> (fn ((a, b), c) => [a, b, c]);
  | 
| 
24596
 | 
   163  | 
  | 
| 
 | 
   164  | 
val scan_str =
  | 
| 
27772
 | 
   165  | 
  Scan.one (fn (s, _) => Symbol.is_printable s andalso s <> "\"" andalso s <> "\\") >> single ||
  | 
| 
 | 
   166  | 
  $$$ "\\" @@@ !!! "bad escape character in string" scan_escape;
  | 
| 
24596
 | 
   167  | 
  | 
| 
27772
 | 
   168  | 
val scan_gap = $$$ "\\" @@@ scan_blanks1 @@@ $$$ "\\";
  | 
| 
 | 
   169  | 
val scan_gaps = Scan.repeat scan_gap >> flat;
  | 
| 
24579
 | 
   170  | 
  | 
| 
24596
 | 
   171  | 
in
  | 
| 
24579
 | 
   172  | 
  | 
| 
 | 
   173  | 
val scan_char =
  | 
| 
27772
 | 
   174  | 
  $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ scan_str @@@ scan_gaps @@@ $$$ "\"";
  | 
| 
24579
 | 
   175  | 
  | 
| 
 | 
   176  | 
val scan_string =
  | 
| 
27772
 | 
   177  | 
  $$$ "\"" @@@ !!! "missing quote at end of string"
  | 
| 
 | 
   178  | 
    ((Scan.repeat (scan_gap || scan_str) >> flat) @@@ $$$ "\"");
  | 
| 
24596
 | 
   179  | 
  | 
| 
 | 
   180  | 
end;
  | 
| 
24579
 | 
   181  | 
  | 
| 
 | 
   182  | 
  | 
| 
24596
 | 
   183  | 
(* token source *)
  | 
| 
24579
 | 
   184  | 
  | 
| 
 | 
   185  | 
local
  | 
| 
 | 
   186  | 
  | 
| 
27799
 | 
   187  | 
fun token k ss = Token (SymbolPos.range ss, (k, SymbolPos.implode ss));
  | 
| 
24579
 | 
   188  | 
  | 
| 
27772
 | 
   189  | 
val scan = !!! "bad input"
  | 
| 
 | 
   190  | 
 (scan_char >> token Char ||
  | 
| 
 | 
   191  | 
  scan_string >> token String ||
  | 
| 
 | 
   192  | 
  scan_blanks1 >> token Space ||
  | 
| 
 | 
   193  | 
  SymbolPos.scan_comment !!! >> token Comment ||
  | 
| 
 | 
   194  | 
  Scan.max token_leq
  | 
| 
 | 
   195  | 
   (scan_keyword >> token Keyword)
  | 
| 
 | 
   196  | 
   (scan_word >> token Word ||
  | 
| 
 | 
   197  | 
    scan_real >> token Real ||
  | 
| 
 | 
   198  | 
    scan_int >> token Int ||
  | 
| 
 | 
   199  | 
    scan_longident >> token LongIdent ||
  | 
| 
 | 
   200  | 
    scan_ident >> token Ident ||
  | 
| 
 | 
   201  | 
    scan_typevar >> token TypeVar));
  | 
| 
 | 
   202  | 
  | 
| 
 | 
   203  | 
fun recover msg =
  | 
| 
 | 
   204  | 
  Scan.many (((not o Symbol.is_blank) andf Symbol.is_regular) o symbol)
  | 
| 
 | 
   205  | 
  >> (fn cs => [token (Error msg) cs]);
  | 
| 
24579
 | 
   206  | 
  | 
| 
 | 
   207  | 
in
  | 
| 
 | 
   208  | 
  | 
| 
24596
 | 
   209  | 
fun source src =
  | 
| 
27772
 | 
   210  | 
  SymbolPos.source (Position.line 1) src
  | 
| 
 | 
   211  | 
  |> Source.source SymbolPos.stopper (Scan.bulk scan) (SOME (false, recover));
  | 
| 
24579
 | 
   212  | 
  | 
| 
 | 
   213  | 
end;
  | 
| 
 | 
   214  | 
  | 
| 
24596
 | 
   215  | 
end;
  | 
| 
24579
 | 
   216  | 
  |