| 24579 |      1 | (*  Title:      Pure/ML/ml_lex.ML
 | 
|  |      2 |     Author:     Makarius
 | 
|  |      3 | 
 | 
|  |      4 | Lexical syntax for SML.
 | 
|  |      5 | *)
 | 
|  |      6 | 
 | 
|  |      7 | signature ML_LEX =
 | 
|  |      8 | sig
 | 
|  |      9 |   datatype token_kind =
 | 
| 24596 |     10 |     Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
 | 
| 24579 |     11 |     Space | Comment | Error of string | EOF
 | 
|  |     12 |   eqtype token
 | 
| 27732 |     13 |   val stopper: token Scan.stopper
 | 
| 24596 |     14 |   val is_regular: token -> bool
 | 
|  |     15 |   val is_improper: token -> bool
 | 
|  |     16 |   val pos_of: token -> string
 | 
| 24579 |     17 |   val kind_of: token -> token_kind
 | 
| 27817 |     18 |   val content_of: token -> string
 | 
| 24579 |     19 |   val keywords: string list
 | 
| 24596 |     20 |   val source: (Symbol.symbol, 'a) Source.source ->
 | 
| 27772 |     21 |     (token, (SymbolPos.T, Position.T * (Symbol.symbol, 'a) Source.source)
 | 
|  |     22 |       Source.source) Source.source
 | 
| 24579 |     23 | end;
 | 
|  |     24 | 
 | 
|  |     25 | structure ML_Lex: ML_LEX =
 | 
|  |     26 | struct
 | 
|  |     27 | 
 | 
|  |     28 | (** tokens **)
 | 
|  |     29 | 
 | 
|  |     30 | (* datatype token *)
 | 
|  |     31 | 
 | 
|  |     32 | datatype token_kind =
 | 
| 24596 |     33 |   Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
 | 
| 24579 |     34 |   Space | Comment | Error of string | EOF;
 | 
|  |     35 | 
 | 
| 27772 |     36 | datatype token = Token of Position.range * (token_kind * string);
 | 
|  |     37 | 
 | 
|  |     38 | 
 | 
|  |     39 | (* position *)
 | 
|  |     40 | 
 | 
|  |     41 | fun position_of (Token ((pos, _), _)) = pos;
 | 
|  |     42 | fun end_position_of (Token ((_, pos), _)) = pos;
 | 
|  |     43 | 
 | 
|  |     44 | val pos_of = Position.str_of o position_of;
 | 
| 24579 |     45 | 
 | 
|  |     46 | 
 | 
| 24596 |     47 | (* control tokens *)
 | 
| 24579 |     48 | 
 | 
| 27772 |     49 | fun mk_eof pos = Token ((pos, Position.none), (EOF, ""));
 | 
|  |     50 | val eof = mk_eof Position.none;
 | 
| 24579 |     51 | 
 | 
|  |     52 | fun is_eof (Token (_, (EOF, _))) = true
 | 
|  |     53 |   | is_eof _ = false;
 | 
|  |     54 | 
 | 
| 27772 |     55 | val stopper =
 | 
|  |     56 |   Scan.stopper (fn [] => eof | toks => mk_eof (end_position_of (List.last toks))) is_eof;
 | 
|  |     57 | 
 | 
| 24579 |     58 | 
 | 
| 27772 |     59 | (* token content *)
 | 
|  |     60 | 
 | 
| 27817 |     61 | fun content_of (Token (_, (_, x))) = x;
 | 
|  |     62 | fun token_leq (tok, tok') = content_of tok <= content_of tok';
 | 
| 27772 |     63 | 
 | 
|  |     64 | fun kind_of (Token (_, (k, _))) = k;
 | 
| 24579 |     65 | 
 | 
| 24596 |     66 | fun is_regular (Token (_, (Error _, _))) = false
 | 
|  |     67 |   | is_regular (Token (_, (EOF, _))) = false
 | 
|  |     68 |   | is_regular _ = true;
 | 
|  |     69 | 
 | 
|  |     70 | fun is_improper (Token (_, (Space, _))) = true
 | 
|  |     71 |   | is_improper (Token (_, (Comment, _))) = true
 | 
|  |     72 |   | is_improper _ = false;
 | 
|  |     73 | 
 | 
|  |     74 | 
 | 
| 24579 |     75 | 
 | 
|  |     76 | (** scanners **)
 | 
|  |     77 | 
 | 
| 27772 |     78 | open BasicSymbolPos;
 | 
| 24579 |     79 | 
 | 
| 27772 |     80 | fun !!! msg = SymbolPos.!!! ("SML lexical error: " ^ msg);
 | 
| 24579 |     81 | 
 | 
|  |     82 | 
 | 
| 27772 |     83 | (* blanks *)
 | 
| 24579 |     84 | 
 | 
| 27772 |     85 | val scan_blank = Scan.one (Symbol.is_ascii_blank o symbol);
 | 
|  |     86 | val scan_blanks1 = Scan.repeat1 scan_blank;
 | 
| 24579 |     87 | 
 | 
|  |     88 | 
 | 
|  |     89 | (* keywords *)
 | 
|  |     90 | 
 | 
|  |     91 | val keywords = ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=",
 | 
|  |     92 |   "=>", "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
 | 
|  |     93 |   "case", "datatype", "do", "else", "end", "eqtype", "exception", "fn",
 | 
|  |     94 |   "fun", "functor", "handle", "if", "in", "include", "infix", "infixr",
 | 
|  |     95 |   "let", "local", "nonfix", "of", "op", "open", "orelse", "raise", "rec",
 | 
|  |     96 |   "sharing", "sig", "signature", "struct", "structure", "then", "type",
 | 
|  |     97 |   "val", "where", "while", "with", "withtype"];
 | 
|  |     98 | 
 | 
| 27772 |     99 | val lex = Scan.make_lexicon (map explode keywords);
 | 
|  |    100 | fun scan_keyword x = Scan.literal lex x;
 | 
| 24579 |    101 | 
 | 
|  |    102 | 
 | 
|  |    103 | (* identifiers *)
 | 
|  |    104 | 
 | 
| 24596 |    105 | local
 | 
|  |    106 | 
 | 
| 24579 |    107 | val scan_letdigs =
 | 
| 27772 |    108 |   Scan.many ((Symbol.is_ascii_letter orf Symbol.is_ascii_digit orf Symbol.is_ascii_quasi) o symbol);
 | 
| 24579 |    109 | 
 | 
| 27772 |    110 | val scan_alphanumeric = Scan.one (Symbol.is_ascii_letter o symbol) -- scan_letdigs >> op ::;
 | 
| 24579 |    111 | 
 | 
| 27772 |    112 | val scan_symbolic = Scan.many1 (member (op =) (explode "!#$%&*+-/:<=>?@\\^`|~") o symbol);
 | 
| 24579 |    113 | 
 | 
| 24596 |    114 | in
 | 
|  |    115 | 
 | 
| 24579 |    116 | val scan_ident = scan_alphanumeric || scan_symbolic;
 | 
|  |    117 | 
 | 
|  |    118 | val scan_longident =
 | 
| 27772 |    119 |   (Scan.repeat1 (scan_alphanumeric @@@ $$$ ".") >> flat) @@@ (scan_ident || $$$ "=");
 | 
| 24579 |    120 | 
 | 
| 27772 |    121 | val scan_typevar = $$$ "'" @@@ scan_letdigs;
 | 
| 24579 |    122 | 
 | 
| 24596 |    123 | end;
 | 
| 24579 |    124 | 
 | 
|  |    125 | 
 | 
|  |    126 | (* numerals *)
 | 
|  |    127 | 
 | 
| 24596 |    128 | local
 | 
|  |    129 | 
 | 
| 27772 |    130 | val scan_dec = Scan.many1 (Symbol.is_ascii_digit o symbol);
 | 
|  |    131 | val scan_hex = Scan.many1 (Symbol.is_ascii_hex o symbol);
 | 
|  |    132 | val scan_sign = Scan.optional ($$$ "~") [];
 | 
|  |    133 | val scan_decint = scan_sign @@@ scan_dec;
 | 
| 24579 |    134 | 
 | 
| 24596 |    135 | in
 | 
|  |    136 | 
 | 
| 27772 |    137 | val scan_word =
 | 
|  |    138 |   $$$ "0" @@@ $$$ "w" @@@ $$$ "x" @@@ scan_hex ||
 | 
|  |    139 |   $$$ "0" @@@ $$$ "w" @@@ scan_dec;
 | 
| 24579 |    140 | 
 | 
| 27772 |    141 | val scan_int = scan_sign @@@ ($$$ "0" @@@ $$$ "x" @@@ scan_hex || scan_dec);
 | 
| 24579 |    142 | 
 | 
| 27772 |    143 | val scan_exp = ($$$ "E" || $$$ "e") @@@ scan_decint;
 | 
| 24579 |    144 | 
 | 
|  |    145 | val scan_real =
 | 
| 27772 |    146 |   scan_decint @@@ $$$ "." @@@ scan_dec @@@ Scan.optional scan_exp [] ||
 | 
|  |    147 |   scan_decint @@@ scan_exp;
 | 
| 24579 |    148 | 
 | 
| 24596 |    149 | end;
 | 
|  |    150 | 
 | 
| 24579 |    151 | 
 | 
|  |    152 | (* chars and strings *)
 | 
|  |    153 | 
 | 
| 24596 |    154 | local
 | 
|  |    155 | 
 | 
|  |    156 | val scan_escape =
 | 
| 27772 |    157 |   Scan.one (member (op =) (explode "\"\\abtnvfr") o symbol) >> single ||
 | 
|  |    158 |   $$$ "^" @@@ (Scan.one (fn (s, _) => ord "@" <= ord s andalso ord s <= ord "_") >> single) ||
 | 
|  |    159 |   Scan.one (Symbol.is_ascii_digit o symbol) --
 | 
|  |    160 |     Scan.one (Symbol.is_ascii_digit o symbol) --
 | 
|  |    161 |     Scan.one (Symbol.is_ascii_digit o symbol) >> (fn ((a, b), c) => [a, b, c]);
 | 
| 24596 |    162 | 
 | 
|  |    163 | val scan_str =
 | 
| 27772 |    164 |   Scan.one (fn (s, _) => Symbol.is_printable s andalso s <> "\"" andalso s <> "\\") >> single ||
 | 
|  |    165 |   $$$ "\\" @@@ !!! "bad escape character in string" scan_escape;
 | 
| 24596 |    166 | 
 | 
| 27772 |    167 | val scan_gap = $$$ "\\" @@@ scan_blanks1 @@@ $$$ "\\";
 | 
|  |    168 | val scan_gaps = Scan.repeat scan_gap >> flat;
 | 
| 24579 |    169 | 
 | 
| 24596 |    170 | in
 | 
| 24579 |    171 | 
 | 
|  |    172 | val scan_char =
 | 
| 27772 |    173 |   $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ scan_str @@@ scan_gaps @@@ $$$ "\"";
 | 
| 24579 |    174 | 
 | 
|  |    175 | val scan_string =
 | 
| 27772 |    176 |   $$$ "\"" @@@ !!! "missing quote at end of string"
 | 
|  |    177 |     ((Scan.repeat (scan_gap || scan_str) >> flat) @@@ $$$ "\"");
 | 
| 24596 |    178 | 
 | 
|  |    179 | end;
 | 
| 24579 |    180 | 
 | 
|  |    181 | 
 | 
| 24596 |    182 | (* token source *)
 | 
| 24579 |    183 | 
 | 
|  |    184 | local
 | 
|  |    185 | 
 | 
| 27799 |    186 | fun token k ss = Token (SymbolPos.range ss, (k, SymbolPos.implode ss));
 | 
| 24579 |    187 | 
 | 
| 27772 |    188 | val scan = !!! "bad input"
 | 
|  |    189 |  (scan_char >> token Char ||
 | 
|  |    190 |   scan_string >> token String ||
 | 
|  |    191 |   scan_blanks1 >> token Space ||
 | 
|  |    192 |   SymbolPos.scan_comment !!! >> token Comment ||
 | 
|  |    193 |   Scan.max token_leq
 | 
|  |    194 |    (scan_keyword >> token Keyword)
 | 
|  |    195 |    (scan_word >> token Word ||
 | 
|  |    196 |     scan_real >> token Real ||
 | 
|  |    197 |     scan_int >> token Int ||
 | 
|  |    198 |     scan_longident >> token LongIdent ||
 | 
|  |    199 |     scan_ident >> token Ident ||
 | 
|  |    200 |     scan_typevar >> token TypeVar));
 | 
|  |    201 | 
 | 
|  |    202 | fun recover msg =
 | 
|  |    203 |   Scan.many (((not o Symbol.is_blank) andf Symbol.is_regular) o symbol)
 | 
|  |    204 |   >> (fn cs => [token (Error msg) cs]);
 | 
| 24579 |    205 | 
 | 
|  |    206 | in
 | 
|  |    207 | 
 | 
| 24596 |    208 | fun source src =
 | 
| 27772 |    209 |   SymbolPos.source (Position.line 1) src
 | 
|  |    210 |   |> Source.source SymbolPos.stopper (Scan.bulk scan) (SOME (false, recover));
 | 
| 24579 |    211 | 
 | 
|  |    212 | end;
 | 
|  |    213 | 
 | 
| 24596 |    214 | end;
 | 
| 24579 |    215 | 
 |