src/Pure/ML/ml_lex.ML
author wenzelm
Thu Oct 04 20:29:42 2007 +0200 (2007-10-04)
changeset 24850 0cfd722ab579
parent 24596 f1333a841b26
child 27732 8dbf5761a24a
permissions -rw-r--r--
Name.uu, Name.aT;
     1 (*  Title:      Pure/ML/ml_lex.ML
     2     ID:         $Id$
     3     Author:     Makarius
     4 
     5 Lexical syntax for SML.
     6 *)
     7 
     8 signature ML_LEX =
     9 sig
    10   datatype token_kind =
    11     Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
    12     Space | Comment | Error of string | EOF
    13   eqtype token
    14   val stopper: token * (token -> bool)
    15   val is_regular: token -> bool
    16   val is_improper: token -> bool
    17   val pos_of: token -> string
    18   val kind_of: token -> token_kind
    19   val val_of: token -> string
    20   val keywords: string list
    21   val source: (Symbol.symbol, 'a) Source.source ->
    22     (token, int * (Symbol.symbol, 'a) Source.source) Source.source
    23 end;
    24 
    25 structure ML_Lex: ML_LEX =
    26 struct
    27 
    28 (** tokens **)
    29 
    30 (* datatype token *)
    31 
    32 datatype token_kind =
    33   Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
    34   Space | Comment | Error of string | EOF;
    35 
    36 datatype token = Token of int * (token_kind * string);
    37 
    38 
    39 (* control tokens *)
    40 
    41 val eof = Token (0, (EOF, ""));
    42 
    43 fun is_eof (Token (_, (EOF, _))) = true
    44   | is_eof _ = false;
    45 
    46 val stopper = (eof, is_eof);
    47 
    48 
    49 fun is_regular (Token (_, (Error _, _))) = false
    50   | is_regular (Token (_, (EOF, _))) = false
    51   | is_regular _ = true;
    52 
    53 fun is_improper (Token (_, (Space, _))) = true
    54   | is_improper (Token (_, (Comment, _))) = true
    55   | is_improper _ = false;
    56 
    57 
    58 (* token content *)
    59 
    60 fun pos_of (Token (n, _)) = " (line " ^ string_of_int n ^ ")";
    61 
    62 fun kind_of (Token (_, (k, _))) = k;
    63 fun val_of (Token (_, (_, x))) = x;
    64 
    65 fun token_leq (Token (_, (_, x)), Token (_, (_, x'))) = x <= x';
    66 
    67 
    68 
    69 (** scanners **)
    70 
    71 (* diagnostics *)
    72 
    73 fun lex_err msg ((n, cs), _) = "SML lexical error (line " ^ string_of_int n ^ "): " ^ msg cs;
    74 fun !!! msg scan = Scan.!! (lex_err (K msg)) scan;
    75 
    76 
    77 (* line numbering *)
    78 
    79 fun incr_line scan = Scan.depend (fn (n: int) => scan >> pair (n + 1));
    80 val keep_line = Scan.lift;
    81 
    82 val scan_blank =
    83   incr_line ($$ "\n") ||
    84   keep_line (Scan.one Symbol.is_ascii_blank);
    85 
    86 val scan_blanks = Scan.repeat scan_blank >> implode;
    87 val scan_blanks1 = Scan.repeat1 scan_blank >> implode;
    88 
    89 
    90 (* keywords *)
    91 
    92 val keywords = ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=",
    93   "=>", "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
    94   "case", "datatype", "do", "else", "end", "eqtype", "exception", "fn",
    95   "fun", "functor", "handle", "if", "in", "include", "infix", "infixr",
    96   "let", "local", "nonfix", "of", "op", "open", "orelse", "raise", "rec",
    97   "sharing", "sig", "signature", "struct", "structure", "then", "type",
    98   "val", "where", "while", "with", "withtype"];
    99 
   100 val scan_keyword = Scan.literal (Scan.make_lexicon (map explode keywords)) >> implode;
   101 
   102 
   103 (* identifiers *)
   104 
   105 local
   106 
   107 val scan_letdigs =
   108   Scan.many (Symbol.is_ascii_letter orf Symbol.is_ascii_digit orf Symbol.is_ascii_quasi) >> implode;
   109 
   110 val scan_alphanumeric = Scan.one Symbol.is_ascii_letter ^^ scan_letdigs;
   111 
   112 val scan_symbolic = Scan.many1 (member (op =) (explode "!#$%&*+-/:<=>?@\\^`|~")) >> implode;
   113 
   114 in
   115 
   116 val scan_ident = scan_alphanumeric || scan_symbolic;
   117 
   118 val scan_longident =
   119   (Scan.repeat1 (scan_alphanumeric ^^ $$ ".") >> implode) ^^ (scan_ident || $$ "=");
   120 
   121 val scan_typevar = $$ "'" ^^ scan_letdigs;
   122 
   123 end;
   124 
   125 
   126 (* numerals *)
   127 
   128 local
   129 
   130 val scan_dec = Scan.many1 Symbol.is_ascii_digit >> implode;
   131 val scan_hex = Scan.many1 Symbol.is_ascii_hex >> implode;
   132 val scan_sign = Scan.optional ($$ "~") "";
   133 val scan_decint = scan_sign ^^ scan_dec;
   134 
   135 in
   136 
   137 val scan_word = Scan.this_string "0wx" ^^ scan_hex || Scan.this_string "0w" ^^ scan_dec;
   138 
   139 val scan_int = scan_sign ^^ (Scan.this_string "0x" ^^ scan_hex || scan_dec);
   140 
   141 val scan_exp = ($$ "E" || $$ "e") ^^ scan_decint;
   142 
   143 val scan_real =
   144   scan_decint ^^ $$ "." ^^ scan_dec ^^ Scan.optional scan_exp "" ||
   145   scan_decint ^^ scan_exp;
   146 
   147 end;
   148 
   149 
   150 (* chars and strings *)
   151 
   152 local
   153 
   154 val scan_escape =
   155   Scan.one (member (op =) (explode "\"\\abtnvfr")) ||
   156   $$ "^" ^^ Scan.one (fn s => ord "@" <= ord s andalso ord s <= ord "_") ||
   157   Scan.one Symbol.is_ascii_digit ^^
   158     Scan.one Symbol.is_ascii_digit ^^
   159     Scan.one Symbol.is_ascii_digit;
   160 
   161 val scan_str =
   162   keep_line (Scan.one (fn s => Symbol.is_printable s andalso s <> "\"" andalso s <> "\\")) ||
   163   keep_line ($$ "\\") ^^ !!! "bad escape character in string" (keep_line scan_escape);
   164 
   165 val scan_gap = keep_line ($$ "\\") ^^ scan_blanks1 ^^ keep_line ($$ "\\");
   166 val scan_gaps = Scan.repeat scan_gap >> implode;
   167 
   168 in
   169 
   170 val scan_char =
   171   keep_line ($$ "#" ^^ $$ "\"") ^^ scan_gaps ^^ scan_str ^^ scan_gaps ^^ keep_line ($$ "\"");
   172 
   173 val scan_string =
   174   keep_line ($$ "\"") ^^
   175   !!! "missing quote at end of string"
   176     ((Scan.repeat (scan_gap || scan_str) >> implode) ^^ keep_line ($$ "\""));
   177 
   178 end;
   179 
   180 
   181 (* scan nested comments *)
   182 
   183 local
   184 
   185 val scan_cmt =
   186   Scan.lift scan_blank ||
   187   Scan.depend (fn (d: int) => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
   188   Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
   189   Scan.lift (keep_line ($$ "*" --| Scan.ahead (~$$ ")"))) ||
   190   Scan.lift (keep_line (Scan.one (fn s => s <> "*" andalso Symbol.is_regular s)));
   191 
   192 in
   193 
   194 val scan_comment =
   195   keep_line ($$ "(" ^^ $$ "*") ^^
   196   !!! "missing end of comment"
   197     (Scan.pass 0 (Scan.repeat scan_cmt >> implode) ^^ keep_line ($$ "*" ^^ $$ ")"));
   198 
   199 end;
   200 
   201 
   202 (* scan token *)
   203 
   204 val scan =
   205   let
   206     val scanner = Scan.state :|-- (fn n =>
   207       let
   208         fun token k x = Token (n, (k, x));
   209       in
   210         scan_char >> token Char ||
   211         scan_string >> token String ||
   212         scan_blanks1 >> token Space ||
   213         scan_comment >> token Comment ||
   214         keep_line (Scan.max token_leq
   215           (scan_keyword >> token Keyword)
   216           (scan_word >> token Word ||
   217             scan_real >> token Real ||
   218             scan_int >> token Int ||
   219             scan_longident >> token LongIdent ||
   220             scan_ident >> token Ident ||
   221             scan_typevar >> token TypeVar))
   222       end);
   223   in !! (lex_err (fn cs => "bad input " ^ quote (Symbol.beginning 10 cs))) scanner end;
   224 
   225 
   226 (* token source *)
   227 
   228 local
   229 
   230 val is_junk = (not o Symbol.is_blank) andf Symbol.is_regular;
   231 
   232 fun recover msg = Scan.state :|-- (fn n =>
   233   keep_line (Scan.many is_junk) >> (fn cs => [Token (n, (Error msg, implode cs))]));
   234 
   235 in
   236 
   237 fun source src =
   238   Source.source' 1 Symbol.stopper (Scan.bulk scan) (SOME (false, recover)) src;
   239 
   240 end;
   241 
   242 end;
   243