src/Pure/ML/ml_lex.ML
author wenzelm
Thu Oct 04 20:29:42 2007 +0200 (2007-10-04)
changeset 24850 0cfd722ab579
parent 24596 f1333a841b26
child 27732 8dbf5761a24a
permissions -rw-r--r--
Name.uu, Name.aT;
wenzelm@24579
     1
(*  Title:      Pure/ML/ml_lex.ML
wenzelm@24579
     2
    ID:         $Id$
wenzelm@24579
     3
    Author:     Makarius
wenzelm@24579
     4
wenzelm@24579
     5
Lexical syntax for SML.
wenzelm@24579
     6
*)
wenzelm@24579
     7
wenzelm@24579
     8
signature ML_LEX =
wenzelm@24579
     9
sig
wenzelm@24579
    10
  datatype token_kind =
wenzelm@24596
    11
    Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
wenzelm@24579
    12
    Space | Comment | Error of string | EOF
wenzelm@24579
    13
  eqtype token
wenzelm@24579
    14
  val stopper: token * (token -> bool)
wenzelm@24596
    15
  val is_regular: token -> bool
wenzelm@24596
    16
  val is_improper: token -> bool
wenzelm@24596
    17
  val pos_of: token -> string
wenzelm@24579
    18
  val kind_of: token -> token_kind
wenzelm@24579
    19
  val val_of: token -> string
wenzelm@24579
    20
  val keywords: string list
wenzelm@24596
    21
  val source: (Symbol.symbol, 'a) Source.source ->
wenzelm@24596
    22
    (token, int * (Symbol.symbol, 'a) Source.source) Source.source
wenzelm@24579
    23
end;
wenzelm@24579
    24
wenzelm@24579
    25
structure ML_Lex: ML_LEX =
wenzelm@24579
    26
struct
wenzelm@24579
    27
wenzelm@24579
    28
(** tokens **)
wenzelm@24579
    29
wenzelm@24579
    30
(* datatype token *)
wenzelm@24579
    31
wenzelm@24579
    32
datatype token_kind =
wenzelm@24596
    33
  Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
wenzelm@24579
    34
  Space | Comment | Error of string | EOF;
wenzelm@24579
    35
wenzelm@24579
    36
datatype token = Token of int * (token_kind * string);
wenzelm@24579
    37
wenzelm@24579
    38
wenzelm@24596
    39
(* control tokens *)
wenzelm@24579
    40
wenzelm@24579
    41
val eof = Token (0, (EOF, ""));
wenzelm@24579
    42
wenzelm@24579
    43
fun is_eof (Token (_, (EOF, _))) = true
wenzelm@24579
    44
  | is_eof _ = false;
wenzelm@24579
    45
wenzelm@24579
    46
val stopper = (eof, is_eof);
wenzelm@24579
    47
wenzelm@24579
    48
wenzelm@24596
    49
fun is_regular (Token (_, (Error _, _))) = false
wenzelm@24596
    50
  | is_regular (Token (_, (EOF, _))) = false
wenzelm@24596
    51
  | is_regular _ = true;
wenzelm@24596
    52
wenzelm@24596
    53
fun is_improper (Token (_, (Space, _))) = true
wenzelm@24596
    54
  | is_improper (Token (_, (Comment, _))) = true
wenzelm@24596
    55
  | is_improper _ = false;
wenzelm@24596
    56
wenzelm@24596
    57
wenzelm@24596
    58
(* token content *)
wenzelm@24596
    59
wenzelm@24596
    60
fun pos_of (Token (n, _)) = " (line " ^ string_of_int n ^ ")";
wenzelm@24579
    61
wenzelm@24579
    62
fun kind_of (Token (_, (k, _))) = k;
wenzelm@24579
    63
fun val_of (Token (_, (_, x))) = x;
wenzelm@24579
    64
wenzelm@24579
    65
fun token_leq (Token (_, (_, x)), Token (_, (_, x'))) = x <= x';
wenzelm@24579
    66
wenzelm@24579
    67
wenzelm@24579
    68
wenzelm@24579
    69
(** scanners **)
wenzelm@24579
    70
wenzelm@24579
    71
(* diagnostics *)
wenzelm@24579
    72
wenzelm@24579
    73
fun lex_err msg ((n, cs), _) = "SML lexical error (line " ^ string_of_int n ^ "): " ^ msg cs;
wenzelm@24579
    74
fun !!! msg scan = Scan.!! (lex_err (K msg)) scan;
wenzelm@24579
    75
wenzelm@24579
    76
wenzelm@24579
    77
(* line numbering *)
wenzelm@24579
    78
wenzelm@24596
    79
fun incr_line scan = Scan.depend (fn (n: int) => scan >> pair (n + 1));
wenzelm@24579
    80
val keep_line = Scan.lift;
wenzelm@24579
    81
wenzelm@24579
    82
val scan_blank =
wenzelm@24579
    83
  incr_line ($$ "\n") ||
wenzelm@24579
    84
  keep_line (Scan.one Symbol.is_ascii_blank);
wenzelm@24579
    85
wenzelm@24579
    86
val scan_blanks = Scan.repeat scan_blank >> implode;
wenzelm@24579
    87
val scan_blanks1 = Scan.repeat1 scan_blank >> implode;
wenzelm@24579
    88
wenzelm@24579
    89
wenzelm@24579
    90
(* keywords *)
wenzelm@24579
    91
wenzelm@24579
    92
val keywords = ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=",
wenzelm@24579
    93
  "=>", "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
wenzelm@24579
    94
  "case", "datatype", "do", "else", "end", "eqtype", "exception", "fn",
wenzelm@24579
    95
  "fun", "functor", "handle", "if", "in", "include", "infix", "infixr",
wenzelm@24579
    96
  "let", "local", "nonfix", "of", "op", "open", "orelse", "raise", "rec",
wenzelm@24579
    97
  "sharing", "sig", "signature", "struct", "structure", "then", "type",
wenzelm@24579
    98
  "val", "where", "while", "with", "withtype"];
wenzelm@24579
    99
wenzelm@24579
   100
val scan_keyword = Scan.literal (Scan.make_lexicon (map explode keywords)) >> implode;
wenzelm@24579
   101
wenzelm@24579
   102
wenzelm@24579
   103
(* identifiers *)
wenzelm@24579
   104
wenzelm@24596
   105
local
wenzelm@24596
   106
wenzelm@24579
   107
val scan_letdigs =
wenzelm@24579
   108
  Scan.many (Symbol.is_ascii_letter orf Symbol.is_ascii_digit orf Symbol.is_ascii_quasi) >> implode;
wenzelm@24579
   109
wenzelm@24579
   110
val scan_alphanumeric = Scan.one Symbol.is_ascii_letter ^^ scan_letdigs;
wenzelm@24579
   111
wenzelm@24579
   112
val scan_symbolic = Scan.many1 (member (op =) (explode "!#$%&*+-/:<=>?@\\^`|~")) >> implode;
wenzelm@24579
   113
wenzelm@24596
   114
in
wenzelm@24596
   115
wenzelm@24579
   116
val scan_ident = scan_alphanumeric || scan_symbolic;
wenzelm@24579
   117
wenzelm@24579
   118
val scan_longident =
wenzelm@24579
   119
  (Scan.repeat1 (scan_alphanumeric ^^ $$ ".") >> implode) ^^ (scan_ident || $$ "=");
wenzelm@24579
   120
wenzelm@24579
   121
val scan_typevar = $$ "'" ^^ scan_letdigs;
wenzelm@24579
   122
wenzelm@24596
   123
end;
wenzelm@24579
   124
wenzelm@24579
   125
wenzelm@24579
   126
(* numerals *)
wenzelm@24579
   127
wenzelm@24596
   128
local
wenzelm@24596
   129
wenzelm@24579
   130
val scan_dec = Scan.many1 Symbol.is_ascii_digit >> implode;
wenzelm@24579
   131
val scan_hex = Scan.many1 Symbol.is_ascii_hex >> implode;
wenzelm@24579
   132
val scan_sign = Scan.optional ($$ "~") "";
wenzelm@24579
   133
val scan_decint = scan_sign ^^ scan_dec;
wenzelm@24579
   134
wenzelm@24596
   135
in
wenzelm@24596
   136
wenzelm@24579
   137
val scan_word = Scan.this_string "0wx" ^^ scan_hex || Scan.this_string "0w" ^^ scan_dec;
wenzelm@24579
   138
wenzelm@24579
   139
val scan_int = scan_sign ^^ (Scan.this_string "0x" ^^ scan_hex || scan_dec);
wenzelm@24579
   140
wenzelm@24579
   141
val scan_exp = ($$ "E" || $$ "e") ^^ scan_decint;
wenzelm@24579
   142
wenzelm@24579
   143
val scan_real =
wenzelm@24579
   144
  scan_decint ^^ $$ "." ^^ scan_dec ^^ Scan.optional scan_exp "" ||
wenzelm@24579
   145
  scan_decint ^^ scan_exp;
wenzelm@24579
   146
wenzelm@24596
   147
end;
wenzelm@24596
   148
wenzelm@24579
   149
wenzelm@24579
   150
(* chars and strings *)
wenzelm@24579
   151
wenzelm@24596
   152
local
wenzelm@24596
   153
wenzelm@24596
   154
val scan_escape =
wenzelm@24596
   155
  Scan.one (member (op =) (explode "\"\\abtnvfr")) ||
wenzelm@24596
   156
  $$ "^" ^^ Scan.one (fn s => ord "@" <= ord s andalso ord s <= ord "_") ||
wenzelm@24596
   157
  Scan.one Symbol.is_ascii_digit ^^
wenzelm@24596
   158
    Scan.one Symbol.is_ascii_digit ^^
wenzelm@24596
   159
    Scan.one Symbol.is_ascii_digit;
wenzelm@24596
   160
wenzelm@24596
   161
val scan_str =
wenzelm@24596
   162
  keep_line (Scan.one (fn s => Symbol.is_printable s andalso s <> "\"" andalso s <> "\\")) ||
wenzelm@24596
   163
  keep_line ($$ "\\") ^^ !!! "bad escape character in string" (keep_line scan_escape);
wenzelm@24596
   164
wenzelm@24579
   165
val scan_gap = keep_line ($$ "\\") ^^ scan_blanks1 ^^ keep_line ($$ "\\");
wenzelm@24579
   166
val scan_gaps = Scan.repeat scan_gap >> implode;
wenzelm@24579
   167
wenzelm@24596
   168
in
wenzelm@24579
   169
wenzelm@24579
   170
val scan_char =
wenzelm@24579
   171
  keep_line ($$ "#" ^^ $$ "\"") ^^ scan_gaps ^^ scan_str ^^ scan_gaps ^^ keep_line ($$ "\"");
wenzelm@24579
   172
wenzelm@24579
   173
val scan_string =
wenzelm@24579
   174
  keep_line ($$ "\"") ^^
wenzelm@24596
   175
  !!! "missing quote at end of string"
wenzelm@24596
   176
    ((Scan.repeat (scan_gap || scan_str) >> implode) ^^ keep_line ($$ "\""));
wenzelm@24596
   177
wenzelm@24596
   178
end;
wenzelm@24579
   179
wenzelm@24579
   180
wenzelm@24579
   181
(* scan nested comments *)
wenzelm@24579
   182
wenzelm@24596
   183
local
wenzelm@24596
   184
wenzelm@24579
   185
val scan_cmt =
wenzelm@24579
   186
  Scan.lift scan_blank ||
wenzelm@24596
   187
  Scan.depend (fn (d: int) => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
wenzelm@24579
   188
  Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
wenzelm@24579
   189
  Scan.lift (keep_line ($$ "*" --| Scan.ahead (~$$ ")"))) ||
wenzelm@24579
   190
  Scan.lift (keep_line (Scan.one (fn s => s <> "*" andalso Symbol.is_regular s)));
wenzelm@24579
   191
wenzelm@24596
   192
in
wenzelm@24596
   193
wenzelm@24579
   194
val scan_comment =
wenzelm@24579
   195
  keep_line ($$ "(" ^^ $$ "*") ^^
wenzelm@24596
   196
  !!! "missing end of comment"
wenzelm@24596
   197
    (Scan.pass 0 (Scan.repeat scan_cmt >> implode) ^^ keep_line ($$ "*" ^^ $$ ")"));
wenzelm@24596
   198
wenzelm@24596
   199
end;
wenzelm@24579
   200
wenzelm@24579
   201
wenzelm@24579
   202
(* scan token *)
wenzelm@24579
   203
wenzelm@24579
   204
val scan =
wenzelm@24579
   205
  let
wenzelm@24579
   206
    val scanner = Scan.state :|-- (fn n =>
wenzelm@24579
   207
      let
wenzelm@24579
   208
        fun token k x = Token (n, (k, x));
wenzelm@24579
   209
      in
wenzelm@24579
   210
        scan_char >> token Char ||
wenzelm@24579
   211
        scan_string >> token String ||
wenzelm@24579
   212
        scan_blanks1 >> token Space ||
wenzelm@24579
   213
        scan_comment >> token Comment ||
wenzelm@24579
   214
        keep_line (Scan.max token_leq
wenzelm@24579
   215
          (scan_keyword >> token Keyword)
wenzelm@24596
   216
          (scan_word >> token Word ||
wenzelm@24596
   217
            scan_real >> token Real ||
wenzelm@24596
   218
            scan_int >> token Int ||
wenzelm@24596
   219
            scan_longident >> token LongIdent ||
wenzelm@24579
   220
            scan_ident >> token Ident ||
wenzelm@24596
   221
            scan_typevar >> token TypeVar))
wenzelm@24579
   222
      end);
wenzelm@24579
   223
  in !! (lex_err (fn cs => "bad input " ^ quote (Symbol.beginning 10 cs))) scanner end;
wenzelm@24579
   224
wenzelm@24579
   225
wenzelm@24596
   226
(* token source *)
wenzelm@24579
   227
wenzelm@24579
   228
local
wenzelm@24579
   229
wenzelm@24579
   230
val is_junk = (not o Symbol.is_blank) andf Symbol.is_regular;
wenzelm@24579
   231
wenzelm@24579
   232
fun recover msg = Scan.state :|-- (fn n =>
wenzelm@24579
   233
  keep_line (Scan.many is_junk) >> (fn cs => [Token (n, (Error msg, implode cs))]));
wenzelm@24579
   234
wenzelm@24579
   235
in
wenzelm@24579
   236
wenzelm@24596
   237
fun source src =
wenzelm@24596
   238
  Source.source' 1 Symbol.stopper (Scan.bulk scan) (SOME (false, recover)) src;
wenzelm@24579
   239
wenzelm@24579
   240
end;
wenzelm@24579
   241
wenzelm@24596
   242
end;
wenzelm@24579
   243