src/Pure/ML/ml_lex.ML
author wenzelm
Sat Nov 04 15:24:40 2017 +0100 (20 months ago)
changeset 67003 49850a679c2c
parent 66067 cdbcb417db67
child 67362 221612c942de
permissions -rw-r--r--
more robust sorted_entries;
wenzelm@24579
     1
(*  Title:      Pure/ML/ml_lex.ML
wenzelm@24579
     2
    Author:     Makarius
wenzelm@24579
     3
wenzelm@59109
     4
Lexical syntax for Isabelle/ML and Standard ML.
wenzelm@24579
     5
*)
wenzelm@24579
     6
wenzelm@24579
     7
signature ML_LEX =
wenzelm@24579
     8
sig
wenzelm@55505
     9
  val keywords: string list
wenzelm@24579
    10
  datatype token_kind =
wenzelm@59082
    11
    Keyword | Ident | Long_Ident | Type_Var | Word | Int | Real | Char | String |
wenzelm@61596
    12
    Space | Comment | Error of string | EOF
wenzelm@24579
    13
  eqtype token
wenzelm@27732
    14
  val stopper: token Scan.stopper
wenzelm@24596
    15
  val is_regular: token -> bool
wenzelm@24596
    16
  val is_improper: token -> bool
wenzelm@30683
    17
  val set_range: Position.range -> token -> token
wenzelm@59112
    18
  val range_of: token -> Position.range
wenzelm@30636
    19
  val pos_of: token -> Position.T
wenzelm@31474
    20
  val end_pos_of: token -> Position.T
wenzelm@24579
    21
  val kind_of: token -> token_kind
wenzelm@27817
    22
  val content_of: token -> string
wenzelm@31426
    23
  val check_content_of: token -> string
wenzelm@31332
    24
  val flatten: token list -> string
wenzelm@24596
    25
  val source: (Symbol.symbol, 'a) Source.source ->
wenzelm@30573
    26
    (token, (Symbol_Pos.T, Position.T * (Symbol.symbol, 'a) Source.source)
wenzelm@27772
    27
      Source.source) Source.source
wenzelm@30591
    28
  val tokenize: string -> token list
wenzelm@59067
    29
  val read_pos: Position.T -> Symbol_Pos.text -> token Antiquote.antiquote list
wenzelm@59067
    30
  val read: Symbol_Pos.text -> token Antiquote.antiquote list
wenzelm@58978
    31
  val read_set_range: Position.range -> Symbol_Pos.text -> token Antiquote.antiquote list
wenzelm@59064
    32
  val read_source: bool -> Input.source -> token Antiquote.antiquote list
wenzelm@24579
    33
end;
wenzelm@24579
    34
wenzelm@24579
    35
structure ML_Lex: ML_LEX =
wenzelm@24579
    36
struct
wenzelm@24579
    37
wenzelm@55505
    38
(** keywords **)
wenzelm@55505
    39
wenzelm@55505
    40
val keywords =
wenzelm@55505
    41
 ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=", "=>", "[",
wenzelm@55505
    42
  "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as", "case",
wenzelm@55505
    43
  "datatype", "do", "else", "end", "eqtype", "exception", "fn", "fun",
wenzelm@55505
    44
  "functor", "handle", "if", "in", "include", "infix", "infixr",
wenzelm@55505
    45
  "let", "local", "nonfix", "of", "op", "open", "orelse", "raise",
wenzelm@55505
    46
  "rec", "sharing", "sig", "signature", "struct", "structure", "then",
wenzelm@55505
    47
  "type", "val", "where", "while", "with", "withtype"];
wenzelm@55505
    48
wenzelm@55505
    49
val keywords2 =
wenzelm@58933
    50
 ["and", "case", "do", "else", "end", "if", "in", "let", "local", "of",
wenzelm@55505
    51
  "sig", "struct", "then", "while", "with"];
wenzelm@55505
    52
wenzelm@55505
    53
val keywords3 =
wenzelm@55505
    54
 ["handle", "open", "raise"];
wenzelm@55505
    55
wenzelm@55505
    56
val lexicon = Scan.make_lexicon (map raw_explode keywords);
wenzelm@55505
    57
wenzelm@55505
    58
wenzelm@55505
    59
wenzelm@24579
    60
(** tokens **)
wenzelm@24579
    61
wenzelm@24579
    62
(* datatype token *)
wenzelm@24579
    63
wenzelm@24579
    64
datatype token_kind =
wenzelm@59082
    65
  Keyword | Ident | Long_Ident | Type_Var | Word | Int | Real | Char | String |
wenzelm@61596
    66
  Space | Comment | Error of string | EOF;
wenzelm@24579
    67
wenzelm@27772
    68
datatype token = Token of Position.range * (token_kind * string);
wenzelm@27772
    69
wenzelm@27772
    70
wenzelm@27772
    71
(* position *)
wenzelm@27772
    72
wenzelm@30683
    73
fun set_range range (Token (_, x)) = Token (range, x);
wenzelm@59112
    74
fun range_of (Token (range, _)) = range;
wenzelm@30683
    75
wenzelm@59112
    76
val pos_of = #1 o range_of;
wenzelm@59112
    77
val end_pos_of = #2 o range_of;
wenzelm@24579
    78
wenzelm@24579
    79
wenzelm@58855
    80
(* stopper *)
wenzelm@24579
    81
wenzelm@27772
    82
fun mk_eof pos = Token ((pos, Position.none), (EOF, ""));
wenzelm@27772
    83
val eof = mk_eof Position.none;
wenzelm@24579
    84
wenzelm@24579
    85
fun is_eof (Token (_, (EOF, _))) = true
wenzelm@24579
    86
  | is_eof _ = false;
wenzelm@24579
    87
wenzelm@27772
    88
val stopper =
wenzelm@30636
    89
  Scan.stopper (fn [] => eof | toks => mk_eof (end_pos_of (List.last toks))) is_eof;
wenzelm@27772
    90
wenzelm@24579
    91
wenzelm@27772
    92
(* token content *)
wenzelm@27772
    93
wenzelm@31426
    94
fun kind_of (Token (_, (k, _))) = k;
wenzelm@31426
    95
wenzelm@27817
    96
fun content_of (Token (_, (_, x))) = x;
wenzelm@27817
    97
fun token_leq (tok, tok') = content_of tok <= content_of tok';
wenzelm@27772
    98
wenzelm@55505
    99
fun is_keyword (Token (_, (Keyword, _))) = true
wenzelm@55505
   100
  | is_keyword _ = false;
wenzelm@55505
   101
wenzelm@55505
   102
fun is_delimiter (Token (_, (Keyword, x))) = not (Symbol.is_ascii_identifier x)
wenzelm@55505
   103
  | is_delimiter _ = false;
wenzelm@55505
   104
wenzelm@41502
   105
fun is_regular (Token (_, (Error _, _))) = false
wenzelm@41502
   106
  | is_regular (Token (_, (EOF, _))) = false
wenzelm@41502
   107
  | is_regular _ = true;
wenzelm@41502
   108
wenzelm@41502
   109
fun is_improper (Token (_, (Space, _))) = true
wenzelm@41502
   110
  | is_improper (Token (_, (Comment, _))) = true
wenzelm@41502
   111
  | is_improper _ = false;
wenzelm@41502
   112
wenzelm@40337
   113
fun warn tok =
wenzelm@40337
   114
  (case tok of
wenzelm@40337
   115
    Token (_, (Keyword, ":>")) =>
wenzelm@40337
   116
      warning ("Opaque signature matching (:>) fails to work with ML pretty printing --\n\
wenzelm@40337
   117
        \prefer non-opaque matching (:) possibly with abstype" ^
wenzelm@48992
   118
        Position.here (pos_of tok))
wenzelm@40337
   119
  | _ => ());
wenzelm@40337
   120
wenzelm@31426
   121
fun check_content_of tok =
wenzelm@30636
   122
  (case kind_of tok of
wenzelm@30645
   123
    Error msg => error msg
wenzelm@40337
   124
  | _ => content_of tok);
wenzelm@30636
   125
wenzelm@59112
   126
wenzelm@59112
   127
(* flatten *)
wenzelm@59112
   128
wenzelm@41502
   129
fun flatten_content (tok :: (toks as tok' :: _)) =
wenzelm@41502
   130
      Symbol.escape (check_content_of tok) ::
wenzelm@41502
   131
        (if is_improper tok orelse is_improper tok' then flatten_content toks
wenzelm@41502
   132
         else Symbol.space :: flatten_content toks)
wenzelm@41502
   133
  | flatten_content toks = map (Symbol.escape o check_content_of) toks;
wenzelm@31332
   134
wenzelm@41502
   135
val flatten = implode o flatten_content;
wenzelm@24596
   136
wenzelm@24596
   137
wenzelm@30614
   138
(* markup *)
wenzelm@30614
   139
wenzelm@37195
   140
local
wenzelm@37195
   141
wenzelm@56278
   142
fun token_kind_markup SML =
wenzelm@59124
   143
 fn Type_Var => (Markup.ML_tvar, "")
wenzelm@59082
   144
  | Word => (Markup.ML_numeral, "")
wenzelm@59082
   145
  | Int => (Markup.ML_numeral, "")
wenzelm@59082
   146
  | Real => (Markup.ML_numeral, "")
wenzelm@59082
   147
  | Char => (Markup.ML_char, "")
wenzelm@59082
   148
  | String => (if SML then Markup.SML_string else Markup.ML_string, "")
wenzelm@59082
   149
  | Comment => (if SML then Markup.SML_comment else Markup.ML_comment, "")
wenzelm@64677
   150
  | Error msg => (Markup.bad (), msg)
wenzelm@59124
   151
  | _ => (Markup.empty, "");
wenzelm@30614
   152
wenzelm@37195
   153
in
wenzelm@37195
   154
wenzelm@61457
   155
fun token_report SML (tok as Token ((pos, _), (kind, x))) =
wenzelm@55505
   156
  let
wenzelm@55505
   157
    val (markup, txt) =
wenzelm@56278
   158
      if not (is_keyword tok) then token_kind_markup SML kind
wenzelm@55505
   159
      else if is_delimiter tok then (Markup.ML_delimiter, "")
wenzelm@66067
   160
      else if member (op =) keywords2 x then (Markup.ML_keyword2 |> Markup.keyword_properties, "")
wenzelm@66067
   161
      else if member (op =) keywords3 x then (Markup.ML_keyword3 |> Markup.keyword_properties, "")
wenzelm@66067
   162
      else (Markup.ML_keyword1 |> Markup.keyword_properties, "");
wenzelm@48768
   163
  in ((pos, markup), txt) end;
wenzelm@37195
   164
wenzelm@37195
   165
end;
wenzelm@30614
   166
wenzelm@30614
   167
wenzelm@24579
   168
wenzelm@24579
   169
(** scanners **)
wenzelm@24579
   170
wenzelm@30573
   171
open Basic_Symbol_Pos;
wenzelm@24579
   172
wenzelm@55105
   173
val err_prefix = "SML lexical error: ";
wenzelm@55105
   174
wenzelm@55105
   175
fun !!! msg = Symbol_Pos.!!! (fn () => err_prefix ^ msg);
wenzelm@24579
   176
wenzelm@24579
   177
wenzelm@24579
   178
(* identifiers *)
wenzelm@24579
   179
wenzelm@24596
   180
local
wenzelm@24596
   181
wenzelm@24579
   182
val scan_letdigs =
wenzelm@55496
   183
  Scan.many (Symbol.is_ascii_letdig o Symbol_Pos.symbol);
wenzelm@24579
   184
wenzelm@40525
   185
val scan_alphanumeric =
wenzelm@61814
   186
  Scan.one (Symbol.is_ascii_letter o Symbol_Pos.symbol) ::: scan_letdigs;
wenzelm@24579
   187
wenzelm@40525
   188
val scan_symbolic =
wenzelm@40627
   189
  Scan.many1 (member (op =) (raw_explode "!#$%&*+-/:<=>?@\\^`|~") o Symbol_Pos.symbol);
wenzelm@24579
   190
wenzelm@24596
   191
in
wenzelm@24596
   192
wenzelm@24579
   193
val scan_ident = scan_alphanumeric || scan_symbolic;
wenzelm@24579
   194
wenzelm@55496
   195
val scan_long_ident =
wenzelm@61476
   196
  Scan.repeats1 (scan_alphanumeric @@@ $$$ ".") @@@ (scan_ident || $$$ "=");
wenzelm@24579
   197
wenzelm@55496
   198
val scan_type_var = $$$ "'" @@@ scan_letdigs;
wenzelm@24579
   199
wenzelm@24596
   200
end;
wenzelm@24579
   201
wenzelm@24579
   202
wenzelm@24579
   203
(* numerals *)
wenzelm@24579
   204
wenzelm@24596
   205
local
wenzelm@24596
   206
wenzelm@40525
   207
val scan_dec = Scan.many1 (Symbol.is_ascii_digit o Symbol_Pos.symbol);
wenzelm@40525
   208
val scan_hex = Scan.many1 (Symbol.is_ascii_hex o Symbol_Pos.symbol);
wenzelm@27772
   209
val scan_sign = Scan.optional ($$$ "~") [];
wenzelm@27772
   210
val scan_decint = scan_sign @@@ scan_dec;
wenzelm@55496
   211
val scan_exp = ($$$ "E" || $$$ "e") @@@ scan_decint;
wenzelm@24579
   212
wenzelm@24596
   213
in
wenzelm@24596
   214
wenzelm@27772
   215
val scan_word =
wenzelm@27772
   216
  $$$ "0" @@@ $$$ "w" @@@ $$$ "x" @@@ scan_hex ||
wenzelm@27772
   217
  $$$ "0" @@@ $$$ "w" @@@ scan_dec;
wenzelm@24579
   218
wenzelm@27772
   219
val scan_int = scan_sign @@@ ($$$ "0" @@@ $$$ "x" @@@ scan_hex || scan_dec);
wenzelm@24579
   220
wenzelm@63204
   221
val scan_rat = scan_decint @@@ Scan.optional ($$$ "/" @@@ scan_dec) [];
wenzelm@63204
   222
wenzelm@24579
   223
val scan_real =
wenzelm@27772
   224
  scan_decint @@@ $$$ "." @@@ scan_dec @@@ Scan.optional scan_exp [] ||
wenzelm@27772
   225
  scan_decint @@@ scan_exp;
wenzelm@24579
   226
wenzelm@24596
   227
end;
wenzelm@24596
   228
wenzelm@24579
   229
wenzelm@24579
   230
(* chars and strings *)
wenzelm@24579
   231
wenzelm@55496
   232
val scan_blanks1 = Scan.many1 (Symbol.is_ascii_blank o Symbol_Pos.symbol);
wenzelm@55496
   233
wenzelm@24596
   234
local
wenzelm@24596
   235
wenzelm@24596
   236
val scan_escape =
wenzelm@40627
   237
  Scan.one (member (op =) (raw_explode "\"\\abtnvfr") o Symbol_Pos.symbol) >> single ||
wenzelm@64275
   238
  $$$ "^" @@@
wenzelm@64275
   239
    (Scan.one (fn (s, _) => Char.ord #"@" <= ord s andalso ord s <= Char.ord #"_") >> single) ||
wenzelm@40525
   240
  Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) --
wenzelm@40525
   241
    Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) --
wenzelm@40525
   242
    Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) >> (fn ((a, b), c) => [a, b, c]);
wenzelm@24596
   243
wenzelm@24596
   244
val scan_str =
wenzelm@58854
   245
  Scan.one (fn (s, _) => Symbol.not_eof s andalso s <> "\"" andalso s <> "\\" andalso
wenzelm@30600
   246
    (not (Symbol.is_char s) orelse Symbol.is_printable s)) >> single ||
wenzelm@27772
   247
  $$$ "\\" @@@ !!! "bad escape character in string" scan_escape;
wenzelm@24596
   248
wenzelm@27772
   249
val scan_gap = $$$ "\\" @@@ scan_blanks1 @@@ $$$ "\\";
wenzelm@61476
   250
val scan_gaps = Scan.repeats scan_gap;
wenzelm@24579
   251
wenzelm@24596
   252
in
wenzelm@24579
   253
wenzelm@24579
   254
val scan_char =
wenzelm@27772
   255
  $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ scan_str @@@ scan_gaps @@@ $$$ "\"";
wenzelm@24579
   256
wenzelm@48743
   257
val recover_char =
wenzelm@48743
   258
  $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ Scan.optional (Scan.permissive scan_str @@@ scan_gaps) [];
wenzelm@48743
   259
wenzelm@24579
   260
val scan_string =
wenzelm@55103
   261
  Scan.ahead ($$ "\"") |--
wenzelm@55103
   262
    !!! "unclosed string literal"
wenzelm@61476
   263
      ($$$ "\"" @@@ Scan.repeats (scan_gap || scan_str) @@@ $$$ "\"");
wenzelm@24596
   264
wenzelm@48743
   265
val recover_string =
wenzelm@61476
   266
  $$$ "\"" @@@ Scan.repeats (scan_gap || Scan.permissive scan_str);
wenzelm@48743
   267
wenzelm@24596
   268
end;
wenzelm@24579
   269
wenzelm@24579
   270
wenzelm@63204
   271
(* rat antiquotation *)
wenzelm@63204
   272
wenzelm@63204
   273
val rat_name = Symbol_Pos.explode ("Pure.rat ", Position.none);
wenzelm@63204
   274
wenzelm@63204
   275
val scan_rat_antiq =
wenzelm@63204
   276
  Symbol_Pos.scan_pos -- ($$ "@" |-- Symbol_Pos.scan_pos -- scan_rat) -- Symbol_Pos.scan_pos
wenzelm@63204
   277
    >> (fn ((pos1, (pos2, body)), pos3) =>
wenzelm@63204
   278
      {start = Position.range_position (pos1, pos2),
wenzelm@63204
   279
       stop = Position.none,
wenzelm@63204
   280
       range = Position.range (pos1, pos3),
wenzelm@63204
   281
       body = rat_name @ body});
wenzelm@63204
   282
wenzelm@63204
   283
wenzelm@30645
   284
(* scan tokens *)
wenzelm@24579
   285
wenzelm@24579
   286
local
wenzelm@24579
   287
wenzelm@30593
   288
fun token k ss = Token (Symbol_Pos.range ss, (k, Symbol_Pos.content ss));
wenzelm@24579
   289
wenzelm@30593
   290
val scan_ml =
wenzelm@27772
   291
 (scan_char >> token Char ||
wenzelm@27772
   292
  scan_string >> token String ||
wenzelm@27772
   293
  scan_blanks1 >> token Space ||
wenzelm@55105
   294
  Symbol_Pos.scan_comment err_prefix >> token Comment ||
wenzelm@27772
   295
  Scan.max token_leq
wenzelm@55505
   296
   (Scan.literal lexicon >> token Keyword)
wenzelm@27772
   297
   (scan_word >> token Word ||
wenzelm@27772
   298
    scan_real >> token Real ||
wenzelm@27772
   299
    scan_int >> token Int ||
wenzelm@59082
   300
    scan_long_ident >> token Long_Ident ||
wenzelm@27772
   301
    scan_ident >> token Ident ||
wenzelm@59082
   302
    scan_type_var >> token Type_Var));
wenzelm@27772
   303
wenzelm@59112
   304
val scan_sml = scan_ml >> Antiquote.Text;
wenzelm@59112
   305
wenzelm@59112
   306
val scan_ml_antiq =
wenzelm@61471
   307
  Antiquote.scan_control >> Antiquote.Control ||
wenzelm@59112
   308
  Antiquote.scan_antiq >> Antiquote.Antiq ||
wenzelm@63204
   309
  scan_rat_antiq >> Antiquote.Antiq ||
wenzelm@59112
   310
  scan_ml >> Antiquote.Text;
wenzelm@30645
   311
wenzelm@27772
   312
fun recover msg =
wenzelm@48743
   313
 (recover_char ||
wenzelm@48743
   314
  recover_string ||
wenzelm@59112
   315
  Symbol_Pos.recover_cartouche ||
wenzelm@48743
   316
  Symbol_Pos.recover_comment ||
wenzelm@48743
   317
  Scan.one (Symbol.not_eof o Symbol_Pos.symbol) >> single)
wenzelm@48743
   318
  >> (single o token (Error msg));
wenzelm@24579
   319
wenzelm@56278
   320
fun gen_read SML pos text =
wenzelm@37198
   321
  let
wenzelm@59110
   322
    val syms =
wenzelm@59110
   323
      Symbol_Pos.explode (text, pos)
wenzelm@59110
   324
      |> SML ? maps (fn (s, p) => raw_explode s |> map (rpair p));
wenzelm@59110
   325
wenzelm@37209
   326
    val termination =
wenzelm@37209
   327
      if null syms then []
wenzelm@37209
   328
      else
wenzelm@37209
   329
        let
wenzelm@37209
   330
          val pos1 = List.last syms |-> Position.advance;
wenzelm@37209
   331
          val pos2 = Position.advance Symbol.space pos1;
wenzelm@62797
   332
        in [Antiquote.Text (Token (Position.range (pos1, pos2), (Space, Symbol.space)))] end;
wenzelm@56278
   333
wenzelm@59112
   334
    val scan = if SML then scan_sml else scan_ml_antiq;
wenzelm@56278
   335
    fun check (Antiquote.Text tok) = (check_content_of tok; if SML then () else warn tok)
wenzelm@56278
   336
      | check _ = ();
wenzelm@37209
   337
    val input =
wenzelm@56278
   338
      Source.of_list syms
wenzelm@58864
   339
      |> Source.source Symbol_Pos.stopper
wenzelm@58864
   340
        (Scan.recover (Scan.bulk (!!! "bad input" scan))
wenzelm@58864
   341
          (fn msg => recover msg >> map Antiquote.Text))
wenzelm@61482
   342
      |> Source.exhaust;
wenzelm@61482
   343
    val _ = Position.reports (Antiquote.antiq_reports input);
wenzelm@61482
   344
    val _ =
wenzelm@61482
   345
      Position.reports_text (maps (fn Antiquote.Text t => [token_report SML t] | _ => []) input);
wenzelm@61482
   346
    val _ = List.app check input;
wenzelm@37209
   347
  in input @ termination end;
wenzelm@30645
   348
wenzelm@56278
   349
in
wenzelm@56278
   350
wenzelm@56278
   351
fun source src =
wenzelm@56278
   352
  Symbol_Pos.source (Position.line 1) src
wenzelm@58864
   353
  |> Source.source Symbol_Pos.stopper (Scan.recover (Scan.bulk (!!! "bad input" scan_ml)) recover);
wenzelm@56278
   354
wenzelm@63936
   355
val tokenize = Symbol.explode #> Source.of_list #> source #> Source.exhaust;
wenzelm@56278
   356
wenzelm@59067
   357
val read_pos = gen_read false;
wenzelm@59067
   358
wenzelm@59067
   359
val read = read_pos Position.none;
wenzelm@56278
   360
wenzelm@58978
   361
fun read_set_range range =
wenzelm@59067
   362
  read #> map (fn Antiquote.Text tok => Antiquote.Text (set_range range tok) | antiq => antiq);
wenzelm@58978
   363
wenzelm@59064
   364
fun read_source SML source =
wenzelm@56459
   365
  let
wenzelm@59064
   366
    val pos = Input.pos_of source;
wenzelm@56459
   367
    val language = if SML then Markup.language_SML else Markup.language_ML;
wenzelm@56459
   368
    val _ =
wenzelm@59064
   369
      if Position.is_reported_range pos
wenzelm@59064
   370
      then Position.report pos (language (Input.is_delimited source))
wenzelm@56459
   371
      else ();
wenzelm@59064
   372
  in gen_read SML pos (Input.text_of source) end;
wenzelm@55828
   373
wenzelm@24579
   374
end;
wenzelm@24579
   375
wenzelm@24596
   376
end;