src/Pure/ML/ml_lex.ML
author wenzelm
Mon, 20 Jan 2014 16:56:18 +0100
changeset 55103 57d87ec3da4c
parent 53167 4e7ddd76e632
child 55105 75815b3b38a1
permissions -rw-r--r--
tuned errors;
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
     1
(*  Title:      Pure/ML/ml_lex.ML
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
     2
    Author:     Makarius
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
     3
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
     4
Lexical syntax for SML.
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
     5
*)
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
     6
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
     7
signature ML_LEX =
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
     8
sig
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
     9
  datatype token_kind =
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
    10
    Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    11
    Space | Comment | Error of string | EOF
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    12
  eqtype token
27732
8dbf5761a24a abstract type Scan.stopper;
wenzelm
parents: 24596
diff changeset
    13
  val stopper: token Scan.stopper
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
    14
  val is_regular: token -> bool
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
    15
  val is_improper: token -> bool
30683
e8ac1f9d9469 datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
wenzelm
parents: 30645
diff changeset
    16
  val set_range: Position.range -> token -> token
30636
bd8813d7774d ML_Lex.pos_of: regular position;
wenzelm
parents: 30614
diff changeset
    17
  val pos_of: token -> Position.T
31474
0ae32184bde0 export end_pos_of;
wenzelm
parents: 31426
diff changeset
    18
  val end_pos_of: token -> Position.T
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    19
  val kind_of: token -> token_kind
27817
78cae5cca09e renamed ML_Lex.val_of to content_of;
wenzelm
parents: 27799
diff changeset
    20
  val content_of: token -> string
31426
5c9dbd511510 tuned signature;
wenzelm
parents: 31332
diff changeset
    21
  val check_content_of: token -> string
31332
9639a6d4d714 added flatten;
wenzelm
parents: 30683
diff changeset
    22
  val flatten: token list -> string
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    23
  val keywords: string list
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
    24
  val source: (Symbol.symbol, 'a) Source.source ->
30573
49899f26fbd1 de-camelized Symbol_Pos;
wenzelm
parents: 29606
diff changeset
    25
    (token, (Symbol_Pos.T, Position.T * (Symbol.symbol, 'a) Source.source)
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    26
      Source.source) Source.source
30591
6c9c00ea4ae6 added tokenize;
wenzelm
parents: 30573
diff changeset
    27
  val tokenize: string -> token list
37198
3af985b10550 replaced ML_Lex.read_antiq by more concise ML_Lex.read, which includes full read/report with explicit position information;
wenzelm
parents: 37195
diff changeset
    28
  val read: Position.T -> Symbol_Pos.text -> token Antiquote.antiquote list
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    29
end;
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    30
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    31
structure ML_Lex: ML_LEX =
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    32
struct
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    33
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    34
(** tokens **)
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    35
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    36
(* datatype token *)
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    37
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    38
datatype token_kind =
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
    39
  Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    40
  Space | Comment | Error of string | EOF;
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    41
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    42
datatype token = Token of Position.range * (token_kind * string);
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    43
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    44
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    45
(* position *)
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    46
30683
e8ac1f9d9469 datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
wenzelm
parents: 30645
diff changeset
    47
fun set_range range (Token (_, x)) = Token (range, x);
e8ac1f9d9469 datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
wenzelm
parents: 30645
diff changeset
    48
30636
bd8813d7774d ML_Lex.pos_of: regular position;
wenzelm
parents: 30614
diff changeset
    49
fun pos_of (Token ((pos, _), _)) = pos;
bd8813d7774d ML_Lex.pos_of: regular position;
wenzelm
parents: 30614
diff changeset
    50
fun end_pos_of (Token ((_, pos), _)) = pos;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    51
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    52
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
    53
(* control tokens *)
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    54
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    55
fun mk_eof pos = Token ((pos, Position.none), (EOF, ""));
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    56
val eof = mk_eof Position.none;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    57
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    58
fun is_eof (Token (_, (EOF, _))) = true
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    59
  | is_eof _ = false;
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    60
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    61
val stopper =
30636
bd8813d7774d ML_Lex.pos_of: regular position;
wenzelm
parents: 30614
diff changeset
    62
  Scan.stopper (fn [] => eof | toks => mk_eof (end_pos_of (List.last toks))) is_eof;
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    63
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    64
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    65
(* token content *)
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    66
31426
5c9dbd511510 tuned signature;
wenzelm
parents: 31332
diff changeset
    67
fun kind_of (Token (_, (k, _))) = k;
5c9dbd511510 tuned signature;
wenzelm
parents: 31332
diff changeset
    68
27817
78cae5cca09e renamed ML_Lex.val_of to content_of;
wenzelm
parents: 27799
diff changeset
    69
fun content_of (Token (_, (_, x))) = x;
78cae5cca09e renamed ML_Lex.val_of to content_of;
wenzelm
parents: 27799
diff changeset
    70
fun token_leq (tok, tok') = content_of tok <= content_of tok';
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    71
41502
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
    72
fun is_regular (Token (_, (Error _, _))) = false
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
    73
  | is_regular (Token (_, (EOF, _))) = false
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
    74
  | is_regular _ = true;
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
    75
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
    76
fun is_improper (Token (_, (Space, _))) = true
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
    77
  | is_improper (Token (_, (Comment, _))) = true
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
    78
  | is_improper _ = false;
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
    79
40337
d25bbb5f1c9e warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents: 40319
diff changeset
    80
fun warn tok =
d25bbb5f1c9e warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents: 40319
diff changeset
    81
  (case tok of
d25bbb5f1c9e warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents: 40319
diff changeset
    82
    Token (_, (Keyword, ":>")) =>
d25bbb5f1c9e warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents: 40319
diff changeset
    83
      warning ("Opaque signature matching (:>) fails to work with ML pretty printing --\n\
d25bbb5f1c9e warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents: 40319
diff changeset
    84
        \prefer non-opaque matching (:) possibly with abstype" ^
48992
0518bf89c777 renamed Position.str_of to Position.here;
wenzelm
parents: 48768
diff changeset
    85
        Position.here (pos_of tok))
40337
d25bbb5f1c9e warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents: 40319
diff changeset
    86
  | _ => ());
d25bbb5f1c9e warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents: 40319
diff changeset
    87
31426
5c9dbd511510 tuned signature;
wenzelm
parents: 31332
diff changeset
    88
fun check_content_of tok =
30636
bd8813d7774d ML_Lex.pos_of: regular position;
wenzelm
parents: 30614
diff changeset
    89
  (case kind_of tok of
30645
e7943202d177 added read_antiq, with improved error reporting;
wenzelm
parents: 30636
diff changeset
    90
    Error msg => error msg
40337
d25bbb5f1c9e warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents: 40319
diff changeset
    91
  | _ => content_of tok);
30636
bd8813d7774d ML_Lex.pos_of: regular position;
wenzelm
parents: 30614
diff changeset
    92
41502
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
    93
fun flatten_content (tok :: (toks as tok' :: _)) =
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
    94
      Symbol.escape (check_content_of tok) ::
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
    95
        (if is_improper tok orelse is_improper tok' then flatten_content toks
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
    96
         else Symbol.space :: flatten_content toks)
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
    97
  | flatten_content toks = map (Symbol.escape o check_content_of) toks;
31332
9639a6d4d714 added flatten;
wenzelm
parents: 30683
diff changeset
    98
41502
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
    99
val flatten = implode o flatten_content;
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   100
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   101
30614
845bcfd3e9cd report markup for ML tokens;
wenzelm
parents: 30600
diff changeset
   102
(* markup *)
845bcfd3e9cd report markup for ML tokens;
wenzelm
parents: 30600
diff changeset
   103
37195
e87d305a4490 separate markup for ML delimiters;
wenzelm
parents: 31543
diff changeset
   104
local
e87d305a4490 separate markup for ML delimiters;
wenzelm
parents: 31543
diff changeset
   105
30645
e7943202d177 added read_antiq, with improved error reporting;
wenzelm
parents: 30636
diff changeset
   106
val token_kind_markup =
50201
c26369c9eda6 Isabelle-specific implementation of quasi-abstract markup elements -- back to module arrangement before d83797ef0d2d;
wenzelm
parents: 48992
diff changeset
   107
 fn Keyword   => (Markup.ML_keyword, "")
48768
abc45de5bb22 clarified "bad" markup: proper body text, invent missing serial on Scala side (counting backwards);
wenzelm
parents: 48756
diff changeset
   108
  | Ident     => (Markup.empty, "")
abc45de5bb22 clarified "bad" markup: proper body text, invent missing serial on Scala side (counting backwards);
wenzelm
parents: 48756
diff changeset
   109
  | LongIdent => (Markup.empty, "")
50201
c26369c9eda6 Isabelle-specific implementation of quasi-abstract markup elements -- back to module arrangement before d83797ef0d2d;
wenzelm
parents: 48992
diff changeset
   110
  | TypeVar   => (Markup.ML_tvar, "")
c26369c9eda6 Isabelle-specific implementation of quasi-abstract markup elements -- back to module arrangement before d83797ef0d2d;
wenzelm
parents: 48992
diff changeset
   111
  | Word      => (Markup.ML_numeral, "")
c26369c9eda6 Isabelle-specific implementation of quasi-abstract markup elements -- back to module arrangement before d83797ef0d2d;
wenzelm
parents: 48992
diff changeset
   112
  | Int       => (Markup.ML_numeral, "")
c26369c9eda6 Isabelle-specific implementation of quasi-abstract markup elements -- back to module arrangement before d83797ef0d2d;
wenzelm
parents: 48992
diff changeset
   113
  | Real      => (Markup.ML_numeral, "")
c26369c9eda6 Isabelle-specific implementation of quasi-abstract markup elements -- back to module arrangement before d83797ef0d2d;
wenzelm
parents: 48992
diff changeset
   114
  | Char      => (Markup.ML_char, "")
c26369c9eda6 Isabelle-specific implementation of quasi-abstract markup elements -- back to module arrangement before d83797ef0d2d;
wenzelm
parents: 48992
diff changeset
   115
  | String    => (Markup.ML_string, "")
48768
abc45de5bb22 clarified "bad" markup: proper body text, invent missing serial on Scala side (counting backwards);
wenzelm
parents: 48756
diff changeset
   116
  | Space     => (Markup.empty, "")
50201
c26369c9eda6 Isabelle-specific implementation of quasi-abstract markup elements -- back to module arrangement before d83797ef0d2d;
wenzelm
parents: 48992
diff changeset
   117
  | Comment   => (Markup.ML_comment, "")
c26369c9eda6 Isabelle-specific implementation of quasi-abstract markup elements -- back to module arrangement before d83797ef0d2d;
wenzelm
parents: 48992
diff changeset
   118
  | Error msg => (Markup.bad, msg)
48768
abc45de5bb22 clarified "bad" markup: proper body text, invent missing serial on Scala side (counting backwards);
wenzelm
parents: 48756
diff changeset
   119
  | EOF       => (Markup.empty, "");
30614
845bcfd3e9cd report markup for ML tokens;
wenzelm
parents: 30600
diff changeset
   120
37195
e87d305a4490 separate markup for ML delimiters;
wenzelm
parents: 31543
diff changeset
   121
fun token_markup kind x =
e87d305a4490 separate markup for ML delimiters;
wenzelm
parents: 31543
diff changeset
   122
  if kind = Keyword andalso exists_string (not o Symbol.is_ascii_letter) x
50201
c26369c9eda6 Isabelle-specific implementation of quasi-abstract markup elements -- back to module arrangement before d83797ef0d2d;
wenzelm
parents: 48992
diff changeset
   123
  then (Markup.ML_delimiter, "")
37195
e87d305a4490 separate markup for ML delimiters;
wenzelm
parents: 31543
diff changeset
   124
  else token_kind_markup kind;
e87d305a4490 separate markup for ML delimiters;
wenzelm
parents: 31543
diff changeset
   125
e87d305a4490 separate markup for ML delimiters;
wenzelm
parents: 31543
diff changeset
   126
in
e87d305a4490 separate markup for ML delimiters;
wenzelm
parents: 31543
diff changeset
   127
48768
abc45de5bb22 clarified "bad" markup: proper body text, invent missing serial on Scala side (counting backwards);
wenzelm
parents: 48756
diff changeset
   128
fun report_of_token (Token ((pos, _), (kind, x))) =
abc45de5bb22 clarified "bad" markup: proper body text, invent missing serial on Scala side (counting backwards);
wenzelm
parents: 48756
diff changeset
   129
  let val (markup, txt) = token_markup kind x
abc45de5bb22 clarified "bad" markup: proper body text, invent missing serial on Scala side (counting backwards);
wenzelm
parents: 48756
diff changeset
   130
  in ((pos, markup), txt) end;
37195
e87d305a4490 separate markup for ML delimiters;
wenzelm
parents: 31543
diff changeset
   131
e87d305a4490 separate markup for ML delimiters;
wenzelm
parents: 31543
diff changeset
   132
end;
30614
845bcfd3e9cd report markup for ML tokens;
wenzelm
parents: 30600
diff changeset
   133
845bcfd3e9cd report markup for ML tokens;
wenzelm
parents: 30600
diff changeset
   134
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   135
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   136
(** scanners **)
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   137
30573
49899f26fbd1 de-camelized Symbol_Pos;
wenzelm
parents: 29606
diff changeset
   138
open Basic_Symbol_Pos;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   139
43947
9b00f09f7721 defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
wenzelm
parents: 41502
diff changeset
   140
fun !!! msg = Symbol_Pos.!!! (fn () => "SML lexical error: " ^ msg);
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   141
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   142
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   143
(* blanks *)
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   144
40525
14a2e686bdac eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents: 40523
diff changeset
   145
val scan_blank = Scan.one (Symbol.is_ascii_blank o Symbol_Pos.symbol);
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   146
val scan_blanks1 = Scan.repeat1 scan_blank;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   147
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   148
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   149
(* keywords *)
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   150
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   151
val keywords = ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=",
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   152
  "=>", "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   153
  "case", "datatype", "do", "else", "end", "eqtype", "exception", "fn",
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   154
  "fun", "functor", "handle", "if", "in", "include", "infix", "infixr",
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   155
  "let", "local", "nonfix", "of", "op", "open", "orelse", "raise", "rec",
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   156
  "sharing", "sig", "signature", "struct", "structure", "then", "type",
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   157
  "val", "where", "while", "with", "withtype"];
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   158
40627
becf5d5187cc renamed raw "explode" function to "raw_explode" to emphasize its meaning;
wenzelm
parents: 40525
diff changeset
   159
val lex = Scan.make_lexicon (map raw_explode keywords);
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   160
fun scan_keyword x = Scan.literal lex x;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   161
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   162
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   163
(* identifiers *)
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   164
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   165
local
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   166
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   167
val scan_letdigs =
40525
14a2e686bdac eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents: 40523
diff changeset
   168
  Scan.many
14a2e686bdac eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents: 40523
diff changeset
   169
    ((Symbol.is_ascii_letter orf Symbol.is_ascii_digit orf Symbol.is_ascii_quasi) o
14a2e686bdac eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents: 40523
diff changeset
   170
      Symbol_Pos.symbol);
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   171
40525
14a2e686bdac eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents: 40523
diff changeset
   172
val scan_alphanumeric =
14a2e686bdac eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents: 40523
diff changeset
   173
  Scan.one (Symbol.is_ascii_letter o Symbol_Pos.symbol) -- scan_letdigs >> op ::;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   174
40525
14a2e686bdac eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents: 40523
diff changeset
   175
val scan_symbolic =
40627
becf5d5187cc renamed raw "explode" function to "raw_explode" to emphasize its meaning;
wenzelm
parents: 40525
diff changeset
   176
  Scan.many1 (member (op =) (raw_explode "!#$%&*+-/:<=>?@\\^`|~") o Symbol_Pos.symbol);
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   177
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   178
in
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   179
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   180
val scan_ident = scan_alphanumeric || scan_symbolic;
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   181
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   182
val scan_longident =
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   183
  (Scan.repeat1 (scan_alphanumeric @@@ $$$ ".") >> flat) @@@ (scan_ident || $$$ "=");
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   184
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   185
val scan_typevar = $$$ "'" @@@ scan_letdigs;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   186
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   187
end;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   188
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   189
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   190
(* numerals *)
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   191
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   192
local
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   193
40525
14a2e686bdac eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents: 40523
diff changeset
   194
val scan_dec = Scan.many1 (Symbol.is_ascii_digit o Symbol_Pos.symbol);
14a2e686bdac eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents: 40523
diff changeset
   195
val scan_hex = Scan.many1 (Symbol.is_ascii_hex o Symbol_Pos.symbol);
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   196
val scan_sign = Scan.optional ($$$ "~") [];
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   197
val scan_decint = scan_sign @@@ scan_dec;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   198
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   199
in
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   200
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   201
val scan_word =
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   202
  $$$ "0" @@@ $$$ "w" @@@ $$$ "x" @@@ scan_hex ||
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   203
  $$$ "0" @@@ $$$ "w" @@@ scan_dec;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   204
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   205
val scan_int = scan_sign @@@ ($$$ "0" @@@ $$$ "x" @@@ scan_hex || scan_dec);
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   206
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   207
val scan_exp = ($$$ "E" || $$$ "e") @@@ scan_decint;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   208
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   209
val scan_real =
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   210
  scan_decint @@@ $$$ "." @@@ scan_dec @@@ Scan.optional scan_exp [] ||
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   211
  scan_decint @@@ scan_exp;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   212
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   213
end;
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   214
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   215
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   216
(* chars and strings *)
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   217
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   218
local
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   219
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   220
val scan_escape =
40627
becf5d5187cc renamed raw "explode" function to "raw_explode" to emphasize its meaning;
wenzelm
parents: 40525
diff changeset
   221
  Scan.one (member (op =) (raw_explode "\"\\abtnvfr") o Symbol_Pos.symbol) >> single ||
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   222
  $$$ "^" @@@ (Scan.one (fn (s, _) => ord "@" <= ord s andalso ord s <= ord "_") >> single) ||
40525
14a2e686bdac eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents: 40523
diff changeset
   223
  Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) --
14a2e686bdac eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents: 40523
diff changeset
   224
    Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) --
14a2e686bdac eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents: 40523
diff changeset
   225
    Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) >> (fn ((a, b), c) => [a, b, c]);
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   226
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   227
val scan_str =
30600
de241396389c allow non-printable symbols within string tokens;
wenzelm
parents: 30593
diff changeset
   228
  Scan.one (fn (s, _) => Symbol.is_regular s andalso s <> "\"" andalso s <> "\\" andalso
de241396389c allow non-printable symbols within string tokens;
wenzelm
parents: 30593
diff changeset
   229
    (not (Symbol.is_char s) orelse Symbol.is_printable s)) >> single ||
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   230
  $$$ "\\" @@@ !!! "bad escape character in string" scan_escape;
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   231
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   232
val scan_gap = $$$ "\\" @@@ scan_blanks1 @@@ $$$ "\\";
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   233
val scan_gaps = Scan.repeat scan_gap >> flat;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   234
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   235
in
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   236
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   237
val scan_char =
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   238
  $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ scan_str @@@ scan_gaps @@@ $$$ "\"";
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   239
48743
a72f8ffecf31 refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents: 45666
diff changeset
   240
val recover_char =
a72f8ffecf31 refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents: 45666
diff changeset
   241
  $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ Scan.optional (Scan.permissive scan_str @@@ scan_gaps) [];
a72f8ffecf31 refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents: 45666
diff changeset
   242
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   243
val scan_string =
55103
57d87ec3da4c tuned errors;
wenzelm
parents: 53167
diff changeset
   244
  Scan.ahead ($$ "\"") |--
57d87ec3da4c tuned errors;
wenzelm
parents: 53167
diff changeset
   245
    !!! "unclosed string literal"
57d87ec3da4c tuned errors;
wenzelm
parents: 53167
diff changeset
   246
      ($$$ "\"" @@@ (Scan.repeat (scan_gap || scan_str) >> flat) @@@ $$$ "\"");
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   247
48743
a72f8ffecf31 refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents: 45666
diff changeset
   248
val recover_string =
a72f8ffecf31 refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents: 45666
diff changeset
   249
  $$$ "\"" @@@ (Scan.repeat (scan_gap || Scan.permissive scan_str) >> flat);
a72f8ffecf31 refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents: 45666
diff changeset
   250
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   251
end;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   252
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   253
30645
e7943202d177 added read_antiq, with improved error reporting;
wenzelm
parents: 30636
diff changeset
   254
(* scan tokens *)
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   255
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   256
local
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   257
30593
495672058d97 added scan_antiq;
wenzelm
parents: 30591
diff changeset
   258
fun token k ss = Token (Symbol_Pos.range ss, (k, Symbol_Pos.content ss));
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   259
30593
495672058d97 added scan_antiq;
wenzelm
parents: 30591
diff changeset
   260
val scan_ml =
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   261
 (scan_char >> token Char ||
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   262
  scan_string >> token String ||
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   263
  scan_blanks1 >> token Space ||
30573
49899f26fbd1 de-camelized Symbol_Pos;
wenzelm
parents: 29606
diff changeset
   264
  Symbol_Pos.scan_comment !!! >> token Comment ||
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   265
  Scan.max token_leq
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   266
   (scan_keyword >> token Keyword)
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   267
   (scan_word >> token Word ||
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   268
    scan_real >> token Real ||
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   269
    scan_int >> token Int ||
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   270
    scan_longident >> token LongIdent ||
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   271
    scan_ident >> token Ident ||
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   272
    scan_typevar >> token TypeVar));
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   273
53167
4e7ddd76e632 discontinued unused antiquotation blocks;
wenzelm
parents: 50201
diff changeset
   274
val scan_antiq = Antiquote.scan_antiq >> Antiquote.Antiq || scan_ml >> Antiquote.Text;
30645
e7943202d177 added read_antiq, with improved error reporting;
wenzelm
parents: 30636
diff changeset
   275
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   276
fun recover msg =
48743
a72f8ffecf31 refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents: 45666
diff changeset
   277
 (recover_char ||
a72f8ffecf31 refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents: 45666
diff changeset
   278
  recover_string ||
a72f8ffecf31 refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents: 45666
diff changeset
   279
  Symbol_Pos.recover_comment ||
a72f8ffecf31 refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents: 45666
diff changeset
   280
  Scan.one (Symbol.not_eof o Symbol_Pos.symbol) >> single)
a72f8ffecf31 refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents: 45666
diff changeset
   281
  >> (single o token (Error msg));
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   282
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   283
in
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   284
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   285
fun source src =
30573
49899f26fbd1 de-camelized Symbol_Pos;
wenzelm
parents: 29606
diff changeset
   286
  Symbol_Pos.source (Position.line 1) src
30593
495672058d97 added scan_antiq;
wenzelm
parents: 30591
diff changeset
   287
  |> Source.source Symbol_Pos.stopper (Scan.bulk (!!! "bad input" scan_ml)) (SOME (false, recover));
30591
6c9c00ea4ae6 added tokenize;
wenzelm
parents: 30573
diff changeset
   288
40523
1050315f6ee2 simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
wenzelm
parents: 40337
diff changeset
   289
val tokenize = Source.of_string #> Symbol.source #> source #> Source.exhaust;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   290
37198
3af985b10550 replaced ML_Lex.read_antiq by more concise ML_Lex.read, which includes full read/report with explicit position information;
wenzelm
parents: 37195
diff changeset
   291
fun read pos txt =
3af985b10550 replaced ML_Lex.read_antiq by more concise ML_Lex.read, which includes full read/report with explicit position information;
wenzelm
parents: 37195
diff changeset
   292
  let
50201
c26369c9eda6 Isabelle-specific implementation of quasi-abstract markup elements -- back to module arrangement before d83797ef0d2d;
wenzelm
parents: 48992
diff changeset
   293
    val _ = Position.report pos Markup.ML_source;
37198
3af985b10550 replaced ML_Lex.read_antiq by more concise ML_Lex.read, which includes full read/report with explicit position information;
wenzelm
parents: 37195
diff changeset
   294
    val syms = Symbol_Pos.explode (txt, pos);
37209
1c8cf0048934 terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents: 37198
diff changeset
   295
    val termination =
1c8cf0048934 terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents: 37198
diff changeset
   296
      if null syms then []
1c8cf0048934 terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents: 37198
diff changeset
   297
      else
1c8cf0048934 terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents: 37198
diff changeset
   298
        let
1c8cf0048934 terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents: 37198
diff changeset
   299
          val pos1 = List.last syms |-> Position.advance;
1c8cf0048934 terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents: 37198
diff changeset
   300
          val pos2 = Position.advance Symbol.space pos1;
1c8cf0048934 terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents: 37198
diff changeset
   301
        in [Antiquote.Text (Token (Position.range pos1 pos2, (Space, Symbol.space)))] end;
1c8cf0048934 terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents: 37198
diff changeset
   302
    val input =
1c8cf0048934 terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents: 37198
diff changeset
   303
      (Source.of_list syms
1c8cf0048934 terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents: 37198
diff changeset
   304
        |> Source.source Symbol_Pos.stopper (Scan.bulk (!!! "bad input" scan_antiq))
1c8cf0048934 terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents: 37198
diff changeset
   305
          (SOME (false, fn msg => recover msg >> map Antiquote.Text))
1c8cf0048934 terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents: 37198
diff changeset
   306
        |> Source.exhaust
48768
abc45de5bb22 clarified "bad" markup: proper body text, invent missing serial on Scala side (counting backwards);
wenzelm
parents: 48756
diff changeset
   307
        |> tap (Position.reports_text o Antiquote.reports_of (single o report_of_token))
55103
57d87ec3da4c tuned errors;
wenzelm
parents: 53167
diff changeset
   308
        |> tap (List.app (fn Antiquote.Text tok => (check_content_of tok; warn tok) | _ => ())));
37209
1c8cf0048934 terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents: 37198
diff changeset
   309
  in input @ termination end;
30645
e7943202d177 added read_antiq, with improved error reporting;
wenzelm
parents: 30636
diff changeset
   310
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   311
end;
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   312
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   313
end;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   314