src/Pure/ML/ml_lex.ML
author wenzelm
Thu, 15 Aug 2024 12:43:17 +0200
changeset 80712 05b16602a683
parent 78701 c7b2697094ac
permissions -rw-r--r--
clarified signature;
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
     1
(*  Title:      Pure/ML/ml_lex.ML
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
     2
    Author:     Makarius
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
     3
59109
364992cd3c50 tuned comment;
wenzelm
parents: 59082
diff changeset
     4
Lexical syntax for Isabelle/ML and Standard ML.
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
     5
*)
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
     6
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
     7
signature ML_LEX =
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
     8
sig
55505
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
     9
  val keywords: string list
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    10
  datatype token_kind =
59082
wenzelm
parents: 59067
diff changeset
    11
    Keyword | Ident | Long_Ident | Type_Var | Word | Int | Real | Char | String |
67425
7d4a088dbc0e clarified modules: uniform notion of formal comments;
wenzelm
parents: 67363
diff changeset
    12
    Space | Comment of Comment.kind option | Error of string | EOF
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    13
  eqtype token
27732
8dbf5761a24a abstract type Scan.stopper;
wenzelm
parents: 24596
diff changeset
    14
  val stopper: token Scan.stopper
78701
c7b2697094ac more general ML_Antiquotation.special_form;
wenzelm
parents: 78687
diff changeset
    15
  val is_ident_with: (string -> bool) -> token -> bool
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
    16
  val is_regular: token -> bool
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
    17
  val is_improper: token -> bool
67362
221612c942de allow formal comments in ML;
wenzelm
parents: 66067
diff changeset
    18
  val is_comment: token -> bool
30683
e8ac1f9d9469 datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
wenzelm
parents: 30645
diff changeset
    19
  val set_range: Position.range -> token -> token
59112
e670969f34df expand ML cartouches to Input.source;
wenzelm
parents: 59110
diff changeset
    20
  val range_of: token -> Position.range
30636
bd8813d7774d ML_Lex.pos_of: regular position;
wenzelm
parents: 30614
diff changeset
    21
  val pos_of: token -> Position.T
31474
0ae32184bde0 export end_pos_of;
wenzelm
parents: 31426
diff changeset
    22
  val end_pos_of: token -> Position.T
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    23
  val kind_of: token -> token_kind
27817
78cae5cca09e renamed ML_Lex.val_of to content_of;
wenzelm
parents: 27799
diff changeset
    24
  val content_of: token -> string
31426
5c9dbd511510 tuned signature;
wenzelm
parents: 31332
diff changeset
    25
  val check_content_of: token -> string
31332
9639a6d4d714 added flatten;
wenzelm
parents: 30683
diff changeset
    26
  val flatten: token list -> string
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
    27
  val source: (Symbol.symbol, 'a) Source.source ->
30573
49899f26fbd1 de-camelized Symbol_Pos;
wenzelm
parents: 29606
diff changeset
    28
    (token, (Symbol_Pos.T, Position.T * (Symbol.symbol, 'a) Source.source)
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    29
      Source.source) Source.source
30591
6c9c00ea4ae6 added tokenize;
wenzelm
parents: 30573
diff changeset
    30
  val tokenize: string -> token list
73549
a2c589d5e1e4 clarified signature: more detailed token positions for antiquotations;
wenzelm
parents: 69891
diff changeset
    31
  val tokenize_range: Position.range -> string -> token list
74291
b83fa8f3a271 ML antiquotations for type constructors and term constants;
wenzelm
parents: 74174
diff changeset
    32
  val tokenize_no_range: string -> token list
68823
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
    33
  val read_text: Symbol_Pos.text * Position.T -> token Antiquote.antiquote list
59067
dd8ec9138112 tuned signature;
wenzelm
parents: 59064
diff changeset
    34
  val read: Symbol_Pos.text -> token Antiquote.antiquote list
73549
a2c589d5e1e4 clarified signature: more detailed token positions for antiquotations;
wenzelm
parents: 69891
diff changeset
    35
  val read_range: Position.range -> Symbol_Pos.text -> token Antiquote.antiquote list
68823
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
    36
  val read_source': {language: bool -> Markup.T, opaque_warning: bool, symbols: bool} ->
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
    37
    token Antiquote.antiquote Symbol_Pos.scanner -> Input.source -> token Antiquote.antiquote list
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
    38
  val read_source: Input.source -> token Antiquote.antiquote list
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
    39
  val read_source_sml: Input.source -> token Antiquote.antiquote list
74373
6e4093927dbb outer syntax: support for control-cartouche tokens;
wenzelm
parents: 74291
diff changeset
    40
  val read_symbols: Symbol_Pos.T list -> token Antiquote.antiquote list
69841
b3c9291b5fc7 clarified signature, notably for hol4isabelle (by Fabian Immler);
wenzelm
parents: 68823
diff changeset
    41
  val read_symbols_sml: Symbol_Pos.T list -> token Antiquote.antiquote list
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    42
end;
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    43
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    44
structure ML_Lex: ML_LEX =
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    45
struct
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    46
55505
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
    47
(** keywords **)
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
    48
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
    49
val keywords =
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
    50
 ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=", "=>", "[",
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
    51
  "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as", "case",
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
    52
  "datatype", "do", "else", "end", "eqtype", "exception", "fn", "fun",
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
    53
  "functor", "handle", "if", "in", "include", "infix", "infixr",
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
    54
  "let", "local", "nonfix", "of", "op", "open", "orelse", "raise",
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
    55
  "rec", "sharing", "sig", "signature", "struct", "structure", "then",
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
    56
  "type", "val", "where", "while", "with", "withtype"];
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
    57
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
    58
val keywords2 =
78687
5fe4c11b5ecb minor performance tuning;
wenzelm
parents: 74373
diff changeset
    59
 Symset.make ["and", "case", "do", "else", "end", "if", "in", "let", "local", "of",
55505
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
    60
  "sig", "struct", "then", "while", "with"];
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
    61
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
    62
val keywords3 =
78687
5fe4c11b5ecb minor performance tuning;
wenzelm
parents: 74373
diff changeset
    63
 Symset.make ["handle", "open", "raise"];
55505
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
    64
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
    65
val lexicon = Scan.make_lexicon (map raw_explode keywords);
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
    66
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
    67
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
    68
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    69
(** tokens **)
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    70
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    71
(* datatype token *)
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    72
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    73
datatype token_kind =
59082
wenzelm
parents: 59067
diff changeset
    74
  Keyword | Ident | Long_Ident | Type_Var | Word | Int | Real | Char | String |
67425
7d4a088dbc0e clarified modules: uniform notion of formal comments;
wenzelm
parents: 67363
diff changeset
    75
  Space | Comment of Comment.kind option | Error of string | EOF;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    76
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    77
datatype token = Token of Position.range * (token_kind * string);
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    78
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    79
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    80
(* position *)
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    81
30683
e8ac1f9d9469 datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
wenzelm
parents: 30645
diff changeset
    82
fun set_range range (Token (_, x)) = Token (range, x);
59112
e670969f34df expand ML cartouches to Input.source;
wenzelm
parents: 59110
diff changeset
    83
fun range_of (Token (range, _)) = range;
30683
e8ac1f9d9469 datatype antiquote: maintain original Position.range, which is eventually attached to the resulting ML tokens;
wenzelm
parents: 30645
diff changeset
    84
59112
e670969f34df expand ML cartouches to Input.source;
wenzelm
parents: 59110
diff changeset
    85
val pos_of = #1 o range_of;
e670969f34df expand ML cartouches to Input.source;
wenzelm
parents: 59110
diff changeset
    86
val end_pos_of = #2 o range_of;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    87
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    88
58855
2885e2eaa0fb removed pointless markup;
wenzelm
parents: 58854
diff changeset
    89
(* stopper *)
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    90
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    91
fun mk_eof pos = Token ((pos, Position.none), (EOF, ""));
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    92
val eof = mk_eof Position.none;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    93
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    94
fun is_eof (Token (_, (EOF, _))) = true
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    95
  | is_eof _ = false;
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
    96
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    97
val stopper =
30636
bd8813d7774d ML_Lex.pos_of: regular position;
wenzelm
parents: 30614
diff changeset
    98
  Scan.stopper (fn [] => eof | toks => mk_eof (end_pos_of (List.last toks))) is_eof;
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
    99
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   100
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   101
(* token content *)
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   102
31426
5c9dbd511510 tuned signature;
wenzelm
parents: 31332
diff changeset
   103
fun kind_of (Token (_, (k, _))) = k;
5c9dbd511510 tuned signature;
wenzelm
parents: 31332
diff changeset
   104
27817
78cae5cca09e renamed ML_Lex.val_of to content_of;
wenzelm
parents: 27799
diff changeset
   105
fun content_of (Token (_, (_, x))) = x;
78cae5cca09e renamed ML_Lex.val_of to content_of;
wenzelm
parents: 27799
diff changeset
   106
fun token_leq (tok, tok') = content_of tok <= content_of tok';
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   107
55505
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
   108
fun is_keyword (Token (_, (Keyword, _))) = true
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
   109
  | is_keyword _ = false;
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
   110
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
   111
fun is_delimiter (Token (_, (Keyword, x))) = not (Symbol.is_ascii_identifier x)
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
   112
  | is_delimiter _ = false;
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
   113
78701
c7b2697094ac more general ML_Antiquotation.special_form;
wenzelm
parents: 78687
diff changeset
   114
fun is_ident_with pred (Token (_, (Ident, x))) = pred x
c7b2697094ac more general ML_Antiquotation.special_form;
wenzelm
parents: 78687
diff changeset
   115
  | is_ident_with _ _ = false;
c7b2697094ac more general ML_Antiquotation.special_form;
wenzelm
parents: 78687
diff changeset
   116
41502
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
   117
fun is_regular (Token (_, (Error _, _))) = false
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
   118
  | is_regular (Token (_, (EOF, _))) = false
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
   119
  | is_regular _ = true;
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
   120
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
   121
fun is_improper (Token (_, (Space, _))) = true
67425
7d4a088dbc0e clarified modules: uniform notion of formal comments;
wenzelm
parents: 67363
diff changeset
   122
  | is_improper (Token (_, (Comment _, _))) = true
41502
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
   123
  | is_improper _ = false;
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
   124
67425
7d4a088dbc0e clarified modules: uniform notion of formal comments;
wenzelm
parents: 67363
diff changeset
   125
fun is_comment (Token (_, (Comment _, _))) = true
67362
221612c942de allow formal comments in ML;
wenzelm
parents: 66067
diff changeset
   126
  | is_comment _ = false;
221612c942de allow formal comments in ML;
wenzelm
parents: 66067
diff changeset
   127
68823
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
   128
fun warning_opaque tok =
40337
d25bbb5f1c9e warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents: 40319
diff changeset
   129
  (case tok of
d25bbb5f1c9e warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents: 40319
diff changeset
   130
    Token (_, (Keyword, ":>")) =>
d25bbb5f1c9e warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents: 40319
diff changeset
   131
      warning ("Opaque signature matching (:>) fails to work with ML pretty printing --\n\
d25bbb5f1c9e warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents: 40319
diff changeset
   132
        \prefer non-opaque matching (:) possibly with abstype" ^
48992
0518bf89c777 renamed Position.str_of to Position.here;
wenzelm
parents: 48768
diff changeset
   133
        Position.here (pos_of tok))
40337
d25bbb5f1c9e warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents: 40319
diff changeset
   134
  | _ => ());
d25bbb5f1c9e warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents: 40319
diff changeset
   135
31426
5c9dbd511510 tuned signature;
wenzelm
parents: 31332
diff changeset
   136
fun check_content_of tok =
30636
bd8813d7774d ML_Lex.pos_of: regular position;
wenzelm
parents: 30614
diff changeset
   137
  (case kind_of tok of
30645
e7943202d177 added read_antiq, with improved error reporting;
wenzelm
parents: 30636
diff changeset
   138
    Error msg => error msg
40337
d25bbb5f1c9e warn in correlation with report -- avoid spurious message duplicates;
wenzelm
parents: 40319
diff changeset
   139
  | _ => content_of tok);
30636
bd8813d7774d ML_Lex.pos_of: regular position;
wenzelm
parents: 30614
diff changeset
   140
59112
e670969f34df expand ML cartouches to Input.source;
wenzelm
parents: 59110
diff changeset
   141
e670969f34df expand ML cartouches to Input.source;
wenzelm
parents: 59110
diff changeset
   142
(* flatten *)
e670969f34df expand ML cartouches to Input.source;
wenzelm
parents: 59110
diff changeset
   143
41502
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
   144
fun flatten_content (tok :: (toks as tok' :: _)) =
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
   145
      Symbol.escape (check_content_of tok) ::
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
   146
        (if is_improper tok orelse is_improper tok' then flatten_content toks
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
   147
         else Symbol.space :: flatten_content toks)
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
   148
  | flatten_content toks = map (Symbol.escape o check_content_of) toks;
31332
9639a6d4d714 added flatten;
wenzelm
parents: 30683
diff changeset
   149
41502
967cbbc77abd refined ML_Lex.flatten: ensure proper separation of tokens;
wenzelm
parents: 40627
diff changeset
   150
val flatten = implode o flatten_content;
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   151
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   152
30614
845bcfd3e9cd report markup for ML tokens;
wenzelm
parents: 30600
diff changeset
   153
(* markup *)
845bcfd3e9cd report markup for ML tokens;
wenzelm
parents: 30600
diff changeset
   154
37195
e87d305a4490 separate markup for ML delimiters;
wenzelm
parents: 31543
diff changeset
   155
local
e87d305a4490 separate markup for ML delimiters;
wenzelm
parents: 31543
diff changeset
   156
68822
253f04c1e814 simplified markup;
wenzelm
parents: 68821
diff changeset
   157
val token_kind_markup =
59124
wenzelm
parents: 59112
diff changeset
   158
 fn Type_Var => (Markup.ML_tvar, "")
59082
wenzelm
parents: 59067
diff changeset
   159
  | Word => (Markup.ML_numeral, "")
wenzelm
parents: 59067
diff changeset
   160
  | Int => (Markup.ML_numeral, "")
wenzelm
parents: 59067
diff changeset
   161
  | Real => (Markup.ML_numeral, "")
wenzelm
parents: 59067
diff changeset
   162
  | Char => (Markup.ML_char, "")
68822
253f04c1e814 simplified markup;
wenzelm
parents: 68821
diff changeset
   163
  | String => (Markup.ML_string, "")
253f04c1e814 simplified markup;
wenzelm
parents: 68821
diff changeset
   164
  | Comment _ => (Markup.ML_comment, "")
64677
8dc24130e8fe more uniform treatment of "bad" like other messages (with serial number);
wenzelm
parents: 64275
diff changeset
   165
  | Error msg => (Markup.bad (), msg)
59124
wenzelm
parents: 59112
diff changeset
   166
  | _ => (Markup.empty, "");
30614
845bcfd3e9cd report markup for ML tokens;
wenzelm
parents: 30600
diff changeset
   167
37195
e87d305a4490 separate markup for ML delimiters;
wenzelm
parents: 31543
diff changeset
   168
in
e87d305a4490 separate markup for ML delimiters;
wenzelm
parents: 31543
diff changeset
   169
68822
253f04c1e814 simplified markup;
wenzelm
parents: 68821
diff changeset
   170
fun token_report (tok as Token ((pos, _), (kind, x))) =
55505
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
   171
  let
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
   172
    val (markup, txt) =
68822
253f04c1e814 simplified markup;
wenzelm
parents: 68821
diff changeset
   173
      if not (is_keyword tok) then token_kind_markup kind
55505
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
   174
      else if is_delimiter tok then (Markup.ML_delimiter, "")
78687
5fe4c11b5ecb minor performance tuning;
wenzelm
parents: 74373
diff changeset
   175
      else if Symset.member keywords2 x then (Markup.ML_keyword2 |> Markup.keyword_properties, "")
5fe4c11b5ecb minor performance tuning;
wenzelm
parents: 74373
diff changeset
   176
      else if Symset.member keywords3 x then (Markup.ML_keyword3 |> Markup.keyword_properties, "")
66067
cdbcb417db67 more markup for HTML rendering;
wenzelm
parents: 64677
diff changeset
   177
      else (Markup.ML_keyword1 |> Markup.keyword_properties, "");
48768
abc45de5bb22 clarified "bad" markup: proper body text, invent missing serial on Scala side (counting backwards);
wenzelm
parents: 48756
diff changeset
   178
  in ((pos, markup), txt) end;
37195
e87d305a4490 separate markup for ML delimiters;
wenzelm
parents: 31543
diff changeset
   179
e87d305a4490 separate markup for ML delimiters;
wenzelm
parents: 31543
diff changeset
   180
end;
30614
845bcfd3e9cd report markup for ML tokens;
wenzelm
parents: 30600
diff changeset
   181
845bcfd3e9cd report markup for ML tokens;
wenzelm
parents: 30600
diff changeset
   182
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   183
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   184
(** scanners **)
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   185
30573
49899f26fbd1 de-camelized Symbol_Pos;
wenzelm
parents: 29606
diff changeset
   186
open Basic_Symbol_Pos;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   187
55105
75815b3b38a1 tuned -- more direct err_prefix;
wenzelm
parents: 55103
diff changeset
   188
val err_prefix = "SML lexical error: ";
75815b3b38a1 tuned -- more direct err_prefix;
wenzelm
parents: 55103
diff changeset
   189
75815b3b38a1 tuned -- more direct err_prefix;
wenzelm
parents: 55103
diff changeset
   190
fun !!! msg = Symbol_Pos.!!! (fn () => err_prefix ^ msg);
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   191
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   192
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   193
(* identifiers *)
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   194
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   195
local
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   196
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   197
val scan_letdigs =
55496
wenzelm
parents: 55105
diff changeset
   198
  Scan.many (Symbol.is_ascii_letdig o Symbol_Pos.symbol);
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   199
40525
14a2e686bdac eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents: 40523
diff changeset
   200
val scan_alphanumeric =
61814
1ca1142e1711 clarified type Token.src: plain token list, with usual implicit value assignment;
wenzelm
parents: 61596
diff changeset
   201
  Scan.one (Symbol.is_ascii_letter o Symbol_Pos.symbol) ::: scan_letdigs;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   202
78687
5fe4c11b5ecb minor performance tuning;
wenzelm
parents: 74373
diff changeset
   203
val symbolic = Symset.make (raw_explode "!#$%&*+-/:<=>?@\\^`|~");
5fe4c11b5ecb minor performance tuning;
wenzelm
parents: 74373
diff changeset
   204
5fe4c11b5ecb minor performance tuning;
wenzelm
parents: 74373
diff changeset
   205
val scan_symbolic = Scan.many1 (Symset.member symbolic o Symbol_Pos.symbol);
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   206
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   207
in
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   208
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   209
val scan_ident = scan_alphanumeric || scan_symbolic;
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   210
55496
wenzelm
parents: 55105
diff changeset
   211
val scan_long_ident =
61476
1884c40f1539 tuned signature;
wenzelm
parents: 61471
diff changeset
   212
  Scan.repeats1 (scan_alphanumeric @@@ $$$ ".") @@@ (scan_ident || $$$ "=");
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   213
55496
wenzelm
parents: 55105
diff changeset
   214
val scan_type_var = $$$ "'" @@@ scan_letdigs;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   215
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   216
end;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   217
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   218
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   219
(* numerals *)
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   220
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   221
local
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   222
40525
14a2e686bdac eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents: 40523
diff changeset
   223
val scan_dec = Scan.many1 (Symbol.is_ascii_digit o Symbol_Pos.symbol);
14a2e686bdac eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents: 40523
diff changeset
   224
val scan_hex = Scan.many1 (Symbol.is_ascii_hex o Symbol_Pos.symbol);
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   225
val scan_sign = Scan.optional ($$$ "~") [];
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   226
val scan_decint = scan_sign @@@ scan_dec;
55496
wenzelm
parents: 55105
diff changeset
   227
val scan_exp = ($$$ "E" || $$$ "e") @@@ scan_decint;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   228
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   229
in
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   230
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   231
val scan_word =
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   232
  $$$ "0" @@@ $$$ "w" @@@ $$$ "x" @@@ scan_hex ||
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   233
  $$$ "0" @@@ $$$ "w" @@@ scan_dec;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   234
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   235
val scan_int = scan_sign @@@ ($$$ "0" @@@ $$$ "x" @@@ scan_hex || scan_dec);
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   236
63204
921a5be54132 support rat numerals via special antiquotation syntax;
wenzelm
parents: 62797
diff changeset
   237
val scan_rat = scan_decint @@@ Scan.optional ($$$ "/" @@@ scan_dec) [];
921a5be54132 support rat numerals via special antiquotation syntax;
wenzelm
parents: 62797
diff changeset
   238
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   239
val scan_real =
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   240
  scan_decint @@@ $$$ "." @@@ scan_dec @@@ Scan.optional scan_exp [] ||
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   241
  scan_decint @@@ scan_exp;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   242
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   243
end;
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   244
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   245
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   246
(* chars and strings *)
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   247
55496
wenzelm
parents: 55105
diff changeset
   248
val scan_blanks1 = Scan.many1 (Symbol.is_ascii_blank o Symbol_Pos.symbol);
wenzelm
parents: 55105
diff changeset
   249
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   250
local
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   251
78687
5fe4c11b5ecb minor performance tuning;
wenzelm
parents: 74373
diff changeset
   252
val escape = Symset.make (raw_explode "\"\\abtnvfr");
5fe4c11b5ecb minor performance tuning;
wenzelm
parents: 74373
diff changeset
   253
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   254
val scan_escape =
78687
5fe4c11b5ecb minor performance tuning;
wenzelm
parents: 74373
diff changeset
   255
  Scan.one (Symset.member escape o Symbol_Pos.symbol) >> single ||
64275
ac2abc987cf9 accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents: 63936
diff changeset
   256
  $$$ "^" @@@
ac2abc987cf9 accomodate Poly/ML repository version, which treats singleton strings as boxed;
wenzelm
parents: 63936
diff changeset
   257
    (Scan.one (fn (s, _) => Char.ord #"@" <= ord s andalso ord s <= Char.ord #"_") >> single) ||
40525
14a2e686bdac eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents: 40523
diff changeset
   258
  Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) --
14a2e686bdac eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents: 40523
diff changeset
   259
    Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) --
14a2e686bdac eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents: 40523
diff changeset
   260
    Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) >> (fn ((a, b), c) => [a, b, c]);
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   261
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   262
val scan_str =
58854
b979c781c2db discontinued obsolete \<^sync> marker;
wenzelm
parents: 56459
diff changeset
   263
  Scan.one (fn (s, _) => Symbol.not_eof s andalso s <> "\"" andalso s <> "\\" andalso
30600
de241396389c allow non-printable symbols within string tokens;
wenzelm
parents: 30593
diff changeset
   264
    (not (Symbol.is_char s) orelse Symbol.is_printable s)) >> single ||
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   265
  $$$ "\\" @@@ !!! "bad escape character in string" scan_escape;
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   266
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   267
val scan_gap = $$$ "\\" @@@ scan_blanks1 @@@ $$$ "\\";
61476
1884c40f1539 tuned signature;
wenzelm
parents: 61471
diff changeset
   268
val scan_gaps = Scan.repeats scan_gap;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   269
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   270
in
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   271
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   272
val scan_char =
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   273
  $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ scan_str @@@ scan_gaps @@@ $$$ "\"";
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   274
48743
a72f8ffecf31 refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents: 45666
diff changeset
   275
val recover_char =
a72f8ffecf31 refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents: 45666
diff changeset
   276
  $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ Scan.optional (Scan.permissive scan_str @@@ scan_gaps) [];
a72f8ffecf31 refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents: 45666
diff changeset
   277
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   278
val scan_string =
55103
57d87ec3da4c tuned errors;
wenzelm
parents: 53167
diff changeset
   279
  Scan.ahead ($$ "\"") |--
57d87ec3da4c tuned errors;
wenzelm
parents: 53167
diff changeset
   280
    !!! "unclosed string literal"
61476
1884c40f1539 tuned signature;
wenzelm
parents: 61471
diff changeset
   281
      ($$$ "\"" @@@ Scan.repeats (scan_gap || scan_str) @@@ $$$ "\"");
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   282
48743
a72f8ffecf31 refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents: 45666
diff changeset
   283
val recover_string =
61476
1884c40f1539 tuned signature;
wenzelm
parents: 61471
diff changeset
   284
  $$$ "\"" @@@ Scan.repeats (scan_gap || Scan.permissive scan_str);
48743
a72f8ffecf31 refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents: 45666
diff changeset
   285
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   286
end;
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   287
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   288
63204
921a5be54132 support rat numerals via special antiquotation syntax;
wenzelm
parents: 62797
diff changeset
   289
(* rat antiquotation *)
921a5be54132 support rat numerals via special antiquotation syntax;
wenzelm
parents: 62797
diff changeset
   290
921a5be54132 support rat numerals via special antiquotation syntax;
wenzelm
parents: 62797
diff changeset
   291
val rat_name = Symbol_Pos.explode ("Pure.rat ", Position.none);
921a5be54132 support rat numerals via special antiquotation syntax;
wenzelm
parents: 62797
diff changeset
   292
921a5be54132 support rat numerals via special antiquotation syntax;
wenzelm
parents: 62797
diff changeset
   293
val scan_rat_antiq =
921a5be54132 support rat numerals via special antiquotation syntax;
wenzelm
parents: 62797
diff changeset
   294
  Symbol_Pos.scan_pos -- ($$ "@" |-- Symbol_Pos.scan_pos -- scan_rat) -- Symbol_Pos.scan_pos
921a5be54132 support rat numerals via special antiquotation syntax;
wenzelm
parents: 62797
diff changeset
   295
    >> (fn ((pos1, (pos2, body)), pos3) =>
921a5be54132 support rat numerals via special antiquotation syntax;
wenzelm
parents: 62797
diff changeset
   296
      {start = Position.range_position (pos1, pos2),
921a5be54132 support rat numerals via special antiquotation syntax;
wenzelm
parents: 62797
diff changeset
   297
       stop = Position.none,
921a5be54132 support rat numerals via special antiquotation syntax;
wenzelm
parents: 62797
diff changeset
   298
       range = Position.range (pos1, pos3),
921a5be54132 support rat numerals via special antiquotation syntax;
wenzelm
parents: 62797
diff changeset
   299
       body = rat_name @ body});
921a5be54132 support rat numerals via special antiquotation syntax;
wenzelm
parents: 62797
diff changeset
   300
921a5be54132 support rat numerals via special antiquotation syntax;
wenzelm
parents: 62797
diff changeset
   301
30645
e7943202d177 added read_antiq, with improved error reporting;
wenzelm
parents: 30636
diff changeset
   302
(* scan tokens *)
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   303
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   304
local
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   305
30593
495672058d97 added scan_antiq;
wenzelm
parents: 30591
diff changeset
   306
fun token k ss = Token (Symbol_Pos.range ss, (k, Symbol_Pos.content ss));
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   307
67427
5409cfd41e7f clarified SML (no formal comments) vs. Isabelle/ML (arbitrary comments);
wenzelm
parents: 67425
diff changeset
   308
val scan_sml =
5409cfd41e7f clarified SML (no formal comments) vs. Isabelle/ML (arbitrary comments);
wenzelm
parents: 67425
diff changeset
   309
  scan_char >> token Char ||
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   310
  scan_string >> token String ||
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   311
  scan_blanks1 >> token Space ||
67425
7d4a088dbc0e clarified modules: uniform notion of formal comments;
wenzelm
parents: 67363
diff changeset
   312
  Symbol_Pos.scan_comment err_prefix >> token (Comment NONE) ||
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   313
  Scan.max token_leq
55505
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55496
diff changeset
   314
   (Scan.literal lexicon >> token Keyword)
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   315
   (scan_word >> token Word ||
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   316
    scan_real >> token Real ||
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   317
    scan_int >> token Int ||
59082
wenzelm
parents: 59067
diff changeset
   318
    scan_long_ident >> token Long_Ident ||
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   319
    scan_ident >> token Ident ||
67427
5409cfd41e7f clarified SML (no formal comments) vs. Isabelle/ML (arbitrary comments);
wenzelm
parents: 67425
diff changeset
   320
    scan_type_var >> token Type_Var);
59112
e670969f34df expand ML cartouches to Input.source;
wenzelm
parents: 59110
diff changeset
   321
68823
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
   322
val scan_sml_antiq = scan_sml >> Antiquote.Text;
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
   323
59112
e670969f34df expand ML cartouches to Input.source;
wenzelm
parents: 59110
diff changeset
   324
val scan_ml_antiq =
69891
def3ec9cdb7e document markers are formal comments, and may thus occur anywhere in the command-span;
wenzelm
parents: 69842
diff changeset
   325
  Comment.scan_inner >> (fn (kind, ss) => Antiquote.Text (token (Comment (SOME kind)) ss)) ||
74373
6e4093927dbb outer syntax: support for control-cartouche tokens;
wenzelm
parents: 74291
diff changeset
   326
  Antiquote.scan_control Antiquote.err_prefix >> Antiquote.Control ||
59112
e670969f34df expand ML cartouches to Input.source;
wenzelm
parents: 59110
diff changeset
   327
  Antiquote.scan_antiq >> Antiquote.Antiq ||
63204
921a5be54132 support rat numerals via special antiquotation syntax;
wenzelm
parents: 62797
diff changeset
   328
  scan_rat_antiq >> Antiquote.Antiq ||
68823
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
   329
  scan_sml_antiq;
30645
e7943202d177 added read_antiq, with improved error reporting;
wenzelm
parents: 30636
diff changeset
   330
27772
b933c997eab3 improved position handling due to SymbolPos.T;
wenzelm
parents: 27732
diff changeset
   331
fun recover msg =
48743
a72f8ffecf31 refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents: 45666
diff changeset
   332
 (recover_char ||
a72f8ffecf31 refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents: 45666
diff changeset
   333
  recover_string ||
59112
e670969f34df expand ML cartouches to Input.source;
wenzelm
parents: 59110
diff changeset
   334
  Symbol_Pos.recover_cartouche ||
48743
a72f8ffecf31 refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents: 45666
diff changeset
   335
  Symbol_Pos.recover_comment ||
a72f8ffecf31 refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents: 45666
diff changeset
   336
  Scan.one (Symbol.not_eof o Symbol_Pos.symbol) >> single)
a72f8ffecf31 refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
wenzelm
parents: 45666
diff changeset
   337
  >> (single o token (Error msg));
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   338
68823
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
   339
fun reader {opaque_warning} scan syms =
37198
3af985b10550 replaced ML_Lex.read_antiq by more concise ML_Lex.read, which includes full read/report with explicit position information;
wenzelm
parents: 37195
diff changeset
   340
  let
37209
1c8cf0048934 terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents: 37198
diff changeset
   341
    val termination =
1c8cf0048934 terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents: 37198
diff changeset
   342
      if null syms then []
1c8cf0048934 terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents: 37198
diff changeset
   343
      else
1c8cf0048934 terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents: 37198
diff changeset
   344
        let
74174
a3b0fc510705 clarified signature;
wenzelm
parents: 74171
diff changeset
   345
          val pos1 = List.last syms |-> Position.symbol;
a3b0fc510705 clarified signature;
wenzelm
parents: 74171
diff changeset
   346
          val pos2 = Position.symbol Symbol.space pos1;
62797
e08c44eed27f tuned signature;
wenzelm
parents: 61814
diff changeset
   347
        in [Antiquote.Text (Token (Position.range (pos1, pos2), (Space, Symbol.space)))] end;
56278
2576d3a40ed6 separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents: 55828
diff changeset
   348
68823
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
   349
    fun check (Antiquote.Text tok) =
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
   350
          (check_content_of tok; if opaque_warning then warning_opaque tok else ())
56278
2576d3a40ed6 separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents: 55828
diff changeset
   351
      | check _ = ();
37209
1c8cf0048934 terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents: 37198
diff changeset
   352
    val input =
56278
2576d3a40ed6 separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents: 55828
diff changeset
   353
      Source.of_list syms
58864
505a8150368a recover via scanner;
wenzelm
parents: 58863
diff changeset
   354
      |> Source.source Symbol_Pos.stopper
505a8150368a recover via scanner;
wenzelm
parents: 58863
diff changeset
   355
        (Scan.recover (Scan.bulk (!!! "bad input" scan))
505a8150368a recover via scanner;
wenzelm
parents: 58863
diff changeset
   356
          (fn msg => recover msg >> map Antiquote.Text))
61482
wenzelm
parents: 61476
diff changeset
   357
      |> Source.exhaust;
wenzelm
parents: 61476
diff changeset
   358
    val _ = Position.reports (Antiquote.antiq_reports input);
68822
253f04c1e814 simplified markup;
wenzelm
parents: 68821
diff changeset
   359
    val _ = Position.reports_text (maps (fn Antiquote.Text t => [token_report t] | _ => []) input);
61482
wenzelm
parents: 61476
diff changeset
   360
    val _ = List.app check input;
37209
1c8cf0048934 terminate ML compiler input produced by ML_Lex.read (cf. 85e864045497);
wenzelm
parents: 37198
diff changeset
   361
  in input @ termination end;
30645
e7943202d177 added read_antiq, with improved error reporting;
wenzelm
parents: 30636
diff changeset
   362
56278
2576d3a40ed6 separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents: 55828
diff changeset
   363
in
2576d3a40ed6 separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents: 55828
diff changeset
   364
2576d3a40ed6 separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents: 55828
diff changeset
   365
fun source src =
2576d3a40ed6 separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents: 55828
diff changeset
   366
  Symbol_Pos.source (Position.line 1) src
67427
5409cfd41e7f clarified SML (no formal comments) vs. Isabelle/ML (arbitrary comments);
wenzelm
parents: 67425
diff changeset
   367
  |> Source.source Symbol_Pos.stopper (Scan.recover (Scan.bulk (!!! "bad input" scan_sml)) recover);
56278
2576d3a40ed6 separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents: 55828
diff changeset
   368
63936
b87784e19a77 discontinued raw symbols;
wenzelm
parents: 63204
diff changeset
   369
val tokenize = Symbol.explode #> Source.of_list #> source #> Source.exhaust;
73549
a2c589d5e1e4 clarified signature: more detailed token positions for antiquotations;
wenzelm
parents: 69891
diff changeset
   370
fun tokenize_range range = tokenize #> map (set_range range);
74291
b83fa8f3a271 ML antiquotations for type constructors and term constants;
wenzelm
parents: 74174
diff changeset
   371
val tokenize_no_range = tokenize_range Position.no_range;
56278
2576d3a40ed6 separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents: 55828
diff changeset
   372
68823
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
   373
val read_text = reader {opaque_warning = true} scan_ml_antiq o Symbol_Pos.explode;
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
   374
fun read text = read_text (text, Position.none);
56278
2576d3a40ed6 separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents: 55828
diff changeset
   375
73549
a2c589d5e1e4 clarified signature: more detailed token positions for antiquotations;
wenzelm
parents: 69891
diff changeset
   376
fun read_range range =
59067
dd8ec9138112 tuned signature;
wenzelm
parents: 59064
diff changeset
   377
  read #> map (fn Antiquote.Text tok => Antiquote.Text (set_range range tok) | antiq => antiq);
58978
e42da880c61e more position information, e.g. relevant for errors in generated ML source;
wenzelm
parents: 58933
diff changeset
   378
68823
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
   379
fun read_source' {language, symbols, opaque_warning} scan source =
56459
38d0b2099743 no report for position singularity, notably for aux. file, especially empty one;
wenzelm
parents: 56278
diff changeset
   380
  let
59064
a8bcb5a446c8 more abstract type Input.source;
wenzelm
parents: 58978
diff changeset
   381
    val pos = Input.pos_of source;
56459
38d0b2099743 no report for position singularity, notably for aux. file, especially empty one;
wenzelm
parents: 56278
diff changeset
   382
    val _ =
59064
a8bcb5a446c8 more abstract type Input.source;
wenzelm
parents: 58978
diff changeset
   383
      if Position.is_reported_range pos
a8bcb5a446c8 more abstract type Input.source;
wenzelm
parents: 58978
diff changeset
   384
      then Position.report pos (language (Input.is_delimited source))
56459
38d0b2099743 no report for position singularity, notably for aux. file, especially empty one;
wenzelm
parents: 56278
diff changeset
   385
      else ();
69842
wenzelm
parents: 69841
diff changeset
   386
  in
wenzelm
parents: 69841
diff changeset
   387
    Input.source_explode source
wenzelm
parents: 69841
diff changeset
   388
    |> not symbols ? maps (fn (s, p) => raw_explode s |> map (rpair p))
wenzelm
parents: 69841
diff changeset
   389
    |> reader {opaque_warning = opaque_warning} scan
wenzelm
parents: 69841
diff changeset
   390
  end;
68823
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
   391
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
   392
val read_source =
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
   393
  read_source' {language = Markup.language_ML, symbols = true, opaque_warning = true}
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
   394
    scan_ml_antiq;
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
   395
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
   396
val read_source_sml =
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
   397
  read_source' {language = Markup.language_SML, symbols = false, opaque_warning = false}
5e7b1ae10eb8 clarified signature;
wenzelm
parents: 68822
diff changeset
   398
    scan_sml_antiq;
55828
42ac3cfb89f6 clarified language markup: added "delimited" property;
wenzelm
parents: 55612
diff changeset
   399
74373
6e4093927dbb outer syntax: support for control-cartouche tokens;
wenzelm
parents: 74291
diff changeset
   400
val read_symbols = reader {opaque_warning = true} scan_ml_antiq;
69841
b3c9291b5fc7 clarified signature, notably for hol4isabelle (by Fabian Immler);
wenzelm
parents: 68823
diff changeset
   401
val read_symbols_sml = reader {opaque_warning = false} scan_sml_antiq;
b3c9291b5fc7 clarified signature, notably for hol4isabelle (by Fabian Immler);
wenzelm
parents: 68823
diff changeset
   402
24579
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   403
end;
852fc50927b1 Lexical syntax for SML.
wenzelm
parents:
diff changeset
   404
24596
f1333a841b26 removed obsolete Selector token;
wenzelm
parents: 24579
diff changeset
   405
end;