src/Pure/ML/ml_lex.ML
author wenzelm
Sat Nov 04 15:24:40 2017 +0100 (20 months ago)
changeset 67003 49850a679c2c
parent 66067 cdbcb417db67
child 67362 221612c942de
permissions -rw-r--r--
more robust sorted_entries;
     1 (*  Title:      Pure/ML/ml_lex.ML
     2     Author:     Makarius
     3 
     4 Lexical syntax for Isabelle/ML and Standard ML.
     5 *)
     6 
     7 signature ML_LEX =
     8 sig
     9   val keywords: string list
    10   datatype token_kind =
    11     Keyword | Ident | Long_Ident | Type_Var | Word | Int | Real | Char | String |
    12     Space | Comment | Error of string | EOF
    13   eqtype token
    14   val stopper: token Scan.stopper
    15   val is_regular: token -> bool
    16   val is_improper: token -> bool
    17   val set_range: Position.range -> token -> token
    18   val range_of: token -> Position.range
    19   val pos_of: token -> Position.T
    20   val end_pos_of: token -> Position.T
    21   val kind_of: token -> token_kind
    22   val content_of: token -> string
    23   val check_content_of: token -> string
    24   val flatten: token list -> string
    25   val source: (Symbol.symbol, 'a) Source.source ->
    26     (token, (Symbol_Pos.T, Position.T * (Symbol.symbol, 'a) Source.source)
    27       Source.source) Source.source
    28   val tokenize: string -> token list
    29   val read_pos: Position.T -> Symbol_Pos.text -> token Antiquote.antiquote list
    30   val read: Symbol_Pos.text -> token Antiquote.antiquote list
    31   val read_set_range: Position.range -> Symbol_Pos.text -> token Antiquote.antiquote list
    32   val read_source: bool -> Input.source -> token Antiquote.antiquote list
    33 end;
    34 
    35 structure ML_Lex: ML_LEX =
    36 struct
    37 
    38 (** keywords **)
    39 
    40 val keywords =
    41  ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=", "=>", "[",
    42   "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as", "case",
    43   "datatype", "do", "else", "end", "eqtype", "exception", "fn", "fun",
    44   "functor", "handle", "if", "in", "include", "infix", "infixr",
    45   "let", "local", "nonfix", "of", "op", "open", "orelse", "raise",
    46   "rec", "sharing", "sig", "signature", "struct", "structure", "then",
    47   "type", "val", "where", "while", "with", "withtype"];
    48 
    49 val keywords2 =
    50  ["and", "case", "do", "else", "end", "if", "in", "let", "local", "of",
    51   "sig", "struct", "then", "while", "with"];
    52 
    53 val keywords3 =
    54  ["handle", "open", "raise"];
    55 
    56 val lexicon = Scan.make_lexicon (map raw_explode keywords);
    57 
    58 
    59 
    60 (** tokens **)
    61 
    62 (* datatype token *)
    63 
    64 datatype token_kind =
    65   Keyword | Ident | Long_Ident | Type_Var | Word | Int | Real | Char | String |
    66   Space | Comment | Error of string | EOF;
    67 
    68 datatype token = Token of Position.range * (token_kind * string);
    69 
    70 
    71 (* position *)
    72 
    73 fun set_range range (Token (_, x)) = Token (range, x);
    74 fun range_of (Token (range, _)) = range;
    75 
    76 val pos_of = #1 o range_of;
    77 val end_pos_of = #2 o range_of;
    78 
    79 
    80 (* stopper *)
    81 
    82 fun mk_eof pos = Token ((pos, Position.none), (EOF, ""));
    83 val eof = mk_eof Position.none;
    84 
    85 fun is_eof (Token (_, (EOF, _))) = true
    86   | is_eof _ = false;
    87 
    88 val stopper =
    89   Scan.stopper (fn [] => eof | toks => mk_eof (end_pos_of (List.last toks))) is_eof;
    90 
    91 
    92 (* token content *)
    93 
    94 fun kind_of (Token (_, (k, _))) = k;
    95 
    96 fun content_of (Token (_, (_, x))) = x;
    97 fun token_leq (tok, tok') = content_of tok <= content_of tok';
    98 
    99 fun is_keyword (Token (_, (Keyword, _))) = true
   100   | is_keyword _ = false;
   101 
   102 fun is_delimiter (Token (_, (Keyword, x))) = not (Symbol.is_ascii_identifier x)
   103   | is_delimiter _ = false;
   104 
   105 fun is_regular (Token (_, (Error _, _))) = false
   106   | is_regular (Token (_, (EOF, _))) = false
   107   | is_regular _ = true;
   108 
   109 fun is_improper (Token (_, (Space, _))) = true
   110   | is_improper (Token (_, (Comment, _))) = true
   111   | is_improper _ = false;
   112 
   113 fun warn tok =
   114   (case tok of
   115     Token (_, (Keyword, ":>")) =>
   116       warning ("Opaque signature matching (:>) fails to work with ML pretty printing --\n\
   117         \prefer non-opaque matching (:) possibly with abstype" ^
   118         Position.here (pos_of tok))
   119   | _ => ());
   120 
   121 fun check_content_of tok =
   122   (case kind_of tok of
   123     Error msg => error msg
   124   | _ => content_of tok);
   125 
   126 
   127 (* flatten *)
   128 
   129 fun flatten_content (tok :: (toks as tok' :: _)) =
   130       Symbol.escape (check_content_of tok) ::
   131         (if is_improper tok orelse is_improper tok' then flatten_content toks
   132          else Symbol.space :: flatten_content toks)
   133   | flatten_content toks = map (Symbol.escape o check_content_of) toks;
   134 
   135 val flatten = implode o flatten_content;
   136 
   137 
   138 (* markup *)
   139 
   140 local
   141 
   142 fun token_kind_markup SML =
   143  fn Type_Var => (Markup.ML_tvar, "")
   144   | Word => (Markup.ML_numeral, "")
   145   | Int => (Markup.ML_numeral, "")
   146   | Real => (Markup.ML_numeral, "")
   147   | Char => (Markup.ML_char, "")
   148   | String => (if SML then Markup.SML_string else Markup.ML_string, "")
   149   | Comment => (if SML then Markup.SML_comment else Markup.ML_comment, "")
   150   | Error msg => (Markup.bad (), msg)
   151   | _ => (Markup.empty, "");
   152 
   153 in
   154 
   155 fun token_report SML (tok as Token ((pos, _), (kind, x))) =
   156   let
   157     val (markup, txt) =
   158       if not (is_keyword tok) then token_kind_markup SML kind
   159       else if is_delimiter tok then (Markup.ML_delimiter, "")
   160       else if member (op =) keywords2 x then (Markup.ML_keyword2 |> Markup.keyword_properties, "")
   161       else if member (op =) keywords3 x then (Markup.ML_keyword3 |> Markup.keyword_properties, "")
   162       else (Markup.ML_keyword1 |> Markup.keyword_properties, "");
   163   in ((pos, markup), txt) end;
   164 
   165 end;
   166 
   167 
   168 
   169 (** scanners **)
   170 
   171 open Basic_Symbol_Pos;
   172 
   173 val err_prefix = "SML lexical error: ";
   174 
   175 fun !!! msg = Symbol_Pos.!!! (fn () => err_prefix ^ msg);
   176 
   177 
   178 (* identifiers *)
   179 
   180 local
   181 
   182 val scan_letdigs =
   183   Scan.many (Symbol.is_ascii_letdig o Symbol_Pos.symbol);
   184 
   185 val scan_alphanumeric =
   186   Scan.one (Symbol.is_ascii_letter o Symbol_Pos.symbol) ::: scan_letdigs;
   187 
   188 val scan_symbolic =
   189   Scan.many1 (member (op =) (raw_explode "!#$%&*+-/:<=>?@\\^`|~") o Symbol_Pos.symbol);
   190 
   191 in
   192 
   193 val scan_ident = scan_alphanumeric || scan_symbolic;
   194 
   195 val scan_long_ident =
   196   Scan.repeats1 (scan_alphanumeric @@@ $$$ ".") @@@ (scan_ident || $$$ "=");
   197 
   198 val scan_type_var = $$$ "'" @@@ scan_letdigs;
   199 
   200 end;
   201 
   202 
   203 (* numerals *)
   204 
   205 local
   206 
   207 val scan_dec = Scan.many1 (Symbol.is_ascii_digit o Symbol_Pos.symbol);
   208 val scan_hex = Scan.many1 (Symbol.is_ascii_hex o Symbol_Pos.symbol);
   209 val scan_sign = Scan.optional ($$$ "~") [];
   210 val scan_decint = scan_sign @@@ scan_dec;
   211 val scan_exp = ($$$ "E" || $$$ "e") @@@ scan_decint;
   212 
   213 in
   214 
   215 val scan_word =
   216   $$$ "0" @@@ $$$ "w" @@@ $$$ "x" @@@ scan_hex ||
   217   $$$ "0" @@@ $$$ "w" @@@ scan_dec;
   218 
   219 val scan_int = scan_sign @@@ ($$$ "0" @@@ $$$ "x" @@@ scan_hex || scan_dec);
   220 
   221 val scan_rat = scan_decint @@@ Scan.optional ($$$ "/" @@@ scan_dec) [];
   222 
   223 val scan_real =
   224   scan_decint @@@ $$$ "." @@@ scan_dec @@@ Scan.optional scan_exp [] ||
   225   scan_decint @@@ scan_exp;
   226 
   227 end;
   228 
   229 
   230 (* chars and strings *)
   231 
   232 val scan_blanks1 = Scan.many1 (Symbol.is_ascii_blank o Symbol_Pos.symbol);
   233 
   234 local
   235 
   236 val scan_escape =
   237   Scan.one (member (op =) (raw_explode "\"\\abtnvfr") o Symbol_Pos.symbol) >> single ||
   238   $$$ "^" @@@
   239     (Scan.one (fn (s, _) => Char.ord #"@" <= ord s andalso ord s <= Char.ord #"_") >> single) ||
   240   Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) --
   241     Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) --
   242     Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) >> (fn ((a, b), c) => [a, b, c]);
   243 
   244 val scan_str =
   245   Scan.one (fn (s, _) => Symbol.not_eof s andalso s <> "\"" andalso s <> "\\" andalso
   246     (not (Symbol.is_char s) orelse Symbol.is_printable s)) >> single ||
   247   $$$ "\\" @@@ !!! "bad escape character in string" scan_escape;
   248 
   249 val scan_gap = $$$ "\\" @@@ scan_blanks1 @@@ $$$ "\\";
   250 val scan_gaps = Scan.repeats scan_gap;
   251 
   252 in
   253 
   254 val scan_char =
   255   $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ scan_str @@@ scan_gaps @@@ $$$ "\"";
   256 
   257 val recover_char =
   258   $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ Scan.optional (Scan.permissive scan_str @@@ scan_gaps) [];
   259 
   260 val scan_string =
   261   Scan.ahead ($$ "\"") |--
   262     !!! "unclosed string literal"
   263       ($$$ "\"" @@@ Scan.repeats (scan_gap || scan_str) @@@ $$$ "\"");
   264 
   265 val recover_string =
   266   $$$ "\"" @@@ Scan.repeats (scan_gap || Scan.permissive scan_str);
   267 
   268 end;
   269 
   270 
   271 (* rat antiquotation *)
   272 
   273 val rat_name = Symbol_Pos.explode ("Pure.rat ", Position.none);
   274 
   275 val scan_rat_antiq =
   276   Symbol_Pos.scan_pos -- ($$ "@" |-- Symbol_Pos.scan_pos -- scan_rat) -- Symbol_Pos.scan_pos
   277     >> (fn ((pos1, (pos2, body)), pos3) =>
   278       {start = Position.range_position (pos1, pos2),
   279        stop = Position.none,
   280        range = Position.range (pos1, pos3),
   281        body = rat_name @ body});
   282 
   283 
   284 (* scan tokens *)
   285 
   286 local
   287 
   288 fun token k ss = Token (Symbol_Pos.range ss, (k, Symbol_Pos.content ss));
   289 
   290 val scan_ml =
   291  (scan_char >> token Char ||
   292   scan_string >> token String ||
   293   scan_blanks1 >> token Space ||
   294   Symbol_Pos.scan_comment err_prefix >> token Comment ||
   295   Scan.max token_leq
   296    (Scan.literal lexicon >> token Keyword)
   297    (scan_word >> token Word ||
   298     scan_real >> token Real ||
   299     scan_int >> token Int ||
   300     scan_long_ident >> token Long_Ident ||
   301     scan_ident >> token Ident ||
   302     scan_type_var >> token Type_Var));
   303 
   304 val scan_sml = scan_ml >> Antiquote.Text;
   305 
   306 val scan_ml_antiq =
   307   Antiquote.scan_control >> Antiquote.Control ||
   308   Antiquote.scan_antiq >> Antiquote.Antiq ||
   309   scan_rat_antiq >> Antiquote.Antiq ||
   310   scan_ml >> Antiquote.Text;
   311 
   312 fun recover msg =
   313  (recover_char ||
   314   recover_string ||
   315   Symbol_Pos.recover_cartouche ||
   316   Symbol_Pos.recover_comment ||
   317   Scan.one (Symbol.not_eof o Symbol_Pos.symbol) >> single)
   318   >> (single o token (Error msg));
   319 
   320 fun gen_read SML pos text =
   321   let
   322     val syms =
   323       Symbol_Pos.explode (text, pos)
   324       |> SML ? maps (fn (s, p) => raw_explode s |> map (rpair p));
   325 
   326     val termination =
   327       if null syms then []
   328       else
   329         let
   330           val pos1 = List.last syms |-> Position.advance;
   331           val pos2 = Position.advance Symbol.space pos1;
   332         in [Antiquote.Text (Token (Position.range (pos1, pos2), (Space, Symbol.space)))] end;
   333 
   334     val scan = if SML then scan_sml else scan_ml_antiq;
   335     fun check (Antiquote.Text tok) = (check_content_of tok; if SML then () else warn tok)
   336       | check _ = ();
   337     val input =
   338       Source.of_list syms
   339       |> Source.source Symbol_Pos.stopper
   340         (Scan.recover (Scan.bulk (!!! "bad input" scan))
   341           (fn msg => recover msg >> map Antiquote.Text))
   342       |> Source.exhaust;
   343     val _ = Position.reports (Antiquote.antiq_reports input);
   344     val _ =
   345       Position.reports_text (maps (fn Antiquote.Text t => [token_report SML t] | _ => []) input);
   346     val _ = List.app check input;
   347   in input @ termination end;
   348 
   349 in
   350 
   351 fun source src =
   352   Symbol_Pos.source (Position.line 1) src
   353   |> Source.source Symbol_Pos.stopper (Scan.recover (Scan.bulk (!!! "bad input" scan_ml)) recover);
   354 
   355 val tokenize = Symbol.explode #> Source.of_list #> source #> Source.exhaust;
   356 
   357 val read_pos = gen_read false;
   358 
   359 val read = read_pos Position.none;
   360 
   361 fun read_set_range range =
   362   read #> map (fn Antiquote.Text tok => Antiquote.Text (set_range range tok) | antiq => antiq);
   363 
   364 fun read_source SML source =
   365   let
   366     val pos = Input.pos_of source;
   367     val language = if SML then Markup.language_SML else Markup.language_ML;
   368     val _ =
   369       if Position.is_reported_range pos
   370       then Position.report pos (language (Input.is_delimited source))
   371       else ();
   372   in gen_read SML pos (Input.text_of source) end;
   373 
   374 end;
   375 
   376 end;