src/Pure/Thy/thy_scan.ML
changeset 4705 f71017706887
parent 3831 45e2e7ba31b8
child 4921 74bc10921f7d
equal deleted inserted replaced
4704:4fce39cc7136 4705:f71017706887
     1 (*  Title:      Pure/Thy/thy_scan.ML
     1 (*  Title:	Pure/Thy/thy_scan.ML
     2     ID:         $Id$
     2     ID:		$Id$
     3     Author:     Markus Wenzel, TU Muenchen
     3     Author:	Markus Wenzel, TU Muenchen
     4 
     4 
     5 The scanner for theory files.
     5 Lexer for the outer Isabelle syntax.
       
     6 
       
     7 TODO:
       
     8   - old vs. new: interpreted strings, no 'ML', var!?;
     6 *)
     9 *)
     7 
    10 
     8 signature THY_SCAN =
    11 signature THY_SCAN =
     9   sig
    12 sig
    10   datatype token_kind =
    13   datatype token_kind =
    11     Keyword | Ident | LongIdent | TypeVar | Nat | String | Verbatim | EOF
    14     Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF
    12   val name_of_kind: token_kind -> string
    15   val name_of_kind: token_kind -> string
    13   type lexicon
    16   val tokenize: Scan.lexicon -> string -> (token_kind * string * int) list
    14   val make_lexicon: string list -> lexicon
    17 end;
    15   val tokenize: lexicon -> string -> (token_kind * string * int) list
       
    16   end;
       
    17 
    18 
    18 
    19 structure ThyScan: THY_SCAN =
    19 structure ThyScan : THY_SCAN =
       
    20 struct
    20 struct
    21 
       
    22 open Scanner;
       
    23 
    21 
    24 
    22 
    25 (** token kinds **)
    23 (** token kinds **)
    26 
    24 
    27 datatype token_kind =
    25 datatype token_kind =
    28   Keyword | Ident | LongIdent | TypeVar | Nat | String | Verbatim | EOF;
    26   Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF;
    29 
    27 
    30 fun name_of_kind Keyword = "keyword"
    28 fun name_of_kind Keyword = "keyword"
    31   | name_of_kind Ident = "identifier"
    29   | name_of_kind Ident = "identifier"
    32   | name_of_kind LongIdent = "long identifier"
    30   | name_of_kind LongIdent = "long identifier"
       
    31   | name_of_kind Var = "schematic variable"
    33   | name_of_kind TypeVar = "type variable"
    32   | name_of_kind TypeVar = "type variable"
    34   | name_of_kind Nat = "natural number"
    33   | name_of_kind Nat = "natural number"
    35   | name_of_kind String = "string"
    34   | name_of_kind String = "string"
    36   | name_of_kind Verbatim = "verbatim text"
    35   | name_of_kind Verbatim = "verbatim text"
       
    36   | name_of_kind Ignore = "ignore"
    37   | name_of_kind EOF = "end-of-file";
    37   | name_of_kind EOF = "end-of-file";
    38 
    38 
    39 
    39 
    40 
    40 
    41 (** scanners **)
    41 (** scanners **)
    42 
    42 
    43 fun lex_err line msg =
    43 (* diagnostics *)
    44   error ("Lexical error on line " ^ string_of_int line ^ ": " ^ msg);
    44 
       
    45 fun lex_err None msg = "Outer lexical error: " ^ msg
       
    46   | lex_err (Some n) msg = "Outer lexical error on line " ^ string_of_int n ^ ": " ^ msg;
    45 
    47 
    46 
    48 
    47 (* misc utilities *)
    49 (* line numbering *)
    48 
    50 
    49 fun count_lines cs =
    51 val incr = apsome (fn n:int => n + 1);
    50   foldl (fn (sum, c) => if c = "\n" then sum + 1 else sum) (0, cs);
       
    51 
    52 
    52 fun inc_line n s = n + count_lines (explode s);
    53 fun incr_line scan = Scan.depend (fn n => scan >> pair (incr n));
       
    54 val keep_line = Scan.lift;
    53 
    55 
    54 fun cons_fst c (cs, x, y) = (c :: cs, x, y);
    56 val scan_blank =
       
    57   incr_line ($$ "\n") ||
       
    58   keep_line (Scan.one Symbol.is_blank);
    55 
    59 
    56 
    60 
    57 (* scan strings *)
    61 (* scan ML-style strings *)
    58 
    62 
    59 val scan_ctrl = $$ "^" ^^ scan_one (fn c => ord c >= 64 andalso ord c <= 95);
    63 val scan_ctrl =
       
    64   $$ "^" ^^ Scan.one (fn c => Symbol.is_ascii c andalso ord c >= 64 andalso ord c <= 95);
    60 
    65 
    61 val scan_dig = scan_one is_digit;
    66 val scan_dig = Scan.one Symbol.is_digit;
    62 
       
    63 val scan_blanks1 = scan_any1 is_blank >> implode;
       
    64 
    67 
    65 val scan_esc =
    68 val scan_esc =
    66   $$ "n" || $$ "t" || $$ "\"" || $$ "\\" ||
    69   keep_line ($$ "\\") ^^ !! (fn (n, _) => lex_err n "bad escape sequence in string")
    67   scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig ||
    70     (keep_line ($$ "n" || $$ "t" || $$ "\"" || $$ "\\" ||
    68   scan_blanks1 ^^ $$ "\\";
    71       scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig) ||
       
    72       (Scan.repeat1 scan_blank >> implode) ^^ keep_line ($$ "\\"));
    69 
    73 
    70 fun string ("\"" :: cs) n = (["\""], cs, n)
    74 val scan_str =
    71   | string ("\\" :: cs) n =
    75   scan_esc ||
    72       (case optional scan_esc cs of
    76   scan_blank >> K " " ||
    73         (None, _) => lex_err n "bad escape sequence in string"
    77   keep_line (Scan.one (not_equal "\"" andf Symbol.not_eof));
    74       | (Some s, cs') => cons_fst ("\\" ^ s) (string cs' (inc_line n s)))
    78 
    75   | string ("\n" :: cs) n = cons_fst " " (string cs (n + 1))
    79 val scan_string =
    76   | string (c :: cs) n = 
    80   keep_line ($$ "\"") ^^
    77       if is_blank c then cons_fst " " (string cs n)
    81   !! (fn (n, _) => lex_err n "missing quote at end of string")
    78       else cons_fst c (string cs n)
    82     ((Scan.repeat scan_str >> implode) ^^ keep_line ($$ "\""));
    79   | string [] n = lex_err n "missing quote at end of string";
       
    80 
    83 
    81 
    84 
    82 (* scan verbatim text *)
    85 (* scan verbatim text *)
    83 
    86 
    84 fun verbatim ("|" :: "}" :: cs) n = ([], cs, n)
    87 val scan_verb =
    85   | verbatim (c :: cs) n = cons_fst c (verbatim cs (inc_line n c))
    88   scan_blank ||
    86   | verbatim [] n = lex_err n "missing end of verbatim text";
    89   keep_line ($$ "|" --| Scan.ahead (Scan.one (not_equal "}"))) ||
       
    90   keep_line (Scan.one (not_equal "|" andf Symbol.not_eof));
       
    91 
       
    92 val scan_verbatim =
       
    93   keep_line ($$ "{" -- $$ "|") |--
       
    94   !! (fn (n, _) => lex_err n "missing end of verbatim text")
       
    95     ((Scan.repeat scan_verb >> implode) --| keep_line ($$ "|" -- $$ "}"));
    87 
    96 
    88 
    97 
    89 (* scan nested comments *)
    98 (* scan nested comments *)
    90 
    99 
    91 fun comment ("*" :: ")" :: cs) n 1 = (cs, n)
   100 val scan_cmt =
    92   | comment ("*" :: ")" :: cs) n d = comment cs n (d - 1)
   101   Scan.lift scan_blank ||
    93   | comment ("(" :: "*" :: cs) n d = comment cs n (d + 1)
   102   Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
    94   | comment (c :: cs) n d = comment cs (inc_line n c) d
   103   Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
    95   | comment [] n _ = lex_err n "missing end of comment";
   104   Scan.lift (keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal ")")))) ||
       
   105   Scan.lift (keep_line (Scan.one (not_equal "*" andf Symbol.not_eof)));
       
   106 
       
   107 val scan_comment =
       
   108   keep_line ($$ "(" -- $$ "*") |--
       
   109   !! (fn (n, _) => lex_err n "missing end of comment")
       
   110     (Scan.pass 0 (Scan.repeat scan_cmt) |-- keep_line ($$ "*" -- $$ ")") >> K "");
    96 
   111 
    97 
   112 
       
   113 (* scan token *)
    98 
   114 
    99 (** tokenize **)
   115 fun token k None x = (k, x, 0)
       
   116   | token k (Some n) x = (k, x, n);
   100 
   117 
   101 fun scan_word lex =
   118 fun scan_tok lex (n, cs) =
   102   max_of
   119  (scan_string >> token String n ||
   103     (scan_literal lex >> pair Keyword)
   120   scan_verbatim >> token Verbatim n ||
   104     (scan_longid >> pair LongIdent ||
   121   Scan.repeat1 scan_blank >> (token Ignore n o implode) ||
   105       scan_id >> pair Ident ||
   122   scan_comment >> token Ignore n ||
   106       scan_tid >> pair TypeVar ||
   123   keep_line (Scan.max (fn ((_, x, _), (_, x', _)) => x <= x')
   107       scan_nat >> pair Nat);
   124     (Scan.literal lex >> (token Keyword n o implode))
       
   125     (Syntax.scan_longid >> token LongIdent n ||
       
   126       Syntax.scan_id >> token Ident n ||
       
   127       Syntax.scan_var >> token Var n ||
       
   128       Syntax.scan_tid >> token TypeVar n ||
       
   129       Syntax.scan_nat >> token Nat n))) (n, cs);
   108 
   130 
   109 fun add_tok toks kind n (cs, cs', n') =
   131 val scan_rest = Scan.any Symbol.not_eof >> implode;
   110   ((kind, implode cs, n) :: toks, cs', n');
       
   111 
   132 
   112 val take_line = implode o fst o take_prefix (not_equal "\n");
   133 fun scan_token lex x =
       
   134   (case scan_tok lex x of
       
   135     ((Keyword, "ML", n), x') => (keep_line scan_rest >> token Verbatim (Some n)) x'
       
   136   | tok_x' => tok_x');
   113 
   137 
       
   138 
       
   139 (* tokenize *)
   114 
   140 
   115 fun tokenize lex str =
   141 fun tokenize lex str =
   116   let
   142   let
   117     fun scan (toks, [], n) = rev ((EOF, "end-of-file", n) :: toks)
   143     val chs = explode str;	(*sic!*)
   118       | scan (toks, "\"" :: cs, n) =
   144     val scan_toks = Scan.repeat (scan_token lex);
   119           scan (add_tok toks String n (cons_fst "\"" (string cs n)))
   145     val ignore = fn (Ignore, _, _) => true | _ => false;
   120       | scan (toks, "{" :: "|" :: cs, n) =
       
   121           scan (add_tok toks Verbatim n (verbatim cs n))
       
   122       | scan (toks, "(" :: "*" :: cs, n) =
       
   123           scan ((fn (cs', n') => (toks, cs', n')) (comment cs n 1))
       
   124       | scan (toks, chs as c :: cs, n) =
       
   125           if is_blank c then scan (toks, cs, inc_line n c)
       
   126           else
       
   127             (case scan_word lex chs of
       
   128               (None, _) => lex_err n (quote (take_line chs))
       
   129             | (Some (Keyword, "ML"), chs') => scan ((Verbatim, implode chs', n)
       
   130                 :: (Keyword, "ML", n) :: toks, [], n + count_lines chs')
       
   131             | (Some (tk, s), chs') => scan ((tk, s, n) :: toks, chs', n));
       
   132   in
   146   in
   133     scan ([], explode str, 1)
   147     (case Scan.error (Scan.finite' Symbol.eof scan_toks) (Some 1, chs) of
       
   148       (toks, (n, [])) => filter_out ignore toks @ [token EOF n "end-of-file"]
       
   149     | (_, (n, cs)) => error (lex_err n ("Bad input " ^ quote (beginning cs))))
   134   end;
   150   end;
   135 
   151 
   136 
   152 
   137 end;
   153 end;
   138