wenzelm@5825: (* Title: Pure/Isar/outer_lex.ML wenzelm@5825: ID: $Id$ wenzelm@5825: Author: Markus Wenzel, TU Muenchen wenzelm@5825: wenzelm@5825: Outer lexical syntax for Isabelle/Isar. wenzelm@5825: *) wenzelm@5825: wenzelm@5825: signature OUTER_LEX = wenzelm@5825: sig wenzelm@5825: datatype token_kind = wenzelm@7477: Command | Keyword | Ident | LongIdent | SymIdent | Var | TypeIdent | TypeVar | wenzelm@17164: Nat | String | AltString | Verbatim | Space | Comment | Sync | Malformed | EOF aspinall@15143: eqtype token wenzelm@5825: val str_of_kind: token_kind -> string wenzelm@5825: val stopper: token * (token -> bool) wenzelm@6859: val not_sync: token -> bool wenzelm@5825: val not_eof: token -> bool wenzelm@5825: val position_of: token -> Position.T wenzelm@5825: val pos_of: token -> string wenzelm@5825: val is_kind: token_kind -> token -> bool wenzelm@7026: val keyword_with: (string -> bool) -> token -> bool wenzelm@16029: val ident_with: (string -> bool) -> token -> bool wenzelm@5825: val name_of: token -> string wenzelm@5825: val is_proper: token -> bool wenzelm@9130: val is_semicolon: token -> bool wenzelm@17069: val is_comment: token -> bool wenzelm@8580: val is_begin_ignore: token -> bool wenzelm@8580: val is_end_ignore: token -> bool wenzelm@17069: val is_blank: token -> bool wenzelm@8651: val is_newline: token -> bool wenzelm@14991: val unparse: token -> string wenzelm@5825: val val_of: token -> string wenzelm@5876: val is_sid: string -> bool wenzelm@9130: val !!! : string -> (Position.T * 'a -> 'b) -> Position.T * 'a -> 'b wenzelm@9130: val incr_line: ('a -> 'b * 'c) -> Position.T * 'a -> 'b * (Position.T * 'c) wenzelm@9130: val keep_line: ('a -> 'b * 'c) -> Position.T * 'a -> 'b * (Position.T * 'c) wenzelm@9130: val scan_blank: Position.T * Symbol.symbol list wenzelm@9130: -> Symbol.symbol * (Position.T * Symbol.symbol list) wenzelm@9130: val scan_string: Position.T * Symbol.symbol list -> string * (Position.T * Symbol.symbol list) wenzelm@7026: val scan: (Scan.lexicon * Scan.lexicon) -> wenzelm@5825: Position.T * Symbol.symbol list -> token * (Position.T * Symbol.symbol list) wenzelm@7026: val source: bool -> (unit -> (Scan.lexicon * Scan.lexicon)) -> wenzelm@7026: Position.T -> (Symbol.symbol, 'a) Source.source -> wenzelm@7682: (token, Position.T * (Symbol.symbol, 'a) Source.source) Source.source wenzelm@17164: val source_proper: (token, 'a) Source.source -> wenzelm@17164: (token, (token, 'a) Source.source) Source.source wenzelm@9130: val make_lexicon: string list -> Scan.lexicon wenzelm@5825: end; wenzelm@5825: wenzelm@5825: structure OuterLex: OUTER_LEX = wenzelm@5825: struct wenzelm@5825: wenzelm@5825: wenzelm@5825: (** tokens **) wenzelm@5825: wenzelm@5825: (* datatype token *) wenzelm@5825: wenzelm@5825: datatype token_kind = wenzelm@7477: Command | Keyword | Ident | LongIdent | SymIdent | Var | TypeIdent | TypeVar | wenzelm@17164: Nat | String | AltString | Verbatim | Space | Comment | Sync | Malformed | EOF; wenzelm@5825: wenzelm@5825: datatype token = Token of Position.T * (token_kind * string); wenzelm@5825: wenzelm@5825: val str_of_kind = wenzelm@7026: fn Command => "command" wenzelm@7026: | Keyword => "keyword" wenzelm@5825: | Ident => "identifier" wenzelm@5825: | LongIdent => "long identifier" wenzelm@5825: | SymIdent => "symbolic identifier" wenzelm@5825: | Var => "schematic variable" wenzelm@5825: | TypeIdent => "type variable" wenzelm@5825: | TypeVar => "schematic type variable" wenzelm@5825: | Nat => "number" wenzelm@5825: | String => "string" wenzelm@17164: | AltString => "back-quoted string" wenzelm@5825: | Verbatim => "verbatim text" wenzelm@7682: | Space => "white space" wenzelm@7682: | Comment => "comment text" wenzelm@6859: | Sync => "sync marker" wenzelm@10748: | Malformed => "bad input" wenzelm@5825: | EOF => "end-of-file"; wenzelm@5825: wenzelm@5825: wenzelm@10748: (* control tokens *) wenzelm@6859: wenzelm@6859: fun not_sync (Token (_, (Sync, _))) = false wenzelm@6859: | not_sync _ = true; wenzelm@6859: wenzelm@10748: val malformed = Token (Position.none, (Malformed, "")); aspinall@15224: fun malformed_of xs = Token (Position.none, (Malformed, implode xs)); wenzelm@10748: wenzelm@6859: wenzelm@5825: (* eof token *) wenzelm@5825: wenzelm@5825: val eof = Token (Position.none, (EOF, "")); wenzelm@5825: wenzelm@5825: fun is_eof (Token (_, (EOF, _))) = true wenzelm@5825: | is_eof _ = false; wenzelm@5825: wenzelm@5825: val stopper = (eof, is_eof); wenzelm@5825: val not_eof = not o is_eof; wenzelm@5825: wenzelm@5825: wenzelm@5825: (* get position *) wenzelm@5825: wenzelm@5825: fun position_of (Token (pos, _)) = pos; wenzelm@5825: val pos_of = Position.str_of o position_of; wenzelm@5825: wenzelm@5825: wenzelm@5825: (* kind of token *) wenzelm@5825: wenzelm@5825: fun is_kind k (Token (_, (k', _))) = k = k'; wenzelm@5825: wenzelm@7026: fun keyword_with pred (Token (_, (Keyword, x))) = pred x wenzelm@7026: | keyword_with _ _ = false; wenzelm@5825: wenzelm@16029: fun ident_with pred (Token (_, (Ident, x))) = pred x wenzelm@16029: | ident_with _ _ = false; wenzelm@16029: wenzelm@7682: fun is_proper (Token (_, (Space, _))) = false wenzelm@7682: | is_proper (Token (_, (Comment, _))) = false wenzelm@5825: | is_proper _ = true; wenzelm@5825: wenzelm@9195: fun is_semicolon (Token (_, (Keyword, ";"))) = true wenzelm@9130: | is_semicolon _ = false; wenzelm@9130: wenzelm@17069: fun is_comment (Token (_, (Comment, _))) = true wenzelm@17069: | is_comment _ = false; wenzelm@17069: wenzelm@8580: fun is_begin_ignore (Token (_, (Comment, "<"))) = true wenzelm@8580: | is_begin_ignore _ = false; wenzelm@8580: wenzelm@8580: fun is_end_ignore (Token (_, (Comment, ">"))) = true wenzelm@8580: | is_end_ignore _ = false; wenzelm@8580: wenzelm@8651: wenzelm@17069: (* blanks and newlines -- space tokens obey lines *) wenzelm@8651: wenzelm@17069: fun is_blank (Token (_, (Space, s))) = not (String.isSuffix "\n" s) wenzelm@17069: | is_blank _ = false; wenzelm@17069: wenzelm@17069: fun is_newline (Token (_, (Space, s))) = String.isSuffix "\n" s wenzelm@8651: | is_newline _ = false; wenzelm@8651: wenzelm@5825: wenzelm@14991: (* token content *) wenzelm@9155: wenzelm@9155: fun name_of (tok as Token (_, (k, x))) = wenzelm@10748: if is_semicolon tok then "terminator" wenzelm@10748: else if x = "" then str_of_kind k wenzelm@10748: else str_of_kind k ^ " " ^ quote x; wenzelm@5825: wenzelm@18547: fun escape q = wenzelm@18547: implode o map (fn s => if s = q orelse s = "\\" then "\\" ^ s else s) o Symbol.explode; wenzelm@18547: wenzelm@14991: fun unparse (Token (_, (kind, x))) = wenzelm@14991: (case kind of wenzelm@18547: String => x |> quote o escape "\"" wenzelm@18547: | AltString => x |> enclose "`" "`" o escape "`" wenzelm@14991: | Verbatim => x |> enclose "{*" "*}" wenzelm@14991: | Comment => x |> enclose "(*" "*)" wenzelm@14991: | _ => x); wenzelm@14991: wenzelm@5825: fun val_of (Token (_, (_, x))) = x; wenzelm@5825: wenzelm@5825: fun token_leq (Token (_, (_, x)), Token (_, (_, x'))) = x <= x'; wenzelm@5825: wenzelm@5825: wenzelm@5825: wenzelm@5825: (** scanners **) wenzelm@5825: wenzelm@5825: fun change_prompt scan = Scan.prompt "# " scan; wenzelm@5825: wenzelm@5825: wenzelm@5825: (* diagnostics *) wenzelm@5825: wenzelm@5825: fun lex_err msg ((pos, cs), _) = "Outer lexical error" ^ Position.str_of pos ^ ": " ^ msg cs; wenzelm@9130: fun !!! msg scan = Scan.!! (lex_err (K msg)) scan; wenzelm@5825: wenzelm@5825: wenzelm@5825: (* line numbering *) wenzelm@5825: wenzelm@5825: fun incr_line scan = Scan.depend (fn pos => scan >> pair (Position.inc pos)); wenzelm@5825: val keep_line = Scan.lift; wenzelm@5825: wenzelm@5825: val scan_blank = wenzelm@5825: incr_line ($$ "\n") || wenzelm@5825: keep_line (Scan.one Symbol.is_blank); wenzelm@5825: wenzelm@5825: wenzelm@5825: (* scan symbolic idents *) wenzelm@5825: wenzelm@20664: val is_sym_char = member (op =) (explode "!#$%&*+-/<=>?@^_|~"); wenzelm@5825: wenzelm@8231: val scan_symid = wenzelm@21858: Scan.many1 is_sym_char >> implode || wenzelm@8231: Scan.one Symbol.is_symbolic; wenzelm@5825: wenzelm@8231: fun is_symid str = wenzelm@8231: (case try Symbol.explode str of skalberg@15531: SOME [s] => Symbol.is_symbolic s orelse is_sym_char s skalberg@15531: | SOME ss => forall is_sym_char ss wenzelm@8231: | _ => false); wenzelm@8231: wenzelm@20982: fun is_sid "begin" = false wenzelm@20982: | is_sid ":" = true wenzelm@22873: | is_sid "::" = true wenzelm@20982: | is_sid s = is_symid s orelse Syntax.is_identifier s; wenzelm@5825: wenzelm@5825: wenzelm@5825: (* scan strings *) wenzelm@5825: wenzelm@17164: local wenzelm@17164: wenzelm@17164: fun scan_str q = wenzelm@7682: scan_blank || wenzelm@9130: keep_line ($$ "\\") |-- !!! "bad escape character in string" wenzelm@17164: (scan_blank || keep_line ($$ q || $$ "\\")) || wenzelm@19305: keep_line (Scan.one (fn s => s <> q andalso s <> "\\" andalso wenzelm@19305: Symbol.not_sync s andalso Symbol.not_eof s)); wenzelm@5825: wenzelm@17164: fun scan_strs q = wenzelm@17164: keep_line ($$ q) |-- wenzelm@9130: !!! "missing quote at end of string" wenzelm@17164: (change_prompt ((Scan.repeat (scan_str q) >> implode) --| keep_line ($$ q))); wenzelm@17164: wenzelm@17164: in wenzelm@17164: wenzelm@17164: val scan_string = scan_strs "\""; wenzelm@17164: val scan_alt_string = scan_strs "`"; wenzelm@17164: wenzelm@17164: end; wenzelm@5825: wenzelm@5825: wenzelm@5825: (* scan verbatim text *) wenzelm@5825: wenzelm@5825: val scan_verb = wenzelm@5825: scan_blank || wenzelm@19305: keep_line ($$ "*" --| Scan.ahead (~$$ "}")) || wenzelm@19305: keep_line (Scan.one (fn s => s <> "*" andalso Symbol.not_sync s andalso Symbol.not_eof s)); wenzelm@5825: wenzelm@5825: val scan_verbatim = wenzelm@6743: keep_line ($$ "{" -- $$ "*") |-- wenzelm@9130: !!! "missing end of verbatim text" wenzelm@6743: (change_prompt ((Scan.repeat scan_verb >> implode) --| keep_line ($$ "*" -- $$ "}"))); wenzelm@5825: wenzelm@5825: wenzelm@5825: (* scan space *) wenzelm@5825: wenzelm@19305: fun is_space s = Symbol.is_blank s andalso s <> "\n"; wenzelm@5825: wenzelm@5825: val scan_space = wenzelm@21858: (keep_line (Scan.many1 is_space) -- Scan.optional (incr_line ($$ "\n")) "" || wenzelm@21858: keep_line (Scan.many is_space) -- incr_line ($$ "\n")) >> (fn (cs, c) => implode cs ^ c); wenzelm@5825: wenzelm@5825: wenzelm@5825: (* scan nested comments *) wenzelm@5825: wenzelm@5825: val scan_cmt = wenzelm@5825: Scan.lift scan_blank || wenzelm@5825: Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) || wenzelm@5825: Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) || wenzelm@19305: Scan.lift (keep_line ($$ "*" --| Scan.ahead (~$$ ")"))) || wenzelm@19305: Scan.lift (keep_line (Scan.one (fn s => wenzelm@19305: s <> "*" andalso Symbol.not_sync s andalso Symbol.not_eof s))); wenzelm@5825: wenzelm@5825: val scan_comment = wenzelm@5825: keep_line ($$ "(" -- $$ "*") |-- wenzelm@9130: !!! "missing end of comment" wenzelm@5825: (change_prompt wenzelm@7682: (Scan.pass 0 (Scan.repeat scan_cmt >> implode) --| keep_line ($$ "*" -- $$ ")"))); wenzelm@5825: wenzelm@5825: wenzelm@5825: (* scan token *) wenzelm@5825: wenzelm@9130: fun scan (lex1, lex2) = wenzelm@5825: let wenzelm@9130: val scanner = Scan.state :-- (fn pos => wenzelm@9130: let wenzelm@9130: fun token k x = Token (pos, (k, x)); wenzelm@9130: fun sync _ = token Sync Symbol.sync; wenzelm@9130: in wenzelm@9130: scan_string >> token String || wenzelm@17164: scan_alt_string >> token AltString || wenzelm@9130: scan_verbatim >> token Verbatim || wenzelm@9130: scan_space >> token Space || wenzelm@9130: scan_comment >> token Comment || wenzelm@9130: keep_line (Scan.one Symbol.is_sync >> sync) || wenzelm@9130: keep_line (Scan.max token_leq wenzelm@9130: (Scan.max token_leq wenzelm@9130: (Scan.literal lex1 >> (token Keyword o implode)) wenzelm@9130: (Scan.literal lex2 >> (token Command o implode))) wenzelm@9130: (Syntax.scan_longid >> token LongIdent || wenzelm@9130: Syntax.scan_id >> token Ident || wenzelm@9130: Syntax.scan_var >> token Var || wenzelm@9130: Syntax.scan_tid >> token TypeIdent || wenzelm@9130: Syntax.scan_tvar >> token TypeVar || wenzelm@9130: Syntax.scan_nat >> token Nat || wenzelm@9130: scan_symid >> token SymIdent)) wenzelm@9130: end) >> #2; wenzelm@14729: in !! (lex_err (fn cs => "bad input " ^ quote (Symbol.beginning 10 cs))) scanner end; wenzelm@5825: wenzelm@5825: wenzelm@9130: (* token sources *) wenzelm@5825: wenzelm@6859: val is_junk = (not o Symbol.is_blank) andf Symbol.not_sync andf Symbol.not_eof; wenzelm@21858: fun recover xs = (keep_line (Scan.many is_junk) >> (fn ts => [malformed_of ts])) xs; wenzelm@5825: wenzelm@5825: fun source do_recover get_lex pos src = wenzelm@5825: Source.source' pos Symbol.stopper (Scan.bulk (fn xs => scan (get_lex ()) xs)) skalberg@15531: (if do_recover then SOME recover else NONE) src; wenzelm@5825: wenzelm@9130: fun source_proper src = src |> Source.filter is_proper; wenzelm@9130: wenzelm@9130: wenzelm@9130: (* lexicons *) wenzelm@9130: wenzelm@9130: val make_lexicon = Scan.make_lexicon o map Symbol.explode; wenzelm@5825: wenzelm@5825: end;