# HG changeset patch # User wenzelm # Date 1289672513 -3600 # Node ID 1050315f6ee268dd894580b6d80f4a7b59e8610f # Parent 37b79d789d91ba0f8e5bc185899dfdbd9104150d simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.); allow malformed symbols inside quoted material, comments etc. -- for improved user experience with incremental re-parsing; refined treatment of malformed surrogates (Scala); diff -r 37b79d789d91 -r 1050315f6ee2 src/Pure/General/scan.scala --- a/src/Pure/General/scan.scala Sat Nov 13 16:46:00 2010 +0100 +++ b/src/Pure/General/scan.scala Sat Nov 13 19:21:53 2010 +0100 @@ -1,7 +1,7 @@ /* Title: Pure/General/scan.scala Author: Makarius -Efficient scanning of keywords. +Efficient scanning of keywords and tokens. */ package isabelle @@ -181,8 +181,7 @@ private def quoted(quote: String): Parser[String] = { quote ~ - rep(many1(sym => sym != quote && sym != "\\" && Symbol.is_wellformed(sym)) | - "\\" + quote | "\\\\" | + rep(many1(sym => sym != quote && sym != "\\") | "\\" + quote | "\\\\" | (("""\\\d\d\d""".r) ^? { case x if x.substring(1, 4).toInt <= 255 => x })) ~ quote ^^ { case x ~ ys ~ z => x + ys.mkString + z } }.named("quoted") @@ -193,7 +192,7 @@ val body = source.substring(1, source.length - 1) if (body.exists(_ == '\\')) { val content = - rep(many1(sym => sym != quote && sym != "\\" && Symbol.is_wellformed(sym)) | + rep(many1(sym => sym != quote && sym != "\\") | "\\" ~> (quote | "\\" | """\d\d\d""".r ^^ { case x => x.toInt.toChar.toString })) parseAll(content ^^ (_.mkString), body).get } @@ -205,7 +204,7 @@ private def verbatim: Parser[String] = { - "{*" ~ rep(many1(sym => sym != "*" && Symbol.is_wellformed(sym)) | """\*(?!\})""".r) ~ "*}" ^^ + "{*" ~ rep(many1(sym => sym != "*") | """\*(?!\})""".r) ~ "*}" ^^ { case x ~ ys ~ z => x + ys.mkString + z } }.named("verbatim") @@ -221,8 +220,7 @@ def comment: Parser[String] = new Parser[String] { val comment_text = - rep(many1(sym => sym != "*" && sym != "(" && Symbol.is_wellformed(sym)) | - """\*(?!\))|\((?!\*)""".r) + rep(many1(sym => sym != "*" && sym != "(") | """\*(?!\))|\((?!\*)""".r) val comment_open = "(*" ~ comment_text ^^^ () val comment_close = comment_text ~ "*)" ^^^ () @@ -277,8 +275,7 @@ ("-" ~ natdot ^^ { case x ~ y => x + y } | natdot) ^^ (x => Token(Token.Kind.FLOAT, x)) val sym_ident = - (many1(symbols.is_symbolic_char) | - one(sym => symbols.is_symbolic(sym) & Symbol.is_wellformed(sym))) ^^ + (many1(symbols.is_symbolic_char) | one(sym => symbols.is_symbolic(sym))) ^^ (x => Token(Token.Kind.SYM_IDENT, x)) val space = many1(symbols.is_blank) ^^ (x => Token(Token.Kind.SPACE, x)) diff -r 37b79d789d91 -r 1050315f6ee2 src/Pure/General/symbol.ML --- a/src/Pure/General/symbol.ML Sat Nov 13 16:46:00 2010 +0100 +++ b/src/Pure/General/symbol.ML Sat Nov 13 19:21:53 2010 +0100 @@ -24,10 +24,9 @@ val stopper: symbol Scan.stopper val sync: symbol val is_sync: symbol -> bool - val malformed: symbol - val end_malformed: symbol - val separate_chars: string -> string val is_regular: symbol -> bool + val is_malformed: symbol -> bool + val malformed_msg: symbol -> string val is_ascii: symbol -> bool val is_ascii_letter: symbol -> bool val is_ascii_digit: symbol -> bool @@ -42,7 +41,9 @@ val is_raw: symbol -> bool val decode_raw: symbol -> string val encode_raw: string -> string - datatype sym = Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string + datatype sym = + Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string | + Malformed of string val decode: symbol -> sym datatype kind = Letter | Digit | Quasi | Blank | Other val kind: symbol -> kind @@ -56,8 +57,7 @@ val beginning: int -> symbol list -> string val scanner: string -> (string list -> 'a * string list) -> symbol list -> 'a val scan_id: string list -> string * string list - val source: {do_recover: bool} -> (string, 'a) Source.source -> - (symbol, (string, 'a) Source.source) Source.source + val source: (string, 'a) Source.source -> (symbol, (string, 'a) Source.source) Source.source val explode: string -> symbol list val esc: symbol -> string val escape: string -> string @@ -111,11 +111,11 @@ fun is_utf8 s = size s > 0 andalso forall_string (fn c => ord c >= 128) s; fun is_symbolic s = - String.isPrefix "\\<" s andalso not (String.isPrefix "\\<^" s); + String.isPrefix "\\<" s andalso String.isSuffix ">" s andalso not (String.isPrefix "\\<^" s); fun is_printable s = if is_char s then ord space <= ord s andalso ord s <= ord "~" - else is_utf8 s orelse not (String.isPrefix "\\<^" s); + else is_utf8 s orelse is_symbolic s; (* input source control *) @@ -128,14 +128,10 @@ val sync = "\\<^sync>"; fun is_sync s = s = sync; -val malformed = "[["; -val end_malformed = "]]"; +fun is_regular s = not_eof s andalso s <> sync; -val separate_chars = explode #> space_implode space; -fun malformed_msg s = "Malformed symbolic character: " ^ quote (separate_chars s); - -fun is_regular s = - not_eof s andalso s <> sync andalso s <> malformed andalso s <> end_malformed; +fun is_malformed s = String.isPrefix "\\<" s andalso not (String.isSuffix ">" s); +fun malformed_msg s = "Malformed symbolic character: " ^ quote s; (* ascii symbols *) @@ -225,15 +221,16 @@ (* symbol variants *) datatype sym = - Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string; + Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string | + Malformed of string; fun decode s = if is_char s then Char s else if is_utf8 s then UTF8 s else if is_raw s then Raw (decode_raw s) + else if is_malformed s then Malformed s else if String.isPrefix "\\<^" s then Ctrl (String.substring (s, 3, size s - 4)) - else if String.isPrefix "\\<" s then Sym (String.substring (s, 2, size s - 3)) - else error (malformed_msg s); + else Sym (String.substring (s, 2, size s - 3)); (* standard symbol kinds *) @@ -445,30 +442,18 @@ Scan.this_string "raw:" ^^ (Scan.many raw_chr >> implode) || Scan.this_string "raw" ^^ (Scan.many1 is_ascii_digit >> implode); -val scan = +val scan_total = Scan.one is_plain || Scan.one is_utf8 ::: Scan.many is_utf8_trailer >> implode_pseudo_utf8 || scan_encoded_newline || - ($$ "\\" ^^ $$ "<" ^^ - !! (fn (cs, _) => malformed_msg (beginning 10 ("\\" :: "<" :: cs))) - (($$ "^" ^^ (scan_raw || scan_id) || scan_id) ^^ $$ ">")) || + ($$ "\\" ^^ $$ "<" ^^ (($$ "^" ^^ (scan_raw || scan_id) || scan_id) ^^ $$ ">")) || + Scan.this_string "\\<^" || + Scan.this_string "\\<" || Scan.one not_eof; -val scan_resync = - Scan.one is_ascii_blank || $$ "\"" || $$ "`" || $$ "\\" || - Scan.this_string "(*" || Scan.this_string "*)" || - Scan.this_string "{*" || Scan.this_string "*}"; - -val recover = - Scan.this ["\\", "<"] @@@ - Scan.repeat (Scan.unless scan_resync (Scan.one not_eof)) - >> (fn ss => malformed :: ss @ [end_malformed]); - in -fun source {do_recover} src = - Source.source stopper (Scan.bulk scan) - (if do_recover then SOME (false, K recover) else NONE) src; +fun source src = Source.source stopper (Scan.bulk scan_total) NONE src; end; @@ -487,7 +472,7 @@ fun sym_explode str = let val chs = explode str in if no_explode chs then chs - else Source.exhaust (source {do_recover = false} (Source.of_list chs)) + else Source.exhaust (source (Source.of_list chs)) end; end; diff -r 37b79d789d91 -r 1050315f6ee2 src/Pure/General/symbol.scala --- a/src/Pure/General/symbol.scala Sat Nov 13 16:46:00 2010 +0100 +++ b/src/Pure/General/symbol.scala Sat Nov 13 19:21:53 2010 +0100 @@ -40,13 +40,11 @@ \^? [A-Za-z][A-Za-z0-9_']* | \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""") - // FIXME cover bad surrogates!? - // FIXME check wrt. Isabelle/ML version - private val bad_symbol = new Regex("(?xs) (?!" + symbol + ")" + - """ \\ < (?: (?! \s | [\"`\\] | \(\* | \*\) | \{\* | \*\} ) . )*""") + private val malformed_symbol = new Regex("(?xs) (?!" + symbol + ")" + + """ [\ud800-\udbff] | \\<^? """) - // total pattern - val regex = new Regex(plain + "|" + physical_newline + "|" + symbol + "|" + bad_symbol + "| .") + val regex_total = + new Regex(plain + "|" + physical_newline + "|" + symbol + "|" + malformed_symbol + "| .") /* basic matching */ @@ -56,12 +54,12 @@ def is_physical_newline(s: CharSequence): Boolean = "\n".contentEquals(s) || "\r".contentEquals(s) || "\r\n".contentEquals(s) - def is_wellformed(s: CharSequence): Boolean = - s.length == 1 && is_plain(s.charAt(0)) || !bad_symbol.pattern.matcher(s).matches + def is_malformed(s: CharSequence): Boolean = + !(s.length == 1 && is_plain(s.charAt(0))) && malformed_symbol.pattern.matcher(s).matches class Matcher(text: CharSequence) { - private val matcher = regex.pattern.matcher(text) + private val matcher = regex_total.pattern.matcher(text) def apply(start: Int, end: Int): Int = { require(0 <= start && start < end && end <= text.length) @@ -162,7 +160,7 @@ def recode(text: String): String = { val len = text.length - val matcher = regex.pattern.matcher(text) + val matcher = regex_total.pattern.matcher(text) val result = new StringBuilder(len) var i = 0 while (i < len) { @@ -204,7 +202,7 @@ } } decl.split("\\s+").toList match { - case sym :: props if sym.length > 1 && is_wellformed(sym) => (sym, read_props(props)) + case sym :: props if sym.length > 1 && !is_malformed(sym) => (sym, read_props(props)) case _ => err() } } @@ -313,6 +311,7 @@ def is_letdig(sym: String): Boolean = is_letter(sym) || is_digit(sym) || is_quasi(sym) def is_blank(sym: String): Boolean = blanks.contains(sym) def is_symbolic_char(sym: String): Boolean = sym_chars.contains(sym) - def is_symbolic(sym: String): Boolean = sym.startsWith("\\<") && !sym.startsWith("\\<^") + def is_symbolic(sym: String): Boolean = + sym.startsWith("\\<") && sym.endsWith(">") && !sym.startsWith("\\<^") } } diff -r 37b79d789d91 -r 1050315f6ee2 src/Pure/Isar/outer_syntax.ML --- a/src/Pure/Isar/outer_syntax.ML Sat Nov 13 16:46:00 2010 +0100 +++ b/src/Pure/Isar/outer_syntax.ML Sat Nov 13 19:21:53 2010 +0100 @@ -187,13 +187,13 @@ fun scan pos str = Source.of_string str - |> Symbol.source {do_recover = false} + |> Symbol.source |> Token.source {do_recover = SOME false} Keyword.get_lexicons pos |> Source.exhaust; fun parse pos str = Source.of_string str - |> Symbol.source {do_recover = false} + |> Symbol.source |> Token.source {do_recover = SOME false} Keyword.get_lexicons pos |> toplevel_source false NONE get_command |> Source.exhaust; @@ -224,7 +224,7 @@ fun isar in_stream term : isar = Source.tty in_stream - |> Symbol.source {do_recover = true} + |> Symbol.source |> Token.source {do_recover = SOME true} Keyword.get_lexicons Position.none |> toplevel_source term (SOME true) get_command; diff -r 37b79d789d91 -r 1050315f6ee2 src/Pure/Isar/token.ML --- a/src/Pure/Isar/token.ML Sat Nov 13 16:46:00 2010 +0100 +++ b/src/Pure/Isar/token.ML Sat Nov 13 19:21:53 2010 +0100 @@ -210,8 +210,7 @@ else let val k = str_of_kind (kind_of tok); - val s = unparse tok - handle ERROR _ => Symbol.separate_chars (content_of tok); + val s = unparse tok; in if s = "" then (k, "") else if size s < 40 andalso not (exists_string (fn c => c = "\n") s) then (k ^ " " ^ s, "") @@ -312,14 +311,6 @@ Symbol_Pos.scan_pos -- (Symbol_Pos.scan_comment_body !!! -- Symbol_Pos.scan_pos); -(* scan malformed symbols *) - -val scan_malformed = - $$$ Symbol.malformed |-- - Symbol_Pos.change_prompt (Scan.many (Symbol.is_regular o symbol)) - --| Scan.option ($$$ Symbol.end_malformed); - - (** token sources **) @@ -341,7 +332,7 @@ scan_verbatim >> token_range Verbatim || scan_comment >> token_range Comment || scan_space >> token Space || - scan_malformed >> token Malformed || + Scan.one (Symbol.is_malformed o symbol) >> (token Malformed o single) || Scan.one (Symbol.is_sync o symbol) >> (token Sync o single) || (Scan.max token_leq (Scan.max token_leq diff -r 37b79d789d91 -r 1050315f6ee2 src/Pure/ML/ml_lex.ML --- a/src/Pure/ML/ml_lex.ML Sat Nov 13 16:46:00 2010 +0100 +++ b/src/Pure/ML/ml_lex.ML Sat Nov 13 19:21:53 2010 +0100 @@ -265,11 +265,7 @@ Symbol_Pos.source (Position.line 1) src |> Source.source Symbol_Pos.stopper (Scan.bulk (!!! "bad input" scan_ml)) (SOME (false, recover)); -val tokenize = - Source.of_string #> - Symbol.source {do_recover = true} #> - source #> - Source.exhaust; +val tokenize = Source.of_string #> Symbol.source #> source #> Source.exhaust; fun read pos txt = let diff -r 37b79d789d91 -r 1050315f6ee2 src/Pure/Thy/latex.ML --- a/src/Pure/Thy/latex.ML Sat Nov 13 16:46:00 2010 +0100 +++ b/src/Pure/Thy/latex.ML Sat Nov 13 19:21:53 2010 +0100 @@ -101,7 +101,8 @@ | Symbol.Ctrl s => if known_ctrl s then literal sym ^ "{}" ^ enclose "\\isactrl" " " s else output_chrs sym - | Symbol.Raw s => s); + | Symbol.Raw s => s + | Symbol.Malformed s => error (Symbol.malformed_msg s)); in diff -r 37b79d789d91 -r 1050315f6ee2 src/Pure/Thy/thy_header.ML --- a/src/Pure/Thy/thy_header.ML Sat Nov 13 16:46:00 2010 +0100 +++ b/src/Pure/Thy/thy_header.ML Sat Nov 13 19:21:53 2010 +0100 @@ -57,7 +57,7 @@ let val res = str |> Source.of_string_limited 8000 - |> Symbol.source {do_recover = false} + |> Symbol.source |> Token.source {do_recover = NONE} (fn () => (header_lexicon, Scan.empty_lexicon)) pos |> Token.source_proper |> Source.source Token.stopper (Scan.single (Scan.error (Parse.!!! header))) NONE diff -r 37b79d789d91 -r 1050315f6ee2 src/Pure/Thy/thy_syntax.ML --- a/src/Pure/Thy/thy_syntax.ML Sat Nov 13 16:46:00 2010 +0100 +++ b/src/Pure/Thy/thy_syntax.ML Sat Nov 13 19:21:53 2010 +0100 @@ -32,14 +32,11 @@ (* parse *) -fun token_source lexs pos src = - Symbol.source {do_recover = true} src - |> Token.source {do_recover = SOME false} (K lexs) pos; +fun token_source lexs pos = + Symbol.source #> Token.source {do_recover = SOME false} (K lexs) pos; -fun parse_tokens lexs pos str = - Source.of_string str - |> token_source lexs pos - |> Source.exhaust; +fun parse_tokens lexs pos = + Source.of_string #> token_source lexs pos #> Source.exhaust; (* present *)