# HG changeset patch # User wenzelm # Date 1344711246 -7200 # Node ID 0e1bab274672e440ad78e23db46bb39f2027615c # Parent e46cd0d2648181bd11134b06b8abef7993aa297d more liberal scanning of potentially malformed symbols; diff -r e46cd0d26481 -r 0e1bab274672 src/Pure/General/symbol.ML --- a/src/Pure/General/symbol.ML Sat Aug 11 19:34:36 2012 +0200 +++ b/src/Pure/General/symbol.ML Sat Aug 11 20:54:06 2012 +0200 @@ -157,8 +157,9 @@ (* encode_raw *) fun raw_chr c = - ord space <= ord c andalso ord c <= ord "~" andalso c <> "." andalso c <> ">" - orelse ord c >= 128; + is_char c andalso + (ord space <= ord c andalso ord c <= ord "~" andalso c <> "." andalso c <> ">" + orelse ord c >= 128); fun encode_raw "" = "" | encode_raw str = @@ -434,7 +435,8 @@ Scan.one is_plain || Scan.one is_utf8 ::: Scan.many is_utf8_trailer >> implode_pseudo_utf8 || scan_encoded_newline || - ($$ "\\" ^^ $$ "<" ^^ (($$ "^" ^^ (scan_raw || scan_id) || scan_id) ^^ $$ ">")) || + ($$ "\\" ^^ $$ "<" ^^ + (($$ "^" ^^ (scan_raw || scan_id) || scan_id) ^^ Scan.optional ($$ ">") "")) || Scan.this_string "\\<^" || Scan.this_string "\\<" || Scan.one not_eof; diff -r e46cd0d26481 -r 0e1bab274672 src/Pure/General/symbol.scala --- a/src/Pure/General/symbol.scala Sat Aug 11 19:34:36 2012 +0200 +++ b/src/Pure/General/symbol.scala Sat Aug 11 20:54:06 2012 +0200 @@ -37,26 +37,31 @@ private val symbol = new Regex("""(?xs) \\ < (?: - \^? [A-Za-z][A-Za-z0-9_']* | - \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""") + \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* | + \^? ([A-Za-z][A-Za-z0-9_']*)? ) >?""") - private val malformed_symbol = new Regex("(?xs) (?!" + symbol + ")" + - """ [\ud800-\udbff\ufffd] | \\<\^? """) - - val regex_total = - new Regex(plain + "|" + physical_newline + "|" + symbol + "|" + malformed_symbol + "| .") + val regex_total = new Regex(plain + "|" + physical_newline + "|" + symbol + "| .") /* basic matching */ - def is_plain(c: Char): Boolean = !(c == '\r' || c == '\\' || '\ud800' <= c && c <= '\udfff') + def is_plain(c: Char): Boolean = !(c == '\r' || c == '\\' || Character.isHighSurrogate(c)) + + def is_malformed(s: Symbol): Boolean = + s.length match { + case 1 => + val c = s(0) + Character.isHighSurrogate(c) || Character.isLowSurrogate(c) || c == '\ufffd' + case 2 => + val c1 = s(0) + val c2 = s(1) + !(c1 == '\r' && c2 == '\n' || Character.isSurrogatePair(c1, c2)) + case _ => !s.endsWith(">") + } def is_physical_newline(s: Symbol): Boolean = s == "\n" || s == "\r" || s == "\r\n" - def is_malformed(s: Symbol): Boolean = - !(s.length == 1 && is_plain(s(0))) && malformed_symbol.pattern.matcher(s).matches - class Matcher(text: CharSequence) { private val matcher = regex_total.pattern.matcher(text) diff -r e46cd0d26481 -r 0e1bab274672 src/Pure/Thy/thy_syntax.ML --- a/src/Pure/Thy/thy_syntax.ML Sat Aug 11 19:34:36 2012 +0200 +++ b/src/Pure/Thy/thy_syntax.ML Sat Aug 11 20:54:06 2012 +0200 @@ -77,7 +77,7 @@ Symbol_Pos.explode (Token.source_position_of tok) |> map_filter (fn (sym, pos) => if Symbol.is_malformed sym - then SOME ((pos, Isabelle_Markup.bad), "Malformed symbol") else NONE); + then SOME ((pos, Isabelle_Markup.bad), "Malformed symbolic character") else NONE); val is_malformed = Token.is_error tok orelse not (null malformed_symbols); val (markup, txt) = token_markup tok; val reports = ((Token.position_of tok, markup), txt) :: malformed_symbols;