more liberal scanning of potentially malformed symbols;
authorwenzelm
Sat, 11 Aug 2012 20:54:06 +0200
changeset 48773 0e1bab274672
parent 48772 e46cd0d26481
child 48774 c4bd5bb3ae69
more liberal scanning of potentially malformed symbols;
src/Pure/General/symbol.ML
src/Pure/General/symbol.scala
src/Pure/Thy/thy_syntax.ML
--- a/src/Pure/General/symbol.ML	Sat Aug 11 19:34:36 2012 +0200
+++ b/src/Pure/General/symbol.ML	Sat Aug 11 20:54:06 2012 +0200
@@ -157,8 +157,9 @@
 (* encode_raw *)
 
 fun raw_chr c =
-  ord space <= ord c andalso ord c <= ord "~" andalso c <> "." andalso c <> ">"
-  orelse ord c >= 128;
+  is_char c andalso
+    (ord space <= ord c andalso ord c <= ord "~" andalso c <> "." andalso c <> ">"
+      orelse ord c >= 128);
 
 fun encode_raw "" = ""
   | encode_raw str =
@@ -434,7 +435,8 @@
   Scan.one is_plain ||
   Scan.one is_utf8 ::: Scan.many is_utf8_trailer >> implode_pseudo_utf8 ||
   scan_encoded_newline ||
-  ($$ "\\" ^^ $$ "<" ^^ (($$ "^" ^^ (scan_raw || scan_id) || scan_id) ^^ $$ ">")) ||
+  ($$ "\\" ^^ $$ "<" ^^
+    (($$ "^" ^^ (scan_raw || scan_id) || scan_id) ^^ Scan.optional ($$ ">") "")) ||
   Scan.this_string "\\<^" ||
   Scan.this_string "\\<" ||
   Scan.one not_eof;
--- a/src/Pure/General/symbol.scala	Sat Aug 11 19:34:36 2012 +0200
+++ b/src/Pure/General/symbol.scala	Sat Aug 11 20:54:06 2012 +0200
@@ -37,26 +37,31 @@
 
   private val symbol = new Regex("""(?xs)
       \\ < (?:
-      \^? [A-Za-z][A-Za-z0-9_']* |
-      \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""")
+      \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* |
+      \^? ([A-Za-z][A-Za-z0-9_']*)? ) >?""")
 
-  private val malformed_symbol = new Regex("(?xs) (?!" + symbol + ")" +
-    """ [\ud800-\udbff\ufffd] | \\<\^? """)
-
-  val regex_total =
-    new Regex(plain + "|" + physical_newline + "|" + symbol + "|" + malformed_symbol + "| .")
+  val regex_total = new Regex(plain + "|" + physical_newline + "|" + symbol + "| .")
 
 
   /* basic matching */
 
-  def is_plain(c: Char): Boolean = !(c == '\r' || c == '\\' || '\ud800' <= c && c <= '\udfff')
+  def is_plain(c: Char): Boolean = !(c == '\r' || c == '\\' || Character.isHighSurrogate(c))
+
+  def is_malformed(s: Symbol): Boolean =
+    s.length match {
+      case 1 =>
+        val c = s(0)
+        Character.isHighSurrogate(c) || Character.isLowSurrogate(c) || c == '\ufffd'
+      case 2 =>
+        val c1 = s(0)
+        val c2 = s(1)
+        !(c1 == '\r' && c2 == '\n' || Character.isSurrogatePair(c1, c2))
+      case _ => !s.endsWith(">")
+    }
 
   def is_physical_newline(s: Symbol): Boolean =
     s == "\n" || s == "\r" || s == "\r\n"
 
-  def is_malformed(s: Symbol): Boolean =
-    !(s.length == 1 && is_plain(s(0))) && malformed_symbol.pattern.matcher(s).matches
-
   class Matcher(text: CharSequence)
   {
     private val matcher = regex_total.pattern.matcher(text)
--- a/src/Pure/Thy/thy_syntax.ML	Sat Aug 11 19:34:36 2012 +0200
+++ b/src/Pure/Thy/thy_syntax.ML	Sat Aug 11 20:54:06 2012 +0200
@@ -77,7 +77,7 @@
       Symbol_Pos.explode (Token.source_position_of tok)
       |> map_filter (fn (sym, pos) =>
           if Symbol.is_malformed sym
-          then SOME ((pos, Isabelle_Markup.bad), "Malformed symbol") else NONE);
+          then SOME ((pos, Isabelle_Markup.bad), "Malformed symbolic character") else NONE);
     val is_malformed = Token.is_error tok orelse not (null malformed_symbols);
     val (markup, txt) = token_markup tok;
     val reports = ((Token.position_of tok, markup), txt) :: malformed_symbols;