simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
authorwenzelm
Sat, 13 Nov 2010 19:21:53 +0100
changeset 40523 1050315f6ee2
parent 40522 37b79d789d91
child 40524 6131d7a78ad3
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.); allow malformed symbols inside quoted material, comments etc. -- for improved user experience with incremental re-parsing; refined treatment of malformed surrogates (Scala);
src/Pure/General/scan.scala
src/Pure/General/symbol.ML
src/Pure/General/symbol.scala
src/Pure/Isar/outer_syntax.ML
src/Pure/Isar/token.ML
src/Pure/ML/ml_lex.ML
src/Pure/Thy/latex.ML
src/Pure/Thy/thy_header.ML
src/Pure/Thy/thy_syntax.ML
--- a/src/Pure/General/scan.scala	Sat Nov 13 16:46:00 2010 +0100
+++ b/src/Pure/General/scan.scala	Sat Nov 13 19:21:53 2010 +0100
@@ -1,7 +1,7 @@
 /*  Title:      Pure/General/scan.scala
     Author:     Makarius
 
-Efficient scanning of keywords.
+Efficient scanning of keywords and tokens.
 */
 
 package isabelle
@@ -181,8 +181,7 @@
     private def quoted(quote: String): Parser[String] =
     {
       quote ~
-        rep(many1(sym => sym != quote && sym != "\\" && Symbol.is_wellformed(sym)) |
-          "\\" + quote | "\\\\" |
+        rep(many1(sym => sym != quote && sym != "\\") | "\\" + quote | "\\\\" |
           (("""\\\d\d\d""".r) ^? { case x if x.substring(1, 4).toInt <= 255 => x })) ~
       quote ^^ { case x ~ ys ~ z => x + ys.mkString + z }
     }.named("quoted")
@@ -193,7 +192,7 @@
       val body = source.substring(1, source.length - 1)
       if (body.exists(_ == '\\')) {
         val content =
-          rep(many1(sym => sym != quote && sym != "\\" && Symbol.is_wellformed(sym)) |
+          rep(many1(sym => sym != quote && sym != "\\") |
               "\\" ~> (quote | "\\" | """\d\d\d""".r ^^ { case x => x.toInt.toChar.toString }))
         parseAll(content ^^ (_.mkString), body).get
       }
@@ -205,7 +204,7 @@
 
     private def verbatim: Parser[String] =
     {
-      "{*" ~ rep(many1(sym => sym != "*" && Symbol.is_wellformed(sym)) | """\*(?!\})""".r) ~ "*}" ^^
+      "{*" ~ rep(many1(sym => sym != "*") | """\*(?!\})""".r) ~ "*}" ^^
         { case x ~ ys ~ z => x + ys.mkString + z }
     }.named("verbatim")
 
@@ -221,8 +220,7 @@
     def comment: Parser[String] = new Parser[String]
     {
       val comment_text =
-        rep(many1(sym => sym != "*" && sym != "(" && Symbol.is_wellformed(sym)) |
-          """\*(?!\))|\((?!\*)""".r)
+        rep(many1(sym => sym != "*" && sym != "(") | """\*(?!\))|\((?!\*)""".r)
       val comment_open = "(*" ~ comment_text ^^^ ()
       val comment_close = comment_text ~ "*)" ^^^ ()
 
@@ -277,8 +275,7 @@
         ("-" ~ natdot ^^ { case x ~ y => x + y } | natdot) ^^ (x => Token(Token.Kind.FLOAT, x))
 
       val sym_ident =
-        (many1(symbols.is_symbolic_char) |
-          one(sym => symbols.is_symbolic(sym) & Symbol.is_wellformed(sym))) ^^
+        (many1(symbols.is_symbolic_char) | one(sym => symbols.is_symbolic(sym))) ^^
         (x => Token(Token.Kind.SYM_IDENT, x))
 
       val space = many1(symbols.is_blank) ^^ (x => Token(Token.Kind.SPACE, x))
--- a/src/Pure/General/symbol.ML	Sat Nov 13 16:46:00 2010 +0100
+++ b/src/Pure/General/symbol.ML	Sat Nov 13 19:21:53 2010 +0100
@@ -24,10 +24,9 @@
   val stopper: symbol Scan.stopper
   val sync: symbol
   val is_sync: symbol -> bool
-  val malformed: symbol
-  val end_malformed: symbol
-  val separate_chars: string -> string
   val is_regular: symbol -> bool
+  val is_malformed: symbol -> bool
+  val malformed_msg: symbol -> string
   val is_ascii: symbol -> bool
   val is_ascii_letter: symbol -> bool
   val is_ascii_digit: symbol -> bool
@@ -42,7 +41,9 @@
   val is_raw: symbol -> bool
   val decode_raw: symbol -> string
   val encode_raw: string -> string
-  datatype sym = Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string
+  datatype sym =
+    Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string |
+    Malformed of string
   val decode: symbol -> sym
   datatype kind = Letter | Digit | Quasi | Blank | Other
   val kind: symbol -> kind
@@ -56,8 +57,7 @@
   val beginning: int -> symbol list -> string
   val scanner: string -> (string list -> 'a * string list) -> symbol list -> 'a
   val scan_id: string list -> string * string list
-  val source: {do_recover: bool} -> (string, 'a) Source.source ->
-    (symbol, (string, 'a) Source.source) Source.source
+  val source: (string, 'a) Source.source -> (symbol, (string, 'a) Source.source) Source.source
   val explode: string -> symbol list
   val esc: symbol -> string
   val escape: string -> string
@@ -111,11 +111,11 @@
 fun is_utf8 s = size s > 0 andalso forall_string (fn c => ord c >= 128) s;
 
 fun is_symbolic s =
-  String.isPrefix "\\<" s andalso not (String.isPrefix "\\<^" s);
+  String.isPrefix "\\<" s andalso String.isSuffix ">" s andalso not (String.isPrefix "\\<^" s);
 
 fun is_printable s =
   if is_char s then ord space <= ord s andalso ord s <= ord "~"
-  else is_utf8 s orelse not (String.isPrefix "\\<^" s);
+  else is_utf8 s orelse is_symbolic s;
 
 
 (* input source control *)
@@ -128,14 +128,10 @@
 val sync = "\\<^sync>";
 fun is_sync s = s = sync;
 
-val malformed = "[[";
-val end_malformed = "]]";
+fun is_regular s = not_eof s andalso s <> sync;
 
-val separate_chars = explode #> space_implode space;
-fun malformed_msg s = "Malformed symbolic character: " ^ quote (separate_chars s);
-
-fun is_regular s =
-  not_eof s andalso s <> sync andalso s <> malformed andalso s <> end_malformed;
+fun is_malformed s = String.isPrefix "\\<" s andalso not (String.isSuffix ">" s);
+fun malformed_msg s = "Malformed symbolic character: " ^ quote s;
 
 
 (* ascii symbols *)
@@ -225,15 +221,16 @@
 (* symbol variants *)
 
 datatype sym =
-  Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string;
+  Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string |
+  Malformed of string;
 
 fun decode s =
   if is_char s then Char s
   else if is_utf8 s then UTF8 s
   else if is_raw s then Raw (decode_raw s)
+  else if is_malformed s then Malformed s
   else if String.isPrefix "\\<^" s then Ctrl (String.substring (s, 3, size s - 4))
-  else if String.isPrefix "\\<" s then Sym (String.substring (s, 2, size s - 3))
-  else error (malformed_msg s);
+  else Sym (String.substring (s, 2, size s - 3));
 
 
 (* standard symbol kinds *)
@@ -445,30 +442,18 @@
   Scan.this_string "raw:" ^^ (Scan.many raw_chr >> implode) ||
   Scan.this_string "raw" ^^ (Scan.many1 is_ascii_digit >> implode);
 
-val scan =
+val scan_total =
   Scan.one is_plain ||
   Scan.one is_utf8 ::: Scan.many is_utf8_trailer >> implode_pseudo_utf8 ||
   scan_encoded_newline ||
-  ($$ "\\" ^^ $$ "<" ^^
-    !! (fn (cs, _) => malformed_msg (beginning 10 ("\\" :: "<" :: cs)))
-      (($$ "^" ^^ (scan_raw || scan_id) || scan_id) ^^ $$ ">")) ||
+  ($$ "\\" ^^ $$ "<" ^^ (($$ "^" ^^ (scan_raw || scan_id) || scan_id) ^^ $$ ">")) ||
+  Scan.this_string "\\<^" ||
+  Scan.this_string "\\<" ||
   Scan.one not_eof;
 
-val scan_resync =
-  Scan.one is_ascii_blank || $$ "\"" || $$ "`" || $$ "\\" ||
-  Scan.this_string "(*" || Scan.this_string "*)" ||
-  Scan.this_string "{*" || Scan.this_string "*}";
-
-val recover =
-  Scan.this ["\\", "<"] @@@
-    Scan.repeat (Scan.unless scan_resync (Scan.one not_eof))
-  >> (fn ss => malformed :: ss @ [end_malformed]);
-
 in
 
-fun source {do_recover} src =
-  Source.source stopper (Scan.bulk scan)
-    (if do_recover then SOME (false, K recover) else NONE) src;
+fun source src = Source.source stopper (Scan.bulk scan_total) NONE src;
 
 end;
 
@@ -487,7 +472,7 @@
 fun sym_explode str =
   let val chs = explode str in
     if no_explode chs then chs
-    else Source.exhaust (source {do_recover = false} (Source.of_list chs))
+    else Source.exhaust (source (Source.of_list chs))
   end;
 
 end;
--- a/src/Pure/General/symbol.scala	Sat Nov 13 16:46:00 2010 +0100
+++ b/src/Pure/General/symbol.scala	Sat Nov 13 19:21:53 2010 +0100
@@ -40,13 +40,11 @@
       \^? [A-Za-z][A-Za-z0-9_']* |
       \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""")
 
-  // FIXME cover bad surrogates!?
-  // FIXME check wrt. Isabelle/ML version
-  private val bad_symbol = new Regex("(?xs) (?!" + symbol + ")" +
-    """ \\ < (?: (?! \s | [\"`\\] | \(\* | \*\) | \{\* | \*\} ) . )*""")
+  private val malformed_symbol = new Regex("(?xs) (?!" + symbol + ")" +
+    """ [\ud800-\udbff] | \\<^? """)
 
-  // total pattern
-  val regex = new Regex(plain + "|" + physical_newline + "|" + symbol + "|" + bad_symbol + "| .")
+  val regex_total =
+    new Regex(plain + "|" + physical_newline + "|" + symbol + "|" + malformed_symbol + "| .")
 
 
   /* basic matching */
@@ -56,12 +54,12 @@
   def is_physical_newline(s: CharSequence): Boolean =
     "\n".contentEquals(s) || "\r".contentEquals(s) || "\r\n".contentEquals(s)
 
-  def is_wellformed(s: CharSequence): Boolean =
-    s.length == 1 && is_plain(s.charAt(0)) || !bad_symbol.pattern.matcher(s).matches
+  def is_malformed(s: CharSequence): Boolean =
+    !(s.length == 1 && is_plain(s.charAt(0))) && malformed_symbol.pattern.matcher(s).matches
 
   class Matcher(text: CharSequence)
   {
-    private val matcher = regex.pattern.matcher(text)
+    private val matcher = regex_total.pattern.matcher(text)
     def apply(start: Int, end: Int): Int =
     {
       require(0 <= start && start < end && end <= text.length)
@@ -162,7 +160,7 @@
     def recode(text: String): String =
     {
       val len = text.length
-      val matcher = regex.pattern.matcher(text)
+      val matcher = regex_total.pattern.matcher(text)
       val result = new StringBuilder(len)
       var i = 0
       while (i < len) {
@@ -204,7 +202,7 @@
         }
       }
       decl.split("\\s+").toList match {
-        case sym :: props if sym.length > 1 && is_wellformed(sym) => (sym, read_props(props))
+        case sym :: props if sym.length > 1 && !is_malformed(sym) => (sym, read_props(props))
         case _ => err()
       }
     }
@@ -313,6 +311,7 @@
     def is_letdig(sym: String): Boolean = is_letter(sym) || is_digit(sym) || is_quasi(sym)
     def is_blank(sym: String): Boolean = blanks.contains(sym)
     def is_symbolic_char(sym: String): Boolean = sym_chars.contains(sym)
-    def is_symbolic(sym: String): Boolean = sym.startsWith("\\<") && !sym.startsWith("\\<^")
+    def is_symbolic(sym: String): Boolean =
+      sym.startsWith("\\<") && sym.endsWith(">") && !sym.startsWith("\\<^")
   }
 }
--- a/src/Pure/Isar/outer_syntax.ML	Sat Nov 13 16:46:00 2010 +0100
+++ b/src/Pure/Isar/outer_syntax.ML	Sat Nov 13 19:21:53 2010 +0100
@@ -187,13 +187,13 @@
 
 fun scan pos str =
   Source.of_string str
-  |> Symbol.source {do_recover = false}
+  |> Symbol.source
   |> Token.source {do_recover = SOME false} Keyword.get_lexicons pos
   |> Source.exhaust;
 
 fun parse pos str =
   Source.of_string str
-  |> Symbol.source {do_recover = false}
+  |> Symbol.source
   |> Token.source {do_recover = SOME false} Keyword.get_lexicons pos
   |> toplevel_source false NONE get_command
   |> Source.exhaust;
@@ -224,7 +224,7 @@
 
 fun isar in_stream term : isar =
   Source.tty in_stream
-  |> Symbol.source {do_recover = true}
+  |> Symbol.source
   |> Token.source {do_recover = SOME true} Keyword.get_lexicons Position.none
   |> toplevel_source term (SOME true) get_command;
 
--- a/src/Pure/Isar/token.ML	Sat Nov 13 16:46:00 2010 +0100
+++ b/src/Pure/Isar/token.ML	Sat Nov 13 19:21:53 2010 +0100
@@ -210,8 +210,7 @@
   else
     let
       val k = str_of_kind (kind_of tok);
-      val s = unparse tok
-        handle ERROR _ => Symbol.separate_chars (content_of tok);
+      val s = unparse tok;
     in
       if s = "" then (k, "")
       else if size s < 40 andalso not (exists_string (fn c => c = "\n") s) then (k ^ " " ^ s, "")
@@ -312,14 +311,6 @@
   Symbol_Pos.scan_pos -- (Symbol_Pos.scan_comment_body !!! -- Symbol_Pos.scan_pos);
 
 
-(* scan malformed symbols *)
-
-val scan_malformed =
-  $$$ Symbol.malformed |--
-    Symbol_Pos.change_prompt (Scan.many (Symbol.is_regular o symbol))
-  --| Scan.option ($$$ Symbol.end_malformed);
-
-
 
 (** token sources **)
 
@@ -341,7 +332,7 @@
     scan_verbatim >> token_range Verbatim ||
     scan_comment >> token_range Comment ||
     scan_space >> token Space ||
-    scan_malformed >> token Malformed ||
+    Scan.one (Symbol.is_malformed o symbol) >> (token Malformed o single) ||
     Scan.one (Symbol.is_sync o symbol) >> (token Sync o single) ||
     (Scan.max token_leq
       (Scan.max token_leq
--- a/src/Pure/ML/ml_lex.ML	Sat Nov 13 16:46:00 2010 +0100
+++ b/src/Pure/ML/ml_lex.ML	Sat Nov 13 19:21:53 2010 +0100
@@ -265,11 +265,7 @@
   Symbol_Pos.source (Position.line 1) src
   |> Source.source Symbol_Pos.stopper (Scan.bulk (!!! "bad input" scan_ml)) (SOME (false, recover));
 
-val tokenize =
-  Source.of_string #>
-  Symbol.source {do_recover = true} #>
-  source #>
-  Source.exhaust;
+val tokenize = Source.of_string #> Symbol.source #> source #> Source.exhaust;
 
 fun read pos txt =
   let
--- a/src/Pure/Thy/latex.ML	Sat Nov 13 16:46:00 2010 +0100
+++ b/src/Pure/Thy/latex.ML	Sat Nov 13 19:21:53 2010 +0100
@@ -101,7 +101,8 @@
   | Symbol.Ctrl s =>
       if known_ctrl s then literal sym ^ "{}" ^ enclose "\\isactrl" " " s
       else output_chrs sym
-  | Symbol.Raw s => s);
+  | Symbol.Raw s => s
+  | Symbol.Malformed s => error (Symbol.malformed_msg s));
 
 in
 
--- a/src/Pure/Thy/thy_header.ML	Sat Nov 13 16:46:00 2010 +0100
+++ b/src/Pure/Thy/thy_header.ML	Sat Nov 13 19:21:53 2010 +0100
@@ -57,7 +57,7 @@
   let val res =
     str
     |> Source.of_string_limited 8000
-    |> Symbol.source {do_recover = false}
+    |> Symbol.source
     |> Token.source {do_recover = NONE} (fn () => (header_lexicon, Scan.empty_lexicon)) pos
     |> Token.source_proper
     |> Source.source Token.stopper (Scan.single (Scan.error (Parse.!!! header))) NONE
--- a/src/Pure/Thy/thy_syntax.ML	Sat Nov 13 16:46:00 2010 +0100
+++ b/src/Pure/Thy/thy_syntax.ML	Sat Nov 13 19:21:53 2010 +0100
@@ -32,14 +32,11 @@
 
 (* parse *)
 
-fun token_source lexs pos src =
-  Symbol.source {do_recover = true} src
-  |> Token.source {do_recover = SOME false} (K lexs) pos;
+fun token_source lexs pos =
+  Symbol.source #> Token.source {do_recover = SOME false} (K lexs) pos;
 
-fun parse_tokens lexs pos str =
-  Source.of_string str
-  |> token_source lexs pos
-  |> Source.exhaust;
+fun parse_tokens lexs pos =
+  Source.of_string #> token_source lexs pos #> Source.exhaust;
 
 
 (* present *)