simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
allow malformed symbols inside quoted material, comments etc. -- for improved user experience with incremental re-parsing;
refined treatment of malformed surrogates (Scala);
--- a/src/Pure/General/scan.scala Sat Nov 13 16:46:00 2010 +0100
+++ b/src/Pure/General/scan.scala Sat Nov 13 19:21:53 2010 +0100
@@ -1,7 +1,7 @@
/* Title: Pure/General/scan.scala
Author: Makarius
-Efficient scanning of keywords.
+Efficient scanning of keywords and tokens.
*/
package isabelle
@@ -181,8 +181,7 @@
private def quoted(quote: String): Parser[String] =
{
quote ~
- rep(many1(sym => sym != quote && sym != "\\" && Symbol.is_wellformed(sym)) |
- "\\" + quote | "\\\\" |
+ rep(many1(sym => sym != quote && sym != "\\") | "\\" + quote | "\\\\" |
(("""\\\d\d\d""".r) ^? { case x if x.substring(1, 4).toInt <= 255 => x })) ~
quote ^^ { case x ~ ys ~ z => x + ys.mkString + z }
}.named("quoted")
@@ -193,7 +192,7 @@
val body = source.substring(1, source.length - 1)
if (body.exists(_ == '\\')) {
val content =
- rep(many1(sym => sym != quote && sym != "\\" && Symbol.is_wellformed(sym)) |
+ rep(many1(sym => sym != quote && sym != "\\") |
"\\" ~> (quote | "\\" | """\d\d\d""".r ^^ { case x => x.toInt.toChar.toString }))
parseAll(content ^^ (_.mkString), body).get
}
@@ -205,7 +204,7 @@
private def verbatim: Parser[String] =
{
- "{*" ~ rep(many1(sym => sym != "*" && Symbol.is_wellformed(sym)) | """\*(?!\})""".r) ~ "*}" ^^
+ "{*" ~ rep(many1(sym => sym != "*") | """\*(?!\})""".r) ~ "*}" ^^
{ case x ~ ys ~ z => x + ys.mkString + z }
}.named("verbatim")
@@ -221,8 +220,7 @@
def comment: Parser[String] = new Parser[String]
{
val comment_text =
- rep(many1(sym => sym != "*" && sym != "(" && Symbol.is_wellformed(sym)) |
- """\*(?!\))|\((?!\*)""".r)
+ rep(many1(sym => sym != "*" && sym != "(") | """\*(?!\))|\((?!\*)""".r)
val comment_open = "(*" ~ comment_text ^^^ ()
val comment_close = comment_text ~ "*)" ^^^ ()
@@ -277,8 +275,7 @@
("-" ~ natdot ^^ { case x ~ y => x + y } | natdot) ^^ (x => Token(Token.Kind.FLOAT, x))
val sym_ident =
- (many1(symbols.is_symbolic_char) |
- one(sym => symbols.is_symbolic(sym) & Symbol.is_wellformed(sym))) ^^
+ (many1(symbols.is_symbolic_char) | one(sym => symbols.is_symbolic(sym))) ^^
(x => Token(Token.Kind.SYM_IDENT, x))
val space = many1(symbols.is_blank) ^^ (x => Token(Token.Kind.SPACE, x))
--- a/src/Pure/General/symbol.ML Sat Nov 13 16:46:00 2010 +0100
+++ b/src/Pure/General/symbol.ML Sat Nov 13 19:21:53 2010 +0100
@@ -24,10 +24,9 @@
val stopper: symbol Scan.stopper
val sync: symbol
val is_sync: symbol -> bool
- val malformed: symbol
- val end_malformed: symbol
- val separate_chars: string -> string
val is_regular: symbol -> bool
+ val is_malformed: symbol -> bool
+ val malformed_msg: symbol -> string
val is_ascii: symbol -> bool
val is_ascii_letter: symbol -> bool
val is_ascii_digit: symbol -> bool
@@ -42,7 +41,9 @@
val is_raw: symbol -> bool
val decode_raw: symbol -> string
val encode_raw: string -> string
- datatype sym = Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string
+ datatype sym =
+ Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string |
+ Malformed of string
val decode: symbol -> sym
datatype kind = Letter | Digit | Quasi | Blank | Other
val kind: symbol -> kind
@@ -56,8 +57,7 @@
val beginning: int -> symbol list -> string
val scanner: string -> (string list -> 'a * string list) -> symbol list -> 'a
val scan_id: string list -> string * string list
- val source: {do_recover: bool} -> (string, 'a) Source.source ->
- (symbol, (string, 'a) Source.source) Source.source
+ val source: (string, 'a) Source.source -> (symbol, (string, 'a) Source.source) Source.source
val explode: string -> symbol list
val esc: symbol -> string
val escape: string -> string
@@ -111,11 +111,11 @@
fun is_utf8 s = size s > 0 andalso forall_string (fn c => ord c >= 128) s;
fun is_symbolic s =
- String.isPrefix "\\<" s andalso not (String.isPrefix "\\<^" s);
+ String.isPrefix "\\<" s andalso String.isSuffix ">" s andalso not (String.isPrefix "\\<^" s);
fun is_printable s =
if is_char s then ord space <= ord s andalso ord s <= ord "~"
- else is_utf8 s orelse not (String.isPrefix "\\<^" s);
+ else is_utf8 s orelse is_symbolic s;
(* input source control *)
@@ -128,14 +128,10 @@
val sync = "\\<^sync>";
fun is_sync s = s = sync;
-val malformed = "[[";
-val end_malformed = "]]";
+fun is_regular s = not_eof s andalso s <> sync;
-val separate_chars = explode #> space_implode space;
-fun malformed_msg s = "Malformed symbolic character: " ^ quote (separate_chars s);
-
-fun is_regular s =
- not_eof s andalso s <> sync andalso s <> malformed andalso s <> end_malformed;
+fun is_malformed s = String.isPrefix "\\<" s andalso not (String.isSuffix ">" s);
+fun malformed_msg s = "Malformed symbolic character: " ^ quote s;
(* ascii symbols *)
@@ -225,15 +221,16 @@
(* symbol variants *)
datatype sym =
- Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string;
+ Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string |
+ Malformed of string;
fun decode s =
if is_char s then Char s
else if is_utf8 s then UTF8 s
else if is_raw s then Raw (decode_raw s)
+ else if is_malformed s then Malformed s
else if String.isPrefix "\\<^" s then Ctrl (String.substring (s, 3, size s - 4))
- else if String.isPrefix "\\<" s then Sym (String.substring (s, 2, size s - 3))
- else error (malformed_msg s);
+ else Sym (String.substring (s, 2, size s - 3));
(* standard symbol kinds *)
@@ -445,30 +442,18 @@
Scan.this_string "raw:" ^^ (Scan.many raw_chr >> implode) ||
Scan.this_string "raw" ^^ (Scan.many1 is_ascii_digit >> implode);
-val scan =
+val scan_total =
Scan.one is_plain ||
Scan.one is_utf8 ::: Scan.many is_utf8_trailer >> implode_pseudo_utf8 ||
scan_encoded_newline ||
- ($$ "\\" ^^ $$ "<" ^^
- !! (fn (cs, _) => malformed_msg (beginning 10 ("\\" :: "<" :: cs)))
- (($$ "^" ^^ (scan_raw || scan_id) || scan_id) ^^ $$ ">")) ||
+ ($$ "\\" ^^ $$ "<" ^^ (($$ "^" ^^ (scan_raw || scan_id) || scan_id) ^^ $$ ">")) ||
+ Scan.this_string "\\<^" ||
+ Scan.this_string "\\<" ||
Scan.one not_eof;
-val scan_resync =
- Scan.one is_ascii_blank || $$ "\"" || $$ "`" || $$ "\\" ||
- Scan.this_string "(*" || Scan.this_string "*)" ||
- Scan.this_string "{*" || Scan.this_string "*}";
-
-val recover =
- Scan.this ["\\", "<"] @@@
- Scan.repeat (Scan.unless scan_resync (Scan.one not_eof))
- >> (fn ss => malformed :: ss @ [end_malformed]);
-
in
-fun source {do_recover} src =
- Source.source stopper (Scan.bulk scan)
- (if do_recover then SOME (false, K recover) else NONE) src;
+fun source src = Source.source stopper (Scan.bulk scan_total) NONE src;
end;
@@ -487,7 +472,7 @@
fun sym_explode str =
let val chs = explode str in
if no_explode chs then chs
- else Source.exhaust (source {do_recover = false} (Source.of_list chs))
+ else Source.exhaust (source (Source.of_list chs))
end;
end;
--- a/src/Pure/General/symbol.scala Sat Nov 13 16:46:00 2010 +0100
+++ b/src/Pure/General/symbol.scala Sat Nov 13 19:21:53 2010 +0100
@@ -40,13 +40,11 @@
\^? [A-Za-z][A-Za-z0-9_']* |
\^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""")
- // FIXME cover bad surrogates!?
- // FIXME check wrt. Isabelle/ML version
- private val bad_symbol = new Regex("(?xs) (?!" + symbol + ")" +
- """ \\ < (?: (?! \s | [\"`\\] | \(\* | \*\) | \{\* | \*\} ) . )*""")
+ private val malformed_symbol = new Regex("(?xs) (?!" + symbol + ")" +
+ """ [\ud800-\udbff] | \\<^? """)
- // total pattern
- val regex = new Regex(plain + "|" + physical_newline + "|" + symbol + "|" + bad_symbol + "| .")
+ val regex_total =
+ new Regex(plain + "|" + physical_newline + "|" + symbol + "|" + malformed_symbol + "| .")
/* basic matching */
@@ -56,12 +54,12 @@
def is_physical_newline(s: CharSequence): Boolean =
"\n".contentEquals(s) || "\r".contentEquals(s) || "\r\n".contentEquals(s)
- def is_wellformed(s: CharSequence): Boolean =
- s.length == 1 && is_plain(s.charAt(0)) || !bad_symbol.pattern.matcher(s).matches
+ def is_malformed(s: CharSequence): Boolean =
+ !(s.length == 1 && is_plain(s.charAt(0))) && malformed_symbol.pattern.matcher(s).matches
class Matcher(text: CharSequence)
{
- private val matcher = regex.pattern.matcher(text)
+ private val matcher = regex_total.pattern.matcher(text)
def apply(start: Int, end: Int): Int =
{
require(0 <= start && start < end && end <= text.length)
@@ -162,7 +160,7 @@
def recode(text: String): String =
{
val len = text.length
- val matcher = regex.pattern.matcher(text)
+ val matcher = regex_total.pattern.matcher(text)
val result = new StringBuilder(len)
var i = 0
while (i < len) {
@@ -204,7 +202,7 @@
}
}
decl.split("\\s+").toList match {
- case sym :: props if sym.length > 1 && is_wellformed(sym) => (sym, read_props(props))
+ case sym :: props if sym.length > 1 && !is_malformed(sym) => (sym, read_props(props))
case _ => err()
}
}
@@ -313,6 +311,7 @@
def is_letdig(sym: String): Boolean = is_letter(sym) || is_digit(sym) || is_quasi(sym)
def is_blank(sym: String): Boolean = blanks.contains(sym)
def is_symbolic_char(sym: String): Boolean = sym_chars.contains(sym)
- def is_symbolic(sym: String): Boolean = sym.startsWith("\\<") && !sym.startsWith("\\<^")
+ def is_symbolic(sym: String): Boolean =
+ sym.startsWith("\\<") && sym.endsWith(">") && !sym.startsWith("\\<^")
}
}
--- a/src/Pure/Isar/outer_syntax.ML Sat Nov 13 16:46:00 2010 +0100
+++ b/src/Pure/Isar/outer_syntax.ML Sat Nov 13 19:21:53 2010 +0100
@@ -187,13 +187,13 @@
fun scan pos str =
Source.of_string str
- |> Symbol.source {do_recover = false}
+ |> Symbol.source
|> Token.source {do_recover = SOME false} Keyword.get_lexicons pos
|> Source.exhaust;
fun parse pos str =
Source.of_string str
- |> Symbol.source {do_recover = false}
+ |> Symbol.source
|> Token.source {do_recover = SOME false} Keyword.get_lexicons pos
|> toplevel_source false NONE get_command
|> Source.exhaust;
@@ -224,7 +224,7 @@
fun isar in_stream term : isar =
Source.tty in_stream
- |> Symbol.source {do_recover = true}
+ |> Symbol.source
|> Token.source {do_recover = SOME true} Keyword.get_lexicons Position.none
|> toplevel_source term (SOME true) get_command;
--- a/src/Pure/Isar/token.ML Sat Nov 13 16:46:00 2010 +0100
+++ b/src/Pure/Isar/token.ML Sat Nov 13 19:21:53 2010 +0100
@@ -210,8 +210,7 @@
else
let
val k = str_of_kind (kind_of tok);
- val s = unparse tok
- handle ERROR _ => Symbol.separate_chars (content_of tok);
+ val s = unparse tok;
in
if s = "" then (k, "")
else if size s < 40 andalso not (exists_string (fn c => c = "\n") s) then (k ^ " " ^ s, "")
@@ -312,14 +311,6 @@
Symbol_Pos.scan_pos -- (Symbol_Pos.scan_comment_body !!! -- Symbol_Pos.scan_pos);
-(* scan malformed symbols *)
-
-val scan_malformed =
- $$$ Symbol.malformed |--
- Symbol_Pos.change_prompt (Scan.many (Symbol.is_regular o symbol))
- --| Scan.option ($$$ Symbol.end_malformed);
-
-
(** token sources **)
@@ -341,7 +332,7 @@
scan_verbatim >> token_range Verbatim ||
scan_comment >> token_range Comment ||
scan_space >> token Space ||
- scan_malformed >> token Malformed ||
+ Scan.one (Symbol.is_malformed o symbol) >> (token Malformed o single) ||
Scan.one (Symbol.is_sync o symbol) >> (token Sync o single) ||
(Scan.max token_leq
(Scan.max token_leq
--- a/src/Pure/ML/ml_lex.ML Sat Nov 13 16:46:00 2010 +0100
+++ b/src/Pure/ML/ml_lex.ML Sat Nov 13 19:21:53 2010 +0100
@@ -265,11 +265,7 @@
Symbol_Pos.source (Position.line 1) src
|> Source.source Symbol_Pos.stopper (Scan.bulk (!!! "bad input" scan_ml)) (SOME (false, recover));
-val tokenize =
- Source.of_string #>
- Symbol.source {do_recover = true} #>
- source #>
- Source.exhaust;
+val tokenize = Source.of_string #> Symbol.source #> source #> Source.exhaust;
fun read pos txt =
let
--- a/src/Pure/Thy/latex.ML Sat Nov 13 16:46:00 2010 +0100
+++ b/src/Pure/Thy/latex.ML Sat Nov 13 19:21:53 2010 +0100
@@ -101,7 +101,8 @@
| Symbol.Ctrl s =>
if known_ctrl s then literal sym ^ "{}" ^ enclose "\\isactrl" " " s
else output_chrs sym
- | Symbol.Raw s => s);
+ | Symbol.Raw s => s
+ | Symbol.Malformed s => error (Symbol.malformed_msg s));
in
--- a/src/Pure/Thy/thy_header.ML Sat Nov 13 16:46:00 2010 +0100
+++ b/src/Pure/Thy/thy_header.ML Sat Nov 13 19:21:53 2010 +0100
@@ -57,7 +57,7 @@
let val res =
str
|> Source.of_string_limited 8000
- |> Symbol.source {do_recover = false}
+ |> Symbol.source
|> Token.source {do_recover = NONE} (fn () => (header_lexicon, Scan.empty_lexicon)) pos
|> Token.source_proper
|> Source.source Token.stopper (Scan.single (Scan.error (Parse.!!! header))) NONE
--- a/src/Pure/Thy/thy_syntax.ML Sat Nov 13 16:46:00 2010 +0100
+++ b/src/Pure/Thy/thy_syntax.ML Sat Nov 13 19:21:53 2010 +0100
@@ -32,14 +32,11 @@
(* parse *)
-fun token_source lexs pos src =
- Symbol.source {do_recover = true} src
- |> Token.source {do_recover = SOME false} (K lexs) pos;
+fun token_source lexs pos =
+ Symbol.source #> Token.source {do_recover = SOME false} (K lexs) pos;
-fun parse_tokens lexs pos str =
- Source.of_string str
- |> token_source lexs pos
- |> Source.exhaust;
+fun parse_tokens lexs pos =
+ Source.of_string #> token_source lexs pos #> Source.exhaust;
(* present *)