# HG changeset patch # User wenzelm # Date 1646665314 -3600 # Node ID 90eaac98b3faa0bd4452b427aaa6e8002f2a8a19 # Parent 240cb0cfba5c0a95d1a60155341e8aaad34ecf5e more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML); diff -r 240cb0cfba5c -r 90eaac98b3fa src/Pure/General/scan.scala --- a/src/Pure/General/scan.scala Mon Mar 07 13:45:09 2022 +0100 +++ b/src/Pure/General/scan.scala Mon Mar 07 16:01:54 2022 +0100 @@ -61,9 +61,8 @@ var count = 0 var finished = false while (!finished && i < end && count < max_count) { - val n = matcher(i, end) - val sym = in.source.subSequence(i, i + n).toString - if (pred(sym)) { i += n; count += 1 } + val sym = matcher.match_symbol(i) + if (pred(sym)) { i += sym.length; count += 1 } else finished = true } if (count < min_count) Failure("bad input", in) @@ -151,8 +150,8 @@ var d = depth var finished = false while (!finished && i < end) { - val n = matcher(i, end) - val sym = in.source.subSequence(i, i + n).toString + val sym = matcher.match_symbol(i) + val n = sym.length if (Symbol.is_open(sym)) { i += n; d += 1 } else if (Symbol.is_close(sym) && d > 0) { i += n; d -= 1; if (d == 0) finished = true } else if (d > 0) i += n diff -r 240cb0cfba5c -r 90eaac98b3fa src/Pure/General/symbol.scala --- a/src/Pure/General/symbol.scala Mon Mar 07 13:45:09 2022 +0100 +++ b/src/Pure/General/symbol.scala Mon Mar 07 16:01:54 2022 +0100 @@ -70,13 +70,6 @@ /* symbol matching */ - private val symbol_total = - new Regex("(?xs) [\ud800-\udbff][\udc00-\udfff] " + - """ | \r\n | \\ < \^? ([A-Za-z][A-Za-z0-9_']*)? >? | .""") - - private def is_plain(c: Char): Boolean = - !(c == '\r' || c == '\\' || Character.isHighSurrogate(c)) - def is_malformed(s: Symbol): Boolean = s.length match { case 1 => @@ -94,16 +87,36 @@ class Matcher(text: CharSequence) { - private val matcher = symbol_total.pattern.matcher(text) - def apply(start: Int, end: Int): Int = + private def ok(i: Int): Boolean = 0 <= i && i < text.length + private def char(i: Int): Char = if (ok(i)) text.charAt(i) else 0 + private def maybe_char(c: Char, i: Int): Int = if (char(i) == c) i + 1 else i + + @tailrec private def many_ascii_letdig(i: Int): Int = + if (is_ascii_letdig(char(i))) many_ascii_letdig(i + 1) else i + private def maybe_ascii_id(i: Int): Int = + if (is_ascii_letter(char(i))) many_ascii_letdig(i + 1) else i + + def match_length(i: Int): Int = { - require(0 <= start && start < end && end <= text.length, "bad matcher range") - if (is_plain(text.charAt(start))) 1 - else { - matcher.region(start, end).lookingAt - matcher.group.length + val a = char(i) + val b = char(i + 1) + + if (Character.isHighSurrogate(a) && Character.isLowSurrogate(b) || a == '\r' && b == '\n') 2 + else if (a == '\\' && b == '<') maybe_char('>', maybe_ascii_id(maybe_char('^', i + 2))) - i + else if (ok(i)) 1 + else 0 + } + + def match_symbol(i: Int): String = + match_length(i) match { + case 0 => "" + case 1 => + val c = text.charAt(i) + if (c < char_symbols.length) char_symbols(c) + else c.toString + case n => + text.subSequence(i, i + n).toString } - } } @@ -120,16 +133,8 @@ def hasNext: Boolean = i < text.length def next(): Symbol = { - val n = matcher(i, text.length) - val s = - if (n == 0) "" - else if (n == 1) { - val c = text.charAt(i) - if (c < char_symbols.length) char_symbols(c) - else text.subSequence(i, i + n).toString - } - else text.subSequence(i, i + n).toString - i += n + val s = matcher.match_symbol(i) + i += s.length s } } @@ -163,7 +168,7 @@ var chr = 0 var sym = 0 while (chr < text.length) { - val n = matcher(chr, text.length) + val n = matcher.match_length(chr) chr += n sym += 1 if (n > 1) buf += Entry(chr, sym) @@ -276,16 +281,15 @@ def recode(text: String): String = { val len = text.length - val matcher = symbol_total.pattern.matcher(text) + val matcher = new Symbol.Matcher(text) val result = new StringBuilder(len) var i = 0 while (i < len) { val c = text(i) if (min <= c && c <= max) { - matcher.region(i, len).lookingAt - val x = matcher.group - result.append(table.getOrElse(x, x)) - i = matcher.end + val s = matcher.match_symbol(i) + result.append(table.getOrElse(s, s)) + i += s.length } else { result.append(c); i += 1 } }