| author | wenzelm | 
| Fri, 05 May 2023 12:01:09 +0200 | |
| changeset 77967 | 6bb2f9b32804 | 
| parent 76235 | 16c12979c132 | 
| child 78592 | fdfe9b91d96e | 
| permissions | -rw-r--r-- | 
| 27901 | 1 | /* Title: Pure/General/symbol.scala | 
| 2 | Author: Makarius | |
| 3 | ||
| 69490 | 4 | Isabelle text symbols. | 
| 27901 | 5 | */ | 
| 6 | ||
| 7 | package isabelle | |
| 8 | ||
| 55618 | 9 | |
| 36011 
3ff725ac13a4
adapted to Scala 2.8.0 Beta1 -- with notable changes to scala.collection;
 wenzelm parents: 
34316diff
changeset | 10 | import scala.collection.mutable | 
| 31522 | 11 | import scala.util.matching.Regex | 
| 48922 | 12 | import scala.annotation.tailrec | 
| 27901 | 13 | |
| 14 | ||
| 75393 | 15 | object Symbol {
 | 
| 43696 | 16 | type Symbol = String | 
| 17 | ||
| 55884 
f2c0eaedd579
tuned signature -- emphasize symbol positions (prover) vs. decoded text offsets (editor);
 wenzelm parents: 
55618diff
changeset | 18 | // counting Isabelle symbols, starting from 1 | 
| 
f2c0eaedd579
tuned signature -- emphasize symbol positions (prover) vs. decoded text offsets (editor);
 wenzelm parents: 
55618diff
changeset | 19 | type Offset = Text.Offset | 
| 
f2c0eaedd579
tuned signature -- emphasize symbol positions (prover) vs. decoded text offsets (editor);
 wenzelm parents: 
55618diff
changeset | 20 | type Range = Text.Range | 
| 
f2c0eaedd579
tuned signature -- emphasize symbol positions (prover) vs. decoded text offsets (editor);
 wenzelm parents: 
55618diff
changeset | 21 | |
| 43696 | 22 | |
| 61865 | 23 | /* spaces */ | 
| 24 | ||
| 71649 | 25 | val space_char = ' ' | 
| 61865 | 26 | val space = " " | 
| 27 | ||
| 28 | private val static_spaces = space * 4000 | |
| 29 | ||
| 75393 | 30 |   def spaces(n: Int): String = {
 | 
| 73120 
c3589f2dff31
more informative errors: simplify diagnosis of spurious failures reported by users;
 wenzelm parents: 
73030diff
changeset | 31 | require(n >= 0, "negative spaces") | 
| 61865 | 32 | if (n < static_spaces.length) static_spaces.substring(0, n) | 
| 33 | else space * n | |
| 34 | } | |
| 35 | ||
| 36 | ||
| 75238 
e74d162ddf9f
clarified char symbols: cover most European languages;
 wenzelm parents: 
75237diff
changeset | 37 | /* char symbols */ | 
| 
e74d162ddf9f
clarified char symbols: cover most European languages;
 wenzelm parents: 
75237diff
changeset | 38 | |
| 
e74d162ddf9f
clarified char symbols: cover most European languages;
 wenzelm parents: 
75237diff
changeset | 39 | private val char_symbols: Array[Symbol] = | 
| 
e74d162ddf9f
clarified char symbols: cover most European languages;
 wenzelm parents: 
75237diff
changeset | 40 | (0 until 0x500).iterator.map(i => new String(Array(i.toChar))).toArray | 
| 
e74d162ddf9f
clarified char symbols: cover most European languages;
 wenzelm parents: 
75237diff
changeset | 41 | |
| 
e74d162ddf9f
clarified char symbols: cover most European languages;
 wenzelm parents: 
75237diff
changeset | 42 | def char_symbol(c: Char): String = | 
| 
e74d162ddf9f
clarified char symbols: cover most European languages;
 wenzelm parents: 
75237diff
changeset | 43 | if (c < char_symbols.length) char_symbols(c) | 
| 
e74d162ddf9f
clarified char symbols: cover most European languages;
 wenzelm parents: 
75237diff
changeset | 44 | else c.toString | 
| 
e74d162ddf9f
clarified char symbols: cover most European languages;
 wenzelm parents: 
75237diff
changeset | 45 | |
| 
e74d162ddf9f
clarified char symbols: cover most European languages;
 wenzelm parents: 
75237diff
changeset | 46 | |
| 43418 | 47 | /* ASCII characters */ | 
| 48 | ||
| 71649 | 49 | def is_ascii_printable(c: Char): Boolean = space_char <= c && c <= '~' | 
| 50 | ||
| 43418 | 51 | def is_ascii_letter(c: Char): Boolean = 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' | 
| 55497 | 52 | |
| 43418 | 53 | def is_ascii_digit(c: Char): Boolean = '0' <= c && c <= '9' | 
| 55497 | 54 | |
| 55 | def is_ascii_hex(c: Char): Boolean = | |
| 56 | '0' <= c && c <= '9' || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' | |
| 57 | ||
| 43418 | 58 | def is_ascii_quasi(c: Char): Boolean = c == '_' || c == '\'' | 
| 59 | ||
| 55497 | 60 | def is_ascii_blank(c: Char): Boolean = " \t\n\u000b\f\r".contains(c) | 
| 61 | ||
| 69448 | 62 | def is_ascii_line_terminator(c: Char): Boolean = "\r\n".contains(c) | 
| 63 | ||
| 43418 | 64 | def is_ascii_letdig(c: Char): Boolean = | 
| 65 | is_ascii_letter(c) || is_ascii_digit(c) || is_ascii_quasi(c) | |
| 66 | ||
| 67 | def is_ascii_identifier(s: String): Boolean = | |
| 75192 
7d680dcd69b1
misc tuning, based on suggestions by IntelliJ IDEA;
 wenzelm parents: 
73359diff
changeset | 68 | s.nonEmpty && is_ascii_letter(s(0)) && s.forall(is_ascii_letdig) | 
| 43418 | 69 | |
| 75393 | 70 |   def ascii(c: Char): Symbol = {
 | 
| 62528 | 71 |     if (c > 127) error("Non-ASCII character: " + quote(c.toString))
 | 
| 75238 
e74d162ddf9f
clarified char symbols: cover most European languages;
 wenzelm parents: 
75237diff
changeset | 72 | else char_symbol(c) | 
| 62528 | 73 | } | 
| 74 | ||
| 66919 | 75 | def is_ascii(s: Symbol): Boolean = s.length == 1 && s(0) < 128 | 
| 76 | ||
| 43418 | 77 | |
| 48775 | 78 | /* symbol matching */ | 
| 27901 | 79 | |
| 48773 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 wenzelm parents: 
48704diff
changeset | 80 | def is_malformed(s: Symbol): Boolean = | 
| 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 wenzelm parents: 
48704diff
changeset | 81 |     s.length match {
 | 
| 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 wenzelm parents: 
48704diff
changeset | 82 | case 1 => | 
| 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 wenzelm parents: 
48704diff
changeset | 83 | val c = s(0) | 
| 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 wenzelm parents: 
48704diff
changeset | 84 | Character.isHighSurrogate(c) || Character.isLowSurrogate(c) || c == '\ufffd' | 
| 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 wenzelm parents: 
48704diff
changeset | 85 | case 2 => | 
| 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 wenzelm parents: 
48704diff
changeset | 86 | val c1 = s(0) | 
| 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 wenzelm parents: 
48704diff
changeset | 87 | val c2 = s(1) | 
| 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 wenzelm parents: 
48704diff
changeset | 88 | !(c1 == '\r' && c2 == '\n' || Character.isSurrogatePair(c1, c2)) | 
| 48774 | 89 |       case _ => !s.endsWith(">") || s == "\\<>" || s == "\\<^>"
 | 
| 48773 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 wenzelm parents: 
48704diff
changeset | 90 | } | 
| 34137 | 91 | |
| 54734 
b91afc3aa3e6
clarified Proof General legacy: special treatment of \<^newline> only in TTY mode;
 wenzelm parents: 
53400diff
changeset | 92 | def is_newline(s: Symbol): Boolean = | 
| 43675 
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
 wenzelm parents: 
43511diff
changeset | 93 | s == "\n" || s == "\r" || s == "\r\n" | 
| 38877 | 94 | |
| 75393 | 95 |   class Matcher(text: CharSequence) {
 | 
| 75237 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 96 | private def ok(i: Int): Boolean = 0 <= i && i < text.length | 
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 97 | private def char(i: Int): Char = if (ok(i)) text.charAt(i) else 0 | 
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 98 | private def maybe_char(c: Char, i: Int): Int = if (char(i) == c) i + 1 else i | 
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 99 | |
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 100 | @tailrec private def many_ascii_letdig(i: Int): Int = | 
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 101 | if (is_ascii_letdig(char(i))) many_ascii_letdig(i + 1) else i | 
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 102 | private def maybe_ascii_id(i: Int): Int = | 
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 103 | if (is_ascii_letter(char(i))) many_ascii_letdig(i + 1) else i | 
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 104 | |
| 75393 | 105 |     def match_length(i: Int): Int = {
 | 
| 75237 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 106 | val a = char(i) | 
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 107 | val b = char(i + 1) | 
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 108 | |
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 109 | if (Character.isHighSurrogate(a) && Character.isLowSurrogate(b) || a == '\r' && b == '\n') 2 | 
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 110 |       else if (a == '\\' && b == '<') maybe_char('>', maybe_ascii_id(maybe_char('^', i + 2))) - i
 | 
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 111 | else if (ok(i)) 1 | 
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 112 | else 0 | 
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 113 | } | 
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 114 | |
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 115 | def match_symbol(i: Int): String = | 
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 116 |       match_length(i) match {
 | 
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 117 | case 0 => "" | 
| 75238 
e74d162ddf9f
clarified char symbols: cover most European languages;
 wenzelm parents: 
75237diff
changeset | 118 | case 1 => char_symbol(text.charAt(i)) | 
| 
e74d162ddf9f
clarified char symbols: cover most European languages;
 wenzelm parents: 
75237diff
changeset | 119 | case n => text.subSequence(i, i + n).toString | 
| 34137 | 120 | } | 
| 31522 | 121 | } | 
| 27937 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 122 | |
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 123 | |
| 43695 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 124 | /* iterator */ | 
| 33998 | 125 | |
| 43696 | 126 | def iterator(text: CharSequence): Iterator[Symbol] = | 
| 75393 | 127 |     new Iterator[Symbol] {
 | 
| 43489 | 128 | private val matcher = new Matcher(text) | 
| 129 | private var i = 0 | |
| 71601 | 130 | def hasNext: Boolean = i < text.length | 
| 75393 | 131 |       def next(): Symbol = {
 | 
| 75237 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 132 | val s = matcher.match_symbol(i) | 
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 133 | i += s.length | 
| 43489 | 134 | s | 
| 135 | } | |
| 33998 | 136 | } | 
| 43489 | 137 | |
| 44949 | 138 | def explode(text: CharSequence): List[Symbol] = iterator(text).toList | 
| 139 | ||
| 64615 | 140 | def length(text: CharSequence): Int = iterator(text).length | 
| 64617 | 141 | |
| 67435 
f83c1842a559
trim blanks -- more thoroughly than in update_cartouches (for single-line comments);
 wenzelm parents: 
67389diff
changeset | 142 | def trim_blanks(text: CharSequence): String = | 
| 71601 | 143 | Library.trim(is_blank, explode(text)).mkString | 
| 67435 
f83c1842a559
trim blanks -- more thoroughly than in update_cartouches (for single-line comments);
 wenzelm parents: 
67389diff
changeset | 144 | |
| 69318 | 145 | def all_blank(str: String): Boolean = | 
| 71601 | 146 | iterator(str).forall(is_blank) | 
| 69318 | 147 | |
| 148 | def trim_blank_lines(text: String): String = | |
| 149 | cat_lines(split_lines(text).dropWhile(all_blank).reverse.dropWhile(all_blank).reverse) | |
| 150 | ||
| 33998 | 151 | |
| 152 | /* decoding offsets */ | |
| 153 | ||
| 75393 | 154 |   object Index {
 | 
| 56471 
2293a4350716
more frugal Symbol.Index -- no need to waste space on mostly empty arrays;
 wenzelm parents: 
56338diff
changeset | 155 | private sealed case class Entry(chr: Int, sym: Int) | 
| 52507 | 156 | |
| 56472 | 157 | val empty: Index = new Index(Nil) | 
| 56471 
2293a4350716
more frugal Symbol.Index -- no need to waste space on mostly empty arrays;
 wenzelm parents: 
56338diff
changeset | 158 | |
| 75393 | 159 |     def apply(text: CharSequence): Index = {
 | 
| 34137 | 160 | val matcher = new Matcher(text) | 
| 56471 
2293a4350716
more frugal Symbol.Index -- no need to waste space on mostly empty arrays;
 wenzelm parents: 
56338diff
changeset | 161 | val buf = new mutable.ListBuffer[Entry] | 
| 31929 | 162 | var chr = 0 | 
| 163 | var sym = 0 | |
| 33998 | 164 |       while (chr < text.length) {
 | 
| 75237 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 165 | val n = matcher.match_length(chr) | 
| 34137 | 166 | chr += n | 
| 31929 | 167 | sym += 1 | 
| 34137 | 168 | if (n > 1) buf += Entry(chr, sym) | 
| 31929 | 169 | } | 
| 56472 | 170 | if (buf.isEmpty) empty else new Index(buf.toList) | 
| 31929 | 171 | } | 
| 56471 
2293a4350716
more frugal Symbol.Index -- no need to waste space on mostly empty arrays;
 wenzelm parents: 
56338diff
changeset | 172 | } | 
| 55430 | 173 | |
| 75393 | 174 |   final class Index private(entries: List[Index.Entry]) {
 | 
| 56472 | 175 | private val hash: Int = entries.hashCode | 
| 176 | private val index: Array[Index.Entry] = entries.toArray | |
| 177 | ||
| 75393 | 178 |     def decode(symbol_offset: Offset): Text.Offset = {
 | 
| 55884 
f2c0eaedd579
tuned signature -- emphasize symbol positions (prover) vs. decoded text offsets (editor);
 wenzelm parents: 
55618diff
changeset | 179 | val sym = symbol_offset - 1 | 
| 31929 | 180 | val end = index.length | 
| 75393 | 181 |       @tailrec def bisect(a: Int, b: Int): Int = {
 | 
| 31929 | 182 |         if (a < b) {
 | 
| 183 | val c = (a + b) / 2 | |
| 184 | if (sym < index(c).sym) bisect(a, c) | |
| 185 | else if (c + 1 == end || sym < index(c + 1).sym) c | |
| 186 | else bisect(c + 1, b) | |
| 187 | } | |
| 188 | else -1 | |
| 189 | } | |
| 190 | val i = bisect(0, end) | |
| 191 | if (i < 0) sym | |
| 192 | else index(i).chr + sym - index(i).sym | |
| 193 | } | |
| 71601 | 194 | def decode(symbol_range: Range): Text.Range = symbol_range.map(decode) | 
| 56335 
8953d4cc060a
store blob content within document node: aux. files that were once open are made persistent;
 wenzelm parents: 
55884diff
changeset | 195 | |
| 56338 
f968f4e3d520
proper structural hashCode, which is required for Command.File equals (NB: Array has physical object identity);
 wenzelm parents: 
56335diff
changeset | 196 | override def hashCode: Int = hash | 
| 56335 
8953d4cc060a
store blob content within document node: aux. files that were once open are made persistent;
 wenzelm parents: 
55884diff
changeset | 197 | override def equals(that: Any): Boolean = | 
| 
8953d4cc060a
store blob content within document node: aux. files that were once open are made persistent;
 wenzelm parents: 
55884diff
changeset | 198 |       that match {
 | 
| 
8953d4cc060a
store blob content within document node: aux. files that were once open are made persistent;
 wenzelm parents: 
55884diff
changeset | 199 | case other: Index => index.sameElements(other.index) | 
| 
8953d4cc060a
store blob content within document node: aux. files that were once open are made persistent;
 wenzelm parents: 
55884diff
changeset | 200 | case _ => false | 
| 
8953d4cc060a
store blob content within document node: aux. files that were once open are made persistent;
 wenzelm parents: 
55884diff
changeset | 201 | } | 
| 31929 | 202 | } | 
| 203 | ||
| 204 | ||
| 64477 | 205 | /* symbolic text chunks -- without actual text */ | 
| 56746 | 206 | |
| 75393 | 207 |   object Text_Chunk {
 | 
| 56746 | 208 | sealed abstract class Name | 
| 209 | case object Default extends Name | |
| 210 | case class Id(id: Document_ID.Generic) extends Name | |
| 211 | case class File(name: String) extends Name | |
| 212 | ||
| 213 | def apply(text: CharSequence): Text_Chunk = | |
| 76235 | 214 | new Text_Chunk(Text.Range.length(text), Index(text)) | 
| 56746 | 215 | } | 
| 216 | ||
| 75393 | 217 |   final class Text_Chunk private(val range: Text.Range, private val index: Index) {
 | 
| 56746 | 218 | override def hashCode: Int = (range, index).hashCode | 
| 219 | override def equals(that: Any): Boolean = | |
| 220 |       that match {
 | |
| 221 | case other: Text_Chunk => | |
| 222 | range == other.range && | |
| 223 | index == other.index | |
| 224 | case _ => false | |
| 225 | } | |
| 226 | ||
| 57840 | 227 | override def toString: String = "Text_Chunk" + range.toString | 
| 228 | ||
| 56746 | 229 | def decode(symbol_offset: Offset): Text.Offset = index.decode(symbol_offset) | 
| 230 | def decode(symbol_range: Range): Text.Range = index.decode(symbol_range) | |
| 75393 | 231 |     def incorporate(symbol_range: Range): Option[Text.Range] = {
 | 
| 75659 
9bd92ac9328f
more robust Scala 3 indentation, for the sake of IntelliJ IDEA;
 wenzelm parents: 
75393diff
changeset | 232 |       def in(r: Range): Option[Text.Range] = {
 | 
| 56746 | 233 |         range.try_restrict(decode(r)) match {
 | 
| 234 | case Some(r1) if !r1.is_singularity => Some(r1) | |
| 235 | case _ => None | |
| 236 | } | |
| 75659 
9bd92ac9328f
more robust Scala 3 indentation, for the sake of IntelliJ IDEA;
 wenzelm parents: 
75393diff
changeset | 237 | } | 
| 
9bd92ac9328f
more robust Scala 3 indentation, for the sake of IntelliJ IDEA;
 wenzelm parents: 
75393diff
changeset | 238 | in(symbol_range) orElse in(symbol_range - 1) | 
| 56746 | 239 | } | 
| 240 | } | |
| 241 | ||
| 242 | ||
| 33998 | 243 | /* recoding text */ | 
| 27937 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 244 | |
| 75393 | 245 |   private class Recoder(list: List[(String, String)]) {
 | 
| 246 |     private val (min, max) = {
 | |
| 27937 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 247 | var min = '\uffff' | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 248 | var max = '\u0000' | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 249 |       for ((x, _) <- list) {
 | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 250 | val c = x(0) | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 251 | if (c < min) min = c | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 252 | if (c > max) max = c | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 253 | } | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 254 | (min, max) | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 255 | } | 
| 75393 | 256 |     private val table = {
 | 
| 40443 | 257 | var tab = Map[String, String]() | 
| 258 |       for ((x, y) <- list) {
 | |
| 259 |         tab.get(x) match {
 | |
| 260 | case None => tab += (x -> y) | |
| 261 | case Some(z) => | |
| 62230 | 262 |             error("Duplicate symbol mapping of " + quote(x) + " to " + quote(y) + " vs. " + quote(z))
 | 
| 40443 | 263 | } | 
| 264 | } | |
| 265 | tab | |
| 266 | } | |
| 75393 | 267 |     def recode(text: String): String = {
 | 
| 27937 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 268 | val len = text.length | 
| 75237 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 269 | val matcher = new Symbol.Matcher(text) | 
| 27937 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 270 | val result = new StringBuilder(len) | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 271 | var i = 0 | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 272 |       while (i < len) {
 | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 273 | val c = text(i) | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 274 |         if (min <= c && c <= max) {
 | 
| 75237 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 275 | val s = matcher.match_symbol(i) | 
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 276 | result.append(table.getOrElse(s, s)) | 
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
75199diff
changeset | 277 | i += s.length | 
| 27937 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 278 | } | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 279 |         else { result.append(c); i += 1 }
 | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 280 | } | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 281 | result.toString | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 282 | } | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 283 | } | 
| 27924 | 284 | |
| 27918 | 285 | |
| 27923 
7ebe9d38743a
use scala.collection.jcl.HashMap, which seems to be more efficient;
 wenzelm parents: 
27918diff
changeset | 286 | |
| 75194 | 287 | /** defined symbols **/ | 
| 27927 | 288 | |
| 75393 | 289 |   object Argument extends Enumeration {
 | 
| 75194 | 290 | val none, cartouche, space_cartouche = Value | 
| 67311 | 291 | |
| 75194 | 292 | def unapply(s: String): Option[Value] = | 
| 293 |       try { Some(withName(s)) }
 | |
| 294 |       catch { case _: NoSuchElementException => None}
 | |
| 75197 | 295 | } | 
| 75193 | 296 | |
| 75393 | 297 |   object Entry {
 | 
| 75197 | 298 |     private val Name = new Regex("""\\<\^?([A-Za-z][A-Za-z0-9_']*)>""")
 | 
| 299 |     private val Argument = new Properties.String("argument")
 | |
| 300 |     private val Abbrev = new Properties.String("abbrev")
 | |
| 301 |     private val Code = new Properties.String("code")
 | |
| 302 |     private val Font = new Properties.String("font")
 | |
| 303 |     private val Group = new Properties.String("group")
 | |
| 304 | ||
| 75393 | 305 |     def apply(symbol: Symbol, props: Properties.T): Entry = {
 | 
| 75197 | 306 | def err(msg: String): Nothing = error(msg + " for symbol " + quote(symbol)) | 
| 307 | ||
| 308 | val name = | |
| 309 |         symbol match { case Name(a) => a case _ => err("Cannot determine name") }
 | |
| 310 | ||
| 311 | val argument = | |
| 312 |         props match {
 | |
| 313 | case Argument(arg) => | |
| 314 | Symbol.Argument.unapply(arg) getOrElse | |
| 315 |               error("Bad argument: " + quote(arg) + " for symbol " + quote(symbol))
 | |
| 316 | case _ => Symbol.Argument.none | |
| 317 | } | |
| 318 | ||
| 319 | val code = | |
| 320 |         props match {
 | |
| 321 | case Code(s) => | |
| 322 |             try {
 | |
| 323 | val code = Integer.decode(s).intValue | |
| 324 |               if (code >= 128) Some(code) else err("Illegal ASCII code")
 | |
| 325 | } | |
| 326 |             catch { case _: NumberFormatException => err("Bad code") }
 | |
| 327 | case _ => None | |
| 328 | } | |
| 329 | ||
| 330 |       val groups = proper_list(for ((Group.name, a) <- props) yield a).getOrElse(List("unsorted"))
 | |
| 331 | ||
| 332 | val abbrevs = for ((Abbrev.name, a) <- props) yield a | |
| 333 | ||
| 334 | new Entry(symbol, name, argument, code, Font.unapply(props), groups, abbrevs) | |
| 335 | } | |
| 336 | } | |
| 337 | ||
| 338 | class Entry private( | |
| 339 | val symbol: Symbol, | |
| 340 | val name: String, | |
| 341 | val argument: Symbol.Argument.Value, | |
| 342 | val code: Option[Int], | |
| 343 | val font: Option[String], | |
| 344 | val groups: List[String], | |
| 75393 | 345 | val abbrevs: List[String] | 
| 346 |   ) {
 | |
| 75197 | 347 | override def toString: String = symbol | 
| 348 | ||
| 349 | val decode: Option[String] = | |
| 350 | code.map(c => new String(Character.toChars(c))) | |
| 75194 | 351 | } | 
| 352 | ||
| 75195 | 353 | lazy val symbols: Symbols = Symbols.load() | 
| 75194 | 354 | |
| 75393 | 355 |   object Symbols {
 | 
| 356 |     def load(static: Boolean = false): Symbols = {
 | |
| 75252 
41dfe941c3da
inline Isabelle symbols into source text, so that "isabelle vscode" can start up properly without access to process.env or fs;
 wenzelm parents: 
75238diff
changeset | 357 | val paths = | 
| 
41dfe941c3da
inline Isabelle symbols into source text, so that "isabelle vscode" can start up properly without access to process.env or fs;
 wenzelm parents: 
75238diff
changeset | 358 |         if (static) List(Path.explode("~~/etc/symbols"))
 | 
| 
41dfe941c3da
inline Isabelle symbols into source text, so that "isabelle vscode" can start up properly without access to process.env or fs;
 wenzelm parents: 
75238diff
changeset | 359 |         else Path.split(Isabelle_System.getenv("ISABELLE_SYMBOLS"))
 | 
| 
41dfe941c3da
inline Isabelle symbols into source text, so that "isabelle vscode" can start up properly without access to process.env or fs;
 wenzelm parents: 
75238diff
changeset | 360 | make(cat_lines(for (path <- paths if path.is_file) yield File.read(path))) | 
| 75193 | 361 | } | 
| 43695 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 362 | |
| 75393 | 363 |     def make(symbols_spec: String): Symbols = {
 | 
| 75193 | 364 |       val No_Decl = new Regex("""(?xs) ^\s* (?: \#.* )? $ """)
 | 
| 365 |       val Key = new Regex("""(?xs) (.+): """)
 | |
| 31522 | 366 | |
| 75393 | 367 |       def read_decl(decl: String): (Symbol, Properties.T) = {
 | 
| 75193 | 368 |         def err() = error("Bad symbol declaration: " + decl)
 | 
| 31522 | 369 | |
| 75393 | 370 |         def read_props(props: List[String]): Properties.T = {
 | 
| 75193 | 371 |           props match {
 | 
| 372 | case Nil => Nil | |
| 373 | case _ :: Nil => err() | |
| 374 |             case Key(x) :: y :: rest => (x -> y.replace('\u2423', ' ')) :: read_props(rest)
 | |
| 375 | case _ => err() | |
| 376 | } | |
| 377 | } | |
| 378 |         decl.split("\\s+").toList match {
 | |
| 379 | case sym :: props if sym.length > 1 && !is_malformed(sym) => | |
| 380 | (sym, read_props(props)) | |
| 31522 | 381 | case _ => err() | 
| 382 | } | |
| 383 | } | |
| 384 | ||
| 75197 | 385 | new Symbols( | 
| 75193 | 386 | split_lines(symbols_spec).reverse. | 
| 75197 | 387 |           foldLeft((List.empty[Entry], Set.empty[Symbol])) {
 | 
| 75193 | 388 | case (res, No_Decl()) => res | 
| 389 | case ((list, known), decl) => | |
| 390 | val (sym, props) = read_decl(decl) | |
| 391 | if (known(sym)) (list, known) | |
| 75197 | 392 | else (Entry(sym, props) :: list, known + sym) | 
| 393 | }._1) | |
| 75193 | 394 | } | 
| 395 | } | |
| 31522 | 396 | |
| 75393 | 397 |   class Symbols(val entries: List[Entry]) {
 | 
| 75197 | 398 |     override def toString: String = entries.mkString("Symbols(", ", ", ")")
 | 
| 399 | ||
| 400 | ||
| 53400 | 401 | /* basic properties */ | 
| 402 | ||
| 75197 | 403 | private val entries_map: Map[Symbol, Entry] = | 
| 404 | (for (entry <- entries.iterator) yield entry.symbol -> entry).toMap | |
| 31651 | 405 | |
| 75197 | 406 | def get(sym: Symbol): Option[Entry] = entries_map.get(sym) | 
| 407 | def defined(sym: Symbol): Boolean = entries_map.isDefinedAt(sym) | |
| 408 | ||
| 409 | def defined_code(sym: Symbol): Boolean = | |
| 410 |       get(sym) match { case Some(entry) => entry.code.isDefined case _ => false }
 | |
| 31651 | 411 | |
| 75197 | 412 | val code_defined: Int => Boolean = | 
| 413 | (for (entry <- entries.iterator; code <- entry.code) yield code).toSet | |
| 50136 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
 wenzelm parents: 
48922diff
changeset | 414 | |
| 75197 | 415 | val groups_code: List[(String, List[Symbol])] = | 
| 416 |       (for {
 | |
| 417 | entry <- entries.iterator if entry.code.isDefined | |
| 418 | group <- entry.groups.iterator | |
| 419 | } yield entry.symbol -> group) | |
| 420 |         .toList.groupBy(_._2).toList.map({ case (g, list) => (g, list.map(_._1)) }).sortBy(_._1)
 | |
| 75195 | 421 | |
| 75199 | 422 | def get_abbrevs(sym: Symbol): List[String] = | 
| 423 |       get(sym) match { case Some(entry) => entry.abbrevs case _ => Nil }
 | |
| 75195 | 424 | |
| 43488 | 425 | |
| 43490 | 426 | /* recoding */ | 
| 31522 | 427 | |
| 75393 | 428 |     private val (decoder, encoder) = {
 | 
| 31522 | 429 | val mapping = | 
| 75197 | 430 | for (entry <- entries; s <- entry.decode) yield entry.symbol -> s | 
| 61376 
93224745477f
output HTML text according to Isabelle/Scala Symbol.Interpretation;
 wenzelm parents: 
61174diff
changeset | 431 | (new Recoder(mapping), new Recoder(for ((x, y) <- mapping) yield (y, x))) | 
| 31522 | 432 | } | 
| 27918 | 433 | |
| 34098 | 434 | def decode(text: String): String = decoder.recode(text) | 
| 435 | def encode(text: String): String = encoder.recode(text) | |
| 34134 | 436 | |
| 75198 | 437 | private def recode_set(elems: Iterable[String]): Set[String] = | 
| 438 | (elems.iterator ++ elems.iterator.map(decode)).toSet | |
| 43490 | 439 | |
| 75198 | 440 | private def recode_map[A](elems: Iterable[(String, A)]): Map[String, A] = | 
| 441 |       (elems.iterator ++ elems.iterator.map({ case (sym, a) => (decode(sym), a) })).toMap
 | |
| 43490 | 442 | |
| 443 | ||
| 444 | /* user fonts */ | |
| 445 | ||
| 43696 | 446 | val fonts: Map[Symbol, String] = | 
| 75198 | 447 | recode_map(for (entry <- entries; font <- entry.font) yield entry.symbol -> font) | 
| 43490 | 448 | |
| 75192 
7d680dcd69b1
misc tuning, based on suggestions by IntelliJ IDEA;
 wenzelm parents: 
73359diff
changeset | 449 | val font_names: List[String] = fonts.iterator.map(_._2).toSet.toList | 
| 
7d680dcd69b1
misc tuning, based on suggestions by IntelliJ IDEA;
 wenzelm parents: 
73359diff
changeset | 450 | val font_index: Map[String, Int] = (font_names zip font_names.indices.toList).toMap | 
| 43490 | 451 | |
| 75195 | 452 | def lookup_font(sym: Symbol): Option[Int] = fonts.get(sym).map(font_index(_)) | 
| 453 | ||
| 34134 | 454 | |
| 455 | /* classification */ | |
| 456 | ||
| 75198 | 457 | val letters: Set[String] = recode_set(List( | 
| 34134 | 458 | "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", | 
| 459 | "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", | |
| 460 | "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", | |
| 461 | "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", | |
| 462 | ||
| 463 | "\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>", | |
| 464 | "\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>", | |
| 465 | "\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>", | |
| 466 | "\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>", | |
| 467 | "\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>", | |
| 468 | "\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>", | |
| 469 | "\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>", | |
| 470 | "\\<x>", "\\<y>", "\\<z>", | |
| 471 | ||
| 472 | "\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>", | |
| 473 | "\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>", | |
| 474 | "\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>", | |
| 475 | "\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>", | |
| 476 | "\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>", | |
| 477 | "\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>", | |
| 478 | "\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>", | |
| 479 | "\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>", | |
| 480 | "\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>", | |
| 481 | ||
| 482 | "\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>", | |
| 483 | "\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>", | |
| 484 | "\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>", | |
| 485 | "\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>", | |
| 486 | "\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>", | |
| 487 | "\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>", | |
| 75198 | 488 | "\\<Psi>", "\\<Omega>")) | 
| 34134 | 489 | |
| 75198 | 490 | val blanks: Set[String] = recode_set(List(space, "\t", "\n", "\u000B", "\f", "\r", "\r\n")) | 
| 34138 | 491 | |
| 43695 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 492 | val sym_chars = | 
| 34138 | 493 |       Set("!", "#", "$", "%", "&", "*", "+", "-", "/", "<", "=", ">", "?", "@", "^", "_", "|", "~")
 | 
| 34134 | 494 | |
| 75197 | 495 | val symbolic: Set[String] = | 
| 75198 | 496 | recode_set(for (entry <- entries if raw_symbolic(entry.symbol)) yield entry.symbol) | 
| 44992 
aa34d2d049ce
refined Symbol.is_symbolic -- cover recoded versions as well;
 wenzelm parents: 
44949diff
changeset | 497 | |
| 43455 | 498 | |
| 63528 
0f39f59317c1
completion templates for commands involving "begin ... end" blocks;
 wenzelm parents: 
62528diff
changeset | 499 | /* misc symbols */ | 
| 61579 | 500 | |
| 75192 
7d680dcd69b1
misc tuning, based on suggestions by IntelliJ IDEA;
 wenzelm parents: 
73359diff
changeset | 501 | val newline_decoded: Symbol = decode(newline) | 
| 
7d680dcd69b1
misc tuning, based on suggestions by IntelliJ IDEA;
 wenzelm parents: 
73359diff
changeset | 502 | val comment_decoded: Symbol = decode(comment) | 
| 
7d680dcd69b1
misc tuning, based on suggestions by IntelliJ IDEA;
 wenzelm parents: 
73359diff
changeset | 503 | val cancel_decoded: Symbol = decode(cancel) | 
| 
7d680dcd69b1
misc tuning, based on suggestions by IntelliJ IDEA;
 wenzelm parents: 
73359diff
changeset | 504 | val latex_decoded: Symbol = decode(latex) | 
| 
7d680dcd69b1
misc tuning, based on suggestions by IntelliJ IDEA;
 wenzelm parents: 
73359diff
changeset | 505 | val marker_decoded: Symbol = decode(marker) | 
| 
7d680dcd69b1
misc tuning, based on suggestions by IntelliJ IDEA;
 wenzelm parents: 
73359diff
changeset | 506 | val open_decoded: Symbol = decode(open) | 
| 
7d680dcd69b1
misc tuning, based on suggestions by IntelliJ IDEA;
 wenzelm parents: 
73359diff
changeset | 507 | val close_decoded: Symbol = decode(close) | 
| 55033 | 508 | |
| 509 | ||
| 43488 | 510 | /* control symbols */ | 
| 511 | ||
| 59107 | 512 | val control_decoded: Set[Symbol] = | 
| 75197 | 513 |       (for (entry <- entries.iterator if entry.symbol.startsWith("\\<^"); s <- entry.decode)
 | 
| 514 | yield s).toSet | |
| 43488 | 515 | |
| 75192 
7d680dcd69b1
misc tuning, based on suggestions by IntelliJ IDEA;
 wenzelm parents: 
73359diff
changeset | 516 | val sub_decoded: Symbol = decode(sub) | 
| 
7d680dcd69b1
misc tuning, based on suggestions by IntelliJ IDEA;
 wenzelm parents: 
73359diff
changeset | 517 | val sup_decoded: Symbol = decode(sup) | 
| 
7d680dcd69b1
misc tuning, based on suggestions by IntelliJ IDEA;
 wenzelm parents: 
73359diff
changeset | 518 | val bold_decoded: Symbol = decode(bold) | 
| 
7d680dcd69b1
misc tuning, based on suggestions by IntelliJ IDEA;
 wenzelm parents: 
73359diff
changeset | 519 | val emph_decoded: Symbol = decode(emph) | 
| 
7d680dcd69b1
misc tuning, based on suggestions by IntelliJ IDEA;
 wenzelm parents: 
73359diff
changeset | 520 | val bsub_decoded: Symbol = decode(bsub) | 
| 
7d680dcd69b1
misc tuning, based on suggestions by IntelliJ IDEA;
 wenzelm parents: 
73359diff
changeset | 521 | val esub_decoded: Symbol = decode(esub) | 
| 
7d680dcd69b1
misc tuning, based on suggestions by IntelliJ IDEA;
 wenzelm parents: 
73359diff
changeset | 522 | val bsup_decoded: Symbol = decode(bsup) | 
| 
7d680dcd69b1
misc tuning, based on suggestions by IntelliJ IDEA;
 wenzelm parents: 
73359diff
changeset | 523 | val esup_decoded: Symbol = decode(esup) | 
| 27918 | 524 | } | 
| 43695 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 525 | |
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 526 | |
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 527 | /* tables */ | 
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 528 | |
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 529 | def decode(text: String): String = symbols.decode(text) | 
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 530 | def encode(text: String): String = symbols.encode(text) | 
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 531 | |
| 73030 | 532 | def decode_yxml(text: String, cache: XML.Cache = XML.Cache.none): XML.Body = | 
| 533 | YXML.parse_body(decode(text), cache = cache) | |
| 534 | ||
| 535 | def decode_yxml_failsafe(text: String, cache: XML.Cache = XML.Cache.none): XML.Body = | |
| 536 | YXML.parse_body_failsafe(decode(text), cache = cache) | |
| 537 | ||
| 65344 
b99283eed13c
clarified YXML vs. symbol encoding: operate on whole message;
 wenzelm parents: 
65335diff
changeset | 538 | def encode_yxml(body: XML.Body): String = encode(YXML.string_of_body(body)) | 
| 53337 
b3817a0e3211
sort items according to persistent history of frequency of use;
 wenzelm parents: 
53316diff
changeset | 539 | |
| 75393 | 540 |   def decode_strict(text: String): String = {
 | 
| 50291 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
 wenzelm parents: 
50238diff
changeset | 541 | val decoded = decode(text) | 
| 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
 wenzelm parents: 
50238diff
changeset | 542 | if (encode(decoded) == text) decoded | 
| 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
 wenzelm parents: 
50238diff
changeset | 543 |     else {
 | 
| 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
 wenzelm parents: 
50238diff
changeset | 544 | val bad = new mutable.ListBuffer[Symbol] | 
| 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
 wenzelm parents: 
50238diff
changeset | 545 | for (s <- iterator(text) if encode(decode(s)) != s && !bad.contains(s)) | 
| 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
 wenzelm parents: 
50238diff
changeset | 546 | bad += s | 
| 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
 wenzelm parents: 
50238diff
changeset | 547 |       error("Bad Unicode symbols in text: " + commas_quote(bad))
 | 
| 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
 wenzelm parents: 
50238diff
changeset | 548 | } | 
| 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
 wenzelm parents: 
50238diff
changeset | 549 | } | 
| 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
 wenzelm parents: 
50238diff
changeset | 550 | |
| 72866 | 551 | def output(unicode_symbols: Boolean, text: String): String = | 
| 552 | if (unicode_symbols) Symbol.decode(text) else Symbol.encode(text) | |
| 553 | ||
| 43695 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 554 | |
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 555 | /* classification */ | 
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 556 | |
| 43696 | 557 | def is_letter(sym: Symbol): Boolean = symbols.letters.contains(sym) | 
| 558 | def is_digit(sym: Symbol): Boolean = sym.length == 1 && '0' <= sym(0) && sym(0) <= '9' | |
| 559 | def is_quasi(sym: Symbol): Boolean = sym == "_" || sym == "'" | |
| 560 | def is_letdig(sym: Symbol): Boolean = is_letter(sym) || is_digit(sym) || is_quasi(sym) | |
| 561 | def is_blank(sym: Symbol): Boolean = symbols.blanks.contains(sym) | |
| 44992 
aa34d2d049ce
refined Symbol.is_symbolic -- cover recoded versions as well;
 wenzelm parents: 
44949diff
changeset | 562 | |
| 55033 | 563 | |
| 67438 | 564 | /* symbolic newline */ | 
| 63528 
0f39f59317c1
completion templates for commands involving "begin ... end" blocks;
 wenzelm parents: 
62528diff
changeset | 565 | |
| 
0f39f59317c1
completion templates for commands involving "begin ... end" blocks;
 wenzelm parents: 
62528diff
changeset | 566 | val newline: Symbol = "\\<newline>" | 
| 
0f39f59317c1
completion templates for commands involving "begin ... end" blocks;
 wenzelm parents: 
62528diff
changeset | 567 | def newline_decoded: Symbol = symbols.newline_decoded | 
| 
0f39f59317c1
completion templates for commands involving "begin ... end" blocks;
 wenzelm parents: 
62528diff
changeset | 568 | |
| 
0f39f59317c1
completion templates for commands involving "begin ... end" blocks;
 wenzelm parents: 
62528diff
changeset | 569 | def print_newlines(str: String): String = | 
| 
0f39f59317c1
completion templates for commands involving "begin ... end" blocks;
 wenzelm parents: 
62528diff
changeset | 570 |     if (str.contains('\n'))
 | 
| 
0f39f59317c1
completion templates for commands involving "begin ... end" blocks;
 wenzelm parents: 
62528diff
changeset | 571 |       (for (s <- iterator(str)) yield { if (s == "\n") newline_decoded else s }).mkString
 | 
| 
0f39f59317c1
completion templates for commands involving "begin ... end" blocks;
 wenzelm parents: 
62528diff
changeset | 572 | else str | 
| 61579 | 573 | |
| 67438 | 574 | |
| 575 | /* formal comments */ | |
| 576 | ||
| 61579 | 577 | val comment: Symbol = "\\<comment>" | 
| 67449 | 578 | val cancel: Symbol = "\\<^cancel>" | 
| 579 | val latex: Symbol = "\\<^latex>" | |
| 69891 
def3ec9cdb7e
document markers are formal comments, and may thus occur anywhere in the command-span;
 wenzelm parents: 
69887diff
changeset | 580 | val marker: Symbol = "\\<^marker>" | 
| 67449 | 581 | |
| 61579 | 582 | def comment_decoded: Symbol = symbols.comment_decoded | 
| 67438 | 583 | def cancel_decoded: Symbol = symbols.cancel_decoded | 
| 584 | def latex_decoded: Symbol = symbols.latex_decoded | |
| 69891 
def3ec9cdb7e
document markers are formal comments, and may thus occur anywhere in the command-span;
 wenzelm parents: 
69887diff
changeset | 585 | def marker_decoded: Symbol = symbols.marker_decoded | 
| 61579 | 586 | |
| 587 | ||
| 55033 | 588 | /* cartouches */ | 
| 589 | ||
| 61579 | 590 | val open: Symbol = "\\<open>" | 
| 591 | val close: Symbol = "\\<close>" | |
| 55033 | 592 | |
| 593 | def open_decoded: Symbol = symbols.open_decoded | |
| 594 | def close_decoded: Symbol = symbols.close_decoded | |
| 595 | ||
| 596 | def is_open(sym: Symbol): Boolean = sym == open_decoded || sym == open | |
| 597 | def is_close(sym: Symbol): Boolean = sym == close_decoded || sym == close | |
| 598 | ||
| 67131 | 599 | def cartouche(s: String): String = open + s + close | 
| 600 | def cartouche_decoded(s: String): String = open_decoded + s + close_decoded | |
| 601 | ||
| 55033 | 602 | |
| 603 | /* symbols for symbolic identifiers */ | |
| 44992 
aa34d2d049ce
refined Symbol.is_symbolic -- cover recoded versions as well;
 wenzelm parents: 
44949diff
changeset | 604 | |
| 
aa34d2d049ce
refined Symbol.is_symbolic -- cover recoded versions as well;
 wenzelm parents: 
44949diff
changeset | 605 | private def raw_symbolic(sym: Symbol): Boolean = | 
| 43695 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 606 |     sym.startsWith("\\<") && sym.endsWith(">") && !sym.startsWith("\\<^")
 | 
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 607 | |
| 55033 | 608 | def is_symbolic(sym: Symbol): Boolean = | 
| 609 | !is_open(sym) && !is_close(sym) && (raw_symbolic(sym) || symbols.symbolic.contains(sym)) | |
| 610 | ||
| 611 | def is_symbolic_char(sym: Symbol): Boolean = symbols.sym_chars.contains(sym) | |
| 612 | ||
| 43695 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 613 | |
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 614 | /* control symbols */ | 
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 615 | |
| 67127 
cf111622c9f8
font style for literal control symbols, notably for antiquotations;
 wenzelm parents: 
66919diff
changeset | 616 | val control_prefix = "\\<^" | 
| 
cf111622c9f8
font style for literal control symbols, notably for antiquotations;
 wenzelm parents: 
66919diff
changeset | 617 | val control_suffix = ">" | 
| 
cf111622c9f8
font style for literal control symbols, notably for antiquotations;
 wenzelm parents: 
66919diff
changeset | 618 | |
| 67255 
f1f983484878
HTML rendering of \<^control> as in Isabelle/jEdit;
 wenzelm parents: 
67131diff
changeset | 619 | def control_name(sym: Symbol): Option[String] = | 
| 
f1f983484878
HTML rendering of \<^control> as in Isabelle/jEdit;
 wenzelm parents: 
67131diff
changeset | 620 | if (is_control_encoded(sym)) | 
| 
f1f983484878
HTML rendering of \<^control> as in Isabelle/jEdit;
 wenzelm parents: 
67131diff
changeset | 621 | Some(sym.substring(control_prefix.length, sym.length - control_suffix.length)) | 
| 
f1f983484878
HTML rendering of \<^control> as in Isabelle/jEdit;
 wenzelm parents: 
67131diff
changeset | 622 | else None | 
| 
f1f983484878
HTML rendering of \<^control> as in Isabelle/jEdit;
 wenzelm parents: 
67131diff
changeset | 623 | |
| 67127 
cf111622c9f8
font style for literal control symbols, notably for antiquotations;
 wenzelm parents: 
66919diff
changeset | 624 | def is_control_encoded(sym: Symbol): Boolean = | 
| 
cf111622c9f8
font style for literal control symbols, notably for antiquotations;
 wenzelm parents: 
66919diff
changeset | 625 | sym.startsWith(control_prefix) && sym.endsWith(control_suffix) | 
| 
cf111622c9f8
font style for literal control symbols, notably for antiquotations;
 wenzelm parents: 
66919diff
changeset | 626 | |
| 59107 | 627 | def is_control(sym: Symbol): Boolean = | 
| 67127 
cf111622c9f8
font style for literal control symbols, notably for antiquotations;
 wenzelm parents: 
66919diff
changeset | 628 | is_control_encoded(sym) || symbols.control_decoded.contains(sym) | 
| 43695 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 629 | |
| 43696 | 630 | def is_controllable(sym: Symbol): Boolean = | 
| 66006 
cec184536dfd
uniform notion of Symbol.is_controllable (see also 265d9300d523);
 wenzelm parents: 
65997diff
changeset | 631 | !is_blank(sym) && !is_control(sym) && !is_open(sym) && !is_close(sym) && | 
| 
cec184536dfd
uniform notion of Symbol.is_controllable (see also 265d9300d523);
 wenzelm parents: 
65997diff
changeset | 632 | !is_malformed(sym) && sym != "\"" | 
| 43695 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 633 | |
| 73208 | 634 | val sub: Symbol = "\\<^sub>" | 
| 635 | val sup: Symbol = "\\<^sup>" | |
| 636 | val bold: Symbol = "\\<^bold>" | |
| 637 | val emph: Symbol = "\\<^emph>" | |
| 638 | val bsub: Symbol = "\\<^bsub>" | |
| 639 | val esub: Symbol = "\\<^esub>" | |
| 640 | val bsup: Symbol = "\\<^bsup>" | |
| 641 | val esup: Symbol = "\\<^esup>" | |
| 62103 | 642 | |
| 44238 
36120feb70ed
some convenience actions/shortcuts for control symbols;
 wenzelm parents: 
44181diff
changeset | 643 | def sub_decoded: Symbol = symbols.sub_decoded | 
| 
36120feb70ed
some convenience actions/shortcuts for control symbols;
 wenzelm parents: 
44181diff
changeset | 644 | def sup_decoded: Symbol = symbols.sup_decoded | 
| 62103 | 645 | def bold_decoded: Symbol = symbols.bold_decoded | 
| 646 | def emph_decoded: Symbol = symbols.emph_decoded | |
| 44238 
36120feb70ed
some convenience actions/shortcuts for control symbols;
 wenzelm parents: 
44181diff
changeset | 647 | def bsub_decoded: Symbol = symbols.bsub_decoded | 
| 
36120feb70ed
some convenience actions/shortcuts for control symbols;
 wenzelm parents: 
44181diff
changeset | 648 | def esub_decoded: Symbol = symbols.esub_decoded | 
| 
36120feb70ed
some convenience actions/shortcuts for control symbols;
 wenzelm parents: 
44181diff
changeset | 649 | def bsup_decoded: Symbol = symbols.bsup_decoded | 
| 
36120feb70ed
some convenience actions/shortcuts for control symbols;
 wenzelm parents: 
44181diff
changeset | 650 | def esup_decoded: Symbol = symbols.esup_decoded | 
| 71649 | 651 | |
| 652 | ||
| 653 | /* metric */ | |
| 654 | ||
| 655 | def is_printable(sym: Symbol): Boolean = | |
| 656 | if (is_ascii(sym)) is_ascii_printable(sym(0)) | |
| 657 | else !is_control(sym) | |
| 658 | ||
| 75393 | 659 |   object Metric extends Pretty.Metric {
 | 
| 71649 | 660 | val unit = 1.0 | 
| 661 | def apply(str: String): Double = | |
| 662 |       (for (s <- iterator(str)) yield {
 | |
| 663 | val sym = encode(s) | |
| 664 |         if (sym.startsWith("\\<long") || sym.startsWith("\\<Long")) 2
 | |
| 665 | else if (is_printable(sym)) 1 | |
| 666 | else 0 | |
| 667 | }).sum | |
| 668 | } | |
| 27901 | 669 | } |