| author | blanchet | 
| Mon, 04 Nov 2013 17:25:36 +0100 | |
| changeset 54247 | 81ee85f56e2d | 
| parent 53400 | 673eb869e6ee | 
| child 54734 | b91afc3aa3e6 | 
| permissions | -rw-r--r-- | 
| 27901 | 1 | /* Title: Pure/General/symbol.scala | 
| 2 | Author: Makarius | |
| 3 | ||
| 27924 | 4 | Detecting and recoding Isabelle symbols. | 
| 27901 | 5 | */ | 
| 6 | ||
| 7 | package isabelle | |
| 8 | ||
| 36011 
3ff725ac13a4
adapted to Scala 2.8.0 Beta1 -- with notable changes to scala.collection;
 wenzelm parents: 
34316diff
changeset | 9 | import scala.collection.mutable | 
| 31522 | 10 | import scala.util.matching.Regex | 
| 48922 | 11 | import scala.annotation.tailrec | 
| 27901 | 12 | |
| 13 | ||
| 31522 | 14 | object Symbol | 
| 15 | {
 | |
| 43696 | 16 | type Symbol = String | 
| 17 | ||
| 18 | ||
| 43418 | 19 | /* ASCII characters */ | 
| 20 | ||
| 21 | def is_ascii_letter(c: Char): Boolean = 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' | |
| 22 | def is_ascii_digit(c: Char): Boolean = '0' <= c && c <= '9' | |
| 23 | def is_ascii_quasi(c: Char): Boolean = c == '_' || c == '\'' | |
| 24 | ||
| 25 | def is_ascii_letdig(c: Char): Boolean = | |
| 26 | is_ascii_letter(c) || is_ascii_digit(c) || is_ascii_quasi(c) | |
| 27 | ||
| 28 | def is_ascii_identifier(s: String): Boolean = | |
| 50238 
98d35a7368bd
more uniform Symbol.is_ascii_identifier in ML/Scala;
 wenzelm parents: 
50233diff
changeset | 29 | s.length > 0 && is_ascii_letter(s(0)) && s.forall(is_ascii_letdig) | 
| 43418 | 30 | |
| 31 | ||
| 48775 | 32 | /* symbol matching */ | 
| 27901 | 33 | |
| 48775 | 34 |   private val symbol_total = new Regex("""(?xs)
 | 
| 35 | [\ud800-\udbff][\udc00-\udfff] | \r\n | | |
| 36 | \\ < (?: \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* | \^? ([A-Za-z][A-Za-z0-9_']*)? ) >? | | |
| 37 | .""") | |
| 27924 | 38 | |
| 48775 | 39 | private def is_plain(c: Char): Boolean = | 
| 40 | !(c == '\r' || c == '\\' || Character.isHighSurrogate(c)) | |
| 48773 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 wenzelm parents: 
48704diff
changeset | 41 | |
| 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 wenzelm parents: 
48704diff
changeset | 42 | def is_malformed(s: Symbol): Boolean = | 
| 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 wenzelm parents: 
48704diff
changeset | 43 |     s.length match {
 | 
| 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 wenzelm parents: 
48704diff
changeset | 44 | case 1 => | 
| 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 wenzelm parents: 
48704diff
changeset | 45 | val c = s(0) | 
| 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 wenzelm parents: 
48704diff
changeset | 46 | Character.isHighSurrogate(c) || Character.isLowSurrogate(c) || c == '\ufffd' | 
| 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 wenzelm parents: 
48704diff
changeset | 47 | case 2 => | 
| 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 wenzelm parents: 
48704diff
changeset | 48 | val c1 = s(0) | 
| 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 wenzelm parents: 
48704diff
changeset | 49 | val c2 = s(1) | 
| 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 wenzelm parents: 
48704diff
changeset | 50 | !(c1 == '\r' && c2 == '\n' || Character.isSurrogatePair(c1, c2)) | 
| 48774 | 51 |       case _ => !s.endsWith(">") || s == "\\<>" || s == "\\<^>"
 | 
| 48773 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 wenzelm parents: 
48704diff
changeset | 52 | } | 
| 34137 | 53 | |
| 43696 | 54 | def is_physical_newline(s: Symbol): Boolean = | 
| 43675 
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
 wenzelm parents: 
43511diff
changeset | 55 | s == "\n" || s == "\r" || s == "\r\n" | 
| 38877 | 56 | |
| 34137 | 57 | class Matcher(text: CharSequence) | 
| 58 |   {
 | |
| 48775 | 59 | private val matcher = symbol_total.pattern.matcher(text) | 
| 34137 | 60 | def apply(start: Int, end: Int): Int = | 
| 61 |     {
 | |
| 62 | require(0 <= start && start < end && end <= text.length) | |
| 34316 
f879b649ac4c
clarified Symbol.is_plain/is_wellformed -- is_closed was rejecting plain backslashes;
 wenzelm parents: 
34193diff
changeset | 63 | if (is_plain(text.charAt(start))) 1 | 
| 34138 | 64 |       else {
 | 
| 34137 | 65 | matcher.region(start, end).lookingAt | 
| 66 | matcher.group.length | |
| 67 | } | |
| 68 | } | |
| 31522 | 69 | } | 
| 27937 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 70 | |
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 71 | |
| 43695 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 72 | /* iterator */ | 
| 33998 | 73 | |
| 43696 | 74 | private val char_symbols: Array[Symbol] = | 
| 43675 
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
 wenzelm parents: 
43511diff
changeset | 75 | (0 until 256).iterator.map(i => new String(Array(i.toChar))).toArray | 
| 
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
 wenzelm parents: 
43511diff
changeset | 76 | |
| 43696 | 77 | def iterator(text: CharSequence): Iterator[Symbol] = | 
| 78 | new Iterator[Symbol] | |
| 40522 | 79 |     {
 | 
| 43489 | 80 | private val matcher = new Matcher(text) | 
| 81 | private var i = 0 | |
| 82 | def hasNext = i < text.length | |
| 83 | def next = | |
| 84 |       {
 | |
| 85 | val n = matcher(i, text.length) | |
| 43675 
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
 wenzelm parents: 
43511diff
changeset | 86 | val s = | 
| 
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
 wenzelm parents: 
43511diff
changeset | 87 | if (n == 0) "" | 
| 
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
 wenzelm parents: 
43511diff
changeset | 88 |           else if (n == 1) {
 | 
| 
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
 wenzelm parents: 
43511diff
changeset | 89 | val c = text.charAt(i) | 
| 
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
 wenzelm parents: 
43511diff
changeset | 90 | if (c < char_symbols.length) char_symbols(c) | 
| 
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
 wenzelm parents: 
43511diff
changeset | 91 | else text.subSequence(i, i + n).toString | 
| 
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
 wenzelm parents: 
43511diff
changeset | 92 | } | 
| 
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
 wenzelm parents: 
43511diff
changeset | 93 | else text.subSequence(i, i + n).toString | 
| 43489 | 94 | i += n | 
| 95 | s | |
| 96 | } | |
| 33998 | 97 | } | 
| 43489 | 98 | |
| 44949 | 99 | def explode(text: CharSequence): List[Symbol] = iterator(text).toList | 
| 100 | ||
| 48922 | 101 | def advance_line_column(pos: (Int, Int), text: CharSequence): (Int, Int) = | 
| 102 |   {
 | |
| 103 | var (line, column) = pos | |
| 104 |     for (sym <- iterator(text)) {
 | |
| 105 |       if (is_physical_newline(sym)) { line += 1; column = 1 }
 | |
| 106 | else column += 1 | |
| 107 | } | |
| 108 | (line, column) | |
| 109 | } | |
| 110 | ||
| 33998 | 111 | |
| 112 | /* decoding offsets */ | |
| 113 | ||
| 52507 | 114 | object Index | 
| 115 |   {
 | |
| 116 | def apply(text: CharSequence): Index = new Index(text) | |
| 117 | } | |
| 118 | ||
| 119 | final class Index private(text: CharSequence) | |
| 31929 | 120 |   {
 | 
| 43714 | 121 | sealed case class Entry(chr: Int, sym: Int) | 
| 31929 | 122 | val index: Array[Entry] = | 
| 123 |     {
 | |
| 34137 | 124 | val matcher = new Matcher(text) | 
| 31929 | 125 | val buf = new mutable.ArrayBuffer[Entry] | 
| 126 | var chr = 0 | |
| 127 | var sym = 0 | |
| 33998 | 128 |       while (chr < text.length) {
 | 
| 34137 | 129 | val n = matcher(chr, text.length) | 
| 130 | chr += n | |
| 31929 | 131 | sym += 1 | 
| 34137 | 132 | if (n > 1) buf += Entry(chr, sym) | 
| 31929 | 133 | } | 
| 134 | buf.toArray | |
| 135 | } | |
| 38479 
e628da370072
more efficient Markup_Tree, based on branches sorted by quasi-order;
 wenzelm parents: 
37556diff
changeset | 136 | def decode(sym1: Int): Int = | 
| 31929 | 137 |     {
 | 
| 38479 
e628da370072
more efficient Markup_Tree, based on branches sorted by quasi-order;
 wenzelm parents: 
37556diff
changeset | 138 | val sym = sym1 - 1 | 
| 31929 | 139 | val end = index.length | 
| 48922 | 140 | @tailrec def bisect(a: Int, b: Int): Int = | 
| 31929 | 141 |       {
 | 
| 142 |         if (a < b) {
 | |
| 143 | val c = (a + b) / 2 | |
| 144 | if (sym < index(c).sym) bisect(a, c) | |
| 145 | else if (c + 1 == end || sym < index(c + 1).sym) c | |
| 146 | else bisect(c + 1, b) | |
| 147 | } | |
| 148 | else -1 | |
| 149 | } | |
| 150 | val i = bisect(0, end) | |
| 151 | if (i < 0) sym | |
| 152 | else index(i).chr + sym - index(i).sym | |
| 153 | } | |
| 38479 
e628da370072
more efficient Markup_Tree, based on branches sorted by quasi-order;
 wenzelm parents: 
37556diff
changeset | 154 | def decode(range: Text.Range): Text.Range = range.map(decode(_)) | 
| 31929 | 155 | } | 
| 156 | ||
| 157 | ||
| 33998 | 158 | /* recoding text */ | 
| 27937 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 159 | |
| 31522 | 160 | private class Recoder(list: List[(String, String)]) | 
| 161 |   {
 | |
| 162 | private val (min, max) = | |
| 163 |     {
 | |
| 27937 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 164 | var min = '\uffff' | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 165 | var max = '\u0000' | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 166 |       for ((x, _) <- list) {
 | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 167 | val c = x(0) | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 168 | if (c < min) min = c | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 169 | if (c > max) max = c | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 170 | } | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 171 | (min, max) | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 172 | } | 
| 40443 | 173 | private val table = | 
| 174 |     {
 | |
| 175 | var tab = Map[String, String]() | |
| 176 |       for ((x, y) <- list) {
 | |
| 177 |         tab.get(x) match {
 | |
| 178 | case None => tab += (x -> y) | |
| 179 | case Some(z) => | |
| 44181 | 180 |             error("Duplicate mapping of " + quote(x) + " to " + quote(y) + " vs. " + quote(z))
 | 
| 40443 | 181 | } | 
| 182 | } | |
| 183 | tab | |
| 184 | } | |
| 31522 | 185 | def recode(text: String): String = | 
| 186 |     {
 | |
| 27937 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 187 | val len = text.length | 
| 48775 | 188 | val matcher = symbol_total.pattern.matcher(text) | 
| 27937 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 189 | val result = new StringBuilder(len) | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 190 | var i = 0 | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 191 |       while (i < len) {
 | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 192 | val c = text(i) | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 193 |         if (min <= c && c <= max) {
 | 
| 31929 | 194 | matcher.region(i, len).lookingAt | 
| 27938 | 195 | val x = matcher.group | 
| 52888 | 196 | result.append(table.getOrElse(x, x)) | 
| 27937 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 197 | i = matcher.end | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 198 | } | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 199 |         else { result.append(c); i += 1 }
 | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 200 | } | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 201 | result.toString | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 202 | } | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 203 | } | 
| 27924 | 204 | |
| 27918 | 205 | |
| 27923 
7ebe9d38743a
use scala.collection.jcl.HashMap, which seems to be more efficient;
 wenzelm parents: 
27918diff
changeset | 206 | |
| 43695 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 207 | /** symbol interpretation **/ | 
| 27927 | 208 | |
| 43695 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 209 | private lazy val symbols = | 
| 50564 
c6fde2fc4217
allow to suppress ISABELLE_SYMBOLS for experiments;
 wenzelm parents: 
50291diff
changeset | 210 |     new Interpretation(File.try_read(Path.split(Isabelle_System.getenv("ISABELLE_SYMBOLS"))))
 | 
| 43695 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 211 | |
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 212 | private class Interpretation(symbols_spec: String) | 
| 29569 
f3f529b5d8fb
more general init of Symbol.Interpretation, independent of IsabelleSystem instance;
 wenzelm parents: 
29174diff
changeset | 213 |   {
 | 
| 31522 | 214 | /* read symbols */ | 
| 215 | ||
| 50136 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
 wenzelm parents: 
48922diff
changeset | 216 |     private val No_Decl = new Regex("""(?xs) ^\s* (?: \#.* )? $ """)
 | 
| 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
 wenzelm parents: 
48922diff
changeset | 217 |     private val Key = new Regex("""(?xs) (.+): """)
 | 
| 31522 | 218 | |
| 53316 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 219 | private def read_decl(decl: String): (Symbol, Properties.T) = | 
| 31522 | 220 |     {
 | 
| 221 |       def err() = error("Bad symbol declaration: " + decl)
 | |
| 222 | ||
| 53316 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 223 | def read_props(props: List[String]): Properties.T = | 
| 31522 | 224 |       {
 | 
| 225 |         props match {
 | |
| 53316 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 226 | case Nil => Nil | 
| 31522 | 227 | case _ :: Nil => err() | 
| 53316 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 228 | case Key(x) :: y :: rest => (x -> y) :: read_props(rest) | 
| 31522 | 229 | case _ => err() | 
| 230 | } | |
| 231 | } | |
| 232 |       decl.split("\\s+").toList match {
 | |
| 53316 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 233 | case sym :: props if sym.length > 1 && !is_malformed(sym) => | 
| 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 234 | (sym, read_props(props)) | 
| 34193 | 235 | case _ => err() | 
| 31522 | 236 | } | 
| 237 | } | |
| 238 | ||
| 53316 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 239 | private val symbols: List[(Symbol, Properties.T)] = | 
| 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 240 | (((List.empty[(Symbol, Properties.T)], Set.empty[Symbol]) /: | 
| 50136 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
 wenzelm parents: 
48922diff
changeset | 241 | split_lines(symbols_spec).reverse) | 
| 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
 wenzelm parents: 
48922diff
changeset | 242 |         { case (res, No_Decl()) => res
 | 
| 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
 wenzelm parents: 
48922diff
changeset | 243 | case ((list, known), decl) => | 
| 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
 wenzelm parents: 
48922diff
changeset | 244 | val (sym, props) = read_decl(decl) | 
| 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
 wenzelm parents: 
48922diff
changeset | 245 | if (known(sym)) (list, known) | 
| 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
 wenzelm parents: 
48922diff
changeset | 246 | else ((sym, props) :: list, known + sym) | 
| 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
 wenzelm parents: 
48922diff
changeset | 247 | })._1 | 
| 31522 | 248 | |
| 249 | ||
| 53400 | 250 | /* basic properties */ | 
| 251 | ||
| 252 | val properties: Map[Symbol, Properties.T] = Map(symbols: _*) | |
| 31651 | 253 | |
| 43696 | 254 | val names: Map[Symbol, String] = | 
| 34134 | 255 |     {
 | 
| 43456 
8a6de1a6e1dc
names for control symbols without "^", which is relevant for completion;
 wenzelm parents: 
43455diff
changeset | 256 |       val name = new Regex("""\\<\^?([A-Za-z][A-Za-z0-9_']*)>""")
 | 
| 31651 | 257 | Map((for ((sym @ name(a), _) <- symbols) yield (sym -> a)): _*) | 
| 258 | } | |
| 259 | ||
| 50136 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
 wenzelm parents: 
48922diff
changeset | 260 | val groups: List[(String, List[Symbol])] = | 
| 53316 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 261 |       symbols.map({ case (sym, props) =>
 | 
| 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 262 |         val gs = for (("group", g) <- props) yield g
 | 
| 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 263 | if (gs.isEmpty) List(sym -> "unsorted") else gs.map(sym -> _) | 
| 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 264 | }).flatten | 
| 50136 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
 wenzelm parents: 
48922diff
changeset | 265 |         .groupBy(_._2).toList.map({ case (group, list) => (group, list.map(_._1)) })
 | 
| 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
 wenzelm parents: 
48922diff
changeset | 266 | .sortBy(_._1) | 
| 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
 wenzelm parents: 
48922diff
changeset | 267 | |
| 53316 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 268 | val abbrevs: Multi_Map[Symbol, String] = | 
| 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 269 | Multi_Map(( | 
| 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 270 |         for {
 | 
| 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 271 | (sym, props) <- symbols | 
| 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 272 |           ("abbrev", a) <- props.reverse
 | 
| 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 273 | } yield (sym -> a)): _*) | 
| 43488 | 274 | |
| 275 | ||
| 43490 | 276 | /* recoding */ | 
| 31522 | 277 | |
| 53316 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 278 |     private val Code = new Properties.String("code")
 | 
| 31522 | 279 | private val (decoder, encoder) = | 
| 280 |     {
 | |
| 281 | val mapping = | |
| 282 |         for {
 | |
| 283 | (sym, props) <- symbols | |
| 46997 | 284 | code = | 
| 53316 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 285 |             props match {
 | 
| 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 286 | case Code(s) => | 
| 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 287 |                 try { Integer.decode(s).intValue }
 | 
| 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 288 |                 catch { case _: NumberFormatException => error("Bad code for symbol " + sym) }
 | 
| 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 289 |               case _ => error("Missing code for symbol " + sym)
 | 
| 31522 | 290 | } | 
| 46997 | 291 | ch = new String(Character.toChars(code)) | 
| 34193 | 292 |         } yield {
 | 
| 293 |           if (code < 128) error("Illegal ASCII code for symbol " + sym)
 | |
| 294 | else (sym, ch) | |
| 295 | } | |
| 31545 
5f1f0a20af4d
discontinued escaped symbols such as \\<forall> -- only one backslash should be used;
 wenzelm parents: 
31523diff
changeset | 296 | (new Recoder(mapping), | 
| 31548 | 297 |        new Recoder(mapping map { case (x, y) => (y, x) }))
 | 
| 31522 | 298 | } | 
| 27918 | 299 | |
| 34098 | 300 | def decode(text: String): String = decoder.recode(text) | 
| 301 | def encode(text: String): String = encoder.recode(text) | |
| 34134 | 302 | |
| 43490 | 303 | private def recode_set(elems: String*): Set[String] = | 
| 304 |     {
 | |
| 305 | val content = elems.toList | |
| 306 | Set((content ::: content.map(decode)): _*) | |
| 307 | } | |
| 308 | ||
| 309 | private def recode_map[A](elems: (String, A)*): Map[String, A] = | |
| 310 |     {
 | |
| 311 | val content = elems.toList | |
| 312 |       Map((content ::: content.map({ case (sym, a) => (decode(sym), a) })): _*)
 | |
| 313 | } | |
| 314 | ||
| 315 | ||
| 316 | /* user fonts */ | |
| 317 | ||
| 53316 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 318 |     private val Font = new Properties.String("font")
 | 
| 43696 | 319 | val fonts: Map[Symbol, String] = | 
| 53316 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 320 | recode_map((for ((sym, Font(font)) <- symbols) yield (sym -> font)): _*) | 
| 43490 | 321 | |
| 322 | val font_names: List[String] = Set(fonts.toList.map(_._2): _*).toList | |
| 323 | val font_index: Map[String, Int] = Map((font_names zip (0 until font_names.length).toList): _*) | |
| 324 | ||
| 34134 | 325 | |
| 326 | /* classification */ | |
| 327 | ||
| 43695 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 328 | val letters = recode_set( | 
| 34134 | 329 | "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", | 
| 330 | "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", | |
| 331 | "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", | |
| 332 | "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", | |
| 333 | ||
| 334 | "\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>", | |
| 335 | "\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>", | |
| 336 | "\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>", | |
| 337 | "\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>", | |
| 338 | "\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>", | |
| 339 | "\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>", | |
| 340 | "\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>", | |
| 341 | "\\<x>", "\\<y>", "\\<z>", | |
| 342 | ||
| 343 | "\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>", | |
| 344 | "\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>", | |
| 345 | "\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>", | |
| 346 | "\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>", | |
| 347 | "\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>", | |
| 348 | "\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>", | |
| 349 | "\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>", | |
| 350 | "\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>", | |
| 351 | "\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>", | |
| 352 | ||
| 353 | "\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>", | |
| 354 | "\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>", | |
| 355 | "\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>", | |
| 356 | "\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>", | |
| 357 | "\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>", | |
| 358 | "\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>", | |
| 52616 
3ac2878764f9
more robust identifier syntax: sub/superscript counts as modifier of LETDIG part instead of LETTER, both isub/isup and sub/sup are allowed;
 wenzelm parents: 
52507diff
changeset | 359 | "\\<Psi>", "\\<Omega>") | 
| 34134 | 360 | |
| 50233 
eef21a0726f1
removed remains of Oheimb's double-space (cf. 0a5af667dc75);
 wenzelm parents: 
50188diff
changeset | 361 |     val blanks = recode_set(" ", "\t", "\n", "\u000B", "\f", "\r", "\r\n", "\\<^newline>")
 | 
| 34138 | 362 | |
| 43695 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 363 | val sym_chars = | 
| 34138 | 364 |       Set("!", "#", "$", "%", "&", "*", "+", "-", "/", "<", "=", ">", "?", "@", "^", "_", "|", "~")
 | 
| 34134 | 365 | |
| 44992 
aa34d2d049ce
refined Symbol.is_symbolic -- cover recoded versions as well;
 wenzelm parents: 
44949diff
changeset | 366 |     val symbolic = recode_set((for { (sym, _) <- symbols; if raw_symbolic(sym) } yield sym): _*)
 | 
| 
aa34d2d049ce
refined Symbol.is_symbolic -- cover recoded versions as well;
 wenzelm parents: 
44949diff
changeset | 367 | |
| 43455 | 368 | |
| 43488 | 369 | /* control symbols */ | 
| 370 | ||
| 43696 | 371 | val ctrl_decoded: Set[Symbol] = | 
| 43488 | 372 |       Set((for ((sym, _) <- symbols if sym.startsWith("\\<^")) yield decode(sym)): _*)
 | 
| 373 | ||
| 44238 
36120feb70ed
some convenience actions/shortcuts for control symbols;
 wenzelm parents: 
44181diff
changeset | 374 |     val sub_decoded = decode("\\<^sub>")
 | 
| 
36120feb70ed
some convenience actions/shortcuts for control symbols;
 wenzelm parents: 
44181diff
changeset | 375 |     val sup_decoded = decode("\\<^sup>")
 | 
| 43511 | 376 |     val bsub_decoded = decode("\\<^bsub>")
 | 
| 377 |     val esub_decoded = decode("\\<^esub>")
 | |
| 378 |     val bsup_decoded = decode("\\<^bsup>")
 | |
| 379 |     val esup_decoded = decode("\\<^esup>")
 | |
| 44238 
36120feb70ed
some convenience actions/shortcuts for control symbols;
 wenzelm parents: 
44181diff
changeset | 380 |     val bold_decoded = decode("\\<^bold>")
 | 
| 27918 | 381 | } | 
| 43695 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 382 | |
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 383 | |
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 384 | /* tables */ | 
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 385 | |
| 53400 | 386 | def properties: Map[Symbol, Properties.T] = symbols.properties | 
| 43696 | 387 | def names: Map[Symbol, String] = symbols.names | 
| 50136 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
 wenzelm parents: 
48922diff
changeset | 388 | def groups: List[(String, List[Symbol])] = symbols.groups | 
| 53316 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
 wenzelm parents: 
53021diff
changeset | 389 | def abbrevs: Multi_Map[Symbol, String] = symbols.abbrevs | 
| 43695 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 390 | |
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 391 | def decode(text: String): String = symbols.decode(text) | 
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 392 | def encode(text: String): String = symbols.encode(text) | 
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 393 | |
| 53337 
b3817a0e3211
sort items according to persistent history of frequency of use;
 wenzelm parents: 
53316diff
changeset | 394 | def decode_string: XML.Decode.T[String] = (x => decode(XML.Decode.string(x))) | 
| 
b3817a0e3211
sort items according to persistent history of frequency of use;
 wenzelm parents: 
53316diff
changeset | 395 | def encode_string: XML.Encode.T[String] = (x => XML.Encode.string(encode(x))) | 
| 
b3817a0e3211
sort items according to persistent history of frequency of use;
 wenzelm parents: 
53316diff
changeset | 396 | |
| 50291 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
 wenzelm parents: 
50238diff
changeset | 397 | def decode_strict(text: String): String = | 
| 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
 wenzelm parents: 
50238diff
changeset | 398 |   {
 | 
| 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
 wenzelm parents: 
50238diff
changeset | 399 | val decoded = decode(text) | 
| 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
 wenzelm parents: 
50238diff
changeset | 400 | if (encode(decoded) == text) decoded | 
| 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
 wenzelm parents: 
50238diff
changeset | 401 |     else {
 | 
| 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
 wenzelm parents: 
50238diff
changeset | 402 | val bad = new mutable.ListBuffer[Symbol] | 
| 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
 wenzelm parents: 
50238diff
changeset | 403 | for (s <- iterator(text) if encode(decode(s)) != s && !bad.contains(s)) | 
| 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
 wenzelm parents: 
50238diff
changeset | 404 | bad += s | 
| 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
 wenzelm parents: 
50238diff
changeset | 405 |       error("Bad Unicode symbols in text: " + commas_quote(bad))
 | 
| 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
 wenzelm parents: 
50238diff
changeset | 406 | } | 
| 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
 wenzelm parents: 
50238diff
changeset | 407 | } | 
| 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
 wenzelm parents: 
50238diff
changeset | 408 | |
| 43696 | 409 | def fonts: Map[Symbol, String] = symbols.fonts | 
| 43695 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 410 | def font_names: List[String] = symbols.font_names | 
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 411 | def font_index: Map[String, Int] = symbols.font_index | 
| 43696 | 412 | def lookup_font(sym: Symbol): Option[Int] = symbols.fonts.get(sym).map(font_index(_)) | 
| 43695 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 413 | |
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 414 | |
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 415 | /* classification */ | 
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 416 | |
| 43696 | 417 | def is_letter(sym: Symbol): Boolean = symbols.letters.contains(sym) | 
| 418 | def is_digit(sym: Symbol): Boolean = sym.length == 1 && '0' <= sym(0) && sym(0) <= '9' | |
| 419 | def is_quasi(sym: Symbol): Boolean = sym == "_" || sym == "'" | |
| 420 | def is_letdig(sym: Symbol): Boolean = is_letter(sym) || is_digit(sym) || is_quasi(sym) | |
| 421 | def is_blank(sym: Symbol): Boolean = symbols.blanks.contains(sym) | |
| 44992 
aa34d2d049ce
refined Symbol.is_symbolic -- cover recoded versions as well;
 wenzelm parents: 
44949diff
changeset | 422 | |
| 43696 | 423 | def is_symbolic_char(sym: Symbol): Boolean = symbols.sym_chars.contains(sym) | 
| 44992 
aa34d2d049ce
refined Symbol.is_symbolic -- cover recoded versions as well;
 wenzelm parents: 
44949diff
changeset | 424 | def is_symbolic(sym: Symbol): Boolean = raw_symbolic(sym) || symbols.symbolic.contains(sym) | 
| 
aa34d2d049ce
refined Symbol.is_symbolic -- cover recoded versions as well;
 wenzelm parents: 
44949diff
changeset | 425 | |
| 
aa34d2d049ce
refined Symbol.is_symbolic -- cover recoded versions as well;
 wenzelm parents: 
44949diff
changeset | 426 | private def raw_symbolic(sym: Symbol): Boolean = | 
| 43695 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 427 |     sym.startsWith("\\<") && sym.endsWith(">") && !sym.startsWith("\\<^")
 | 
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 428 | |
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 429 | |
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 430 | /* control symbols */ | 
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 431 | |
| 43696 | 432 | def is_ctrl(sym: Symbol): Boolean = | 
| 43695 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 433 |     sym.startsWith("\\<^") || symbols.ctrl_decoded.contains(sym)
 | 
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 434 | |
| 43696 | 435 | def is_controllable(sym: Symbol): Boolean = | 
| 43695 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 436 | !is_blank(sym) && !is_ctrl(sym) && !is_malformed(sym) | 
| 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
 wenzelm parents: 
43675diff
changeset | 437 | |
| 44238 
36120feb70ed
some convenience actions/shortcuts for control symbols;
 wenzelm parents: 
44181diff
changeset | 438 | def sub_decoded: Symbol = symbols.sub_decoded | 
| 
36120feb70ed
some convenience actions/shortcuts for control symbols;
 wenzelm parents: 
44181diff
changeset | 439 | def sup_decoded: Symbol = symbols.sup_decoded | 
| 
36120feb70ed
some convenience actions/shortcuts for control symbols;
 wenzelm parents: 
44181diff
changeset | 440 | def bsub_decoded: Symbol = symbols.bsub_decoded | 
| 
36120feb70ed
some convenience actions/shortcuts for control symbols;
 wenzelm parents: 
44181diff
changeset | 441 | def esub_decoded: Symbol = symbols.esub_decoded | 
| 
36120feb70ed
some convenience actions/shortcuts for control symbols;
 wenzelm parents: 
44181diff
changeset | 442 | def bsup_decoded: Symbol = symbols.bsup_decoded | 
| 
36120feb70ed
some convenience actions/shortcuts for control symbols;
 wenzelm parents: 
44181diff
changeset | 443 | def esup_decoded: Symbol = symbols.esup_decoded | 
| 
36120feb70ed
some convenience actions/shortcuts for control symbols;
 wenzelm parents: 
44181diff
changeset | 444 | def bold_decoded: Symbol = symbols.bold_decoded | 
| 27901 | 445 | } |