| author | bulwahn | 
| Wed, 21 Jul 2010 18:13:15 +0200 | |
| changeset 37921 | 1e846be00ddf | 
| parent 37556 | 2bf29095d26f | 
| child 38479 | e628da370072 | 
| permissions | -rw-r--r-- | 
| 27901 | 1 | /* Title: Pure/General/symbol.scala | 
| 2 | Author: Makarius | |
| 3 | ||
| 27924 | 4 | Detecting and recoding Isabelle symbols. | 
| 27901 | 5 | */ | 
| 6 | ||
| 7 | package isabelle | |
| 8 | ||
| 27918 | 9 | import scala.io.Source | 
| 36011 
3ff725ac13a4
adapted to Scala 2.8.0 Beta1 -- with notable changes to scala.collection;
 wenzelm parents: 
34316diff
changeset | 10 | import scala.collection.mutable | 
| 31522 | 11 | import scala.util.matching.Regex | 
| 27901 | 12 | |
| 13 | ||
| 31522 | 14 | object Symbol | 
| 15 | {
 | |
| 36763 | 16 | /* spaces */ | 
| 17 | ||
| 36816 | 18 | val spc = ' ' | 
| 19 | val space = " " | |
| 20 | ||
| 21 | private val static_spaces = space * 4000 | |
| 36763 | 22 | |
| 23 | def spaces(k: Int): String = | |
| 24 |   {
 | |
| 25 | require(k >= 0) | |
| 26 | if (k < static_spaces.length) static_spaces.substring(0, k) | |
| 36816 | 27 | else space * k | 
| 36763 | 28 | } | 
| 29 | ||
| 30 | ||
| 33998 | 31 | /* Symbol regexps */ | 
| 27901 | 32 | |
| 31522 | 33 |   private val plain = new Regex("""(?xs)
 | 
| 37556 
2bf29095d26f
treat alternative newline symbols as in Isabelle/ML;
 wenzelm parents: 
36816diff
changeset | 34 | [^\r\\ \ud800-\udfff] | [\ud800-\udbff][\udc00-\udfff] """) | 
| 
2bf29095d26f
treat alternative newline symbols as in Isabelle/ML;
 wenzelm parents: 
36816diff
changeset | 35 | |
| 
2bf29095d26f
treat alternative newline symbols as in Isabelle/ML;
 wenzelm parents: 
36816diff
changeset | 36 |   private val newline = new Regex("""(?xs) \r\n | \r """)
 | 
| 27901 | 37 | |
| 31522 | 38 |   private val symbol = new Regex("""(?xs)
 | 
| 31545 
5f1f0a20af4d
discontinued escaped symbols such as \\<forall> -- only one backslash should be used;
 wenzelm parents: 
31523diff
changeset | 39 | \\ < (?: | 
| 27924 | 40 | \^? [A-Za-z][A-Za-z0-9_']* | | 
| 41 | \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""") | |
| 42 | ||
| 34316 
f879b649ac4c
clarified Symbol.is_plain/is_wellformed -- is_closed was rejecting plain backslashes;
 wenzelm parents: 
34193diff
changeset | 43 | // FIXME cover bad surrogates!? | 
| 37556 
2bf29095d26f
treat alternative newline symbols as in Isabelle/ML;
 wenzelm parents: 
36816diff
changeset | 44 | // FIXME check wrt. Isabelle/ML version | 
| 31522 | 45 |   private val bad_symbol = new Regex("(?xs) (?!" + symbol + ")" +
 | 
| 31545 
5f1f0a20af4d
discontinued escaped symbols such as \\<forall> -- only one backslash should be used;
 wenzelm parents: 
31523diff
changeset | 46 |     """ \\ < (?: (?! \s | [\"`\\] | \(\* | \*\) | \{\* | \*\} ) . )*""")
 | 
| 27924 | 47 | |
| 27939 | 48 | // total pattern | 
| 37556 
2bf29095d26f
treat alternative newline symbols as in Isabelle/ML;
 wenzelm parents: 
36816diff
changeset | 49 | val regex = new Regex(plain + "|" + newline + "|" + symbol + "|" + bad_symbol + "| .") | 
| 27937 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 50 | |
| 34137 | 51 | |
| 52 | /* basic matching */ | |
| 53 | ||
| 37556 
2bf29095d26f
treat alternative newline symbols as in Isabelle/ML;
 wenzelm parents: 
36816diff
changeset | 54 | def is_plain(c: Char): Boolean = !(c == '\r' || c == '\\' || '\ud800' <= c && c <= '\udfff') | 
| 34137 | 55 | |
| 34316 
f879b649ac4c
clarified Symbol.is_plain/is_wellformed -- is_closed was rejecting plain backslashes;
 wenzelm parents: 
34193diff
changeset | 56 | def is_wellformed(s: CharSequence): Boolean = | 
| 
f879b649ac4c
clarified Symbol.is_plain/is_wellformed -- is_closed was rejecting plain backslashes;
 wenzelm parents: 
34193diff
changeset | 57 | s.length == 1 && is_plain(s.charAt(0)) || !bad_symbol.pattern.matcher(s).matches | 
| 34137 | 58 | |
| 59 | class Matcher(text: CharSequence) | |
| 60 |   {
 | |
| 61 | private val matcher = regex.pattern.matcher(text) | |
| 62 | def apply(start: Int, end: Int): Int = | |
| 63 |     {
 | |
| 64 | require(0 <= start && start < end && end <= text.length) | |
| 34316 
f879b649ac4c
clarified Symbol.is_plain/is_wellformed -- is_closed was rejecting plain backslashes;
 wenzelm parents: 
34193diff
changeset | 65 | if (is_plain(text.charAt(start))) 1 | 
| 34138 | 66 |       else {
 | 
| 34137 | 67 | matcher.region(start, end).lookingAt | 
| 68 | matcher.group.length | |
| 69 | } | |
| 70 | } | |
| 31522 | 71 | } | 
| 27937 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 72 | |
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 73 | |
| 36011 
3ff725ac13a4
adapted to Scala 2.8.0 Beta1 -- with notable changes to scala.collection;
 wenzelm parents: 
34316diff
changeset | 74 | /* iterator */ | 
| 33998 | 75 | |
| 36011 
3ff725ac13a4
adapted to Scala 2.8.0 Beta1 -- with notable changes to scala.collection;
 wenzelm parents: 
34316diff
changeset | 76 | def iterator(text: CharSequence) = new Iterator[CharSequence] | 
| 34134 | 77 |   {
 | 
| 34137 | 78 | private val matcher = new Matcher(text) | 
| 33998 | 79 | private var i = 0 | 
| 80 | def hasNext = i < text.length | |
| 81 |     def next = {
 | |
| 34137 | 82 | val n = matcher(i, text.length) | 
| 83 | val s = text.subSequence(i, i + n) | |
| 84 | i += n | |
| 85 | s | |
| 33998 | 86 | } | 
| 87 | } | |
| 88 | ||
| 89 | ||
| 90 | /* decoding offsets */ | |
| 91 | ||
| 92 | class Index(text: CharSequence) | |
| 31929 | 93 |   {
 | 
| 94 | case class Entry(chr: Int, sym: Int) | |
| 95 | val index: Array[Entry] = | |
| 96 |     {
 | |
| 34137 | 97 | val matcher = new Matcher(text) | 
| 31929 | 98 | val buf = new mutable.ArrayBuffer[Entry] | 
| 99 | var chr = 0 | |
| 100 | var sym = 0 | |
| 33998 | 101 |       while (chr < text.length) {
 | 
| 34137 | 102 | val n = matcher(chr, text.length) | 
| 103 | chr += n | |
| 31929 | 104 | sym += 1 | 
| 34137 | 105 | if (n > 1) buf += Entry(chr, sym) | 
| 31929 | 106 | } | 
| 107 | buf.toArray | |
| 108 | } | |
| 109 | def decode(sym: Int): Int = | |
| 110 |     {
 | |
| 111 | val end = index.length | |
| 112 | def bisect(a: Int, b: Int): Int = | |
| 113 |       {
 | |
| 114 |         if (a < b) {
 | |
| 115 | val c = (a + b) / 2 | |
| 116 | if (sym < index(c).sym) bisect(a, c) | |
| 117 | else if (c + 1 == end || sym < index(c + 1).sym) c | |
| 118 | else bisect(c + 1, b) | |
| 119 | } | |
| 120 | else -1 | |
| 121 | } | |
| 122 | val i = bisect(0, end) | |
| 123 | if (i < 0) sym | |
| 124 | else index(i).chr + sym - index(i).sym | |
| 125 | } | |
| 126 | } | |
| 127 | ||
| 128 | ||
| 33998 | 129 | /* recoding text */ | 
| 27937 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 130 | |
| 31522 | 131 | private class Recoder(list: List[(String, String)]) | 
| 132 |   {
 | |
| 133 | private val (min, max) = | |
| 134 |     {
 | |
| 27937 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 135 | var min = '\uffff' | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 136 | var max = '\u0000' | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 137 |       for ((x, _) <- list) {
 | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 138 | val c = x(0) | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 139 | if (c < min) min = c | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 140 | if (c > max) max = c | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 141 | } | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 142 | (min, max) | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 143 | } | 
| 36011 
3ff725ac13a4
adapted to Scala 2.8.0 Beta1 -- with notable changes to scala.collection;
 wenzelm parents: 
34316diff
changeset | 144 | private val table = Map[String, String]() ++ list | 
| 31522 | 145 | def recode(text: String): String = | 
| 146 |     {
 | |
| 27937 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 147 | val len = text.length | 
| 31522 | 148 | val matcher = regex.pattern.matcher(text) | 
| 27937 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 149 | val result = new StringBuilder(len) | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 150 | var i = 0 | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 151 |       while (i < len) {
 | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 152 | val c = text(i) | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 153 |         if (min <= c && c <= max) {
 | 
| 31929 | 154 | matcher.region(i, len).lookingAt | 
| 27938 | 155 | val x = matcher.group | 
| 31522 | 156 | result.append(table.get(x) getOrElse x) | 
| 27937 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 157 | i = matcher.end | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 158 | } | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 159 |         else { result.append(c); i += 1 }
 | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 160 | } | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 161 | result.toString | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 162 | } | 
| 
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
 wenzelm parents: 
27935diff
changeset | 163 | } | 
| 27924 | 164 | |
| 27918 | 165 | |
| 27923 
7ebe9d38743a
use scala.collection.jcl.HashMap, which seems to be more efficient;
 wenzelm parents: 
27918diff
changeset | 166 | |
| 27927 | 167 | /** Symbol interpretation **/ | 
| 168 | ||
| 34137 | 169 | class Interpretation(symbol_decls: List[String]) | 
| 29569 
f3f529b5d8fb
more general init of Symbol.Interpretation, independent of IsabelleSystem instance;
 wenzelm parents: 
29174diff
changeset | 170 |   {
 | 
| 31522 | 171 | /* read symbols */ | 
| 172 | ||
| 173 |     private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """)
 | |
| 174 |     private val key = new Regex("""(?xs) (.+): """)
 | |
| 175 | ||
| 176 | private def read_decl(decl: String): (String, Map[String, String]) = | |
| 177 |     {
 | |
| 178 |       def err() = error("Bad symbol declaration: " + decl)
 | |
| 179 | ||
| 180 | def read_props(props: List[String]): Map[String, String] = | |
| 181 |       {
 | |
| 182 |         props match {
 | |
| 183 | case Nil => Map() | |
| 184 | case _ :: Nil => err() | |
| 185 | case key(x) :: y :: rest => read_props(rest) + (x -> y) | |
| 186 | case _ => err() | |
| 187 | } | |
| 188 | } | |
| 189 |       decl.split("\\s+").toList match {
 | |
| 34316 
f879b649ac4c
clarified Symbol.is_plain/is_wellformed -- is_closed was rejecting plain backslashes;
 wenzelm parents: 
34193diff
changeset | 190 | case sym :: props if sym.length > 1 && is_wellformed(sym) => (sym, read_props(props)) | 
| 34193 | 191 | case _ => err() | 
| 31522 | 192 | } | 
| 193 | } | |
| 194 | ||
| 195 | private val symbols: List[(String, Map[String, String])] = | |
| 34137 | 196 | for (decl <- symbol_decls if !empty.pattern.matcher(decl).matches) | 
| 31522 | 197 | yield read_decl(decl) | 
| 198 | ||
| 199 | ||
| 31651 | 200 | /* misc properties */ | 
| 201 | ||
| 34134 | 202 | val names: Map[String, String] = | 
| 203 |     {
 | |
| 31651 | 204 |       val name = new Regex("""\\<([A-Za-z][A-Za-z0-9_']*)>""")
 | 
| 205 | Map((for ((sym @ name(a), _) <- symbols) yield (sym -> a)): _*) | |
| 206 | } | |
| 207 | ||
| 208 | val abbrevs: Map[String, String] = Map(( | |
| 209 |       for ((sym, props) <- symbols if props.isDefinedAt("abbrev"))
 | |
| 210 |         yield (sym -> props("abbrev"))): _*)
 | |
| 211 | ||
| 212 | ||
| 31522 | 213 | /* main recoder methods */ | 
| 214 | ||
| 215 | private val (decoder, encoder) = | |
| 216 |     {
 | |
| 217 | val mapping = | |
| 218 |         for {
 | |
| 219 | (sym, props) <- symbols | |
| 220 | val code = | |
| 221 |             try { Integer.decode(props("code")).intValue }
 | |
| 222 |             catch {
 | |
| 223 |               case _: NoSuchElementException => error("Missing code for symbol " + sym)
 | |
| 224 |               case _: NumberFormatException => error("Bad code for symbol " + sym)
 | |
| 225 | } | |
| 226 | val ch = new String(Character.toChars(code)) | |
| 34193 | 227 |         } yield {
 | 
| 228 |           if (code < 128) error("Illegal ASCII code for symbol " + sym)
 | |
| 229 | else (sym, ch) | |
| 230 | } | |
| 31545 
5f1f0a20af4d
discontinued escaped symbols such as \\<forall> -- only one backslash should be used;
 wenzelm parents: 
31523diff
changeset | 231 | (new Recoder(mapping), | 
| 31548 | 232 |        new Recoder(mapping map { case (x, y) => (y, x) }))
 | 
| 31522 | 233 | } | 
| 27918 | 234 | |
| 34098 | 235 | def decode(text: String): String = decoder.recode(text) | 
| 236 | def encode(text: String): String = encoder.recode(text) | |
| 34134 | 237 | |
| 238 | ||
| 239 | /* classification */ | |
| 240 | ||
| 34138 | 241 | private object Decode_Set | 
| 242 |     {
 | |
| 243 | def apply(elems: String*): Set[String] = | |
| 244 |       {
 | |
| 245 | val content = elems.toList | |
| 246 | Set((content ::: content.map(decode)): _*) | |
| 247 | } | |
| 248 | } | |
| 249 | ||
| 250 | private val letters = Decode_Set( | |
| 34134 | 251 | "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", | 
| 252 | "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", | |
| 253 | "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", | |
| 254 | "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", | |
| 255 | ||
| 256 | "\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>", | |
| 257 | "\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>", | |
| 258 | "\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>", | |
| 259 | "\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>", | |
| 260 | "\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>", | |
| 261 | "\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>", | |
| 262 | "\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>", | |
| 263 | "\\<x>", "\\<y>", "\\<z>", | |
| 264 | ||
| 265 | "\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>", | |
| 266 | "\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>", | |
| 267 | "\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>", | |
| 268 | "\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>", | |
| 269 | "\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>", | |
| 270 | "\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>", | |
| 271 | "\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>", | |
| 272 | "\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>", | |
| 273 | "\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>", | |
| 274 | ||
| 275 | "\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>", | |
| 276 | "\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>", | |
| 277 | "\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>", | |
| 278 | "\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>", | |
| 279 | "\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>", | |
| 280 | "\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>", | |
| 281 | "\\<Psi>", "\\<Omega>", | |
| 282 | ||
| 283 | "\\<^isub>", "\\<^isup>") | |
| 284 | ||
| 34138 | 285 | private val blanks = | 
| 36816 | 286 | Decode_Set(space, "\t", "\n", "\u000B", "\f", "\r", "\\<spacespace>", "\\<^newline>") | 
| 34138 | 287 | |
| 288 | private val sym_chars = | |
| 289 |       Set("!", "#", "$", "%", "&", "*", "+", "-", "/", "<", "=", ">", "?", "@", "^", "_", "|", "~")
 | |
| 34134 | 290 | |
| 291 | def is_letter(sym: String): Boolean = letters.contains(sym) | |
| 34138 | 292 | def is_digit(sym: String): Boolean = sym.length == 1 && '0' <= sym(0) && sym(0) <= '9' | 
| 34134 | 293 | def is_quasi(sym: String): Boolean = sym == "_" || sym == "'" | 
| 34138 | 294 | def is_letdig(sym: String): Boolean = is_letter(sym) || is_digit(sym) || is_quasi(sym) | 
| 34134 | 295 | def is_blank(sym: String): Boolean = blanks.contains(sym) | 
| 34138 | 296 | def is_symbolic_char(sym: String): Boolean = sym_chars.contains(sym) | 
| 297 |     def is_symbolic(sym: String): Boolean = sym.startsWith("\\<") && !sym.startsWith("\\<^")
 | |
| 27918 | 298 | } | 
| 27901 | 299 | } |