author | wenzelm |
Mon, 29 Mar 2010 22:43:56 +0200 | |
changeset 36011 | 3ff725ac13a4 |
parent 34316 | f879b649ac4c |
child 36763 | 096ebe74aeaf |
permissions | -rw-r--r-- |
27901 | 1 |
/* Title: Pure/General/symbol.scala |
2 |
Author: Makarius |
|
3 |
||
27924 | 4 |
Detecting and recoding Isabelle symbols. |
27901 | 5 |
*/ |
6 |
||
7 |
package isabelle |
|
8 |
||
27918 | 9 |
import scala.io.Source |
36011
3ff725ac13a4
adapted to Scala 2.8.0 Beta1 -- with notable changes to scala.collection;
wenzelm
parents:
34316
diff
changeset
|
10 |
import scala.collection.mutable |
31522 | 11 |
import scala.util.matching.Regex |
27901 | 12 |
|
13 |
||
31522 | 14 |
object Symbol |
15 |
{ |
|
33998 | 16 |
/* Symbol regexps */ |
27901 | 17 |
|
31522 | 18 |
private val plain = new Regex("""(?xs) |
19 |
[^\\ \ud800-\udfff] | [\ud800-\udbff][\udc00-\udfff] """) |
|
27901 | 20 |
|
31522 | 21 |
private val symbol = new Regex("""(?xs) |
31545
5f1f0a20af4d
discontinued escaped symbols such as \\<forall> -- only one backslash should be used;
wenzelm
parents:
31523
diff
changeset
|
22 |
\\ < (?: |
27924 | 23 |
\^? [A-Za-z][A-Za-z0-9_']* | |
24 |
\^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""") |
|
25 |
||
34316
f879b649ac4c
clarified Symbol.is_plain/is_wellformed -- is_closed was rejecting plain backslashes;
wenzelm
parents:
34193
diff
changeset
|
26 |
// FIXME cover bad surrogates!? |
f879b649ac4c
clarified Symbol.is_plain/is_wellformed -- is_closed was rejecting plain backslashes;
wenzelm
parents:
34193
diff
changeset
|
27 |
// FIXME check wrt. ML version |
31522 | 28 |
private val bad_symbol = new Regex("(?xs) (?!" + symbol + ")" + |
31545
5f1f0a20af4d
discontinued escaped symbols such as \\<forall> -- only one backslash should be used;
wenzelm
parents:
31523
diff
changeset
|
29 |
""" \\ < (?: (?! \s | [\"`\\] | \(\* | \*\) | \{\* | \*\} ) . )*""") |
27924 | 30 |
|
27939 | 31 |
// total pattern |
31522 | 32 |
val regex = new Regex(plain + "|" + symbol + "|" + bad_symbol + "| .") |
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
33 |
|
34137 | 34 |
|
35 |
/* basic matching */ |
|
36 |
||
34316
f879b649ac4c
clarified Symbol.is_plain/is_wellformed -- is_closed was rejecting plain backslashes;
wenzelm
parents:
34193
diff
changeset
|
37 |
def is_plain(c: Char): Boolean = !(c == '\\' || '\ud800' <= c && c <= '\udfff') |
34137 | 38 |
|
34316
f879b649ac4c
clarified Symbol.is_plain/is_wellformed -- is_closed was rejecting plain backslashes;
wenzelm
parents:
34193
diff
changeset
|
39 |
def is_wellformed(s: CharSequence): Boolean = |
f879b649ac4c
clarified Symbol.is_plain/is_wellformed -- is_closed was rejecting plain backslashes;
wenzelm
parents:
34193
diff
changeset
|
40 |
s.length == 1 && is_plain(s.charAt(0)) || !bad_symbol.pattern.matcher(s).matches |
34137 | 41 |
|
42 |
class Matcher(text: CharSequence) |
|
43 |
{ |
|
44 |
private val matcher = regex.pattern.matcher(text) |
|
45 |
def apply(start: Int, end: Int): Int = |
|
46 |
{ |
|
47 |
require(0 <= start && start < end && end <= text.length) |
|
34316
f879b649ac4c
clarified Symbol.is_plain/is_wellformed -- is_closed was rejecting plain backslashes;
wenzelm
parents:
34193
diff
changeset
|
48 |
if (is_plain(text.charAt(start))) 1 |
34138 | 49 |
else { |
34137 | 50 |
matcher.region(start, end).lookingAt |
51 |
matcher.group.length |
|
52 |
} |
|
53 |
} |
|
31522 | 54 |
} |
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
55 |
|
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
56 |
|
36011
3ff725ac13a4
adapted to Scala 2.8.0 Beta1 -- with notable changes to scala.collection;
wenzelm
parents:
34316
diff
changeset
|
57 |
/* iterator */ |
33998 | 58 |
|
36011
3ff725ac13a4
adapted to Scala 2.8.0 Beta1 -- with notable changes to scala.collection;
wenzelm
parents:
34316
diff
changeset
|
59 |
def iterator(text: CharSequence) = new Iterator[CharSequence] |
34134 | 60 |
{ |
34137 | 61 |
private val matcher = new Matcher(text) |
33998 | 62 |
private var i = 0 |
63 |
def hasNext = i < text.length |
|
64 |
def next = { |
|
34137 | 65 |
val n = matcher(i, text.length) |
66 |
val s = text.subSequence(i, i + n) |
|
67 |
i += n |
|
68 |
s |
|
33998 | 69 |
} |
70 |
} |
|
71 |
||
72 |
||
73 |
/* decoding offsets */ |
|
74 |
||
75 |
class Index(text: CharSequence) |
|
31929 | 76 |
{ |
77 |
case class Entry(chr: Int, sym: Int) |
|
78 |
val index: Array[Entry] = |
|
79 |
{ |
|
34137 | 80 |
val matcher = new Matcher(text) |
31929 | 81 |
val buf = new mutable.ArrayBuffer[Entry] |
82 |
var chr = 0 |
|
83 |
var sym = 0 |
|
33998 | 84 |
while (chr < text.length) { |
34137 | 85 |
val n = matcher(chr, text.length) |
86 |
chr += n |
|
31929 | 87 |
sym += 1 |
34137 | 88 |
if (n > 1) buf += Entry(chr, sym) |
31929 | 89 |
} |
90 |
buf.toArray |
|
91 |
} |
|
92 |
def decode(sym: Int): Int = |
|
93 |
{ |
|
94 |
val end = index.length |
|
95 |
def bisect(a: Int, b: Int): Int = |
|
96 |
{ |
|
97 |
if (a < b) { |
|
98 |
val c = (a + b) / 2 |
|
99 |
if (sym < index(c).sym) bisect(a, c) |
|
100 |
else if (c + 1 == end || sym < index(c + 1).sym) c |
|
101 |
else bisect(c + 1, b) |
|
102 |
} |
|
103 |
else -1 |
|
104 |
} |
|
105 |
val i = bisect(0, end) |
|
106 |
if (i < 0) sym |
|
107 |
else index(i).chr + sym - index(i).sym |
|
108 |
} |
|
109 |
} |
|
110 |
||
111 |
||
33998 | 112 |
/* recoding text */ |
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
113 |
|
31522 | 114 |
private class Recoder(list: List[(String, String)]) |
115 |
{ |
|
116 |
private val (min, max) = |
|
117 |
{ |
|
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
118 |
var min = '\uffff' |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
119 |
var max = '\u0000' |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
120 |
for ((x, _) <- list) { |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
121 |
val c = x(0) |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
122 |
if (c < min) min = c |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
123 |
if (c > max) max = c |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
124 |
} |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
125 |
(min, max) |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
126 |
} |
36011
3ff725ac13a4
adapted to Scala 2.8.0 Beta1 -- with notable changes to scala.collection;
wenzelm
parents:
34316
diff
changeset
|
127 |
private val table = Map[String, String]() ++ list |
31522 | 128 |
def recode(text: String): String = |
129 |
{ |
|
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
130 |
val len = text.length |
31522 | 131 |
val matcher = regex.pattern.matcher(text) |
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
132 |
val result = new StringBuilder(len) |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
133 |
var i = 0 |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
134 |
while (i < len) { |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
135 |
val c = text(i) |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
136 |
if (min <= c && c <= max) { |
31929 | 137 |
matcher.region(i, len).lookingAt |
27938 | 138 |
val x = matcher.group |
31522 | 139 |
result.append(table.get(x) getOrElse x) |
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
140 |
i = matcher.end |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
141 |
} |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
142 |
else { result.append(c); i += 1 } |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
143 |
} |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
144 |
result.toString |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
145 |
} |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
146 |
} |
27924 | 147 |
|
27918 | 148 |
|
27923
7ebe9d38743a
use scala.collection.jcl.HashMap, which seems to be more efficient;
wenzelm
parents:
27918
diff
changeset
|
149 |
|
27927 | 150 |
/** Symbol interpretation **/ |
151 |
||
34137 | 152 |
class Interpretation(symbol_decls: List[String]) |
29569
f3f529b5d8fb
more general init of Symbol.Interpretation, independent of IsabelleSystem instance;
wenzelm
parents:
29174
diff
changeset
|
153 |
{ |
31522 | 154 |
/* read symbols */ |
155 |
||
156 |
private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """) |
|
157 |
private val key = new Regex("""(?xs) (.+): """) |
|
158 |
||
159 |
private def read_decl(decl: String): (String, Map[String, String]) = |
|
160 |
{ |
|
161 |
def err() = error("Bad symbol declaration: " + decl) |
|
162 |
||
163 |
def read_props(props: List[String]): Map[String, String] = |
|
164 |
{ |
|
165 |
props match { |
|
166 |
case Nil => Map() |
|
167 |
case _ :: Nil => err() |
|
168 |
case key(x) :: y :: rest => read_props(rest) + (x -> y) |
|
169 |
case _ => err() |
|
170 |
} |
|
171 |
} |
|
172 |
decl.split("\\s+").toList match { |
|
34316
f879b649ac4c
clarified Symbol.is_plain/is_wellformed -- is_closed was rejecting plain backslashes;
wenzelm
parents:
34193
diff
changeset
|
173 |
case sym :: props if sym.length > 1 && is_wellformed(sym) => (sym, read_props(props)) |
34193 | 174 |
case _ => err() |
31522 | 175 |
} |
176 |
} |
|
177 |
||
178 |
private val symbols: List[(String, Map[String, String])] = |
|
34137 | 179 |
for (decl <- symbol_decls if !empty.pattern.matcher(decl).matches) |
31522 | 180 |
yield read_decl(decl) |
181 |
||
182 |
||
31651 | 183 |
/* misc properties */ |
184 |
||
34134 | 185 |
val names: Map[String, String] = |
186 |
{ |
|
31651 | 187 |
val name = new Regex("""\\<([A-Za-z][A-Za-z0-9_']*)>""") |
188 |
Map((for ((sym @ name(a), _) <- symbols) yield (sym -> a)): _*) |
|
189 |
} |
|
190 |
||
191 |
val abbrevs: Map[String, String] = Map(( |
|
192 |
for ((sym, props) <- symbols if props.isDefinedAt("abbrev")) |
|
193 |
yield (sym -> props("abbrev"))): _*) |
|
194 |
||
195 |
||
31522 | 196 |
/* main recoder methods */ |
197 |
||
198 |
private val (decoder, encoder) = |
|
199 |
{ |
|
200 |
val mapping = |
|
201 |
for { |
|
202 |
(sym, props) <- symbols |
|
203 |
val code = |
|
204 |
try { Integer.decode(props("code")).intValue } |
|
205 |
catch { |
|
206 |
case _: NoSuchElementException => error("Missing code for symbol " + sym) |
|
207 |
case _: NumberFormatException => error("Bad code for symbol " + sym) |
|
208 |
} |
|
209 |
val ch = new String(Character.toChars(code)) |
|
34193 | 210 |
} yield { |
211 |
if (code < 128) error("Illegal ASCII code for symbol " + sym) |
|
212 |
else (sym, ch) |
|
213 |
} |
|
31545
5f1f0a20af4d
discontinued escaped symbols such as \\<forall> -- only one backslash should be used;
wenzelm
parents:
31523
diff
changeset
|
214 |
(new Recoder(mapping), |
31548 | 215 |
new Recoder(mapping map { case (x, y) => (y, x) })) |
31522 | 216 |
} |
27918 | 217 |
|
34098 | 218 |
def decode(text: String): String = decoder.recode(text) |
219 |
def encode(text: String): String = encoder.recode(text) |
|
34134 | 220 |
|
221 |
||
222 |
/* classification */ |
|
223 |
||
34138 | 224 |
private object Decode_Set |
225 |
{ |
|
226 |
def apply(elems: String*): Set[String] = |
|
227 |
{ |
|
228 |
val content = elems.toList |
|
229 |
Set((content ::: content.map(decode)): _*) |
|
230 |
} |
|
231 |
} |
|
232 |
||
233 |
private val letters = Decode_Set( |
|
34134 | 234 |
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", |
235 |
"N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", |
|
236 |
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", |
|
237 |
"n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", |
|
238 |
||
239 |
"\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>", |
|
240 |
"\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>", |
|
241 |
"\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>", |
|
242 |
"\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>", |
|
243 |
"\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>", |
|
244 |
"\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>", |
|
245 |
"\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>", |
|
246 |
"\\<x>", "\\<y>", "\\<z>", |
|
247 |
||
248 |
"\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>", |
|
249 |
"\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>", |
|
250 |
"\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>", |
|
251 |
"\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>", |
|
252 |
"\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>", |
|
253 |
"\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>", |
|
254 |
"\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>", |
|
255 |
"\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>", |
|
256 |
"\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>", |
|
257 |
||
258 |
"\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>", |
|
259 |
"\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>", |
|
260 |
"\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>", |
|
261 |
"\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>", |
|
262 |
"\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>", |
|
263 |
"\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>", |
|
264 |
"\\<Psi>", "\\<Omega>", |
|
265 |
||
266 |
"\\<^isub>", "\\<^isup>") |
|
267 |
||
34138 | 268 |
private val blanks = |
269 |
Decode_Set(" ", "\t", "\n", "\u000B", "\f", "\r", "\\<spacespace>", "\\<^newline>") |
|
270 |
||
271 |
private val sym_chars = |
|
272 |
Set("!", "#", "$", "%", "&", "*", "+", "-", "/", "<", "=", ">", "?", "@", "^", "_", "|", "~") |
|
34134 | 273 |
|
274 |
def is_letter(sym: String): Boolean = letters.contains(sym) |
|
34138 | 275 |
def is_digit(sym: String): Boolean = sym.length == 1 && '0' <= sym(0) && sym(0) <= '9' |
34134 | 276 |
def is_quasi(sym: String): Boolean = sym == "_" || sym == "'" |
34138 | 277 |
def is_letdig(sym: String): Boolean = is_letter(sym) || is_digit(sym) || is_quasi(sym) |
34134 | 278 |
def is_blank(sym: String): Boolean = blanks.contains(sym) |
34138 | 279 |
def is_symbolic_char(sym: String): Boolean = sym_chars.contains(sym) |
280 |
def is_symbolic(sym: String): Boolean = sym.startsWith("\\<") && !sym.startsWith("\\<^") |
|
27918 | 281 |
} |
27901 | 282 |
} |