author  wenzelm 
Sun, 20 Dec 2009 15:41:57 +0100  
changeset 34138  4008c2f5a46e 
parent 34137  6cc9a0cbaf55 
child 34193  d3358b909c40 
permissions  rwrr 
27901  1 
/* Title: Pure/General/symbol.scala 
2 
Author: Makarius 

3 

27924  4 
Detecting and recoding Isabelle symbols. 
27901  5 
*/ 
6 

7 
package isabelle 

8 

27918  9 
import scala.io.Source 
31929  10 
import scala.collection.{jcl, mutable} 
31522  11 
import scala.util.matching.Regex 
27901  12 

13 

31522  14 
object Symbol 
15 
{ 

33998  16 
/* Symbol regexps */ 
27901  17 

31522  18 
private val plain = new Regex("""(?xs) 
19 
[^\\ \ud800\udfff]  [\ud800\udbff][\udc00\udfff] """) 

27901  20 

31522  21 
private val symbol = new Regex("""(?xs) 
31545
5f1f0a20af4d
discontinued escaped symbols such as \\<forall>  only one backslash should be used;
wenzelm
parents:
31523
diff
changeset

22 
\\ < (?: 
27924  23 
\^? [AZaz][AZaz09_']*  
24 
\^raw: [\x20\x7e\u0100\uffff && [^.>]]* ) >""") 

25 

31522  26 
private val bad_symbol = new Regex("(?xs) (?!" + symbol + ")" + 
31545
5f1f0a20af4d
discontinued escaped symbols such as \\<forall>  only one backslash should be used;
wenzelm
parents:
31523
diff
changeset

27 
""" \\ < (?: (?! \s  [\"`\\]  \(\*  \*\)  \{\*  \*\} ) . )*""") 
27924  28 

27939  29 
// total pattern 
31522  30 
val regex = new Regex(plain + "" + symbol + "" + bad_symbol + " .") 
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

31 

34137  32 

33 
/* basic matching */ 

34 

34138  35 
def is_closed(c: Char): Boolean = 
36 
!(c == '\\'  Character.isHighSurrogate(c)) 

34137  37 

34138  38 
def is_closed(s: CharSequence): Boolean = 
31522  39 
{ 
34138  40 
if (s.length == 1) is_closed(s.charAt(0)) 
41 
else !bad_symbol.pattern.matcher(s).matches 

34137  42 
} 
43 

44 
class Matcher(text: CharSequence) 

45 
{ 

46 
private val matcher = regex.pattern.matcher(text) 

47 
def apply(start: Int, end: Int): Int = 

48 
{ 

49 
require(0 <= start && start < end && end <= text.length) 

34138  50 
if (is_closed(text.charAt(start))) 1 
51 
else { 

34137  52 
matcher.region(start, end).lookingAt 
53 
matcher.group.length 

54 
} 

55 
} 

31522  56 
} 
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

57 

fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

58 

33998  59 
/* elements */ 
60 

34137  61 
def elements(text: CharSequence) = new Iterator[CharSequence] 
34134  62 
{ 
34137  63 
private val matcher = new Matcher(text) 
33998  64 
private var i = 0 
65 
def hasNext = i < text.length 

66 
def next = { 

34137  67 
val n = matcher(i, text.length) 
68 
val s = text.subSequence(i, i + n) 

69 
i += n 

70 
s 

33998  71 
} 
72 
} 

73 

74 

75 
/* decoding offsets */ 

76 

77 
class Index(text: CharSequence) 

31929  78 
{ 
79 
case class Entry(chr: Int, sym: Int) 

80 
val index: Array[Entry] = 

81 
{ 

34137  82 
val matcher = new Matcher(text) 
31929  83 
val buf = new mutable.ArrayBuffer[Entry] 
84 
var chr = 0 

85 
var sym = 0 

33998  86 
while (chr < text.length) { 
34137  87 
val n = matcher(chr, text.length) 
88 
chr += n 

31929  89 
sym += 1 
34137  90 
if (n > 1) buf += Entry(chr, sym) 
31929  91 
} 
92 
buf.toArray 

93 
} 

94 
def decode(sym: Int): Int = 

95 
{ 

96 
val end = index.length 

97 
def bisect(a: Int, b: Int): Int = 

98 
{ 

99 
if (a < b) { 

100 
val c = (a + b) / 2 

101 
if (sym < index(c).sym) bisect(a, c) 

102 
else if (c + 1 == end  sym < index(c + 1).sym) c 

103 
else bisect(c + 1, b) 

104 
} 

105 
else 1 

106 
} 

107 
val i = bisect(0, end) 

108 
if (i < 0) sym 

109 
else index(i).chr + sym  index(i).sym 

110 
} 

111 
} 

112 

113 

33998  114 
/* recoding text */ 
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

115 

31522  116 
private class Recoder(list: List[(String, String)]) 
117 
{ 

118 
private val (min, max) = 

119 
{ 

27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

120 
var min = '\uffff' 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

121 
var max = '\u0000' 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

122 
for ((x, _) < list) { 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

123 
val c = x(0) 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

124 
if (c < min) min = c 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

125 
if (c > max) max = c 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

126 
} 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

127 
(min, max) 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

128 
} 
31522  129 
private val table = 
130 
{ 

131 
val table = new jcl.HashMap[String, String] // reasonably efficient? 

27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

132 
for ((x, y) < list) table + (x > y) 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

133 
table 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

134 
} 
31522  135 
def recode(text: String): String = 
136 
{ 

27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

137 
val len = text.length 
31522  138 
val matcher = regex.pattern.matcher(text) 
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

139 
val result = new StringBuilder(len) 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

140 
var i = 0 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

141 
while (i < len) { 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

142 
val c = text(i) 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

143 
if (min <= c && c <= max) { 
31929  144 
matcher.region(i, len).lookingAt 
27938  145 
val x = matcher.group 
31522  146 
result.append(table.get(x) getOrElse x) 
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

147 
i = matcher.end 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

148 
} 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

149 
else { result.append(c); i += 1 } 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

150 
} 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

151 
result.toString 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

152 
} 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

153 
} 
27924  154 

27918  155 

27923
7ebe9d38743a
use scala.collection.jcl.HashMap, which seems to be more efficient;
wenzelm
parents:
27918
diff
changeset

156 

27927  157 
/** Symbol interpretation **/ 
158 

34137  159 
class Interpretation(symbol_decls: List[String]) 
29569
f3f529b5d8fb
more general init of Symbol.Interpretation, independent of IsabelleSystem instance;
wenzelm
parents:
29174
diff
changeset

160 
{ 
31522  161 
/* read symbols */ 
162 

163 
private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """) 

164 
private val key = new Regex("""(?xs) (.+): """) 

165 

166 
private def read_decl(decl: String): (String, Map[String, String]) = 

167 
{ 

168 
def err() = error("Bad symbol declaration: " + decl) 

169 

170 
def read_props(props: List[String]): Map[String, String] = 

171 
{ 

172 
props match { 

173 
case Nil => Map() 

174 
case _ :: Nil => err() 

175 
case key(x) :: y :: rest => read_props(rest) + (x > y) 

176 
case _ => err() 

177 
} 

178 
} 

179 
decl.split("\\s+").toList match { 

180 
case Nil => err() 

181 
case sym :: props => (sym, read_props(props)) 

182 
} 

183 
} 

184 

185 
private val symbols: List[(String, Map[String, String])] = 

34137  186 
for (decl < symbol_decls if !empty.pattern.matcher(decl).matches) 
31522  187 
yield read_decl(decl) 
188 

189 

31651  190 
/* misc properties */ 
191 

34134  192 
val names: Map[String, String] = 
193 
{ 

31651  194 
val name = new Regex("""\\<([AZaz][AZaz09_']*)>""") 
195 
Map((for ((sym @ name(a), _) < symbols) yield (sym > a)): _*) 

196 
} 

197 

198 
val abbrevs: Map[String, String] = Map(( 

199 
for ((sym, props) < symbols if props.isDefinedAt("abbrev")) 

200 
yield (sym > props("abbrev"))): _*) 

201 

202 

31522  203 
/* main recoder methods */ 
204 

205 
private val (decoder, encoder) = 

206 
{ 

207 
val mapping = 

208 
for { 

209 
(sym, props) < symbols 

210 
val code = 

211 
try { Integer.decode(props("code")).intValue } 

212 
catch { 

213 
case _: NoSuchElementException => error("Missing code for symbol " + sym) 

214 
case _: NumberFormatException => error("Bad code for symbol " + sym) 

215 
} 

216 
val ch = new String(Character.toChars(code)) 

217 
} yield (sym, ch) 

31545
5f1f0a20af4d
discontinued escaped symbols such as \\<forall>  only one backslash should be used;
wenzelm
parents:
31523
diff
changeset

218 
(new Recoder(mapping), 
31548  219 
new Recoder(mapping map { case (x, y) => (y, x) })) 
31522  220 
} 
27918  221 

34098  222 
def decode(text: String): String = decoder.recode(text) 
223 
def encode(text: String): String = encoder.recode(text) 

34134  224 

225 

226 
/* classification */ 

227 

34138  228 
private object Decode_Set 
229 
{ 

230 
def apply(elems: String*): Set[String] = 

231 
{ 

232 
val content = elems.toList 

233 
Set((content ::: content.map(decode)): _*) 

234 
} 

235 
} 

236 

237 
private val letters = Decode_Set( 

34134  238 
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", 
239 
"N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", 

240 
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", 

241 
"n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", 

242 

243 
"\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>", 

244 
"\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>", 

245 
"\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>", 

246 
"\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>", 

247 
"\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>", 

248 
"\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>", 

249 
"\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>", 

250 
"\\<x>", "\\<y>", "\\<z>", 

251 

252 
"\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>", 

253 
"\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>", 

254 
"\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>", 

255 
"\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>", 

256 
"\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>", 

257 
"\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>", 

258 
"\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>", 

259 
"\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>", 

260 
"\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>", 

261 

262 
"\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>", 

263 
"\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>", 

264 
"\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>", 

265 
"\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>", 

266 
"\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>", 

267 
"\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>", 

268 
"\\<Psi>", "\\<Omega>", 

269 

270 
"\\<^isub>", "\\<^isup>") 

271 

34138  272 
private val blanks = 
273 
Decode_Set(" ", "\t", "\n", "\u000B", "\f", "\r", "\\<spacespace>", "\\<^newline>") 

274 

275 
private val sym_chars = 

276 
Set("!", "#", "$", "%", "&", "*", "+", "", "/", "<", "=", ">", "?", "@", "^", "_", "", "~") 

34134  277 

278 
def is_letter(sym: String): Boolean = letters.contains(sym) 

34138  279 
def is_digit(sym: String): Boolean = sym.length == 1 && '0' <= sym(0) && sym(0) <= '9' 
34134  280 
def is_quasi(sym: String): Boolean = sym == "_"  sym == "'" 
34138  281 
def is_letdig(sym: String): Boolean = is_letter(sym)  is_digit(sym)  is_quasi(sym) 
34134  282 
def is_blank(sym: String): Boolean = blanks.contains(sym) 
34138  283 
def is_symbolic_char(sym: String): Boolean = sym_chars.contains(sym) 
284 
def is_symbolic(sym: String): Boolean = sym.startsWith("\\<") && !sym.startsWith("\\<^") 

27918  285 
} 
27901  286 
} 