author  wenzelm 
Sat, 19 Dec 2009 11:45:14 +0100  
changeset 34134  d8d9df8407f6 
parent 34098  2b9cdf23c188 
child 34137  6cc9a0cbaf55 
permissions  rwrr 
27901  1 
/* Title: Pure/General/symbol.scala 
2 
Author: Makarius 

3 

27924  4 
Detecting and recoding Isabelle symbols. 
27901  5 
*/ 
6 

7 
package isabelle 

8 

27918  9 
import scala.io.Source 
31929  10 
import scala.collection.{jcl, mutable} 
31522  11 
import scala.util.matching.Regex 
27901  12 

13 

31522  14 
object Symbol 
15 
{ 

33998  16 
/* Symbol regexps */ 
27901  17 

31522  18 
private val plain = new Regex("""(?xs) 
19 
[^\\ \ud800\udfff]  [\ud800\udbff][\udc00\udfff] """) 

27901  20 

31522  21 
private val symbol = new Regex("""(?xs) 
31545
5f1f0a20af4d
discontinued escaped symbols such as \\<forall>  only one backslash should be used;
wenzelm
parents:
31523
diff
changeset

22 
\\ < (?: 
27924  23 
\^? [AZaz][AZaz09_']*  
24 
\^raw: [\x20\x7e\u0100\uffff && [^.>]]* ) >""") 

25 

31522  26 
private val bad_symbol = new Regex("(?xs) (?!" + symbol + ")" + 
31545
5f1f0a20af4d
discontinued escaped symbols such as \\<forall>  only one backslash should be used;
wenzelm
parents:
31523
diff
changeset

27 
""" \\ < (?: (?! \s  [\"`\\]  \(\*  \*\)  \{\*  \*\} ) . )*""") 
27924  28 

27939  29 
// total pattern 
31522  30 
val regex = new Regex(plain + "" + symbol + "" + bad_symbol + " .") 
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

31 

31522  32 
// prefix of another symbol 
33998  33 
def is_open(s: CharSequence): Boolean = 
31522  34 
{ 
35 
val len = s.length 

33998  36 
len == 1 && Character.isHighSurrogate(s.charAt(0))  
31522  37 
s == "\\"  
38 
s == "\\<"  

34134  39 
len > 2 && s.charAt(len  1) != '>' // FIXME bad_symbol !?? 
31522  40 
} 
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

41 

fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

42 

33998  43 
/* elements */ 
44 

34134  45 
def could_open(c: Char): Boolean = 
33998  46 
c == '\\'  Character.isHighSurrogate(c) 
31929  47 

34134  48 
def elements(text: CharSequence) = new Iterator[String] 
49 
{ 

33998  50 
private val matcher = regex.pattern.matcher(text) 
51 
private var i = 0 

52 
def hasNext = i < text.length 

53 
def next = { 

54 
val len = 

55 
if (could_open(text.charAt(i))) { 

56 
matcher.region(i, text.length).lookingAt 

57 
matcher.group.length 

58 
} 

59 
else 1 

60 
val s = text.subSequence(i, i + len) 

61 
i += len 

34001  62 
s.toString 
33998  63 
} 
64 
} 

65 

66 

67 
/* decoding offsets */ 

68 

69 
class Index(text: CharSequence) 

31929  70 
{ 
71 
case class Entry(chr: Int, sym: Int) 

72 
val index: Array[Entry] = 

73 
{ 

74 
val matcher = regex.pattern.matcher(text) 

75 
val buf = new mutable.ArrayBuffer[Entry] 

76 
var chr = 0 

77 
var sym = 0 

33998  78 
while (chr < text.length) { 
31929  79 
val len = 
33998  80 
if (could_open(text.charAt(chr))) { 
81 
matcher.region(chr, text.length).lookingAt 

31929  82 
matcher.group.length 
33998  83 
} 
84 
else 1 

31929  85 
chr += len 
86 
sym += 1 

87 
if (len > 1) buf += Entry(chr, sym) 

88 
} 

89 
buf.toArray 

90 
} 

91 
def decode(sym: Int): Int = 

92 
{ 

93 
val end = index.length 

94 
def bisect(a: Int, b: Int): Int = 

95 
{ 

96 
if (a < b) { 

97 
val c = (a + b) / 2 

98 
if (sym < index(c).sym) bisect(a, c) 

99 
else if (c + 1 == end  sym < index(c + 1).sym) c 

100 
else bisect(c + 1, b) 

101 
} 

102 
else 1 

103 
} 

104 
val i = bisect(0, end) 

105 
if (i < 0) sym 

106 
else index(i).chr + sym  index(i).sym 

107 
} 

108 
} 

109 

110 

33998  111 
/* recoding text */ 
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

112 

31522  113 
private class Recoder(list: List[(String, String)]) 
114 
{ 

115 
private val (min, max) = 

116 
{ 

27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

117 
var min = '\uffff' 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

118 
var max = '\u0000' 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

119 
for ((x, _) < list) { 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

120 
val c = x(0) 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

121 
if (c < min) min = c 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

122 
if (c > max) max = c 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

123 
} 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

124 
(min, max) 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

125 
} 
31522  126 
private val table = 
127 
{ 

128 
val table = new jcl.HashMap[String, String] // reasonably efficient? 

27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

129 
for ((x, y) < list) table + (x > y) 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

130 
table 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

131 
} 
31522  132 
def recode(text: String): String = 
133 
{ 

27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

134 
val len = text.length 
31522  135 
val matcher = regex.pattern.matcher(text) 
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

136 
val result = new StringBuilder(len) 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

137 
var i = 0 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

138 
while (i < len) { 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

139 
val c = text(i) 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

140 
if (min <= c && c <= max) { 
31929  141 
matcher.region(i, len).lookingAt 
27938  142 
val x = matcher.group 
31522  143 
result.append(table.get(x) getOrElse x) 
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

144 
i = matcher.end 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

145 
} 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

146 
else { result.append(c); i += 1 } 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

147 
} 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

148 
result.toString 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

149 
} 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

150 
} 
27924  151 

27918  152 

27923
7ebe9d38743a
use scala.collection.jcl.HashMap, which seems to be more efficient;
wenzelm
parents:
27918
diff
changeset

153 

27927  154 
/** Symbol interpretation **/ 
155 

29569
f3f529b5d8fb
more general init of Symbol.Interpretation, independent of IsabelleSystem instance;
wenzelm
parents:
29174
diff
changeset

156 
class Interpretation(symbol_decls: Iterator[String]) 
f3f529b5d8fb
more general init of Symbol.Interpretation, independent of IsabelleSystem instance;
wenzelm
parents:
29174
diff
changeset

157 
{ 
31522  158 
/* read symbols */ 
159 

160 
private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """) 

161 
private val key = new Regex("""(?xs) (.+): """) 

162 

163 
private def read_decl(decl: String): (String, Map[String, String]) = 

164 
{ 

165 
def err() = error("Bad symbol declaration: " + decl) 

166 

167 
def read_props(props: List[String]): Map[String, String] = 

168 
{ 

169 
props match { 

170 
case Nil => Map() 

171 
case _ :: Nil => err() 

172 
case key(x) :: y :: rest => read_props(rest) + (x > y) 

173 
case _ => err() 

174 
} 

175 
} 

176 
decl.split("\\s+").toList match { 

177 
case Nil => err() 

178 
case sym :: props => (sym, read_props(props)) 

179 
} 

180 
} 

181 

182 
private val symbols: List[(String, Map[String, String])] = 

183 
for (decl < symbol_decls.toList if !empty.pattern.matcher(decl).matches) 

184 
yield read_decl(decl) 

185 

186 

31651  187 
/* misc properties */ 
188 

34134  189 
val names: Map[String, String] = 
190 
{ 

31651  191 
val name = new Regex("""\\<([AZaz][AZaz09_']*)>""") 
192 
Map((for ((sym @ name(a), _) < symbols) yield (sym > a)): _*) 

193 
} 

194 

195 
val abbrevs: Map[String, String] = Map(( 

196 
for ((sym, props) < symbols if props.isDefinedAt("abbrev")) 

197 
yield (sym > props("abbrev"))): _*) 

198 

199 

31522  200 
/* main recoder methods */ 
201 

202 
private val (decoder, encoder) = 

203 
{ 

204 
val mapping = 

205 
for { 

206 
(sym, props) < symbols 

207 
val code = 

208 
try { Integer.decode(props("code")).intValue } 

209 
catch { 

210 
case _: NoSuchElementException => error("Missing code for symbol " + sym) 

211 
case _: NumberFormatException => error("Bad code for symbol " + sym) 

212 
} 

213 
val ch = new String(Character.toChars(code)) 

214 
} yield (sym, ch) 

31545
5f1f0a20af4d
discontinued escaped symbols such as \\<forall>  only one backslash should be used;
wenzelm
parents:
31523
diff
changeset

215 
(new Recoder(mapping), 
31548  216 
new Recoder(mapping map { case (x, y) => (y, x) })) 
31522  217 
} 
27918  218 

34098  219 
def decode(text: String): String = decoder.recode(text) 
220 
def encode(text: String): String = encoder.recode(text) 

34134  221 

222 

223 
/* classification */ 

224 

225 
private val raw_letters = Set( 

226 
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", 

227 
"N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", 

228 
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", 

229 
"n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", 

230 

231 
"\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>", 

232 
"\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>", 

233 
"\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>", 

234 
"\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>", 

235 
"\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>", 

236 
"\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>", 

237 
"\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>", 

238 
"\\<x>", "\\<y>", "\\<z>", 

239 

240 
"\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>", 

241 
"\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>", 

242 
"\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>", 

243 
"\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>", 

244 
"\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>", 

245 
"\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>", 

246 
"\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>", 

247 
"\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>", 

248 
"\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>", 

249 

250 
"\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>", 

251 
"\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>", 

252 
"\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>", 

253 
"\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>", 

254 
"\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>", 

255 
"\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>", 

256 
"\\<Psi>", "\\<Omega>", 

257 

258 
"\\<^isub>", "\\<^isup>") 

259 

260 
private val letters = raw_letters ++ raw_letters.map(decode(_)) 

261 

262 
def is_letter(sym: String): Boolean = letters.contains(sym) 

263 

264 
def is_digit(sym: String): Boolean = 

265 
if (sym.length == 1) { 

266 
val c = sym(0) 

267 
'0' <= c && c <= '9' 

268 
} 

269 
else false 

270 

271 
def is_quasi(sym: String): Boolean = sym == "_"  sym == "'" 

272 

273 

274 
private val raw_blanks = 

275 
Set(" ", "\t", "\n", "\u000B", "\f", "\r", "\\<spacespace>", "\\<^newline>") 

276 
private val blanks = raw_blanks ++ raw_blanks.map(decode(_)) 

277 

278 
def is_blank(sym: String): Boolean = blanks.contains(sym) 

27918  279 
} 
27901  280 
} 