author  wenzelm 
Fri, 30 Aug 2013 23:38:18 +0200  
changeset 53337  b3817a0e3211 
parent 53316  c3e549e0d3c7 
child 53400  673eb869e6ee 
permissions  rwrr 
27901  1 
/* Title: Pure/General/symbol.scala 
2 
Author: Makarius 

3 

27924  4 
Detecting and recoding Isabelle symbols. 
27901  5 
*/ 
6 

7 
package isabelle 

8 

36011
3ff725ac13a4
adapted to Scala 2.8.0 Beta1  with notable changes to scala.collection;
wenzelm
parents:
34316
diff
changeset

9 
import scala.collection.mutable 
31522  10 
import scala.util.matching.Regex 
48922  11 
import scala.annotation.tailrec 
27901  12 

13 

31522  14 
object Symbol 
15 
{ 

43696  16 
type Symbol = String 
17 

18 

43418  19 
/* ASCII characters */ 
20 

21 
def is_ascii_letter(c: Char): Boolean = 'A' <= c && c <= 'Z'  'a' <= c && c <= 'z' 

22 
def is_ascii_digit(c: Char): Boolean = '0' <= c && c <= '9' 

23 
def is_ascii_quasi(c: Char): Boolean = c == '_'  c == '\'' 

24 

25 
def is_ascii_letdig(c: Char): Boolean = 

26 
is_ascii_letter(c)  is_ascii_digit(c)  is_ascii_quasi(c) 

27 

28 
def is_ascii_identifier(s: String): Boolean = 

50238
98d35a7368bd
more uniform Symbol.is_ascii_identifier in ML/Scala;
wenzelm
parents:
50233
diff
changeset

29 
s.length > 0 && is_ascii_letter(s(0)) && s.forall(is_ascii_letdig) 
43418  30 

31 

48775  32 
/* symbol matching */ 
27901  33 

48775  34 
private val symbol_total = new Regex("""(?xs) 
35 
[\ud800\udbff][\udc00\udfff]  \r\n  

36 
\\ < (?: \^raw: [\x20\x7e\u0100\uffff && [^.>]]*  \^? ([AZaz][AZaz09_']*)? ) >?  

37 
.""") 

27924  38 

48775  39 
private def is_plain(c: Char): Boolean = 
40 
!(c == '\r'  c == '\\'  Character.isHighSurrogate(c)) 

48773
0e1bab274672
more liberal scanning of potentially malformed symbols;
wenzelm
parents:
48704
diff
changeset

41 

0e1bab274672
more liberal scanning of potentially malformed symbols;
wenzelm
parents:
48704
diff
changeset

42 
def is_malformed(s: Symbol): Boolean = 
0e1bab274672
more liberal scanning of potentially malformed symbols;
wenzelm
parents:
48704
diff
changeset

43 
s.length match { 
0e1bab274672
more liberal scanning of potentially malformed symbols;
wenzelm
parents:
48704
diff
changeset

44 
case 1 => 
0e1bab274672
more liberal scanning of potentially malformed symbols;
wenzelm
parents:
48704
diff
changeset

45 
val c = s(0) 
0e1bab274672
more liberal scanning of potentially malformed symbols;
wenzelm
parents:
48704
diff
changeset

46 
Character.isHighSurrogate(c)  Character.isLowSurrogate(c)  c == '\ufffd' 
0e1bab274672
more liberal scanning of potentially malformed symbols;
wenzelm
parents:
48704
diff
changeset

47 
case 2 => 
0e1bab274672
more liberal scanning of potentially malformed symbols;
wenzelm
parents:
48704
diff
changeset

48 
val c1 = s(0) 
0e1bab274672
more liberal scanning of potentially malformed symbols;
wenzelm
parents:
48704
diff
changeset

49 
val c2 = s(1) 
0e1bab274672
more liberal scanning of potentially malformed symbols;
wenzelm
parents:
48704
diff
changeset

50 
!(c1 == '\r' && c2 == '\n'  Character.isSurrogatePair(c1, c2)) 
48774  51 
case _ => !s.endsWith(">")  s == "\\<>"  s == "\\<^>" 
48773
0e1bab274672
more liberal scanning of potentially malformed symbols;
wenzelm
parents:
48704
diff
changeset

52 
} 
34137  53 

43696  54 
def is_physical_newline(s: Symbol): Boolean = 
43675
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
wenzelm
parents:
43511
diff
changeset

55 
s == "\n"  s == "\r"  s == "\r\n" 
38877  56 

34137  57 
class Matcher(text: CharSequence) 
58 
{ 

48775  59 
private val matcher = symbol_total.pattern.matcher(text) 
34137  60 
def apply(start: Int, end: Int): Int = 
61 
{ 

62 
require(0 <= start && start < end && end <= text.length) 

34316
f879b649ac4c
clarified Symbol.is_plain/is_wellformed  is_closed was rejecting plain backslashes;
wenzelm
parents:
34193
diff
changeset

63 
if (is_plain(text.charAt(start))) 1 
34138  64 
else { 
34137  65 
matcher.region(start, end).lookingAt 
66 
matcher.group.length 

67 
} 

68 
} 

31522  69 
} 
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

70 

fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

71 

43695
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

72 
/* iterator */ 
33998  73 

43696  74 
private val char_symbols: Array[Symbol] = 
43675
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
wenzelm
parents:
43511
diff
changeset

75 
(0 until 256).iterator.map(i => new String(Array(i.toChar))).toArray 
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
wenzelm
parents:
43511
diff
changeset

76 

43696  77 
def iterator(text: CharSequence): Iterator[Symbol] = 
78 
new Iterator[Symbol] 

40522  79 
{ 
43489  80 
private val matcher = new Matcher(text) 
81 
private var i = 0 

82 
def hasNext = i < text.length 

83 
def next = 

84 
{ 

85 
val n = matcher(i, text.length) 

43675
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
wenzelm
parents:
43511
diff
changeset

86 
val s = 
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
wenzelm
parents:
43511
diff
changeset

87 
if (n == 0) "" 
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
wenzelm
parents:
43511
diff
changeset

88 
else if (n == 1) { 
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
wenzelm
parents:
43511
diff
changeset

89 
val c = text.charAt(i) 
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
wenzelm
parents:
43511
diff
changeset

90 
if (c < char_symbols.length) char_symbols(c) 
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
wenzelm
parents:
43511
diff
changeset

91 
else text.subSequence(i, i + n).toString 
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
wenzelm
parents:
43511
diff
changeset

92 
} 
8252d51d70e2
simplified Symbol.iterator: produce strings, which are mostly preallocated;
wenzelm
parents:
43511
diff
changeset

93 
else text.subSequence(i, i + n).toString 
43489  94 
i += n 
95 
s 

96 
} 

33998  97 
} 
43489  98 

44949  99 
def explode(text: CharSequence): List[Symbol] = iterator(text).toList 
100 

48922  101 
def advance_line_column(pos: (Int, Int), text: CharSequence): (Int, Int) = 
102 
{ 

103 
var (line, column) = pos 

104 
for (sym < iterator(text)) { 

105 
if (is_physical_newline(sym)) { line += 1; column = 1 } 

106 
else column += 1 

107 
} 

108 
(line, column) 

109 
} 

110 

33998  111 

112 
/* decoding offsets */ 

113 

52507  114 
object Index 
115 
{ 

116 
def apply(text: CharSequence): Index = new Index(text) 

117 
} 

118 

119 
final class Index private(text: CharSequence) 

31929  120 
{ 
43714  121 
sealed case class Entry(chr: Int, sym: Int) 
31929  122 
val index: Array[Entry] = 
123 
{ 

34137  124 
val matcher = new Matcher(text) 
31929  125 
val buf = new mutable.ArrayBuffer[Entry] 
126 
var chr = 0 

127 
var sym = 0 

33998  128 
while (chr < text.length) { 
34137  129 
val n = matcher(chr, text.length) 
130 
chr += n 

31929  131 
sym += 1 
34137  132 
if (n > 1) buf += Entry(chr, sym) 
31929  133 
} 
134 
buf.toArray 

135 
} 

38479
e628da370072
more efficient Markup_Tree, based on branches sorted by quasiorder;
wenzelm
parents:
37556
diff
changeset

136 
def decode(sym1: Int): Int = 
31929  137 
{ 
38479
e628da370072
more efficient Markup_Tree, based on branches sorted by quasiorder;
wenzelm
parents:
37556
diff
changeset

138 
val sym = sym1  1 
31929  139 
val end = index.length 
48922  140 
@tailrec def bisect(a: Int, b: Int): Int = 
31929  141 
{ 
142 
if (a < b) { 

143 
val c = (a + b) / 2 

144 
if (sym < index(c).sym) bisect(a, c) 

145 
else if (c + 1 == end  sym < index(c + 1).sym) c 

146 
else bisect(c + 1, b) 

147 
} 

148 
else 1 

149 
} 

150 
val i = bisect(0, end) 

151 
if (i < 0) sym 

152 
else index(i).chr + sym  index(i).sym 

153 
} 

38479
e628da370072
more efficient Markup_Tree, based on branches sorted by quasiorder;
wenzelm
parents:
37556
diff
changeset

154 
def decode(range: Text.Range): Text.Range = range.map(decode(_)) 
31929  155 
} 
156 

157 

33998  158 
/* recoding text */ 
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

159 

31522  160 
private class Recoder(list: List[(String, String)]) 
161 
{ 

162 
private val (min, max) = 

163 
{ 

27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

164 
var min = '\uffff' 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

165 
var max = '\u0000' 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

166 
for ((x, _) < list) { 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

167 
val c = x(0) 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

168 
if (c < min) min = c 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

169 
if (c > max) max = c 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

170 
} 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

171 
(min, max) 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

172 
} 
40443  173 
private val table = 
174 
{ 

175 
var tab = Map[String, String]() 

176 
for ((x, y) < list) { 

177 
tab.get(x) match { 

178 
case None => tab += (x > y) 

179 
case Some(z) => 

44181  180 
error("Duplicate mapping of " + quote(x) + " to " + quote(y) + " vs. " + quote(z)) 
40443  181 
} 
182 
} 

183 
tab 

184 
} 

31522  185 
def recode(text: String): String = 
186 
{ 

27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

187 
val len = text.length 
48775  188 
val matcher = symbol_total.pattern.matcher(text) 
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

189 
val result = new StringBuilder(len) 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

190 
var i = 0 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

191 
while (i < len) { 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

192 
val c = text(i) 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

193 
if (min <= c && c <= max) { 
31929  194 
matcher.region(i, len).lookingAt 
27938  195 
val x = matcher.group 
52888  196 
result.append(table.getOrElse(x, x)) 
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

197 
i = matcher.end 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

198 
} 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

199 
else { result.append(c); i += 1 } 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

200 
} 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

201 
result.toString 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

202 
} 
fdf77e7be01a
more robust pattern: look at longer matches first, added catchall case;
wenzelm
parents:
27935
diff
changeset

203 
} 
27924  204 

27918  205 

27923
7ebe9d38743a
use scala.collection.jcl.HashMap, which seems to be more efficient;
wenzelm
parents:
27918
diff
changeset

206 

43695
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

207 
/** symbol interpretation **/ 
27927  208 

43695
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

209 
private lazy val symbols = 
50564
c6fde2fc4217
allow to suppress ISABELLE_SYMBOLS for experiments;
wenzelm
parents:
50291
diff
changeset

210 
new Interpretation(File.try_read(Path.split(Isabelle_System.getenv("ISABELLE_SYMBOLS")))) 
43695
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

211 

5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

212 
private class Interpretation(symbols_spec: String) 
29569
f3f529b5d8fb
more general init of Symbol.Interpretation, independent of IsabelleSystem instance;
wenzelm
parents:
29174
diff
changeset

213 
{ 
31522  214 
/* read symbols */ 
215 

50136
a96bd08258a2
support for symbol groups, retaining original order of declarations;
wenzelm
parents:
48922
diff
changeset

216 
private val No_Decl = new Regex("""(?xs) ^\s* (?: \#.* )? $ """) 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
wenzelm
parents:
48922
diff
changeset

217 
private val Key = new Regex("""(?xs) (.+): """) 
31522  218 

53316
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

219 
private def read_decl(decl: String): (Symbol, Properties.T) = 
31522  220 
{ 
221 
def err() = error("Bad symbol declaration: " + decl) 

222 

53316
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

223 
def read_props(props: List[String]): Properties.T = 
31522  224 
{ 
225 
props match { 

53316
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

226 
case Nil => Nil 
31522  227 
case _ :: Nil => err() 
53316
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

228 
case Key(x) :: y :: rest => (x > y) :: read_props(rest) 
31522  229 
case _ => err() 
230 
} 

231 
} 

232 
decl.split("\\s+").toList match { 

53316
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

233 
case sym :: props if sym.length > 1 && !is_malformed(sym) => 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

234 
(sym, read_props(props)) 
34193  235 
case _ => err() 
31522  236 
} 
237 
} 

238 

53316
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

239 
private val symbols: List[(Symbol, Properties.T)] = 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

240 
(((List.empty[(Symbol, Properties.T)], Set.empty[Symbol]) /: 
50136
a96bd08258a2
support for symbol groups, retaining original order of declarations;
wenzelm
parents:
48922
diff
changeset

241 
split_lines(symbols_spec).reverse) 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
wenzelm
parents:
48922
diff
changeset

242 
{ case (res, No_Decl()) => res 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
wenzelm
parents:
48922
diff
changeset

243 
case ((list, known), decl) => 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
wenzelm
parents:
48922
diff
changeset

244 
val (sym, props) = read_decl(decl) 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
wenzelm
parents:
48922
diff
changeset

245 
if (known(sym)) (list, known) 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
wenzelm
parents:
48922
diff
changeset

246 
else ((sym, props) :: list, known + sym) 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
wenzelm
parents:
48922
diff
changeset

247 
})._1 
31522  248 

249 

31651  250 
/* misc properties */ 
251 

43696  252 
val names: Map[Symbol, String] = 
34134  253 
{ 
43456
8a6de1a6e1dc
names for control symbols without "^", which is relevant for completion;
wenzelm
parents:
43455
diff
changeset

254 
val name = new Regex("""\\<\^?([AZaz][AZaz09_']*)>""") 
31651  255 
Map((for ((sym @ name(a), _) < symbols) yield (sym > a)): _*) 
256 
} 

257 

50136
a96bd08258a2
support for symbol groups, retaining original order of declarations;
wenzelm
parents:
48922
diff
changeset

258 
val groups: List[(String, List[Symbol])] = 
53316
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

259 
symbols.map({ case (sym, props) => 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

260 
val gs = for (("group", g) < props) yield g 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

261 
if (gs.isEmpty) List(sym > "unsorted") else gs.map(sym > _) 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

262 
}).flatten 
50136
a96bd08258a2
support for symbol groups, retaining original order of declarations;
wenzelm
parents:
48922
diff
changeset

263 
.groupBy(_._2).toList.map({ case (group, list) => (group, list.map(_._1)) }) 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
wenzelm
parents:
48922
diff
changeset

264 
.sortBy(_._1) 
a96bd08258a2
support for symbol groups, retaining original order of declarations;
wenzelm
parents:
48922
diff
changeset

265 

53316
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

266 
val abbrevs: Multi_Map[Symbol, String] = 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

267 
Multi_Map(( 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

268 
for { 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

269 
(sym, props) < symbols 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

270 
("abbrev", a) < props.reverse 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

271 
} yield (sym > a)): _*) 
43488  272 

273 

43490  274 
/* recoding */ 
31522  275 

53316
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

276 
private val Code = new Properties.String("code") 
31522  277 
private val (decoder, encoder) = 
278 
{ 

279 
val mapping = 

280 
for { 

281 
(sym, props) < symbols 

46997  282 
code = 
53316
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

283 
props match { 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

284 
case Code(s) => 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

285 
try { Integer.decode(s).intValue } 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

286 
catch { case _: NumberFormatException => error("Bad code for symbol " + sym) } 
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

287 
case _ => error("Missing code for symbol " + sym) 
31522  288 
} 
46997  289 
ch = new String(Character.toChars(code)) 
34193  290 
} yield { 
291 
if (code < 128) error("Illegal ASCII code for symbol " + sym) 

292 
else (sym, ch) 

293 
} 

31545
5f1f0a20af4d
discontinued escaped symbols such as \\<forall>  only one backslash should be used;
wenzelm
parents:
31523
diff
changeset

294 
(new Recoder(mapping), 
31548  295 
new Recoder(mapping map { case (x, y) => (y, x) })) 
31522  296 
} 
27918  297 

34098  298 
def decode(text: String): String = decoder.recode(text) 
299 
def encode(text: String): String = encoder.recode(text) 

34134  300 

43490  301 
private def recode_set(elems: String*): Set[String] = 
302 
{ 

303 
val content = elems.toList 

304 
Set((content ::: content.map(decode)): _*) 

305 
} 

306 

307 
private def recode_map[A](elems: (String, A)*): Map[String, A] = 

308 
{ 

309 
val content = elems.toList 

310 
Map((content ::: content.map({ case (sym, a) => (decode(sym), a) })): _*) 

311 
} 

312 

313 

314 
/* user fonts */ 

315 

53316
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

316 
private val Font = new Properties.String("font") 
43696  317 
val fonts: Map[Symbol, String] = 
53316
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

318 
recode_map((for ((sym, Font(font)) < symbols) yield (sym > font)): _*) 
43490  319 

320 
val font_names: List[String] = Set(fonts.toList.map(_._2): _*).toList 

321 
val font_index: Map[String, Int] = Map((font_names zip (0 until font_names.length).toList): _*) 

322 

34134  323 

324 
/* classification */ 

325 

43695
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

326 
val letters = recode_set( 
34134  327 
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", 
328 
"N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", 

329 
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", 

330 
"n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", 

331 

332 
"\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>", 

333 
"\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>", 

334 
"\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>", 

335 
"\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>", 

336 
"\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>", 

337 
"\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>", 

338 
"\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>", 

339 
"\\<x>", "\\<y>", "\\<z>", 

340 

341 
"\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>", 

342 
"\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>", 

343 
"\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>", 

344 
"\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>", 

345 
"\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>", 

346 
"\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>", 

347 
"\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>", 

348 
"\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>", 

349 
"\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>", 

350 

351 
"\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>", 

352 
"\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>", 

353 
"\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>", 

354 
"\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>", 

355 
"\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>", 

356 
"\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>", 

52616
3ac2878764f9
more robust identifier syntax: sub/superscript counts as modifier of LETDIG part instead of LETTER, both isub/isup and sub/sup are allowed;
wenzelm
parents:
52507
diff
changeset

357 
"\\<Psi>", "\\<Omega>") 
34134  358 

50233
eef21a0726f1
removed remains of Oheimb's doublespace (cf. 0a5af667dc75);
wenzelm
parents:
50188
diff
changeset

359 
val blanks = recode_set(" ", "\t", "\n", "\u000B", "\f", "\r", "\r\n", "\\<^newline>") 
34138  360 

43695
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

361 
val sym_chars = 
34138  362 
Set("!", "#", "$", "%", "&", "*", "+", "", "/", "<", "=", ">", "?", "@", "^", "_", "", "~") 
34134  363 

44992
aa34d2d049ce
refined Symbol.is_symbolic  cover recoded versions as well;
wenzelm
parents:
44949
diff
changeset

364 
val symbolic = recode_set((for { (sym, _) < symbols; if raw_symbolic(sym) } yield sym): _*) 
aa34d2d049ce
refined Symbol.is_symbolic  cover recoded versions as well;
wenzelm
parents:
44949
diff
changeset

365 

43455  366 

43488  367 
/* control symbols */ 
368 

43696  369 
val ctrl_decoded: Set[Symbol] = 
43488  370 
Set((for ((sym, _) < symbols if sym.startsWith("\\<^")) yield decode(sym)): _*) 
371 

44238
36120feb70ed
some convenience actions/shortcuts for control symbols;
wenzelm
parents:
44181
diff
changeset

372 
val sub_decoded = decode("\\<^sub>") 
36120feb70ed
some convenience actions/shortcuts for control symbols;
wenzelm
parents:
44181
diff
changeset

373 
val sup_decoded = decode("\\<^sup>") 
43511  374 
val bsub_decoded = decode("\\<^bsub>") 
375 
val esub_decoded = decode("\\<^esub>") 

376 
val bsup_decoded = decode("\\<^bsup>") 

377 
val esup_decoded = decode("\\<^esup>") 

44238
36120feb70ed
some convenience actions/shortcuts for control symbols;
wenzelm
parents:
44181
diff
changeset

378 
val bold_decoded = decode("\\<^bold>") 
27918  379 
} 
43695
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

380 

5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

381 

5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

382 
/* tables */ 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

383 

43696  384 
def names: Map[Symbol, String] = symbols.names 
50136
a96bd08258a2
support for symbol groups, retaining original order of declarations;
wenzelm
parents:
48922
diff
changeset

385 
def groups: List[(String, List[Symbol])] = symbols.groups 
53316
c3e549e0d3c7
allow multiple symbol properties, notably groups and abbrevs;
wenzelm
parents:
53021
diff
changeset

386 
def abbrevs: Multi_Map[Symbol, String] = symbols.abbrevs 
43695
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

387 

5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

388 
def decode(text: String): String = symbols.decode(text) 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

389 
def encode(text: String): String = symbols.encode(text) 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

390 

53337
b3817a0e3211
sort items according to persistent history of frequency of use;
wenzelm
parents:
53316
diff
changeset

391 
def decode_string: XML.Decode.T[String] = (x => decode(XML.Decode.string(x))) 
b3817a0e3211
sort items according to persistent history of frequency of use;
wenzelm
parents:
53316
diff
changeset

392 
def encode_string: XML.Encode.T[String] = (x => XML.Encode.string(encode(x))) 
b3817a0e3211
sort items according to persistent history of frequency of use;
wenzelm
parents:
53316
diff
changeset

393 

50291
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
wenzelm
parents:
50238
diff
changeset

394 
def decode_strict(text: String): String = 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
wenzelm
parents:
50238
diff
changeset

395 
{ 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
wenzelm
parents:
50238
diff
changeset

396 
val decoded = decode(text) 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
wenzelm
parents:
50238
diff
changeset

397 
if (encode(decoded) == text) decoded 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
wenzelm
parents:
50238
diff
changeset

398 
else { 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
wenzelm
parents:
50238
diff
changeset

399 
val bad = new mutable.ListBuffer[Symbol] 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
wenzelm
parents:
50238
diff
changeset

400 
for (s < iterator(text) if encode(decode(s)) != s && !bad.contains(s)) 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
wenzelm
parents:
50238
diff
changeset

401 
bad += s 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
wenzelm
parents:
50238
diff
changeset

402 
error("Bad Unicode symbols in text: " + commas_quote(bad)) 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
wenzelm
parents:
50238
diff
changeset

403 
} 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
wenzelm
parents:
50238
diff
changeset

404 
} 
674893679352
prefer Symbol.decode_strict in batch mode, to avoid files with spurious Unicode symbols that clash with Isabelle symbol interpretation;
wenzelm
parents:
50238
diff
changeset

405 

43696  406 
def fonts: Map[Symbol, String] = symbols.fonts 
43695
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

407 
def font_names: List[String] = symbols.font_names 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

408 
def font_index: Map[String, Int] = symbols.font_index 
43696  409 
def lookup_font(sym: Symbol): Option[Int] = symbols.fonts.get(sym).map(font_index(_)) 
43695
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

410 

5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

411 

5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

412 
/* classification */ 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

413 

43696  414 
def is_letter(sym: Symbol): Boolean = symbols.letters.contains(sym) 
415 
def is_digit(sym: Symbol): Boolean = sym.length == 1 && '0' <= sym(0) && sym(0) <= '9' 

416 
def is_quasi(sym: Symbol): Boolean = sym == "_"  sym == "'" 

417 
def is_letdig(sym: Symbol): Boolean = is_letter(sym)  is_digit(sym)  is_quasi(sym) 

418 
def is_blank(sym: Symbol): Boolean = symbols.blanks.contains(sym) 

44992
aa34d2d049ce
refined Symbol.is_symbolic  cover recoded versions as well;
wenzelm
parents:
44949
diff
changeset

419 

43696  420 
def is_symbolic_char(sym: Symbol): Boolean = symbols.sym_chars.contains(sym) 
44992
aa34d2d049ce
refined Symbol.is_symbolic  cover recoded versions as well;
wenzelm
parents:
44949
diff
changeset

421 
def is_symbolic(sym: Symbol): Boolean = raw_symbolic(sym)  symbols.symbolic.contains(sym) 
aa34d2d049ce
refined Symbol.is_symbolic  cover recoded versions as well;
wenzelm
parents:
44949
diff
changeset

422 

aa34d2d049ce
refined Symbol.is_symbolic  cover recoded versions as well;
wenzelm
parents:
44949
diff
changeset

423 
private def raw_symbolic(sym: Symbol): Boolean = 
43695
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

424 
sym.startsWith("\\<") && sym.endsWith(">") && !sym.startsWith("\\<^") 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

425 

5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

426 

5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

427 
/* control symbols */ 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

428 

43696  429 
def is_ctrl(sym: Symbol): Boolean = 
43695
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

430 
sym.startsWith("\\<^")  symbols.ctrl_decoded.contains(sym) 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

431 

43696  432 
def is_controllable(sym: Symbol): Boolean = 
43695
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

433 
!is_blank(sym) && !is_ctrl(sym) && !is_malformed(sym) 
5130dfe1b7be
simplified Symbol based on lazy Symbol.Interpretation  reduced odd "functorial style";
wenzelm
parents:
43675
diff
changeset

434 

44238
36120feb70ed
some convenience actions/shortcuts for control symbols;
wenzelm
parents:
44181
diff
changeset

435 
def sub_decoded: Symbol = symbols.sub_decoded 
36120feb70ed
some convenience actions/shortcuts for control symbols;
wenzelm
parents:
44181
diff
changeset

436 
def sup_decoded: Symbol = symbols.sup_decoded 
36120feb70ed
some convenience actions/shortcuts for control symbols;
wenzelm
parents:
44181
diff
changeset

437 
def bsub_decoded: Symbol = symbols.bsub_decoded 
36120feb70ed
some convenience actions/shortcuts for control symbols;
wenzelm
parents:
44181
diff
changeset

438 
def esub_decoded: Symbol = symbols.esub_decoded 
36120feb70ed
some convenience actions/shortcuts for control symbols;
wenzelm
parents:
44181
diff
changeset

439 
def bsup_decoded: Symbol = symbols.bsup_decoded 
36120feb70ed
some convenience actions/shortcuts for control symbols;
wenzelm
parents:
44181
diff
changeset

440 
def esup_decoded: Symbol = symbols.esup_decoded 
36120feb70ed
some convenience actions/shortcuts for control symbols;
wenzelm
parents:
44181
diff
changeset

441 
def bold_decoded: Symbol = symbols.bold_decoded 
27901  442 
} 