author | wenzelm |
Sun, 06 Dec 2009 21:56:23 +0100 | |
changeset 33998 | fc56cfc6906e |
parent 31929 | ecfc667cac53 |
child 34001 | 6e5eafb373b3 |
permissions | -rw-r--r-- |
27901 | 1 |
/* Title: Pure/General/symbol.scala |
2 |
Author: Makarius |
|
3 |
||
27924 | 4 |
Detecting and recoding Isabelle symbols. |
27901 | 5 |
*/ |
6 |
||
7 |
package isabelle |
|
8 |
||
27918 | 9 |
import scala.io.Source |
31929 | 10 |
import scala.collection.{jcl, mutable} |
31522 | 11 |
import scala.util.matching.Regex |
27901 | 12 |
|
13 |
||
31522 | 14 |
object Symbol |
15 |
{ |
|
33998 | 16 |
/* Symbol regexps */ |
27901 | 17 |
|
31522 | 18 |
private val plain = new Regex("""(?xs) |
19 |
[^\\ \ud800-\udfff] | [\ud800-\udbff][\udc00-\udfff] """) |
|
27901 | 20 |
|
31522 | 21 |
private val symbol = new Regex("""(?xs) |
31545
5f1f0a20af4d
discontinued escaped symbols such as \\<forall> -- only one backslash should be used;
wenzelm
parents:
31523
diff
changeset
|
22 |
\\ < (?: |
27924 | 23 |
\^? [A-Za-z][A-Za-z0-9_']* | |
24 |
\^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""") |
|
25 |
||
31522 | 26 |
private val bad_symbol = new Regex("(?xs) (?!" + symbol + ")" + |
31545
5f1f0a20af4d
discontinued escaped symbols such as \\<forall> -- only one backslash should be used;
wenzelm
parents:
31523
diff
changeset
|
27 |
""" \\ < (?: (?! \s | [\"`\\] | \(\* | \*\) | \{\* | \*\} ) . )*""") |
27924 | 28 |
|
27939 | 29 |
// total pattern |
31522 | 30 |
val regex = new Regex(plain + "|" + symbol + "|" + bad_symbol + "| .") |
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
31 |
|
31522 | 32 |
// prefix of another symbol |
33998 | 33 |
def is_open(s: CharSequence): Boolean = |
31522 | 34 |
{ |
35 |
val len = s.length |
|
33998 | 36 |
len == 1 && Character.isHighSurrogate(s.charAt(0)) || |
31522 | 37 |
s == "\\" || |
38 |
s == "\\<" || |
|
33998 | 39 |
len > 2 && s.charAt(len - 1) != '>' |
31522 | 40 |
} |
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
41 |
|
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
42 |
|
33998 | 43 |
/* elements */ |
44 |
||
45 |
private def could_open(c: Char): Boolean = |
|
46 |
c == '\\' || Character.isHighSurrogate(c) |
|
31929 | 47 |
|
33998 | 48 |
def elements(text: CharSequence) = new Iterator[CharSequence] { |
49 |
private val matcher = regex.pattern.matcher(text) |
|
50 |
private var i = 0 |
|
51 |
def hasNext = i < text.length |
|
52 |
def next = { |
|
53 |
val len = |
|
54 |
if (could_open(text.charAt(i))) { |
|
55 |
matcher.region(i, text.length).lookingAt |
|
56 |
matcher.group.length |
|
57 |
} |
|
58 |
else 1 |
|
59 |
val s = text.subSequence(i, i + len) |
|
60 |
i += len |
|
61 |
s |
|
62 |
} |
|
63 |
} |
|
64 |
||
65 |
||
66 |
/* decoding offsets */ |
|
67 |
||
68 |
class Index(text: CharSequence) |
|
31929 | 69 |
{ |
70 |
case class Entry(chr: Int, sym: Int) |
|
71 |
val index: Array[Entry] = |
|
72 |
{ |
|
73 |
val matcher = regex.pattern.matcher(text) |
|
74 |
val buf = new mutable.ArrayBuffer[Entry] |
|
75 |
var chr = 0 |
|
76 |
var sym = 0 |
|
33998 | 77 |
while (chr < text.length) { |
31929 | 78 |
val len = |
33998 | 79 |
if (could_open(text.charAt(chr))) { |
80 |
matcher.region(chr, text.length).lookingAt |
|
31929 | 81 |
matcher.group.length |
33998 | 82 |
} |
83 |
else 1 |
|
31929 | 84 |
chr += len |
85 |
sym += 1 |
|
86 |
if (len > 1) buf += Entry(chr, sym) |
|
87 |
} |
|
88 |
buf.toArray |
|
89 |
} |
|
90 |
def decode(sym: Int): Int = |
|
91 |
{ |
|
92 |
val end = index.length |
|
93 |
def bisect(a: Int, b: Int): Int = |
|
94 |
{ |
|
95 |
if (a < b) { |
|
96 |
val c = (a + b) / 2 |
|
97 |
if (sym < index(c).sym) bisect(a, c) |
|
98 |
else if (c + 1 == end || sym < index(c + 1).sym) c |
|
99 |
else bisect(c + 1, b) |
|
100 |
} |
|
101 |
else -1 |
|
102 |
} |
|
103 |
val i = bisect(0, end) |
|
104 |
if (i < 0) sym |
|
105 |
else index(i).chr + sym - index(i).sym |
|
106 |
} |
|
107 |
} |
|
108 |
||
109 |
||
33998 | 110 |
/* recoding text */ |
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
111 |
|
31522 | 112 |
private class Recoder(list: List[(String, String)]) |
113 |
{ |
|
114 |
private val (min, max) = |
|
115 |
{ |
|
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
116 |
var min = '\uffff' |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
117 |
var max = '\u0000' |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
118 |
for ((x, _) <- list) { |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
119 |
val c = x(0) |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
120 |
if (c < min) min = c |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
121 |
if (c > max) max = c |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
122 |
} |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
123 |
(min, max) |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
124 |
} |
31522 | 125 |
private val table = |
126 |
{ |
|
127 |
val table = new jcl.HashMap[String, String] // reasonably efficient? |
|
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
128 |
for ((x, y) <- list) table + (x -> y) |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
129 |
table |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
130 |
} |
31522 | 131 |
def recode(text: String): String = |
132 |
{ |
|
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
133 |
val len = text.length |
31522 | 134 |
val matcher = regex.pattern.matcher(text) |
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
135 |
val result = new StringBuilder(len) |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
136 |
var i = 0 |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
137 |
while (i < len) { |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
138 |
val c = text(i) |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
139 |
if (min <= c && c <= max) { |
31929 | 140 |
matcher.region(i, len).lookingAt |
27938 | 141 |
val x = matcher.group |
31522 | 142 |
result.append(table.get(x) getOrElse x) |
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
143 |
i = matcher.end |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
144 |
} |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
145 |
else { result.append(c); i += 1 } |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
146 |
} |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
147 |
result.toString |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
148 |
} |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
149 |
} |
27924 | 150 |
|
27918 | 151 |
|
27923
7ebe9d38743a
use scala.collection.jcl.HashMap, which seems to be more efficient;
wenzelm
parents:
27918
diff
changeset
|
152 |
|
27927 | 153 |
/** Symbol interpretation **/ |
154 |
||
29569
f3f529b5d8fb
more general init of Symbol.Interpretation, independent of IsabelleSystem instance;
wenzelm
parents:
29174
diff
changeset
|
155 |
class Interpretation(symbol_decls: Iterator[String]) |
f3f529b5d8fb
more general init of Symbol.Interpretation, independent of IsabelleSystem instance;
wenzelm
parents:
29174
diff
changeset
|
156 |
{ |
31522 | 157 |
/* read symbols */ |
158 |
||
159 |
private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """) |
|
160 |
private val key = new Regex("""(?xs) (.+): """) |
|
161 |
||
162 |
private def read_decl(decl: String): (String, Map[String, String]) = |
|
163 |
{ |
|
164 |
def err() = error("Bad symbol declaration: " + decl) |
|
165 |
||
166 |
def read_props(props: List[String]): Map[String, String] = |
|
167 |
{ |
|
168 |
props match { |
|
169 |
case Nil => Map() |
|
170 |
case _ :: Nil => err() |
|
171 |
case key(x) :: y :: rest => read_props(rest) + (x -> y) |
|
172 |
case _ => err() |
|
173 |
} |
|
174 |
} |
|
175 |
decl.split("\\s+").toList match { |
|
176 |
case Nil => err() |
|
177 |
case sym :: props => (sym, read_props(props)) |
|
178 |
} |
|
179 |
} |
|
180 |
||
181 |
private val symbols: List[(String, Map[String, String])] = |
|
182 |
for (decl <- symbol_decls.toList if !empty.pattern.matcher(decl).matches) |
|
183 |
yield read_decl(decl) |
|
184 |
||
185 |
||
31651 | 186 |
/* misc properties */ |
187 |
||
188 |
val names: Map[String, String] = { |
|
189 |
val name = new Regex("""\\<([A-Za-z][A-Za-z0-9_']*)>""") |
|
190 |
Map((for ((sym @ name(a), _) <- symbols) yield (sym -> a)): _*) |
|
191 |
} |
|
192 |
||
193 |
val abbrevs: Map[String, String] = Map(( |
|
194 |
for ((sym, props) <- symbols if props.isDefinedAt("abbrev")) |
|
195 |
yield (sym -> props("abbrev"))): _*) |
|
196 |
||
197 |
||
31522 | 198 |
/* main recoder methods */ |
199 |
||
200 |
private val (decoder, encoder) = |
|
201 |
{ |
|
202 |
val mapping = |
|
203 |
for { |
|
204 |
(sym, props) <- symbols |
|
205 |
val code = |
|
206 |
try { Integer.decode(props("code")).intValue } |
|
207 |
catch { |
|
208 |
case _: NoSuchElementException => error("Missing code for symbol " + sym) |
|
209 |
case _: NumberFormatException => error("Bad code for symbol " + sym) |
|
210 |
} |
|
211 |
val ch = new String(Character.toChars(code)) |
|
212 |
} yield (sym, ch) |
|
31545
5f1f0a20af4d
discontinued escaped symbols such as \\<forall> -- only one backslash should be used;
wenzelm
parents:
31523
diff
changeset
|
213 |
(new Recoder(mapping), |
31548 | 214 |
new Recoder(mapping map { case (x, y) => (y, x) })) |
31522 | 215 |
} |
27918 | 216 |
|
27924 | 217 |
def decode(text: String) = decoder.recode(text) |
218 |
def encode(text: String) = encoder.recode(text) |
|
27918 | 219 |
} |
27901 | 220 |
} |