author | wenzelm |
Tue, 09 Jun 2009 20:40:19 +0200 | |
changeset 31523 | 2c0b67a0e5e7 |
parent 31522 | 0466cb17064f |
child 31545 | 5f1f0a20af4d |
permissions | -rw-r--r-- |
27901 | 1 |
/* Title: Pure/General/symbol.scala |
2 |
Author: Makarius |
|
3 |
||
27924 | 4 |
Detecting and recoding Isabelle symbols. |
27901 | 5 |
*/ |
6 |
||
7 |
package isabelle |
|
8 |
||
27918 | 9 |
import scala.io.Source |
31522 | 10 |
import scala.collection.jcl |
11 |
import scala.util.matching.Regex |
|
27901 | 12 |
|
13 |
||
31522 | 14 |
object Symbol |
15 |
{ |
|
27901 | 16 |
|
27924 | 17 |
/** Symbol regexps **/ |
27901 | 18 |
|
31522 | 19 |
private val plain = new Regex("""(?xs) |
20 |
[^\\ \ud800-\udfff] | [\ud800-\udbff][\udc00-\udfff] """) |
|
27901 | 21 |
|
31522 | 22 |
private val symbol = new Regex("""(?xs) |
23 |
\\ \\? < (?: |
|
27924 | 24 |
\^? [A-Za-z][A-Za-z0-9_']* | |
25 |
\^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""") |
|
26 |
||
31522 | 27 |
private val bad_symbol = new Regex("(?xs) (?!" + symbol + ")" + |
27924 | 28 |
""" \\ \\? < (?: (?! \s | [\"`\\] | \(\* | \*\) | \{\* | \*\} ) . )*""") |
29 |
||
27939 | 30 |
// total pattern |
31522 | 31 |
val regex = new Regex(plain + "|" + symbol + "|" + bad_symbol + "| .") |
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
32 |
|
31522 | 33 |
// prefix of another symbol |
34 |
def is_open(s: String): Boolean = |
|
35 |
{ |
|
36 |
val len = s.length |
|
37 |
len == 1 && Character.isHighSurrogate(s(0)) || |
|
38 |
s == "\\" || |
|
39 |
s == "\\<" || |
|
40 |
len > 2 && s(len - 1) != '>' |
|
41 |
} |
|
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
42 |
|
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
43 |
|
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
44 |
/** Recoding **/ |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
45 |
|
31522 | 46 |
private class Recoder(list: List[(String, String)]) |
47 |
{ |
|
48 |
private val (min, max) = |
|
49 |
{ |
|
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
50 |
var min = '\uffff' |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
51 |
var max = '\u0000' |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
52 |
for ((x, _) <- list) { |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
53 |
val c = x(0) |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
54 |
if (c < min) min = c |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
55 |
if (c > max) max = c |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
56 |
} |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
57 |
(min, max) |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
58 |
} |
31522 | 59 |
private val table = |
60 |
{ |
|
61 |
val table = new jcl.HashMap[String, String] // reasonably efficient? |
|
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
62 |
for ((x, y) <- list) table + (x -> y) |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
63 |
table |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
64 |
} |
31522 | 65 |
def recode(text: String): String = |
66 |
{ |
|
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
67 |
val len = text.length |
31522 | 68 |
val matcher = regex.pattern.matcher(text) |
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
69 |
val result = new StringBuilder(len) |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
70 |
var i = 0 |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
71 |
while (i < len) { |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
72 |
val c = text(i) |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
73 |
if (min <= c && c <= max) { |
27939 | 74 |
matcher.region(i, len) |
75 |
matcher.lookingAt |
|
27938 | 76 |
val x = matcher.group |
31522 | 77 |
result.append(table.get(x) getOrElse x) |
27937
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
78 |
i = matcher.end |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
79 |
} |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
80 |
else { result.append(c); i += 1 } |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
81 |
} |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
82 |
result.toString |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
83 |
} |
fdf77e7be01a
more robust pattern: look at longer matches first, added catch-all case;
wenzelm
parents:
27935
diff
changeset
|
84 |
} |
27924 | 85 |
|
27918 | 86 |
|
27923
7ebe9d38743a
use scala.collection.jcl.HashMap, which seems to be more efficient;
wenzelm
parents:
27918
diff
changeset
|
87 |
|
27927 | 88 |
/** Symbol interpretation **/ |
89 |
||
29569
f3f529b5d8fb
more general init of Symbol.Interpretation, independent of IsabelleSystem instance;
wenzelm
parents:
29174
diff
changeset
|
90 |
class Interpretation(symbol_decls: Iterator[String]) |
f3f529b5d8fb
more general init of Symbol.Interpretation, independent of IsabelleSystem instance;
wenzelm
parents:
29174
diff
changeset
|
91 |
{ |
31522 | 92 |
/* read symbols */ |
93 |
||
94 |
private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """) |
|
95 |
private val key = new Regex("""(?xs) (.+): """) |
|
96 |
||
97 |
private def read_decl(decl: String): (String, Map[String, String]) = |
|
98 |
{ |
|
99 |
def err() = error("Bad symbol declaration: " + decl) |
|
100 |
||
101 |
def read_props(props: List[String]): Map[String, String] = |
|
102 |
{ |
|
103 |
props match { |
|
104 |
case Nil => Map() |
|
105 |
case _ :: Nil => err() |
|
106 |
case key(x) :: y :: rest => read_props(rest) + (x -> y) |
|
107 |
case _ => err() |
|
108 |
} |
|
109 |
} |
|
110 |
decl.split("\\s+").toList match { |
|
111 |
case Nil => err() |
|
112 |
case sym :: props => (sym, read_props(props)) |
|
113 |
} |
|
114 |
} |
|
115 |
||
116 |
private val symbols: List[(String, Map[String, String])] = |
|
117 |
for (decl <- symbol_decls.toList if !empty.pattern.matcher(decl).matches) |
|
118 |
yield read_decl(decl) |
|
119 |
||
120 |
||
121 |
/* main recoder methods */ |
|
122 |
||
123 |
private val (decoder, encoder) = |
|
124 |
{ |
|
125 |
val mapping = |
|
126 |
for { |
|
127 |
(sym, props) <- symbols |
|
128 |
val code = |
|
129 |
try { Integer.decode(props("code")).intValue } |
|
130 |
catch { |
|
131 |
case _: NoSuchElementException => error("Missing code for symbol " + sym) |
|
132 |
case _: NumberFormatException => error("Bad code for symbol " + sym) |
|
133 |
} |
|
134 |
val ch = new String(Character.toChars(code)) |
|
135 |
} yield (sym, ch) |
|
136 |
(new Recoder(mapping ++ (for ((x, y) <- mapping) yield ("\\" + x, y))), |
|
137 |
new Recoder(for ((x, y) <- mapping) yield (y, x))) |
|
138 |
} |
|
27918 | 139 |
|
27924 | 140 |
def decode(text: String) = decoder.recode(text) |
141 |
def encode(text: String) = encoder.recode(text) |
|
27923
7ebe9d38743a
use scala.collection.jcl.HashMap, which seems to be more efficient;
wenzelm
parents:
27918
diff
changeset
|
142 |
|
27918 | 143 |
} |
27901 | 144 |
} |