src/Pure/General/symbol.scala
author wenzelm
Sat Sep 17 16:19:40 2011 +0200 (2011-09-17)
changeset 44949 b49d7f1066c8
parent 44238 36120feb70ed
child 44992 aa34d2d049ce
permissions -rw-r--r--
Symbol.explode as in ML;
     1 /*  Title:      Pure/General/symbol.scala
     2     Author:     Makarius
     3 
     4 Detecting and recoding Isabelle symbols.
     5 */
     6 
     7 package isabelle
     8 
     9 import scala.io.Source
    10 import scala.collection.mutable
    11 import scala.util.matching.Regex
    12 
    13 
    14 object Symbol
    15 {
    16   type Symbol = String
    17 
    18 
    19   /* spaces */
    20 
    21   val spc = ' '
    22   val space = " "
    23 
    24   private val static_spaces = space * 4000
    25 
    26   def spaces(k: Int): String =
    27   {
    28     require(k >= 0)
    29     if (k < static_spaces.length) static_spaces.substring(0, k)
    30     else space * k
    31   }
    32 
    33 
    34   /* ASCII characters */
    35 
    36   def is_ascii_letter(c: Char): Boolean = 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
    37   def is_ascii_digit(c: Char): Boolean = '0' <= c && c <= '9'
    38   def is_ascii_quasi(c: Char): Boolean = c == '_' || c == '\''
    39 
    40   def is_ascii_letdig(c: Char): Boolean =
    41     is_ascii_letter(c) || is_ascii_digit(c) || is_ascii_quasi(c)
    42 
    43   def is_ascii_identifier(s: String): Boolean =
    44     s.length > 0 && is_ascii_letter(s(0)) && s.substring(1).forall(is_ascii_letdig)
    45 
    46 
    47   /* Symbol regexps */
    48 
    49   private val plain = new Regex("""(?xs)
    50       [^\r\\\ud800-\udfff\ufffd] | [\ud800-\udbff][\udc00-\udfff] """)
    51 
    52   private val physical_newline = new Regex("""(?xs) \n | \r\n | \r """)
    53 
    54   private val symbol = new Regex("""(?xs)
    55       \\ < (?:
    56       \^? [A-Za-z][A-Za-z0-9_']* |
    57       \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""")
    58 
    59   private val malformed_symbol = new Regex("(?xs) (?!" + symbol + ")" +
    60     """ [\ud800-\udbff\ufffd] | \\<\^? """)
    61 
    62   val regex_total =
    63     new Regex(plain + "|" + physical_newline + "|" + symbol + "|" + malformed_symbol + "| .")
    64 
    65 
    66   /* basic matching */
    67 
    68   def is_plain(c: Char): Boolean = !(c == '\r' || c == '\\' || '\ud800' <= c && c <= '\udfff')
    69 
    70   def is_physical_newline(s: Symbol): Boolean =
    71     s == "\n" || s == "\r" || s == "\r\n"
    72 
    73   def is_malformed(s: Symbol): Boolean =
    74     !(s.length == 1 && is_plain(s(0))) && malformed_symbol.pattern.matcher(s).matches
    75 
    76   class Matcher(text: CharSequence)
    77   {
    78     private val matcher = regex_total.pattern.matcher(text)
    79     def apply(start: Int, end: Int): Int =
    80     {
    81       require(0 <= start && start < end && end <= text.length)
    82       if (is_plain(text.charAt(start))) 1
    83       else {
    84         matcher.region(start, end).lookingAt
    85         matcher.group.length
    86       }
    87     }
    88   }
    89 
    90 
    91   /* iterator */
    92 
    93   private val char_symbols: Array[Symbol] =
    94     (0 until 256).iterator.map(i => new String(Array(i.toChar))).toArray
    95 
    96   def iterator(text: CharSequence): Iterator[Symbol] =
    97     new Iterator[Symbol]
    98     {
    99       private val matcher = new Matcher(text)
   100       private var i = 0
   101       def hasNext = i < text.length
   102       def next =
   103       {
   104         val n = matcher(i, text.length)
   105         val s =
   106           if (n == 0) ""
   107           else if (n == 1) {
   108             val c = text.charAt(i)
   109             if (c < char_symbols.length) char_symbols(c)
   110             else text.subSequence(i, i + n).toString
   111           }
   112           else text.subSequence(i, i + n).toString
   113         i += n
   114         s
   115       }
   116     }
   117 
   118   def explode(text: CharSequence): List[Symbol] = iterator(text).toList
   119 
   120 
   121   /* decoding offsets */
   122 
   123   class Index(text: CharSequence)
   124   {
   125     sealed case class Entry(chr: Int, sym: Int)
   126     val index: Array[Entry] =
   127     {
   128       val matcher = new Matcher(text)
   129       val buf = new mutable.ArrayBuffer[Entry]
   130       var chr = 0
   131       var sym = 0
   132       while (chr < text.length) {
   133         val n = matcher(chr, text.length)
   134         chr += n
   135         sym += 1
   136         if (n > 1) buf += Entry(chr, sym)
   137       }
   138       buf.toArray
   139     }
   140     def decode(sym1: Int): Int =
   141     {
   142       val sym = sym1 - 1
   143       val end = index.length
   144       def bisect(a: Int, b: Int): Int =
   145       {
   146         if (a < b) {
   147           val c = (a + b) / 2
   148           if (sym < index(c).sym) bisect(a, c)
   149           else if (c + 1 == end || sym < index(c + 1).sym) c
   150           else bisect(c + 1, b)
   151         }
   152         else -1
   153       }
   154       val i = bisect(0, end)
   155       if (i < 0) sym
   156       else index(i).chr + sym - index(i).sym
   157     }
   158     def decode(range: Text.Range): Text.Range = range.map(decode(_))
   159   }
   160 
   161 
   162   /* recoding text */
   163 
   164   private class Recoder(list: List[(String, String)])
   165   {
   166     private val (min, max) =
   167     {
   168       var min = '\uffff'
   169       var max = '\u0000'
   170       for ((x, _) <- list) {
   171         val c = x(0)
   172         if (c < min) min = c
   173         if (c > max) max = c
   174       }
   175       (min, max)
   176     }
   177     private val table =
   178     {
   179       var tab = Map[String, String]()
   180       for ((x, y) <- list) {
   181         tab.get(x) match {
   182           case None => tab += (x -> y)
   183           case Some(z) =>
   184             error("Duplicate mapping of " + quote(x) + " to " + quote(y) + " vs. " + quote(z))
   185         }
   186       }
   187       tab
   188     }
   189     def recode(text: String): String =
   190     {
   191       val len = text.length
   192       val matcher = regex_total.pattern.matcher(text)
   193       val result = new StringBuilder(len)
   194       var i = 0
   195       while (i < len) {
   196         val c = text(i)
   197         if (min <= c && c <= max) {
   198           matcher.region(i, len).lookingAt
   199           val x = matcher.group
   200           result.append(table.get(x) getOrElse x)
   201           i = matcher.end
   202         }
   203         else { result.append(c); i += 1 }
   204       }
   205       result.toString
   206     }
   207   }
   208 
   209 
   210 
   211   /** symbol interpretation **/
   212 
   213   private lazy val symbols =
   214     new Interpretation(
   215       Isabelle_System.try_read(Path.split(Isabelle_System.getenv_strict("ISABELLE_SYMBOLS"))))
   216 
   217   private class Interpretation(symbols_spec: String)
   218   {
   219     /* read symbols */
   220 
   221     private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """)
   222     private val key = new Regex("""(?xs) (.+): """)
   223 
   224     private def read_decl(decl: String): (Symbol, Map[String, String]) =
   225     {
   226       def err() = error("Bad symbol declaration: " + decl)
   227 
   228       def read_props(props: List[String]): Map[String, String] =
   229       {
   230         props match {
   231           case Nil => Map()
   232           case _ :: Nil => err()
   233           case key(x) :: y :: rest => read_props(rest) + (x -> y)
   234           case _ => err()
   235         }
   236       }
   237       decl.split("\\s+").toList match {
   238         case sym :: props if sym.length > 1 && !is_malformed(sym) => (sym, read_props(props))
   239         case _ => err()
   240       }
   241     }
   242 
   243     private val symbols: List[(Symbol, Map[String, String])] =
   244       Map((
   245         for (decl <- split_lines(symbols_spec) if !empty.pattern.matcher(decl).matches)
   246           yield read_decl(decl)): _*) toList
   247 
   248 
   249     /* misc properties */
   250 
   251     val names: Map[Symbol, String] =
   252     {
   253       val name = new Regex("""\\<\^?([A-Za-z][A-Za-z0-9_']*)>""")
   254       Map((for ((sym @ name(a), _) <- symbols) yield (sym -> a)): _*)
   255     }
   256 
   257     val abbrevs: Map[Symbol, String] =
   258       Map((
   259         for ((sym, props) <- symbols if props.isDefinedAt("abbrev"))
   260           yield (sym -> props("abbrev"))): _*)
   261 
   262 
   263     /* recoding */
   264 
   265     private val (decoder, encoder) =
   266     {
   267       val mapping =
   268         for {
   269           (sym, props) <- symbols
   270           val code =
   271             try { Integer.decode(props("code")).intValue }
   272             catch {
   273               case _: NoSuchElementException => error("Missing code for symbol " + sym)
   274               case _: NumberFormatException => error("Bad code for symbol " + sym)
   275             }
   276           val ch = new String(Character.toChars(code))
   277         } yield {
   278           if (code < 128) error("Illegal ASCII code for symbol " + sym)
   279           else (sym, ch)
   280         }
   281       (new Recoder(mapping),
   282        new Recoder(mapping map { case (x, y) => (y, x) }))
   283     }
   284 
   285     def decode(text: String): String = decoder.recode(text)
   286     def encode(text: String): String = encoder.recode(text)
   287 
   288     private def recode_set(elems: String*): Set[String] =
   289     {
   290       val content = elems.toList
   291       Set((content ::: content.map(decode)): _*)
   292     }
   293 
   294     private def recode_map[A](elems: (String, A)*): Map[String, A] =
   295     {
   296       val content = elems.toList
   297       Map((content ::: content.map({ case (sym, a) => (decode(sym), a) })): _*)
   298     }
   299 
   300 
   301     /* user fonts */
   302 
   303     val fonts: Map[Symbol, String] =
   304       recode_map((
   305         for ((sym, props) <- symbols if props.isDefinedAt("font"))
   306           yield (sym -> props("font"))): _*)
   307 
   308     val font_names: List[String] = Set(fonts.toList.map(_._2): _*).toList
   309     val font_index: Map[String, Int] = Map((font_names zip (0 until font_names.length).toList): _*)
   310 
   311 
   312     /* classification */
   313 
   314     val letters = recode_set(
   315       "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
   316       "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
   317       "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
   318       "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
   319 
   320       "\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>",
   321       "\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>",
   322       "\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>",
   323       "\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>",
   324       "\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>",
   325       "\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>",
   326       "\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>",
   327       "\\<x>", "\\<y>", "\\<z>",
   328 
   329       "\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>",
   330       "\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>",
   331       "\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>",
   332       "\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>",
   333       "\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>",
   334       "\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>",
   335       "\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>",
   336       "\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>",
   337       "\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>",
   338 
   339       "\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>",
   340       "\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>",
   341       "\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>",
   342       "\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>",
   343       "\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>",
   344       "\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>",
   345       "\\<Psi>", "\\<Omega>",
   346 
   347       "\\<^isub>", "\\<^isup>")
   348 
   349     val blanks =
   350       recode_set(space, "\t", "\n", "\u000B", "\f", "\r", "\\<spacespace>", "\\<^newline>")
   351 
   352     val sym_chars =
   353       Set("!", "#", "$", "%", "&", "*", "+", "-", "/", "<", "=", ">", "?", "@", "^", "_", "|", "~")
   354 
   355 
   356     /* control symbols */
   357 
   358     val ctrl_decoded: Set[Symbol] =
   359       Set((for ((sym, _) <- symbols if sym.startsWith("\\<^")) yield decode(sym)): _*)
   360 
   361     val sub_decoded = decode("\\<^sub>")
   362     val sup_decoded = decode("\\<^sup>")
   363     val isub_decoded = decode("\\<^isub>")
   364     val isup_decoded = decode("\\<^isup>")
   365     val bsub_decoded = decode("\\<^bsub>")
   366     val esub_decoded = decode("\\<^esub>")
   367     val bsup_decoded = decode("\\<^bsup>")
   368     val esup_decoded = decode("\\<^esup>")
   369     val bold_decoded = decode("\\<^bold>")
   370   }
   371 
   372 
   373   /* tables */
   374 
   375   def names: Map[Symbol, String] = symbols.names
   376   def abbrevs: Map[Symbol, String] = symbols.abbrevs
   377 
   378   def decode(text: String): String = symbols.decode(text)
   379   def encode(text: String): String = symbols.encode(text)
   380 
   381   def fonts: Map[Symbol, String] = symbols.fonts
   382   def font_names: List[String] = symbols.font_names
   383   def font_index: Map[String, Int] = symbols.font_index
   384   def lookup_font(sym: Symbol): Option[Int] = symbols.fonts.get(sym).map(font_index(_))
   385 
   386 
   387   /* classification */
   388 
   389   def is_letter(sym: Symbol): Boolean = symbols.letters.contains(sym)
   390   def is_digit(sym: Symbol): Boolean = sym.length == 1 && '0' <= sym(0) && sym(0) <= '9'
   391   def is_quasi(sym: Symbol): Boolean = sym == "_" || sym == "'"
   392   def is_letdig(sym: Symbol): Boolean = is_letter(sym) || is_digit(sym) || is_quasi(sym)
   393   def is_blank(sym: Symbol): Boolean = symbols.blanks.contains(sym)
   394   def is_symbolic_char(sym: Symbol): Boolean = symbols.sym_chars.contains(sym)
   395   def is_symbolic(sym: Symbol): Boolean =
   396     sym.startsWith("\\<") && sym.endsWith(">") && !sym.startsWith("\\<^")
   397 
   398 
   399   /* control symbols */
   400 
   401   def is_ctrl(sym: Symbol): Boolean =
   402     sym.startsWith("\\<^") || symbols.ctrl_decoded.contains(sym)
   403 
   404   def is_controllable(sym: Symbol): Boolean =
   405     !is_blank(sym) && !is_ctrl(sym) && !is_malformed(sym)
   406 
   407   def sub_decoded: Symbol = symbols.sub_decoded
   408   def sup_decoded: Symbol = symbols.sup_decoded
   409   def isub_decoded: Symbol = symbols.isub_decoded
   410   def isup_decoded: Symbol = symbols.isup_decoded
   411   def bsub_decoded: Symbol = symbols.bsub_decoded
   412   def esub_decoded: Symbol = symbols.esub_decoded
   413   def bsup_decoded: Symbol = symbols.bsup_decoded
   414   def esup_decoded: Symbol = symbols.esup_decoded
   415   def bold_decoded: Symbol = symbols.bold_decoded
   416 }