src/Pure/General/symbol.scala
author wenzelm
Wed Aug 17 16:01:27 2011 +0200 (2011-08-17)
changeset 44238 36120feb70ed
parent 44181 bbce0417236d
child 44949 b49d7f1066c8
permissions -rw-r--r--
some convenience actions/shortcuts for control symbols;
     1 /*  Title:      Pure/General/symbol.scala
     2     Author:     Makarius
     3 
     4 Detecting and recoding Isabelle symbols.
     5 */
     6 
     7 package isabelle
     8 
     9 import scala.io.Source
    10 import scala.collection.mutable
    11 import scala.util.matching.Regex
    12 
    13 
    14 object Symbol
    15 {
    16   type Symbol = String
    17 
    18 
    19   /* spaces */
    20 
    21   val spc = ' '
    22   val space = " "
    23 
    24   private val static_spaces = space * 4000
    25 
    26   def spaces(k: Int): String =
    27   {
    28     require(k >= 0)
    29     if (k < static_spaces.length) static_spaces.substring(0, k)
    30     else space * k
    31   }
    32 
    33 
    34   /* ASCII characters */
    35 
    36   def is_ascii_letter(c: Char): Boolean = 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
    37   def is_ascii_digit(c: Char): Boolean = '0' <= c && c <= '9'
    38   def is_ascii_quasi(c: Char): Boolean = c == '_' || c == '\''
    39 
    40   def is_ascii_letdig(c: Char): Boolean =
    41     is_ascii_letter(c) || is_ascii_digit(c) || is_ascii_quasi(c)
    42 
    43   def is_ascii_identifier(s: String): Boolean =
    44     s.length > 0 && is_ascii_letter(s(0)) && s.substring(1).forall(is_ascii_letdig)
    45 
    46 
    47   /* Symbol regexps */
    48 
    49   private val plain = new Regex("""(?xs)
    50       [^\r\\\ud800-\udfff\ufffd] | [\ud800-\udbff][\udc00-\udfff] """)
    51 
    52   private val physical_newline = new Regex("""(?xs) \n | \r\n | \r """)
    53 
    54   private val symbol = new Regex("""(?xs)
    55       \\ < (?:
    56       \^? [A-Za-z][A-Za-z0-9_']* |
    57       \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""")
    58 
    59   private val malformed_symbol = new Regex("(?xs) (?!" + symbol + ")" +
    60     """ [\ud800-\udbff\ufffd] | \\<\^? """)
    61 
    62   val regex_total =
    63     new Regex(plain + "|" + physical_newline + "|" + symbol + "|" + malformed_symbol + "| .")
    64 
    65 
    66   /* basic matching */
    67 
    68   def is_plain(c: Char): Boolean = !(c == '\r' || c == '\\' || '\ud800' <= c && c <= '\udfff')
    69 
    70   def is_physical_newline(s: Symbol): Boolean =
    71     s == "\n" || s == "\r" || s == "\r\n"
    72 
    73   def is_malformed(s: Symbol): Boolean =
    74     !(s.length == 1 && is_plain(s(0))) && malformed_symbol.pattern.matcher(s).matches
    75 
    76   class Matcher(text: CharSequence)
    77   {
    78     private val matcher = regex_total.pattern.matcher(text)
    79     def apply(start: Int, end: Int): Int =
    80     {
    81       require(0 <= start && start < end && end <= text.length)
    82       if (is_plain(text.charAt(start))) 1
    83       else {
    84         matcher.region(start, end).lookingAt
    85         matcher.group.length
    86       }
    87     }
    88   }
    89 
    90 
    91   /* iterator */
    92 
    93   private val char_symbols: Array[Symbol] =
    94     (0 until 256).iterator.map(i => new String(Array(i.toChar))).toArray
    95 
    96   def iterator(text: CharSequence): Iterator[Symbol] =
    97     new Iterator[Symbol]
    98     {
    99       private val matcher = new Matcher(text)
   100       private var i = 0
   101       def hasNext = i < text.length
   102       def next =
   103       {
   104         val n = matcher(i, text.length)
   105         val s =
   106           if (n == 0) ""
   107           else if (n == 1) {
   108             val c = text.charAt(i)
   109             if (c < char_symbols.length) char_symbols(c)
   110             else text.subSequence(i, i + n).toString
   111           }
   112           else text.subSequence(i, i + n).toString
   113         i += n
   114         s
   115       }
   116     }
   117 
   118 
   119   /* decoding offsets */
   120 
   121   class Index(text: CharSequence)
   122   {
   123     sealed case class Entry(chr: Int, sym: Int)
   124     val index: Array[Entry] =
   125     {
   126       val matcher = new Matcher(text)
   127       val buf = new mutable.ArrayBuffer[Entry]
   128       var chr = 0
   129       var sym = 0
   130       while (chr < text.length) {
   131         val n = matcher(chr, text.length)
   132         chr += n
   133         sym += 1
   134         if (n > 1) buf += Entry(chr, sym)
   135       }
   136       buf.toArray
   137     }
   138     def decode(sym1: Int): Int =
   139     {
   140       val sym = sym1 - 1
   141       val end = index.length
   142       def bisect(a: Int, b: Int): Int =
   143       {
   144         if (a < b) {
   145           val c = (a + b) / 2
   146           if (sym < index(c).sym) bisect(a, c)
   147           else if (c + 1 == end || sym < index(c + 1).sym) c
   148           else bisect(c + 1, b)
   149         }
   150         else -1
   151       }
   152       val i = bisect(0, end)
   153       if (i < 0) sym
   154       else index(i).chr + sym - index(i).sym
   155     }
   156     def decode(range: Text.Range): Text.Range = range.map(decode(_))
   157   }
   158 
   159 
   160   /* recoding text */
   161 
   162   private class Recoder(list: List[(String, String)])
   163   {
   164     private val (min, max) =
   165     {
   166       var min = '\uffff'
   167       var max = '\u0000'
   168       for ((x, _) <- list) {
   169         val c = x(0)
   170         if (c < min) min = c
   171         if (c > max) max = c
   172       }
   173       (min, max)
   174     }
   175     private val table =
   176     {
   177       var tab = Map[String, String]()
   178       for ((x, y) <- list) {
   179         tab.get(x) match {
   180           case None => tab += (x -> y)
   181           case Some(z) =>
   182             error("Duplicate mapping of " + quote(x) + " to " + quote(y) + " vs. " + quote(z))
   183         }
   184       }
   185       tab
   186     }
   187     def recode(text: String): String =
   188     {
   189       val len = text.length
   190       val matcher = regex_total.pattern.matcher(text)
   191       val result = new StringBuilder(len)
   192       var i = 0
   193       while (i < len) {
   194         val c = text(i)
   195         if (min <= c && c <= max) {
   196           matcher.region(i, len).lookingAt
   197           val x = matcher.group
   198           result.append(table.get(x) getOrElse x)
   199           i = matcher.end
   200         }
   201         else { result.append(c); i += 1 }
   202       }
   203       result.toString
   204     }
   205   }
   206 
   207 
   208 
   209   /** symbol interpretation **/
   210 
   211   private lazy val symbols =
   212     new Interpretation(
   213       Isabelle_System.try_read(Path.split(Isabelle_System.getenv_strict("ISABELLE_SYMBOLS"))))
   214 
   215   private class Interpretation(symbols_spec: String)
   216   {
   217     /* read symbols */
   218 
   219     private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """)
   220     private val key = new Regex("""(?xs) (.+): """)
   221 
   222     private def read_decl(decl: String): (Symbol, Map[String, String]) =
   223     {
   224       def err() = error("Bad symbol declaration: " + decl)
   225 
   226       def read_props(props: List[String]): Map[String, String] =
   227       {
   228         props match {
   229           case Nil => Map()
   230           case _ :: Nil => err()
   231           case key(x) :: y :: rest => read_props(rest) + (x -> y)
   232           case _ => err()
   233         }
   234       }
   235       decl.split("\\s+").toList match {
   236         case sym :: props if sym.length > 1 && !is_malformed(sym) => (sym, read_props(props))
   237         case _ => err()
   238       }
   239     }
   240 
   241     private val symbols: List[(Symbol, Map[String, String])] =
   242       Map((
   243         for (decl <- split_lines(symbols_spec) if !empty.pattern.matcher(decl).matches)
   244           yield read_decl(decl)): _*) toList
   245 
   246 
   247     /* misc properties */
   248 
   249     val names: Map[Symbol, String] =
   250     {
   251       val name = new Regex("""\\<\^?([A-Za-z][A-Za-z0-9_']*)>""")
   252       Map((for ((sym @ name(a), _) <- symbols) yield (sym -> a)): _*)
   253     }
   254 
   255     val abbrevs: Map[Symbol, String] =
   256       Map((
   257         for ((sym, props) <- symbols if props.isDefinedAt("abbrev"))
   258           yield (sym -> props("abbrev"))): _*)
   259 
   260 
   261     /* recoding */
   262 
   263     private val (decoder, encoder) =
   264     {
   265       val mapping =
   266         for {
   267           (sym, props) <- symbols
   268           val code =
   269             try { Integer.decode(props("code")).intValue }
   270             catch {
   271               case _: NoSuchElementException => error("Missing code for symbol " + sym)
   272               case _: NumberFormatException => error("Bad code for symbol " + sym)
   273             }
   274           val ch = new String(Character.toChars(code))
   275         } yield {
   276           if (code < 128) error("Illegal ASCII code for symbol " + sym)
   277           else (sym, ch)
   278         }
   279       (new Recoder(mapping),
   280        new Recoder(mapping map { case (x, y) => (y, x) }))
   281     }
   282 
   283     def decode(text: String): String = decoder.recode(text)
   284     def encode(text: String): String = encoder.recode(text)
   285 
   286     private def recode_set(elems: String*): Set[String] =
   287     {
   288       val content = elems.toList
   289       Set((content ::: content.map(decode)): _*)
   290     }
   291 
   292     private def recode_map[A](elems: (String, A)*): Map[String, A] =
   293     {
   294       val content = elems.toList
   295       Map((content ::: content.map({ case (sym, a) => (decode(sym), a) })): _*)
   296     }
   297 
   298 
   299     /* user fonts */
   300 
   301     val fonts: Map[Symbol, String] =
   302       recode_map((
   303         for ((sym, props) <- symbols if props.isDefinedAt("font"))
   304           yield (sym -> props("font"))): _*)
   305 
   306     val font_names: List[String] = Set(fonts.toList.map(_._2): _*).toList
   307     val font_index: Map[String, Int] = Map((font_names zip (0 until font_names.length).toList): _*)
   308 
   309 
   310     /* classification */
   311 
   312     val letters = recode_set(
   313       "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
   314       "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
   315       "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
   316       "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
   317 
   318       "\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>",
   319       "\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>",
   320       "\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>",
   321       "\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>",
   322       "\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>",
   323       "\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>",
   324       "\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>",
   325       "\\<x>", "\\<y>", "\\<z>",
   326 
   327       "\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>",
   328       "\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>",
   329       "\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>",
   330       "\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>",
   331       "\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>",
   332       "\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>",
   333       "\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>",
   334       "\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>",
   335       "\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>",
   336 
   337       "\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>",
   338       "\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>",
   339       "\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>",
   340       "\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>",
   341       "\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>",
   342       "\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>",
   343       "\\<Psi>", "\\<Omega>",
   344 
   345       "\\<^isub>", "\\<^isup>")
   346 
   347     val blanks =
   348       recode_set(space, "\t", "\n", "\u000B", "\f", "\r", "\\<spacespace>", "\\<^newline>")
   349 
   350     val sym_chars =
   351       Set("!", "#", "$", "%", "&", "*", "+", "-", "/", "<", "=", ">", "?", "@", "^", "_", "|", "~")
   352 
   353 
   354     /* control symbols */
   355 
   356     val ctrl_decoded: Set[Symbol] =
   357       Set((for ((sym, _) <- symbols if sym.startsWith("\\<^")) yield decode(sym)): _*)
   358 
   359     val sub_decoded = decode("\\<^sub>")
   360     val sup_decoded = decode("\\<^sup>")
   361     val isub_decoded = decode("\\<^isub>")
   362     val isup_decoded = decode("\\<^isup>")
   363     val bsub_decoded = decode("\\<^bsub>")
   364     val esub_decoded = decode("\\<^esub>")
   365     val bsup_decoded = decode("\\<^bsup>")
   366     val esup_decoded = decode("\\<^esup>")
   367     val bold_decoded = decode("\\<^bold>")
   368   }
   369 
   370 
   371   /* tables */
   372 
   373   def names: Map[Symbol, String] = symbols.names
   374   def abbrevs: Map[Symbol, String] = symbols.abbrevs
   375 
   376   def decode(text: String): String = symbols.decode(text)
   377   def encode(text: String): String = symbols.encode(text)
   378 
   379   def fonts: Map[Symbol, String] = symbols.fonts
   380   def font_names: List[String] = symbols.font_names
   381   def font_index: Map[String, Int] = symbols.font_index
   382   def lookup_font(sym: Symbol): Option[Int] = symbols.fonts.get(sym).map(font_index(_))
   383 
   384 
   385   /* classification */
   386 
   387   def is_letter(sym: Symbol): Boolean = symbols.letters.contains(sym)
   388   def is_digit(sym: Symbol): Boolean = sym.length == 1 && '0' <= sym(0) && sym(0) <= '9'
   389   def is_quasi(sym: Symbol): Boolean = sym == "_" || sym == "'"
   390   def is_letdig(sym: Symbol): Boolean = is_letter(sym) || is_digit(sym) || is_quasi(sym)
   391   def is_blank(sym: Symbol): Boolean = symbols.blanks.contains(sym)
   392   def is_symbolic_char(sym: Symbol): Boolean = symbols.sym_chars.contains(sym)
   393   def is_symbolic(sym: Symbol): Boolean =
   394     sym.startsWith("\\<") && sym.endsWith(">") && !sym.startsWith("\\<^")
   395 
   396 
   397   /* control symbols */
   398 
   399   def is_ctrl(sym: Symbol): Boolean =
   400     sym.startsWith("\\<^") || symbols.ctrl_decoded.contains(sym)
   401 
   402   def is_controllable(sym: Symbol): Boolean =
   403     !is_blank(sym) && !is_ctrl(sym) && !is_malformed(sym)
   404 
   405   def sub_decoded: Symbol = symbols.sub_decoded
   406   def sup_decoded: Symbol = symbols.sup_decoded
   407   def isub_decoded: Symbol = symbols.isub_decoded
   408   def isup_decoded: Symbol = symbols.isup_decoded
   409   def bsub_decoded: Symbol = symbols.bsub_decoded
   410   def esub_decoded: Symbol = symbols.esub_decoded
   411   def bsup_decoded: Symbol = symbols.bsup_decoded
   412   def esup_decoded: Symbol = symbols.esup_decoded
   413   def bold_decoded: Symbol = symbols.bold_decoded
   414 }