src/Pure/General/symbol.scala
author wenzelm
Tue Dec 20 09:52:01 2016 +0100 (2016-12-20)
changeset 64612 08e4b1aeac50
parent 64477 8be21ca788ca
child 64615 fd0d6de380c6
permissions -rw-r--r--
tuned;
     1 /*  Title:      Pure/General/symbol.scala
     2     Author:     Makarius
     3 
     4 Detecting and recoding Isabelle symbols.
     5 */
     6 
     7 package isabelle
     8 
     9 
    10 import scala.collection.mutable
    11 import scala.util.matching.Regex
    12 import scala.annotation.tailrec
    13 
    14 
    15 object Symbol
    16 {
    17   type Symbol = String
    18 
    19   // counting Isabelle symbols, starting from 1
    20   type Offset = Text.Offset
    21   type Range = Text.Range
    22 
    23 
    24   /* spaces */
    25 
    26   val space = " "
    27 
    28   private val static_spaces = space * 4000
    29 
    30   def spaces(n: Int): String =
    31   {
    32     require(n >= 0)
    33     if (n < static_spaces.length) static_spaces.substring(0, n)
    34     else space * n
    35   }
    36 
    37 
    38   /* ASCII characters */
    39 
    40   def is_ascii_letter(c: Char): Boolean = 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
    41 
    42   def is_ascii_digit(c: Char): Boolean = '0' <= c && c <= '9'
    43 
    44   def is_ascii_hex(c: Char): Boolean =
    45     '0' <= c && c <= '9' || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f'
    46 
    47   def is_ascii_quasi(c: Char): Boolean = c == '_' || c == '\''
    48 
    49   def is_ascii_blank(c: Char): Boolean = " \t\n\u000b\f\r".contains(c)
    50 
    51   def is_ascii_letdig(c: Char): Boolean =
    52     is_ascii_letter(c) || is_ascii_digit(c) || is_ascii_quasi(c)
    53 
    54   def is_ascii_identifier(s: String): Boolean =
    55     s.length > 0 && is_ascii_letter(s(0)) && s.forall(is_ascii_letdig)
    56 
    57   def ascii(c: Char): Symbol =
    58   {
    59     if (c > 127) error("Non-ASCII character: " + quote(c.toString))
    60     else char_symbols(c.toInt)
    61   }
    62 
    63 
    64   /* symbol matching */
    65 
    66   private val symbol_total = new Regex("""(?xs)
    67     [\ud800-\udbff][\udc00-\udfff] | \r\n | \\ < \^? ([A-Za-z][A-Za-z0-9_']*)? >? | .""")
    68 
    69   private def is_plain(c: Char): Boolean =
    70     !(c == '\r' || c == '\\' || Character.isHighSurrogate(c))
    71 
    72   def is_malformed(s: Symbol): Boolean =
    73     s.length match {
    74       case 1 =>
    75         val c = s(0)
    76         Character.isHighSurrogate(c) || Character.isLowSurrogate(c) || c == '\ufffd'
    77       case 2 =>
    78         val c1 = s(0)
    79         val c2 = s(1)
    80         !(c1 == '\r' && c2 == '\n' || Character.isSurrogatePair(c1, c2))
    81       case _ => !s.endsWith(">") || s == "\\<>" || s == "\\<^>"
    82     }
    83 
    84   def is_newline(s: Symbol): Boolean =
    85     s == "\n" || s == "\r" || s == "\r\n"
    86 
    87   class Matcher(text: CharSequence)
    88   {
    89     private val matcher = symbol_total.pattern.matcher(text)
    90     def apply(start: Int, end: Int): Int =
    91     {
    92       require(0 <= start && start < end && end <= text.length)
    93       if (is_plain(text.charAt(start))) 1
    94       else {
    95         matcher.region(start, end).lookingAt
    96         matcher.group.length
    97       }
    98     }
    99   }
   100 
   101 
   102   /* iterator */
   103 
   104   private val char_symbols: Array[Symbol] =
   105     (0 until 256).iterator.map(i => new String(Array(i.toChar))).toArray
   106 
   107   def iterator(text: CharSequence): Iterator[Symbol] =
   108     new Iterator[Symbol]
   109     {
   110       private val matcher = new Matcher(text)
   111       private var i = 0
   112       def hasNext = i < text.length
   113       def next =
   114       {
   115         val n = matcher(i, text.length)
   116         val s =
   117           if (n == 0) ""
   118           else if (n == 1) {
   119             val c = text.charAt(i)
   120             if (c < char_symbols.length) char_symbols(c)
   121             else text.subSequence(i, i + n).toString
   122           }
   123           else text.subSequence(i, i + n).toString
   124         i += n
   125         s
   126       }
   127     }
   128 
   129   def explode(text: CharSequence): List[Symbol] = iterator(text).toList
   130 
   131 
   132   /* decoding offsets */
   133 
   134   object Index
   135   {
   136     private sealed case class Entry(chr: Int, sym: Int)
   137 
   138     val empty: Index = new Index(Nil)
   139 
   140     def apply(text: CharSequence): Index =
   141     {
   142       val matcher = new Matcher(text)
   143       val buf = new mutable.ListBuffer[Entry]
   144       var chr = 0
   145       var sym = 0
   146       while (chr < text.length) {
   147         val n = matcher(chr, text.length)
   148         chr += n
   149         sym += 1
   150         if (n > 1) buf += Entry(chr, sym)
   151       }
   152       if (buf.isEmpty) empty else new Index(buf.toList)
   153     }
   154   }
   155 
   156   final class Index private(entries: List[Index.Entry])
   157   {
   158     private val hash: Int = entries.hashCode
   159     private val index: Array[Index.Entry] = entries.toArray
   160 
   161     def decode(symbol_offset: Offset): Text.Offset =
   162     {
   163       val sym = symbol_offset - 1
   164       val end = index.length
   165       @tailrec def bisect(a: Int, b: Int): Int =
   166       {
   167         if (a < b) {
   168           val c = (a + b) / 2
   169           if (sym < index(c).sym) bisect(a, c)
   170           else if (c + 1 == end || sym < index(c + 1).sym) c
   171           else bisect(c + 1, b)
   172         }
   173         else -1
   174       }
   175       val i = bisect(0, end)
   176       if (i < 0) sym
   177       else index(i).chr + sym - index(i).sym
   178     }
   179     def decode(symbol_range: Range): Text.Range = symbol_range.map(decode(_))
   180 
   181     override def hashCode: Int = hash
   182     override def equals(that: Any): Boolean =
   183       that match {
   184         case other: Index => index.sameElements(other.index)
   185         case _ => false
   186       }
   187   }
   188 
   189 
   190   /* symbolic text chunks -- without actual text */
   191 
   192   object Text_Chunk
   193   {
   194     sealed abstract class Name
   195     case object Default extends Name
   196     case class Id(id: Document_ID.Generic) extends Name
   197     case class File(name: String) extends Name
   198 
   199     def apply(text: CharSequence): Text_Chunk =
   200       new Text_Chunk(Text.Range(0, text.length), Index(text))
   201   }
   202 
   203   final class Text_Chunk private(val range: Text.Range, private val index: Index)
   204   {
   205     override def hashCode: Int = (range, index).hashCode
   206     override def equals(that: Any): Boolean =
   207       that match {
   208         case other: Text_Chunk =>
   209           range == other.range &&
   210           index == other.index
   211         case _ => false
   212       }
   213 
   214     override def toString: String = "Text_Chunk" + range.toString
   215 
   216     def decode(symbol_offset: Offset): Text.Offset = index.decode(symbol_offset)
   217     def decode(symbol_range: Range): Text.Range = index.decode(symbol_range)
   218     def incorporate(symbol_range: Range): Option[Text.Range] =
   219     {
   220       def in(r: Range): Option[Text.Range] =
   221         range.try_restrict(decode(r)) match {
   222           case Some(r1) if !r1.is_singularity => Some(r1)
   223           case _ => None
   224         }
   225      in(symbol_range) orElse in(symbol_range - 1)
   226     }
   227   }
   228 
   229 
   230   /* recoding text */
   231 
   232   private class Recoder(list: List[(String, String)])
   233   {
   234     private val (min, max) =
   235     {
   236       var min = '\uffff'
   237       var max = '\u0000'
   238       for ((x, _) <- list) {
   239         val c = x(0)
   240         if (c < min) min = c
   241         if (c > max) max = c
   242       }
   243       (min, max)
   244     }
   245     private val table =
   246     {
   247       var tab = Map[String, String]()
   248       for ((x, y) <- list) {
   249         tab.get(x) match {
   250           case None => tab += (x -> y)
   251           case Some(z) =>
   252             error("Duplicate symbol mapping of " + quote(x) + " to " + quote(y) + " vs. " + quote(z))
   253         }
   254       }
   255       tab
   256     }
   257     def recode(text: String): String =
   258     {
   259       val len = text.length
   260       val matcher = symbol_total.pattern.matcher(text)
   261       val result = new StringBuilder(len)
   262       var i = 0
   263       while (i < len) {
   264         val c = text(i)
   265         if (min <= c && c <= max) {
   266           matcher.region(i, len).lookingAt
   267           val x = matcher.group
   268           result.append(table.getOrElse(x, x))
   269           i = matcher.end
   270         }
   271         else { result.append(c); i += 1 }
   272       }
   273       result.toString
   274     }
   275   }
   276 
   277 
   278 
   279   /** symbol interpretation **/
   280 
   281   private lazy val symbols =
   282   {
   283     val contents =
   284       for (path <- Path.split(Isabelle_System.getenv("ISABELLE_SYMBOLS")) if path.is_file)
   285         yield (File.read(path))
   286     new Interpretation(cat_lines(contents))
   287   }
   288 
   289   private class Interpretation(symbols_spec: String)
   290   {
   291     /* read symbols */
   292 
   293     private val No_Decl = new Regex("""(?xs) ^\s* (?: \#.* )? $ """)
   294     private val Key = new Regex("""(?xs) (.+): """)
   295 
   296     private def read_decl(decl: String): (Symbol, Properties.T) =
   297     {
   298       def err() = error("Bad symbol declaration: " + decl)
   299 
   300       def read_props(props: List[String]): Properties.T =
   301       {
   302         props match {
   303           case Nil => Nil
   304           case _ :: Nil => err()
   305           case Key(x) :: y :: rest => (x -> y.replace('\u2423', ' ')) :: read_props(rest)
   306           case _ => err()
   307         }
   308       }
   309       decl.split("\\s+").toList match {
   310         case sym :: props if sym.length > 1 && !is_malformed(sym) =>
   311           (sym, read_props(props))
   312         case _ => err()
   313       }
   314     }
   315 
   316     private val symbols: List[(Symbol, Properties.T)] =
   317       (((List.empty[(Symbol, Properties.T)], Set.empty[Symbol]) /:
   318           split_lines(symbols_spec).reverse)
   319         { case (res, No_Decl()) => res
   320           case ((list, known), decl) =>
   321             val (sym, props) = read_decl(decl)
   322             if (known(sym)) (list, known)
   323             else ((sym, props) :: list, known + sym)
   324         })._1
   325 
   326 
   327     /* basic properties */
   328 
   329     val properties: Map[Symbol, Properties.T] = Map(symbols: _*)
   330 
   331     val names: Map[Symbol, String] =
   332     {
   333       val name = new Regex("""\\<\^?([A-Za-z][A-Za-z0-9_']*)>""")
   334       Map((for ((sym @ name(a), _) <- symbols) yield sym -> a): _*)
   335     }
   336 
   337     val groups: List[(String, List[Symbol])] =
   338       symbols.map({ case (sym, props) =>
   339         val gs = for (("group", g) <- props) yield g
   340         if (gs.isEmpty) List(sym -> "unsorted") else gs.map(sym -> _)
   341       }).flatten
   342         .groupBy(_._2).toList.map({ case (group, list) => (group, list.map(_._1)) })
   343         .sortBy(_._1)
   344 
   345     val abbrevs: Multi_Map[Symbol, String] =
   346       Multi_Map((
   347         for {
   348           (sym, props) <- symbols
   349           ("abbrev", a) <- props.reverse
   350         } yield sym -> a): _*)
   351 
   352     val codes: List[(String, Int)] =
   353     {
   354       val Code = new Properties.String("code")
   355       for {
   356         (sym, props) <- symbols
   357         code =
   358           props match {
   359             case Code(s) =>
   360               try { Integer.decode(s).intValue }
   361               catch { case _: NumberFormatException => error("Bad code for symbol " + sym) }
   362             case _ => error("Missing code for symbol " + sym)
   363           }
   364       } yield {
   365         if (code < 128) error("Illegal ASCII code for symbol " + sym)
   366         else (sym, code)
   367       }
   368     }
   369 
   370 
   371     /* recoding */
   372 
   373     private val (decoder, encoder) =
   374     {
   375       val mapping =
   376         for ((sym, code) <- codes) yield (sym, new String(Character.toChars(code)))
   377       (new Recoder(mapping), new Recoder(for ((x, y) <- mapping) yield (y, x)))
   378     }
   379 
   380     def decode(text: String): String = decoder.recode(text)
   381     def encode(text: String): String = encoder.recode(text)
   382 
   383     private def recode_set(elems: String*): Set[String] =
   384     {
   385       val content = elems.toList
   386       Set((content ::: content.map(decode)): _*)
   387     }
   388 
   389     private def recode_map[A](elems: (String, A)*): Map[String, A] =
   390     {
   391       val content = elems.toList
   392       Map((content ::: content.map({ case (sym, a) => (decode(sym), a) })): _*)
   393     }
   394 
   395 
   396     /* user fonts */
   397 
   398     private val Font = new Properties.String("font")
   399     val fonts: Map[Symbol, String] =
   400       recode_map((for ((sym, Font(font)) <- symbols) yield sym -> font): _*)
   401 
   402     val font_names: List[String] = Set(fonts.toList.map(_._2): _*).toList
   403     val font_index: Map[String, Int] = Map((font_names zip (0 until font_names.length).toList): _*)
   404 
   405 
   406     /* classification */
   407 
   408     val letters = recode_set(
   409       "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
   410       "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
   411       "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
   412       "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
   413 
   414       "\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>",
   415       "\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>",
   416       "\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>",
   417       "\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>",
   418       "\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>",
   419       "\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>",
   420       "\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>",
   421       "\\<x>", "\\<y>", "\\<z>",
   422 
   423       "\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>",
   424       "\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>",
   425       "\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>",
   426       "\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>",
   427       "\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>",
   428       "\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>",
   429       "\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>",
   430       "\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>",
   431       "\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>",
   432 
   433       "\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>",
   434       "\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>",
   435       "\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>",
   436       "\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>",
   437       "\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>",
   438       "\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>",
   439       "\\<Psi>", "\\<Omega>")
   440 
   441     val blanks = recode_set(space, "\t", "\n", "\u000B", "\f", "\r", "\r\n")
   442 
   443     val sym_chars =
   444       Set("!", "#", "$", "%", "&", "*", "+", "-", "/", "<", "=", ">", "?", "@", "^", "_", "|", "~")
   445 
   446     val symbolic = recode_set((for { (sym, _) <- symbols; if raw_symbolic(sym) } yield sym): _*)
   447 
   448 
   449     /* misc symbols */
   450 
   451     val newline_decoded = decode(newline)
   452     val comment_decoded = decode(comment)
   453 
   454 
   455     /* cartouches */
   456 
   457     val open_decoded = decode(open)
   458     val close_decoded = decode(close)
   459 
   460 
   461     /* control symbols */
   462 
   463     val control_decoded: Set[Symbol] =
   464       Set((for ((sym, _) <- symbols if sym.startsWith("\\<^")) yield decode(sym)): _*)
   465 
   466     val sub_decoded = decode(sub)
   467     val sup_decoded = decode(sup)
   468     val bold_decoded = decode(bold)
   469     val emph_decoded = decode(emph)
   470     val bsub_decoded = decode("\\<^bsub>")
   471     val esub_decoded = decode("\\<^esub>")
   472     val bsup_decoded = decode("\\<^bsup>")
   473     val esup_decoded = decode("\\<^esup>")
   474   }
   475 
   476 
   477   /* tables */
   478 
   479   def properties: Map[Symbol, Properties.T] = symbols.properties
   480   def names: Map[Symbol, String] = symbols.names
   481   def groups: List[(String, List[Symbol])] = symbols.groups
   482   def abbrevs: Multi_Map[Symbol, String] = symbols.abbrevs
   483   def codes: List[(String, Int)] = symbols.codes
   484 
   485   def decode(text: String): String = symbols.decode(text)
   486   def encode(text: String): String = symbols.encode(text)
   487 
   488   def decode_string: XML.Decode.T[String] = (x => decode(XML.Decode.string(x)))
   489   def encode_string: XML.Encode.T[String] = (x => XML.Encode.string(encode(x)))
   490 
   491   def decode_strict(text: String): String =
   492   {
   493     val decoded = decode(text)
   494     if (encode(decoded) == text) decoded
   495     else {
   496       val bad = new mutable.ListBuffer[Symbol]
   497       for (s <- iterator(text) if encode(decode(s)) != s && !bad.contains(s))
   498         bad += s
   499       error("Bad Unicode symbols in text: " + commas_quote(bad))
   500     }
   501   }
   502 
   503   def fonts: Map[Symbol, String] = symbols.fonts
   504   def font_names: List[String] = symbols.font_names
   505   def font_index: Map[String, Int] = symbols.font_index
   506   def lookup_font(sym: Symbol): Option[Int] = symbols.fonts.get(sym).map(font_index(_))
   507 
   508 
   509   /* classification */
   510 
   511   def is_letter(sym: Symbol): Boolean = symbols.letters.contains(sym)
   512   def is_digit(sym: Symbol): Boolean = sym.length == 1 && '0' <= sym(0) && sym(0) <= '9'
   513   def is_quasi(sym: Symbol): Boolean = sym == "_" || sym == "'"
   514   def is_letdig(sym: Symbol): Boolean = is_letter(sym) || is_digit(sym) || is_quasi(sym)
   515   def is_blank(sym: Symbol): Boolean = symbols.blanks.contains(sym)
   516 
   517 
   518   /* misc symbols */
   519 
   520   val newline: Symbol = "\\<newline>"
   521   def newline_decoded: Symbol = symbols.newline_decoded
   522 
   523   def print_newlines(str: String): String =
   524     if (str.contains('\n'))
   525       (for (s <- iterator(str)) yield { if (s == "\n") newline_decoded else s }).mkString
   526     else str
   527 
   528   val comment: Symbol = "\\<comment>"
   529   def comment_decoded: Symbol = symbols.comment_decoded
   530 
   531 
   532   /* cartouches */
   533 
   534   val open: Symbol = "\\<open>"
   535   val close: Symbol = "\\<close>"
   536 
   537   def open_decoded: Symbol = symbols.open_decoded
   538   def close_decoded: Symbol = symbols.close_decoded
   539 
   540   def is_open(sym: Symbol): Boolean = sym == open_decoded || sym == open
   541   def is_close(sym: Symbol): Boolean = sym == close_decoded || sym == close
   542 
   543 
   544   /* symbols for symbolic identifiers */
   545 
   546   private def raw_symbolic(sym: Symbol): Boolean =
   547     sym.startsWith("\\<") && sym.endsWith(">") && !sym.startsWith("\\<^")
   548 
   549   def is_symbolic(sym: Symbol): Boolean =
   550     !is_open(sym) && !is_close(sym) && (raw_symbolic(sym) || symbols.symbolic.contains(sym))
   551 
   552   def is_symbolic_char(sym: Symbol): Boolean = symbols.sym_chars.contains(sym)
   553 
   554 
   555   /* control symbols */
   556 
   557   def is_control(sym: Symbol): Boolean =
   558     (sym.startsWith("\\<^") && sym.endsWith(">")) || symbols.control_decoded.contains(sym)
   559 
   560   def is_controllable(sym: Symbol): Boolean =
   561     !is_blank(sym) && !is_control(sym) && !is_open(sym) && !is_close(sym) && !is_malformed(sym)
   562 
   563   val sub = "\\<^sub>"
   564   val sup = "\\<^sup>"
   565   val bold = "\\<^bold>"
   566   val emph = "\\<^emph>"
   567 
   568   def sub_decoded: Symbol = symbols.sub_decoded
   569   def sup_decoded: Symbol = symbols.sup_decoded
   570   def bold_decoded: Symbol = symbols.bold_decoded
   571   def emph_decoded: Symbol = symbols.emph_decoded
   572   def bsub_decoded: Symbol = symbols.bsub_decoded
   573   def esub_decoded: Symbol = symbols.esub_decoded
   574   def bsup_decoded: Symbol = symbols.bsup_decoded
   575   def esup_decoded: Symbol = symbols.esup_decoded
   576 }