src/Pure/General/symbol.scala
author wenzelm
Sat Apr 01 19:17:15 2017 +0200 (2017-04-01)
changeset 65344 b99283eed13c
parent 65335 7634d33c1a79
child 65521 e307a781416a
permissions -rw-r--r--
clarified YXML vs. symbol encoding: operate on whole message;
     1 /*  Title:      Pure/General/symbol.scala
     2     Author:     Makarius
     3 
     4 Detecting and recoding Isabelle symbols.
     5 */
     6 
     7 package isabelle
     8 
     9 
    10 import scala.collection.mutable
    11 import scala.util.matching.Regex
    12 import scala.annotation.tailrec
    13 
    14 
    15 object Symbol
    16 {
    17   type Symbol = String
    18 
    19   // counting Isabelle symbols, starting from 1
    20   type Offset = Text.Offset
    21   type Range = Text.Range
    22 
    23 
    24   /* spaces */
    25 
    26   val space = " "
    27 
    28   private val static_spaces = space * 4000
    29 
    30   def spaces(n: Int): String =
    31   {
    32     require(n >= 0)
    33     if (n < static_spaces.length) static_spaces.substring(0, n)
    34     else space * n
    35   }
    36 
    37 
    38   /* ASCII characters */
    39 
    40   def is_ascii_letter(c: Char): Boolean = 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
    41 
    42   def is_ascii_digit(c: Char): Boolean = '0' <= c && c <= '9'
    43 
    44   def is_ascii_hex(c: Char): Boolean =
    45     '0' <= c && c <= '9' || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f'
    46 
    47   def is_ascii_quasi(c: Char): Boolean = c == '_' || c == '\''
    48 
    49   def is_ascii_blank(c: Char): Boolean = " \t\n\u000b\f\r".contains(c)
    50 
    51   def is_ascii_letdig(c: Char): Boolean =
    52     is_ascii_letter(c) || is_ascii_digit(c) || is_ascii_quasi(c)
    53 
    54   def is_ascii_identifier(s: String): Boolean =
    55     s.length > 0 && is_ascii_letter(s(0)) && s.forall(is_ascii_letdig)
    56 
    57   def ascii(c: Char): Symbol =
    58   {
    59     if (c > 127) error("Non-ASCII character: " + quote(c.toString))
    60     else char_symbols(c.toInt)
    61   }
    62 
    63 
    64   /* symbol matching */
    65 
    66   private val symbol_total = new Regex("""(?xs)
    67     [\ud800-\udbff][\udc00-\udfff] | \r\n | \\ < \^? ([A-Za-z][A-Za-z0-9_']*)? >? | .""")
    68 
    69   private def is_plain(c: Char): Boolean =
    70     !(c == '\r' || c == '\\' || Character.isHighSurrogate(c))
    71 
    72   def is_malformed(s: Symbol): Boolean =
    73     s.length match {
    74       case 1 =>
    75         val c = s(0)
    76         Character.isHighSurrogate(c) || Character.isLowSurrogate(c) || c == '\ufffd'
    77       case 2 =>
    78         val c1 = s(0)
    79         val c2 = s(1)
    80         !(c1 == '\r' && c2 == '\n' || Character.isSurrogatePair(c1, c2))
    81       case _ => !s.endsWith(">") || s == "\\<>" || s == "\\<^>"
    82     }
    83 
    84   def is_newline(s: Symbol): Boolean =
    85     s == "\n" || s == "\r" || s == "\r\n"
    86 
    87   class Matcher(text: CharSequence)
    88   {
    89     private val matcher = symbol_total.pattern.matcher(text)
    90     def apply(start: Int, end: Int): Int =
    91     {
    92       require(0 <= start && start < end && end <= text.length)
    93       if (is_plain(text.charAt(start))) 1
    94       else {
    95         matcher.region(start, end).lookingAt
    96         matcher.group.length
    97       }
    98     }
    99   }
   100 
   101 
   102   /* iterator */
   103 
   104   private val char_symbols: Array[Symbol] =
   105     (0 until 256).iterator.map(i => new String(Array(i.toChar))).toArray
   106 
   107   def iterator(text: CharSequence): Iterator[Symbol] =
   108     new Iterator[Symbol]
   109     {
   110       private val matcher = new Matcher(text)
   111       private var i = 0
   112       def hasNext = i < text.length
   113       def next =
   114       {
   115         val n = matcher(i, text.length)
   116         val s =
   117           if (n == 0) ""
   118           else if (n == 1) {
   119             val c = text.charAt(i)
   120             if (c < char_symbols.length) char_symbols(c)
   121             else text.subSequence(i, i + n).toString
   122           }
   123           else text.subSequence(i, i + n).toString
   124         i += n
   125         s
   126       }
   127     }
   128 
   129   def explode(text: CharSequence): List[Symbol] = iterator(text).toList
   130 
   131   def length(text: CharSequence): Int = iterator(text).length
   132 
   133 
   134   /* decoding offsets */
   135 
   136   object Index
   137   {
   138     private sealed case class Entry(chr: Int, sym: Int)
   139 
   140     val empty: Index = new Index(Nil)
   141 
   142     def apply(text: CharSequence): Index =
   143     {
   144       val matcher = new Matcher(text)
   145       val buf = new mutable.ListBuffer[Entry]
   146       var chr = 0
   147       var sym = 0
   148       while (chr < text.length) {
   149         val n = matcher(chr, text.length)
   150         chr += n
   151         sym += 1
   152         if (n > 1) buf += Entry(chr, sym)
   153       }
   154       if (buf.isEmpty) empty else new Index(buf.toList)
   155     }
   156   }
   157 
   158   final class Index private(entries: List[Index.Entry])
   159   {
   160     private val hash: Int = entries.hashCode
   161     private val index: Array[Index.Entry] = entries.toArray
   162 
   163     def decode(symbol_offset: Offset): Text.Offset =
   164     {
   165       val sym = symbol_offset - 1
   166       val end = index.length
   167       @tailrec def bisect(a: Int, b: Int): Int =
   168       {
   169         if (a < b) {
   170           val c = (a + b) / 2
   171           if (sym < index(c).sym) bisect(a, c)
   172           else if (c + 1 == end || sym < index(c + 1).sym) c
   173           else bisect(c + 1, b)
   174         }
   175         else -1
   176       }
   177       val i = bisect(0, end)
   178       if (i < 0) sym
   179       else index(i).chr + sym - index(i).sym
   180     }
   181     def decode(symbol_range: Range): Text.Range = symbol_range.map(decode(_))
   182 
   183     override def hashCode: Int = hash
   184     override def equals(that: Any): Boolean =
   185       that match {
   186         case other: Index => index.sameElements(other.index)
   187         case _ => false
   188       }
   189   }
   190 
   191 
   192   /* symbolic text chunks -- without actual text */
   193 
   194   object Text_Chunk
   195   {
   196     sealed abstract class Name
   197     case object Default extends Name
   198     case class Id(id: Document_ID.Generic) extends Name
   199     case class File(name: String) extends Name
   200 
   201     val encode_name: XML.Encode.T[Name] =
   202     {
   203       import XML.Encode._
   204       variant(List(
   205         { case Default => (Nil, Nil) },
   206         { case Id(a) => (List(long_atom(a)), Nil) },
   207         { case File(a) => (List(a), Nil) }))
   208     }
   209 
   210     val decode_name: XML.Decode.T[Name] =
   211     {
   212       import XML.Decode._
   213       variant(List(
   214         { case (Nil, Nil) => Default },
   215         { case (List(a), Nil) => Id(long_atom(a)) },
   216         { case (List(a), Nil) => File(a) }))
   217     }
   218 
   219 
   220     def apply(text: CharSequence): Text_Chunk =
   221       new Text_Chunk(Text.Range(0, text.length), Index(text))
   222   }
   223 
   224   final class Text_Chunk private(val range: Text.Range, private val index: Index)
   225   {
   226     override def hashCode: Int = (range, index).hashCode
   227     override def equals(that: Any): Boolean =
   228       that match {
   229         case other: Text_Chunk =>
   230           range == other.range &&
   231           index == other.index
   232         case _ => false
   233       }
   234 
   235     override def toString: String = "Text_Chunk" + range.toString
   236 
   237     def decode(symbol_offset: Offset): Text.Offset = index.decode(symbol_offset)
   238     def decode(symbol_range: Range): Text.Range = index.decode(symbol_range)
   239     def incorporate(symbol_range: Range): Option[Text.Range] =
   240     {
   241       def in(r: Range): Option[Text.Range] =
   242         range.try_restrict(decode(r)) match {
   243           case Some(r1) if !r1.is_singularity => Some(r1)
   244           case _ => None
   245         }
   246      in(symbol_range) orElse in(symbol_range - 1)
   247     }
   248   }
   249 
   250 
   251   /* recoding text */
   252 
   253   private class Recoder(list: List[(String, String)])
   254   {
   255     private val (min, max) =
   256     {
   257       var min = '\uffff'
   258       var max = '\u0000'
   259       for ((x, _) <- list) {
   260         val c = x(0)
   261         if (c < min) min = c
   262         if (c > max) max = c
   263       }
   264       (min, max)
   265     }
   266     private val table =
   267     {
   268       var tab = Map[String, String]()
   269       for ((x, y) <- list) {
   270         tab.get(x) match {
   271           case None => tab += (x -> y)
   272           case Some(z) =>
   273             error("Duplicate symbol mapping of " + quote(x) + " to " + quote(y) + " vs. " + quote(z))
   274         }
   275       }
   276       tab
   277     }
   278     def recode(text: String): String =
   279     {
   280       val len = text.length
   281       val matcher = symbol_total.pattern.matcher(text)
   282       val result = new StringBuilder(len)
   283       var i = 0
   284       while (i < len) {
   285         val c = text(i)
   286         if (min <= c && c <= max) {
   287           matcher.region(i, len).lookingAt
   288           val x = matcher.group
   289           result.append(table.getOrElse(x, x))
   290           i = matcher.end
   291         }
   292         else { result.append(c); i += 1 }
   293       }
   294       result.toString
   295     }
   296   }
   297 
   298 
   299 
   300   /** symbol interpretation **/
   301 
   302   private lazy val symbols =
   303   {
   304     val contents =
   305       for (path <- Path.split(Isabelle_System.getenv("ISABELLE_SYMBOLS")) if path.is_file)
   306         yield (File.read(path))
   307     new Interpretation(cat_lines(contents))
   308   }
   309 
   310   private class Interpretation(symbols_spec: String)
   311   {
   312     /* read symbols */
   313 
   314     private val No_Decl = new Regex("""(?xs) ^\s* (?: \#.* )? $ """)
   315     private val Key = new Regex("""(?xs) (.+): """)
   316 
   317     private def read_decl(decl: String): (Symbol, Properties.T) =
   318     {
   319       def err() = error("Bad symbol declaration: " + decl)
   320 
   321       def read_props(props: List[String]): Properties.T =
   322       {
   323         props match {
   324           case Nil => Nil
   325           case _ :: Nil => err()
   326           case Key(x) :: y :: rest => (x -> y.replace('\u2423', ' ')) :: read_props(rest)
   327           case _ => err()
   328         }
   329       }
   330       decl.split("\\s+").toList match {
   331         case sym :: props if sym.length > 1 && !is_malformed(sym) =>
   332           (sym, read_props(props))
   333         case _ => err()
   334       }
   335     }
   336 
   337     private val symbols: List[(Symbol, Properties.T)] =
   338       (((List.empty[(Symbol, Properties.T)], Set.empty[Symbol]) /:
   339           split_lines(symbols_spec).reverse)
   340         { case (res, No_Decl()) => res
   341           case ((list, known), decl) =>
   342             val (sym, props) = read_decl(decl)
   343             if (known(sym)) (list, known)
   344             else ((sym, props) :: list, known + sym)
   345         })._1
   346 
   347 
   348     /* basic properties */
   349 
   350     val properties: Map[Symbol, Properties.T] = Map(symbols: _*)
   351 
   352     val names: Map[Symbol, String] =
   353     {
   354       val name = new Regex("""\\<\^?([A-Za-z][A-Za-z0-9_']*)>""")
   355       Map((for ((sym @ name(a), _) <- symbols) yield sym -> a): _*)
   356     }
   357 
   358     val groups: List[(String, List[Symbol])] =
   359       symbols.map({ case (sym, props) =>
   360         val gs = for (("group", g) <- props) yield g
   361         if (gs.isEmpty) List(sym -> "unsorted") else gs.map(sym -> _)
   362       }).flatten
   363         .groupBy(_._2).toList.map({ case (group, list) => (group, list.map(_._1)) })
   364         .sortBy(_._1)
   365 
   366     val abbrevs: Multi_Map[Symbol, String] =
   367       Multi_Map((
   368         for {
   369           (sym, props) <- symbols
   370           ("abbrev", a) <- props.reverse
   371         } yield sym -> a): _*)
   372 
   373     val codes: List[(String, Int)] =
   374     {
   375       val Code = new Properties.String("code")
   376       for {
   377         (sym, props) <- symbols
   378         code =
   379           props match {
   380             case Code(s) =>
   381               try { Integer.decode(s).intValue }
   382               catch { case _: NumberFormatException => error("Bad code for symbol " + sym) }
   383             case _ => error("Missing code for symbol " + sym)
   384           }
   385       } yield {
   386         if (code < 128) error("Illegal ASCII code for symbol " + sym)
   387         else (sym, code)
   388       }
   389     }
   390 
   391 
   392     /* recoding */
   393 
   394     private val (decoder, encoder) =
   395     {
   396       val mapping =
   397         for ((sym, code) <- codes) yield (sym, new String(Character.toChars(code)))
   398       (new Recoder(mapping), new Recoder(for ((x, y) <- mapping) yield (y, x)))
   399     }
   400 
   401     def decode(text: String): String = decoder.recode(text)
   402     def encode(text: String): String = encoder.recode(text)
   403 
   404     private def recode_set(elems: String*): Set[String] =
   405     {
   406       val content = elems.toList
   407       Set((content ::: content.map(decode)): _*)
   408     }
   409 
   410     private def recode_map[A](elems: (String, A)*): Map[String, A] =
   411     {
   412       val content = elems.toList
   413       Map((content ::: content.map({ case (sym, a) => (decode(sym), a) })): _*)
   414     }
   415 
   416 
   417     /* user fonts */
   418 
   419     private val Font = new Properties.String("font")
   420     val fonts: Map[Symbol, String] =
   421       recode_map((for ((sym, Font(font)) <- symbols) yield sym -> font): _*)
   422 
   423     val font_names: List[String] = Set(fonts.toList.map(_._2): _*).toList
   424     val font_index: Map[String, Int] = Map((font_names zip (0 until font_names.length).toList): _*)
   425 
   426 
   427     /* classification */
   428 
   429     val letters = recode_set(
   430       "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
   431       "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
   432       "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
   433       "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
   434 
   435       "\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>",
   436       "\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>",
   437       "\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>",
   438       "\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>",
   439       "\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>",
   440       "\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>",
   441       "\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>",
   442       "\\<x>", "\\<y>", "\\<z>",
   443 
   444       "\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>",
   445       "\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>",
   446       "\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>",
   447       "\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>",
   448       "\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>",
   449       "\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>",
   450       "\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>",
   451       "\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>",
   452       "\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>",
   453 
   454       "\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>",
   455       "\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>",
   456       "\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>",
   457       "\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>",
   458       "\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>",
   459       "\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>",
   460       "\\<Psi>", "\\<Omega>")
   461 
   462     val blanks = recode_set(space, "\t", "\n", "\u000B", "\f", "\r", "\r\n")
   463 
   464     val sym_chars =
   465       Set("!", "#", "$", "%", "&", "*", "+", "-", "/", "<", "=", ">", "?", "@", "^", "_", "|", "~")
   466 
   467     val symbolic = recode_set((for { (sym, _) <- symbols; if raw_symbolic(sym) } yield sym): _*)
   468 
   469 
   470     /* misc symbols */
   471 
   472     val newline_decoded = decode(newline)
   473     val comment_decoded = decode(comment)
   474 
   475 
   476     /* cartouches */
   477 
   478     val open_decoded = decode(open)
   479     val close_decoded = decode(close)
   480 
   481 
   482     /* control symbols */
   483 
   484     val control_decoded: Set[Symbol] =
   485       Set((for ((sym, _) <- symbols if sym.startsWith("\\<^")) yield decode(sym)): _*)
   486 
   487     val sub_decoded = decode(sub)
   488     val sup_decoded = decode(sup)
   489     val bold_decoded = decode(bold)
   490     val emph_decoded = decode(emph)
   491     val bsub_decoded = decode("\\<^bsub>")
   492     val esub_decoded = decode("\\<^esub>")
   493     val bsup_decoded = decode("\\<^bsup>")
   494     val esup_decoded = decode("\\<^esup>")
   495   }
   496 
   497 
   498   /* tables */
   499 
   500   def properties: Map[Symbol, Properties.T] = symbols.properties
   501   def names: Map[Symbol, String] = symbols.names
   502   def groups: List[(String, List[Symbol])] = symbols.groups
   503   def abbrevs: Multi_Map[Symbol, String] = symbols.abbrevs
   504   def codes: List[(String, Int)] = symbols.codes
   505 
   506   def decode(text: String): String = symbols.decode(text)
   507   def encode(text: String): String = symbols.encode(text)
   508 
   509   def decode_yxml(text: String): XML.Body = YXML.parse_body(decode(text))
   510   def decode_yxml_failsafe(text: String): XML.Body = YXML.parse_body_failsafe(decode(text))
   511   def encode_yxml(body: XML.Body): String = encode(YXML.string_of_body(body))
   512 
   513   def decode_strict(text: String): String =
   514   {
   515     val decoded = decode(text)
   516     if (encode(decoded) == text) decoded
   517     else {
   518       val bad = new mutable.ListBuffer[Symbol]
   519       for (s <- iterator(text) if encode(decode(s)) != s && !bad.contains(s))
   520         bad += s
   521       error("Bad Unicode symbols in text: " + commas_quote(bad))
   522     }
   523   }
   524 
   525   def fonts: Map[Symbol, String] = symbols.fonts
   526   def font_names: List[String] = symbols.font_names
   527   def font_index: Map[String, Int] = symbols.font_index
   528   def lookup_font(sym: Symbol): Option[Int] = symbols.fonts.get(sym).map(font_index(_))
   529 
   530 
   531   /* classification */
   532 
   533   def is_letter(sym: Symbol): Boolean = symbols.letters.contains(sym)
   534   def is_digit(sym: Symbol): Boolean = sym.length == 1 && '0' <= sym(0) && sym(0) <= '9'
   535   def is_quasi(sym: Symbol): Boolean = sym == "_" || sym == "'"
   536   def is_letdig(sym: Symbol): Boolean = is_letter(sym) || is_digit(sym) || is_quasi(sym)
   537   def is_blank(sym: Symbol): Boolean = symbols.blanks.contains(sym)
   538 
   539 
   540   /* misc symbols */
   541 
   542   val newline: Symbol = "\\<newline>"
   543   def newline_decoded: Symbol = symbols.newline_decoded
   544 
   545   def print_newlines(str: String): String =
   546     if (str.contains('\n'))
   547       (for (s <- iterator(str)) yield { if (s == "\n") newline_decoded else s }).mkString
   548     else str
   549 
   550   val comment: Symbol = "\\<comment>"
   551   def comment_decoded: Symbol = symbols.comment_decoded
   552 
   553 
   554   /* cartouches */
   555 
   556   val open: Symbol = "\\<open>"
   557   val close: Symbol = "\\<close>"
   558 
   559   def open_decoded: Symbol = symbols.open_decoded
   560   def close_decoded: Symbol = symbols.close_decoded
   561 
   562   def is_open(sym: Symbol): Boolean = sym == open_decoded || sym == open
   563   def is_close(sym: Symbol): Boolean = sym == close_decoded || sym == close
   564 
   565 
   566   /* symbols for symbolic identifiers */
   567 
   568   private def raw_symbolic(sym: Symbol): Boolean =
   569     sym.startsWith("\\<") && sym.endsWith(">") && !sym.startsWith("\\<^")
   570 
   571   def is_symbolic(sym: Symbol): Boolean =
   572     !is_open(sym) && !is_close(sym) && (raw_symbolic(sym) || symbols.symbolic.contains(sym))
   573 
   574   def is_symbolic_char(sym: Symbol): Boolean = symbols.sym_chars.contains(sym)
   575 
   576 
   577   /* control symbols */
   578 
   579   def is_control(sym: Symbol): Boolean =
   580     (sym.startsWith("\\<^") && sym.endsWith(">")) || symbols.control_decoded.contains(sym)
   581 
   582   def is_controllable(sym: Symbol): Boolean =
   583     !is_blank(sym) && !is_control(sym) && !is_open(sym) && !is_close(sym) && !is_malformed(sym)
   584 
   585   val sub = "\\<^sub>"
   586   val sup = "\\<^sup>"
   587   val bold = "\\<^bold>"
   588   val emph = "\\<^emph>"
   589 
   590   def sub_decoded: Symbol = symbols.sub_decoded
   591   def sup_decoded: Symbol = symbols.sup_decoded
   592   def bold_decoded: Symbol = symbols.bold_decoded
   593   def emph_decoded: Symbol = symbols.emph_decoded
   594   def bsub_decoded: Symbol = symbols.bsub_decoded
   595   def esub_decoded: Symbol = symbols.esub_decoded
   596   def bsup_decoded: Symbol = symbols.bsup_decoded
   597   def esup_decoded: Symbol = symbols.esup_decoded
   598 }