src/Pure/General/codepoint.scala
author haftmann
Mon, 06 Feb 2017 20:56:34 +0100
changeset 64990 c6a7de505796
parent 64682 7e119f32276a
child 65196 e8760a98db78
permissions -rw-r--r--
more explicit errors in pathological cases
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
64610
1b89608974e9 clarified modules;
wenzelm
parents:
diff changeset
     1
/*  Title:      Pure/General/codepoint.scala
1b89608974e9 clarified modules;
wenzelm
parents:
diff changeset
     2
    Author:     Makarius
1b89608974e9 clarified modules;
wenzelm
parents:
diff changeset
     3
1b89608974e9 clarified modules;
wenzelm
parents:
diff changeset
     4
Unicode codepoints vs. Unicode string encoding.
1b89608974e9 clarified modules;
wenzelm
parents:
diff changeset
     5
*/
1b89608974e9 clarified modules;
wenzelm
parents:
diff changeset
     6
1b89608974e9 clarified modules;
wenzelm
parents:
diff changeset
     7
package isabelle
1b89608974e9 clarified modules;
wenzelm
parents:
diff changeset
     8
1b89608974e9 clarified modules;
wenzelm
parents:
diff changeset
     9
1b89608974e9 clarified modules;
wenzelm
parents:
diff changeset
    10
object Codepoint
1b89608974e9 clarified modules;
wenzelm
parents:
diff changeset
    11
{
64615
fd0d6de380c6 more systematic text length;
wenzelm
parents: 64610
diff changeset
    12
  def string(c: Int): String = new String(Array(c), 0, 1)
fd0d6de380c6 more systematic text length;
wenzelm
parents: 64610
diff changeset
    13
fd0d6de380c6 more systematic text length;
wenzelm
parents: 64610
diff changeset
    14
  def iterator(s: String): Iterator[Int] =
64610
1b89608974e9 clarified modules;
wenzelm
parents:
diff changeset
    15
    new Iterator[Int] {
1b89608974e9 clarified modules;
wenzelm
parents:
diff changeset
    16
      var offset = 0
64615
fd0d6de380c6 more systematic text length;
wenzelm
parents: 64610
diff changeset
    17
      def hasNext: Boolean = offset < s.length
64610
1b89608974e9 clarified modules;
wenzelm
parents:
diff changeset
    18
      def next: Int =
1b89608974e9 clarified modules;
wenzelm
parents:
diff changeset
    19
      {
64615
fd0d6de380c6 more systematic text length;
wenzelm
parents: 64610
diff changeset
    20
        val c = s.codePointAt(offset)
64610
1b89608974e9 clarified modules;
wenzelm
parents:
diff changeset
    21
        offset += Character.charCount(c)
1b89608974e9 clarified modules;
wenzelm
parents:
diff changeset
    22
        c
1b89608974e9 clarified modules;
wenzelm
parents:
diff changeset
    23
      }
1b89608974e9 clarified modules;
wenzelm
parents:
diff changeset
    24
    }
1b89608974e9 clarified modules;
wenzelm
parents:
diff changeset
    25
64615
fd0d6de380c6 more systematic text length;
wenzelm
parents: 64610
diff changeset
    26
  def length(s: String): Int = iterator(s).length
64617
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    27
64682
7e119f32276a clarified modules;
wenzelm
parents: 64617
diff changeset
    28
  trait Length extends Text.Length
64617
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    29
  {
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    30
    protected def codepoint_length(c: Int): Int = 1
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    31
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    32
    def apply(s: String): Int =
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    33
      (0 /: iterator(s)) { case (n, c) => n + codepoint_length(c) }
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    34
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    35
    def offset(s: String, i: Int): Option[Text.Offset] =
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    36
    {
64682
7e119f32276a clarified modules;
wenzelm
parents: 64617
diff changeset
    37
      if (i <= 0 || s.forall(_ < 0x80)) Text.Length.offset(s, i)
64617
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    38
      else {
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    39
        val length = s.length
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    40
        var offset = 0
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    41
        var j = 0
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    42
        while (offset < length && j < i) {
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    43
          val c = s.codePointAt(offset)
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    44
          offset += Character.charCount(c)
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    45
          j += codepoint_length(c)
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    46
        }
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    47
        if (j >= i) Some(offset) else None
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    48
      }
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    49
    }
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    50
  }
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    51
01e50039edc9 more systematic text length wrt. encoding;
wenzelm
parents: 64615
diff changeset
    52
  object Length extends Length
64610
1b89608974e9 clarified modules;
wenzelm
parents:
diff changeset
    53
}