more systematic text length wrt. encoding;
authorwenzelm
Tue, 20 Dec 2016 16:08:02 +0100 (2016-12-20)
changeset 64617 01e50039edc9
parent 64616 dc3ec40fe41b
child 64618 c81bd30839a6
more systematic text length wrt. encoding;
src/Pure/General/codepoint.scala
src/Pure/General/length.scala
src/Pure/General/symbol.scala
src/Pure/PIDE/line.scala
src/Pure/System/utf8.scala
src/Pure/build-jars
--- a/src/Pure/General/codepoint.scala	Tue Dec 20 10:45:20 2016 +0100
+++ b/src/Pure/General/codepoint.scala	Tue Dec 20 16:08:02 2016 +0100
@@ -24,5 +24,30 @@
     }
 
   def length(s: String): Int = iterator(s).length
-  object Length extends Line.Length { def apply(s: String): Int = Codepoint.length(s) }
+
+  trait Length extends isabelle.Length
+  {
+    protected def codepoint_length(c: Int): Int = 1
+
+    def apply(s: String): Int =
+      (0 /: iterator(s)) { case (n, c) => n + codepoint_length(c) }
+
+    def offset(s: String, i: Int): Option[Text.Offset] =
+    {
+      if (i <= 0 || s.forall(_ < 0x80)) isabelle.Length.offset(s, i)
+      else {
+        val length = s.length
+        var offset = 0
+        var j = 0
+        while (offset < length && j < i) {
+          val c = s.codePointAt(offset)
+          offset += Character.charCount(c)
+          j += codepoint_length(c)
+        }
+        if (j >= i) Some(offset) else None
+      }
+    }
+  }
+
+  object Length extends Length
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/Pure/General/length.scala	Tue Dec 20 16:08:02 2016 +0100
@@ -0,0 +1,31 @@
+/*  Title:      Pure/General/length.scala
+    Author:     Makarius
+
+Text length wrt. encoding.
+*/
+
+package isabelle
+
+
+trait Length
+{
+  def apply(text: String): Int
+  def offset(text: String, i: Int): Option[Text.Offset]
+}
+
+object Length extends Length
+{
+  def apply(text: String): Int = text.length
+  def offset(text: String, i: Int): Option[Text.Offset] =
+    if (0 <= i && i <= text.length) Some(i) else None
+
+  def encoding(name: String): Length =
+    name match {
+      case "UTF-8" => UTF8.Length
+      case "UTF-16" => Length
+      case "codepoints" => Codepoint.Length
+      case "symbols" => Symbol.Length
+      case _ =>
+        error("Bad text encoding: " + name + " (expected UTF-8, UTF-16, codepoints, symbols)")
+    }
+}
--- a/src/Pure/General/symbol.scala	Tue Dec 20 10:45:20 2016 +0100
+++ b/src/Pure/General/symbol.scala	Tue Dec 20 16:08:02 2016 +0100
@@ -129,7 +129,15 @@
   def explode(text: CharSequence): List[Symbol] = iterator(text).toList
 
   def length(text: CharSequence): Int = iterator(text).length
-  object Length extends Line.Length { def apply(text: String): Int = length(text) }
+
+  object Length extends isabelle.Length
+  {
+    def apply(text: String): Int = length(text)
+    def offset(text: String, i: Int): Option[Text.Offset] =
+      if (i <= 0 || iterator(text).forall(s => s.length == 1)) isabelle.Length.offset(text, i)
+      else if (i <= length(text)) Some(iterator(text).take(i).mkString.length)
+      else None
+  }
 
 
   /* decoding offsets */
--- a/src/Pure/PIDE/line.scala	Tue Dec 20 10:45:20 2016 +0100
+++ b/src/Pure/PIDE/line.scala	Tue Dec 20 16:08:02 2016 +0100
@@ -12,12 +12,6 @@
 
 object Line
 {
-  /* length wrt. encoding */
-
-  trait Length { def apply(text: String): Int }
-  object Length extends Length { def apply(text: String): Int = text.length }
-
-
   /* position */
 
   object Position
@@ -37,21 +31,12 @@
         case i => i
       }
 
-    private def advance[A](iterator: Iterator[A], is_newline: A => Boolean): Position =
-    {
-      var l = line
-      var c = column
-      for (x <- iterator) {
-        if (is_newline(x)) { l += 1; c = 0 } else c += 1
+    def advance(text: String, length: Length = Length): Position =
+      if (text.isEmpty) this
+      else {
+        val lines = Library.split_lines(text)
+        Position(line + lines.length - 1, column + length(Library.trim_line(lines.last)))
       }
-      Position(l, c)
-    }
-
-    def advance(text: String): Position =
-      if (text.isEmpty) this else advance[Char](text.iterator, _ == '\n')
-
-    def advance_codepoints(text: String): Position =
-      if (text.isEmpty) this else advance[Int](Codepoint.iterator(text), _ == 10)
   }
 
 
@@ -94,7 +79,7 @@
       }
     override def hashCode(): Int = lines.hashCode
 
-    private def position(advance: (Position, String) => Position, offset: Text.Offset): Position =
+    def position(offset: Text.Offset, length: Length = Length): Position =
     {
       @tailrec def move(i: Text.Offset, lines_count: Int, lines_rest: List[Line]): Position =
       {
@@ -102,26 +87,23 @@
           case Nil => require(i == 0); Position(lines_count, 0)
           case line :: ls =>
             val n = line.text.length
-            if (ls.isEmpty || i <= n) advance(Position(lines_count, 0), line.text.substring(n - i))
+            if (ls.isEmpty || i <= n)
+              Position(lines_count, 0).advance(line.text.substring(n - i), length)
             else move(i - (n + 1), lines_count + 1, ls)
         }
       }
       move(offset, 0, lines)
     }
 
-    def position(i: Text.Offset): Position = position(_.advance(_), i)
-    def position_codepoints(i: Text.Offset): Position = position(_.advance_codepoints(_), i)
-
-    // FIXME codepoints
-    def offset(pos: Position): Option[Text.Offset] =
+    def offset(pos: Position, length: Length = Length): Option[Text.Offset] =
     {
       val l = pos.line
       val c = pos.column
       if (0 <= l && l < lines.length) {
         val line_offset =
           if (l == 0) 0
-          else (0 /: lines.take(l - 1))({ case (n, line) => n + line.text.length + 1 })
-        if (c <= lines(l).text.length) Some(line_offset + c) else None
+          else (0 /: lines.iterator.take(l - 1))({ case (n, line) => n + length(line.text) + 1 })
+        length.offset(lines(l).text, c).map(line_offset + _)
       }
       else None
     }
--- a/src/Pure/System/utf8.scala	Tue Dec 20 10:45:20 2016 +0100
+++ b/src/Pure/System/utf8.scala	Tue Dec 20 16:08:02 2016 +0100
@@ -21,16 +21,14 @@
 
   def bytes(s: String): Array[Byte] = s.getBytes(charset)
 
-  def bytes_length(s: String): Int =
-    (0 /: Codepoint.iterator(s)) {
-      case (n, i) =>
-        if (i < 0x80) n + 1
-        else if (i < 0x800) n + 2
-        else if (i < 0x10000) n + 3
-        else n + 4
-    }
-
-  object Length extends Line.Length { def apply(s: String): Int = bytes_length(s) }
+  object Length extends Codepoint.Length
+  {
+    override def codepoint_length(c: Int): Int =
+      if (c < 0x80) 1
+      else if (c < 0x800) 2
+      else if (c < 0x10000) 3
+      else 4
+  }
 
 
   /* permissive UTF-8 decoding */
@@ -99,4 +97,3 @@
     new Decode_Chars(decode, buffer, start, end)
   }
 }
-
--- a/src/Pure/build-jars	Tue Dec 20 10:45:20 2016 +0100
+++ b/src/Pure/build-jars	Tue Dec 20 16:08:02 2016 +0100
@@ -49,6 +49,7 @@
   General/graphics_file.scala
   General/http_server.scala
   General/json.scala
+  General/length.scala
   General/linear_set.scala
   General/logger.scala
   General/long_name.scala