# HG changeset patch # User wenzelm # Date 1482227076 -3600 # Node ID fd0d6de380c67f2d2d4d72d4f293ad5786cb2bc3 # Parent 88211daacf93511885dbff01910b18cf852eb465 more systematic text length; diff -r 88211daacf93 -r fd0d6de380c6 src/Pure/General/codepoint.scala --- a/src/Pure/General/codepoint.scala Tue Dec 20 10:06:18 2016 +0100 +++ b/src/Pure/General/codepoint.scala Tue Dec 20 10:44:36 2016 +0100 @@ -9,17 +9,20 @@ object Codepoint { - def iterator(str: String): Iterator[Int] = + def string(c: Int): String = new String(Array(c), 0, 1) + + def iterator(s: String): Iterator[Int] = new Iterator[Int] { var offset = 0 - def hasNext: Boolean = offset < str.length + def hasNext: Boolean = offset < s.length def next: Int = { - val c = str.codePointAt(offset) + val c = s.codePointAt(offset) offset += Character.charCount(c) c } } - def string(c: Int): String = new String(Array(c), 0, 1) + def length(s: String): Int = iterator(s).length + object Length extends Line.Length { def apply(s: String): Int = Codepoint.length(s) } } diff -r 88211daacf93 -r fd0d6de380c6 src/Pure/General/symbol.scala --- a/src/Pure/General/symbol.scala Tue Dec 20 10:06:18 2016 +0100 +++ b/src/Pure/General/symbol.scala Tue Dec 20 10:44:36 2016 +0100 @@ -128,6 +128,9 @@ def explode(text: CharSequence): List[Symbol] = iterator(text).toList + def length(text: CharSequence): Int = iterator(text).length + object Length extends Line.Length { def apply(text: String): Int = length(text) } + /* decoding offsets */ diff -r 88211daacf93 -r fd0d6de380c6 src/Pure/PIDE/line.scala --- a/src/Pure/PIDE/line.scala Tue Dec 20 10:06:18 2016 +0100 +++ b/src/Pure/PIDE/line.scala Tue Dec 20 10:44:36 2016 +0100 @@ -12,6 +12,12 @@ object Line { + /* length wrt. encoding */ + + trait Length { def apply(text: String): Int } + object Length extends Length { def apply(text: String): Int = text.length } + + /* position */ object Position diff -r 88211daacf93 -r fd0d6de380c6 src/Pure/System/utf8.scala --- a/src/Pure/System/utf8.scala Tue Dec 20 10:06:18 2016 +0100 +++ b/src/Pure/System/utf8.scala Tue Dec 20 10:44:36 2016 +0100 @@ -21,6 +21,17 @@ def bytes(s: String): Array[Byte] = s.getBytes(charset) + def bytes_length(s: String): Int = + (0 /: Codepoint.iterator(s)) { + case (n, i) => + if (i < 0x80) n + 1 + else if (i < 0x800) n + 2 + else if (i < 0x10000) n + 3 + else n + 4 + } + + object Length extends Line.Length { def apply(s: String): Int = bytes_length(s) } + /* permissive UTF-8 decoding */