# HG changeset patch # User wenzelm # Date 1482246482 -3600 # Node ID 01e50039edc94fbffe919463e04bdd1da56e1971 # Parent dc3ec40fe41b2771cac81fd3c67c4a450ab0a940 more systematic text length wrt. encoding; diff -r dc3ec40fe41b -r 01e50039edc9 src/Pure/General/codepoint.scala --- a/src/Pure/General/codepoint.scala Tue Dec 20 10:45:20 2016 +0100 +++ b/src/Pure/General/codepoint.scala Tue Dec 20 16:08:02 2016 +0100 @@ -24,5 +24,30 @@ } def length(s: String): Int = iterator(s).length - object Length extends Line.Length { def apply(s: String): Int = Codepoint.length(s) } + + trait Length extends isabelle.Length + { + protected def codepoint_length(c: Int): Int = 1 + + def apply(s: String): Int = + (0 /: iterator(s)) { case (n, c) => n + codepoint_length(c) } + + def offset(s: String, i: Int): Option[Text.Offset] = + { + if (i <= 0 || s.forall(_ < 0x80)) isabelle.Length.offset(s, i) + else { + val length = s.length + var offset = 0 + var j = 0 + while (offset < length && j < i) { + val c = s.codePointAt(offset) + offset += Character.charCount(c) + j += codepoint_length(c) + } + if (j >= i) Some(offset) else None + } + } + } + + object Length extends Length } diff -r dc3ec40fe41b -r 01e50039edc9 src/Pure/General/length.scala --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Pure/General/length.scala Tue Dec 20 16:08:02 2016 +0100 @@ -0,0 +1,31 @@ +/* Title: Pure/General/length.scala + Author: Makarius + +Text length wrt. encoding. +*/ + +package isabelle + + +trait Length +{ + def apply(text: String): Int + def offset(text: String, i: Int): Option[Text.Offset] +} + +object Length extends Length +{ + def apply(text: String): Int = text.length + def offset(text: String, i: Int): Option[Text.Offset] = + if (0 <= i && i <= text.length) Some(i) else None + + def encoding(name: String): Length = + name match { + case "UTF-8" => UTF8.Length + case "UTF-16" => Length + case "codepoints" => Codepoint.Length + case "symbols" => Symbol.Length + case _ => + error("Bad text encoding: " + name + " (expected UTF-8, UTF-16, codepoints, symbols)") + } +} diff -r dc3ec40fe41b -r 01e50039edc9 src/Pure/General/symbol.scala --- a/src/Pure/General/symbol.scala Tue Dec 20 10:45:20 2016 +0100 +++ b/src/Pure/General/symbol.scala Tue Dec 20 16:08:02 2016 +0100 @@ -129,7 +129,15 @@ def explode(text: CharSequence): List[Symbol] = iterator(text).toList def length(text: CharSequence): Int = iterator(text).length - object Length extends Line.Length { def apply(text: String): Int = length(text) } + + object Length extends isabelle.Length + { + def apply(text: String): Int = length(text) + def offset(text: String, i: Int): Option[Text.Offset] = + if (i <= 0 || iterator(text).forall(s => s.length == 1)) isabelle.Length.offset(text, i) + else if (i <= length(text)) Some(iterator(text).take(i).mkString.length) + else None + } /* decoding offsets */ diff -r dc3ec40fe41b -r 01e50039edc9 src/Pure/PIDE/line.scala --- a/src/Pure/PIDE/line.scala Tue Dec 20 10:45:20 2016 +0100 +++ b/src/Pure/PIDE/line.scala Tue Dec 20 16:08:02 2016 +0100 @@ -12,12 +12,6 @@ object Line { - /* length wrt. encoding */ - - trait Length { def apply(text: String): Int } - object Length extends Length { def apply(text: String): Int = text.length } - - /* position */ object Position @@ -37,21 +31,12 @@ case i => i } - private def advance[A](iterator: Iterator[A], is_newline: A => Boolean): Position = - { - var l = line - var c = column - for (x <- iterator) { - if (is_newline(x)) { l += 1; c = 0 } else c += 1 + def advance(text: String, length: Length = Length): Position = + if (text.isEmpty) this + else { + val lines = Library.split_lines(text) + Position(line + lines.length - 1, column + length(Library.trim_line(lines.last))) } - Position(l, c) - } - - def advance(text: String): Position = - if (text.isEmpty) this else advance[Char](text.iterator, _ == '\n') - - def advance_codepoints(text: String): Position = - if (text.isEmpty) this else advance[Int](Codepoint.iterator(text), _ == 10) } @@ -94,7 +79,7 @@ } override def hashCode(): Int = lines.hashCode - private def position(advance: (Position, String) => Position, offset: Text.Offset): Position = + def position(offset: Text.Offset, length: Length = Length): Position = { @tailrec def move(i: Text.Offset, lines_count: Int, lines_rest: List[Line]): Position = { @@ -102,26 +87,23 @@ case Nil => require(i == 0); Position(lines_count, 0) case line :: ls => val n = line.text.length - if (ls.isEmpty || i <= n) advance(Position(lines_count, 0), line.text.substring(n - i)) + if (ls.isEmpty || i <= n) + Position(lines_count, 0).advance(line.text.substring(n - i), length) else move(i - (n + 1), lines_count + 1, ls) } } move(offset, 0, lines) } - def position(i: Text.Offset): Position = position(_.advance(_), i) - def position_codepoints(i: Text.Offset): Position = position(_.advance_codepoints(_), i) - - // FIXME codepoints - def offset(pos: Position): Option[Text.Offset] = + def offset(pos: Position, length: Length = Length): Option[Text.Offset] = { val l = pos.line val c = pos.column if (0 <= l && l < lines.length) { val line_offset = if (l == 0) 0 - else (0 /: lines.take(l - 1))({ case (n, line) => n + line.text.length + 1 }) - if (c <= lines(l).text.length) Some(line_offset + c) else None + else (0 /: lines.iterator.take(l - 1))({ case (n, line) => n + length(line.text) + 1 }) + length.offset(lines(l).text, c).map(line_offset + _) } else None } diff -r dc3ec40fe41b -r 01e50039edc9 src/Pure/System/utf8.scala --- a/src/Pure/System/utf8.scala Tue Dec 20 10:45:20 2016 +0100 +++ b/src/Pure/System/utf8.scala Tue Dec 20 16:08:02 2016 +0100 @@ -21,16 +21,14 @@ def bytes(s: String): Array[Byte] = s.getBytes(charset) - def bytes_length(s: String): Int = - (0 /: Codepoint.iterator(s)) { - case (n, i) => - if (i < 0x80) n + 1 - else if (i < 0x800) n + 2 - else if (i < 0x10000) n + 3 - else n + 4 - } - - object Length extends Line.Length { def apply(s: String): Int = bytes_length(s) } + object Length extends Codepoint.Length + { + override def codepoint_length(c: Int): Int = + if (c < 0x80) 1 + else if (c < 0x800) 2 + else if (c < 0x10000) 3 + else 4 + } /* permissive UTF-8 decoding */ @@ -99,4 +97,3 @@ new Decode_Chars(decode, buffer, start, end) } } - diff -r dc3ec40fe41b -r 01e50039edc9 src/Pure/build-jars --- a/src/Pure/build-jars Tue Dec 20 10:45:20 2016 +0100 +++ b/src/Pure/build-jars Tue Dec 20 16:08:02 2016 +0100 @@ -49,6 +49,7 @@ General/graphics_file.scala General/http_server.scala General/json.scala + General/length.scala General/linear_set.scala General/logger.scala General/long_name.scala