author | wenzelm |
Mon, 24 Oct 2016 12:16:12 +0200 | |
changeset 64370 | 865b39487b5d |
parent 63450 | afd657fffdf9 |
child 64610 | 1b89608974e9 |
permissions | -rw-r--r-- |
56599 | 1 |
/* Title: Pure/General/word.scala |
2 |
Author: Makarius |
|
3 |
||
56747 | 4 |
Support for words within Unicode text. |
56599 | 5 |
*/ |
6 |
||
7 |
package isabelle |
|
8 |
||
62812
ce22e5c3d4ce
more robust display of bidirectional Unicode text: enforce left-to-right;
wenzelm
parents:
59319
diff
changeset
|
9 |
import java.text.Bidi |
56599 | 10 |
import java.util.Locale |
11 |
||
12 |
||
13 |
object Word |
|
14 |
{ |
|
56601 | 15 |
/* codepoints */ |
16 |
||
17 |
def codepoint_iterator(str: String): Iterator[Int] = |
|
18 |
new Iterator[Int] { |
|
19 |
var offset = 0 |
|
20 |
def hasNext: Boolean = offset < str.length |
|
21 |
def next: Int = |
|
22 |
{ |
|
23 |
val c = str.codePointAt(offset) |
|
24 |
offset += Character.charCount(c) |
|
25 |
c |
|
26 |
} |
|
27 |
} |
|
28 |
||
56792 | 29 |
def codepoint(c: Int): String = new String(Array(c), 0, 1) |
30 |
||
56601 | 31 |
|
62812
ce22e5c3d4ce
more robust display of bidirectional Unicode text: enforce left-to-right;
wenzelm
parents:
59319
diff
changeset
|
32 |
/* directionality */ |
ce22e5c3d4ce
more robust display of bidirectional Unicode text: enforce left-to-right;
wenzelm
parents:
59319
diff
changeset
|
33 |
|
ce22e5c3d4ce
more robust display of bidirectional Unicode text: enforce left-to-right;
wenzelm
parents:
59319
diff
changeset
|
34 |
def bidi_detect(str: String): Boolean = |
ce22e5c3d4ce
more robust display of bidirectional Unicode text: enforce left-to-right;
wenzelm
parents:
59319
diff
changeset
|
35 |
str.exists(c => c >= 0x590) && Bidi.requiresBidi(str.toArray, 0, str.length) |
ce22e5c3d4ce
more robust display of bidirectional Unicode text: enforce left-to-right;
wenzelm
parents:
59319
diff
changeset
|
36 |
|
ce22e5c3d4ce
more robust display of bidirectional Unicode text: enforce left-to-right;
wenzelm
parents:
59319
diff
changeset
|
37 |
def bidi_override(str: String): String = |
ce22e5c3d4ce
more robust display of bidirectional Unicode text: enforce left-to-right;
wenzelm
parents:
59319
diff
changeset
|
38 |
if (bidi_detect(str)) "\u200E\u202D" + str + "\u202C" else str |
ce22e5c3d4ce
more robust display of bidirectional Unicode text: enforce left-to-right;
wenzelm
parents:
59319
diff
changeset
|
39 |
|
ce22e5c3d4ce
more robust display of bidirectional Unicode text: enforce left-to-right;
wenzelm
parents:
59319
diff
changeset
|
40 |
|
56600 | 41 |
/* case */ |
42 |
||
56599 | 43 |
def lowercase(str: String): String = str.toLowerCase(Locale.ROOT) |
44 |
def uppercase(str: String): String = str.toUpperCase(Locale.ROOT) |
|
45 |
||
46 |
def capitalize(str: String): String = |
|
47 |
if (str.length == 0) str |
|
56601 | 48 |
else { |
49 |
val n = Character.charCount(str.codePointAt(0)) |
|
56602 | 50 |
uppercase(str.substring(0, n)) + lowercase(str.substring(n)) |
56601 | 51 |
} |
52 |
||
56609
5ac67041ccf8
capitalize more carefully, e.g. relevant for option "ML_exception_trace";
wenzelm
parents:
56602
diff
changeset
|
53 |
def perhaps_capitalize(str: String): String = |
57087
16536c15d749
capitalize even more carefully (see 5ac67041ccf8), e.g. relevant for option "z3_non_commercial" and prospective "MaSh";
wenzelm
parents:
56792
diff
changeset
|
54 |
if (codepoint_iterator(str).forall(c => Character.isLowerCase(c) || Character.isDigit(c))) |
16536c15d749
capitalize even more carefully (see 5ac67041ccf8), e.g. relevant for option "z3_non_commercial" and prospective "MaSh";
wenzelm
parents:
56792
diff
changeset
|
55 |
capitalize(str) |
16536c15d749
capitalize even more carefully (see 5ac67041ccf8), e.g. relevant for option "z3_non_commercial" and prospective "MaSh";
wenzelm
parents:
56792
diff
changeset
|
56 |
else str |
56609
5ac67041ccf8
capitalize more carefully, e.g. relevant for option "ML_exception_trace";
wenzelm
parents:
56602
diff
changeset
|
57 |
|
56601 | 58 |
sealed abstract class Case |
59 |
case object Lowercase extends Case |
|
60 |
case object Uppercase extends Case |
|
61 |
case object Capitalized extends Case |
|
62 |
||
63 |
object Case |
|
64 |
{ |
|
65 |
def apply(c: Case, str: String): String = |
|
66 |
c match { |
|
67 |
case Lowercase => lowercase(str) |
|
68 |
case Uppercase => uppercase(str) |
|
69 |
case Capitalized => capitalize(str) |
|
70 |
} |
|
71 |
def unapply(str: String): Option[Case] = |
|
59319 | 72 |
if (str.nonEmpty) { |
56601 | 73 |
if (codepoint_iterator(str).forall(Character.isLowerCase(_))) Some(Lowercase) |
74 |
else if (codepoint_iterator(str).forall(Character.isUpperCase(_))) Some(Uppercase) |
|
75 |
else { |
|
76 |
val it = codepoint_iterator(str) |
|
77 |
if (Character.isUpperCase(it.next) && it.forall(Character.isLowerCase(_))) |
|
78 |
Some(Capitalized) |
|
79 |
else None |
|
80 |
} |
|
81 |
} |
|
82 |
else None |
|
83 |
} |
|
56599 | 84 |
|
56600 | 85 |
|
86 |
/* sequence of words */ |
|
87 |
||
88 |
def implode(words: Iterable[String]): String = words.iterator.mkString(" ") |
|
89 |
||
90 |
def explode(sep: Char => Boolean, text: String): List[String] = |
|
91 |
Library.separated_chunks(sep, text).map(_.toString).filter(_ != "").toList |
|
92 |
||
93 |
def explode(sep: Char, text: String): List[String] = |
|
94 |
explode(_ == sep, text) |
|
95 |
||
96 |
def explode(text: String): List[String] = |
|
56747 | 97 |
explode(Character.isWhitespace(_), text) |
63450 | 98 |
|
99 |
||
100 |
/* brackets */ |
|
101 |
||
102 |
val open_brackets = "([{«‹⟨⌈⌊⦇⟦⦃" |
|
103 |
val close_brackets = ")]}»›⟩⌉⌋⦈⟧⦄" |
|
56599 | 104 |
} |