src/Pure/General/word.scala
author wenzelm
Sun, 27 Oct 2024 11:02:21 +0100
changeset 81266 8300511f4c45
parent 80477 d32748570069
child 81647 ae670d860912
permissions -rw-r--r--
clarified signature;
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
56599
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
     1
/*  Title:      Pure/General/word.scala
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
     2
    Author:     Makarius
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
     3
56747
f87e3be0de9a clarified;
wenzelm
parents: 56744
diff changeset
     4
Support for words within Unicode text.
56599
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
     5
*/
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
     6
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
     7
package isabelle
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
     8
62812
ce22e5c3d4ce more robust display of bidirectional Unicode text: enforce left-to-right;
wenzelm
parents: 59319
diff changeset
     9
import java.text.Bidi
56599
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    10
import java.util.Locale
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    11
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    12
75393
87ebf5a50283 clarified formatting, for the sake of scala3;
wenzelm
parents: 73344
diff changeset
    13
object Word {
62812
ce22e5c3d4ce more robust display of bidirectional Unicode text: enforce left-to-right;
wenzelm
parents: 59319
diff changeset
    14
  /* directionality */
ce22e5c3d4ce more robust display of bidirectional Unicode text: enforce left-to-right;
wenzelm
parents: 59319
diff changeset
    15
ce22e5c3d4ce more robust display of bidirectional Unicode text: enforce left-to-right;
wenzelm
parents: 59319
diff changeset
    16
  def bidi_detect(str: String): Boolean =
ce22e5c3d4ce more robust display of bidirectional Unicode text: enforce left-to-right;
wenzelm
parents: 59319
diff changeset
    17
    str.exists(c => c >= 0x590) && Bidi.requiresBidi(str.toArray, 0, str.length)
ce22e5c3d4ce more robust display of bidirectional Unicode text: enforce left-to-right;
wenzelm
parents: 59319
diff changeset
    18
ce22e5c3d4ce more robust display of bidirectional Unicode text: enforce left-to-right;
wenzelm
parents: 59319
diff changeset
    19
  def bidi_override(str: String): String =
ce22e5c3d4ce more robust display of bidirectional Unicode text: enforce left-to-right;
wenzelm
parents: 59319
diff changeset
    20
    if (bidi_detect(str)) "\u200E\u202D" + str + "\u202C" else str
ce22e5c3d4ce more robust display of bidirectional Unicode text: enforce left-to-right;
wenzelm
parents: 59319
diff changeset
    21
ce22e5c3d4ce more robust display of bidirectional Unicode text: enforce left-to-right;
wenzelm
parents: 59319
diff changeset
    22
56600
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    23
  /* case */
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    24
56599
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    25
  def lowercase(str: String): String = str.toLowerCase(Locale.ROOT)
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    26
  def uppercase(str: String): String = str.toUpperCase(Locale.ROOT)
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    27
78614
4da5cdaa4dcd clarified signature;
wenzelm
parents: 78613
diff changeset
    28
  def capitalized(str: String): String =
56599
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    29
    if (str.length == 0) str
56601
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    30
    else {
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    31
      val n = Character.charCount(str.codePointAt(0))
56602
e7e20d72756a capitalize fully (like in Emacs);
wenzelm
parents: 56601
diff changeset
    32
      uppercase(str.substring(0, n)) + lowercase(str.substring(n))
56601
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    33
    }
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    34
78614
4da5cdaa4dcd clarified signature;
wenzelm
parents: 78613
diff changeset
    35
  def perhaps_capitalized(str: String): String =
64610
1b89608974e9 clarified modules;
wenzelm
parents: 64370
diff changeset
    36
    if (Codepoint.iterator(str).forall(c => Character.isLowerCase(c) || Character.isDigit(c)))
78614
4da5cdaa4dcd clarified signature;
wenzelm
parents: 78613
diff changeset
    37
      capitalized(str)
57087
16536c15d749 capitalize even more carefully (see 5ac67041ccf8), e.g. relevant for option "z3_non_commercial" and prospective "MaSh";
wenzelm
parents: 56792
diff changeset
    38
    else str
56609
5ac67041ccf8 capitalize more carefully, e.g. relevant for option "ML_exception_trace";
wenzelm
parents: 56602
diff changeset
    39
75393
87ebf5a50283 clarified formatting, for the sake of scala3;
wenzelm
parents: 73344
diff changeset
    40
  object Case {
56601
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    41
    def apply(c: Case, str: String): String =
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    42
      c match {
78613
60561d28569b clarified signature: prefer enum types;
wenzelm
parents: 75393
diff changeset
    43
        case Case.lowercase => Word.lowercase(str)
60561d28569b clarified signature: prefer enum types;
wenzelm
parents: 75393
diff changeset
    44
        case Case.uppercase => Word.uppercase(str)
78614
4da5cdaa4dcd clarified signature;
wenzelm
parents: 78613
diff changeset
    45
        case Case.capitalized => Word.capitalized(str)
56601
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    46
      }
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    47
    def unapply(str: String): Option[Case] =
59319
wenzelm
parents: 57087
diff changeset
    48
      if (str.nonEmpty) {
78613
60561d28569b clarified signature: prefer enum types;
wenzelm
parents: 75393
diff changeset
    49
        if (Codepoint.iterator(str).forall(Character.isLowerCase)) Some(Case.lowercase)
60561d28569b clarified signature: prefer enum types;
wenzelm
parents: 75393
diff changeset
    50
        else if (Codepoint.iterator(str).forall(Character.isUpperCase)) Some(Case.uppercase)
56601
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    51
        else {
64610
1b89608974e9 clarified modules;
wenzelm
parents: 64370
diff changeset
    52
          val it = Codepoint.iterator(str)
73344
f5c147654661 tuned --- fewer warnings;
wenzelm
parents: 71867
diff changeset
    53
          if (Character.isUpperCase(it.next()) && it.forall(Character.isLowerCase))
78613
60561d28569b clarified signature: prefer enum types;
wenzelm
parents: 75393
diff changeset
    54
            Some(Case.capitalized)
56601
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    55
          else None
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    56
        }
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    57
      }
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    58
      else None
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    59
  }
56599
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    60
78613
60561d28569b clarified signature: prefer enum types;
wenzelm
parents: 75393
diff changeset
    61
  enum Case { case lowercase, uppercase, capitalized }
60561d28569b clarified signature: prefer enum types;
wenzelm
parents: 75393
diff changeset
    62
56600
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    63
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    64
  /* sequence of words */
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    65
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    66
  def implode(words: Iterable[String]): String = words.iterator.mkString(" ")
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    67
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    68
  def explode(sep: Char => Boolean, text: String): List[String] =
80477
wenzelm
parents: 78614
diff changeset
    69
    List.from(
wenzelm
parents: 78614
diff changeset
    70
      for (s <- Library.separated_chunks(sep, text) if !s.isEmpty)
wenzelm
parents: 78614
diff changeset
    71
        yield s.toString)
56600
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    72
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    73
  def explode(sep: Char, text: String): List[String] =
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    74
    explode(_ == sep, text)
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    75
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    76
  def explode(text: String): List[String] =
71867
wenzelm
parents: 71866
diff changeset
    77
    explode(Character.isWhitespace _, text)
63450
afd657fffdf9 indentation of brackets;
wenzelm
parents: 62812
diff changeset
    78
afd657fffdf9 indentation of brackets;
wenzelm
parents: 62812
diff changeset
    79
afd657fffdf9 indentation of brackets;
wenzelm
parents: 62812
diff changeset
    80
  /* brackets */
afd657fffdf9 indentation of brackets;
wenzelm
parents: 62812
diff changeset
    81
71866
081fdd53003a more brackets (see 2e8af171887f);
wenzelm
parents: 71601
diff changeset
    82
  val open_brackets = "([{«‹⟨⌈⌊⦇⟦⦃⟪"
081fdd53003a more brackets (see 2e8af171887f);
wenzelm
parents: 71601
diff changeset
    83
  val close_brackets = ")]}»›⟩⌉⌋⦈⟧⦄⟫"
56599
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    84
}