src/Pure/General/word.scala
author wenzelm
Wed, 16 Apr 2014 12:26:12 +0200
changeset 56601 8f80a243857d
parent 56600 628e039cc34d
child 56602 e7e20d72756a
permissions -rw-r--r--
clarified word case; more robust treatment of codepoints;
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
56599
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
     1
/*  Title:      Pure/General/word.scala
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
     2
    Module:     PIDE
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
     3
    Author:     Makarius
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
     4
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
     5
Support for plain text words.
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
     6
*/
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
     7
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
     8
package isabelle
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
     9
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    10
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    11
import java.util.Locale
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    12
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    13
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    14
object Word
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    15
{
56601
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    16
  /* codepoints */
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    17
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    18
  def codepoint_iterator(str: String): Iterator[Int] =
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    19
    new Iterator[Int] {
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    20
      var offset = 0
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    21
      def hasNext: Boolean = offset < str.length
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    22
      def next: Int =
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    23
      {
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    24
        val c = str.codePointAt(offset)
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    25
        offset += Character.charCount(c)
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    26
        c
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    27
      }
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    28
    }
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    29
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    30
56600
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    31
  /* case */
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    32
56599
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    33
  def lowercase(str: String): String = str.toLowerCase(Locale.ROOT)
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    34
  def uppercase(str: String): String = str.toUpperCase(Locale.ROOT)
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    35
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    36
  def capitalize(str: String): String =
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    37
    if (str.length == 0) str
56601
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    38
    else {
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    39
      val n = Character.charCount(str.codePointAt(0))
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    40
      uppercase(str.substring(0, n)) + str.substring(n)
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    41
    }
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    42
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    43
  sealed abstract class Case
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    44
  case object Lowercase extends Case
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    45
  case object Uppercase extends Case
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    46
  case object Capitalized extends Case
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    47
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    48
  object Case
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    49
  {
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    50
    def apply(c: Case, str: String): String =
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    51
      c match {
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    52
        case Lowercase => lowercase(str)
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    53
        case Uppercase => uppercase(str)
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    54
        case Capitalized => capitalize(str)
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    55
      }
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    56
    def unapply(str: String): Option[Case] =
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    57
      if (!str.isEmpty) {
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    58
        if (codepoint_iterator(str).forall(Character.isLowerCase(_))) Some(Lowercase)
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    59
        else if (codepoint_iterator(str).forall(Character.isUpperCase(_))) Some(Uppercase)
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    60
        else {
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    61
          val it = codepoint_iterator(str)
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    62
          if (Character.isUpperCase(it.next) && it.forall(Character.isLowerCase(_)))
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    63
            Some(Capitalized)
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    64
          else None
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    65
        }
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    66
      }
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    67
      else None
8f80a243857d clarified word case;
wenzelm
parents: 56600
diff changeset
    68
  }
56599
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    69
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    70
  def is_capitalized(str: String): Boolean =
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    71
    str.length > 0 &&
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    72
    Character.isUpperCase(str(0)) && str.substring(1).forall(Character.isLowerCase(_))
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    73
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    74
  def is_all_caps(str: String): Boolean =
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    75
    str.length > 0 && str.forall(Character.isUpperCase(_))
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    76
56600
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    77
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    78
  /* sequence of words */
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    79
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    80
  def implode(words: Iterable[String]): String = words.iterator.mkString(" ")
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    81
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    82
  def explode(sep: Char => Boolean, text: String): List[String] =
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    83
    Library.separated_chunks(sep, text).map(_.toString).filter(_ != "").toList
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    84
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    85
  def explode(sep: Char, text: String): List[String] =
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    86
    explode(_ == sep, text)
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    87
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    88
  def explode(text: String): List[String] =
628e039cc34d more specific support for sequence of words;
wenzelm
parents: 56599
diff changeset
    89
    explode(Symbol.is_ascii_blank(_), text)
56599
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    90
}
c4424d8c890f tuned signature -- separate module Word;
wenzelm
parents:
diff changeset
    91