clarified strings of symbols, including ML string literals;
authorwenzelm
Fri, 21 Jun 2013 13:36:10 +0200
changeset 52421 6d93140a206c
parent 52420 81d5072935ac
child 52422 93f3f9a2ae91
clarified strings of symbols, including ML string literals;
src/Doc/IsarImplementation/ML.thy
src/Doc/IsarImplementation/Prelim.thy
src/HOL/Tools/reflection.ML
--- a/src/Doc/IsarImplementation/ML.thy	Thu Jun 20 16:54:05 2013 +0200
+++ b/src/Doc/IsarImplementation/ML.thy	Fri Jun 21 13:36:10 2013 +0200
@@ -1274,6 +1274,122 @@
 *}
 
 
+section {* Strings of symbols \label{sec:symbols} *}
+
+text {* A \emph{symbol} constitutes the smallest textual unit in
+  Isabelle/ML --- raw ML characters are normally not encountered at
+  all!  Isabelle strings consist of a sequence of symbols, represented
+  as a packed string or an exploded list of strings.  Each symbol is
+  in itself a small string, which has either one of the following
+  forms:
+
+  \begin{enumerate}
+
+  \item a single ASCII character ``@{text "c"}'', for example
+  ``\verb,a,'',
+
+  \item a codepoint according to UTF8 (non-ASCII byte sequence),
+
+  \item a regular symbol ``\verb,\,\verb,<,@{text "ident"}\verb,>,'',
+  for example ``\verb,\,\verb,<alpha>,'',
+
+  \item a control symbol ``\verb,\,\verb,<^,@{text "ident"}\verb,>,'',
+  for example ``\verb,\,\verb,<^bold>,'',
+
+  \item a raw symbol ``\verb,\,\verb,<^raw:,@{text text}\verb,>,''
+  where @{text text} consists of printable characters excluding
+  ``\verb,.,'' and ``\verb,>,'', for example
+  ``\verb,\,\verb,<^raw:$\sum_{i = 1}^n$>,'',
+
+  \item a numbered raw control symbol ``\verb,\,\verb,<^raw,@{text
+  n}\verb,>, where @{text n} consists of digits, for example
+  ``\verb,\,\verb,<^raw42>,''.
+
+  \end{enumerate}
+
+  The @{text "ident"} syntax for symbol names is @{text "letter
+  (letter | digit)\<^sup>*"}, where @{text "letter = A..Za..z"} and @{text
+  "digit = 0..9"}.  There are infinitely many regular symbols and
+  control symbols, but a fixed collection of standard symbols is
+  treated specifically.  For example, ``\verb,\,\verb,<alpha>,'' is
+  classified as a letter, which means it may occur within regular
+  Isabelle identifiers.
+
+  The character set underlying Isabelle symbols is 7-bit ASCII, but
+  8-bit character sequences are passed-through unchanged.  Unicode/UCS
+  data in UTF-8 encoding is processed in a non-strict fashion, such
+  that well-formed code sequences are recognized
+  accordingly.\footnote{Note that ISO-Latin-1 differs from UTF-8 only
+  in some special punctuation characters that even have replacements
+  within the standard collection of Isabelle symbols.  Text consisting
+  of ASCII plus accented letters can be processed in either encoding.}
+  Unicode provides its own collection of mathematical symbols, but
+  within the core Isabelle/ML world there is no link to the standard
+  collection of Isabelle regular symbols.
+
+  \medskip Output of Isabelle symbols depends on the print mode
+  \cite{isabelle-isar-ref}.  For example, the standard {\LaTeX}
+  setup of the Isabelle document preparation system would present
+  ``\verb,\,\verb,<alpha>,'' as @{text "\<alpha>"}, and
+  ``\verb,\,\verb,<^bold>,\verb,\,\verb,<alpha>,'' as @{text "\<^bold>\<alpha>"}.
+  On-screen rendering usually works by mapping a finite subset of
+  Isabelle symbols to suitable Unicode characters.
+*}
+
+text %mlref {*
+  \begin{mldecls}
+  @{index_ML_type "Symbol.symbol": string} \\
+  @{index_ML Symbol.explode: "string -> Symbol.symbol list"} \\
+  @{index_ML Symbol.is_letter: "Symbol.symbol -> bool"} \\
+  @{index_ML Symbol.is_digit: "Symbol.symbol -> bool"} \\
+  @{index_ML Symbol.is_quasi: "Symbol.symbol -> bool"} \\
+  @{index_ML Symbol.is_blank: "Symbol.symbol -> bool"} \\
+  \end{mldecls}
+  \begin{mldecls}
+  @{index_ML_type "Symbol.sym"} \\
+  @{index_ML Symbol.decode: "Symbol.symbol -> Symbol.sym"} \\
+  \end{mldecls}
+
+  \begin{description}
+
+  \item Type @{ML_type "Symbol.symbol"} represents individual Isabelle
+  symbols.
+
+  \item @{ML "Symbol.explode"}~@{text "str"} produces a symbol list
+  from the packed form.  This function supersedes @{ML
+  "String.explode"} for virtually all purposes of manipulating text in
+  Isabelle!\footnote{The runtime overhead for exploded strings is
+  mainly that of the list structure: individual symbols that happen to
+  be a singleton string do not require extra memory in Poly/ML.}
+
+  \item @{ML "Symbol.is_letter"}, @{ML "Symbol.is_digit"}, @{ML
+  "Symbol.is_quasi"}, @{ML "Symbol.is_blank"} classify standard
+  symbols according to fixed syntactic conventions of Isabelle, cf.\
+  \cite{isabelle-isar-ref}.
+
+  \item Type @{ML_type "Symbol.sym"} is a concrete datatype that
+  represents the different kinds of symbols explicitly, with
+  constructors @{ML "Symbol.Char"}, @{ML "Symbol.Sym"}, @{ML
+  "Symbol.UTF8"}, @{ML "Symbol.Ctrl"}, @{ML "Symbol.Raw"}.
+
+  \item @{ML "Symbol.decode"} converts the string representation of a
+  symbol into the datatype version.
+
+  \end{description}
+
+  \paragraph{Historical note.} In the original SML90 standard the
+  primitive ML type @{ML_type char} did not exists, and @{ML_text
+  "explode: string -> string list"} produced a list of singleton
+  strings like @{ML "raw_explode: string -> string list"} in
+  Isabelle/ML today.  When SML97 came out, Isabelle did not adopt its
+  somewhat anachronistic 8-bit or 16-bit characters, but the idea of
+  exploding a string into a list of small strings was extended to
+  ``symbols'' as explained above.  Thus Isabelle sources can refer to
+  an infinite store of user-defined symbols, without having to worry
+  about the multitude of Unicode encodings that have emerged over the
+  years.  *}
+
+
 section {* Basic data types *}
 
 text {* The basis library proposal of SML97 needs to be treated with
@@ -1305,6 +1421,61 @@
 *}
 
 
+subsection {* Strings *}
+
+text %mlref {*
+  \begin{mldecls}
+  @{index_ML_type string} \\
+  \end{mldecls}
+
+  \begin{description}
+
+  \item Type @{ML_type string} represents immutable vectors of 8-bit
+  characters.  There are operations in SML to convert back and forth
+  to actual byte vectors, which are seldom used.
+
+  This historically important raw text representation is used for
+  Isabelle-specific purposes with the following implicit substructures
+  packed into the string content:
+
+  \begin{enumerate}
+
+  \item sequence of Isabelle symbols (see also \secref{sec:symbols}),
+  with @{ML Symbol.explode} as key operation;
+
+  \item XML tree structure via YXML (see also \cite{isabelle-sys}),
+  with @{ML YXML.parse_body} as key operation.
+
+  \end{enumerate}
+
+  Note that Isabelle/ML string literals may refer Isabelle symbols
+  like ``\verb,\,\verb,<alpha>,'' natively, \emph{without} escaping
+  the backslash.  This is a consequence of Isabelle treating all
+  source text as strings of symbols, instead of raw characters.
+
+  \end{description}
+*}
+
+text %mlex {* The subsequent example illustrates the difference of
+  physical addressing of bytes versus logical addressing of symbols in
+  Isabelle strings.
+*}
+
+ML_val {*
+  val s = "\<A>";
+
+  @{assert} (length (Symbol.explode s) = 1);
+  @{assert} (size s = 4);
+*}
+
+text {* Note that in Unicode renderings of the symbol @{text "\<A>"},
+  variations of encodings like UTF-8 or UTF-16 pose delicate questions
+  about the multi-byte representations its codepoint, which is outside
+  of the 16-bit address space of the original Unicode standard from
+  the 1990-ies.  In Isabelle/ML it is just ``\verb,\,\verb,<A>,''
+  literally, using plain ASCII characters beyond any doubts. *}
+
+
 subsection {* Integers *}
 
 text %mlref {*
--- a/src/Doc/IsarImplementation/Prelim.thy	Thu Jun 20 16:54:05 2013 +0200
+++ b/src/Doc/IsarImplementation/Prelim.thy	Fri Jun 21 13:36:10 2013 +0200
@@ -681,7 +681,7 @@
   qualifier @{text "Foo.bar"} and base name @{text "baz"}.  The
   individual constituents of a name may have further substructure,
   e.g.\ the string ``\verb,\,\verb,<alpha>,'' encodes as a single
-  symbol.
+  symbol (\secref{sec:symbols}).
 
   \medskip Subsequently, we shall introduce specific categories of
   names.  Roughly speaking these correspond to logical entities as
@@ -702,120 +702,6 @@
 *}
 
 
-subsection {* Strings of symbols \label{sec:symbols} *}
-
-text {* A \emph{symbol} constitutes the smallest textual unit in
-  Isabelle --- raw ML characters are normally not encountered at all!
-  Isabelle strings consist of a sequence of symbols, represented as a
-  packed string or an exploded list of strings.  Each symbol is in
-  itself a small string, which has either one of the following forms:
-
-  \begin{enumerate}
-
-  \item a single ASCII character ``@{text "c"}'', for example
-  ``\verb,a,'',
-
-  \item a codepoint according to UTF8 (non-ASCII byte sequence),
-
-  \item a regular symbol ``\verb,\,\verb,<,@{text "ident"}\verb,>,'',
-  for example ``\verb,\,\verb,<alpha>,'',
-
-  \item a control symbol ``\verb,\,\verb,<^,@{text "ident"}\verb,>,'',
-  for example ``\verb,\,\verb,<^bold>,'',
-
-  \item a raw symbol ``\verb,\,\verb,<^raw:,@{text text}\verb,>,''
-  where @{text text} consists of printable characters excluding
-  ``\verb,.,'' and ``\verb,>,'', for example
-  ``\verb,\,\verb,<^raw:$\sum_{i = 1}^n$>,'',
-
-  \item a numbered raw control symbol ``\verb,\,\verb,<^raw,@{text
-  n}\verb,>, where @{text n} consists of digits, for example
-  ``\verb,\,\verb,<^raw42>,''.
-
-  \end{enumerate}
-
-  The @{text "ident"} syntax for symbol names is @{text "letter
-  (letter | digit)\<^sup>*"}, where @{text "letter = A..Za..z"} and @{text
-  "digit = 0..9"}.  There are infinitely many regular symbols and
-  control symbols, but a fixed collection of standard symbols is
-  treated specifically.  For example, ``\verb,\,\verb,<alpha>,'' is
-  classified as a letter, which means it may occur within regular
-  Isabelle identifiers.
-
-  The character set underlying Isabelle symbols is 7-bit ASCII, but
-  8-bit character sequences are passed-through unchanged.  Unicode/UCS
-  data in UTF-8 encoding is processed in a non-strict fashion, such
-  that well-formed code sequences are recognized
-  accordingly.\footnote{Note that ISO-Latin-1 differs from UTF-8 only
-  in some special punctuation characters that even have replacements
-  within the standard collection of Isabelle symbols.  Text consisting
-  of ASCII plus accented letters can be processed in either encoding.}
-  Unicode provides its own collection of mathematical symbols, but
-  within the core Isabelle/ML world there is no link to the standard
-  collection of Isabelle regular symbols.
-
-  \medskip Output of Isabelle symbols depends on the print mode
-  (\cite{isabelle-isar-ref}).  For example, the standard {\LaTeX}
-  setup of the Isabelle document preparation system would present
-  ``\verb,\,\verb,<alpha>,'' as @{text "\<alpha>"}, and
-  ``\verb,\,\verb,<^bold>,\verb,\,\verb,<alpha>,'' as @{text "\<^bold>\<alpha>"}.
-  On-screen rendering usually works by mapping a finite subset of
-  Isabelle symbols to suitable Unicode characters.
-*}
-
-text %mlref {*
-  \begin{mldecls}
-  @{index_ML_type "Symbol.symbol": string} \\
-  @{index_ML Symbol.explode: "string -> Symbol.symbol list"} \\
-  @{index_ML Symbol.is_letter: "Symbol.symbol -> bool"} \\
-  @{index_ML Symbol.is_digit: "Symbol.symbol -> bool"} \\
-  @{index_ML Symbol.is_quasi: "Symbol.symbol -> bool"} \\
-  @{index_ML Symbol.is_blank: "Symbol.symbol -> bool"} \\
-  \end{mldecls}
-  \begin{mldecls}
-  @{index_ML_type "Symbol.sym"} \\
-  @{index_ML Symbol.decode: "Symbol.symbol -> Symbol.sym"} \\
-  \end{mldecls}
-
-  \begin{description}
-
-  \item Type @{ML_type "Symbol.symbol"} represents individual Isabelle
-  symbols.
-
-  \item @{ML "Symbol.explode"}~@{text "str"} produces a symbol list
-  from the packed form.  This function supersedes @{ML
-  "String.explode"} for virtually all purposes of manipulating text in
-  Isabelle!\footnote{The runtime overhead for exploded strings is
-  mainly that of the list structure: individual symbols that happen to
-  be a singleton string do not require extra memory in Poly/ML.}
-
-  \item @{ML "Symbol.is_letter"}, @{ML "Symbol.is_digit"}, @{ML
-  "Symbol.is_quasi"}, @{ML "Symbol.is_blank"} classify standard
-  symbols according to fixed syntactic conventions of Isabelle, cf.\
-  \cite{isabelle-isar-ref}.
-
-  \item Type @{ML_type "Symbol.sym"} is a concrete datatype that
-  represents the different kinds of symbols explicitly, with
-  constructors @{ML "Symbol.Char"}, @{ML "Symbol.Sym"}, @{ML
-  "Symbol.UTF8"}, @{ML "Symbol.Ctrl"}, @{ML "Symbol.Raw"}.
-
-  \item @{ML "Symbol.decode"} converts the string representation of a
-  symbol into the datatype version.
-
-  \end{description}
-
-  \paragraph{Historical note.} In the original SML90 standard the
-  primitive ML type @{ML_type char} did not exists, and the @{ML_text
-  "explode: string -> string list"} operation would produce a list of
-  singleton strings as does @{ML "raw_explode: string -> string list"}
-  in Isabelle/ML today.  When SML97 came out, Isabelle did not adopt
-  its slightly anachronistic 8-bit characters, but the idea of
-  exploding a string into a list of small strings was extended to
-  ``symbols'' as explained above.  Thus Isabelle sources can refer to
-  an infinite store of user-defined symbols, without having to worry
-  about the multitude of Unicode encodings.  *}
-
-
 subsection {* Basic names \label{sec:basic-name} *}
 
 text {*