# HG changeset patch # User wenzelm # Date 1371814570 -7200 # Node ID 6d93140a206cb1c4800d0d4305abf731149558ca # Parent 81d5072935acf3ce28b04f3ce8f64f90a81cd29c clarified strings of symbols, including ML string literals; diff -r 81d5072935ac -r 6d93140a206c src/Doc/IsarImplementation/ML.thy --- a/src/Doc/IsarImplementation/ML.thy Thu Jun 20 16:54:05 2013 +0200 +++ b/src/Doc/IsarImplementation/ML.thy Fri Jun 21 13:36:10 2013 +0200 @@ -1274,6 +1274,122 @@ *} +section {* Strings of symbols \label{sec:symbols} *} + +text {* A \emph{symbol} constitutes the smallest textual unit in + Isabelle/ML --- raw ML characters are normally not encountered at + all! Isabelle strings consist of a sequence of symbols, represented + as a packed string or an exploded list of strings. Each symbol is + in itself a small string, which has either one of the following + forms: + + \begin{enumerate} + + \item a single ASCII character ``@{text "c"}'', for example + ``\verb,a,'', + + \item a codepoint according to UTF8 (non-ASCII byte sequence), + + \item a regular symbol ``\verb,\,\verb,<,@{text "ident"}\verb,>,'', + for example ``\verb,\,\verb,,'', + + \item a control symbol ``\verb,\,\verb,<^,@{text "ident"}\verb,>,'', + for example ``\verb,\,\verb,<^bold>,'', + + \item a raw symbol ``\verb,\,\verb,<^raw:,@{text text}\verb,>,'' + where @{text text} consists of printable characters excluding + ``\verb,.,'' and ``\verb,>,'', for example + ``\verb,\,\verb,<^raw:$\sum_{i = 1}^n$>,'', + + \item a numbered raw control symbol ``\verb,\,\verb,<^raw,@{text + n}\verb,>, where @{text n} consists of digits, for example + ``\verb,\,\verb,<^raw42>,''. + + \end{enumerate} + + The @{text "ident"} syntax for symbol names is @{text "letter + (letter | digit)\<^sup>*"}, where @{text "letter = A..Za..z"} and @{text + "digit = 0..9"}. There are infinitely many regular symbols and + control symbols, but a fixed collection of standard symbols is + treated specifically. For example, ``\verb,\,\verb,,'' is + classified as a letter, which means it may occur within regular + Isabelle identifiers. + + The character set underlying Isabelle symbols is 7-bit ASCII, but + 8-bit character sequences are passed-through unchanged. Unicode/UCS + data in UTF-8 encoding is processed in a non-strict fashion, such + that well-formed code sequences are recognized + accordingly.\footnote{Note that ISO-Latin-1 differs from UTF-8 only + in some special punctuation characters that even have replacements + within the standard collection of Isabelle symbols. Text consisting + of ASCII plus accented letters can be processed in either encoding.} + Unicode provides its own collection of mathematical symbols, but + within the core Isabelle/ML world there is no link to the standard + collection of Isabelle regular symbols. + + \medskip Output of Isabelle symbols depends on the print mode + \cite{isabelle-isar-ref}. For example, the standard {\LaTeX} + setup of the Isabelle document preparation system would present + ``\verb,\,\verb,,'' as @{text "\"}, and + ``\verb,\,\verb,<^bold>,\verb,\,\verb,,'' as @{text "\<^bold>\"}. + On-screen rendering usually works by mapping a finite subset of + Isabelle symbols to suitable Unicode characters. +*} + +text %mlref {* + \begin{mldecls} + @{index_ML_type "Symbol.symbol": string} \\ + @{index_ML Symbol.explode: "string -> Symbol.symbol list"} \\ + @{index_ML Symbol.is_letter: "Symbol.symbol -> bool"} \\ + @{index_ML Symbol.is_digit: "Symbol.symbol -> bool"} \\ + @{index_ML Symbol.is_quasi: "Symbol.symbol -> bool"} \\ + @{index_ML Symbol.is_blank: "Symbol.symbol -> bool"} \\ + \end{mldecls} + \begin{mldecls} + @{index_ML_type "Symbol.sym"} \\ + @{index_ML Symbol.decode: "Symbol.symbol -> Symbol.sym"} \\ + \end{mldecls} + + \begin{description} + + \item Type @{ML_type "Symbol.symbol"} represents individual Isabelle + symbols. + + \item @{ML "Symbol.explode"}~@{text "str"} produces a symbol list + from the packed form. This function supersedes @{ML + "String.explode"} for virtually all purposes of manipulating text in + Isabelle!\footnote{The runtime overhead for exploded strings is + mainly that of the list structure: individual symbols that happen to + be a singleton string do not require extra memory in Poly/ML.} + + \item @{ML "Symbol.is_letter"}, @{ML "Symbol.is_digit"}, @{ML + "Symbol.is_quasi"}, @{ML "Symbol.is_blank"} classify standard + symbols according to fixed syntactic conventions of Isabelle, cf.\ + \cite{isabelle-isar-ref}. + + \item Type @{ML_type "Symbol.sym"} is a concrete datatype that + represents the different kinds of symbols explicitly, with + constructors @{ML "Symbol.Char"}, @{ML "Symbol.Sym"}, @{ML + "Symbol.UTF8"}, @{ML "Symbol.Ctrl"}, @{ML "Symbol.Raw"}. + + \item @{ML "Symbol.decode"} converts the string representation of a + symbol into the datatype version. + + \end{description} + + \paragraph{Historical note.} In the original SML90 standard the + primitive ML type @{ML_type char} did not exists, and @{ML_text + "explode: string -> string list"} produced a list of singleton + strings like @{ML "raw_explode: string -> string list"} in + Isabelle/ML today. When SML97 came out, Isabelle did not adopt its + somewhat anachronistic 8-bit or 16-bit characters, but the idea of + exploding a string into a list of small strings was extended to + ``symbols'' as explained above. Thus Isabelle sources can refer to + an infinite store of user-defined symbols, without having to worry + about the multitude of Unicode encodings that have emerged over the + years. *} + + section {* Basic data types *} text {* The basis library proposal of SML97 needs to be treated with @@ -1305,6 +1421,61 @@ *} +subsection {* Strings *} + +text %mlref {* + \begin{mldecls} + @{index_ML_type string} \\ + \end{mldecls} + + \begin{description} + + \item Type @{ML_type string} represents immutable vectors of 8-bit + characters. There are operations in SML to convert back and forth + to actual byte vectors, which are seldom used. + + This historically important raw text representation is used for + Isabelle-specific purposes with the following implicit substructures + packed into the string content: + + \begin{enumerate} + + \item sequence of Isabelle symbols (see also \secref{sec:symbols}), + with @{ML Symbol.explode} as key operation; + + \item XML tree structure via YXML (see also \cite{isabelle-sys}), + with @{ML YXML.parse_body} as key operation. + + \end{enumerate} + + Note that Isabelle/ML string literals may refer Isabelle symbols + like ``\verb,\,\verb,,'' natively, \emph{without} escaping + the backslash. This is a consequence of Isabelle treating all + source text as strings of symbols, instead of raw characters. + + \end{description} +*} + +text %mlex {* The subsequent example illustrates the difference of + physical addressing of bytes versus logical addressing of symbols in + Isabelle strings. +*} + +ML_val {* + val s = "\"; + + @{assert} (length (Symbol.explode s) = 1); + @{assert} (size s = 4); +*} + +text {* Note that in Unicode renderings of the symbol @{text "\"}, + variations of encodings like UTF-8 or UTF-16 pose delicate questions + about the multi-byte representations its codepoint, which is outside + of the 16-bit address space of the original Unicode standard from + the 1990-ies. In Isabelle/ML it is just ``\verb,\,\verb,,'' + literally, using plain ASCII characters beyond any doubts. *} + + subsection {* Integers *} text %mlref {* diff -r 81d5072935ac -r 6d93140a206c src/Doc/IsarImplementation/Prelim.thy --- a/src/Doc/IsarImplementation/Prelim.thy Thu Jun 20 16:54:05 2013 +0200 +++ b/src/Doc/IsarImplementation/Prelim.thy Fri Jun 21 13:36:10 2013 +0200 @@ -681,7 +681,7 @@ qualifier @{text "Foo.bar"} and base name @{text "baz"}. The individual constituents of a name may have further substructure, e.g.\ the string ``\verb,\,\verb,,'' encodes as a single - symbol. + symbol (\secref{sec:symbols}). \medskip Subsequently, we shall introduce specific categories of names. Roughly speaking these correspond to logical entities as @@ -702,120 +702,6 @@ *} -subsection {* Strings of symbols \label{sec:symbols} *} - -text {* A \emph{symbol} constitutes the smallest textual unit in - Isabelle --- raw ML characters are normally not encountered at all! - Isabelle strings consist of a sequence of symbols, represented as a - packed string or an exploded list of strings. Each symbol is in - itself a small string, which has either one of the following forms: - - \begin{enumerate} - - \item a single ASCII character ``@{text "c"}'', for example - ``\verb,a,'', - - \item a codepoint according to UTF8 (non-ASCII byte sequence), - - \item a regular symbol ``\verb,\,\verb,<,@{text "ident"}\verb,>,'', - for example ``\verb,\,\verb,,'', - - \item a control symbol ``\verb,\,\verb,<^,@{text "ident"}\verb,>,'', - for example ``\verb,\,\verb,<^bold>,'', - - \item a raw symbol ``\verb,\,\verb,<^raw:,@{text text}\verb,>,'' - where @{text text} consists of printable characters excluding - ``\verb,.,'' and ``\verb,>,'', for example - ``\verb,\,\verb,<^raw:$\sum_{i = 1}^n$>,'', - - \item a numbered raw control symbol ``\verb,\,\verb,<^raw,@{text - n}\verb,>, where @{text n} consists of digits, for example - ``\verb,\,\verb,<^raw42>,''. - - \end{enumerate} - - The @{text "ident"} syntax for symbol names is @{text "letter - (letter | digit)\<^sup>*"}, where @{text "letter = A..Za..z"} and @{text - "digit = 0..9"}. There are infinitely many regular symbols and - control symbols, but a fixed collection of standard symbols is - treated specifically. For example, ``\verb,\,\verb,,'' is - classified as a letter, which means it may occur within regular - Isabelle identifiers. - - The character set underlying Isabelle symbols is 7-bit ASCII, but - 8-bit character sequences are passed-through unchanged. Unicode/UCS - data in UTF-8 encoding is processed in a non-strict fashion, such - that well-formed code sequences are recognized - accordingly.\footnote{Note that ISO-Latin-1 differs from UTF-8 only - in some special punctuation characters that even have replacements - within the standard collection of Isabelle symbols. Text consisting - of ASCII plus accented letters can be processed in either encoding.} - Unicode provides its own collection of mathematical symbols, but - within the core Isabelle/ML world there is no link to the standard - collection of Isabelle regular symbols. - - \medskip Output of Isabelle symbols depends on the print mode - (\cite{isabelle-isar-ref}). For example, the standard {\LaTeX} - setup of the Isabelle document preparation system would present - ``\verb,\,\verb,,'' as @{text "\"}, and - ``\verb,\,\verb,<^bold>,\verb,\,\verb,,'' as @{text "\<^bold>\"}. - On-screen rendering usually works by mapping a finite subset of - Isabelle symbols to suitable Unicode characters. -*} - -text %mlref {* - \begin{mldecls} - @{index_ML_type "Symbol.symbol": string} \\ - @{index_ML Symbol.explode: "string -> Symbol.symbol list"} \\ - @{index_ML Symbol.is_letter: "Symbol.symbol -> bool"} \\ - @{index_ML Symbol.is_digit: "Symbol.symbol -> bool"} \\ - @{index_ML Symbol.is_quasi: "Symbol.symbol -> bool"} \\ - @{index_ML Symbol.is_blank: "Symbol.symbol -> bool"} \\ - \end{mldecls} - \begin{mldecls} - @{index_ML_type "Symbol.sym"} \\ - @{index_ML Symbol.decode: "Symbol.symbol -> Symbol.sym"} \\ - \end{mldecls} - - \begin{description} - - \item Type @{ML_type "Symbol.symbol"} represents individual Isabelle - symbols. - - \item @{ML "Symbol.explode"}~@{text "str"} produces a symbol list - from the packed form. This function supersedes @{ML - "String.explode"} for virtually all purposes of manipulating text in - Isabelle!\footnote{The runtime overhead for exploded strings is - mainly that of the list structure: individual symbols that happen to - be a singleton string do not require extra memory in Poly/ML.} - - \item @{ML "Symbol.is_letter"}, @{ML "Symbol.is_digit"}, @{ML - "Symbol.is_quasi"}, @{ML "Symbol.is_blank"} classify standard - symbols according to fixed syntactic conventions of Isabelle, cf.\ - \cite{isabelle-isar-ref}. - - \item Type @{ML_type "Symbol.sym"} is a concrete datatype that - represents the different kinds of symbols explicitly, with - constructors @{ML "Symbol.Char"}, @{ML "Symbol.Sym"}, @{ML - "Symbol.UTF8"}, @{ML "Symbol.Ctrl"}, @{ML "Symbol.Raw"}. - - \item @{ML "Symbol.decode"} converts the string representation of a - symbol into the datatype version. - - \end{description} - - \paragraph{Historical note.} In the original SML90 standard the - primitive ML type @{ML_type char} did not exists, and the @{ML_text - "explode: string -> string list"} operation would produce a list of - singleton strings as does @{ML "raw_explode: string -> string list"} - in Isabelle/ML today. When SML97 came out, Isabelle did not adopt - its slightly anachronistic 8-bit characters, but the idea of - exploding a string into a list of small strings was extended to - ``symbols'' as explained above. Thus Isabelle sources can refer to - an infinite store of user-defined symbols, without having to worry - about the multitude of Unicode encodings. *} - - subsection {* Basic names \label{sec:basic-name} *} text {*