src/Doc/Codegen/Foundations.thy
author blanchet
Tue Nov 07 15:16:42 2017 +0100 (20 months ago)
changeset 67022 49309fe530fd
parent 61781 e1e6bb36b27a
child 68254 3a7f257dcac7
permissions -rw-r--r--
more robust parsing for THF proofs (esp. polymorphic Leo-III proofs)
haftmann@38405
     1
theory Foundations
haftmann@28419
     2
imports Introduction
haftmann@28213
     3
begin
haftmann@28213
     4
haftmann@59377
     5
section \<open>Code generation foundations \label{sec:foundations}\<close>
haftmann@28419
     6
haftmann@59377
     7
subsection \<open>Code generator architecture \label{sec:architecture}\<close>
haftmann@28419
     8
haftmann@59377
     9
text \<open>
haftmann@38437
    10
  The code generator is actually a framework consisting of different
haftmann@38437
    11
  components which can be customised individually.
haftmann@38437
    12
haftmann@38437
    13
  Conceptually all components operate on Isabelle's logic framework
haftmann@38437
    14
  @{theory Pure}.  Practically, the object logic @{theory HOL}
haftmann@38437
    15
  provides the necessary facilities to make use of the code generator,
haftmann@38437
    16
  mainly since it is an extension of @{theory Pure}.
haftmann@38437
    17
haftmann@38437
    18
  The constellation of the different components is visualized in the
haftmann@38437
    19
  following picture.
haftmann@38437
    20
haftmann@38437
    21
  \begin{figure}[h]
wenzelm@52742
    22
    \def\sys#1{\emph{#1}}
wenzelm@52742
    23
    \begin{tikzpicture}[x = 4cm, y = 1cm]
wenzelm@52742
    24
      \tikzstyle positive=[color = black, fill = white];
wenzelm@52742
    25
      \tikzstyle negative=[color = white, fill = black];
wenzelm@52742
    26
      \tikzstyle entity=[rounded corners, draw, thick];
wenzelm@52742
    27
      \tikzstyle process=[ellipse, draw, thick];
wenzelm@52742
    28
      \tikzstyle arrow=[-stealth, semithick];
wenzelm@52742
    29
      \node (spec) at (0, 3) [entity, positive] {specification tools};
wenzelm@52742
    30
      \node (user) at (1, 3) [entity, positive] {user proofs};
wenzelm@52742
    31
      \node (spec_user_join) at (0.5, 3) [shape=coordinate] {};
wenzelm@52742
    32
      \node (raw) at (0.5, 4) [entity, positive] {raw code equations};
wenzelm@52742
    33
      \node (pre) at (1.5, 4) [process, positive] {preprocessing};
wenzelm@52742
    34
      \node (eqn) at (2.5, 4) [entity, positive] {code equations};
wenzelm@52742
    35
      \node (iml) at (0.5, 0) [entity, positive] {intermediate program};
wenzelm@52742
    36
      \node (seri) at (1.5, 0) [process, positive] {serialisation};
wenzelm@52742
    37
      \node (SML) at (2.5, 3) [entity, positive] {\sys{SML}};
wenzelm@52742
    38
      \node (OCaml) at (2.5, 2) [entity, positive] {\sys{OCaml}};
wenzelm@52742
    39
      \node (Haskell) at (2.5, 1) [entity, positive] {\sys{Haskell}};
wenzelm@52742
    40
      \node (Scala) at (2.5, 0) [entity, positive] {\sys{Scala}};
wenzelm@52742
    41
      \draw [semithick] (spec) -- (spec_user_join);
wenzelm@52742
    42
      \draw [semithick] (user) -- (spec_user_join);
wenzelm@52742
    43
      \draw [-diamond, semithick] (spec_user_join) -- (raw);
wenzelm@52742
    44
      \draw [arrow] (raw) -- (pre);
wenzelm@52742
    45
      \draw [arrow] (pre) -- (eqn);
wenzelm@52742
    46
      \draw [arrow] (eqn) -- node (transl) [process, positive] {translation} (iml);
wenzelm@52742
    47
      \draw [arrow] (iml) -- (seri);
wenzelm@52742
    48
      \draw [arrow] (seri) -- (SML);
wenzelm@52742
    49
      \draw [arrow] (seri) -- (OCaml);
wenzelm@52742
    50
      \draw [arrow] (seri) -- (Haskell);
wenzelm@52742
    51
      \draw [arrow] (seri) -- (Scala);
wenzelm@52742
    52
    \end{tikzpicture}
haftmann@38437
    53
    \caption{Code generator architecture}
haftmann@38437
    54
    \label{fig:arch}
haftmann@38437
    55
  \end{figure}
haftmann@38437
    56
haftmann@38437
    57
  \noindent Central to code generation is the notion of \emph{code
haftmann@38437
    58
  equations}.  A code equation as a first approximation is a theorem
wenzelm@53015
    59
  of the form @{text "f t\<^sub>1 t\<^sub>2 \<dots> t\<^sub>n \<equiv> t"} (an equation headed by a
wenzelm@53015
    60
  constant @{text f} with arguments @{text "t\<^sub>1 t\<^sub>2 \<dots> t\<^sub>n"} and right
haftmann@38437
    61
  hand side @{text t}).
haftmann@38437
    62
haftmann@38437
    63
  \begin{itemize}
haftmann@38437
    64
haftmann@38437
    65
    \item Starting point of code generation is a collection of (raw)
haftmann@38437
    66
      code equations in a theory. It is not relevant where they stem
haftmann@38437
    67
      from, but typically they were either produced by specification
haftmann@38437
    68
      tools or proved explicitly by the user.
haftmann@38437
    69
      
haftmann@38437
    70
    \item These raw code equations can be subjected to theorem
haftmann@38437
    71
      transformations.  This \qn{preprocessor} (see
haftmann@38437
    72
      \secref{sec:preproc}) can apply the full expressiveness of
haftmann@38437
    73
      ML-based theorem transformations to code generation.  The result
haftmann@38437
    74
      of preprocessing is a structured collection of code equations.
haftmann@38437
    75
haftmann@38437
    76
    \item These code equations are \qn{translated} to a program in an
haftmann@38437
    77
      abstract intermediate language.  Think of it as a kind of
haftmann@38437
    78
      \qt{Mini-Haskell} with four \qn{statements}: @{text data} (for
haftmann@38437
    79
      datatypes), @{text fun} (stemming from code equations), also
haftmann@38437
    80
      @{text class} and @{text inst} (for type classes).
haftmann@38437
    81
haftmann@38437
    82
    \item Finally, the abstract program is \qn{serialised} into
haftmann@38437
    83
      concrete source code of a target language.  This step only
haftmann@38437
    84
      produces concrete syntax but does not change the program in
haftmann@38437
    85
      essence; all conceptual transformations occur in the translation
haftmann@38437
    86
      step.
haftmann@38437
    87
haftmann@38437
    88
  \end{itemize}
haftmann@38437
    89
haftmann@38437
    90
  \noindent From these steps, only the last two are carried out
haftmann@38437
    91
  outside the logic; by keeping this layer as thin as possible, the
haftmann@38437
    92
  amount of code to trust is kept to a minimum.
haftmann@59377
    93
\<close>
haftmann@28419
    94
haftmann@28419
    95
haftmann@59377
    96
subsection \<open>The pre- and postprocessor \label{sec:preproc}\<close>
haftmann@28419
    97
haftmann@59377
    98
text \<open>
haftmann@38437
    99
  Before selected function theorems are turned into abstract code, a
haftmann@38437
   100
  chain of definitional transformation steps is carried out:
haftmann@38437
   101
  \emph{preprocessing}.  The preprocessor consists of two
haftmann@38437
   102
  components: a \emph{simpset} and \emph{function transformers}.
haftmann@28419
   103
haftmann@54721
   104
  The preprocessor simpset has a disparate brother, the
haftmann@54721
   105
  \emph{postprocessor simpset}.
haftmann@54721
   106
  In the theory-to-code scenario depicted in the picture 
haftmann@54721
   107
  above, it plays no role.  But if generated code is used
haftmann@54721
   108
  to evaluate expressions (cf.~\secref{sec:evaluation}), the
haftmann@54721
   109
  postprocessor simpset is applied to the resulting expression before this
haftmann@54721
   110
  is turned back.
haftmann@54721
   111
haftmann@54721
   112
  The pre- and postprocessor \emph{simpsets} can apply the
haftmann@54721
   113
  full generality of the Isabelle
haftmann@38437
   114
  simplifier.  Due to the interpretation of theorems as code
haftmann@32000
   115
  equations, rewrites are applied to the right hand side and the
haftmann@32000
   116
  arguments of the left hand side of an equation, but never to the
haftmann@54721
   117
  constant heading the left hand side.
haftmann@28213
   118
haftmann@61781
   119
  Pre- and postprocessor can be setup to transfer between
haftmann@54721
   120
  expressions suitable for logical reasoning and expressions 
haftmann@54721
   121
  suitable for execution.  As example, take list membership; logically
haftmann@61781
   122
  it is expressed as @{term "x \<in> set xs"}.  But for execution
haftmann@54721
   123
  the intermediate set is not desirable.  Hence the following
haftmann@54721
   124
  specification:
haftmann@59377
   125
\<close>
haftmann@28419
   126
haftmann@54721
   127
definition %quote member :: "'a list \<Rightarrow> 'a \<Rightarrow> bool"
haftmann@54721
   128
where
haftmann@54721
   129
  [code_abbrev]: "member xs x \<longleftrightarrow> x \<in> set xs"
haftmann@28419
   130
haftmann@59377
   131
text \<open>
haftmann@54721
   132
  \noindent The \emph{@{attribute code_abbrev} attribute} declares
haftmann@54721
   133
  its theorem a rewrite rule for the postprocessor and the symmetric
haftmann@54721
   134
  of its theorem as rewrite rule for the preprocessor.  Together,
haftmann@54721
   135
  this has the effect that expressions @{thm (rhs) member_def [no_vars]}
haftmann@54721
   136
  are replaced by @{thm (lhs) member_def [no_vars]} in generated code, but
haftmann@54721
   137
  are turned back into @{thm (rhs) member_def [no_vars]} if generated
haftmann@54721
   138
  code is used for evaluation.
haftmann@38437
   139
haftmann@54721
   140
  Rewrite rules for pre- or postprocessor may be
haftmann@54721
   141
  declared independently using \emph{@{attribute code_unfold}}
haftmann@54721
   142
  or \emph{@{attribute code_post}} respectively.
haftmann@38437
   143
haftmann@54721
   144
  \emph{Function transformers} provide a very general
haftmann@38437
   145
  interface, transforming a list of function theorems to another list
haftmann@38437
   146
  of function theorems, provided that neither the heading constant nor
wenzelm@61076
   147
  its type change.  The @{term "0::nat"} / @{const Suc} pattern
haftmann@51171
   148
  used in theory @{text Code_Abstract_Nat} (see \secref{abstract_nat})
haftmann@51171
   149
  uses this interface.
haftmann@28419
   150
haftmann@54721
   151
  \noindent The current setup of the pre- and postprocessor may be inspected
haftmann@38505
   152
  using the @{command_def print_codeproc} command.  @{command_def
haftmann@38505
   153
  code_thms} (see \secref{sec:equations}) provides a convenient
haftmann@38505
   154
  mechanism to inspect the impact of a preprocessor setup on code
haftmann@38505
   155
  equations.
haftmann@59377
   156
\<close>
haftmann@28419
   157
haftmann@38437
   158
haftmann@59377
   159
subsection \<open>Understanding code equations \label{sec:equations}\<close>
haftmann@28419
   160
haftmann@59377
   161
text \<open>
haftmann@38437
   162
  As told in \secref{sec:principle}, the notion of code equations is
haftmann@38437
   163
  vital to code generation.  Indeed most problems which occur in
haftmann@38437
   164
  practice can be resolved by an inspection of the underlying code
haftmann@38437
   165
  equations.
haftmann@28419
   166
haftmann@38437
   167
  It is possible to exchange the default code equations for constants
haftmann@38437
   168
  by explicitly proving alternative ones:
haftmann@59377
   169
\<close>
haftmann@28419
   170
haftmann@38437
   171
lemma %quote [code]:
haftmann@38437
   172
  "dequeue (AQueue xs []) =
haftmann@38437
   173
     (if xs = [] then (None, AQueue [] [])
haftmann@38437
   174
       else dequeue (AQueue [] (rev xs)))"
haftmann@38437
   175
  "dequeue (AQueue xs (y # ys)) =
haftmann@38437
   176
     (Some y, AQueue xs ys)"
haftmann@38437
   177
  by (cases xs, simp_all) (cases "rev xs", simp_all)
haftmann@28213
   178
haftmann@59377
   179
text \<open>
haftmann@38437
   180
  \noindent The annotation @{text "[code]"} is an @{text attribute}
haftmann@38437
   181
  which states that the given theorems should be considered as code
haftmann@38437
   182
  equations for a @{text fun} statement -- the corresponding constant
haftmann@38437
   183
  is determined syntactically.  The resulting code:
haftmann@59377
   184
\<close>
haftmann@29794
   185
haftmann@59377
   186
text %quotetypewriter \<open>
haftmann@39683
   187
  @{code_stmts dequeue (consts) dequeue (Haskell)}
haftmann@59377
   188
\<close>
haftmann@29794
   189
haftmann@59377
   190
text \<open>
haftmann@38437
   191
  \noindent You may note that the equality test @{term "xs = []"} has
haftmann@38437
   192
  been replaced by the predicate @{term "List.null xs"}.  This is due
haftmann@38437
   193
  to the default setup of the \qn{preprocessor}.
haftmann@38437
   194
haftmann@38437
   195
  This possibility to select arbitrary code equations is the key
haftmann@38437
   196
  technique for program and datatype refinement (see
haftmann@39677
   197
  \secref{sec:refinement}).
haftmann@38437
   198
haftmann@38437
   199
  Due to the preprocessor, there is the distinction of raw code
haftmann@38437
   200
  equations (before preprocessing) and code equations (after
haftmann@38437
   201
  preprocessing).
haftmann@38437
   202
haftmann@38505
   203
  The first can be listed (among other data) using the @{command_def
haftmann@38505
   204
  print_codesetup} command.
haftmann@38437
   205
haftmann@38437
   206
  The code equations after preprocessing are already are blueprint of
haftmann@38437
   207
  the generated program and can be inspected using the @{command
haftmann@38437
   208
  code_thms} command:
haftmann@59377
   209
\<close>
haftmann@29794
   210
haftmann@38437
   211
code_thms %quote dequeue
haftmann@28419
   212
haftmann@59377
   213
text \<open>
haftmann@38437
   214
  \noindent This prints a table with the code equations for @{const
haftmann@38437
   215
  dequeue}, including \emph{all} code equations those equations depend
haftmann@38437
   216
  on recursively.  These dependencies themselves can be visualized using
haftmann@38505
   217
  the @{command_def code_deps} command.
haftmann@59377
   218
\<close>
haftmann@28419
   219
haftmann@28213
   220
haftmann@59377
   221
subsection \<open>Equality\<close>
haftmann@28213
   222
haftmann@59377
   223
text \<open>
haftmann@38437
   224
  Implementation of equality deserves some attention.  Here an example
haftmann@38437
   225
  function involving polymorphic equality:
haftmann@59377
   226
\<close>
haftmann@28419
   227
haftmann@28564
   228
primrec %quote collect_duplicates :: "'a list \<Rightarrow> 'a list \<Rightarrow> 'a list \<Rightarrow> 'a list" where
haftmann@28447
   229
  "collect_duplicates xs ys [] = xs"
haftmann@38437
   230
| "collect_duplicates xs ys (z#zs) = (if z \<in> set xs
haftmann@38437
   231
    then if z \<in> set ys
haftmann@38437
   232
      then collect_duplicates xs ys zs
haftmann@38437
   233
      else collect_duplicates xs (z#ys) zs
haftmann@38437
   234
    else collect_duplicates (z#xs) (z#ys) zs)"
haftmann@28419
   235
haftmann@59377
   236
text \<open>
haftmann@37612
   237
  \noindent During preprocessing, the membership test is rewritten,
haftmann@38437
   238
  resulting in @{const List.member}, which itself performs an explicit
haftmann@38437
   239
  equality check, as can be seen in the corresponding @{text SML} code:
haftmann@59377
   240
\<close>
haftmann@28419
   241
haftmann@59377
   242
text %quotetypewriter \<open>
haftmann@39683
   243
  @{code_stmts collect_duplicates (SML)}
haftmann@59377
   244
\<close>
haftmann@28419
   245
haftmann@59377
   246
text \<open>
haftmann@28419
   247
  \noindent Obviously, polymorphic equality is implemented the Haskell
haftmann@38437
   248
  way using a type class.  How is this achieved?  HOL introduces an
haftmann@38857
   249
  explicit class @{class equal} with a corresponding operation @{const
haftmann@38857
   250
  HOL.equal} such that @{thm equal [no_vars]}.  The preprocessing
haftmann@38857
   251
  framework does the rest by propagating the @{class equal} constraints
haftmann@38437
   252
  through all dependent code equations.  For datatypes, instances of
haftmann@38857
   253
  @{class equal} are implicitly derived when possible.  For other types,
haftmann@38857
   254
  you may instantiate @{text equal} manually like any other type class.
haftmann@59377
   255
\<close>
haftmann@28419
   256
haftmann@28419
   257
haftmann@59377
   258
subsection \<open>Explicit partiality \label{sec:partiality}\<close>
haftmann@28462
   259
haftmann@59377
   260
text \<open>
haftmann@28462
   261
  Partiality usually enters the game by partial patterns, as
haftmann@28462
   262
  in the following example, again for amortised queues:
haftmann@59377
   263
\<close>
haftmann@28462
   264
haftmann@29798
   265
definition %quote strict_dequeue :: "'a queue \<Rightarrow> 'a \<times> 'a queue" where
haftmann@29798
   266
  "strict_dequeue q = (case dequeue q
haftmann@29798
   267
    of (Some x, q') \<Rightarrow> (x, q'))"
haftmann@29798
   268
haftmann@29798
   269
lemma %quote strict_dequeue_AQueue [code]:
haftmann@29798
   270
  "strict_dequeue (AQueue xs (y # ys)) = (y, AQueue xs ys)"
haftmann@29798
   271
  "strict_dequeue (AQueue xs []) =
haftmann@29798
   272
    (case rev xs of y # ys \<Rightarrow> (y, AQueue [] ys))"
haftmann@38437
   273
  by (simp_all add: strict_dequeue_def) (cases xs, simp_all split: list.split)
haftmann@28462
   274
haftmann@59377
   275
text \<open>
haftmann@28462
   276
  \noindent In the corresponding code, there is no equation
haftmann@29798
   277
  for the pattern @{term "AQueue [] []"}:
haftmann@59377
   278
\<close>
haftmann@28462
   279
haftmann@59377
   280
text %quotetypewriter \<open>
haftmann@39683
   281
  @{code_stmts strict_dequeue (consts) strict_dequeue (Haskell)}
haftmann@59377
   282
\<close>
haftmann@28462
   283
haftmann@59377
   284
text \<open>
haftmann@28462
   285
  \noindent In some cases it is desirable to have this
haftmann@28462
   286
  pseudo-\qt{partiality} more explicitly, e.g.~as follows:
haftmann@59377
   287
\<close>
haftmann@28462
   288
haftmann@28564
   289
axiomatization %quote empty_queue :: 'a
haftmann@28462
   290
haftmann@29798
   291
definition %quote strict_dequeue' :: "'a queue \<Rightarrow> 'a \<times> 'a queue" where
haftmann@29798
   292
  "strict_dequeue' q = (case dequeue q of (Some x, q') \<Rightarrow> (x, q') | _ \<Rightarrow> empty_queue)"
haftmann@28213
   293
haftmann@29798
   294
lemma %quote strict_dequeue'_AQueue [code]:
haftmann@29798
   295
  "strict_dequeue' (AQueue xs []) = (if xs = [] then empty_queue
haftmann@29798
   296
     else strict_dequeue' (AQueue [] (rev xs)))"
haftmann@29798
   297
  "strict_dequeue' (AQueue xs (y # ys)) =
haftmann@29798
   298
     (y, AQueue xs ys)"
haftmann@38437
   299
  by (simp_all add: strict_dequeue'_def split: list.splits)
haftmann@28462
   300
haftmann@59377
   301
text \<open>
haftmann@29798
   302
  Observe that on the right hand side of the definition of @{const
paulson@34155
   303
  "strict_dequeue'"}, the unspecified constant @{const empty_queue} occurs.
haftmann@28462
   304
haftmann@29798
   305
  Normally, if constants without any code equations occur in a
haftmann@29798
   306
  program, the code generator complains (since in most cases this is
paulson@34155
   307
  indeed an error).  But such constants can also be thought
paulson@34155
   308
  of as function definitions which always fail,
haftmann@29798
   309
  since there is never a successful pattern match on the left hand
haftmann@29798
   310
  side.  In order to categorise a constant into that category
haftmann@54890
   311
  explicitly, use the @{attribute code} attribute with
haftmann@54890
   312
  @{text abort}:
haftmann@59377
   313
\<close>
haftmann@28462
   314
haftmann@54890
   315
declare %quote [[code abort: empty_queue]]
haftmann@28462
   316
haftmann@59377
   317
text \<open>
haftmann@28462
   318
  \noindent Then the code generator will just insert an error or
haftmann@28462
   319
  exception at the appropriate position:
haftmann@59377
   320
\<close>
haftmann@28462
   321
haftmann@59377
   322
text %quotetypewriter \<open>
haftmann@39683
   323
  @{code_stmts strict_dequeue' (consts) empty_queue strict_dequeue' (Haskell)}
haftmann@59377
   324
\<close>
haftmann@28462
   325
haftmann@59377
   326
text \<open>
haftmann@38437
   327
  \noindent This feature however is rarely needed in practice.  Note
haftmann@54890
   328
  also that the HOL default setup already declares @{const undefined},
haftmann@54890
   329
  which is most likely to be used in such situations, as
haftmann@54890
   330
  @{text "code abort"}.
haftmann@59377
   331
\<close>
haftmann@38437
   332
haftmann@38437
   333
haftmann@59377
   334
subsection \<open>If something goes utterly wrong \label{sec:utterly_wrong}\<close>
haftmann@38437
   335
haftmann@59377
   336
text \<open>
haftmann@38437
   337
  Under certain circumstances, the code generator fails to produce
haftmann@38440
   338
  code entirely.  To debug these, the following hints may prove
haftmann@38440
   339
  helpful:
haftmann@38437
   340
haftmann@38437
   341
  \begin{description}
haftmann@38437
   342
haftmann@38440
   343
    \ditem{\emph{Check with a different target language}.}  Sometimes
haftmann@38440
   344
      the situation gets more clear if you switch to another target
haftmann@38440
   345
      language; the code generated there might give some hints what
haftmann@38440
   346
      prevents the code generator to produce code for the desired
haftmann@38440
   347
      language.
haftmann@38437
   348
haftmann@38440
   349
    \ditem{\emph{Inspect code equations}.}  Code equations are the central
haftmann@43410
   350
      carrier of code generation.  Most problems occurring while generating
haftmann@38440
   351
      code can be traced to single equations which are printed as part of
haftmann@38440
   352
      the error message.  A closer inspection of those may offer the key
haftmann@38440
   353
      for solving issues (cf.~\secref{sec:equations}).
haftmann@38437
   354
haftmann@38440
   355
    \ditem{\emph{Inspect preprocessor setup}.}  The preprocessor might
haftmann@38440
   356
      transform code equations unexpectedly; to understand an
haftmann@38440
   357
      inspection of its setup is necessary (cf.~\secref{sec:preproc}).
haftmann@38437
   358
haftmann@38440
   359
    \ditem{\emph{Generate exceptions}.}  If the code generator
haftmann@38440
   360
      complains about missing code equations, in can be helpful to
haftmann@38440
   361
      implement the offending constants as exceptions
haftmann@38440
   362
      (cf.~\secref{sec:partiality}); this allows at least for a formal
haftmann@38440
   363
      generation of code, whose inspection may then give clues what is
haftmann@38440
   364
      wrong.
haftmann@38437
   365
haftmann@38440
   366
    \ditem{\emph{Remove offending code equations}.}  If code
haftmann@38440
   367
      generation is prevented by just a single equation, this can be
haftmann@38440
   368
      removed (cf.~\secref{sec:equations}) to allow formal code
haftmann@38440
   369
      generation, whose result in turn can be used to trace the
haftmann@38440
   370
      problem.  The most prominent case here are mismatches in type
haftmann@38440
   371
      class signatures (\qt{wellsortedness error}).
haftmann@38437
   372
haftmann@38437
   373
  \end{description}
haftmann@59377
   374
\<close>
haftmann@28213
   375
haftmann@28213
   376
end