author | blanchet |
Thu, 22 Aug 2013 12:16:56 +0200 | |
changeset 53141 | d27e99a6a679 |
parent 53015 | a1119cf551e8 |
child 54721 | 22b888402278 |
permissions | -rw-r--r-- |
38405 | 1 |
theory Foundations |
28419 | 2 |
imports Introduction |
28213 | 3 |
begin |
4 |
||
38437 | 5 |
section {* Code generation foundations \label{sec:foundations} *} |
28419 | 6 |
|
38437 | 7 |
subsection {* Code generator architecture \label{sec:architecture} *} |
28419 | 8 |
|
9 |
text {* |
|
38437 | 10 |
The code generator is actually a framework consisting of different |
11 |
components which can be customised individually. |
|
12 |
||
13 |
Conceptually all components operate on Isabelle's logic framework |
|
14 |
@{theory Pure}. Practically, the object logic @{theory HOL} |
|
15 |
provides the necessary facilities to make use of the code generator, |
|
16 |
mainly since it is an extension of @{theory Pure}. |
|
17 |
||
18 |
The constellation of the different components is visualized in the |
|
19 |
following picture. |
|
20 |
||
21 |
\begin{figure}[h] |
|
52742 | 22 |
\def\sys#1{\emph{#1}} |
23 |
\begin{tikzpicture}[x = 4cm, y = 1cm] |
|
24 |
\tikzstyle positive=[color = black, fill = white]; |
|
25 |
\tikzstyle negative=[color = white, fill = black]; |
|
26 |
\tikzstyle entity=[rounded corners, draw, thick]; |
|
27 |
\tikzstyle process=[ellipse, draw, thick]; |
|
28 |
\tikzstyle arrow=[-stealth, semithick]; |
|
29 |
\node (spec) at (0, 3) [entity, positive] {specification tools}; |
|
30 |
\node (user) at (1, 3) [entity, positive] {user proofs}; |
|
31 |
\node (spec_user_join) at (0.5, 3) [shape=coordinate] {}; |
|
32 |
\node (raw) at (0.5, 4) [entity, positive] {raw code equations}; |
|
33 |
\node (pre) at (1.5, 4) [process, positive] {preprocessing}; |
|
34 |
\node (eqn) at (2.5, 4) [entity, positive] {code equations}; |
|
35 |
\node (iml) at (0.5, 0) [entity, positive] {intermediate program}; |
|
36 |
\node (seri) at (1.5, 0) [process, positive] {serialisation}; |
|
37 |
\node (SML) at (2.5, 3) [entity, positive] {\sys{SML}}; |
|
38 |
\node (OCaml) at (2.5, 2) [entity, positive] {\sys{OCaml}}; |
|
39 |
\node (Haskell) at (2.5, 1) [entity, positive] {\sys{Haskell}}; |
|
40 |
\node (Scala) at (2.5, 0) [entity, positive] {\sys{Scala}}; |
|
41 |
\draw [semithick] (spec) -- (spec_user_join); |
|
42 |
\draw [semithick] (user) -- (spec_user_join); |
|
43 |
\draw [-diamond, semithick] (spec_user_join) -- (raw); |
|
44 |
\draw [arrow] (raw) -- (pre); |
|
45 |
\draw [arrow] (pre) -- (eqn); |
|
46 |
\draw [arrow] (eqn) -- node (transl) [process, positive] {translation} (iml); |
|
47 |
\draw [arrow] (iml) -- (seri); |
|
48 |
\draw [arrow] (seri) -- (SML); |
|
49 |
\draw [arrow] (seri) -- (OCaml); |
|
50 |
\draw [arrow] (seri) -- (Haskell); |
|
51 |
\draw [arrow] (seri) -- (Scala); |
|
52 |
\end{tikzpicture} |
|
38437 | 53 |
\caption{Code generator architecture} |
54 |
\label{fig:arch} |
|
55 |
\end{figure} |
|
56 |
||
57 |
\noindent Central to code generation is the notion of \emph{code |
|
58 |
equations}. A code equation as a first approximation is a theorem |
|
53015
a1119cf551e8
standardized symbols via "isabelle update_sub_sup", excluding src/Pure and src/Tools/WWW_Find;
wenzelm
parents:
52742
diff
changeset
|
59 |
of the form @{text "f t\<^sub>1 t\<^sub>2 \<dots> t\<^sub>n \<equiv> t"} (an equation headed by a |
a1119cf551e8
standardized symbols via "isabelle update_sub_sup", excluding src/Pure and src/Tools/WWW_Find;
wenzelm
parents:
52742
diff
changeset
|
60 |
constant @{text f} with arguments @{text "t\<^sub>1 t\<^sub>2 \<dots> t\<^sub>n"} and right |
38437 | 61 |
hand side @{text t}). |
62 |
||
63 |
\begin{itemize} |
|
64 |
||
65 |
\item Starting point of code generation is a collection of (raw) |
|
66 |
code equations in a theory. It is not relevant where they stem |
|
67 |
from, but typically they were either produced by specification |
|
68 |
tools or proved explicitly by the user. |
|
69 |
||
70 |
\item These raw code equations can be subjected to theorem |
|
71 |
transformations. This \qn{preprocessor} (see |
|
72 |
\secref{sec:preproc}) can apply the full expressiveness of |
|
73 |
ML-based theorem transformations to code generation. The result |
|
74 |
of preprocessing is a structured collection of code equations. |
|
75 |
||
76 |
\item These code equations are \qn{translated} to a program in an |
|
77 |
abstract intermediate language. Think of it as a kind of |
|
78 |
\qt{Mini-Haskell} with four \qn{statements}: @{text data} (for |
|
79 |
datatypes), @{text fun} (stemming from code equations), also |
|
80 |
@{text class} and @{text inst} (for type classes). |
|
81 |
||
82 |
\item Finally, the abstract program is \qn{serialised} into |
|
83 |
concrete source code of a target language. This step only |
|
84 |
produces concrete syntax but does not change the program in |
|
85 |
essence; all conceptual transformations occur in the translation |
|
86 |
step. |
|
87 |
||
88 |
\end{itemize} |
|
89 |
||
90 |
\noindent From these steps, only the last two are carried out |
|
91 |
outside the logic; by keeping this layer as thin as possible, the |
|
92 |
amount of code to trust is kept to a minimum. |
|
28419 | 93 |
*} |
94 |
||
95 |
||
96 |
subsection {* The preprocessor \label{sec:preproc} *} |
|
97 |
||
98 |
text {* |
|
38437 | 99 |
Before selected function theorems are turned into abstract code, a |
100 |
chain of definitional transformation steps is carried out: |
|
101 |
\emph{preprocessing}. The preprocessor consists of two |
|
102 |
components: a \emph{simpset} and \emph{function transformers}. |
|
28419 | 103 |
|
38437 | 104 |
The \emph{simpset} can apply the full generality of the Isabelle |
105 |
simplifier. Due to the interpretation of theorems as code |
|
32000 | 106 |
equations, rewrites are applied to the right hand side and the |
107 |
arguments of the left hand side of an equation, but never to the |
|
108 |
constant heading the left hand side. An important special case are |
|
38437 | 109 |
\emph{unfold theorems}, which may be declared and removed using the |
110 |
@{attribute code_unfold} or \emph{@{attribute code_unfold} del} |
|
34155 | 111 |
attribute, respectively. |
28213 | 112 |
|
28419 | 113 |
Some common applications: |
114 |
*} |
|
115 |
||
116 |
text_raw {* |
|
117 |
\begin{itemize} |
|
118 |
*} |
|
119 |
||
120 |
text {* |
|
121 |
\item replacing non-executable constructs by executable ones: |
|
122 |
*} |
|
123 |
||
37211 | 124 |
lemma %quote [code_unfold]: |
37612
48fed6598be9
adapted to reorganization of auxiliary list operations; split off predicate compiler into separate theory
haftmann
parents:
37427
diff
changeset
|
125 |
"x \<in> set xs \<longleftrightarrow> List.member xs x" by (fact in_set_member) |
28419 | 126 |
|
127 |
text {* |
|
128 |
\item replacing executable but inconvenient constructs: |
|
129 |
*} |
|
130 |
||
37211 | 131 |
lemma %quote [code_unfold]: |
37612
48fed6598be9
adapted to reorganization of auxiliary list operations; split off predicate compiler into separate theory
haftmann
parents:
37427
diff
changeset
|
132 |
"xs = [] \<longleftrightarrow> List.null xs" by (fact eq_Nil_null) |
28419 | 133 |
|
38437 | 134 |
text {* |
135 |
\item eliminating disturbing expressions: |
|
136 |
*} |
|
137 |
||
138 |
lemma %quote [code_unfold]: |
|
139 |
"1 = Suc 0" by (fact One_nat_def) |
|
140 |
||
28419 | 141 |
text_raw {* |
142 |
\end{itemize} |
|
143 |
*} |
|
144 |
||
145 |
text {* |
|
38437 | 146 |
\noindent \emph{Function transformers} provide a very general |
147 |
interface, transforming a list of function theorems to another list |
|
148 |
of function theorems, provided that neither the heading constant nor |
|
149 |
its type change. The @{term "0\<Colon>nat"} / @{const Suc} pattern |
|
51171
e8b2d90da499
corrected and clarified Code_Binary_Nat vs. Code_Target_Nat
haftmann
parents:
51143
diff
changeset
|
150 |
used in theory @{text Code_Abstract_Nat} (see \secref{abstract_nat}) |
e8b2d90da499
corrected and clarified Code_Binary_Nat vs. Code_Target_Nat
haftmann
parents:
51143
diff
changeset
|
151 |
uses this interface. |
28419 | 152 |
|
38437 | 153 |
\noindent The current setup of the preprocessor may be inspected |
38505 | 154 |
using the @{command_def print_codeproc} command. @{command_def |
155 |
code_thms} (see \secref{sec:equations}) provides a convenient |
|
156 |
mechanism to inspect the impact of a preprocessor setup on code |
|
157 |
equations. |
|
28419 | 158 |
*} |
159 |
||
38437 | 160 |
|
161 |
subsection {* Understanding code equations \label{sec:equations} *} |
|
28419 | 162 |
|
163 |
text {* |
|
38437 | 164 |
As told in \secref{sec:principle}, the notion of code equations is |
165 |
vital to code generation. Indeed most problems which occur in |
|
166 |
practice can be resolved by an inspection of the underlying code |
|
167 |
equations. |
|
28419 | 168 |
|
38437 | 169 |
It is possible to exchange the default code equations for constants |
170 |
by explicitly proving alternative ones: |
|
28419 | 171 |
*} |
172 |
||
38437 | 173 |
lemma %quote [code]: |
174 |
"dequeue (AQueue xs []) = |
|
175 |
(if xs = [] then (None, AQueue [] []) |
|
176 |
else dequeue (AQueue [] (rev xs)))" |
|
177 |
"dequeue (AQueue xs (y # ys)) = |
|
178 |
(Some y, AQueue xs ys)" |
|
179 |
by (cases xs, simp_all) (cases "rev xs", simp_all) |
|
28213 | 180 |
|
28419 | 181 |
text {* |
38437 | 182 |
\noindent The annotation @{text "[code]"} is an @{text attribute} |
183 |
which states that the given theorems should be considered as code |
|
184 |
equations for a @{text fun} statement -- the corresponding constant |
|
185 |
is determined syntactically. The resulting code: |
|
29798 | 186 |
*} |
29794 | 187 |
|
39745 | 188 |
text %quotetypewriter {* |
39683 | 189 |
@{code_stmts dequeue (consts) dequeue (Haskell)} |
39664
0afaf89ab591
more canonical type setting of type writer code examples
haftmann
parents:
38857
diff
changeset
|
190 |
*} |
29794 | 191 |
|
29798 | 192 |
text {* |
38437 | 193 |
\noindent You may note that the equality test @{term "xs = []"} has |
194 |
been replaced by the predicate @{term "List.null xs"}. This is due |
|
195 |
to the default setup of the \qn{preprocessor}. |
|
196 |
||
197 |
This possibility to select arbitrary code equations is the key |
|
198 |
technique for program and datatype refinement (see |
|
39677 | 199 |
\secref{sec:refinement}). |
38437 | 200 |
|
201 |
Due to the preprocessor, there is the distinction of raw code |
|
202 |
equations (before preprocessing) and code equations (after |
|
203 |
preprocessing). |
|
204 |
||
38505 | 205 |
The first can be listed (among other data) using the @{command_def |
206 |
print_codesetup} command. |
|
38437 | 207 |
|
208 |
The code equations after preprocessing are already are blueprint of |
|
209 |
the generated program and can be inspected using the @{command |
|
210 |
code_thms} command: |
|
29798 | 211 |
*} |
29794 | 212 |
|
38437 | 213 |
code_thms %quote dequeue |
28419 | 214 |
|
215 |
text {* |
|
38437 | 216 |
\noindent This prints a table with the code equations for @{const |
217 |
dequeue}, including \emph{all} code equations those equations depend |
|
218 |
on recursively. These dependencies themselves can be visualized using |
|
38505 | 219 |
the @{command_def code_deps} command. |
28419 | 220 |
*} |
221 |
||
28213 | 222 |
|
30938
c6c9359e474c
wellsortedness is no issue for a user manual any more
haftmann
parents:
30227
diff
changeset
|
223 |
subsection {* Equality *} |
28213 | 224 |
|
28419 | 225 |
text {* |
38437 | 226 |
Implementation of equality deserves some attention. Here an example |
227 |
function involving polymorphic equality: |
|
28419 | 228 |
*} |
229 |
||
28564 | 230 |
primrec %quote collect_duplicates :: "'a list \<Rightarrow> 'a list \<Rightarrow> 'a list \<Rightarrow> 'a list" where |
28447 | 231 |
"collect_duplicates xs ys [] = xs" |
38437 | 232 |
| "collect_duplicates xs ys (z#zs) = (if z \<in> set xs |
233 |
then if z \<in> set ys |
|
234 |
then collect_duplicates xs ys zs |
|
235 |
else collect_duplicates xs (z#ys) zs |
|
236 |
else collect_duplicates (z#xs) (z#ys) zs)" |
|
28419 | 237 |
|
238 |
text {* |
|
37612
48fed6598be9
adapted to reorganization of auxiliary list operations; split off predicate compiler into separate theory
haftmann
parents:
37427
diff
changeset
|
239 |
\noindent During preprocessing, the membership test is rewritten, |
38437 | 240 |
resulting in @{const List.member}, which itself performs an explicit |
241 |
equality check, as can be seen in the corresponding @{text SML} code: |
|
28419 | 242 |
*} |
243 |
||
39745 | 244 |
text %quotetypewriter {* |
39683 | 245 |
@{code_stmts collect_duplicates (SML)} |
39664
0afaf89ab591
more canonical type setting of type writer code examples
haftmann
parents:
38857
diff
changeset
|
246 |
*} |
28419 | 247 |
|
248 |
text {* |
|
249 |
\noindent Obviously, polymorphic equality is implemented the Haskell |
|
38437 | 250 |
way using a type class. How is this achieved? HOL introduces an |
38857
97775f3e8722
renamed class/constant eq to equal; tuned some instantiations
haftmann
parents:
38505
diff
changeset
|
251 |
explicit class @{class equal} with a corresponding operation @{const |
97775f3e8722
renamed class/constant eq to equal; tuned some instantiations
haftmann
parents:
38505
diff
changeset
|
252 |
HOL.equal} such that @{thm equal [no_vars]}. The preprocessing |
97775f3e8722
renamed class/constant eq to equal; tuned some instantiations
haftmann
parents:
38505
diff
changeset
|
253 |
framework does the rest by propagating the @{class equal} constraints |
38437 | 254 |
through all dependent code equations. For datatypes, instances of |
38857
97775f3e8722
renamed class/constant eq to equal; tuned some instantiations
haftmann
parents:
38505
diff
changeset
|
255 |
@{class equal} are implicitly derived when possible. For other types, |
97775f3e8722
renamed class/constant eq to equal; tuned some instantiations
haftmann
parents:
38505
diff
changeset
|
256 |
you may instantiate @{text equal} manually like any other type class. |
28419 | 257 |
*} |
258 |
||
259 |
||
38440 | 260 |
subsection {* Explicit partiality \label{sec:partiality} *} |
28462 | 261 |
|
262 |
text {* |
|
263 |
Partiality usually enters the game by partial patterns, as |
|
264 |
in the following example, again for amortised queues: |
|
265 |
*} |
|
266 |
||
29798 | 267 |
definition %quote strict_dequeue :: "'a queue \<Rightarrow> 'a \<times> 'a queue" where |
268 |
"strict_dequeue q = (case dequeue q |
|
269 |
of (Some x, q') \<Rightarrow> (x, q'))" |
|
270 |
||
271 |
lemma %quote strict_dequeue_AQueue [code]: |
|
272 |
"strict_dequeue (AQueue xs (y # ys)) = (y, AQueue xs ys)" |
|
273 |
"strict_dequeue (AQueue xs []) = |
|
274 |
(case rev xs of y # ys \<Rightarrow> (y, AQueue [] ys))" |
|
38437 | 275 |
by (simp_all add: strict_dequeue_def) (cases xs, simp_all split: list.split) |
28462 | 276 |
|
277 |
text {* |
|
278 |
\noindent In the corresponding code, there is no equation |
|
29798 | 279 |
for the pattern @{term "AQueue [] []"}: |
28462 | 280 |
*} |
281 |
||
39745 | 282 |
text %quotetypewriter {* |
39683 | 283 |
@{code_stmts strict_dequeue (consts) strict_dequeue (Haskell)} |
39664
0afaf89ab591
more canonical type setting of type writer code examples
haftmann
parents:
38857
diff
changeset
|
284 |
*} |
28462 | 285 |
|
286 |
text {* |
|
287 |
\noindent In some cases it is desirable to have this |
|
288 |
pseudo-\qt{partiality} more explicitly, e.g.~as follows: |
|
289 |
*} |
|
290 |
||
28564 | 291 |
axiomatization %quote empty_queue :: 'a |
28462 | 292 |
|
29798 | 293 |
definition %quote strict_dequeue' :: "'a queue \<Rightarrow> 'a \<times> 'a queue" where |
294 |
"strict_dequeue' q = (case dequeue q of (Some x, q') \<Rightarrow> (x, q') | _ \<Rightarrow> empty_queue)" |
|
28213 | 295 |
|
29798 | 296 |
lemma %quote strict_dequeue'_AQueue [code]: |
297 |
"strict_dequeue' (AQueue xs []) = (if xs = [] then empty_queue |
|
298 |
else strict_dequeue' (AQueue [] (rev xs)))" |
|
299 |
"strict_dequeue' (AQueue xs (y # ys)) = |
|
300 |
(y, AQueue xs ys)" |
|
38437 | 301 |
by (simp_all add: strict_dequeue'_def split: list.splits) |
28462 | 302 |
|
303 |
text {* |
|
29798 | 304 |
Observe that on the right hand side of the definition of @{const |
34155 | 305 |
"strict_dequeue'"}, the unspecified constant @{const empty_queue} occurs. |
28462 | 306 |
|
29798 | 307 |
Normally, if constants without any code equations occur in a |
308 |
program, the code generator complains (since in most cases this is |
|
34155 | 309 |
indeed an error). But such constants can also be thought |
310 |
of as function definitions which always fail, |
|
29798 | 311 |
since there is never a successful pattern match on the left hand |
312 |
side. In order to categorise a constant into that category |
|
38505 | 313 |
explicitly, use @{command_def "code_abort"}: |
28462 | 314 |
*} |
315 |
||
28564 | 316 |
code_abort %quote empty_queue |
28462 | 317 |
|
318 |
text {* |
|
319 |
\noindent Then the code generator will just insert an error or |
|
320 |
exception at the appropriate position: |
|
321 |
*} |
|
322 |
||
39745 | 323 |
text %quotetypewriter {* |
39683 | 324 |
@{code_stmts strict_dequeue' (consts) empty_queue strict_dequeue' (Haskell)} |
39664
0afaf89ab591
more canonical type setting of type writer code examples
haftmann
parents:
38857
diff
changeset
|
325 |
*} |
28462 | 326 |
|
327 |
text {* |
|
38437 | 328 |
\noindent This feature however is rarely needed in practice. Note |
329 |
also that the HOL default setup already declares @{const undefined} |
|
330 |
as @{command "code_abort"}, which is most likely to be used in such |
|
331 |
situations. |
|
332 |
*} |
|
333 |
||
334 |
||
335 |
subsection {* If something goes utterly wrong \label{sec:utterly_wrong} *} |
|
336 |
||
337 |
text {* |
|
338 |
Under certain circumstances, the code generator fails to produce |
|
38440 | 339 |
code entirely. To debug these, the following hints may prove |
340 |
helpful: |
|
38437 | 341 |
|
342 |
\begin{description} |
|
343 |
||
38440 | 344 |
\ditem{\emph{Check with a different target language}.} Sometimes |
345 |
the situation gets more clear if you switch to another target |
|
346 |
language; the code generated there might give some hints what |
|
347 |
prevents the code generator to produce code for the desired |
|
348 |
language. |
|
38437 | 349 |
|
38440 | 350 |
\ditem{\emph{Inspect code equations}.} Code equations are the central |
43410 | 351 |
carrier of code generation. Most problems occurring while generating |
38440 | 352 |
code can be traced to single equations which are printed as part of |
353 |
the error message. A closer inspection of those may offer the key |
|
354 |
for solving issues (cf.~\secref{sec:equations}). |
|
38437 | 355 |
|
38440 | 356 |
\ditem{\emph{Inspect preprocessor setup}.} The preprocessor might |
357 |
transform code equations unexpectedly; to understand an |
|
358 |
inspection of its setup is necessary (cf.~\secref{sec:preproc}). |
|
38437 | 359 |
|
38440 | 360 |
\ditem{\emph{Generate exceptions}.} If the code generator |
361 |
complains about missing code equations, in can be helpful to |
|
362 |
implement the offending constants as exceptions |
|
363 |
(cf.~\secref{sec:partiality}); this allows at least for a formal |
|
364 |
generation of code, whose inspection may then give clues what is |
|
365 |
wrong. |
|
38437 | 366 |
|
38440 | 367 |
\ditem{\emph{Remove offending code equations}.} If code |
368 |
generation is prevented by just a single equation, this can be |
|
369 |
removed (cf.~\secref{sec:equations}) to allow formal code |
|
370 |
generation, whose result in turn can be used to trace the |
|
371 |
problem. The most prominent case here are mismatches in type |
|
372 |
class signatures (\qt{wellsortedness error}). |
|
38437 | 373 |
|
374 |
\end{description} |
|
28462 | 375 |
*} |
28213 | 376 |
|
377 |
end |