src/HOL/Probability/Information.thy
 author haftmann Fri Jun 19 07:53:35 2015 +0200 (2015-06-19) changeset 60517 f16e4fb20652 parent 60017 b785d6d06430 child 60580 7e741e22d7fc permissions -rw-r--r--
separate class for notions specific for integral (semi)domains, in contrast to fields where these are trivial
```     1 (*  Title:      HOL/Probability/Information.thy
```
```     2     Author:     Johannes Hölzl, TU München
```
```     3     Author:     Armin Heller, TU München
```
```     4 *)
```
```     5
```
```     6 section {*Information theory*}
```
```     7
```
```     8 theory Information
```
```     9 imports
```
```    10   Independent_Family
```
```    11   "~~/src/HOL/Library/Convex"
```
```    12 begin
```
```    13
```
```    14 lemma log_le: "1 < a \<Longrightarrow> 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log a x \<le> log a y"
```
```    15   by (subst log_le_cancel_iff) auto
```
```    16
```
```    17 lemma log_less: "1 < a \<Longrightarrow> 0 < x \<Longrightarrow> x < y \<Longrightarrow> log a x < log a y"
```
```    18   by (subst log_less_cancel_iff) auto
```
```    19
```
```    20 lemma setsum_cartesian_product':
```
```    21   "(\<Sum>x\<in>A \<times> B. f x) = (\<Sum>x\<in>A. setsum (\<lambda>y. f (x, y)) B)"
```
```    22   unfolding setsum.cartesian_product by simp
```
```    23
```
```    24 lemma split_pairs:
```
```    25   "((A, B) = X) \<longleftrightarrow> (fst X = A \<and> snd X = B)" and
```
```    26   "(X = (A, B)) \<longleftrightarrow> (fst X = A \<and> snd X = B)" by auto
```
```    27
```
```    28 subsection "Information theory"
```
```    29
```
```    30 locale information_space = prob_space +
```
```    31   fixes b :: real assumes b_gt_1: "1 < b"
```
```    32
```
```    33 context information_space
```
```    34 begin
```
```    35
```
```    36 text {* Introduce some simplification rules for logarithm of base @{term b}. *}
```
```    37
```
```    38 lemma log_neg_const:
```
```    39   assumes "x \<le> 0"
```
```    40   shows "log b x = log b 0"
```
```    41 proof -
```
```    42   { fix u :: real
```
```    43     have "x \<le> 0" by fact
```
```    44     also have "0 < exp u"
```
```    45       using exp_gt_zero .
```
```    46     finally have "exp u \<noteq> x"
```
```    47       by auto }
```
```    48   then show "log b x = log b 0"
```
```    49     by (simp add: log_def ln_real_def)
```
```    50 qed
```
```    51
```
```    52 lemma log_mult_eq:
```
```    53   "log b (A * B) = (if 0 < A * B then log b \<bar>A\<bar> + log b \<bar>B\<bar> else log b 0)"
```
```    54   using log_mult[of b "\<bar>A\<bar>" "\<bar>B\<bar>"] b_gt_1 log_neg_const[of "A * B"]
```
```    55   by (auto simp: zero_less_mult_iff mult_le_0_iff)
```
```    56
```
```    57 lemma log_inverse_eq:
```
```    58   "log b (inverse B) = (if 0 < B then - log b B else log b 0)"
```
```    59   using log_inverse[of b B] log_neg_const[of "inverse B"] b_gt_1 by simp
```
```    60
```
```    61 lemma log_divide_eq:
```
```    62   "log b (A / B) = (if 0 < A * B then log b \<bar>A\<bar> - log b \<bar>B\<bar> else log b 0)"
```
```    63   unfolding divide_inverse log_mult_eq log_inverse_eq abs_inverse
```
```    64   by (auto simp: zero_less_mult_iff mult_le_0_iff)
```
```    65
```
```    66 lemmas log_simps = log_mult_eq log_inverse_eq log_divide_eq
```
```    67
```
```    68 end
```
```    69
```
```    70 subsection "Kullback\$-\$Leibler divergence"
```
```    71
```
```    72 text {* The Kullback\$-\$Leibler divergence is also known as relative entropy or
```
```    73 Kullback\$-\$Leibler distance. *}
```
```    74
```
```    75 definition
```
```    76   "entropy_density b M N = log b \<circ> real \<circ> RN_deriv M N"
```
```    77
```
```    78 definition
```
```    79   "KL_divergence b M N = integral\<^sup>L N (entropy_density b M N)"
```
```    80
```
```    81 lemma measurable_entropy_density[measurable]: "entropy_density b M N \<in> borel_measurable M"
```
```    82   unfolding entropy_density_def by auto
```
```    83
```
```    84 lemma (in sigma_finite_measure) KL_density:
```
```    85   fixes f :: "'a \<Rightarrow> real"
```
```    86   assumes "1 < b"
```
```    87   assumes f[measurable]: "f \<in> borel_measurable M" and nn: "AE x in M. 0 \<le> f x"
```
```    88   shows "KL_divergence b M (density M f) = (\<integral>x. f x * log b (f x) \<partial>M)"
```
```    89   unfolding KL_divergence_def
```
```    90 proof (subst integral_real_density)
```
```    91   show [measurable]: "entropy_density b M (density M (\<lambda>x. ereal (f x))) \<in> borel_measurable M"
```
```    92     using f
```
```    93     by (auto simp: comp_def entropy_density_def)
```
```    94   have "density M (RN_deriv M (density M f)) = density M f"
```
```    95     using f nn by (intro density_RN_deriv_density) auto
```
```    96   then have eq: "AE x in M. RN_deriv M (density M f) x = f x"
```
```    97     using f nn by (intro density_unique) (auto simp: RN_deriv_nonneg)
```
```    98   show "(\<integral>x. f x * entropy_density b M (density M (\<lambda>x. ereal (f x))) x \<partial>M) = (\<integral>x. f x * log b (f x) \<partial>M)"
```
```    99     apply (intro integral_cong_AE)
```
```   100     apply measurable
```
```   101     using eq
```
```   102     apply eventually_elim
```
```   103     apply (auto simp: entropy_density_def)
```
```   104     done
```
```   105 qed fact+
```
```   106
```
```   107 lemma (in sigma_finite_measure) KL_density_density:
```
```   108   fixes f g :: "'a \<Rightarrow> real"
```
```   109   assumes "1 < b"
```
```   110   assumes f: "f \<in> borel_measurable M" "AE x in M. 0 \<le> f x"
```
```   111   assumes g: "g \<in> borel_measurable M" "AE x in M. 0 \<le> g x"
```
```   112   assumes ac: "AE x in M. f x = 0 \<longrightarrow> g x = 0"
```
```   113   shows "KL_divergence b (density M f) (density M g) = (\<integral>x. g x * log b (g x / f x) \<partial>M)"
```
```   114 proof -
```
```   115   interpret Mf: sigma_finite_measure "density M f"
```
```   116     using f by (subst sigma_finite_iff_density_finite) auto
```
```   117   have "KL_divergence b (density M f) (density M g) =
```
```   118     KL_divergence b (density M f) (density (density M f) (\<lambda>x. g x / f x))"
```
```   119     using f g ac by (subst density_density_divide) simp_all
```
```   120   also have "\<dots> = (\<integral>x. (g x / f x) * log b (g x / f x) \<partial>density M f)"
```
```   121     using f g `1 < b` by (intro Mf.KL_density) (auto simp: AE_density)
```
```   122   also have "\<dots> = (\<integral>x. g x * log b (g x / f x) \<partial>M)"
```
```   123     using ac f g `1 < b` by (subst integral_density) (auto intro!: integral_cong_AE)
```
```   124   finally show ?thesis .
```
```   125 qed
```
```   126
```
```   127 lemma (in information_space) KL_gt_0:
```
```   128   fixes D :: "'a \<Rightarrow> real"
```
```   129   assumes "prob_space (density M D)"
```
```   130   assumes D: "D \<in> borel_measurable M" "AE x in M. 0 \<le> D x"
```
```   131   assumes int: "integrable M (\<lambda>x. D x * log b (D x))"
```
```   132   assumes A: "density M D \<noteq> M"
```
```   133   shows "0 < KL_divergence b M (density M D)"
```
```   134 proof -
```
```   135   interpret N: prob_space "density M D" by fact
```
```   136
```
```   137   obtain A where "A \<in> sets M" "emeasure (density M D) A \<noteq> emeasure M A"
```
```   138     using measure_eqI[of "density M D" M] `density M D \<noteq> M` by auto
```
```   139
```
```   140   let ?D_set = "{x\<in>space M. D x \<noteq> 0}"
```
```   141   have [simp, intro]: "?D_set \<in> sets M"
```
```   142     using D by auto
```
```   143
```
```   144   have D_neg: "(\<integral>\<^sup>+ x. ereal (- D x) \<partial>M) = 0"
```
```   145     using D by (subst nn_integral_0_iff_AE) auto
```
```   146
```
```   147   have "(\<integral>\<^sup>+ x. ereal (D x) \<partial>M) = emeasure (density M D) (space M)"
```
```   148     using D by (simp add: emeasure_density cong: nn_integral_cong)
```
```   149   then have D_pos: "(\<integral>\<^sup>+ x. ereal (D x) \<partial>M) = 1"
```
```   150     using N.emeasure_space_1 by simp
```
```   151
```
```   152   have "integrable M D"
```
```   153     using D D_pos D_neg unfolding real_integrable_def real_lebesgue_integral_def by simp_all
```
```   154   then have "integral\<^sup>L M D = 1"
```
```   155     using D D_pos D_neg by (simp add: real_lebesgue_integral_def)
```
```   156
```
```   157   have "0 \<le> 1 - measure M ?D_set"
```
```   158     using prob_le_1 by (auto simp: field_simps)
```
```   159   also have "\<dots> = (\<integral> x. D x - indicator ?D_set x \<partial>M)"
```
```   160     using `integrable M D` `integral\<^sup>L M D = 1`
```
```   161     by (simp add: emeasure_eq_measure)
```
```   162   also have "\<dots> < (\<integral> x. D x * (ln b * log b (D x)) \<partial>M)"
```
```   163   proof (rule integral_less_AE)
```
```   164     show "integrable M (\<lambda>x. D x - indicator ?D_set x)"
```
```   165       using `integrable M D` by auto
```
```   166   next
```
```   167     from integrable_mult_left(1)[OF int, of "ln b"]
```
```   168     show "integrable M (\<lambda>x. D x * (ln b * log b (D x)))"
```
```   169       by (simp add: ac_simps)
```
```   170   next
```
```   171     show "emeasure M {x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} \<noteq> 0"
```
```   172     proof
```
```   173       assume eq_0: "emeasure M {x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} = 0"
```
```   174       then have disj: "AE x in M. D x = 1 \<or> D x = 0"
```
```   175         using D(1) by (auto intro!: AE_I[OF subset_refl] sets.sets_Collect)
```
```   176
```
```   177       have "emeasure M {x\<in>space M. D x = 1} = (\<integral>\<^sup>+ x. indicator {x\<in>space M. D x = 1} x \<partial>M)"
```
```   178         using D(1) by auto
```
```   179       also have "\<dots> = (\<integral>\<^sup>+ x. ereal (D x) \<partial>M)"
```
```   180         using disj by (auto intro!: nn_integral_cong_AE simp: indicator_def one_ereal_def)
```
```   181       finally have "AE x in M. D x = 1"
```
```   182         using D D_pos by (intro AE_I_eq_1) auto
```
```   183       then have "(\<integral>\<^sup>+x. indicator A x\<partial>M) = (\<integral>\<^sup>+x. ereal (D x) * indicator A x\<partial>M)"
```
```   184         by (intro nn_integral_cong_AE) (auto simp: one_ereal_def[symmetric])
```
```   185       also have "\<dots> = density M D A"
```
```   186         using `A \<in> sets M` D by (simp add: emeasure_density)
```
```   187       finally show False using `A \<in> sets M` `emeasure (density M D) A \<noteq> emeasure M A` by simp
```
```   188     qed
```
```   189     show "{x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} \<in> sets M"
```
```   190       using D(1) by (auto intro: sets.sets_Collect_conj)
```
```   191
```
```   192     show "AE t in M. t \<in> {x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} \<longrightarrow>
```
```   193       D t - indicator ?D_set t \<noteq> D t * (ln b * log b (D t))"
```
```   194       using D(2)
```
```   195     proof (eventually_elim, safe)
```
```   196       fix t assume Dt: "t \<in> space M" "D t \<noteq> 1" "D t \<noteq> 0" "0 \<le> D t"
```
```   197         and eq: "D t - indicator ?D_set t = D t * (ln b * log b (D t))"
```
```   198
```
```   199       have "D t - 1 = D t - indicator ?D_set t"
```
```   200         using Dt by simp
```
```   201       also note eq
```
```   202       also have "D t * (ln b * log b (D t)) = - D t * ln (1 / D t)"
```
```   203         using b_gt_1 `D t \<noteq> 0` `0 \<le> D t`
```
```   204         by (simp add: log_def ln_div less_le)
```
```   205       finally have "ln (1 / D t) = 1 / D t - 1"
```
```   206         using `D t \<noteq> 0` by (auto simp: field_simps)
```
```   207       from ln_eq_minus_one[OF _ this] `D t \<noteq> 0` `0 \<le> D t` `D t \<noteq> 1`
```
```   208       show False by auto
```
```   209     qed
```
```   210
```
```   211     show "AE t in M. D t - indicator ?D_set t \<le> D t * (ln b * log b (D t))"
```
```   212       using D(2) AE_space
```
```   213     proof eventually_elim
```
```   214       fix t assume "t \<in> space M" "0 \<le> D t"
```
```   215       show "D t - indicator ?D_set t \<le> D t * (ln b * log b (D t))"
```
```   216       proof cases
```
```   217         assume asm: "D t \<noteq> 0"
```
```   218         then have "0 < D t" using `0 \<le> D t` by auto
```
```   219         then have "0 < 1 / D t" by auto
```
```   220         have "D t - indicator ?D_set t \<le> - D t * (1 / D t - 1)"
```
```   221           using asm `t \<in> space M` by (simp add: field_simps)
```
```   222         also have "- D t * (1 / D t - 1) \<le> - D t * ln (1 / D t)"
```
```   223           using ln_le_minus_one `0 < 1 / D t` by (intro mult_left_mono_neg) auto
```
```   224         also have "\<dots> = D t * (ln b * log b (D t))"
```
```   225           using `0 < D t` b_gt_1
```
```   226           by (simp_all add: log_def ln_div)
```
```   227         finally show ?thesis by simp
```
```   228       qed simp
```
```   229     qed
```
```   230   qed
```
```   231   also have "\<dots> = (\<integral> x. ln b * (D x * log b (D x)) \<partial>M)"
```
```   232     by (simp add: ac_simps)
```
```   233   also have "\<dots> = ln b * (\<integral> x. D x * log b (D x) \<partial>M)"
```
```   234     using int by simp
```
```   235   finally show ?thesis
```
```   236     using b_gt_1 D by (subst KL_density) (auto simp: zero_less_mult_iff)
```
```   237 qed
```
```   238
```
```   239 lemma (in sigma_finite_measure) KL_same_eq_0: "KL_divergence b M M = 0"
```
```   240 proof -
```
```   241   have "AE x in M. 1 = RN_deriv M M x"
```
```   242   proof (rule RN_deriv_unique)
```
```   243     show "(\<lambda>x. 1) \<in> borel_measurable M" "AE x in M. 0 \<le> (1 :: ereal)" by auto
```
```   244     show "density M (\<lambda>x. 1) = M"
```
```   245       apply (auto intro!: measure_eqI emeasure_density)
```
```   246       apply (subst emeasure_density)
```
```   247       apply auto
```
```   248       done
```
```   249   qed
```
```   250   then have "AE x in M. log b (real (RN_deriv M M x)) = 0"
```
```   251     by (elim AE_mp) simp
```
```   252   from integral_cong_AE[OF _ _ this]
```
```   253   have "integral\<^sup>L M (entropy_density b M M) = 0"
```
```   254     by (simp add: entropy_density_def comp_def)
```
```   255   then show "KL_divergence b M M = 0"
```
```   256     unfolding KL_divergence_def
```
```   257     by auto
```
```   258 qed
```
```   259
```
```   260 lemma (in information_space) KL_eq_0_iff_eq:
```
```   261   fixes D :: "'a \<Rightarrow> real"
```
```   262   assumes "prob_space (density M D)"
```
```   263   assumes D: "D \<in> borel_measurable M" "AE x in M. 0 \<le> D x"
```
```   264   assumes int: "integrable M (\<lambda>x. D x * log b (D x))"
```
```   265   shows "KL_divergence b M (density M D) = 0 \<longleftrightarrow> density M D = M"
```
```   266   using KL_same_eq_0[of b] KL_gt_0[OF assms]
```
```   267   by (auto simp: less_le)
```
```   268
```
```   269 lemma (in information_space) KL_eq_0_iff_eq_ac:
```
```   270   fixes D :: "'a \<Rightarrow> real"
```
```   271   assumes "prob_space N"
```
```   272   assumes ac: "absolutely_continuous M N" "sets N = sets M"
```
```   273   assumes int: "integrable N (entropy_density b M N)"
```
```   274   shows "KL_divergence b M N = 0 \<longleftrightarrow> N = M"
```
```   275 proof -
```
```   276   interpret N: prob_space N by fact
```
```   277   have "finite_measure N" by unfold_locales
```
```   278   from real_RN_deriv[OF this ac] guess D . note D = this
```
```   279
```
```   280   have "N = density M (RN_deriv M N)"
```
```   281     using ac by (rule density_RN_deriv[symmetric])
```
```   282   also have "\<dots> = density M D"
```
```   283     using D by (auto intro!: density_cong)
```
```   284   finally have N: "N = density M D" .
```
```   285
```
```   286   from absolutely_continuous_AE[OF ac(2,1) D(2)] D b_gt_1 ac measurable_entropy_density
```
```   287   have "integrable N (\<lambda>x. log b (D x))"
```
```   288     by (intro integrable_cong_AE[THEN iffD2, OF _ _ _ int])
```
```   289        (auto simp: N entropy_density_def)
```
```   290   with D b_gt_1 have "integrable M (\<lambda>x. D x * log b (D x))"
```
```   291     by (subst integrable_real_density[symmetric]) (auto simp: N[symmetric] comp_def)
```
```   292   with `prob_space N` D show ?thesis
```
```   293     unfolding N
```
```   294     by (intro KL_eq_0_iff_eq) auto
```
```   295 qed
```
```   296
```
```   297 lemma (in information_space) KL_nonneg:
```
```   298   assumes "prob_space (density M D)"
```
```   299   assumes D: "D \<in> borel_measurable M" "AE x in M. 0 \<le> D x"
```
```   300   assumes int: "integrable M (\<lambda>x. D x * log b (D x))"
```
```   301   shows "0 \<le> KL_divergence b M (density M D)"
```
```   302   using KL_gt_0[OF assms] by (cases "density M D = M") (auto simp: KL_same_eq_0)
```
```   303
```
```   304 lemma (in sigma_finite_measure) KL_density_density_nonneg:
```
```   305   fixes f g :: "'a \<Rightarrow> real"
```
```   306   assumes "1 < b"
```
```   307   assumes f: "f \<in> borel_measurable M" "AE x in M. 0 \<le> f x" "prob_space (density M f)"
```
```   308   assumes g: "g \<in> borel_measurable M" "AE x in M. 0 \<le> g x" "prob_space (density M g)"
```
```   309   assumes ac: "AE x in M. f x = 0 \<longrightarrow> g x = 0"
```
```   310   assumes int: "integrable M (\<lambda>x. g x * log b (g x / f x))"
```
```   311   shows "0 \<le> KL_divergence b (density M f) (density M g)"
```
```   312 proof -
```
```   313   interpret Mf: prob_space "density M f" by fact
```
```   314   interpret Mf: information_space "density M f" b by default fact
```
```   315   have eq: "density (density M f) (\<lambda>x. g x / f x) = density M g" (is "?DD = _")
```
```   316     using f g ac by (subst density_density_divide) simp_all
```
```   317
```
```   318   have "0 \<le> KL_divergence b (density M f) (density (density M f) (\<lambda>x. g x / f x))"
```
```   319   proof (rule Mf.KL_nonneg)
```
```   320     show "prob_space ?DD" unfolding eq by fact
```
```   321     from f g show "(\<lambda>x. g x / f x) \<in> borel_measurable (density M f)"
```
```   322       by auto
```
```   323     show "AE x in density M f. 0 \<le> g x / f x"
```
```   324       using f g by (auto simp: AE_density)
```
```   325     show "integrable (density M f) (\<lambda>x. g x / f x * log b (g x / f x))"
```
```   326       using `1 < b` f g ac
```
```   327       by (subst integrable_density)
```
```   328          (auto intro!: integrable_cong_AE[THEN iffD2, OF _ _ _ int] measurable_If)
```
```   329   qed
```
```   330   also have "\<dots> = KL_divergence b (density M f) (density M g)"
```
```   331     using f g ac by (subst density_density_divide) simp_all
```
```   332   finally show ?thesis .
```
```   333 qed
```
```   334
```
```   335 subsection {* Finite Entropy *}
```
```   336
```
```   337 definition (in information_space)
```
```   338   "finite_entropy S X f \<longleftrightarrow> distributed M S X f \<and> integrable S (\<lambda>x. f x * log b (f x))"
```
```   339
```
```   340 lemma (in information_space) finite_entropy_simple_function:
```
```   341   assumes X: "simple_function M X"
```
```   342   shows "finite_entropy (count_space (X`space M)) X (\<lambda>a. measure M {x \<in> space M. X x = a})"
```
```   343   unfolding finite_entropy_def
```
```   344 proof
```
```   345   have [simp]: "finite (X ` space M)"
```
```   346     using X by (auto simp: simple_function_def)
```
```   347   then show "integrable (count_space (X ` space M))
```
```   348      (\<lambda>x. prob {xa \<in> space M. X xa = x} * log b (prob {xa \<in> space M. X xa = x}))"
```
```   349     by (rule integrable_count_space)
```
```   350   have d: "distributed M (count_space (X ` space M)) X (\<lambda>x. ereal (if x \<in> X`space M then prob {xa \<in> space M. X xa = x} else 0))"
```
```   351     by (rule distributed_simple_function_superset[OF X]) (auto intro!: arg_cong[where f=prob])
```
```   352   show "distributed M (count_space (X ` space M)) X (\<lambda>x. ereal (prob {xa \<in> space M. X xa = x}))"
```
```   353     by (rule distributed_cong_density[THEN iffD1, OF _ _ _ d]) auto
```
```   354 qed
```
```   355
```
```   356 lemma distributed_transform_AE:
```
```   357   assumes T: "T \<in> measurable P Q" "absolutely_continuous Q (distr P Q T)"
```
```   358   assumes g: "distributed M Q Y g"
```
```   359   shows "AE x in P. 0 \<le> g (T x)"
```
```   360   using g
```
```   361   apply (subst AE_distr_iff[symmetric, OF T(1)])
```
```   362   apply simp
```
```   363   apply (rule absolutely_continuous_AE[OF _ T(2)])
```
```   364   apply simp
```
```   365   apply (simp add: distributed_AE)
```
```   366   done
```
```   367
```
```   368 lemma ac_fst:
```
```   369   assumes "sigma_finite_measure T"
```
```   370   shows "absolutely_continuous S (distr (S \<Otimes>\<^sub>M T) S fst)"
```
```   371 proof -
```
```   372   interpret sigma_finite_measure T by fact
```
```   373   { fix A assume A: "A \<in> sets S" "emeasure S A = 0"
```
```   374     then have "fst -` A \<inter> space (S \<Otimes>\<^sub>M T) = A \<times> space T"
```
```   375       by (auto simp: space_pair_measure dest!: sets.sets_into_space)
```
```   376     with A have "emeasure (S \<Otimes>\<^sub>M T) (fst -` A \<inter> space (S \<Otimes>\<^sub>M T)) = 0"
```
```   377       by (simp add: emeasure_pair_measure_Times) }
```
```   378   then show ?thesis
```
```   379     unfolding absolutely_continuous_def
```
```   380     apply (auto simp: null_sets_distr_iff)
```
```   381     apply (auto simp: null_sets_def intro!: measurable_sets)
```
```   382     done
```
```   383 qed
```
```   384
```
```   385 lemma ac_snd:
```
```   386   assumes "sigma_finite_measure T"
```
```   387   shows "absolutely_continuous T (distr (S \<Otimes>\<^sub>M T) T snd)"
```
```   388 proof -
```
```   389   interpret sigma_finite_measure T by fact
```
```   390   { fix A assume A: "A \<in> sets T" "emeasure T A = 0"
```
```   391     then have "snd -` A \<inter> space (S \<Otimes>\<^sub>M T) = space S \<times> A"
```
```   392       by (auto simp: space_pair_measure dest!: sets.sets_into_space)
```
```   393     with A have "emeasure (S \<Otimes>\<^sub>M T) (snd -` A \<inter> space (S \<Otimes>\<^sub>M T)) = 0"
```
```   394       by (simp add: emeasure_pair_measure_Times) }
```
```   395   then show ?thesis
```
```   396     unfolding absolutely_continuous_def
```
```   397     apply (auto simp: null_sets_distr_iff)
```
```   398     apply (auto simp: null_sets_def intro!: measurable_sets)
```
```   399     done
```
```   400 qed
```
```   401
```
```   402 lemma integrable_cong_AE_imp:
```
```   403   "integrable M g \<Longrightarrow> f \<in> borel_measurable M \<Longrightarrow> (AE x in M. g x = f x) \<Longrightarrow> integrable M f"
```
```   404   using integrable_cong_AE[of f M g] by (auto simp: eq_commute)
```
```   405
```
```   406 lemma (in information_space) finite_entropy_integrable:
```
```   407   "finite_entropy S X Px \<Longrightarrow> integrable S (\<lambda>x. Px x * log b (Px x))"
```
```   408   unfolding finite_entropy_def by auto
```
```   409
```
```   410 lemma (in information_space) finite_entropy_distributed:
```
```   411   "finite_entropy S X Px \<Longrightarrow> distributed M S X Px"
```
```   412   unfolding finite_entropy_def by auto
```
```   413
```
```   414 lemma (in information_space) finite_entropy_integrable_transform:
```
```   415   assumes Fx: "finite_entropy S X Px"
```
```   416   assumes Fy: "distributed M T Y Py"
```
```   417     and "X = (\<lambda>x. f (Y x))"
```
```   418     and "f \<in> measurable T S"
```
```   419   shows "integrable T (\<lambda>x. Py x * log b (Px (f x)))"
```
```   420   using assms unfolding finite_entropy_def
```
```   421   using distributed_transform_integrable[of M T Y Py S X Px f "\<lambda>x. log b (Px x)"]
```
```   422   by auto
```
```   423
```
```   424 subsection {* Mutual Information *}
```
```   425
```
```   426 definition (in prob_space)
```
```   427   "mutual_information b S T X Y =
```
```   428     KL_divergence b (distr M S X \<Otimes>\<^sub>M distr M T Y) (distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)))"
```
```   429
```
```   430 lemma (in information_space) mutual_information_indep_vars:
```
```   431   fixes S T X Y
```
```   432   defines "P \<equiv> distr M S X \<Otimes>\<^sub>M distr M T Y"
```
```   433   defines "Q \<equiv> distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))"
```
```   434   shows "indep_var S X T Y \<longleftrightarrow>
```
```   435     (random_variable S X \<and> random_variable T Y \<and>
```
```   436       absolutely_continuous P Q \<and> integrable Q (entropy_density b P Q) \<and>
```
```   437       mutual_information b S T X Y = 0)"
```
```   438   unfolding indep_var_distribution_eq
```
```   439 proof safe
```
```   440   assume rv[measurable]: "random_variable S X" "random_variable T Y"
```
```   441
```
```   442   interpret X: prob_space "distr M S X"
```
```   443     by (rule prob_space_distr) fact
```
```   444   interpret Y: prob_space "distr M T Y"
```
```   445     by (rule prob_space_distr) fact
```
```   446   interpret XY: pair_prob_space "distr M S X" "distr M T Y" by default
```
```   447   interpret P: information_space P b unfolding P_def by default (rule b_gt_1)
```
```   448
```
```   449   interpret Q: prob_space Q unfolding Q_def
```
```   450     by (rule prob_space_distr) simp
```
```   451
```
```   452   { assume "distr M S X \<Otimes>\<^sub>M distr M T Y = distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))"
```
```   453     then have [simp]: "Q = P"  unfolding Q_def P_def by simp
```
```   454
```
```   455     show ac: "absolutely_continuous P Q" by (simp add: absolutely_continuous_def)
```
```   456     then have ed: "entropy_density b P Q \<in> borel_measurable P"
```
```   457       by simp
```
```   458
```
```   459     have "AE x in P. 1 = RN_deriv P Q x"
```
```   460     proof (rule P.RN_deriv_unique)
```
```   461       show "density P (\<lambda>x. 1) = Q"
```
```   462         unfolding `Q = P` by (intro measure_eqI) (auto simp: emeasure_density)
```
```   463     qed auto
```
```   464     then have ae_0: "AE x in P. entropy_density b P Q x = 0"
```
```   465       by eventually_elim (auto simp: entropy_density_def)
```
```   466     then have "integrable P (entropy_density b P Q) \<longleftrightarrow> integrable Q (\<lambda>x. 0::real)"
```
```   467       using ed unfolding `Q = P` by (intro integrable_cong_AE) auto
```
```   468     then show "integrable Q (entropy_density b P Q)" by simp
```
```   469
```
```   470     from ae_0 have "mutual_information b S T X Y = (\<integral>x. 0 \<partial>P)"
```
```   471       unfolding mutual_information_def KL_divergence_def P_def[symmetric] Q_def[symmetric] `Q = P`
```
```   472       by (intro integral_cong_AE) auto
```
```   473     then show "mutual_information b S T X Y = 0"
```
```   474       by simp }
```
```   475
```
```   476   { assume ac: "absolutely_continuous P Q"
```
```   477     assume int: "integrable Q (entropy_density b P Q)"
```
```   478     assume I_eq_0: "mutual_information b S T X Y = 0"
```
```   479
```
```   480     have eq: "Q = P"
```
```   481     proof (rule P.KL_eq_0_iff_eq_ac[THEN iffD1])
```
```   482       show "prob_space Q" by unfold_locales
```
```   483       show "absolutely_continuous P Q" by fact
```
```   484       show "integrable Q (entropy_density b P Q)" by fact
```
```   485       show "sets Q = sets P" by (simp add: P_def Q_def sets_pair_measure)
```
```   486       show "KL_divergence b P Q = 0"
```
```   487         using I_eq_0 unfolding mutual_information_def by (simp add: P_def Q_def)
```
```   488     qed
```
```   489     then show "distr M S X \<Otimes>\<^sub>M distr M T Y = distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))"
```
```   490       unfolding P_def Q_def .. }
```
```   491 qed
```
```   492
```
```   493 abbreviation (in information_space)
```
```   494   mutual_information_Pow ("\<I>'(_ ; _')") where
```
```   495   "\<I>(X ; Y) \<equiv> mutual_information b (count_space (X`space M)) (count_space (Y`space M)) X Y"
```
```   496
```
```   497 lemma (in information_space)
```
```   498   fixes Pxy :: "'b \<times> 'c \<Rightarrow> real" and Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real"
```
```   499   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
```
```   500   assumes Fx: "finite_entropy S X Px" and Fy: "finite_entropy T Y Py"
```
```   501   assumes Fxy: "finite_entropy (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
```
```   502   defines "f \<equiv> \<lambda>x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x)))"
```
```   503   shows mutual_information_distr': "mutual_information b S T X Y = integral\<^sup>L (S \<Otimes>\<^sub>M T) f" (is "?M = ?R")
```
```   504     and mutual_information_nonneg': "0 \<le> mutual_information b S T X Y"
```
```   505 proof -
```
```   506   have Px: "distributed M S X Px"
```
```   507     using Fx by (auto simp: finite_entropy_def)
```
```   508   have Py: "distributed M T Y Py"
```
```   509     using Fy by (auto simp: finite_entropy_def)
```
```   510   have Pxy: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
```
```   511     using Fxy by (auto simp: finite_entropy_def)
```
```   512
```
```   513   have X: "random_variable S X"
```
```   514     using Px by auto
```
```   515   have Y: "random_variable T Y"
```
```   516     using Py by auto
```
```   517   interpret S: sigma_finite_measure S by fact
```
```   518   interpret T: sigma_finite_measure T by fact
```
```   519   interpret ST: pair_sigma_finite S T ..
```
```   520   interpret X: prob_space "distr M S X" using X by (rule prob_space_distr)
```
```   521   interpret Y: prob_space "distr M T Y" using Y by (rule prob_space_distr)
```
```   522   interpret XY: pair_prob_space "distr M S X" "distr M T Y" ..
```
```   523   let ?P = "S \<Otimes>\<^sub>M T"
```
```   524   let ?D = "distr M ?P (\<lambda>x. (X x, Y x))"
```
```   525
```
```   526   { fix A assume "A \<in> sets S"
```
```   527     with X Y have "emeasure (distr M S X) A = emeasure ?D (A \<times> space T)"
```
```   528       by (auto simp: emeasure_distr measurable_Pair measurable_space
```
```   529                intro!: arg_cong[where f="emeasure M"]) }
```
```   530   note marginal_eq1 = this
```
```   531   { fix A assume "A \<in> sets T"
```
```   532     with X Y have "emeasure (distr M T Y) A = emeasure ?D (space S \<times> A)"
```
```   533       by (auto simp: emeasure_distr measurable_Pair measurable_space
```
```   534                intro!: arg_cong[where f="emeasure M"]) }
```
```   535   note marginal_eq2 = this
```
```   536
```
```   537   have eq: "(\<lambda>x. ereal (Px (fst x) * Py (snd x))) = (\<lambda>(x, y). ereal (Px x) * ereal (Py y))"
```
```   538     by auto
```
```   539
```
```   540   have distr_eq: "distr M S X \<Otimes>\<^sub>M distr M T Y = density ?P (\<lambda>x. ereal (Px (fst x) * Py (snd x)))"
```
```   541     unfolding Px(1)[THEN distributed_distr_eq_density] Py(1)[THEN distributed_distr_eq_density] eq
```
```   542   proof (subst pair_measure_density)
```
```   543     show "(\<lambda>x. ereal (Px x)) \<in> borel_measurable S" "(\<lambda>y. ereal (Py y)) \<in> borel_measurable T"
```
```   544       "AE x in S. 0 \<le> ereal (Px x)" "AE y in T. 0 \<le> ereal (Py y)"
```
```   545       using Px Py by (auto simp: distributed_def)
```
```   546     show "sigma_finite_measure (density T Py)" unfolding Py(1)[THEN distributed_distr_eq_density, symmetric] ..
```
```   547   qed (fact | simp)+
```
```   548
```
```   549   have M: "?M = KL_divergence b (density ?P (\<lambda>x. ereal (Px (fst x) * Py (snd x)))) (density ?P (\<lambda>x. ereal (Pxy x)))"
```
```   550     unfolding mutual_information_def distr_eq Pxy(1)[THEN distributed_distr_eq_density] ..
```
```   551
```
```   552   from Px Py have f: "(\<lambda>x. Px (fst x) * Py (snd x)) \<in> borel_measurable ?P"
```
```   553     by (intro borel_measurable_times) (auto intro: distributed_real_measurable measurable_fst'' measurable_snd'')
```
```   554   have PxPy_nonneg: "AE x in ?P. 0 \<le> Px (fst x) * Py (snd x)"
```
```   555   proof (rule ST.AE_pair_measure)
```
```   556     show "{x \<in> space ?P. 0 \<le> Px (fst x) * Py (snd x)} \<in> sets ?P"
```
```   557       using f by auto
```
```   558     show "AE x in S. AE y in T. 0 \<le> Px (fst (x, y)) * Py (snd (x, y))"
```
```   559       using Px Py by (auto simp: zero_le_mult_iff dest!: distributed_real_AE)
```
```   560   qed
```
```   561
```
```   562   have "(AE x in ?P. Px (fst x) = 0 \<longrightarrow> Pxy x = 0)"
```
```   563     by (rule subdensity_real[OF measurable_fst Pxy Px]) auto
```
```   564   moreover
```
```   565   have "(AE x in ?P. Py (snd x) = 0 \<longrightarrow> Pxy x = 0)"
```
```   566     by (rule subdensity_real[OF measurable_snd Pxy Py]) auto
```
```   567   ultimately have ac: "AE x in ?P. Px (fst x) * Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
```
```   568     by eventually_elim auto
```
```   569
```
```   570   show "?M = ?R"
```
```   571     unfolding M f_def
```
```   572     using b_gt_1 f PxPy_nonneg Pxy[THEN distributed_real_measurable] Pxy[THEN distributed_real_AE] ac
```
```   573     by (rule ST.KL_density_density)
```
```   574
```
```   575   have X: "X = fst \<circ> (\<lambda>x. (X x, Y x))" and Y: "Y = snd \<circ> (\<lambda>x. (X x, Y x))"
```
```   576     by auto
```
```   577
```
```   578   have "integrable (S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x) - Pxy x * log b (Px (fst x)) - Pxy x * log b (Py (snd x)))"
```
```   579     using finite_entropy_integrable[OF Fxy]
```
```   580     using finite_entropy_integrable_transform[OF Fx Pxy, of fst]
```
```   581     using finite_entropy_integrable_transform[OF Fy Pxy, of snd]
```
```   582     by simp
```
```   583   moreover have "f \<in> borel_measurable (S \<Otimes>\<^sub>M T)"
```
```   584     unfolding f_def using Px Py Pxy
```
```   585     by (auto intro: distributed_real_measurable measurable_fst'' measurable_snd''
```
```   586       intro!: borel_measurable_times borel_measurable_log borel_measurable_divide)
```
```   587   ultimately have int: "integrable (S \<Otimes>\<^sub>M T) f"
```
```   588     apply (rule integrable_cong_AE_imp)
```
```   589     using
```
```   590       distributed_transform_AE[OF measurable_fst ac_fst, of T, OF T Px]
```
```   591       distributed_transform_AE[OF measurable_snd ac_snd, of _ _ _ _ S, OF T Py]
```
```   592       subdensity_real[OF measurable_fst Pxy Px X]
```
```   593       subdensity_real[OF measurable_snd Pxy Py Y]
```
```   594       distributed_real_AE[OF Pxy]
```
```   595     by eventually_elim
```
```   596        (auto simp: f_def log_divide_eq log_mult_eq field_simps zero_less_mult_iff)
```
```   597
```
```   598   show "0 \<le> ?M" unfolding M
```
```   599   proof (rule ST.KL_density_density_nonneg
```
```   600     [OF b_gt_1 f PxPy_nonneg _ Pxy[THEN distributed_real_measurable] Pxy[THEN distributed_real_AE] _ ac int[unfolded f_def]])
```
```   601     show "prob_space (density (S \<Otimes>\<^sub>M T) (\<lambda>x. ereal (Pxy x))) "
```
```   602       unfolding distributed_distr_eq_density[OF Pxy, symmetric]
```
```   603       using distributed_measurable[OF Pxy] by (rule prob_space_distr)
```
```   604     show "prob_space (density (S \<Otimes>\<^sub>M T) (\<lambda>x. ereal (Px (fst x) * Py (snd x))))"
```
```   605       unfolding distr_eq[symmetric] by unfold_locales
```
```   606   qed
```
```   607 qed
```
```   608
```
```   609
```
```   610 lemma (in information_space)
```
```   611   fixes Pxy :: "'b \<times> 'c \<Rightarrow> real" and Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real"
```
```   612   assumes "sigma_finite_measure S" "sigma_finite_measure T"
```
```   613   assumes Px: "distributed M S X Px" and Py: "distributed M T Y Py"
```
```   614   assumes Pxy: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
```
```   615   defines "f \<equiv> \<lambda>x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x)))"
```
```   616   shows mutual_information_distr: "mutual_information b S T X Y = integral\<^sup>L (S \<Otimes>\<^sub>M T) f" (is "?M = ?R")
```
```   617     and mutual_information_nonneg: "integrable (S \<Otimes>\<^sub>M T) f \<Longrightarrow> 0 \<le> mutual_information b S T X Y"
```
```   618 proof -
```
```   619   have X: "random_variable S X"
```
```   620     using Px by (auto simp: distributed_def)
```
```   621   have Y: "random_variable T Y"
```
```   622     using Py by (auto simp: distributed_def)
```
```   623   interpret S: sigma_finite_measure S by fact
```
```   624   interpret T: sigma_finite_measure T by fact
```
```   625   interpret ST: pair_sigma_finite S T ..
```
```   626   interpret X: prob_space "distr M S X" using X by (rule prob_space_distr)
```
```   627   interpret Y: prob_space "distr M T Y" using Y by (rule prob_space_distr)
```
```   628   interpret XY: pair_prob_space "distr M S X" "distr M T Y" ..
```
```   629   let ?P = "S \<Otimes>\<^sub>M T"
```
```   630   let ?D = "distr M ?P (\<lambda>x. (X x, Y x))"
```
```   631
```
```   632   { fix A assume "A \<in> sets S"
```
```   633     with X Y have "emeasure (distr M S X) A = emeasure ?D (A \<times> space T)"
```
```   634       by (auto simp: emeasure_distr measurable_Pair measurable_space
```
```   635                intro!: arg_cong[where f="emeasure M"]) }
```
```   636   note marginal_eq1 = this
```
```   637   { fix A assume "A \<in> sets T"
```
```   638     with X Y have "emeasure (distr M T Y) A = emeasure ?D (space S \<times> A)"
```
```   639       by (auto simp: emeasure_distr measurable_Pair measurable_space
```
```   640                intro!: arg_cong[where f="emeasure M"]) }
```
```   641   note marginal_eq2 = this
```
```   642
```
```   643   have eq: "(\<lambda>x. ereal (Px (fst x) * Py (snd x))) = (\<lambda>(x, y). ereal (Px x) * ereal (Py y))"
```
```   644     by auto
```
```   645
```
```   646   have distr_eq: "distr M S X \<Otimes>\<^sub>M distr M T Y = density ?P (\<lambda>x. ereal (Px (fst x) * Py (snd x)))"
```
```   647     unfolding Px(1)[THEN distributed_distr_eq_density] Py(1)[THEN distributed_distr_eq_density] eq
```
```   648   proof (subst pair_measure_density)
```
```   649     show "(\<lambda>x. ereal (Px x)) \<in> borel_measurable S" "(\<lambda>y. ereal (Py y)) \<in> borel_measurable T"
```
```   650       "AE x in S. 0 \<le> ereal (Px x)" "AE y in T. 0 \<le> ereal (Py y)"
```
```   651       using Px Py by (auto simp: distributed_def)
```
```   652     show "sigma_finite_measure (density T Py)" unfolding Py(1)[THEN distributed_distr_eq_density, symmetric] ..
```
```   653   qed (fact | simp)+
```
```   654
```
```   655   have M: "?M = KL_divergence b (density ?P (\<lambda>x. ereal (Px (fst x) * Py (snd x)))) (density ?P (\<lambda>x. ereal (Pxy x)))"
```
```   656     unfolding mutual_information_def distr_eq Pxy(1)[THEN distributed_distr_eq_density] ..
```
```   657
```
```   658   from Px Py have f: "(\<lambda>x. Px (fst x) * Py (snd x)) \<in> borel_measurable ?P"
```
```   659     by (intro borel_measurable_times) (auto intro: distributed_real_measurable measurable_fst'' measurable_snd'')
```
```   660   have PxPy_nonneg: "AE x in ?P. 0 \<le> Px (fst x) * Py (snd x)"
```
```   661   proof (rule ST.AE_pair_measure)
```
```   662     show "{x \<in> space ?P. 0 \<le> Px (fst x) * Py (snd x)} \<in> sets ?P"
```
```   663       using f by auto
```
```   664     show "AE x in S. AE y in T. 0 \<le> Px (fst (x, y)) * Py (snd (x, y))"
```
```   665       using Px Py by (auto simp: zero_le_mult_iff dest!: distributed_real_AE)
```
```   666   qed
```
```   667
```
```   668   have "(AE x in ?P. Px (fst x) = 0 \<longrightarrow> Pxy x = 0)"
```
```   669     by (rule subdensity_real[OF measurable_fst Pxy Px]) auto
```
```   670   moreover
```
```   671   have "(AE x in ?P. Py (snd x) = 0 \<longrightarrow> Pxy x = 0)"
```
```   672     by (rule subdensity_real[OF measurable_snd Pxy Py]) auto
```
```   673   ultimately have ac: "AE x in ?P. Px (fst x) * Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
```
```   674     by eventually_elim auto
```
```   675
```
```   676   show "?M = ?R"
```
```   677     unfolding M f_def
```
```   678     using b_gt_1 f PxPy_nonneg Pxy[THEN distributed_real_measurable] Pxy[THEN distributed_real_AE] ac
```
```   679     by (rule ST.KL_density_density)
```
```   680
```
```   681   assume int: "integrable (S \<Otimes>\<^sub>M T) f"
```
```   682   show "0 \<le> ?M" unfolding M
```
```   683   proof (rule ST.KL_density_density_nonneg
```
```   684     [OF b_gt_1 f PxPy_nonneg _ Pxy[THEN distributed_real_measurable] Pxy[THEN distributed_real_AE] _ ac int[unfolded f_def]])
```
```   685     show "prob_space (density (S \<Otimes>\<^sub>M T) (\<lambda>x. ereal (Pxy x))) "
```
```   686       unfolding distributed_distr_eq_density[OF Pxy, symmetric]
```
```   687       using distributed_measurable[OF Pxy] by (rule prob_space_distr)
```
```   688     show "prob_space (density (S \<Otimes>\<^sub>M T) (\<lambda>x. ereal (Px (fst x) * Py (snd x))))"
```
```   689       unfolding distr_eq[symmetric] by unfold_locales
```
```   690   qed
```
```   691 qed
```
```   692
```
```   693 lemma (in information_space)
```
```   694   fixes Pxy :: "'b \<times> 'c \<Rightarrow> real" and Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real"
```
```   695   assumes "sigma_finite_measure S" "sigma_finite_measure T"
```
```   696   assumes Px[measurable]: "distributed M S X Px" and Py[measurable]: "distributed M T Y Py"
```
```   697   assumes Pxy[measurable]: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
```
```   698   assumes ae: "AE x in S. AE y in T. Pxy (x, y) = Px x * Py y"
```
```   699   shows mutual_information_eq_0: "mutual_information b S T X Y = 0"
```
```   700 proof -
```
```   701   interpret S: sigma_finite_measure S by fact
```
```   702   interpret T: sigma_finite_measure T by fact
```
```   703   interpret ST: pair_sigma_finite S T ..
```
```   704
```
```   705   have "AE x in S \<Otimes>\<^sub>M T. Px (fst x) = 0 \<longrightarrow> Pxy x = 0"
```
```   706     by (rule subdensity_real[OF measurable_fst Pxy Px]) auto
```
```   707   moreover
```
```   708   have "AE x in S \<Otimes>\<^sub>M T. Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
```
```   709     by (rule subdensity_real[OF measurable_snd Pxy Py]) auto
```
```   710   moreover
```
```   711   have "AE x in S \<Otimes>\<^sub>M T. Pxy x = Px (fst x) * Py (snd x)"
```
```   712     using distributed_real_measurable[OF Px] distributed_real_measurable[OF Py] distributed_real_measurable[OF Pxy]
```
```   713     by (intro ST.AE_pair_measure) (auto simp: ae intro!: measurable_snd'' measurable_fst'')
```
```   714   ultimately have "AE x in S \<Otimes>\<^sub>M T. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))) = 0"
```
```   715     by eventually_elim simp
```
```   716   then have "(\<integral>x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))) \<partial>(S \<Otimes>\<^sub>M T)) = (\<integral>x. 0 \<partial>(S \<Otimes>\<^sub>M T))"
```
```   717     by (intro integral_cong_AE) auto
```
```   718   then show ?thesis
```
```   719     by (subst mutual_information_distr[OF assms(1-5)]) simp
```
```   720 qed
```
```   721
```
```   722 lemma (in information_space) mutual_information_simple_distributed:
```
```   723   assumes X: "simple_distributed M X Px" and Y: "simple_distributed M Y Py"
```
```   724   assumes XY: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy"
```
```   725   shows "\<I>(X ; Y) = (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x))`space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y)))"
```
```   726 proof (subst mutual_information_distr[OF _ _ simple_distributed[OF X] simple_distributed[OF Y] simple_distributed_joint[OF XY]])
```
```   727   note fin = simple_distributed_joint_finite[OF XY, simp]
```
```   728   show "sigma_finite_measure (count_space (X ` space M))"
```
```   729     by (simp add: sigma_finite_measure_count_space_finite)
```
```   730   show "sigma_finite_measure (count_space (Y ` space M))"
```
```   731     by (simp add: sigma_finite_measure_count_space_finite)
```
```   732   let ?Pxy = "\<lambda>x. (if x \<in> (\<lambda>x. (X x, Y x)) ` space M then Pxy x else 0)"
```
```   733   let ?f = "\<lambda>x. ?Pxy x * log b (?Pxy x / (Px (fst x) * Py (snd x)))"
```
```   734   have "\<And>x. ?f x = (if x \<in> (\<lambda>x. (X x, Y x)) ` space M then Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))) else 0)"
```
```   735     by auto
```
```   736   with fin show "(\<integral> x. ?f x \<partial>(count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M))) =
```
```   737     (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y)))"
```
```   738     by (auto simp add: pair_measure_count_space lebesgue_integral_count_space_finite setsum.If_cases split_beta'
```
```   739              intro!: setsum.cong)
```
```   740 qed
```
```   741
```
```   742 lemma (in information_space)
```
```   743   fixes Pxy :: "'b \<times> 'c \<Rightarrow> real" and Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real"
```
```   744   assumes Px: "simple_distributed M X Px" and Py: "simple_distributed M Y Py"
```
```   745   assumes Pxy: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy"
```
```   746   assumes ae: "\<forall>x\<in>space M. Pxy (X x, Y x) = Px (X x) * Py (Y x)"
```
```   747   shows mutual_information_eq_0_simple: "\<I>(X ; Y) = 0"
```
```   748 proof (subst mutual_information_simple_distributed[OF Px Py Pxy])
```
```   749   have "(\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y))) =
```
```   750     (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. 0)"
```
```   751     by (intro setsum.cong) (auto simp: ae)
```
```   752   then show "(\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M.
```
```   753     Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y))) = 0" by simp
```
```   754 qed
```
```   755
```
```   756 subsection {* Entropy *}
```
```   757
```
```   758 definition (in prob_space) entropy :: "real \<Rightarrow> 'b measure \<Rightarrow> ('a \<Rightarrow> 'b) \<Rightarrow> real" where
```
```   759   "entropy b S X = - KL_divergence b S (distr M S X)"
```
```   760
```
```   761 abbreviation (in information_space)
```
```   762   entropy_Pow ("\<H>'(_')") where
```
```   763   "\<H>(X) \<equiv> entropy b (count_space (X`space M)) X"
```
```   764
```
```   765 lemma (in prob_space) distributed_RN_deriv:
```
```   766   assumes X: "distributed M S X Px"
```
```   767   shows "AE x in S. RN_deriv S (density S Px) x = Px x"
```
```   768 proof -
```
```   769   note D = distributed_measurable[OF X] distributed_borel_measurable[OF X] distributed_AE[OF X]
```
```   770   interpret X: prob_space "distr M S X"
```
```   771     using D(1) by (rule prob_space_distr)
```
```   772
```
```   773   have sf: "sigma_finite_measure (distr M S X)" by default
```
```   774   show ?thesis
```
```   775     using D
```
```   776     apply (subst eq_commute)
```
```   777     apply (intro RN_deriv_unique_sigma_finite)
```
```   778     apply (auto simp: distributed_distr_eq_density[symmetric, OF X] sf measure_nonneg)
```
```   779     done
```
```   780 qed
```
```   781
```
```   782 lemma (in information_space)
```
```   783   fixes X :: "'a \<Rightarrow> 'b"
```
```   784   assumes X[measurable]: "distributed M MX X f"
```
```   785   shows entropy_distr: "entropy b MX X = - (\<integral>x. f x * log b (f x) \<partial>MX)" (is ?eq)
```
```   786 proof -
```
```   787   note D = distributed_measurable[OF X] distributed_borel_measurable[OF X] distributed_AE[OF X]
```
```   788   note ae = distributed_RN_deriv[OF X]
```
```   789
```
```   790   have ae_eq: "AE x in distr M MX X. log b (real (RN_deriv MX (distr M MX X) x)) =
```
```   791     log b (f x)"
```
```   792     unfolding distributed_distr_eq_density[OF X]
```
```   793     apply (subst AE_density)
```
```   794     using D apply simp
```
```   795     using ae apply eventually_elim
```
```   796     apply auto
```
```   797     done
```
```   798
```
```   799   have int_eq: "(\<integral> x. f x * log b (f x) \<partial>MX) = (\<integral> x. log b (f x) \<partial>distr M MX X)"
```
```   800     unfolding distributed_distr_eq_density[OF X]
```
```   801     using D
```
```   802     by (subst integral_density)
```
```   803        (auto simp: borel_measurable_ereal_iff)
```
```   804
```
```   805   show ?eq
```
```   806     unfolding entropy_def KL_divergence_def entropy_density_def comp_def int_eq neg_equal_iff_equal
```
```   807     using ae_eq by (intro integral_cong_AE) auto
```
```   808 qed
```
```   809
```
```   810 lemma (in prob_space) distributed_imp_emeasure_nonzero:
```
```   811   assumes X: "distributed M MX X Px"
```
```   812   shows "emeasure MX {x \<in> space MX. Px x \<noteq> 0} \<noteq> 0"
```
```   813 proof
```
```   814   note Px = distributed_borel_measurable[OF X] distributed_AE[OF X]
```
```   815   interpret X: prob_space "distr M MX X"
```
```   816     using distributed_measurable[OF X] by (rule prob_space_distr)
```
```   817
```
```   818   assume "emeasure MX {x \<in> space MX. Px x \<noteq> 0} = 0"
```
```   819   with Px have "AE x in MX. Px x = 0"
```
```   820     by (intro AE_I[OF subset_refl]) (auto simp: borel_measurable_ereal_iff)
```
```   821   moreover
```
```   822   from X.emeasure_space_1 have "(\<integral>\<^sup>+x. Px x \<partial>MX) = 1"
```
```   823     unfolding distributed_distr_eq_density[OF X] using Px
```
```   824     by (subst (asm) emeasure_density)
```
```   825        (auto simp: borel_measurable_ereal_iff intro!: integral_cong cong: nn_integral_cong)
```
```   826   ultimately show False
```
```   827     by (simp add: nn_integral_cong_AE)
```
```   828 qed
```
```   829
```
```   830 lemma (in information_space) entropy_le:
```
```   831   fixes Px :: "'b \<Rightarrow> real" and MX :: "'b measure"
```
```   832   assumes X[measurable]: "distributed M MX X Px"
```
```   833   and fin: "emeasure MX {x \<in> space MX. Px x \<noteq> 0} \<noteq> \<infinity>"
```
```   834   and int: "integrable MX (\<lambda>x. - Px x * log b (Px x))"
```
```   835   shows "entropy b MX X \<le> log b (measure MX {x \<in> space MX. Px x \<noteq> 0})"
```
```   836 proof -
```
```   837   note Px = distributed_borel_measurable[OF X] distributed_AE[OF X]
```
```   838   interpret X: prob_space "distr M MX X"
```
```   839     using distributed_measurable[OF X] by (rule prob_space_distr)
```
```   840
```
```   841   have " - log b (measure MX {x \<in> space MX. Px x \<noteq> 0}) =
```
```   842     - log b (\<integral> x. indicator {x \<in> space MX. Px x \<noteq> 0} x \<partial>MX)"
```
```   843     using Px fin
```
```   844     by (auto simp: measure_def borel_measurable_ereal_iff)
```
```   845   also have "- log b (\<integral> x. indicator {x \<in> space MX. Px x \<noteq> 0} x \<partial>MX) = - log b (\<integral> x. 1 / Px x \<partial>distr M MX X)"
```
```   846     unfolding distributed_distr_eq_density[OF X] using Px
```
```   847     apply (intro arg_cong[where f="log b"] arg_cong[where f=uminus])
```
```   848     by (subst integral_density) (auto simp: borel_measurable_ereal_iff simp del: integral_indicator intro!: integral_cong)
```
```   849   also have "\<dots> \<le> (\<integral> x. - log b (1 / Px x) \<partial>distr M MX X)"
```
```   850   proof (rule X.jensens_inequality[of "\<lambda>x. 1 / Px x" "{0<..}" 0 1 "\<lambda>x. - log b x"])
```
```   851     show "AE x in distr M MX X. 1 / Px x \<in> {0<..}"
```
```   852       unfolding distributed_distr_eq_density[OF X]
```
```   853       using Px by (auto simp: AE_density)
```
```   854     have [simp]: "\<And>x. x \<in> space MX \<Longrightarrow> ereal (if Px x = 0 then 0 else 1) = indicator {x \<in> space MX. Px x \<noteq> 0} x"
```
```   855       by (auto simp: one_ereal_def)
```
```   856     have "(\<integral>\<^sup>+ x. max 0 (ereal (- (if Px x = 0 then 0 else 1))) \<partial>MX) = (\<integral>\<^sup>+ x. 0 \<partial>MX)"
```
```   857       by (intro nn_integral_cong) (auto split: split_max)
```
```   858     then show "integrable (distr M MX X) (\<lambda>x. 1 / Px x)"
```
```   859       unfolding distributed_distr_eq_density[OF X] using Px
```
```   860       by (auto simp: nn_integral_density real_integrable_def borel_measurable_ereal_iff fin nn_integral_max_0
```
```   861               cong: nn_integral_cong)
```
```   862     have "integrable MX (\<lambda>x. Px x * log b (1 / Px x)) =
```
```   863       integrable MX (\<lambda>x. - Px x * log b (Px x))"
```
```   864       using Px
```
```   865       by (intro integrable_cong_AE)
```
```   866          (auto simp: borel_measurable_ereal_iff log_divide_eq
```
```   867                   intro!: measurable_If)
```
```   868     then show "integrable (distr M MX X) (\<lambda>x. - log b (1 / Px x))"
```
```   869       unfolding distributed_distr_eq_density[OF X]
```
```   870       using Px int
```
```   871       by (subst integrable_real_density) (auto simp: borel_measurable_ereal_iff)
```
```   872   qed (auto simp: minus_log_convex[OF b_gt_1])
```
```   873   also have "\<dots> = (\<integral> x. log b (Px x) \<partial>distr M MX X)"
```
```   874     unfolding distributed_distr_eq_density[OF X] using Px
```
```   875     by (intro integral_cong_AE) (auto simp: AE_density log_divide_eq)
```
```   876   also have "\<dots> = - entropy b MX X"
```
```   877     unfolding distributed_distr_eq_density[OF X] using Px
```
```   878     by (subst entropy_distr[OF X]) (auto simp: borel_measurable_ereal_iff integral_density)
```
```   879   finally show ?thesis
```
```   880     by simp
```
```   881 qed
```
```   882
```
```   883 lemma (in information_space) entropy_le_space:
```
```   884   fixes Px :: "'b \<Rightarrow> real" and MX :: "'b measure"
```
```   885   assumes X: "distributed M MX X Px"
```
```   886   and fin: "finite_measure MX"
```
```   887   and int: "integrable MX (\<lambda>x. - Px x * log b (Px x))"
```
```   888   shows "entropy b MX X \<le> log b (measure MX (space MX))"
```
```   889 proof -
```
```   890   note Px = distributed_borel_measurable[OF X] distributed_AE[OF X]
```
```   891   interpret finite_measure MX by fact
```
```   892   have "entropy b MX X \<le> log b (measure MX {x \<in> space MX. Px x \<noteq> 0})"
```
```   893     using int X by (intro entropy_le) auto
```
```   894   also have "\<dots> \<le> log b (measure MX (space MX))"
```
```   895     using Px distributed_imp_emeasure_nonzero[OF X]
```
```   896     by (intro log_le)
```
```   897        (auto intro!: borel_measurable_ereal_iff finite_measure_mono b_gt_1
```
```   898                      less_le[THEN iffD2] measure_nonneg simp: emeasure_eq_measure)
```
```   899   finally show ?thesis .
```
```   900 qed
```
```   901
```
```   902 lemma (in information_space) entropy_uniform:
```
```   903   assumes X: "distributed M MX X (\<lambda>x. indicator A x / measure MX A)" (is "distributed _ _ _ ?f")
```
```   904   shows "entropy b MX X = log b (measure MX A)"
```
```   905 proof (subst entropy_distr[OF X])
```
```   906   have [simp]: "emeasure MX A \<noteq> \<infinity>"
```
```   907     using uniform_distributed_params[OF X] by (auto simp add: measure_def)
```
```   908   have eq: "(\<integral> x. indicator A x / measure MX A * log b (indicator A x / measure MX A) \<partial>MX) =
```
```   909     (\<integral> x. (- log b (measure MX A) / measure MX A) * indicator A x \<partial>MX)"
```
```   910     using measure_nonneg[of MX A] uniform_distributed_params[OF X]
```
```   911     by (intro integral_cong) (auto split: split_indicator simp: log_divide_eq)
```
```   912   show "- (\<integral> x. indicator A x / measure MX A * log b (indicator A x / measure MX A) \<partial>MX) =
```
```   913     log b (measure MX A)"
```
```   914     unfolding eq using uniform_distributed_params[OF X]
```
```   915     by (subst integral_mult_right) (auto simp: measure_def)
```
```   916 qed
```
```   917
```
```   918 lemma (in information_space) entropy_simple_distributed:
```
```   919   "simple_distributed M X f \<Longrightarrow> \<H>(X) = - (\<Sum>x\<in>X`space M. f x * log b (f x))"
```
```   920   by (subst entropy_distr[OF simple_distributed])
```
```   921      (auto simp add: lebesgue_integral_count_space_finite)
```
```   922
```
```   923 lemma (in information_space) entropy_le_card_not_0:
```
```   924   assumes X: "simple_distributed M X f"
```
```   925   shows "\<H>(X) \<le> log b (card (X ` space M \<inter> {x. f x \<noteq> 0}))"
```
```   926 proof -
```
```   927   let ?X = "count_space (X`space M)"
```
```   928   have "\<H>(X) \<le> log b (measure ?X {x \<in> space ?X. f x \<noteq> 0})"
```
```   929     by (rule entropy_le[OF simple_distributed[OF X]])
```
```   930        (simp_all add: simple_distributed_finite[OF X] subset_eq integrable_count_space emeasure_count_space)
```
```   931   also have "measure ?X {x \<in> space ?X. f x \<noteq> 0} = card (X ` space M \<inter> {x. f x \<noteq> 0})"
```
```   932     by (simp_all add: simple_distributed_finite[OF X] subset_eq emeasure_count_space measure_def Int_def)
```
```   933   finally show ?thesis .
```
```   934 qed
```
```   935
```
```   936 lemma (in information_space) entropy_le_card:
```
```   937   assumes X: "simple_distributed M X f"
```
```   938   shows "\<H>(X) \<le> log b (real (card (X ` space M)))"
```
```   939 proof -
```
```   940   let ?X = "count_space (X`space M)"
```
```   941   have "\<H>(X) \<le> log b (measure ?X (space ?X))"
```
```   942     by (rule entropy_le_space[OF simple_distributed[OF X]])
```
```   943        (simp_all add: simple_distributed_finite[OF X] subset_eq integrable_count_space emeasure_count_space finite_measure_count_space)
```
```   944   also have "measure ?X (space ?X) = card (X ` space M)"
```
```   945     by (simp_all add: simple_distributed_finite[OF X] subset_eq emeasure_count_space measure_def)
```
```   946   finally show ?thesis .
```
```   947 qed
```
```   948
```
```   949 subsection {* Conditional Mutual Information *}
```
```   950
```
```   951 definition (in prob_space)
```
```   952   "conditional_mutual_information b MX MY MZ X Y Z \<equiv>
```
```   953     mutual_information b MX (MY \<Otimes>\<^sub>M MZ) X (\<lambda>x. (Y x, Z x)) -
```
```   954     mutual_information b MX MZ X Z"
```
```   955
```
```   956 abbreviation (in information_space)
```
```   957   conditional_mutual_information_Pow ("\<I>'( _ ; _ | _ ')") where
```
```   958   "\<I>(X ; Y | Z) \<equiv> conditional_mutual_information b
```
```   959     (count_space (X ` space M)) (count_space (Y ` space M)) (count_space (Z ` space M)) X Y Z"
```
```   960
```
```   961 lemma (in information_space)
```
```   962   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" and P: "sigma_finite_measure P"
```
```   963   assumes Px[measurable]: "distributed M S X Px"
```
```   964   assumes Pz[measurable]: "distributed M P Z Pz"
```
```   965   assumes Pyz[measurable]: "distributed M (T \<Otimes>\<^sub>M P) (\<lambda>x. (Y x, Z x)) Pyz"
```
```   966   assumes Pxz[measurable]: "distributed M (S \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Z x)) Pxz"
```
```   967   assumes Pxyz[measurable]: "distributed M (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Y x, Z x)) Pxyz"
```
```   968   assumes I1: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z))))"
```
```   969   assumes I2: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z)))"
```
```   970   shows conditional_mutual_information_generic_eq: "conditional_mutual_information b S T P X Y Z
```
```   971     = (\<integral>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))" (is "?eq")
```
```   972     and conditional_mutual_information_generic_nonneg: "0 \<le> conditional_mutual_information b S T P X Y Z" (is "?nonneg")
```
```   973 proof -
```
```   974   interpret S: sigma_finite_measure S by fact
```
```   975   interpret T: sigma_finite_measure T by fact
```
```   976   interpret P: sigma_finite_measure P by fact
```
```   977   interpret TP: pair_sigma_finite T P ..
```
```   978   interpret SP: pair_sigma_finite S P ..
```
```   979   interpret ST: pair_sigma_finite S T ..
```
```   980   interpret SPT: pair_sigma_finite "S \<Otimes>\<^sub>M P" T ..
```
```   981   interpret STP: pair_sigma_finite S "T \<Otimes>\<^sub>M P" ..
```
```   982   interpret TPS: pair_sigma_finite "T \<Otimes>\<^sub>M P" S ..
```
```   983   have TP: "sigma_finite_measure (T \<Otimes>\<^sub>M P)" ..
```
```   984   have SP: "sigma_finite_measure (S \<Otimes>\<^sub>M P)" ..
```
```   985   have YZ: "random_variable (T \<Otimes>\<^sub>M P) (\<lambda>x. (Y x, Z x))"
```
```   986     using Pyz by (simp add: distributed_measurable)
```
```   987
```
```   988   from Pxz Pxyz have distr_eq: "distr M (S \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Z x)) =
```
```   989     distr (distr M (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Y x, Z x))) (S \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). (x, z))"
```
```   990     by (simp add: comp_def distr_distr)
```
```   991
```
```   992   have "mutual_information b S P X Z =
```
```   993     (\<integral>x. Pxz x * log b (Pxz x / (Px (fst x) * Pz (snd x))) \<partial>(S \<Otimes>\<^sub>M P))"
```
```   994     by (rule mutual_information_distr[OF S P Px Pz Pxz])
```
```   995   also have "\<dots> = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))"
```
```   996     using b_gt_1 Pxz Px Pz
```
```   997     by (subst distributed_transform_integral[OF Pxyz Pxz, where T="\<lambda>(x, y, z). (x, z)"]) (auto simp: split_beta')
```
```   998   finally have mi_eq:
```
```   999     "mutual_information b S P X Z = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))" .
```
```  1000
```
```  1001   have ae1: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Px (fst x) = 0 \<longrightarrow> Pxyz x = 0"
```
```  1002     by (intro subdensity_real[of fst, OF _ Pxyz Px]) auto
```
```  1003   moreover have ae2: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
```
```  1004     by (intro subdensity_real[of "\<lambda>x. snd (snd x)", OF _ Pxyz Pz]) auto
```
```  1005   moreover have ae3: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
```
```  1006     by (intro subdensity_real[of "\<lambda>x. (fst x, snd (snd x))", OF _ Pxyz Pxz]) auto
```
```  1007   moreover have ae4: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0"
```
```  1008     by (intro subdensity_real[of snd, OF _ Pxyz Pyz]) auto
```
```  1009   moreover have ae5: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Px (fst x)"
```
```  1010     using Px by (intro STP.AE_pair_measure) (auto simp: comp_def dest: distributed_real_AE)
```
```  1011   moreover have ae6: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Pyz (snd x)"
```
```  1012     using Pyz by (intro STP.AE_pair_measure) (auto simp: comp_def dest: distributed_real_AE)
```
```  1013   moreover have ae7: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Pz (snd (snd x))"
```
```  1014     using Pz Pz[THEN distributed_real_measurable]
```
```  1015     by (auto intro!: TP.AE_pair_measure STP.AE_pair_measure AE_I2[of S] dest: distributed_real_AE)
```
```  1016   moreover have ae8: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Pxz (fst x, snd (snd x))"
```
```  1017     using Pxz[THEN distributed_real_AE, THEN SP.AE_pair]
```
```  1018     by (auto intro!: TP.AE_pair_measure STP.AE_pair_measure)
```
```  1019   moreover note Pxyz[THEN distributed_real_AE]
```
```  1020   ultimately have ae: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P.
```
```  1021     Pxyz x * log b (Pxyz x / (Px (fst x) * Pyz (snd x))) -
```
```  1022     Pxyz x * log b (Pxz (fst x, snd (snd x)) / (Px (fst x) * Pz (snd (snd x)))) =
```
```  1023     Pxyz x * log b (Pxyz x * Pz (snd (snd x)) / (Pxz (fst x, snd (snd x)) * Pyz (snd x))) "
```
```  1024   proof eventually_elim
```
```  1025     case (goal1 x)
```
```  1026     show ?case
```
```  1027     proof cases
```
```  1028       assume "Pxyz x \<noteq> 0"
```
```  1029       with goal1 have "0 < Px (fst x)" "0 < Pz (snd (snd x))" "0 < Pxz (fst x, snd (snd x))" "0 < Pyz (snd x)" "0 < Pxyz x"
```
```  1030         by auto
```
```  1031       then show ?thesis
```
```  1032         using b_gt_1 by (simp add: log_simps less_imp_le field_simps)
```
```  1033     qed simp
```
```  1034   qed
```
```  1035   with I1 I2 show ?eq
```
```  1036     unfolding conditional_mutual_information_def
```
```  1037     apply (subst mi_eq)
```
```  1038     apply (subst mutual_information_distr[OF S TP Px Pyz Pxyz])
```
```  1039     apply (subst integral_diff[symmetric])
```
```  1040     apply (auto intro!: integral_cong_AE simp: split_beta' simp del: integral_diff)
```
```  1041     done
```
```  1042
```
```  1043   let ?P = "density (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) Pxyz"
```
```  1044   interpret P: prob_space ?P
```
```  1045     unfolding distributed_distr_eq_density[OF Pxyz, symmetric]
```
```  1046     by (rule prob_space_distr) simp
```
```  1047
```
```  1048   let ?Q = "density (T \<Otimes>\<^sub>M P) Pyz"
```
```  1049   interpret Q: prob_space ?Q
```
```  1050     unfolding distributed_distr_eq_density[OF Pyz, symmetric]
```
```  1051     by (rule prob_space_distr) simp
```
```  1052
```
```  1053   let ?f = "\<lambda>(x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) / Pxyz (x, y, z)"
```
```  1054
```
```  1055   from subdensity_real[of snd, OF _ Pyz Pz]
```
```  1056   have aeX1: "AE x in T \<Otimes>\<^sub>M P. Pz (snd x) = 0 \<longrightarrow> Pyz x = 0" by (auto simp: comp_def)
```
```  1057   have aeX2: "AE x in T \<Otimes>\<^sub>M P. 0 \<le> Pz (snd x)"
```
```  1058     using Pz by (intro TP.AE_pair_measure) (auto simp: comp_def dest: distributed_real_AE)
```
```  1059
```
```  1060   have aeX3: "AE y in T \<Otimes>\<^sub>M P. (\<integral>\<^sup>+ x. ereal (Pxz (x, snd y)) \<partial>S) = ereal (Pz (snd y))"
```
```  1061     using Pz distributed_marginal_eq_joint2[OF P S Pz Pxz]
```
```  1062     by (intro TP.AE_pair_measure) (auto dest: distributed_real_AE)
```
```  1063
```
```  1064   have "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<le> (\<integral>\<^sup>+ (x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))"
```
```  1065     apply (subst nn_integral_density)
```
```  1066     apply simp
```
```  1067     apply (rule distributed_AE[OF Pxyz])
```
```  1068     apply auto []
```
```  1069     apply (rule nn_integral_mono_AE)
```
```  1070     using ae5 ae6 ae7 ae8
```
```  1071     apply eventually_elim
```
```  1072     apply auto
```
```  1073     done
```
```  1074   also have "\<dots> = (\<integral>\<^sup>+(y, z). \<integral>\<^sup>+ x. ereal (Pxz (x, z)) * ereal (Pyz (y, z) / Pz z) \<partial>S \<partial>T \<Otimes>\<^sub>M P)"
```
```  1075     by (subst STP.nn_integral_snd[symmetric]) (auto simp add: split_beta')
```
```  1076   also have "\<dots> = (\<integral>\<^sup>+x. ereal (Pyz x) * 1 \<partial>T \<Otimes>\<^sub>M P)"
```
```  1077     apply (rule nn_integral_cong_AE)
```
```  1078     using aeX1 aeX2 aeX3 distributed_AE[OF Pyz] AE_space
```
```  1079     apply eventually_elim
```
```  1080   proof (case_tac x, simp del: times_ereal.simps add: space_pair_measure)
```
```  1081     fix a b assume "Pz b = 0 \<longrightarrow> Pyz (a, b) = 0" "0 \<le> Pz b" "a \<in> space T \<and> b \<in> space P"
```
```  1082       "(\<integral>\<^sup>+ x. ereal (Pxz (x, b)) \<partial>S) = ereal (Pz b)" "0 \<le> Pyz (a, b)"
```
```  1083     then show "(\<integral>\<^sup>+ x. ereal (Pxz (x, b)) * ereal (Pyz (a, b) / Pz b) \<partial>S) = ereal (Pyz (a, b))"
```
```  1084       by (subst nn_integral_multc)
```
```  1085          (auto split: prod.split)
```
```  1086   qed
```
```  1087   also have "\<dots> = 1"
```
```  1088     using Q.emeasure_space_1 distributed_AE[OF Pyz] distributed_distr_eq_density[OF Pyz]
```
```  1089     by (subst nn_integral_density[symmetric]) auto
```
```  1090   finally have le1: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<le> 1" .
```
```  1091   also have "\<dots> < \<infinity>" by simp
```
```  1092   finally have fin: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> \<infinity>" by simp
```
```  1093
```
```  1094   have pos: "(\<integral>\<^sup>+x. ?f x \<partial>?P) \<noteq> 0"
```
```  1095     apply (subst nn_integral_density)
```
```  1096     apply simp
```
```  1097     apply (rule distributed_AE[OF Pxyz])
```
```  1098     apply auto []
```
```  1099     apply (simp add: split_beta')
```
```  1100   proof
```
```  1101     let ?g = "\<lambda>x. ereal (if Pxyz x = 0 then 0 else Pxz (fst x, snd (snd x)) * Pyz (snd x) / Pz (snd (snd x)))"
```
```  1102     assume "(\<integral>\<^sup>+x. ?g x \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)) = 0"
```
```  1103     then have "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. ?g x \<le> 0"
```
```  1104       by (intro nn_integral_0_iff_AE[THEN iffD1]) auto
```
```  1105     then have "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pxyz x = 0"
```
```  1106       using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
```
```  1107       by eventually_elim (auto split: split_if_asm simp: mult_le_0_iff divide_le_0_iff)
```
```  1108     then have "(\<integral>\<^sup>+ x. ereal (Pxyz x) \<partial>S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) = 0"
```
```  1109       by (subst nn_integral_cong_AE[of _ "\<lambda>x. 0"]) auto
```
```  1110     with P.emeasure_space_1 show False
```
```  1111       by (subst (asm) emeasure_density) (auto cong: nn_integral_cong)
```
```  1112   qed
```
```  1113
```
```  1114   have neg: "(\<integral>\<^sup>+ x. - ?f x \<partial>?P) = 0"
```
```  1115     apply (rule nn_integral_0_iff_AE[THEN iffD2])
```
```  1116     apply simp
```
```  1117     apply (subst AE_density)
```
```  1118     apply simp
```
```  1119     using ae5 ae6 ae7 ae8
```
```  1120     apply eventually_elim
```
```  1121     apply auto
```
```  1122     done
```
```  1123
```
```  1124   have I3: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))"
```
```  1125     apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ integrable_diff[OF I1 I2]])
```
```  1126     using ae
```
```  1127     apply (auto simp: split_beta')
```
```  1128     done
```
```  1129
```
```  1130   have "- log b 1 \<le> - log b (integral\<^sup>L ?P ?f)"
```
```  1131   proof (intro le_imp_neg_le log_le[OF b_gt_1])
```
```  1132     have If: "integrable ?P ?f"
```
```  1133       unfolding real_integrable_def
```
```  1134     proof (intro conjI)
```
```  1135       from neg show "(\<integral>\<^sup>+ x. - ?f x \<partial>?P) \<noteq> \<infinity>"
```
```  1136         by simp
```
```  1137       from fin show "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> \<infinity>"
```
```  1138         by simp
```
```  1139     qed simp
```
```  1140     then have "(\<integral>\<^sup>+ x. ?f x \<partial>?P) = (\<integral>x. ?f x \<partial>?P)"
```
```  1141       apply (rule nn_integral_eq_integral)
```
```  1142       apply (subst AE_density)
```
```  1143       apply simp
```
```  1144       using ae5 ae6 ae7 ae8
```
```  1145       apply eventually_elim
```
```  1146       apply auto
```
```  1147       done
```
```  1148     with nn_integral_nonneg[of ?P ?f] pos le1
```
```  1149     show "0 < (\<integral>x. ?f x \<partial>?P)" "(\<integral>x. ?f x \<partial>?P) \<le> 1"
```
```  1150       by (simp_all add: one_ereal_def)
```
```  1151   qed
```
```  1152   also have "- log b (integral\<^sup>L ?P ?f) \<le> (\<integral> x. - log b (?f x) \<partial>?P)"
```
```  1153   proof (rule P.jensens_inequality[where a=0 and b=1 and I="{0<..}"])
```
```  1154     show "AE x in ?P. ?f x \<in> {0<..}"
```
```  1155       unfolding AE_density[OF distributed_borel_measurable[OF Pxyz]]
```
```  1156       using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
```
```  1157       by eventually_elim (auto)
```
```  1158     show "integrable ?P ?f"
```
```  1159       unfolding real_integrable_def
```
```  1160       using fin neg by (auto simp: split_beta')
```
```  1161     show "integrable ?P (\<lambda>x. - log b (?f x))"
```
```  1162       apply (subst integrable_real_density)
```
```  1163       apply simp
```
```  1164       apply (auto intro!: distributed_real_AE[OF Pxyz]) []
```
```  1165       apply simp
```
```  1166       apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ I3])
```
```  1167       apply simp
```
```  1168       apply simp
```
```  1169       using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
```
```  1170       apply eventually_elim
```
```  1171       apply (auto simp: log_divide_eq log_mult_eq zero_le_mult_iff zero_less_mult_iff zero_less_divide_iff field_simps)
```
```  1172       done
```
```  1173   qed (auto simp: b_gt_1 minus_log_convex)
```
```  1174   also have "\<dots> = conditional_mutual_information b S T P X Y Z"
```
```  1175     unfolding `?eq`
```
```  1176     apply (subst integral_real_density)
```
```  1177     apply simp
```
```  1178     apply (auto intro!: distributed_real_AE[OF Pxyz]) []
```
```  1179     apply simp
```
```  1180     apply (intro integral_cong_AE)
```
```  1181     using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
```
```  1182     apply (auto simp: log_divide_eq zero_less_mult_iff zero_less_divide_iff field_simps)
```
```  1183     done
```
```  1184   finally show ?nonneg
```
```  1185     by simp
```
```  1186 qed
```
```  1187
```
```  1188 lemma (in information_space)
```
```  1189   fixes Px :: "_ \<Rightarrow> real"
```
```  1190   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" and P: "sigma_finite_measure P"
```
```  1191   assumes Fx: "finite_entropy S X Px"
```
```  1192   assumes Fz: "finite_entropy P Z Pz"
```
```  1193   assumes Fyz: "finite_entropy (T \<Otimes>\<^sub>M P) (\<lambda>x. (Y x, Z x)) Pyz"
```
```  1194   assumes Fxz: "finite_entropy (S \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Z x)) Pxz"
```
```  1195   assumes Fxyz: "finite_entropy (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Y x, Z x)) Pxyz"
```
```  1196   shows conditional_mutual_information_generic_eq': "conditional_mutual_information b S T P X Y Z
```
```  1197     = (\<integral>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))" (is "?eq")
```
```  1198     and conditional_mutual_information_generic_nonneg': "0 \<le> conditional_mutual_information b S T P X Y Z" (is "?nonneg")
```
```  1199 proof -
```
```  1200   note Px = Fx[THEN finite_entropy_distributed, measurable]
```
```  1201   note Pz = Fz[THEN finite_entropy_distributed, measurable]
```
```  1202   note Pyz = Fyz[THEN finite_entropy_distributed, measurable]
```
```  1203   note Pxz = Fxz[THEN finite_entropy_distributed, measurable]
```
```  1204   note Pxyz = Fxyz[THEN finite_entropy_distributed, measurable]
```
```  1205
```
```  1206   interpret S: sigma_finite_measure S by fact
```
```  1207   interpret T: sigma_finite_measure T by fact
```
```  1208   interpret P: sigma_finite_measure P by fact
```
```  1209   interpret TP: pair_sigma_finite T P ..
```
```  1210   interpret SP: pair_sigma_finite S P ..
```
```  1211   interpret ST: pair_sigma_finite S T ..
```
```  1212   interpret SPT: pair_sigma_finite "S \<Otimes>\<^sub>M P" T ..
```
```  1213   interpret STP: pair_sigma_finite S "T \<Otimes>\<^sub>M P" ..
```
```  1214   interpret TPS: pair_sigma_finite "T \<Otimes>\<^sub>M P" S ..
```
```  1215   have TP: "sigma_finite_measure (T \<Otimes>\<^sub>M P)" ..
```
```  1216   have SP: "sigma_finite_measure (S \<Otimes>\<^sub>M P)" ..
```
```  1217
```
```  1218   from Pxz Pxyz have distr_eq: "distr M (S \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Z x)) =
```
```  1219     distr (distr M (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Y x, Z x))) (S \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). (x, z))"
```
```  1220     by (simp add: distr_distr comp_def)
```
```  1221
```
```  1222   have "mutual_information b S P X Z =
```
```  1223     (\<integral>x. Pxz x * log b (Pxz x / (Px (fst x) * Pz (snd x))) \<partial>(S \<Otimes>\<^sub>M P))"
```
```  1224     by (rule mutual_information_distr[OF S P Px Pz Pxz])
```
```  1225   also have "\<dots> = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))"
```
```  1226     using b_gt_1 Pxz Px Pz
```
```  1227     by (subst distributed_transform_integral[OF Pxyz Pxz, where T="\<lambda>(x, y, z). (x, z)"])
```
```  1228        (auto simp: split_beta')
```
```  1229   finally have mi_eq:
```
```  1230     "mutual_information b S P X Z = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))" .
```
```  1231
```
```  1232   have ae1: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Px (fst x) = 0 \<longrightarrow> Pxyz x = 0"
```
```  1233     by (intro subdensity_real[of fst, OF _ Pxyz Px]) auto
```
```  1234   moreover have ae2: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
```
```  1235     by (intro subdensity_real[of "\<lambda>x. snd (snd x)", OF _ Pxyz Pz]) auto
```
```  1236   moreover have ae3: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
```
```  1237     by (intro subdensity_real[of "\<lambda>x. (fst x, snd (snd x))", OF _ Pxyz Pxz]) auto
```
```  1238   moreover have ae4: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0"
```
```  1239     by (intro subdensity_real[of snd, OF _ Pxyz Pyz]) auto
```
```  1240   moreover have ae5: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Px (fst x)"
```
```  1241     using Px by (intro STP.AE_pair_measure) (auto dest: distributed_real_AE)
```
```  1242   moreover have ae6: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Pyz (snd x)"
```
```  1243     using Pyz by (intro STP.AE_pair_measure) (auto dest: distributed_real_AE)
```
```  1244   moreover have ae7: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Pz (snd (snd x))"
```
```  1245     using Pz Pz[THEN distributed_real_measurable] by (auto intro!: TP.AE_pair_measure STP.AE_pair_measure AE_I2[of S] dest: distributed_real_AE)
```
```  1246   moreover have ae8: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Pxz (fst x, snd (snd x))"
```
```  1247     using Pxz[THEN distributed_real_AE, THEN SP.AE_pair]
```
```  1248     by (auto intro!: TP.AE_pair_measure STP.AE_pair_measure simp: comp_def)
```
```  1249   moreover note ae9 = Pxyz[THEN distributed_real_AE]
```
```  1250   ultimately have ae: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P.
```
```  1251     Pxyz x * log b (Pxyz x / (Px (fst x) * Pyz (snd x))) -
```
```  1252     Pxyz x * log b (Pxz (fst x, snd (snd x)) / (Px (fst x) * Pz (snd (snd x)))) =
```
```  1253     Pxyz x * log b (Pxyz x * Pz (snd (snd x)) / (Pxz (fst x, snd (snd x)) * Pyz (snd x))) "
```
```  1254   proof eventually_elim
```
```  1255     case (goal1 x)
```
```  1256     show ?case
```
```  1257     proof cases
```
```  1258       assume "Pxyz x \<noteq> 0"
```
```  1259       with goal1 have "0 < Px (fst x)" "0 < Pz (snd (snd x))" "0 < Pxz (fst x, snd (snd x))" "0 < Pyz (snd x)" "0 < Pxyz x"
```
```  1260         by auto
```
```  1261       then show ?thesis
```
```  1262         using b_gt_1 by (simp add: log_simps less_imp_le field_simps)
```
```  1263     qed simp
```
```  1264   qed
```
```  1265
```
```  1266   have "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)
```
```  1267     (\<lambda>x. Pxyz x * log b (Pxyz x) - Pxyz x * log b (Px (fst x)) - Pxyz x * log b (Pyz (snd x)))"
```
```  1268     using finite_entropy_integrable[OF Fxyz]
```
```  1269     using finite_entropy_integrable_transform[OF Fx Pxyz, of fst]
```
```  1270     using finite_entropy_integrable_transform[OF Fyz Pxyz, of snd]
```
```  1271     by simp
```
```  1272   moreover have "(\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z)))) \<in> borel_measurable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)"
```
```  1273     using Pxyz Px Pyz by simp
```
```  1274   ultimately have I1: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z))))"
```
```  1275     apply (rule integrable_cong_AE_imp)
```
```  1276     using ae1 ae4 ae5 ae6 ae9
```
```  1277     by eventually_elim
```
```  1278        (auto simp: log_divide_eq log_mult_eq field_simps zero_less_mult_iff)
```
```  1279
```
```  1280   have "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)
```
```  1281     (\<lambda>x. Pxyz x * log b (Pxz (fst x, snd (snd x))) - Pxyz x * log b (Px (fst x)) - Pxyz x * log b (Pz (snd (snd x))))"
```
```  1282     using finite_entropy_integrable_transform[OF Fxz Pxyz, of "\<lambda>x. (fst x, snd (snd x))"]
```
```  1283     using finite_entropy_integrable_transform[OF Fx Pxyz, of fst]
```
```  1284     using finite_entropy_integrable_transform[OF Fz Pxyz, of "snd \<circ> snd"]
```
```  1285     by simp
```
```  1286   moreover have "(\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z))) \<in> borel_measurable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)"
```
```  1287     using Pxyz Px Pz
```
```  1288     by auto
```
```  1289   ultimately have I2: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z)))"
```
```  1290     apply (rule integrable_cong_AE_imp)
```
```  1291     using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 ae9
```
```  1292     by eventually_elim
```
```  1293        (auto simp: log_divide_eq log_mult_eq field_simps zero_less_mult_iff)
```
```  1294
```
```  1295   from ae I1 I2 show ?eq
```
```  1296     unfolding conditional_mutual_information_def
```
```  1297     apply (subst mi_eq)
```
```  1298     apply (subst mutual_information_distr[OF S TP Px Pyz Pxyz])
```
```  1299     apply (subst integral_diff[symmetric])
```
```  1300     apply (auto intro!: integral_cong_AE simp: split_beta' simp del: integral_diff)
```
```  1301     done
```
```  1302
```
```  1303   let ?P = "density (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) Pxyz"
```
```  1304   interpret P: prob_space ?P
```
```  1305     unfolding distributed_distr_eq_density[OF Pxyz, symmetric] by (rule prob_space_distr) simp
```
```  1306
```
```  1307   let ?Q = "density (T \<Otimes>\<^sub>M P) Pyz"
```
```  1308   interpret Q: prob_space ?Q
```
```  1309     unfolding distributed_distr_eq_density[OF Pyz, symmetric] by (rule prob_space_distr) simp
```
```  1310
```
```  1311   let ?f = "\<lambda>(x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) / Pxyz (x, y, z)"
```
```  1312
```
```  1313   from subdensity_real[of snd, OF _ Pyz Pz]
```
```  1314   have aeX1: "AE x in T \<Otimes>\<^sub>M P. Pz (snd x) = 0 \<longrightarrow> Pyz x = 0" by (auto simp: comp_def)
```
```  1315   have aeX2: "AE x in T \<Otimes>\<^sub>M P. 0 \<le> Pz (snd x)"
```
```  1316     using Pz by (intro TP.AE_pair_measure) (auto dest: distributed_real_AE)
```
```  1317
```
```  1318   have aeX3: "AE y in T \<Otimes>\<^sub>M P. (\<integral>\<^sup>+ x. ereal (Pxz (x, snd y)) \<partial>S) = ereal (Pz (snd y))"
```
```  1319     using Pz distributed_marginal_eq_joint2[OF P S Pz Pxz]
```
```  1320     by (intro TP.AE_pair_measure) (auto dest: distributed_real_AE)
```
```  1321   have "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<le> (\<integral>\<^sup>+ (x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))"
```
```  1322     apply (subst nn_integral_density)
```
```  1323     apply (rule distributed_borel_measurable[OF Pxyz])
```
```  1324     apply (rule distributed_AE[OF Pxyz])
```
```  1325     apply simp
```
```  1326     apply (rule nn_integral_mono_AE)
```
```  1327     using ae5 ae6 ae7 ae8
```
```  1328     apply eventually_elim
```
```  1329     apply auto
```
```  1330     done
```
```  1331   also have "\<dots> = (\<integral>\<^sup>+(y, z). \<integral>\<^sup>+ x. ereal (Pxz (x, z)) * ereal (Pyz (y, z) / Pz z) \<partial>S \<partial>T \<Otimes>\<^sub>M P)"
```
```  1332     by (subst STP.nn_integral_snd[symmetric]) (auto simp add: split_beta')
```
```  1333   also have "\<dots> = (\<integral>\<^sup>+x. ereal (Pyz x) * 1 \<partial>T \<Otimes>\<^sub>M P)"
```
```  1334     apply (rule nn_integral_cong_AE)
```
```  1335     using aeX1 aeX2 aeX3 distributed_AE[OF Pyz] AE_space
```
```  1336     apply eventually_elim
```
```  1337   proof (case_tac x, simp del: times_ereal.simps add: space_pair_measure)
```
```  1338     fix a b assume "Pz b = 0 \<longrightarrow> Pyz (a, b) = 0" "0 \<le> Pz b" "a \<in> space T \<and> b \<in> space P"
```
```  1339       "(\<integral>\<^sup>+ x. ereal (Pxz (x, b)) \<partial>S) = ereal (Pz b)" "0 \<le> Pyz (a, b)"
```
```  1340     then show "(\<integral>\<^sup>+ x. ereal (Pxz (x, b)) * ereal (Pyz (a, b) / Pz b) \<partial>S) = ereal (Pyz (a, b))"
```
```  1341       by (subst nn_integral_multc) auto
```
```  1342   qed
```
```  1343   also have "\<dots> = 1"
```
```  1344     using Q.emeasure_space_1 distributed_AE[OF Pyz] distributed_distr_eq_density[OF Pyz]
```
```  1345     by (subst nn_integral_density[symmetric]) auto
```
```  1346   finally have le1: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<le> 1" .
```
```  1347   also have "\<dots> < \<infinity>" by simp
```
```  1348   finally have fin: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> \<infinity>" by simp
```
```  1349
```
```  1350   have pos: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> 0"
```
```  1351     apply (subst nn_integral_density)
```
```  1352     apply simp
```
```  1353     apply (rule distributed_AE[OF Pxyz])
```
```  1354     apply simp
```
```  1355     apply (simp add: split_beta')
```
```  1356   proof
```
```  1357     let ?g = "\<lambda>x. ereal (if Pxyz x = 0 then 0 else Pxz (fst x, snd (snd x)) * Pyz (snd x) / Pz (snd (snd x)))"
```
```  1358     assume "(\<integral>\<^sup>+ x. ?g x \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)) = 0"
```
```  1359     then have "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. ?g x \<le> 0"
```
```  1360       by (intro nn_integral_0_iff_AE[THEN iffD1]) (auto intro!: borel_measurable_ereal measurable_If)
```
```  1361     then have "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pxyz x = 0"
```
```  1362       using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
```
```  1363       by eventually_elim (auto split: split_if_asm simp: mult_le_0_iff divide_le_0_iff)
```
```  1364     then have "(\<integral>\<^sup>+ x. ereal (Pxyz x) \<partial>S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) = 0"
```
```  1365       by (subst nn_integral_cong_AE[of _ "\<lambda>x. 0"]) auto
```
```  1366     with P.emeasure_space_1 show False
```
```  1367       by (subst (asm) emeasure_density) (auto cong: nn_integral_cong)
```
```  1368   qed
```
```  1369
```
```  1370   have neg: "(\<integral>\<^sup>+ x. - ?f x \<partial>?P) = 0"
```
```  1371     apply (rule nn_integral_0_iff_AE[THEN iffD2])
```
```  1372     apply (auto simp: split_beta') []
```
```  1373     apply (subst AE_density)
```
```  1374     apply (auto simp: split_beta') []
```
```  1375     using ae5 ae6 ae7 ae8
```
```  1376     apply eventually_elim
```
```  1377     apply auto
```
```  1378     done
```
```  1379
```
```  1380   have I3: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))"
```
```  1381     apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ integrable_diff[OF I1 I2]])
```
```  1382     using ae
```
```  1383     apply (auto simp: split_beta')
```
```  1384     done
```
```  1385
```
```  1386   have "- log b 1 \<le> - log b (integral\<^sup>L ?P ?f)"
```
```  1387   proof (intro le_imp_neg_le log_le[OF b_gt_1])
```
```  1388     have If: "integrable ?P ?f"
```
```  1389       unfolding real_integrable_def
```
```  1390     proof (intro conjI)
```
```  1391       from neg show "(\<integral>\<^sup>+ x. - ?f x \<partial>?P) \<noteq> \<infinity>"
```
```  1392         by simp
```
```  1393       from fin show "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> \<infinity>"
```
```  1394         by simp
```
```  1395     qed simp
```
```  1396     then have "(\<integral>\<^sup>+ x. ?f x \<partial>?P) = (\<integral>x. ?f x \<partial>?P)"
```
```  1397       apply (rule nn_integral_eq_integral)
```
```  1398       apply (subst AE_density)
```
```  1399       apply simp
```
```  1400       using ae5 ae6 ae7 ae8
```
```  1401       apply eventually_elim
```
```  1402       apply auto
```
```  1403       done
```
```  1404     with nn_integral_nonneg[of ?P ?f] pos le1
```
```  1405     show "0 < (\<integral>x. ?f x \<partial>?P)" "(\<integral>x. ?f x \<partial>?P) \<le> 1"
```
```  1406       by (simp_all add: one_ereal_def)
```
```  1407   qed
```
```  1408   also have "- log b (integral\<^sup>L ?P ?f) \<le> (\<integral> x. - log b (?f x) \<partial>?P)"
```
```  1409   proof (rule P.jensens_inequality[where a=0 and b=1 and I="{0<..}"])
```
```  1410     show "AE x in ?P. ?f x \<in> {0<..}"
```
```  1411       unfolding AE_density[OF distributed_borel_measurable[OF Pxyz]]
```
```  1412       using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
```
```  1413       by eventually_elim (auto)
```
```  1414     show "integrable ?P ?f"
```
```  1415       unfolding real_integrable_def
```
```  1416       using fin neg by (auto simp: split_beta')
```
```  1417     show "integrable ?P (\<lambda>x. - log b (?f x))"
```
```  1418       apply (subst integrable_real_density)
```
```  1419       apply simp
```
```  1420       apply (auto intro!: distributed_real_AE[OF Pxyz]) []
```
```  1421       apply simp
```
```  1422       apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ I3])
```
```  1423       apply simp
```
```  1424       apply simp
```
```  1425       using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
```
```  1426       apply eventually_elim
```
```  1427       apply (auto simp: log_divide_eq log_mult_eq zero_le_mult_iff zero_less_mult_iff zero_less_divide_iff field_simps)
```
```  1428       done
```
```  1429   qed (auto simp: b_gt_1 minus_log_convex)
```
```  1430   also have "\<dots> = conditional_mutual_information b S T P X Y Z"
```
```  1431     unfolding `?eq`
```
```  1432     apply (subst integral_real_density)
```
```  1433     apply simp
```
```  1434     apply (auto intro!: distributed_real_AE[OF Pxyz]) []
```
```  1435     apply simp
```
```  1436     apply (intro integral_cong_AE)
```
```  1437     using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
```
```  1438     apply (auto simp: log_divide_eq zero_less_mult_iff zero_less_divide_iff field_simps)
```
```  1439     done
```
```  1440   finally show ?nonneg
```
```  1441     by simp
```
```  1442 qed
```
```  1443
```
```  1444 lemma (in information_space) conditional_mutual_information_eq:
```
```  1445   assumes Pz: "simple_distributed M Z Pz"
```
```  1446   assumes Pyz: "simple_distributed M (\<lambda>x. (Y x, Z x)) Pyz"
```
```  1447   assumes Pxz: "simple_distributed M (\<lambda>x. (X x, Z x)) Pxz"
```
```  1448   assumes Pxyz: "simple_distributed M (\<lambda>x. (X x, Y x, Z x)) Pxyz"
```
```  1449   shows "\<I>(X ; Y | Z) =
```
```  1450    (\<Sum>(x, y, z)\<in>(\<lambda>x. (X x, Y x, Z x))`space M. Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))"
```
```  1451 proof (subst conditional_mutual_information_generic_eq[OF _ _ _ _
```
```  1452     simple_distributed[OF Pz] simple_distributed_joint[OF Pyz] simple_distributed_joint[OF Pxz]
```
```  1453     simple_distributed_joint2[OF Pxyz]])
```
```  1454   note simple_distributed_joint2_finite[OF Pxyz, simp]
```
```  1455   show "sigma_finite_measure (count_space (X ` space M))"
```
```  1456     by (simp add: sigma_finite_measure_count_space_finite)
```
```  1457   show "sigma_finite_measure (count_space (Y ` space M))"
```
```  1458     by (simp add: sigma_finite_measure_count_space_finite)
```
```  1459   show "sigma_finite_measure (count_space (Z ` space M))"
```
```  1460     by (simp add: sigma_finite_measure_count_space_finite)
```
```  1461   have "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) \<Otimes>\<^sub>M count_space (Z ` space M) =
```
```  1462       count_space (X`space M \<times> Y`space M \<times> Z`space M)"
```
```  1463     (is "?P = ?C")
```
```  1464     by (simp add: pair_measure_count_space)
```
```  1465
```
```  1466   let ?Px = "\<lambda>x. measure M (X -` {x} \<inter> space M)"
```
```  1467   have "(\<lambda>x. (X x, Z x)) \<in> measurable M (count_space (X ` space M) \<Otimes>\<^sub>M count_space (Z ` space M))"
```
```  1468     using simple_distributed_joint[OF Pxz] by (rule distributed_measurable)
```
```  1469   from measurable_comp[OF this measurable_fst]
```
```  1470   have "random_variable (count_space (X ` space M)) X"
```
```  1471     by (simp add: comp_def)
```
```  1472   then have "simple_function M X"
```
```  1473     unfolding simple_function_def by (auto simp: measurable_count_space_eq2)
```
```  1474   then have "simple_distributed M X ?Px"
```
```  1475     by (rule simple_distributedI) auto
```
```  1476   then show "distributed M (count_space (X ` space M)) X ?Px"
```
```  1477     by (rule simple_distributed)
```
```  1478
```
```  1479   let ?f = "(\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x, Z x)) ` space M then Pxyz x else 0)"
```
```  1480   let ?g = "(\<lambda>x. if x \<in> (\<lambda>x. (Y x, Z x)) ` space M then Pyz x else 0)"
```
```  1481   let ?h = "(\<lambda>x. if x \<in> (\<lambda>x. (X x, Z x)) ` space M then Pxz x else 0)"
```
```  1482   show
```
```  1483       "integrable ?P (\<lambda>(x, y, z). ?f (x, y, z) * log b (?f (x, y, z) / (?Px x * ?g (y, z))))"
```
```  1484       "integrable ?P (\<lambda>(x, y, z). ?f (x, y, z) * log b (?h (x, z) / (?Px x * Pz z)))"
```
```  1485     by (auto intro!: integrable_count_space simp: pair_measure_count_space)
```
```  1486   let ?i = "\<lambda>x y z. ?f (x, y, z) * log b (?f (x, y, z) / (?h (x, z) * (?g (y, z) / Pz z)))"
```
```  1487   let ?j = "\<lambda>x y z. Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z)))"
```
```  1488   have "(\<lambda>(x, y, z). ?i x y z) = (\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x, Z x)) ` space M then ?j (fst x) (fst (snd x)) (snd (snd x)) else 0)"
```
```  1489     by (auto intro!: ext)
```
```  1490   then show "(\<integral> (x, y, z). ?i x y z \<partial>?P) = (\<Sum>(x, y, z)\<in>(\<lambda>x. (X x, Y x, Z x)) ` space M. ?j x y z)"
```
```  1491     by (auto intro!: setsum.cong simp add: `?P = ?C` lebesgue_integral_count_space_finite simple_distributed_finite setsum.If_cases split_beta')
```
```  1492 qed
```
```  1493
```
```  1494 lemma (in information_space) conditional_mutual_information_nonneg:
```
```  1495   assumes X: "simple_function M X" and Y: "simple_function M Y" and Z: "simple_function M Z"
```
```  1496   shows "0 \<le> \<I>(X ; Y | Z)"
```
```  1497 proof -
```
```  1498   have [simp]: "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) \<Otimes>\<^sub>M count_space (Z ` space M) =
```
```  1499       count_space (X`space M \<times> Y`space M \<times> Z`space M)"
```
```  1500     by (simp add: pair_measure_count_space X Y Z simple_functionD)
```
```  1501   note sf = sigma_finite_measure_count_space_finite[OF simple_functionD(1)]
```
```  1502   note sd = simple_distributedI[OF _ refl]
```
```  1503   note sp = simple_function_Pair
```
```  1504   show ?thesis
```
```  1505    apply (rule conditional_mutual_information_generic_nonneg[OF sf[OF X] sf[OF Y] sf[OF Z]])
```
```  1506    apply (rule simple_distributed[OF sd[OF X]])
```
```  1507    apply (rule simple_distributed[OF sd[OF Z]])
```
```  1508    apply (rule simple_distributed_joint[OF sd[OF sp[OF Y Z]]])
```
```  1509    apply (rule simple_distributed_joint[OF sd[OF sp[OF X Z]]])
```
```  1510    apply (rule simple_distributed_joint2[OF sd[OF sp[OF X sp[OF Y Z]]]])
```
```  1511    apply (auto intro!: integrable_count_space simp: X Y Z simple_functionD)
```
```  1512    done
```
```  1513 qed
```
```  1514
```
```  1515 subsection {* Conditional Entropy *}
```
```  1516
```
```  1517 definition (in prob_space)
```
```  1518   "conditional_entropy b S T X Y = - (\<integral>(x, y). log b (real (RN_deriv (S \<Otimes>\<^sub>M T) (distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))) (x, y)) /
```
```  1519     real (RN_deriv T (distr M T Y) y)) \<partial>distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)))"
```
```  1520
```
```  1521 abbreviation (in information_space)
```
```  1522   conditional_entropy_Pow ("\<H>'(_ | _')") where
```
```  1523   "\<H>(X | Y) \<equiv> conditional_entropy b (count_space (X`space M)) (count_space (Y`space M)) X Y"
```
```  1524
```
```  1525 lemma (in information_space) conditional_entropy_generic_eq:
```
```  1526   fixes Pxy :: "_ \<Rightarrow> real" and Py :: "'c \<Rightarrow> real"
```
```  1527   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
```
```  1528   assumes Py[measurable]: "distributed M T Y Py"
```
```  1529   assumes Pxy[measurable]: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
```
```  1530   shows "conditional_entropy b S T X Y = - (\<integral>(x, y). Pxy (x, y) * log b (Pxy (x, y) / Py y) \<partial>(S \<Otimes>\<^sub>M T))"
```
```  1531 proof -
```
```  1532   interpret S: sigma_finite_measure S by fact
```
```  1533   interpret T: sigma_finite_measure T by fact
```
```  1534   interpret ST: pair_sigma_finite S T ..
```
```  1535
```
```  1536   have "AE x in density (S \<Otimes>\<^sub>M T) (\<lambda>x. ereal (Pxy x)). Pxy x = real (RN_deriv (S \<Otimes>\<^sub>M T) (distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))) x)"
```
```  1537     unfolding AE_density[OF distributed_borel_measurable, OF Pxy]
```
```  1538     unfolding distributed_distr_eq_density[OF Pxy]
```
```  1539     using distributed_RN_deriv[OF Pxy]
```
```  1540     by auto
```
```  1541   moreover
```
```  1542   have "AE x in density (S \<Otimes>\<^sub>M T) (\<lambda>x. ereal (Pxy x)). Py (snd x) = real (RN_deriv T (distr M T Y) (snd x))"
```
```  1543     unfolding AE_density[OF distributed_borel_measurable, OF Pxy]
```
```  1544     unfolding distributed_distr_eq_density[OF Py]
```
```  1545     apply (rule ST.AE_pair_measure)
```
```  1546     apply auto
```
```  1547     using distributed_RN_deriv[OF Py]
```
```  1548     apply auto
```
```  1549     done
```
```  1550   ultimately
```
```  1551   have "conditional_entropy b S T X Y = - (\<integral>x. Pxy x * log b (Pxy x / Py (snd x)) \<partial>(S \<Otimes>\<^sub>M T))"
```
```  1552     unfolding conditional_entropy_def neg_equal_iff_equal
```
```  1553     apply (subst integral_real_density[symmetric])
```
```  1554     apply (auto simp: distributed_real_AE[OF Pxy] distributed_distr_eq_density[OF Pxy]
```
```  1555                 intro!: integral_cong_AE)
```
```  1556     done
```
```  1557   then show ?thesis by (simp add: split_beta')
```
```  1558 qed
```
```  1559
```
```  1560 lemma (in information_space) conditional_entropy_eq_entropy:
```
```  1561   fixes Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real"
```
```  1562   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
```
```  1563   assumes Py[measurable]: "distributed M T Y Py"
```
```  1564   assumes Pxy[measurable]: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
```
```  1565   assumes I1: "integrable (S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x))"
```
```  1566   assumes I2: "integrable (S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Py (snd x)))"
```
```  1567   shows "conditional_entropy b S T X Y = entropy b (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) - entropy b T Y"
```
```  1568 proof -
```
```  1569   interpret S: sigma_finite_measure S by fact
```
```  1570   interpret T: sigma_finite_measure T by fact
```
```  1571   interpret ST: pair_sigma_finite S T ..
```
```  1572
```
```  1573   have "entropy b T Y = - (\<integral>y. Py y * log b (Py y) \<partial>T)"
```
```  1574     by (rule entropy_distr[OF Py])
```
```  1575   also have "\<dots> = - (\<integral>(x,y). Pxy (x,y) * log b (Py y) \<partial>(S \<Otimes>\<^sub>M T))"
```
```  1576     using b_gt_1 Py[THEN distributed_real_measurable]
```
```  1577     by (subst distributed_transform_integral[OF Pxy Py, where T=snd]) (auto intro!: integral_cong)
```
```  1578   finally have e_eq: "entropy b T Y = - (\<integral>(x,y). Pxy (x,y) * log b (Py y) \<partial>(S \<Otimes>\<^sub>M T))" .
```
```  1579
```
```  1580   have ae2: "AE x in S \<Otimes>\<^sub>M T. Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
```
```  1581     by (intro subdensity_real[of snd, OF _ Pxy Py]) (auto intro: measurable_Pair)
```
```  1582   moreover have ae4: "AE x in S \<Otimes>\<^sub>M T. 0 \<le> Py (snd x)"
```
```  1583     using Py by (intro ST.AE_pair_measure) (auto simp: comp_def intro!: measurable_snd'' dest: distributed_real_AE distributed_real_measurable)
```
```  1584   moreover note ae5 = Pxy[THEN distributed_real_AE]
```
```  1585   ultimately have "AE x in S \<Otimes>\<^sub>M T. 0 \<le> Pxy x \<and> 0 \<le> Py (snd x) \<and>
```
```  1586     (Pxy x = 0 \<or> (Pxy x \<noteq> 0 \<longrightarrow> 0 < Pxy x \<and> 0 < Py (snd x)))"
```
```  1587     by eventually_elim auto
```
```  1588   then have ae: "AE x in S \<Otimes>\<^sub>M T.
```
```  1589      Pxy x * log b (Pxy x) - Pxy x * log b (Py (snd x)) = Pxy x * log b (Pxy x / Py (snd x))"
```
```  1590     by eventually_elim (auto simp: log_simps field_simps b_gt_1)
```
```  1591   have "conditional_entropy b S T X Y =
```
```  1592     - (\<integral>x. Pxy x * log b (Pxy x) - Pxy x * log b (Py (snd x)) \<partial>(S \<Otimes>\<^sub>M T))"
```
```  1593     unfolding conditional_entropy_generic_eq[OF S T Py Pxy] neg_equal_iff_equal
```
```  1594     apply (intro integral_cong_AE)
```
```  1595     using ae
```
```  1596     apply auto
```
```  1597     done
```
```  1598   also have "\<dots> = - (\<integral>x. Pxy x * log b (Pxy x) \<partial>(S \<Otimes>\<^sub>M T)) - - (\<integral>x.  Pxy x * log b (Py (snd x)) \<partial>(S \<Otimes>\<^sub>M T))"
```
```  1599     by (simp add: integral_diff[OF I1 I2])
```
```  1600   finally show ?thesis
```
```  1601     unfolding conditional_entropy_generic_eq[OF S T Py Pxy] entropy_distr[OF Pxy] e_eq
```
```  1602     by (simp add: split_beta')
```
```  1603 qed
```
```  1604
```
```  1605 lemma (in information_space) conditional_entropy_eq_entropy_simple:
```
```  1606   assumes X: "simple_function M X" and Y: "simple_function M Y"
```
```  1607   shows "\<H>(X | Y) = entropy b (count_space (X`space M) \<Otimes>\<^sub>M count_space (Y`space M)) (\<lambda>x. (X x, Y x)) - \<H>(Y)"
```
```  1608 proof -
```
```  1609   have "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) = count_space (X`space M \<times> Y`space M)"
```
```  1610     (is "?P = ?C") using X Y by (simp add: simple_functionD pair_measure_count_space)
```
```  1611   show ?thesis
```
```  1612     by (rule conditional_entropy_eq_entropy sigma_finite_measure_count_space_finite
```
```  1613                  simple_functionD  X Y simple_distributed simple_distributedI[OF _ refl]
```
```  1614                  simple_distributed_joint simple_function_Pair integrable_count_space)+
```
```  1615        (auto simp: `?P = ?C` intro!: integrable_count_space simple_functionD  X Y)
```
```  1616 qed
```
```  1617
```
```  1618 lemma (in information_space) conditional_entropy_eq:
```
```  1619   assumes Y: "simple_distributed M Y Py"
```
```  1620   assumes XY: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy"
```
```  1621     shows "\<H>(X | Y) = - (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / Py y))"
```
```  1622 proof (subst conditional_entropy_generic_eq[OF _ _
```
```  1623   simple_distributed[OF Y] simple_distributed_joint[OF XY]])
```
```  1624   have "finite ((\<lambda>x. (X x, Y x))`space M)"
```
```  1625     using XY unfolding simple_distributed_def by auto
```
```  1626   from finite_imageI[OF this, of fst]
```
```  1627   have [simp]: "finite (X`space M)"
```
```  1628     by (simp add: image_comp comp_def)
```
```  1629   note Y[THEN simple_distributed_finite, simp]
```
```  1630   show "sigma_finite_measure (count_space (X ` space M))"
```
```  1631     by (simp add: sigma_finite_measure_count_space_finite)
```
```  1632   show "sigma_finite_measure (count_space (Y ` space M))"
```
```  1633     by (simp add: sigma_finite_measure_count_space_finite)
```
```  1634   let ?f = "(\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x)) ` space M then Pxy x else 0)"
```
```  1635   have "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) = count_space (X`space M \<times> Y`space M)"
```
```  1636     (is "?P = ?C")
```
```  1637     using Y by (simp add: simple_distributed_finite pair_measure_count_space)
```
```  1638   have eq: "(\<lambda>(x, y). ?f (x, y) * log b (?f (x, y) / Py y)) =
```
```  1639     (\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x)) ` space M then Pxy x * log b (Pxy x / Py (snd x)) else 0)"
```
```  1640     by auto
```
```  1641   from Y show "- (\<integral> (x, y). ?f (x, y) * log b (?f (x, y) / Py y) \<partial>?P) =
```
```  1642     - (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / Py y))"
```
```  1643     by (auto intro!: setsum.cong simp add: `?P = ?C` lebesgue_integral_count_space_finite simple_distributed_finite eq setsum.If_cases split_beta')
```
```  1644 qed
```
```  1645
```
```  1646 lemma (in information_space) conditional_mutual_information_eq_conditional_entropy:
```
```  1647   assumes X: "simple_function M X" and Y: "simple_function M Y"
```
```  1648   shows "\<I>(X ; X | Y) = \<H>(X | Y)"
```
```  1649 proof -
```
```  1650   def Py \<equiv> "\<lambda>x. if x \<in> Y`space M then measure M (Y -` {x} \<inter> space M) else 0"
```
```  1651   def Pxy \<equiv> "\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x))`space M then measure M ((\<lambda>x. (X x, Y x)) -` {x} \<inter> space M) else 0"
```
```  1652   def Pxxy \<equiv> "\<lambda>x. if x \<in> (\<lambda>x. (X x, X x, Y x))`space M then measure M ((\<lambda>x. (X x, X x, Y x)) -` {x} \<inter> space M) else 0"
```
```  1653   let ?M = "X`space M \<times> X`space M \<times> Y`space M"
```
```  1654
```
```  1655   note XY = simple_function_Pair[OF X Y]
```
```  1656   note XXY = simple_function_Pair[OF X XY]
```
```  1657   have Py: "simple_distributed M Y Py"
```
```  1658     using Y by (rule simple_distributedI) (auto simp: Py_def)
```
```  1659   have Pxy: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy"
```
```  1660     using XY by (rule simple_distributedI) (auto simp: Pxy_def)
```
```  1661   have Pxxy: "simple_distributed M (\<lambda>x. (X x, X x, Y x)) Pxxy"
```
```  1662     using XXY by (rule simple_distributedI) (auto simp: Pxxy_def)
```
```  1663   have eq: "(\<lambda>x. (X x, X x, Y x)) ` space M = (\<lambda>(x, y). (x, x, y)) ` (\<lambda>x. (X x, Y x)) ` space M"
```
```  1664     by auto
```
```  1665   have inj: "\<And>A. inj_on (\<lambda>(x, y). (x, x, y)) A"
```
```  1666     by (auto simp: inj_on_def)
```
```  1667   have Pxxy_eq: "\<And>x y. Pxxy (x, x, y) = Pxy (x, y)"
```
```  1668     by (auto simp: Pxxy_def Pxy_def intro!: arg_cong[where f=prob])
```
```  1669   have "AE x in count_space ((\<lambda>x. (X x, Y x))`space M). Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
```
```  1670     by (intro subdensity_real[of snd, OF _ Pxy[THEN simple_distributed] Py[THEN simple_distributed]]) (auto intro: measurable_Pair)
```
```  1671   then show ?thesis
```
```  1672     apply (subst conditional_mutual_information_eq[OF Py Pxy Pxy Pxxy])
```
```  1673     apply (subst conditional_entropy_eq[OF Py Pxy])
```
```  1674     apply (auto intro!: setsum.cong simp: Pxxy_eq setsum_negf[symmetric] eq setsum.reindex[OF inj]
```
```  1675                 log_simps zero_less_mult_iff zero_le_mult_iff field_simps mult_less_0_iff AE_count_space)
```
```  1676     using Py[THEN simple_distributed, THEN distributed_real_AE] Pxy[THEN simple_distributed, THEN distributed_real_AE]
```
```  1677   apply (auto simp add: not_le[symmetric] AE_count_space)
```
```  1678     done
```
```  1679 qed
```
```  1680
```
```  1681 lemma (in information_space) conditional_entropy_nonneg:
```
```  1682   assumes X: "simple_function M X" and Y: "simple_function M Y" shows "0 \<le> \<H>(X | Y)"
```
```  1683   using conditional_mutual_information_eq_conditional_entropy[OF X Y] conditional_mutual_information_nonneg[OF X X Y]
```
```  1684   by simp
```
```  1685
```
```  1686 subsection {* Equalities *}
```
```  1687
```
```  1688 lemma (in information_space) mutual_information_eq_entropy_conditional_entropy_distr:
```
```  1689   fixes Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" and Pxy :: "('b \<times> 'c) \<Rightarrow> real"
```
```  1690   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
```
```  1691   assumes Px[measurable]: "distributed M S X Px" and Py[measurable]: "distributed M T Y Py"
```
```  1692   assumes Pxy[measurable]: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
```
```  1693   assumes Ix: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Px (fst x)))"
```
```  1694   assumes Iy: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Py (snd x)))"
```
```  1695   assumes Ixy: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x))"
```
```  1696   shows  "mutual_information b S T X Y = entropy b S X + entropy b T Y - entropy b (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))"
```
```  1697 proof -
```
```  1698   have X: "entropy b S X = - (\<integral>x. Pxy x * log b (Px (fst x)) \<partial>(S \<Otimes>\<^sub>M T))"
```
```  1699     using b_gt_1 Px[THEN distributed_real_measurable]
```
```  1700     apply (subst entropy_distr[OF Px])
```
```  1701     apply (subst distributed_transform_integral[OF Pxy Px, where T=fst])
```
```  1702     apply (auto intro!: integral_cong)
```
```  1703     done
```
```  1704
```
```  1705   have Y: "entropy b T Y = - (\<integral>x. Pxy x * log b (Py (snd x)) \<partial>(S \<Otimes>\<^sub>M T))"
```
```  1706     using b_gt_1 Py[THEN distributed_real_measurable]
```
```  1707     apply (subst entropy_distr[OF Py])
```
```  1708     apply (subst distributed_transform_integral[OF Pxy Py, where T=snd])
```
```  1709     apply (auto intro!: integral_cong)
```
```  1710     done
```
```  1711
```
```  1712   interpret S: sigma_finite_measure S by fact
```
```  1713   interpret T: sigma_finite_measure T by fact
```
```  1714   interpret ST: pair_sigma_finite S T ..
```
```  1715   have ST: "sigma_finite_measure (S \<Otimes>\<^sub>M T)" ..
```
```  1716
```
```  1717   have XY: "entropy b (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) = - (\<integral>x. Pxy x * log b (Pxy x) \<partial>(S \<Otimes>\<^sub>M T))"
```
```  1718     by (subst entropy_distr[OF Pxy]) (auto intro!: integral_cong)
```
```  1719
```
```  1720   have "AE x in S \<Otimes>\<^sub>M T. Px (fst x) = 0 \<longrightarrow> Pxy x = 0"
```
```  1721     by (intro subdensity_real[of fst, OF _ Pxy Px]) (auto intro: measurable_Pair)
```
```  1722   moreover have "AE x in S \<Otimes>\<^sub>M T. Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
```
```  1723     by (intro subdensity_real[of snd, OF _ Pxy Py]) (auto intro: measurable_Pair)
```
```  1724   moreover have "AE x in S \<Otimes>\<^sub>M T. 0 \<le> Px (fst x)"
```
```  1725     using Px by (intro ST.AE_pair_measure) (auto simp: comp_def intro!: measurable_fst'' dest: distributed_real_AE distributed_real_measurable)
```
```  1726   moreover have "AE x in S \<Otimes>\<^sub>M T. 0 \<le> Py (snd x)"
```
```  1727     using Py by (intro ST.AE_pair_measure) (auto simp: comp_def intro!: measurable_snd'' dest: distributed_real_AE distributed_real_measurable)
```
```  1728   moreover note Pxy[THEN distributed_real_AE]
```
```  1729   ultimately have "AE x in S \<Otimes>\<^sub>M T. Pxy x * log b (Pxy x) - Pxy x * log b (Px (fst x)) - Pxy x * log b (Py (snd x)) =
```
```  1730     Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x)))"
```
```  1731     (is "AE x in _. ?f x = ?g x")
```
```  1732   proof eventually_elim
```
```  1733     case (goal1 x)
```
```  1734     show ?case
```
```  1735     proof cases
```
```  1736       assume "Pxy x \<noteq> 0"
```
```  1737       with goal1 have "0 < Px (fst x)" "0 < Py (snd x)" "0 < Pxy x"
```
```  1738         by auto
```
```  1739       then show ?thesis
```
```  1740         using b_gt_1 by (simp add: log_simps less_imp_le field_simps)
```
```  1741     qed simp
```
```  1742   qed
```
```  1743
```
```  1744   have "entropy b S X + entropy b T Y - entropy b (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) = integral\<^sup>L (S \<Otimes>\<^sub>M T) ?f"
```
```  1745     unfolding X Y XY
```
```  1746     apply (subst integral_diff)
```
```  1747     apply (intro integrable_diff Ixy Ix Iy)+
```
```  1748     apply (subst integral_diff)
```
```  1749     apply (intro Ixy Ix Iy)+
```
```  1750     apply (simp add: field_simps)
```
```  1751     done
```
```  1752   also have "\<dots> = integral\<^sup>L (S \<Otimes>\<^sub>M T) ?g"
```
```  1753     using `AE x in _. ?f x = ?g x` by (intro integral_cong_AE) auto
```
```  1754   also have "\<dots> = mutual_information b S T X Y"
```
```  1755     by (rule mutual_information_distr[OF S T Px Py Pxy, symmetric])
```
```  1756   finally show ?thesis ..
```
```  1757 qed
```
```  1758
```
```  1759 lemma (in information_space) mutual_information_eq_entropy_conditional_entropy':
```
```  1760   fixes Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" and Pxy :: "('b \<times> 'c) \<Rightarrow> real"
```
```  1761   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
```
```  1762   assumes Px: "distributed M S X Px" and Py: "distributed M T Y Py"
```
```  1763   assumes Pxy: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
```
```  1764   assumes Ix: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Px (fst x)))"
```
```  1765   assumes Iy: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Py (snd x)))"
```
```  1766   assumes Ixy: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x))"
```
```  1767   shows  "mutual_information b S T X Y = entropy b S X - conditional_entropy b S T X Y"
```
```  1768   using
```
```  1769     mutual_information_eq_entropy_conditional_entropy_distr[OF S T Px Py Pxy Ix Iy Ixy]
```
```  1770     conditional_entropy_eq_entropy[OF S T Py Pxy Ixy Iy]
```
```  1771   by simp
```
```  1772
```
```  1773 lemma (in information_space) mutual_information_eq_entropy_conditional_entropy:
```
```  1774   assumes sf_X: "simple_function M X" and sf_Y: "simple_function M Y"
```
```  1775   shows  "\<I>(X ; Y) = \<H>(X) - \<H>(X | Y)"
```
```  1776 proof -
```
```  1777   have X: "simple_distributed M X (\<lambda>x. measure M (X -` {x} \<inter> space M))"
```
```  1778     using sf_X by (rule simple_distributedI) auto
```
```  1779   have Y: "simple_distributed M Y (\<lambda>x. measure M (Y -` {x} \<inter> space M))"
```
```  1780     using sf_Y by (rule simple_distributedI) auto
```
```  1781   have sf_XY: "simple_function M (\<lambda>x. (X x, Y x))"
```
```  1782     using sf_X sf_Y by (rule simple_function_Pair)
```
```  1783   then have XY: "simple_distributed M (\<lambda>x. (X x, Y x)) (\<lambda>x. measure M ((\<lambda>x. (X x, Y x)) -` {x} \<inter> space M))"
```
```  1784     by (rule simple_distributedI) auto
```
```  1785   from simple_distributed_joint_finite[OF this, simp]
```
```  1786   have eq: "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) = count_space (X ` space M \<times> Y ` space M)"
```
```  1787     by (simp add: pair_measure_count_space)
```
```  1788
```
```  1789   have "\<I>(X ; Y) = \<H>(X) + \<H>(Y) - entropy b (count_space (X`space M) \<Otimes>\<^sub>M count_space (Y`space M)) (\<lambda>x. (X x, Y x))"
```
```  1790     using sigma_finite_measure_count_space_finite sigma_finite_measure_count_space_finite simple_distributed[OF X] simple_distributed[OF Y] simple_distributed_joint[OF XY]
```
```  1791     by (rule mutual_information_eq_entropy_conditional_entropy_distr) (auto simp: eq integrable_count_space)
```
```  1792   then show ?thesis
```
```  1793     unfolding conditional_entropy_eq_entropy_simple[OF sf_X sf_Y] by simp
```
```  1794 qed
```
```  1795
```
```  1796 lemma (in information_space) mutual_information_nonneg_simple:
```
```  1797   assumes sf_X: "simple_function M X" and sf_Y: "simple_function M Y"
```
```  1798   shows  "0 \<le> \<I>(X ; Y)"
```
```  1799 proof -
```
```  1800   have X: "simple_distributed M X (\<lambda>x. measure M (X -` {x} \<inter> space M))"
```
```  1801     using sf_X by (rule simple_distributedI) auto
```
```  1802   have Y: "simple_distributed M Y (\<lambda>x. measure M (Y -` {x} \<inter> space M))"
```
```  1803     using sf_Y by (rule simple_distributedI) auto
```
```  1804
```
```  1805   have sf_XY: "simple_function M (\<lambda>x. (X x, Y x))"
```
```  1806     using sf_X sf_Y by (rule simple_function_Pair)
```
```  1807   then have XY: "simple_distributed M (\<lambda>x. (X x, Y x)) (\<lambda>x. measure M ((\<lambda>x. (X x, Y x)) -` {x} \<inter> space M))"
```
```  1808     by (rule simple_distributedI) auto
```
```  1809
```
```  1810   from simple_distributed_joint_finite[OF this, simp]
```
```  1811   have eq: "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) = count_space (X ` space M \<times> Y ` space M)"
```
```  1812     by (simp add: pair_measure_count_space)
```
```  1813
```
```  1814   show ?thesis
```
```  1815     by (rule mutual_information_nonneg[OF _ _ simple_distributed[OF X] simple_distributed[OF Y] simple_distributed_joint[OF XY]])
```
```  1816        (simp_all add: eq integrable_count_space sigma_finite_measure_count_space_finite)
```
```  1817 qed
```
```  1818
```
```  1819 lemma (in information_space) conditional_entropy_less_eq_entropy:
```
```  1820   assumes X: "simple_function M X" and Z: "simple_function M Z"
```
```  1821   shows "\<H>(X | Z) \<le> \<H>(X)"
```
```  1822 proof -
```
```  1823   have "0 \<le> \<I>(X ; Z)" using X Z by (rule mutual_information_nonneg_simple)
```
```  1824   also have "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)" using mutual_information_eq_entropy_conditional_entropy[OF assms] .
```
```  1825   finally show ?thesis by auto
```
```  1826 qed
```
```  1827
```
```  1828 lemma (in information_space)
```
```  1829   fixes Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" and Pxy :: "('b \<times> 'c) \<Rightarrow> real"
```
```  1830   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
```
```  1831   assumes Px: "finite_entropy S X Px" and Py: "finite_entropy T Y Py"
```
```  1832   assumes Pxy: "finite_entropy (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
```
```  1833   shows "conditional_entropy b S T X Y \<le> entropy b S X"
```
```  1834 proof -
```
```  1835
```
```  1836   have "0 \<le> mutual_information b S T X Y"
```
```  1837     by (rule mutual_information_nonneg') fact+
```
```  1838   also have "\<dots> = entropy b S X - conditional_entropy b S T X Y"
```
```  1839     apply (rule mutual_information_eq_entropy_conditional_entropy')
```
```  1840     using assms
```
```  1841     by (auto intro!: finite_entropy_integrable finite_entropy_distributed
```
```  1842       finite_entropy_integrable_transform[OF Px]
```
```  1843       finite_entropy_integrable_transform[OF Py])
```
```  1844   finally show ?thesis by auto
```
```  1845 qed
```
```  1846
```
```  1847 lemma (in information_space) entropy_chain_rule:
```
```  1848   assumes X: "simple_function M X" and Y: "simple_function M Y"
```
```  1849   shows  "\<H>(\<lambda>x. (X x, Y x)) = \<H>(X) + \<H>(Y|X)"
```
```  1850 proof -
```
```  1851   note XY = simple_distributedI[OF simple_function_Pair[OF X Y] refl]
```
```  1852   note YX = simple_distributedI[OF simple_function_Pair[OF Y X] refl]
```
```  1853   note simple_distributed_joint_finite[OF this, simp]
```
```  1854   let ?f = "\<lambda>x. prob ((\<lambda>x. (X x, Y x)) -` {x} \<inter> space M)"
```
```  1855   let ?g = "\<lambda>x. prob ((\<lambda>x. (Y x, X x)) -` {x} \<inter> space M)"
```
```  1856   let ?h = "\<lambda>x. if x \<in> (\<lambda>x. (Y x, X x)) ` space M then prob ((\<lambda>x. (Y x, X x)) -` {x} \<inter> space M) else 0"
```
```  1857   have "\<H>(\<lambda>x. (X x, Y x)) = - (\<Sum>x\<in>(\<lambda>x. (X x, Y x)) ` space M. ?f x * log b (?f x))"
```
```  1858     using XY by (rule entropy_simple_distributed)
```
```  1859   also have "\<dots> = - (\<Sum>x\<in>(\<lambda>(x, y). (y, x)) ` (\<lambda>x. (X x, Y x)) ` space M. ?g x * log b (?g x))"
```
```  1860     by (subst (2) setsum.reindex) (auto simp: inj_on_def intro!: setsum.cong arg_cong[where f="\<lambda>A. prob A * log b (prob A)"])
```
```  1861   also have "\<dots> = - (\<Sum>x\<in>(\<lambda>x. (Y x, X x)) ` space M. ?h x * log b (?h x))"
```
```  1862     by (auto intro!: setsum.cong)
```
```  1863   also have "\<dots> = entropy b (count_space (Y ` space M) \<Otimes>\<^sub>M count_space (X ` space M)) (\<lambda>x. (Y x, X x))"
```
```  1864     by (subst entropy_distr[OF simple_distributed_joint[OF YX]])
```
```  1865        (auto simp: pair_measure_count_space sigma_finite_measure_count_space_finite lebesgue_integral_count_space_finite
```
```  1866              cong del: setsum.cong  intro!: setsum.mono_neutral_left)
```
```  1867   finally have "\<H>(\<lambda>x. (X x, Y x)) = entropy b (count_space (Y ` space M) \<Otimes>\<^sub>M count_space (X ` space M)) (\<lambda>x. (Y x, X x))" .
```
```  1868   then show ?thesis
```
```  1869     unfolding conditional_entropy_eq_entropy_simple[OF Y X] by simp
```
```  1870 qed
```
```  1871
```
```  1872 lemma (in information_space) entropy_partition:
```
```  1873   assumes X: "simple_function M X"
```
```  1874   shows "\<H>(X) = \<H>(f \<circ> X) + \<H>(X|f \<circ> X)"
```
```  1875 proof -
```
```  1876   note fX = simple_function_compose[OF X, of f]
```
```  1877   have eq: "(\<lambda>x. ((f \<circ> X) x, X x)) ` space M = (\<lambda>x. (f x, x)) ` X ` space M" by auto
```
```  1878   have inj: "\<And>A. inj_on (\<lambda>x. (f x, x)) A"
```
```  1879     by (auto simp: inj_on_def)
```
```  1880   show ?thesis
```
```  1881     apply (subst entropy_chain_rule[symmetric, OF fX X])
```
```  1882     apply (subst entropy_simple_distributed[OF simple_distributedI[OF simple_function_Pair[OF fX X] refl]])
```
```  1883     apply (subst entropy_simple_distributed[OF simple_distributedI[OF X refl]])
```
```  1884     unfolding eq
```
```  1885     apply (subst setsum.reindex[OF inj])
```
```  1886     apply (auto intro!: setsum.cong arg_cong[where f="\<lambda>A. prob A * log b (prob A)"])
```
```  1887     done
```
```  1888 qed
```
```  1889
```
```  1890 corollary (in information_space) entropy_data_processing:
```
```  1891   assumes X: "simple_function M X" shows "\<H>(f \<circ> X) \<le> \<H>(X)"
```
```  1892 proof -
```
```  1893   note fX = simple_function_compose[OF X, of f]
```
```  1894   from X have "\<H>(X) = \<H>(f\<circ>X) + \<H>(X|f\<circ>X)" by (rule entropy_partition)
```
```  1895   then show "\<H>(f \<circ> X) \<le> \<H>(X)"
```
```  1896     by (auto intro: conditional_entropy_nonneg[OF X fX])
```
```  1897 qed
```
```  1898
```
```  1899 corollary (in information_space) entropy_of_inj:
```
```  1900   assumes X: "simple_function M X" and inj: "inj_on f (X`space M)"
```
```  1901   shows "\<H>(f \<circ> X) = \<H>(X)"
```
```  1902 proof (rule antisym)
```
```  1903   show "\<H>(f \<circ> X) \<le> \<H>(X)" using entropy_data_processing[OF X] .
```
```  1904 next
```
```  1905   have sf: "simple_function M (f \<circ> X)"
```
```  1906     using X by auto
```
```  1907   have "\<H>(X) = \<H>(the_inv_into (X`space M) f \<circ> (f \<circ> X))"
```
```  1908     unfolding o_assoc
```
```  1909     apply (subst entropy_simple_distributed[OF simple_distributedI[OF X refl]])
```
```  1910     apply (subst entropy_simple_distributed[OF simple_distributedI[OF simple_function_compose[OF X]], where f="\<lambda>x. prob (X -` {x} \<inter> space M)"])
```
```  1911     apply (auto intro!: setsum.cong arg_cong[where f=prob] image_eqI simp: the_inv_into_f_f[OF inj] comp_def)
```
```  1912     done
```
```  1913   also have "... \<le> \<H>(f \<circ> X)"
```
```  1914     using entropy_data_processing[OF sf] .
```
```  1915   finally show "\<H>(X) \<le> \<H>(f \<circ> X)" .
```
```  1916 qed
```
```  1917
```
```  1918 end
```