src/HOL/Probability/Information.thy
author paulson <lp15@cam.ac.uk>
Sat Apr 11 11:56:40 2015 +0100 (2015-04-11)
changeset 60017 b785d6d06430
parent 58876 1888e3cb8048
child 60580 7e741e22d7fc
permissions -rw-r--r--
Overloading of ln and powr, but "approximation" no longer works for powr. Code generation also fails due to type ambiguity in scala.
     1 (*  Title:      HOL/Probability/Information.thy
     2     Author:     Johannes Hölzl, TU München
     3     Author:     Armin Heller, TU München
     4 *)
     5 
     6 section {*Information theory*}
     7 
     8 theory Information
     9 imports
    10   Independent_Family
    11   "~~/src/HOL/Library/Convex"
    12 begin
    13 
    14 lemma log_le: "1 < a \<Longrightarrow> 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log a x \<le> log a y"
    15   by (subst log_le_cancel_iff) auto
    16 
    17 lemma log_less: "1 < a \<Longrightarrow> 0 < x \<Longrightarrow> x < y \<Longrightarrow> log a x < log a y"
    18   by (subst log_less_cancel_iff) auto
    19 
    20 lemma setsum_cartesian_product':
    21   "(\<Sum>x\<in>A \<times> B. f x) = (\<Sum>x\<in>A. setsum (\<lambda>y. f (x, y)) B)"
    22   unfolding setsum.cartesian_product by simp
    23 
    24 lemma split_pairs:
    25   "((A, B) = X) \<longleftrightarrow> (fst X = A \<and> snd X = B)" and
    26   "(X = (A, B)) \<longleftrightarrow> (fst X = A \<and> snd X = B)" by auto
    27 
    28 subsection "Information theory"
    29 
    30 locale information_space = prob_space +
    31   fixes b :: real assumes b_gt_1: "1 < b"
    32 
    33 context information_space
    34 begin
    35 
    36 text {* Introduce some simplification rules for logarithm of base @{term b}. *}
    37 
    38 lemma log_neg_const:
    39   assumes "x \<le> 0"
    40   shows "log b x = log b 0"
    41 proof -
    42   { fix u :: real
    43     have "x \<le> 0" by fact
    44     also have "0 < exp u"
    45       using exp_gt_zero .
    46     finally have "exp u \<noteq> x"
    47       by auto }
    48   then show "log b x = log b 0"
    49     by (simp add: log_def ln_real_def)
    50 qed
    51 
    52 lemma log_mult_eq:
    53   "log b (A * B) = (if 0 < A * B then log b \<bar>A\<bar> + log b \<bar>B\<bar> else log b 0)"
    54   using log_mult[of b "\<bar>A\<bar>" "\<bar>B\<bar>"] b_gt_1 log_neg_const[of "A * B"]
    55   by (auto simp: zero_less_mult_iff mult_le_0_iff)
    56 
    57 lemma log_inverse_eq:
    58   "log b (inverse B) = (if 0 < B then - log b B else log b 0)"
    59   using log_inverse[of b B] log_neg_const[of "inverse B"] b_gt_1 by simp
    60 
    61 lemma log_divide_eq:
    62   "log b (A / B) = (if 0 < A * B then log b \<bar>A\<bar> - log b \<bar>B\<bar> else log b 0)"
    63   unfolding divide_inverse log_mult_eq log_inverse_eq abs_inverse
    64   by (auto simp: zero_less_mult_iff mult_le_0_iff)
    65 
    66 lemmas log_simps = log_mult_eq log_inverse_eq log_divide_eq
    67 
    68 end
    69 
    70 subsection "Kullback$-$Leibler divergence"
    71 
    72 text {* The Kullback$-$Leibler divergence is also known as relative entropy or
    73 Kullback$-$Leibler distance. *}
    74 
    75 definition
    76   "entropy_density b M N = log b \<circ> real \<circ> RN_deriv M N"
    77 
    78 definition
    79   "KL_divergence b M N = integral\<^sup>L N (entropy_density b M N)"
    80 
    81 lemma measurable_entropy_density[measurable]: "entropy_density b M N \<in> borel_measurable M"
    82   unfolding entropy_density_def by auto
    83 
    84 lemma (in sigma_finite_measure) KL_density:
    85   fixes f :: "'a \<Rightarrow> real"
    86   assumes "1 < b"
    87   assumes f[measurable]: "f \<in> borel_measurable M" and nn: "AE x in M. 0 \<le> f x"
    88   shows "KL_divergence b M (density M f) = (\<integral>x. f x * log b (f x) \<partial>M)"
    89   unfolding KL_divergence_def
    90 proof (subst integral_real_density)
    91   show [measurable]: "entropy_density b M (density M (\<lambda>x. ereal (f x))) \<in> borel_measurable M"
    92     using f
    93     by (auto simp: comp_def entropy_density_def)
    94   have "density M (RN_deriv M (density M f)) = density M f"
    95     using f nn by (intro density_RN_deriv_density) auto
    96   then have eq: "AE x in M. RN_deriv M (density M f) x = f x"
    97     using f nn by (intro density_unique) (auto simp: RN_deriv_nonneg)
    98   show "(\<integral>x. f x * entropy_density b M (density M (\<lambda>x. ereal (f x))) x \<partial>M) = (\<integral>x. f x * log b (f x) \<partial>M)"
    99     apply (intro integral_cong_AE)
   100     apply measurable
   101     using eq
   102     apply eventually_elim
   103     apply (auto simp: entropy_density_def)
   104     done
   105 qed fact+
   106 
   107 lemma (in sigma_finite_measure) KL_density_density:
   108   fixes f g :: "'a \<Rightarrow> real"
   109   assumes "1 < b"
   110   assumes f: "f \<in> borel_measurable M" "AE x in M. 0 \<le> f x"
   111   assumes g: "g \<in> borel_measurable M" "AE x in M. 0 \<le> g x"
   112   assumes ac: "AE x in M. f x = 0 \<longrightarrow> g x = 0"
   113   shows "KL_divergence b (density M f) (density M g) = (\<integral>x. g x * log b (g x / f x) \<partial>M)"
   114 proof -
   115   interpret Mf: sigma_finite_measure "density M f"
   116     using f by (subst sigma_finite_iff_density_finite) auto
   117   have "KL_divergence b (density M f) (density M g) =
   118     KL_divergence b (density M f) (density (density M f) (\<lambda>x. g x / f x))"
   119     using f g ac by (subst density_density_divide) simp_all
   120   also have "\<dots> = (\<integral>x. (g x / f x) * log b (g x / f x) \<partial>density M f)"
   121     using f g `1 < b` by (intro Mf.KL_density) (auto simp: AE_density)
   122   also have "\<dots> = (\<integral>x. g x * log b (g x / f x) \<partial>M)"
   123     using ac f g `1 < b` by (subst integral_density) (auto intro!: integral_cong_AE)
   124   finally show ?thesis .
   125 qed
   126 
   127 lemma (in information_space) KL_gt_0:
   128   fixes D :: "'a \<Rightarrow> real"
   129   assumes "prob_space (density M D)"
   130   assumes D: "D \<in> borel_measurable M" "AE x in M. 0 \<le> D x"
   131   assumes int: "integrable M (\<lambda>x. D x * log b (D x))"
   132   assumes A: "density M D \<noteq> M"
   133   shows "0 < KL_divergence b M (density M D)"
   134 proof -
   135   interpret N: prob_space "density M D" by fact
   136 
   137   obtain A where "A \<in> sets M" "emeasure (density M D) A \<noteq> emeasure M A"
   138     using measure_eqI[of "density M D" M] `density M D \<noteq> M` by auto
   139 
   140   let ?D_set = "{x\<in>space M. D x \<noteq> 0}"
   141   have [simp, intro]: "?D_set \<in> sets M"
   142     using D by auto
   143 
   144   have D_neg: "(\<integral>\<^sup>+ x. ereal (- D x) \<partial>M) = 0"
   145     using D by (subst nn_integral_0_iff_AE) auto
   146 
   147   have "(\<integral>\<^sup>+ x. ereal (D x) \<partial>M) = emeasure (density M D) (space M)"
   148     using D by (simp add: emeasure_density cong: nn_integral_cong)
   149   then have D_pos: "(\<integral>\<^sup>+ x. ereal (D x) \<partial>M) = 1"
   150     using N.emeasure_space_1 by simp
   151 
   152   have "integrable M D"
   153     using D D_pos D_neg unfolding real_integrable_def real_lebesgue_integral_def by simp_all
   154   then have "integral\<^sup>L M D = 1"
   155     using D D_pos D_neg by (simp add: real_lebesgue_integral_def)
   156 
   157   have "0 \<le> 1 - measure M ?D_set"
   158     using prob_le_1 by (auto simp: field_simps)
   159   also have "\<dots> = (\<integral> x. D x - indicator ?D_set x \<partial>M)"
   160     using `integrable M D` `integral\<^sup>L M D = 1`
   161     by (simp add: emeasure_eq_measure)
   162   also have "\<dots> < (\<integral> x. D x * (ln b * log b (D x)) \<partial>M)"
   163   proof (rule integral_less_AE)
   164     show "integrable M (\<lambda>x. D x - indicator ?D_set x)"
   165       using `integrable M D` by auto
   166   next
   167     from integrable_mult_left(1)[OF int, of "ln b"]
   168     show "integrable M (\<lambda>x. D x * (ln b * log b (D x)))" 
   169       by (simp add: ac_simps)
   170   next
   171     show "emeasure M {x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} \<noteq> 0"
   172     proof
   173       assume eq_0: "emeasure M {x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} = 0"
   174       then have disj: "AE x in M. D x = 1 \<or> D x = 0"
   175         using D(1) by (auto intro!: AE_I[OF subset_refl] sets.sets_Collect)
   176 
   177       have "emeasure M {x\<in>space M. D x = 1} = (\<integral>\<^sup>+ x. indicator {x\<in>space M. D x = 1} x \<partial>M)"
   178         using D(1) by auto
   179       also have "\<dots> = (\<integral>\<^sup>+ x. ereal (D x) \<partial>M)"
   180         using disj by (auto intro!: nn_integral_cong_AE simp: indicator_def one_ereal_def)
   181       finally have "AE x in M. D x = 1"
   182         using D D_pos by (intro AE_I_eq_1) auto
   183       then have "(\<integral>\<^sup>+x. indicator A x\<partial>M) = (\<integral>\<^sup>+x. ereal (D x) * indicator A x\<partial>M)"
   184         by (intro nn_integral_cong_AE) (auto simp: one_ereal_def[symmetric])
   185       also have "\<dots> = density M D A"
   186         using `A \<in> sets M` D by (simp add: emeasure_density)
   187       finally show False using `A \<in> sets M` `emeasure (density M D) A \<noteq> emeasure M A` by simp
   188     qed
   189     show "{x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} \<in> sets M"
   190       using D(1) by (auto intro: sets.sets_Collect_conj)
   191 
   192     show "AE t in M. t \<in> {x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} \<longrightarrow>
   193       D t - indicator ?D_set t \<noteq> D t * (ln b * log b (D t))"
   194       using D(2)
   195     proof (eventually_elim, safe)
   196       fix t assume Dt: "t \<in> space M" "D t \<noteq> 1" "D t \<noteq> 0" "0 \<le> D t"
   197         and eq: "D t - indicator ?D_set t = D t * (ln b * log b (D t))"
   198 
   199       have "D t - 1 = D t - indicator ?D_set t"
   200         using Dt by simp
   201       also note eq
   202       also have "D t * (ln b * log b (D t)) = - D t * ln (1 / D t)"
   203         using b_gt_1 `D t \<noteq> 0` `0 \<le> D t`
   204         by (simp add: log_def ln_div less_le)
   205       finally have "ln (1 / D t) = 1 / D t - 1"
   206         using `D t \<noteq> 0` by (auto simp: field_simps)
   207       from ln_eq_minus_one[OF _ this] `D t \<noteq> 0` `0 \<le> D t` `D t \<noteq> 1`
   208       show False by auto
   209     qed
   210 
   211     show "AE t in M. D t - indicator ?D_set t \<le> D t * (ln b * log b (D t))"
   212       using D(2) AE_space
   213     proof eventually_elim
   214       fix t assume "t \<in> space M" "0 \<le> D t"
   215       show "D t - indicator ?D_set t \<le> D t * (ln b * log b (D t))"
   216       proof cases
   217         assume asm: "D t \<noteq> 0"
   218         then have "0 < D t" using `0 \<le> D t` by auto
   219         then have "0 < 1 / D t" by auto
   220         have "D t - indicator ?D_set t \<le> - D t * (1 / D t - 1)"
   221           using asm `t \<in> space M` by (simp add: field_simps)
   222         also have "- D t * (1 / D t - 1) \<le> - D t * ln (1 / D t)"
   223           using ln_le_minus_one `0 < 1 / D t` by (intro mult_left_mono_neg) auto
   224         also have "\<dots> = D t * (ln b * log b (D t))"
   225           using `0 < D t` b_gt_1
   226           by (simp_all add: log_def ln_div)
   227         finally show ?thesis by simp
   228       qed simp
   229     qed
   230   qed
   231   also have "\<dots> = (\<integral> x. ln b * (D x * log b (D x)) \<partial>M)"
   232     by (simp add: ac_simps)
   233   also have "\<dots> = ln b * (\<integral> x. D x * log b (D x) \<partial>M)"
   234     using int by simp
   235   finally show ?thesis
   236     using b_gt_1 D by (subst KL_density) (auto simp: zero_less_mult_iff)
   237 qed
   238 
   239 lemma (in sigma_finite_measure) KL_same_eq_0: "KL_divergence b M M = 0"
   240 proof -
   241   have "AE x in M. 1 = RN_deriv M M x"
   242   proof (rule RN_deriv_unique)
   243     show "(\<lambda>x. 1) \<in> borel_measurable M" "AE x in M. 0 \<le> (1 :: ereal)" by auto
   244     show "density M (\<lambda>x. 1) = M"
   245       apply (auto intro!: measure_eqI emeasure_density)
   246       apply (subst emeasure_density)
   247       apply auto
   248       done
   249   qed
   250   then have "AE x in M. log b (real (RN_deriv M M x)) = 0"
   251     by (elim AE_mp) simp
   252   from integral_cong_AE[OF _ _ this]
   253   have "integral\<^sup>L M (entropy_density b M M) = 0"
   254     by (simp add: entropy_density_def comp_def)
   255   then show "KL_divergence b M M = 0"
   256     unfolding KL_divergence_def
   257     by auto
   258 qed
   259 
   260 lemma (in information_space) KL_eq_0_iff_eq:
   261   fixes D :: "'a \<Rightarrow> real"
   262   assumes "prob_space (density M D)"
   263   assumes D: "D \<in> borel_measurable M" "AE x in M. 0 \<le> D x"
   264   assumes int: "integrable M (\<lambda>x. D x * log b (D x))"
   265   shows "KL_divergence b M (density M D) = 0 \<longleftrightarrow> density M D = M"
   266   using KL_same_eq_0[of b] KL_gt_0[OF assms]
   267   by (auto simp: less_le)
   268 
   269 lemma (in information_space) KL_eq_0_iff_eq_ac:
   270   fixes D :: "'a \<Rightarrow> real"
   271   assumes "prob_space N"
   272   assumes ac: "absolutely_continuous M N" "sets N = sets M"
   273   assumes int: "integrable N (entropy_density b M N)"
   274   shows "KL_divergence b M N = 0 \<longleftrightarrow> N = M"
   275 proof -
   276   interpret N: prob_space N by fact
   277   have "finite_measure N" by unfold_locales
   278   from real_RN_deriv[OF this ac] guess D . note D = this
   279   
   280   have "N = density M (RN_deriv M N)"
   281     using ac by (rule density_RN_deriv[symmetric])
   282   also have "\<dots> = density M D"
   283     using D by (auto intro!: density_cong)
   284   finally have N: "N = density M D" .
   285 
   286   from absolutely_continuous_AE[OF ac(2,1) D(2)] D b_gt_1 ac measurable_entropy_density
   287   have "integrable N (\<lambda>x. log b (D x))"
   288     by (intro integrable_cong_AE[THEN iffD2, OF _ _ _ int])
   289        (auto simp: N entropy_density_def)
   290   with D b_gt_1 have "integrable M (\<lambda>x. D x * log b (D x))"
   291     by (subst integrable_real_density[symmetric]) (auto simp: N[symmetric] comp_def)
   292   with `prob_space N` D show ?thesis
   293     unfolding N
   294     by (intro KL_eq_0_iff_eq) auto
   295 qed
   296 
   297 lemma (in information_space) KL_nonneg:
   298   assumes "prob_space (density M D)"
   299   assumes D: "D \<in> borel_measurable M" "AE x in M. 0 \<le> D x"
   300   assumes int: "integrable M (\<lambda>x. D x * log b (D x))"
   301   shows "0 \<le> KL_divergence b M (density M D)"
   302   using KL_gt_0[OF assms] by (cases "density M D = M") (auto simp: KL_same_eq_0)
   303 
   304 lemma (in sigma_finite_measure) KL_density_density_nonneg:
   305   fixes f g :: "'a \<Rightarrow> real"
   306   assumes "1 < b"
   307   assumes f: "f \<in> borel_measurable M" "AE x in M. 0 \<le> f x" "prob_space (density M f)"
   308   assumes g: "g \<in> borel_measurable M" "AE x in M. 0 \<le> g x" "prob_space (density M g)"
   309   assumes ac: "AE x in M. f x = 0 \<longrightarrow> g x = 0"
   310   assumes int: "integrable M (\<lambda>x. g x * log b (g x / f x))"
   311   shows "0 \<le> KL_divergence b (density M f) (density M g)"
   312 proof -
   313   interpret Mf: prob_space "density M f" by fact
   314   interpret Mf: information_space "density M f" b by default fact
   315   have eq: "density (density M f) (\<lambda>x. g x / f x) = density M g" (is "?DD = _")
   316     using f g ac by (subst density_density_divide) simp_all
   317 
   318   have "0 \<le> KL_divergence b (density M f) (density (density M f) (\<lambda>x. g x / f x))"
   319   proof (rule Mf.KL_nonneg)
   320     show "prob_space ?DD" unfolding eq by fact
   321     from f g show "(\<lambda>x. g x / f x) \<in> borel_measurable (density M f)"
   322       by auto
   323     show "AE x in density M f. 0 \<le> g x / f x"
   324       using f g by (auto simp: AE_density)
   325     show "integrable (density M f) (\<lambda>x. g x / f x * log b (g x / f x))"
   326       using `1 < b` f g ac
   327       by (subst integrable_density)
   328          (auto intro!: integrable_cong_AE[THEN iffD2, OF _ _ _ int] measurable_If)
   329   qed
   330   also have "\<dots> = KL_divergence b (density M f) (density M g)"
   331     using f g ac by (subst density_density_divide) simp_all
   332   finally show ?thesis .
   333 qed
   334 
   335 subsection {* Finite Entropy *}
   336 
   337 definition (in information_space) 
   338   "finite_entropy S X f \<longleftrightarrow> distributed M S X f \<and> integrable S (\<lambda>x. f x * log b (f x))"
   339 
   340 lemma (in information_space) finite_entropy_simple_function:
   341   assumes X: "simple_function M X"
   342   shows "finite_entropy (count_space (X`space M)) X (\<lambda>a. measure M {x \<in> space M. X x = a})"
   343   unfolding finite_entropy_def
   344 proof
   345   have [simp]: "finite (X ` space M)"
   346     using X by (auto simp: simple_function_def)
   347   then show "integrable (count_space (X ` space M))
   348      (\<lambda>x. prob {xa \<in> space M. X xa = x} * log b (prob {xa \<in> space M. X xa = x}))"
   349     by (rule integrable_count_space)
   350   have d: "distributed M (count_space (X ` space M)) X (\<lambda>x. ereal (if x \<in> X`space M then prob {xa \<in> space M. X xa = x} else 0))"
   351     by (rule distributed_simple_function_superset[OF X]) (auto intro!: arg_cong[where f=prob])
   352   show "distributed M (count_space (X ` space M)) X (\<lambda>x. ereal (prob {xa \<in> space M. X xa = x}))"
   353     by (rule distributed_cong_density[THEN iffD1, OF _ _ _ d]) auto
   354 qed
   355 
   356 lemma distributed_transform_AE:
   357   assumes T: "T \<in> measurable P Q" "absolutely_continuous Q (distr P Q T)"
   358   assumes g: "distributed M Q Y g"
   359   shows "AE x in P. 0 \<le> g (T x)"
   360   using g
   361   apply (subst AE_distr_iff[symmetric, OF T(1)])
   362   apply simp
   363   apply (rule absolutely_continuous_AE[OF _ T(2)])
   364   apply simp
   365   apply (simp add: distributed_AE)
   366   done
   367 
   368 lemma ac_fst:
   369   assumes "sigma_finite_measure T"
   370   shows "absolutely_continuous S (distr (S \<Otimes>\<^sub>M T) S fst)"
   371 proof -
   372   interpret sigma_finite_measure T by fact
   373   { fix A assume A: "A \<in> sets S" "emeasure S A = 0"
   374     then have "fst -` A \<inter> space (S \<Otimes>\<^sub>M T) = A \<times> space T"
   375       by (auto simp: space_pair_measure dest!: sets.sets_into_space)
   376     with A have "emeasure (S \<Otimes>\<^sub>M T) (fst -` A \<inter> space (S \<Otimes>\<^sub>M T)) = 0"
   377       by (simp add: emeasure_pair_measure_Times) }
   378   then show ?thesis
   379     unfolding absolutely_continuous_def
   380     apply (auto simp: null_sets_distr_iff)
   381     apply (auto simp: null_sets_def intro!: measurable_sets)
   382     done
   383 qed
   384 
   385 lemma ac_snd:
   386   assumes "sigma_finite_measure T"
   387   shows "absolutely_continuous T (distr (S \<Otimes>\<^sub>M T) T snd)"
   388 proof -
   389   interpret sigma_finite_measure T by fact
   390   { fix A assume A: "A \<in> sets T" "emeasure T A = 0"
   391     then have "snd -` A \<inter> space (S \<Otimes>\<^sub>M T) = space S \<times> A"
   392       by (auto simp: space_pair_measure dest!: sets.sets_into_space)
   393     with A have "emeasure (S \<Otimes>\<^sub>M T) (snd -` A \<inter> space (S \<Otimes>\<^sub>M T)) = 0"
   394       by (simp add: emeasure_pair_measure_Times) }
   395   then show ?thesis
   396     unfolding absolutely_continuous_def
   397     apply (auto simp: null_sets_distr_iff)
   398     apply (auto simp: null_sets_def intro!: measurable_sets)
   399     done
   400 qed
   401 
   402 lemma integrable_cong_AE_imp:
   403   "integrable M g \<Longrightarrow> f \<in> borel_measurable M \<Longrightarrow> (AE x in M. g x = f x) \<Longrightarrow> integrable M f"
   404   using integrable_cong_AE[of f M g] by (auto simp: eq_commute)
   405 
   406 lemma (in information_space) finite_entropy_integrable:
   407   "finite_entropy S X Px \<Longrightarrow> integrable S (\<lambda>x. Px x * log b (Px x))"
   408   unfolding finite_entropy_def by auto
   409 
   410 lemma (in information_space) finite_entropy_distributed:
   411   "finite_entropy S X Px \<Longrightarrow> distributed M S X Px"
   412   unfolding finite_entropy_def by auto
   413 
   414 lemma (in information_space) finite_entropy_integrable_transform:
   415   assumes Fx: "finite_entropy S X Px"
   416   assumes Fy: "distributed M T Y Py"
   417     and "X = (\<lambda>x. f (Y x))"
   418     and "f \<in> measurable T S"
   419   shows "integrable T (\<lambda>x. Py x * log b (Px (f x)))"
   420   using assms unfolding finite_entropy_def
   421   using distributed_transform_integrable[of M T Y Py S X Px f "\<lambda>x. log b (Px x)"]
   422   by auto
   423 
   424 subsection {* Mutual Information *}
   425 
   426 definition (in prob_space)
   427   "mutual_information b S T X Y =
   428     KL_divergence b (distr M S X \<Otimes>\<^sub>M distr M T Y) (distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)))"
   429 
   430 lemma (in information_space) mutual_information_indep_vars:
   431   fixes S T X Y
   432   defines "P \<equiv> distr M S X \<Otimes>\<^sub>M distr M T Y"
   433   defines "Q \<equiv> distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))"
   434   shows "indep_var S X T Y \<longleftrightarrow>
   435     (random_variable S X \<and> random_variable T Y \<and>
   436       absolutely_continuous P Q \<and> integrable Q (entropy_density b P Q) \<and>
   437       mutual_information b S T X Y = 0)"
   438   unfolding indep_var_distribution_eq
   439 proof safe
   440   assume rv[measurable]: "random_variable S X" "random_variable T Y"
   441 
   442   interpret X: prob_space "distr M S X"
   443     by (rule prob_space_distr) fact
   444   interpret Y: prob_space "distr M T Y"
   445     by (rule prob_space_distr) fact
   446   interpret XY: pair_prob_space "distr M S X" "distr M T Y" by default
   447   interpret P: information_space P b unfolding P_def by default (rule b_gt_1)
   448 
   449   interpret Q: prob_space Q unfolding Q_def
   450     by (rule prob_space_distr) simp
   451 
   452   { assume "distr M S X \<Otimes>\<^sub>M distr M T Y = distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))"
   453     then have [simp]: "Q = P"  unfolding Q_def P_def by simp
   454 
   455     show ac: "absolutely_continuous P Q" by (simp add: absolutely_continuous_def)
   456     then have ed: "entropy_density b P Q \<in> borel_measurable P"
   457       by simp
   458 
   459     have "AE x in P. 1 = RN_deriv P Q x"
   460     proof (rule P.RN_deriv_unique)
   461       show "density P (\<lambda>x. 1) = Q"
   462         unfolding `Q = P` by (intro measure_eqI) (auto simp: emeasure_density)
   463     qed auto
   464     then have ae_0: "AE x in P. entropy_density b P Q x = 0"
   465       by eventually_elim (auto simp: entropy_density_def)
   466     then have "integrable P (entropy_density b P Q) \<longleftrightarrow> integrable Q (\<lambda>x. 0::real)"
   467       using ed unfolding `Q = P` by (intro integrable_cong_AE) auto
   468     then show "integrable Q (entropy_density b P Q)" by simp
   469 
   470     from ae_0 have "mutual_information b S T X Y = (\<integral>x. 0 \<partial>P)"
   471       unfolding mutual_information_def KL_divergence_def P_def[symmetric] Q_def[symmetric] `Q = P`
   472       by (intro integral_cong_AE) auto
   473     then show "mutual_information b S T X Y = 0"
   474       by simp }
   475 
   476   { assume ac: "absolutely_continuous P Q"
   477     assume int: "integrable Q (entropy_density b P Q)"
   478     assume I_eq_0: "mutual_information b S T X Y = 0"
   479 
   480     have eq: "Q = P"
   481     proof (rule P.KL_eq_0_iff_eq_ac[THEN iffD1])
   482       show "prob_space Q" by unfold_locales
   483       show "absolutely_continuous P Q" by fact
   484       show "integrable Q (entropy_density b P Q)" by fact
   485       show "sets Q = sets P" by (simp add: P_def Q_def sets_pair_measure)
   486       show "KL_divergence b P Q = 0"
   487         using I_eq_0 unfolding mutual_information_def by (simp add: P_def Q_def)
   488     qed
   489     then show "distr M S X \<Otimes>\<^sub>M distr M T Y = distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))"
   490       unfolding P_def Q_def .. }
   491 qed
   492 
   493 abbreviation (in information_space)
   494   mutual_information_Pow ("\<I>'(_ ; _')") where
   495   "\<I>(X ; Y) \<equiv> mutual_information b (count_space (X`space M)) (count_space (Y`space M)) X Y"
   496 
   497 lemma (in information_space)
   498   fixes Pxy :: "'b \<times> 'c \<Rightarrow> real" and Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real"
   499   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
   500   assumes Fx: "finite_entropy S X Px" and Fy: "finite_entropy T Y Py"
   501   assumes Fxy: "finite_entropy (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
   502   defines "f \<equiv> \<lambda>x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x)))"
   503   shows mutual_information_distr': "mutual_information b S T X Y = integral\<^sup>L (S \<Otimes>\<^sub>M T) f" (is "?M = ?R")
   504     and mutual_information_nonneg': "0 \<le> mutual_information b S T X Y"
   505 proof -
   506   have Px: "distributed M S X Px"
   507     using Fx by (auto simp: finite_entropy_def)
   508   have Py: "distributed M T Y Py"
   509     using Fy by (auto simp: finite_entropy_def)
   510   have Pxy: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
   511     using Fxy by (auto simp: finite_entropy_def)
   512 
   513   have X: "random_variable S X"
   514     using Px by auto
   515   have Y: "random_variable T Y"
   516     using Py by auto
   517   interpret S: sigma_finite_measure S by fact
   518   interpret T: sigma_finite_measure T by fact
   519   interpret ST: pair_sigma_finite S T ..
   520   interpret X: prob_space "distr M S X" using X by (rule prob_space_distr)
   521   interpret Y: prob_space "distr M T Y" using Y by (rule prob_space_distr)
   522   interpret XY: pair_prob_space "distr M S X" "distr M T Y" ..
   523   let ?P = "S \<Otimes>\<^sub>M T"
   524   let ?D = "distr M ?P (\<lambda>x. (X x, Y x))"
   525 
   526   { fix A assume "A \<in> sets S"
   527     with X Y have "emeasure (distr M S X) A = emeasure ?D (A \<times> space T)"
   528       by (auto simp: emeasure_distr measurable_Pair measurable_space
   529                intro!: arg_cong[where f="emeasure M"]) }
   530   note marginal_eq1 = this
   531   { fix A assume "A \<in> sets T"
   532     with X Y have "emeasure (distr M T Y) A = emeasure ?D (space S \<times> A)"
   533       by (auto simp: emeasure_distr measurable_Pair measurable_space
   534                intro!: arg_cong[where f="emeasure M"]) }
   535   note marginal_eq2 = this
   536 
   537   have eq: "(\<lambda>x. ereal (Px (fst x) * Py (snd x))) = (\<lambda>(x, y). ereal (Px x) * ereal (Py y))"
   538     by auto
   539 
   540   have distr_eq: "distr M S X \<Otimes>\<^sub>M distr M T Y = density ?P (\<lambda>x. ereal (Px (fst x) * Py (snd x)))"
   541     unfolding Px(1)[THEN distributed_distr_eq_density] Py(1)[THEN distributed_distr_eq_density] eq
   542   proof (subst pair_measure_density)
   543     show "(\<lambda>x. ereal (Px x)) \<in> borel_measurable S" "(\<lambda>y. ereal (Py y)) \<in> borel_measurable T"
   544       "AE x in S. 0 \<le> ereal (Px x)" "AE y in T. 0 \<le> ereal (Py y)"
   545       using Px Py by (auto simp: distributed_def)
   546     show "sigma_finite_measure (density T Py)" unfolding Py(1)[THEN distributed_distr_eq_density, symmetric] ..
   547   qed (fact | simp)+
   548   
   549   have M: "?M = KL_divergence b (density ?P (\<lambda>x. ereal (Px (fst x) * Py (snd x)))) (density ?P (\<lambda>x. ereal (Pxy x)))"
   550     unfolding mutual_information_def distr_eq Pxy(1)[THEN distributed_distr_eq_density] ..
   551 
   552   from Px Py have f: "(\<lambda>x. Px (fst x) * Py (snd x)) \<in> borel_measurable ?P"
   553     by (intro borel_measurable_times) (auto intro: distributed_real_measurable measurable_fst'' measurable_snd'')
   554   have PxPy_nonneg: "AE x in ?P. 0 \<le> Px (fst x) * Py (snd x)"
   555   proof (rule ST.AE_pair_measure)
   556     show "{x \<in> space ?P. 0 \<le> Px (fst x) * Py (snd x)} \<in> sets ?P"
   557       using f by auto
   558     show "AE x in S. AE y in T. 0 \<le> Px (fst (x, y)) * Py (snd (x, y))"
   559       using Px Py by (auto simp: zero_le_mult_iff dest!: distributed_real_AE)
   560   qed
   561 
   562   have "(AE x in ?P. Px (fst x) = 0 \<longrightarrow> Pxy x = 0)"
   563     by (rule subdensity_real[OF measurable_fst Pxy Px]) auto
   564   moreover
   565   have "(AE x in ?P. Py (snd x) = 0 \<longrightarrow> Pxy x = 0)"
   566     by (rule subdensity_real[OF measurable_snd Pxy Py]) auto
   567   ultimately have ac: "AE x in ?P. Px (fst x) * Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
   568     by eventually_elim auto
   569 
   570   show "?M = ?R"
   571     unfolding M f_def
   572     using b_gt_1 f PxPy_nonneg Pxy[THEN distributed_real_measurable] Pxy[THEN distributed_real_AE] ac
   573     by (rule ST.KL_density_density)
   574 
   575   have X: "X = fst \<circ> (\<lambda>x. (X x, Y x))" and Y: "Y = snd \<circ> (\<lambda>x. (X x, Y x))"
   576     by auto
   577 
   578   have "integrable (S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x) - Pxy x * log b (Px (fst x)) - Pxy x * log b (Py (snd x)))"
   579     using finite_entropy_integrable[OF Fxy]
   580     using finite_entropy_integrable_transform[OF Fx Pxy, of fst]
   581     using finite_entropy_integrable_transform[OF Fy Pxy, of snd]
   582     by simp
   583   moreover have "f \<in> borel_measurable (S \<Otimes>\<^sub>M T)"
   584     unfolding f_def using Px Py Pxy
   585     by (auto intro: distributed_real_measurable measurable_fst'' measurable_snd''
   586       intro!: borel_measurable_times borel_measurable_log borel_measurable_divide)
   587   ultimately have int: "integrable (S \<Otimes>\<^sub>M T) f"
   588     apply (rule integrable_cong_AE_imp)
   589     using
   590       distributed_transform_AE[OF measurable_fst ac_fst, of T, OF T Px]
   591       distributed_transform_AE[OF measurable_snd ac_snd, of _ _ _ _ S, OF T Py]
   592       subdensity_real[OF measurable_fst Pxy Px X]
   593       subdensity_real[OF measurable_snd Pxy Py Y]
   594       distributed_real_AE[OF Pxy]
   595     by eventually_elim
   596        (auto simp: f_def log_divide_eq log_mult_eq field_simps zero_less_mult_iff)
   597 
   598   show "0 \<le> ?M" unfolding M
   599   proof (rule ST.KL_density_density_nonneg
   600     [OF b_gt_1 f PxPy_nonneg _ Pxy[THEN distributed_real_measurable] Pxy[THEN distributed_real_AE] _ ac int[unfolded f_def]])
   601     show "prob_space (density (S \<Otimes>\<^sub>M T) (\<lambda>x. ereal (Pxy x))) "
   602       unfolding distributed_distr_eq_density[OF Pxy, symmetric]
   603       using distributed_measurable[OF Pxy] by (rule prob_space_distr)
   604     show "prob_space (density (S \<Otimes>\<^sub>M T) (\<lambda>x. ereal (Px (fst x) * Py (snd x))))"
   605       unfolding distr_eq[symmetric] by unfold_locales
   606   qed
   607 qed
   608 
   609 
   610 lemma (in information_space)
   611   fixes Pxy :: "'b \<times> 'c \<Rightarrow> real" and Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real"
   612   assumes "sigma_finite_measure S" "sigma_finite_measure T"
   613   assumes Px: "distributed M S X Px" and Py: "distributed M T Y Py"
   614   assumes Pxy: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
   615   defines "f \<equiv> \<lambda>x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x)))"
   616   shows mutual_information_distr: "mutual_information b S T X Y = integral\<^sup>L (S \<Otimes>\<^sub>M T) f" (is "?M = ?R")
   617     and mutual_information_nonneg: "integrable (S \<Otimes>\<^sub>M T) f \<Longrightarrow> 0 \<le> mutual_information b S T X Y"
   618 proof -
   619   have X: "random_variable S X"
   620     using Px by (auto simp: distributed_def)
   621   have Y: "random_variable T Y"
   622     using Py by (auto simp: distributed_def)
   623   interpret S: sigma_finite_measure S by fact
   624   interpret T: sigma_finite_measure T by fact
   625   interpret ST: pair_sigma_finite S T ..
   626   interpret X: prob_space "distr M S X" using X by (rule prob_space_distr)
   627   interpret Y: prob_space "distr M T Y" using Y by (rule prob_space_distr)
   628   interpret XY: pair_prob_space "distr M S X" "distr M T Y" ..
   629   let ?P = "S \<Otimes>\<^sub>M T"
   630   let ?D = "distr M ?P (\<lambda>x. (X x, Y x))"
   631 
   632   { fix A assume "A \<in> sets S"
   633     with X Y have "emeasure (distr M S X) A = emeasure ?D (A \<times> space T)"
   634       by (auto simp: emeasure_distr measurable_Pair measurable_space
   635                intro!: arg_cong[where f="emeasure M"]) }
   636   note marginal_eq1 = this
   637   { fix A assume "A \<in> sets T"
   638     with X Y have "emeasure (distr M T Y) A = emeasure ?D (space S \<times> A)"
   639       by (auto simp: emeasure_distr measurable_Pair measurable_space
   640                intro!: arg_cong[where f="emeasure M"]) }
   641   note marginal_eq2 = this
   642 
   643   have eq: "(\<lambda>x. ereal (Px (fst x) * Py (snd x))) = (\<lambda>(x, y). ereal (Px x) * ereal (Py y))"
   644     by auto
   645 
   646   have distr_eq: "distr M S X \<Otimes>\<^sub>M distr M T Y = density ?P (\<lambda>x. ereal (Px (fst x) * Py (snd x)))"
   647     unfolding Px(1)[THEN distributed_distr_eq_density] Py(1)[THEN distributed_distr_eq_density] eq
   648   proof (subst pair_measure_density)
   649     show "(\<lambda>x. ereal (Px x)) \<in> borel_measurable S" "(\<lambda>y. ereal (Py y)) \<in> borel_measurable T"
   650       "AE x in S. 0 \<le> ereal (Px x)" "AE y in T. 0 \<le> ereal (Py y)"
   651       using Px Py by (auto simp: distributed_def)
   652     show "sigma_finite_measure (density T Py)" unfolding Py(1)[THEN distributed_distr_eq_density, symmetric] ..
   653   qed (fact | simp)+
   654   
   655   have M: "?M = KL_divergence b (density ?P (\<lambda>x. ereal (Px (fst x) * Py (snd x)))) (density ?P (\<lambda>x. ereal (Pxy x)))"
   656     unfolding mutual_information_def distr_eq Pxy(1)[THEN distributed_distr_eq_density] ..
   657 
   658   from Px Py have f: "(\<lambda>x. Px (fst x) * Py (snd x)) \<in> borel_measurable ?P"
   659     by (intro borel_measurable_times) (auto intro: distributed_real_measurable measurable_fst'' measurable_snd'')
   660   have PxPy_nonneg: "AE x in ?P. 0 \<le> Px (fst x) * Py (snd x)"
   661   proof (rule ST.AE_pair_measure)
   662     show "{x \<in> space ?P. 0 \<le> Px (fst x) * Py (snd x)} \<in> sets ?P"
   663       using f by auto
   664     show "AE x in S. AE y in T. 0 \<le> Px (fst (x, y)) * Py (snd (x, y))"
   665       using Px Py by (auto simp: zero_le_mult_iff dest!: distributed_real_AE)
   666   qed
   667 
   668   have "(AE x in ?P. Px (fst x) = 0 \<longrightarrow> Pxy x = 0)"
   669     by (rule subdensity_real[OF measurable_fst Pxy Px]) auto
   670   moreover
   671   have "(AE x in ?P. Py (snd x) = 0 \<longrightarrow> Pxy x = 0)"
   672     by (rule subdensity_real[OF measurable_snd Pxy Py]) auto
   673   ultimately have ac: "AE x in ?P. Px (fst x) * Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
   674     by eventually_elim auto
   675 
   676   show "?M = ?R"
   677     unfolding M f_def
   678     using b_gt_1 f PxPy_nonneg Pxy[THEN distributed_real_measurable] Pxy[THEN distributed_real_AE] ac
   679     by (rule ST.KL_density_density)
   680 
   681   assume int: "integrable (S \<Otimes>\<^sub>M T) f"
   682   show "0 \<le> ?M" unfolding M
   683   proof (rule ST.KL_density_density_nonneg
   684     [OF b_gt_1 f PxPy_nonneg _ Pxy[THEN distributed_real_measurable] Pxy[THEN distributed_real_AE] _ ac int[unfolded f_def]])
   685     show "prob_space (density (S \<Otimes>\<^sub>M T) (\<lambda>x. ereal (Pxy x))) "
   686       unfolding distributed_distr_eq_density[OF Pxy, symmetric]
   687       using distributed_measurable[OF Pxy] by (rule prob_space_distr)
   688     show "prob_space (density (S \<Otimes>\<^sub>M T) (\<lambda>x. ereal (Px (fst x) * Py (snd x))))"
   689       unfolding distr_eq[symmetric] by unfold_locales
   690   qed
   691 qed
   692 
   693 lemma (in information_space)
   694   fixes Pxy :: "'b \<times> 'c \<Rightarrow> real" and Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real"
   695   assumes "sigma_finite_measure S" "sigma_finite_measure T"
   696   assumes Px[measurable]: "distributed M S X Px" and Py[measurable]: "distributed M T Y Py"
   697   assumes Pxy[measurable]: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
   698   assumes ae: "AE x in S. AE y in T. Pxy (x, y) = Px x * Py y"
   699   shows mutual_information_eq_0: "mutual_information b S T X Y = 0"
   700 proof -
   701   interpret S: sigma_finite_measure S by fact
   702   interpret T: sigma_finite_measure T by fact
   703   interpret ST: pair_sigma_finite S T ..
   704 
   705   have "AE x in S \<Otimes>\<^sub>M T. Px (fst x) = 0 \<longrightarrow> Pxy x = 0"
   706     by (rule subdensity_real[OF measurable_fst Pxy Px]) auto
   707   moreover
   708   have "AE x in S \<Otimes>\<^sub>M T. Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
   709     by (rule subdensity_real[OF measurable_snd Pxy Py]) auto
   710   moreover 
   711   have "AE x in S \<Otimes>\<^sub>M T. Pxy x = Px (fst x) * Py (snd x)"
   712     using distributed_real_measurable[OF Px] distributed_real_measurable[OF Py] distributed_real_measurable[OF Pxy]
   713     by (intro ST.AE_pair_measure) (auto simp: ae intro!: measurable_snd'' measurable_fst'')
   714   ultimately have "AE x in S \<Otimes>\<^sub>M T. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))) = 0"
   715     by eventually_elim simp
   716   then have "(\<integral>x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))) \<partial>(S \<Otimes>\<^sub>M T)) = (\<integral>x. 0 \<partial>(S \<Otimes>\<^sub>M T))"
   717     by (intro integral_cong_AE) auto
   718   then show ?thesis
   719     by (subst mutual_information_distr[OF assms(1-5)]) simp
   720 qed
   721 
   722 lemma (in information_space) mutual_information_simple_distributed:
   723   assumes X: "simple_distributed M X Px" and Y: "simple_distributed M Y Py"
   724   assumes XY: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy"
   725   shows "\<I>(X ; Y) = (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x))`space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y)))"
   726 proof (subst mutual_information_distr[OF _ _ simple_distributed[OF X] simple_distributed[OF Y] simple_distributed_joint[OF XY]])
   727   note fin = simple_distributed_joint_finite[OF XY, simp]
   728   show "sigma_finite_measure (count_space (X ` space M))"
   729     by (simp add: sigma_finite_measure_count_space_finite)
   730   show "sigma_finite_measure (count_space (Y ` space M))"
   731     by (simp add: sigma_finite_measure_count_space_finite)
   732   let ?Pxy = "\<lambda>x. (if x \<in> (\<lambda>x. (X x, Y x)) ` space M then Pxy x else 0)"
   733   let ?f = "\<lambda>x. ?Pxy x * log b (?Pxy x / (Px (fst x) * Py (snd x)))"
   734   have "\<And>x. ?f x = (if x \<in> (\<lambda>x. (X x, Y x)) ` space M then Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))) else 0)"
   735     by auto
   736   with fin show "(\<integral> x. ?f x \<partial>(count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M))) =
   737     (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y)))"
   738     by (auto simp add: pair_measure_count_space lebesgue_integral_count_space_finite setsum.If_cases split_beta'
   739              intro!: setsum.cong)
   740 qed
   741 
   742 lemma (in information_space)
   743   fixes Pxy :: "'b \<times> 'c \<Rightarrow> real" and Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real"
   744   assumes Px: "simple_distributed M X Px" and Py: "simple_distributed M Y Py"
   745   assumes Pxy: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy"
   746   assumes ae: "\<forall>x\<in>space M. Pxy (X x, Y x) = Px (X x) * Py (Y x)"
   747   shows mutual_information_eq_0_simple: "\<I>(X ; Y) = 0"
   748 proof (subst mutual_information_simple_distributed[OF Px Py Pxy])
   749   have "(\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y))) =
   750     (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. 0)"
   751     by (intro setsum.cong) (auto simp: ae)
   752   then show "(\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M.
   753     Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y))) = 0" by simp
   754 qed
   755 
   756 subsection {* Entropy *}
   757 
   758 definition (in prob_space) entropy :: "real \<Rightarrow> 'b measure \<Rightarrow> ('a \<Rightarrow> 'b) \<Rightarrow> real" where
   759   "entropy b S X = - KL_divergence b S (distr M S X)"
   760 
   761 abbreviation (in information_space)
   762   entropy_Pow ("\<H>'(_')") where
   763   "\<H>(X) \<equiv> entropy b (count_space (X`space M)) X"
   764 
   765 lemma (in prob_space) distributed_RN_deriv:
   766   assumes X: "distributed M S X Px"
   767   shows "AE x in S. RN_deriv S (density S Px) x = Px x"
   768 proof -
   769   note D = distributed_measurable[OF X] distributed_borel_measurable[OF X] distributed_AE[OF X]
   770   interpret X: prob_space "distr M S X"
   771     using D(1) by (rule prob_space_distr)
   772 
   773   have sf: "sigma_finite_measure (distr M S X)" by default
   774   show ?thesis
   775     using D
   776     apply (subst eq_commute)
   777     apply (intro RN_deriv_unique_sigma_finite)
   778     apply (auto simp: distributed_distr_eq_density[symmetric, OF X] sf measure_nonneg)
   779     done
   780 qed
   781 
   782 lemma (in information_space)
   783   fixes X :: "'a \<Rightarrow> 'b"
   784   assumes X[measurable]: "distributed M MX X f"
   785   shows entropy_distr: "entropy b MX X = - (\<integral>x. f x * log b (f x) \<partial>MX)" (is ?eq)
   786 proof -
   787   note D = distributed_measurable[OF X] distributed_borel_measurable[OF X] distributed_AE[OF X]
   788   note ae = distributed_RN_deriv[OF X]
   789 
   790   have ae_eq: "AE x in distr M MX X. log b (real (RN_deriv MX (distr M MX X) x)) =
   791     log b (f x)"
   792     unfolding distributed_distr_eq_density[OF X]
   793     apply (subst AE_density)
   794     using D apply simp
   795     using ae apply eventually_elim
   796     apply auto
   797     done
   798 
   799   have int_eq: "(\<integral> x. f x * log b (f x) \<partial>MX) = (\<integral> x. log b (f x) \<partial>distr M MX X)"
   800     unfolding distributed_distr_eq_density[OF X]
   801     using D
   802     by (subst integral_density)
   803        (auto simp: borel_measurable_ereal_iff)
   804 
   805   show ?eq
   806     unfolding entropy_def KL_divergence_def entropy_density_def comp_def int_eq neg_equal_iff_equal
   807     using ae_eq by (intro integral_cong_AE) auto
   808 qed
   809 
   810 lemma (in prob_space) distributed_imp_emeasure_nonzero:
   811   assumes X: "distributed M MX X Px"
   812   shows "emeasure MX {x \<in> space MX. Px x \<noteq> 0} \<noteq> 0"
   813 proof
   814   note Px = distributed_borel_measurable[OF X] distributed_AE[OF X]
   815   interpret X: prob_space "distr M MX X"
   816     using distributed_measurable[OF X] by (rule prob_space_distr)
   817 
   818   assume "emeasure MX {x \<in> space MX. Px x \<noteq> 0} = 0"
   819   with Px have "AE x in MX. Px x = 0"
   820     by (intro AE_I[OF subset_refl]) (auto simp: borel_measurable_ereal_iff)
   821   moreover
   822   from X.emeasure_space_1 have "(\<integral>\<^sup>+x. Px x \<partial>MX) = 1"
   823     unfolding distributed_distr_eq_density[OF X] using Px
   824     by (subst (asm) emeasure_density)
   825        (auto simp: borel_measurable_ereal_iff intro!: integral_cong cong: nn_integral_cong)
   826   ultimately show False
   827     by (simp add: nn_integral_cong_AE)
   828 qed
   829 
   830 lemma (in information_space) entropy_le:
   831   fixes Px :: "'b \<Rightarrow> real" and MX :: "'b measure"
   832   assumes X[measurable]: "distributed M MX X Px"
   833   and fin: "emeasure MX {x \<in> space MX. Px x \<noteq> 0} \<noteq> \<infinity>"
   834   and int: "integrable MX (\<lambda>x. - Px x * log b (Px x))"
   835   shows "entropy b MX X \<le> log b (measure MX {x \<in> space MX. Px x \<noteq> 0})"
   836 proof -
   837   note Px = distributed_borel_measurable[OF X] distributed_AE[OF X]
   838   interpret X: prob_space "distr M MX X"
   839     using distributed_measurable[OF X] by (rule prob_space_distr)
   840 
   841   have " - log b (measure MX {x \<in> space MX. Px x \<noteq> 0}) = 
   842     - log b (\<integral> x. indicator {x \<in> space MX. Px x \<noteq> 0} x \<partial>MX)"
   843     using Px fin
   844     by (auto simp: measure_def borel_measurable_ereal_iff)
   845   also have "- log b (\<integral> x. indicator {x \<in> space MX. Px x \<noteq> 0} x \<partial>MX) = - log b (\<integral> x. 1 / Px x \<partial>distr M MX X)"
   846     unfolding distributed_distr_eq_density[OF X] using Px
   847     apply (intro arg_cong[where f="log b"] arg_cong[where f=uminus])
   848     by (subst integral_density) (auto simp: borel_measurable_ereal_iff simp del: integral_indicator intro!: integral_cong)
   849   also have "\<dots> \<le> (\<integral> x. - log b (1 / Px x) \<partial>distr M MX X)"
   850   proof (rule X.jensens_inequality[of "\<lambda>x. 1 / Px x" "{0<..}" 0 1 "\<lambda>x. - log b x"])
   851     show "AE x in distr M MX X. 1 / Px x \<in> {0<..}"
   852       unfolding distributed_distr_eq_density[OF X]
   853       using Px by (auto simp: AE_density)
   854     have [simp]: "\<And>x. x \<in> space MX \<Longrightarrow> ereal (if Px x = 0 then 0 else 1) = indicator {x \<in> space MX. Px x \<noteq> 0} x"
   855       by (auto simp: one_ereal_def)
   856     have "(\<integral>\<^sup>+ x. max 0 (ereal (- (if Px x = 0 then 0 else 1))) \<partial>MX) = (\<integral>\<^sup>+ x. 0 \<partial>MX)"
   857       by (intro nn_integral_cong) (auto split: split_max)
   858     then show "integrable (distr M MX X) (\<lambda>x. 1 / Px x)"
   859       unfolding distributed_distr_eq_density[OF X] using Px
   860       by (auto simp: nn_integral_density real_integrable_def borel_measurable_ereal_iff fin nn_integral_max_0
   861               cong: nn_integral_cong)
   862     have "integrable MX (\<lambda>x. Px x * log b (1 / Px x)) =
   863       integrable MX (\<lambda>x. - Px x * log b (Px x))"
   864       using Px
   865       by (intro integrable_cong_AE)
   866          (auto simp: borel_measurable_ereal_iff log_divide_eq
   867                   intro!: measurable_If)
   868     then show "integrable (distr M MX X) (\<lambda>x. - log b (1 / Px x))"
   869       unfolding distributed_distr_eq_density[OF X]
   870       using Px int
   871       by (subst integrable_real_density) (auto simp: borel_measurable_ereal_iff)
   872   qed (auto simp: minus_log_convex[OF b_gt_1])
   873   also have "\<dots> = (\<integral> x. log b (Px x) \<partial>distr M MX X)"
   874     unfolding distributed_distr_eq_density[OF X] using Px
   875     by (intro integral_cong_AE) (auto simp: AE_density log_divide_eq)
   876   also have "\<dots> = - entropy b MX X"
   877     unfolding distributed_distr_eq_density[OF X] using Px
   878     by (subst entropy_distr[OF X]) (auto simp: borel_measurable_ereal_iff integral_density)
   879   finally show ?thesis
   880     by simp
   881 qed
   882 
   883 lemma (in information_space) entropy_le_space:
   884   fixes Px :: "'b \<Rightarrow> real" and MX :: "'b measure"
   885   assumes X: "distributed M MX X Px"
   886   and fin: "finite_measure MX"
   887   and int: "integrable MX (\<lambda>x. - Px x * log b (Px x))"
   888   shows "entropy b MX X \<le> log b (measure MX (space MX))"
   889 proof -
   890   note Px = distributed_borel_measurable[OF X] distributed_AE[OF X]
   891   interpret finite_measure MX by fact
   892   have "entropy b MX X \<le> log b (measure MX {x \<in> space MX. Px x \<noteq> 0})"
   893     using int X by (intro entropy_le) auto
   894   also have "\<dots> \<le> log b (measure MX (space MX))"
   895     using Px distributed_imp_emeasure_nonzero[OF X]
   896     by (intro log_le)
   897        (auto intro!: borel_measurable_ereal_iff finite_measure_mono b_gt_1
   898                      less_le[THEN iffD2] measure_nonneg simp: emeasure_eq_measure)
   899   finally show ?thesis .
   900 qed
   901 
   902 lemma (in information_space) entropy_uniform:
   903   assumes X: "distributed M MX X (\<lambda>x. indicator A x / measure MX A)" (is "distributed _ _ _ ?f")
   904   shows "entropy b MX X = log b (measure MX A)"
   905 proof (subst entropy_distr[OF X])
   906   have [simp]: "emeasure MX A \<noteq> \<infinity>"
   907     using uniform_distributed_params[OF X] by (auto simp add: measure_def)
   908   have eq: "(\<integral> x. indicator A x / measure MX A * log b (indicator A x / measure MX A) \<partial>MX) =
   909     (\<integral> x. (- log b (measure MX A) / measure MX A) * indicator A x \<partial>MX)"
   910     using measure_nonneg[of MX A] uniform_distributed_params[OF X]
   911     by (intro integral_cong) (auto split: split_indicator simp: log_divide_eq)
   912   show "- (\<integral> x. indicator A x / measure MX A * log b (indicator A x / measure MX A) \<partial>MX) =
   913     log b (measure MX A)"
   914     unfolding eq using uniform_distributed_params[OF X]
   915     by (subst integral_mult_right) (auto simp: measure_def)
   916 qed
   917 
   918 lemma (in information_space) entropy_simple_distributed:
   919   "simple_distributed M X f \<Longrightarrow> \<H>(X) = - (\<Sum>x\<in>X`space M. f x * log b (f x))"
   920   by (subst entropy_distr[OF simple_distributed])
   921      (auto simp add: lebesgue_integral_count_space_finite)
   922 
   923 lemma (in information_space) entropy_le_card_not_0:
   924   assumes X: "simple_distributed M X f"
   925   shows "\<H>(X) \<le> log b (card (X ` space M \<inter> {x. f x \<noteq> 0}))"
   926 proof -
   927   let ?X = "count_space (X`space M)"
   928   have "\<H>(X) \<le> log b (measure ?X {x \<in> space ?X. f x \<noteq> 0})"
   929     by (rule entropy_le[OF simple_distributed[OF X]])
   930        (simp_all add: simple_distributed_finite[OF X] subset_eq integrable_count_space emeasure_count_space)
   931   also have "measure ?X {x \<in> space ?X. f x \<noteq> 0} = card (X ` space M \<inter> {x. f x \<noteq> 0})"
   932     by (simp_all add: simple_distributed_finite[OF X] subset_eq emeasure_count_space measure_def Int_def)
   933   finally show ?thesis .
   934 qed
   935 
   936 lemma (in information_space) entropy_le_card:
   937   assumes X: "simple_distributed M X f"
   938   shows "\<H>(X) \<le> log b (real (card (X ` space M)))"
   939 proof -
   940   let ?X = "count_space (X`space M)"
   941   have "\<H>(X) \<le> log b (measure ?X (space ?X))"
   942     by (rule entropy_le_space[OF simple_distributed[OF X]])
   943        (simp_all add: simple_distributed_finite[OF X] subset_eq integrable_count_space emeasure_count_space finite_measure_count_space)
   944   also have "measure ?X (space ?X) = card (X ` space M)"
   945     by (simp_all add: simple_distributed_finite[OF X] subset_eq emeasure_count_space measure_def)
   946   finally show ?thesis .
   947 qed
   948 
   949 subsection {* Conditional Mutual Information *}
   950 
   951 definition (in prob_space)
   952   "conditional_mutual_information b MX MY MZ X Y Z \<equiv>
   953     mutual_information b MX (MY \<Otimes>\<^sub>M MZ) X (\<lambda>x. (Y x, Z x)) -
   954     mutual_information b MX MZ X Z"
   955 
   956 abbreviation (in information_space)
   957   conditional_mutual_information_Pow ("\<I>'( _ ; _ | _ ')") where
   958   "\<I>(X ; Y | Z) \<equiv> conditional_mutual_information b
   959     (count_space (X ` space M)) (count_space (Y ` space M)) (count_space (Z ` space M)) X Y Z"
   960 
   961 lemma (in information_space)
   962   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" and P: "sigma_finite_measure P"
   963   assumes Px[measurable]: "distributed M S X Px"
   964   assumes Pz[measurable]: "distributed M P Z Pz"
   965   assumes Pyz[measurable]: "distributed M (T \<Otimes>\<^sub>M P) (\<lambda>x. (Y x, Z x)) Pyz"
   966   assumes Pxz[measurable]: "distributed M (S \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Z x)) Pxz"
   967   assumes Pxyz[measurable]: "distributed M (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Y x, Z x)) Pxyz"
   968   assumes I1: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z))))"
   969   assumes I2: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z)))"
   970   shows conditional_mutual_information_generic_eq: "conditional_mutual_information b S T P X Y Z
   971     = (\<integral>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))" (is "?eq")
   972     and conditional_mutual_information_generic_nonneg: "0 \<le> conditional_mutual_information b S T P X Y Z" (is "?nonneg")
   973 proof -
   974   interpret S: sigma_finite_measure S by fact
   975   interpret T: sigma_finite_measure T by fact
   976   interpret P: sigma_finite_measure P by fact
   977   interpret TP: pair_sigma_finite T P ..
   978   interpret SP: pair_sigma_finite S P ..
   979   interpret ST: pair_sigma_finite S T ..
   980   interpret SPT: pair_sigma_finite "S \<Otimes>\<^sub>M P" T ..
   981   interpret STP: pair_sigma_finite S "T \<Otimes>\<^sub>M P" ..
   982   interpret TPS: pair_sigma_finite "T \<Otimes>\<^sub>M P" S ..
   983   have TP: "sigma_finite_measure (T \<Otimes>\<^sub>M P)" ..
   984   have SP: "sigma_finite_measure (S \<Otimes>\<^sub>M P)" ..
   985   have YZ: "random_variable (T \<Otimes>\<^sub>M P) (\<lambda>x. (Y x, Z x))"
   986     using Pyz by (simp add: distributed_measurable)
   987   
   988   from Pxz Pxyz have distr_eq: "distr M (S \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Z x)) =
   989     distr (distr M (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Y x, Z x))) (S \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). (x, z))"
   990     by (simp add: comp_def distr_distr)
   991 
   992   have "mutual_information b S P X Z =
   993     (\<integral>x. Pxz x * log b (Pxz x / (Px (fst x) * Pz (snd x))) \<partial>(S \<Otimes>\<^sub>M P))"
   994     by (rule mutual_information_distr[OF S P Px Pz Pxz])
   995   also have "\<dots> = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))"
   996     using b_gt_1 Pxz Px Pz
   997     by (subst distributed_transform_integral[OF Pxyz Pxz, where T="\<lambda>(x, y, z). (x, z)"]) (auto simp: split_beta')
   998   finally have mi_eq:
   999     "mutual_information b S P X Z = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))" .
  1000   
  1001   have ae1: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Px (fst x) = 0 \<longrightarrow> Pxyz x = 0"
  1002     by (intro subdensity_real[of fst, OF _ Pxyz Px]) auto
  1003   moreover have ae2: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
  1004     by (intro subdensity_real[of "\<lambda>x. snd (snd x)", OF _ Pxyz Pz]) auto
  1005   moreover have ae3: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
  1006     by (intro subdensity_real[of "\<lambda>x. (fst x, snd (snd x))", OF _ Pxyz Pxz]) auto
  1007   moreover have ae4: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0"
  1008     by (intro subdensity_real[of snd, OF _ Pxyz Pyz]) auto
  1009   moreover have ae5: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Px (fst x)"
  1010     using Px by (intro STP.AE_pair_measure) (auto simp: comp_def dest: distributed_real_AE)
  1011   moreover have ae6: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Pyz (snd x)"
  1012     using Pyz by (intro STP.AE_pair_measure) (auto simp: comp_def dest: distributed_real_AE)
  1013   moreover have ae7: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Pz (snd (snd x))"
  1014     using Pz Pz[THEN distributed_real_measurable]
  1015     by (auto intro!: TP.AE_pair_measure STP.AE_pair_measure AE_I2[of S] dest: distributed_real_AE)
  1016   moreover have ae8: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Pxz (fst x, snd (snd x))"
  1017     using Pxz[THEN distributed_real_AE, THEN SP.AE_pair]
  1018     by (auto intro!: TP.AE_pair_measure STP.AE_pair_measure)
  1019   moreover note Pxyz[THEN distributed_real_AE]
  1020   ultimately have ae: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P.
  1021     Pxyz x * log b (Pxyz x / (Px (fst x) * Pyz (snd x))) -
  1022     Pxyz x * log b (Pxz (fst x, snd (snd x)) / (Px (fst x) * Pz (snd (snd x)))) =
  1023     Pxyz x * log b (Pxyz x * Pz (snd (snd x)) / (Pxz (fst x, snd (snd x)) * Pyz (snd x))) "
  1024   proof eventually_elim
  1025     case (goal1 x)
  1026     show ?case
  1027     proof cases
  1028       assume "Pxyz x \<noteq> 0"
  1029       with goal1 have "0 < Px (fst x)" "0 < Pz (snd (snd x))" "0 < Pxz (fst x, snd (snd x))" "0 < Pyz (snd x)" "0 < Pxyz x"
  1030         by auto
  1031       then show ?thesis
  1032         using b_gt_1 by (simp add: log_simps less_imp_le field_simps)
  1033     qed simp
  1034   qed
  1035   with I1 I2 show ?eq
  1036     unfolding conditional_mutual_information_def
  1037     apply (subst mi_eq)
  1038     apply (subst mutual_information_distr[OF S TP Px Pyz Pxyz])
  1039     apply (subst integral_diff[symmetric])
  1040     apply (auto intro!: integral_cong_AE simp: split_beta' simp del: integral_diff)
  1041     done
  1042 
  1043   let ?P = "density (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) Pxyz"
  1044   interpret P: prob_space ?P
  1045     unfolding distributed_distr_eq_density[OF Pxyz, symmetric]
  1046     by (rule prob_space_distr) simp
  1047 
  1048   let ?Q = "density (T \<Otimes>\<^sub>M P) Pyz"
  1049   interpret Q: prob_space ?Q
  1050     unfolding distributed_distr_eq_density[OF Pyz, symmetric]
  1051     by (rule prob_space_distr) simp
  1052 
  1053   let ?f = "\<lambda>(x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) / Pxyz (x, y, z)"
  1054 
  1055   from subdensity_real[of snd, OF _ Pyz Pz]
  1056   have aeX1: "AE x in T \<Otimes>\<^sub>M P. Pz (snd x) = 0 \<longrightarrow> Pyz x = 0" by (auto simp: comp_def)
  1057   have aeX2: "AE x in T \<Otimes>\<^sub>M P. 0 \<le> Pz (snd x)"
  1058     using Pz by (intro TP.AE_pair_measure) (auto simp: comp_def dest: distributed_real_AE)
  1059 
  1060   have aeX3: "AE y in T \<Otimes>\<^sub>M P. (\<integral>\<^sup>+ x. ereal (Pxz (x, snd y)) \<partial>S) = ereal (Pz (snd y))"
  1061     using Pz distributed_marginal_eq_joint2[OF P S Pz Pxz]
  1062     by (intro TP.AE_pair_measure) (auto dest: distributed_real_AE)
  1063 
  1064   have "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<le> (\<integral>\<^sup>+ (x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))"
  1065     apply (subst nn_integral_density)
  1066     apply simp
  1067     apply (rule distributed_AE[OF Pxyz])
  1068     apply auto []
  1069     apply (rule nn_integral_mono_AE)
  1070     using ae5 ae6 ae7 ae8
  1071     apply eventually_elim
  1072     apply auto
  1073     done
  1074   also have "\<dots> = (\<integral>\<^sup>+(y, z). \<integral>\<^sup>+ x. ereal (Pxz (x, z)) * ereal (Pyz (y, z) / Pz z) \<partial>S \<partial>T \<Otimes>\<^sub>M P)"
  1075     by (subst STP.nn_integral_snd[symmetric]) (auto simp add: split_beta')
  1076   also have "\<dots> = (\<integral>\<^sup>+x. ereal (Pyz x) * 1 \<partial>T \<Otimes>\<^sub>M P)"
  1077     apply (rule nn_integral_cong_AE)
  1078     using aeX1 aeX2 aeX3 distributed_AE[OF Pyz] AE_space
  1079     apply eventually_elim
  1080   proof (case_tac x, simp del: times_ereal.simps add: space_pair_measure)
  1081     fix a b assume "Pz b = 0 \<longrightarrow> Pyz (a, b) = 0" "0 \<le> Pz b" "a \<in> space T \<and> b \<in> space P"
  1082       "(\<integral>\<^sup>+ x. ereal (Pxz (x, b)) \<partial>S) = ereal (Pz b)" "0 \<le> Pyz (a, b)" 
  1083     then show "(\<integral>\<^sup>+ x. ereal (Pxz (x, b)) * ereal (Pyz (a, b) / Pz b) \<partial>S) = ereal (Pyz (a, b))"
  1084       by (subst nn_integral_multc)
  1085          (auto split: prod.split)
  1086   qed
  1087   also have "\<dots> = 1"
  1088     using Q.emeasure_space_1 distributed_AE[OF Pyz] distributed_distr_eq_density[OF Pyz]
  1089     by (subst nn_integral_density[symmetric]) auto
  1090   finally have le1: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<le> 1" .
  1091   also have "\<dots> < \<infinity>" by simp
  1092   finally have fin: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> \<infinity>" by simp
  1093 
  1094   have pos: "(\<integral>\<^sup>+x. ?f x \<partial>?P) \<noteq> 0"
  1095     apply (subst nn_integral_density)
  1096     apply simp
  1097     apply (rule distributed_AE[OF Pxyz])
  1098     apply auto []
  1099     apply (simp add: split_beta')
  1100   proof
  1101     let ?g = "\<lambda>x. ereal (if Pxyz x = 0 then 0 else Pxz (fst x, snd (snd x)) * Pyz (snd x) / Pz (snd (snd x)))"
  1102     assume "(\<integral>\<^sup>+x. ?g x \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)) = 0"
  1103     then have "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. ?g x \<le> 0"
  1104       by (intro nn_integral_0_iff_AE[THEN iffD1]) auto
  1105     then have "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pxyz x = 0"
  1106       using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
  1107       by eventually_elim (auto split: split_if_asm simp: mult_le_0_iff divide_le_0_iff)
  1108     then have "(\<integral>\<^sup>+ x. ereal (Pxyz x) \<partial>S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) = 0"
  1109       by (subst nn_integral_cong_AE[of _ "\<lambda>x. 0"]) auto
  1110     with P.emeasure_space_1 show False
  1111       by (subst (asm) emeasure_density) (auto cong: nn_integral_cong)
  1112   qed
  1113 
  1114   have neg: "(\<integral>\<^sup>+ x. - ?f x \<partial>?P) = 0"
  1115     apply (rule nn_integral_0_iff_AE[THEN iffD2])
  1116     apply simp
  1117     apply (subst AE_density)
  1118     apply simp
  1119     using ae5 ae6 ae7 ae8
  1120     apply eventually_elim
  1121     apply auto
  1122     done
  1123 
  1124   have I3: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))"
  1125     apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ integrable_diff[OF I1 I2]])
  1126     using ae
  1127     apply (auto simp: split_beta')
  1128     done
  1129 
  1130   have "- log b 1 \<le> - log b (integral\<^sup>L ?P ?f)"
  1131   proof (intro le_imp_neg_le log_le[OF b_gt_1])
  1132     have If: "integrable ?P ?f"
  1133       unfolding real_integrable_def
  1134     proof (intro conjI)
  1135       from neg show "(\<integral>\<^sup>+ x. - ?f x \<partial>?P) \<noteq> \<infinity>"
  1136         by simp
  1137       from fin show "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> \<infinity>"
  1138         by simp
  1139     qed simp
  1140     then have "(\<integral>\<^sup>+ x. ?f x \<partial>?P) = (\<integral>x. ?f x \<partial>?P)"
  1141       apply (rule nn_integral_eq_integral)
  1142       apply (subst AE_density)
  1143       apply simp
  1144       using ae5 ae6 ae7 ae8
  1145       apply eventually_elim
  1146       apply auto
  1147       done
  1148     with nn_integral_nonneg[of ?P ?f] pos le1
  1149     show "0 < (\<integral>x. ?f x \<partial>?P)" "(\<integral>x. ?f x \<partial>?P) \<le> 1"
  1150       by (simp_all add: one_ereal_def)
  1151   qed
  1152   also have "- log b (integral\<^sup>L ?P ?f) \<le> (\<integral> x. - log b (?f x) \<partial>?P)"
  1153   proof (rule P.jensens_inequality[where a=0 and b=1 and I="{0<..}"])
  1154     show "AE x in ?P. ?f x \<in> {0<..}"
  1155       unfolding AE_density[OF distributed_borel_measurable[OF Pxyz]]
  1156       using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
  1157       by eventually_elim (auto)
  1158     show "integrable ?P ?f"
  1159       unfolding real_integrable_def 
  1160       using fin neg by (auto simp: split_beta')
  1161     show "integrable ?P (\<lambda>x. - log b (?f x))"
  1162       apply (subst integrable_real_density)
  1163       apply simp
  1164       apply (auto intro!: distributed_real_AE[OF Pxyz]) []
  1165       apply simp
  1166       apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ I3])
  1167       apply simp
  1168       apply simp
  1169       using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
  1170       apply eventually_elim
  1171       apply (auto simp: log_divide_eq log_mult_eq zero_le_mult_iff zero_less_mult_iff zero_less_divide_iff field_simps)
  1172       done
  1173   qed (auto simp: b_gt_1 minus_log_convex)
  1174   also have "\<dots> = conditional_mutual_information b S T P X Y Z"
  1175     unfolding `?eq`
  1176     apply (subst integral_real_density)
  1177     apply simp
  1178     apply (auto intro!: distributed_real_AE[OF Pxyz]) []
  1179     apply simp
  1180     apply (intro integral_cong_AE)
  1181     using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
  1182     apply (auto simp: log_divide_eq zero_less_mult_iff zero_less_divide_iff field_simps)
  1183     done
  1184   finally show ?nonneg
  1185     by simp
  1186 qed
  1187 
  1188 lemma (in information_space)
  1189   fixes Px :: "_ \<Rightarrow> real"
  1190   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" and P: "sigma_finite_measure P"
  1191   assumes Fx: "finite_entropy S X Px"
  1192   assumes Fz: "finite_entropy P Z Pz"
  1193   assumes Fyz: "finite_entropy (T \<Otimes>\<^sub>M P) (\<lambda>x. (Y x, Z x)) Pyz"
  1194   assumes Fxz: "finite_entropy (S \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Z x)) Pxz"
  1195   assumes Fxyz: "finite_entropy (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Y x, Z x)) Pxyz"
  1196   shows conditional_mutual_information_generic_eq': "conditional_mutual_information b S T P X Y Z
  1197     = (\<integral>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))" (is "?eq")
  1198     and conditional_mutual_information_generic_nonneg': "0 \<le> conditional_mutual_information b S T P X Y Z" (is "?nonneg")
  1199 proof -
  1200   note Px = Fx[THEN finite_entropy_distributed, measurable]
  1201   note Pz = Fz[THEN finite_entropy_distributed, measurable]
  1202   note Pyz = Fyz[THEN finite_entropy_distributed, measurable]
  1203   note Pxz = Fxz[THEN finite_entropy_distributed, measurable]
  1204   note Pxyz = Fxyz[THEN finite_entropy_distributed, measurable]
  1205 
  1206   interpret S: sigma_finite_measure S by fact
  1207   interpret T: sigma_finite_measure T by fact
  1208   interpret P: sigma_finite_measure P by fact
  1209   interpret TP: pair_sigma_finite T P ..
  1210   interpret SP: pair_sigma_finite S P ..
  1211   interpret ST: pair_sigma_finite S T ..
  1212   interpret SPT: pair_sigma_finite "S \<Otimes>\<^sub>M P" T ..
  1213   interpret STP: pair_sigma_finite S "T \<Otimes>\<^sub>M P" ..
  1214   interpret TPS: pair_sigma_finite "T \<Otimes>\<^sub>M P" S ..
  1215   have TP: "sigma_finite_measure (T \<Otimes>\<^sub>M P)" ..
  1216   have SP: "sigma_finite_measure (S \<Otimes>\<^sub>M P)" ..
  1217 
  1218   from Pxz Pxyz have distr_eq: "distr M (S \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Z x)) =
  1219     distr (distr M (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Y x, Z x))) (S \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). (x, z))"
  1220     by (simp add: distr_distr comp_def)
  1221 
  1222   have "mutual_information b S P X Z =
  1223     (\<integral>x. Pxz x * log b (Pxz x / (Px (fst x) * Pz (snd x))) \<partial>(S \<Otimes>\<^sub>M P))"
  1224     by (rule mutual_information_distr[OF S P Px Pz Pxz])
  1225   also have "\<dots> = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))"
  1226     using b_gt_1 Pxz Px Pz
  1227     by (subst distributed_transform_integral[OF Pxyz Pxz, where T="\<lambda>(x, y, z). (x, z)"])
  1228        (auto simp: split_beta')
  1229   finally have mi_eq:
  1230     "mutual_information b S P X Z = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))" .
  1231   
  1232   have ae1: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Px (fst x) = 0 \<longrightarrow> Pxyz x = 0"
  1233     by (intro subdensity_real[of fst, OF _ Pxyz Px]) auto
  1234   moreover have ae2: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
  1235     by (intro subdensity_real[of "\<lambda>x. snd (snd x)", OF _ Pxyz Pz]) auto
  1236   moreover have ae3: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
  1237     by (intro subdensity_real[of "\<lambda>x. (fst x, snd (snd x))", OF _ Pxyz Pxz]) auto
  1238   moreover have ae4: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0"
  1239     by (intro subdensity_real[of snd, OF _ Pxyz Pyz]) auto
  1240   moreover have ae5: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Px (fst x)"
  1241     using Px by (intro STP.AE_pair_measure) (auto dest: distributed_real_AE)
  1242   moreover have ae6: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Pyz (snd x)"
  1243     using Pyz by (intro STP.AE_pair_measure) (auto dest: distributed_real_AE)
  1244   moreover have ae7: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Pz (snd (snd x))"
  1245     using Pz Pz[THEN distributed_real_measurable] by (auto intro!: TP.AE_pair_measure STP.AE_pair_measure AE_I2[of S] dest: distributed_real_AE)
  1246   moreover have ae8: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Pxz (fst x, snd (snd x))"
  1247     using Pxz[THEN distributed_real_AE, THEN SP.AE_pair]
  1248     by (auto intro!: TP.AE_pair_measure STP.AE_pair_measure simp: comp_def)
  1249   moreover note ae9 = Pxyz[THEN distributed_real_AE]
  1250   ultimately have ae: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P.
  1251     Pxyz x * log b (Pxyz x / (Px (fst x) * Pyz (snd x))) -
  1252     Pxyz x * log b (Pxz (fst x, snd (snd x)) / (Px (fst x) * Pz (snd (snd x)))) =
  1253     Pxyz x * log b (Pxyz x * Pz (snd (snd x)) / (Pxz (fst x, snd (snd x)) * Pyz (snd x))) "
  1254   proof eventually_elim
  1255     case (goal1 x)
  1256     show ?case
  1257     proof cases
  1258       assume "Pxyz x \<noteq> 0"
  1259       with goal1 have "0 < Px (fst x)" "0 < Pz (snd (snd x))" "0 < Pxz (fst x, snd (snd x))" "0 < Pyz (snd x)" "0 < Pxyz x"
  1260         by auto
  1261       then show ?thesis
  1262         using b_gt_1 by (simp add: log_simps less_imp_le field_simps)
  1263     qed simp
  1264   qed
  1265 
  1266   have "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)
  1267     (\<lambda>x. Pxyz x * log b (Pxyz x) - Pxyz x * log b (Px (fst x)) - Pxyz x * log b (Pyz (snd x)))"
  1268     using finite_entropy_integrable[OF Fxyz]
  1269     using finite_entropy_integrable_transform[OF Fx Pxyz, of fst]
  1270     using finite_entropy_integrable_transform[OF Fyz Pxyz, of snd]
  1271     by simp
  1272   moreover have "(\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z)))) \<in> borel_measurable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)"
  1273     using Pxyz Px Pyz by simp
  1274   ultimately have I1: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z))))"
  1275     apply (rule integrable_cong_AE_imp)
  1276     using ae1 ae4 ae5 ae6 ae9
  1277     by eventually_elim
  1278        (auto simp: log_divide_eq log_mult_eq field_simps zero_less_mult_iff)
  1279 
  1280   have "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)
  1281     (\<lambda>x. Pxyz x * log b (Pxz (fst x, snd (snd x))) - Pxyz x * log b (Px (fst x)) - Pxyz x * log b (Pz (snd (snd x))))"
  1282     using finite_entropy_integrable_transform[OF Fxz Pxyz, of "\<lambda>x. (fst x, snd (snd x))"]
  1283     using finite_entropy_integrable_transform[OF Fx Pxyz, of fst]
  1284     using finite_entropy_integrable_transform[OF Fz Pxyz, of "snd \<circ> snd"]
  1285     by simp
  1286   moreover have "(\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z))) \<in> borel_measurable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)"
  1287     using Pxyz Px Pz
  1288     by auto
  1289   ultimately have I2: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z)))"
  1290     apply (rule integrable_cong_AE_imp)
  1291     using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 ae9
  1292     by eventually_elim
  1293        (auto simp: log_divide_eq log_mult_eq field_simps zero_less_mult_iff)
  1294 
  1295   from ae I1 I2 show ?eq
  1296     unfolding conditional_mutual_information_def
  1297     apply (subst mi_eq)
  1298     apply (subst mutual_information_distr[OF S TP Px Pyz Pxyz])
  1299     apply (subst integral_diff[symmetric])
  1300     apply (auto intro!: integral_cong_AE simp: split_beta' simp del: integral_diff)
  1301     done
  1302 
  1303   let ?P = "density (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) Pxyz"
  1304   interpret P: prob_space ?P
  1305     unfolding distributed_distr_eq_density[OF Pxyz, symmetric] by (rule prob_space_distr) simp
  1306 
  1307   let ?Q = "density (T \<Otimes>\<^sub>M P) Pyz"
  1308   interpret Q: prob_space ?Q
  1309     unfolding distributed_distr_eq_density[OF Pyz, symmetric] by (rule prob_space_distr) simp
  1310 
  1311   let ?f = "\<lambda>(x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) / Pxyz (x, y, z)"
  1312 
  1313   from subdensity_real[of snd, OF _ Pyz Pz]
  1314   have aeX1: "AE x in T \<Otimes>\<^sub>M P. Pz (snd x) = 0 \<longrightarrow> Pyz x = 0" by (auto simp: comp_def)
  1315   have aeX2: "AE x in T \<Otimes>\<^sub>M P. 0 \<le> Pz (snd x)"
  1316     using Pz by (intro TP.AE_pair_measure) (auto dest: distributed_real_AE)
  1317 
  1318   have aeX3: "AE y in T \<Otimes>\<^sub>M P. (\<integral>\<^sup>+ x. ereal (Pxz (x, snd y)) \<partial>S) = ereal (Pz (snd y))"
  1319     using Pz distributed_marginal_eq_joint2[OF P S Pz Pxz]
  1320     by (intro TP.AE_pair_measure) (auto dest: distributed_real_AE)
  1321   have "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<le> (\<integral>\<^sup>+ (x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))"
  1322     apply (subst nn_integral_density)
  1323     apply (rule distributed_borel_measurable[OF Pxyz])
  1324     apply (rule distributed_AE[OF Pxyz])
  1325     apply simp
  1326     apply (rule nn_integral_mono_AE)
  1327     using ae5 ae6 ae7 ae8
  1328     apply eventually_elim
  1329     apply auto
  1330     done
  1331   also have "\<dots> = (\<integral>\<^sup>+(y, z). \<integral>\<^sup>+ x. ereal (Pxz (x, z)) * ereal (Pyz (y, z) / Pz z) \<partial>S \<partial>T \<Otimes>\<^sub>M P)"
  1332     by (subst STP.nn_integral_snd[symmetric]) (auto simp add: split_beta')
  1333   also have "\<dots> = (\<integral>\<^sup>+x. ereal (Pyz x) * 1 \<partial>T \<Otimes>\<^sub>M P)"
  1334     apply (rule nn_integral_cong_AE)
  1335     using aeX1 aeX2 aeX3 distributed_AE[OF Pyz] AE_space
  1336     apply eventually_elim
  1337   proof (case_tac x, simp del: times_ereal.simps add: space_pair_measure)
  1338     fix a b assume "Pz b = 0 \<longrightarrow> Pyz (a, b) = 0" "0 \<le> Pz b" "a \<in> space T \<and> b \<in> space P"
  1339       "(\<integral>\<^sup>+ x. ereal (Pxz (x, b)) \<partial>S) = ereal (Pz b)" "0 \<le> Pyz (a, b)" 
  1340     then show "(\<integral>\<^sup>+ x. ereal (Pxz (x, b)) * ereal (Pyz (a, b) / Pz b) \<partial>S) = ereal (Pyz (a, b))"
  1341       by (subst nn_integral_multc) auto
  1342   qed
  1343   also have "\<dots> = 1"
  1344     using Q.emeasure_space_1 distributed_AE[OF Pyz] distributed_distr_eq_density[OF Pyz]
  1345     by (subst nn_integral_density[symmetric]) auto
  1346   finally have le1: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<le> 1" .
  1347   also have "\<dots> < \<infinity>" by simp
  1348   finally have fin: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> \<infinity>" by simp
  1349 
  1350   have pos: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> 0"
  1351     apply (subst nn_integral_density)
  1352     apply simp
  1353     apply (rule distributed_AE[OF Pxyz])
  1354     apply simp
  1355     apply (simp add: split_beta')
  1356   proof
  1357     let ?g = "\<lambda>x. ereal (if Pxyz x = 0 then 0 else Pxz (fst x, snd (snd x)) * Pyz (snd x) / Pz (snd (snd x)))"
  1358     assume "(\<integral>\<^sup>+ x. ?g x \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)) = 0"
  1359     then have "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. ?g x \<le> 0"
  1360       by (intro nn_integral_0_iff_AE[THEN iffD1]) (auto intro!: borel_measurable_ereal measurable_If)
  1361     then have "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pxyz x = 0"
  1362       using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
  1363       by eventually_elim (auto split: split_if_asm simp: mult_le_0_iff divide_le_0_iff)
  1364     then have "(\<integral>\<^sup>+ x. ereal (Pxyz x) \<partial>S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) = 0"
  1365       by (subst nn_integral_cong_AE[of _ "\<lambda>x. 0"]) auto
  1366     with P.emeasure_space_1 show False
  1367       by (subst (asm) emeasure_density) (auto cong: nn_integral_cong)
  1368   qed
  1369 
  1370   have neg: "(\<integral>\<^sup>+ x. - ?f x \<partial>?P) = 0"
  1371     apply (rule nn_integral_0_iff_AE[THEN iffD2])
  1372     apply (auto simp: split_beta') []
  1373     apply (subst AE_density)
  1374     apply (auto simp: split_beta') []
  1375     using ae5 ae6 ae7 ae8
  1376     apply eventually_elim
  1377     apply auto
  1378     done
  1379 
  1380   have I3: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))"
  1381     apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ integrable_diff[OF I1 I2]])
  1382     using ae
  1383     apply (auto simp: split_beta')
  1384     done
  1385 
  1386   have "- log b 1 \<le> - log b (integral\<^sup>L ?P ?f)"
  1387   proof (intro le_imp_neg_le log_le[OF b_gt_1])
  1388     have If: "integrable ?P ?f"
  1389       unfolding real_integrable_def
  1390     proof (intro conjI)
  1391       from neg show "(\<integral>\<^sup>+ x. - ?f x \<partial>?P) \<noteq> \<infinity>"
  1392         by simp
  1393       from fin show "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> \<infinity>"
  1394         by simp
  1395     qed simp
  1396     then have "(\<integral>\<^sup>+ x. ?f x \<partial>?P) = (\<integral>x. ?f x \<partial>?P)"
  1397       apply (rule nn_integral_eq_integral)
  1398       apply (subst AE_density)
  1399       apply simp
  1400       using ae5 ae6 ae7 ae8
  1401       apply eventually_elim
  1402       apply auto
  1403       done
  1404     with nn_integral_nonneg[of ?P ?f] pos le1
  1405     show "0 < (\<integral>x. ?f x \<partial>?P)" "(\<integral>x. ?f x \<partial>?P) \<le> 1"
  1406       by (simp_all add: one_ereal_def)
  1407   qed
  1408   also have "- log b (integral\<^sup>L ?P ?f) \<le> (\<integral> x. - log b (?f x) \<partial>?P)"
  1409   proof (rule P.jensens_inequality[where a=0 and b=1 and I="{0<..}"])
  1410     show "AE x in ?P. ?f x \<in> {0<..}"
  1411       unfolding AE_density[OF distributed_borel_measurable[OF Pxyz]]
  1412       using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
  1413       by eventually_elim (auto)
  1414     show "integrable ?P ?f"
  1415       unfolding real_integrable_def
  1416       using fin neg by (auto simp: split_beta')
  1417     show "integrable ?P (\<lambda>x. - log b (?f x))"
  1418       apply (subst integrable_real_density)
  1419       apply simp
  1420       apply (auto intro!: distributed_real_AE[OF Pxyz]) []
  1421       apply simp
  1422       apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ I3])
  1423       apply simp
  1424       apply simp
  1425       using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
  1426       apply eventually_elim
  1427       apply (auto simp: log_divide_eq log_mult_eq zero_le_mult_iff zero_less_mult_iff zero_less_divide_iff field_simps)
  1428       done
  1429   qed (auto simp: b_gt_1 minus_log_convex)
  1430   also have "\<dots> = conditional_mutual_information b S T P X Y Z"
  1431     unfolding `?eq`
  1432     apply (subst integral_real_density)
  1433     apply simp
  1434     apply (auto intro!: distributed_real_AE[OF Pxyz]) []
  1435     apply simp
  1436     apply (intro integral_cong_AE)
  1437     using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
  1438     apply (auto simp: log_divide_eq zero_less_mult_iff zero_less_divide_iff field_simps)
  1439     done
  1440   finally show ?nonneg
  1441     by simp
  1442 qed
  1443 
  1444 lemma (in information_space) conditional_mutual_information_eq:
  1445   assumes Pz: "simple_distributed M Z Pz"
  1446   assumes Pyz: "simple_distributed M (\<lambda>x. (Y x, Z x)) Pyz"
  1447   assumes Pxz: "simple_distributed M (\<lambda>x. (X x, Z x)) Pxz"
  1448   assumes Pxyz: "simple_distributed M (\<lambda>x. (X x, Y x, Z x)) Pxyz"
  1449   shows "\<I>(X ; Y | Z) =
  1450    (\<Sum>(x, y, z)\<in>(\<lambda>x. (X x, Y x, Z x))`space M. Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))"
  1451 proof (subst conditional_mutual_information_generic_eq[OF _ _ _ _
  1452     simple_distributed[OF Pz] simple_distributed_joint[OF Pyz] simple_distributed_joint[OF Pxz]
  1453     simple_distributed_joint2[OF Pxyz]])
  1454   note simple_distributed_joint2_finite[OF Pxyz, simp]
  1455   show "sigma_finite_measure (count_space (X ` space M))"
  1456     by (simp add: sigma_finite_measure_count_space_finite)
  1457   show "sigma_finite_measure (count_space (Y ` space M))"
  1458     by (simp add: sigma_finite_measure_count_space_finite)
  1459   show "sigma_finite_measure (count_space (Z ` space M))"
  1460     by (simp add: sigma_finite_measure_count_space_finite)
  1461   have "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) \<Otimes>\<^sub>M count_space (Z ` space M) =
  1462       count_space (X`space M \<times> Y`space M \<times> Z`space M)"
  1463     (is "?P = ?C")
  1464     by (simp add: pair_measure_count_space)
  1465 
  1466   let ?Px = "\<lambda>x. measure M (X -` {x} \<inter> space M)"
  1467   have "(\<lambda>x. (X x, Z x)) \<in> measurable M (count_space (X ` space M) \<Otimes>\<^sub>M count_space (Z ` space M))"
  1468     using simple_distributed_joint[OF Pxz] by (rule distributed_measurable)
  1469   from measurable_comp[OF this measurable_fst]
  1470   have "random_variable (count_space (X ` space M)) X"
  1471     by (simp add: comp_def)
  1472   then have "simple_function M X"    
  1473     unfolding simple_function_def by (auto simp: measurable_count_space_eq2)
  1474   then have "simple_distributed M X ?Px"
  1475     by (rule simple_distributedI) auto
  1476   then show "distributed M (count_space (X ` space M)) X ?Px"
  1477     by (rule simple_distributed)
  1478 
  1479   let ?f = "(\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x, Z x)) ` space M then Pxyz x else 0)"
  1480   let ?g = "(\<lambda>x. if x \<in> (\<lambda>x. (Y x, Z x)) ` space M then Pyz x else 0)"
  1481   let ?h = "(\<lambda>x. if x \<in> (\<lambda>x. (X x, Z x)) ` space M then Pxz x else 0)"
  1482   show
  1483       "integrable ?P (\<lambda>(x, y, z). ?f (x, y, z) * log b (?f (x, y, z) / (?Px x * ?g (y, z))))"
  1484       "integrable ?P (\<lambda>(x, y, z). ?f (x, y, z) * log b (?h (x, z) / (?Px x * Pz z)))"
  1485     by (auto intro!: integrable_count_space simp: pair_measure_count_space)
  1486   let ?i = "\<lambda>x y z. ?f (x, y, z) * log b (?f (x, y, z) / (?h (x, z) * (?g (y, z) / Pz z)))"
  1487   let ?j = "\<lambda>x y z. Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z)))"
  1488   have "(\<lambda>(x, y, z). ?i x y z) = (\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x, Z x)) ` space M then ?j (fst x) (fst (snd x)) (snd (snd x)) else 0)"
  1489     by (auto intro!: ext)
  1490   then show "(\<integral> (x, y, z). ?i x y z \<partial>?P) = (\<Sum>(x, y, z)\<in>(\<lambda>x. (X x, Y x, Z x)) ` space M. ?j x y z)"
  1491     by (auto intro!: setsum.cong simp add: `?P = ?C` lebesgue_integral_count_space_finite simple_distributed_finite setsum.If_cases split_beta')
  1492 qed
  1493 
  1494 lemma (in information_space) conditional_mutual_information_nonneg:
  1495   assumes X: "simple_function M X" and Y: "simple_function M Y" and Z: "simple_function M Z"
  1496   shows "0 \<le> \<I>(X ; Y | Z)"
  1497 proof -
  1498   have [simp]: "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) \<Otimes>\<^sub>M count_space (Z ` space M) =
  1499       count_space (X`space M \<times> Y`space M \<times> Z`space M)"
  1500     by (simp add: pair_measure_count_space X Y Z simple_functionD)
  1501   note sf = sigma_finite_measure_count_space_finite[OF simple_functionD(1)]
  1502   note sd = simple_distributedI[OF _ refl]
  1503   note sp = simple_function_Pair
  1504   show ?thesis
  1505    apply (rule conditional_mutual_information_generic_nonneg[OF sf[OF X] sf[OF Y] sf[OF Z]])
  1506    apply (rule simple_distributed[OF sd[OF X]])
  1507    apply (rule simple_distributed[OF sd[OF Z]])
  1508    apply (rule simple_distributed_joint[OF sd[OF sp[OF Y Z]]])
  1509    apply (rule simple_distributed_joint[OF sd[OF sp[OF X Z]]])
  1510    apply (rule simple_distributed_joint2[OF sd[OF sp[OF X sp[OF Y Z]]]])
  1511    apply (auto intro!: integrable_count_space simp: X Y Z simple_functionD)
  1512    done
  1513 qed
  1514 
  1515 subsection {* Conditional Entropy *}
  1516 
  1517 definition (in prob_space)
  1518   "conditional_entropy b S T X Y = - (\<integral>(x, y). log b (real (RN_deriv (S \<Otimes>\<^sub>M T) (distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))) (x, y)) / 
  1519     real (RN_deriv T (distr M T Y) y)) \<partial>distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)))"
  1520 
  1521 abbreviation (in information_space)
  1522   conditional_entropy_Pow ("\<H>'(_ | _')") where
  1523   "\<H>(X | Y) \<equiv> conditional_entropy b (count_space (X`space M)) (count_space (Y`space M)) X Y"
  1524 
  1525 lemma (in information_space) conditional_entropy_generic_eq:
  1526   fixes Pxy :: "_ \<Rightarrow> real" and Py :: "'c \<Rightarrow> real"
  1527   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
  1528   assumes Py[measurable]: "distributed M T Y Py"
  1529   assumes Pxy[measurable]: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
  1530   shows "conditional_entropy b S T X Y = - (\<integral>(x, y). Pxy (x, y) * log b (Pxy (x, y) / Py y) \<partial>(S \<Otimes>\<^sub>M T))"
  1531 proof -
  1532   interpret S: sigma_finite_measure S by fact
  1533   interpret T: sigma_finite_measure T by fact
  1534   interpret ST: pair_sigma_finite S T ..
  1535 
  1536   have "AE x in density (S \<Otimes>\<^sub>M T) (\<lambda>x. ereal (Pxy x)). Pxy x = real (RN_deriv (S \<Otimes>\<^sub>M T) (distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))) x)"
  1537     unfolding AE_density[OF distributed_borel_measurable, OF Pxy]
  1538     unfolding distributed_distr_eq_density[OF Pxy]
  1539     using distributed_RN_deriv[OF Pxy]
  1540     by auto
  1541   moreover
  1542   have "AE x in density (S \<Otimes>\<^sub>M T) (\<lambda>x. ereal (Pxy x)). Py (snd x) = real (RN_deriv T (distr M T Y) (snd x))"
  1543     unfolding AE_density[OF distributed_borel_measurable, OF Pxy]
  1544     unfolding distributed_distr_eq_density[OF Py]
  1545     apply (rule ST.AE_pair_measure)
  1546     apply auto
  1547     using distributed_RN_deriv[OF Py]
  1548     apply auto
  1549     done    
  1550   ultimately
  1551   have "conditional_entropy b S T X Y = - (\<integral>x. Pxy x * log b (Pxy x / Py (snd x)) \<partial>(S \<Otimes>\<^sub>M T))"
  1552     unfolding conditional_entropy_def neg_equal_iff_equal
  1553     apply (subst integral_real_density[symmetric])
  1554     apply (auto simp: distributed_real_AE[OF Pxy] distributed_distr_eq_density[OF Pxy]
  1555                 intro!: integral_cong_AE)
  1556     done
  1557   then show ?thesis by (simp add: split_beta')
  1558 qed
  1559 
  1560 lemma (in information_space) conditional_entropy_eq_entropy:
  1561   fixes Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real"
  1562   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
  1563   assumes Py[measurable]: "distributed M T Y Py"
  1564   assumes Pxy[measurable]: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
  1565   assumes I1: "integrable (S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x))"
  1566   assumes I2: "integrable (S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Py (snd x)))"
  1567   shows "conditional_entropy b S T X Y = entropy b (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) - entropy b T Y"
  1568 proof -
  1569   interpret S: sigma_finite_measure S by fact
  1570   interpret T: sigma_finite_measure T by fact
  1571   interpret ST: pair_sigma_finite S T ..
  1572 
  1573   have "entropy b T Y = - (\<integral>y. Py y * log b (Py y) \<partial>T)"
  1574     by (rule entropy_distr[OF Py])
  1575   also have "\<dots> = - (\<integral>(x,y). Pxy (x,y) * log b (Py y) \<partial>(S \<Otimes>\<^sub>M T))"
  1576     using b_gt_1 Py[THEN distributed_real_measurable]
  1577     by (subst distributed_transform_integral[OF Pxy Py, where T=snd]) (auto intro!: integral_cong)
  1578   finally have e_eq: "entropy b T Y = - (\<integral>(x,y). Pxy (x,y) * log b (Py y) \<partial>(S \<Otimes>\<^sub>M T))" .
  1579 
  1580   have ae2: "AE x in S \<Otimes>\<^sub>M T. Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
  1581     by (intro subdensity_real[of snd, OF _ Pxy Py]) (auto intro: measurable_Pair)
  1582   moreover have ae4: "AE x in S \<Otimes>\<^sub>M T. 0 \<le> Py (snd x)"
  1583     using Py by (intro ST.AE_pair_measure) (auto simp: comp_def intro!: measurable_snd'' dest: distributed_real_AE distributed_real_measurable)
  1584   moreover note ae5 = Pxy[THEN distributed_real_AE]
  1585   ultimately have "AE x in S \<Otimes>\<^sub>M T. 0 \<le> Pxy x \<and> 0 \<le> Py (snd x) \<and>
  1586     (Pxy x = 0 \<or> (Pxy x \<noteq> 0 \<longrightarrow> 0 < Pxy x \<and> 0 < Py (snd x)))"
  1587     by eventually_elim auto
  1588   then have ae: "AE x in S \<Otimes>\<^sub>M T.
  1589      Pxy x * log b (Pxy x) - Pxy x * log b (Py (snd x)) = Pxy x * log b (Pxy x / Py (snd x))"
  1590     by eventually_elim (auto simp: log_simps field_simps b_gt_1)
  1591   have "conditional_entropy b S T X Y = 
  1592     - (\<integral>x. Pxy x * log b (Pxy x) - Pxy x * log b (Py (snd x)) \<partial>(S \<Otimes>\<^sub>M T))"
  1593     unfolding conditional_entropy_generic_eq[OF S T Py Pxy] neg_equal_iff_equal
  1594     apply (intro integral_cong_AE)
  1595     using ae
  1596     apply auto
  1597     done
  1598   also have "\<dots> = - (\<integral>x. Pxy x * log b (Pxy x) \<partial>(S \<Otimes>\<^sub>M T)) - - (\<integral>x.  Pxy x * log b (Py (snd x)) \<partial>(S \<Otimes>\<^sub>M T))"
  1599     by (simp add: integral_diff[OF I1 I2])
  1600   finally show ?thesis 
  1601     unfolding conditional_entropy_generic_eq[OF S T Py Pxy] entropy_distr[OF Pxy] e_eq
  1602     by (simp add: split_beta')
  1603 qed
  1604 
  1605 lemma (in information_space) conditional_entropy_eq_entropy_simple:
  1606   assumes X: "simple_function M X" and Y: "simple_function M Y"
  1607   shows "\<H>(X | Y) = entropy b (count_space (X`space M) \<Otimes>\<^sub>M count_space (Y`space M)) (\<lambda>x. (X x, Y x)) - \<H>(Y)"
  1608 proof -
  1609   have "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) = count_space (X`space M \<times> Y`space M)"
  1610     (is "?P = ?C") using X Y by (simp add: simple_functionD pair_measure_count_space)
  1611   show ?thesis
  1612     by (rule conditional_entropy_eq_entropy sigma_finite_measure_count_space_finite
  1613                  simple_functionD  X Y simple_distributed simple_distributedI[OF _ refl]
  1614                  simple_distributed_joint simple_function_Pair integrable_count_space)+
  1615        (auto simp: `?P = ?C` intro!: integrable_count_space simple_functionD  X Y)
  1616 qed
  1617 
  1618 lemma (in information_space) conditional_entropy_eq:
  1619   assumes Y: "simple_distributed M Y Py"
  1620   assumes XY: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy"
  1621     shows "\<H>(X | Y) = - (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / Py y))"
  1622 proof (subst conditional_entropy_generic_eq[OF _ _
  1623   simple_distributed[OF Y] simple_distributed_joint[OF XY]])
  1624   have "finite ((\<lambda>x. (X x, Y x))`space M)"
  1625     using XY unfolding simple_distributed_def by auto
  1626   from finite_imageI[OF this, of fst]
  1627   have [simp]: "finite (X`space M)"
  1628     by (simp add: image_comp comp_def)
  1629   note Y[THEN simple_distributed_finite, simp]
  1630   show "sigma_finite_measure (count_space (X ` space M))"
  1631     by (simp add: sigma_finite_measure_count_space_finite)
  1632   show "sigma_finite_measure (count_space (Y ` space M))"
  1633     by (simp add: sigma_finite_measure_count_space_finite)
  1634   let ?f = "(\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x)) ` space M then Pxy x else 0)"
  1635   have "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) = count_space (X`space M \<times> Y`space M)"
  1636     (is "?P = ?C")
  1637     using Y by (simp add: simple_distributed_finite pair_measure_count_space)
  1638   have eq: "(\<lambda>(x, y). ?f (x, y) * log b (?f (x, y) / Py y)) =
  1639     (\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x)) ` space M then Pxy x * log b (Pxy x / Py (snd x)) else 0)"
  1640     by auto
  1641   from Y show "- (\<integral> (x, y). ?f (x, y) * log b (?f (x, y) / Py y) \<partial>?P) =
  1642     - (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / Py y))"
  1643     by (auto intro!: setsum.cong simp add: `?P = ?C` lebesgue_integral_count_space_finite simple_distributed_finite eq setsum.If_cases split_beta')
  1644 qed
  1645 
  1646 lemma (in information_space) conditional_mutual_information_eq_conditional_entropy:
  1647   assumes X: "simple_function M X" and Y: "simple_function M Y"
  1648   shows "\<I>(X ; X | Y) = \<H>(X | Y)"
  1649 proof -
  1650   def Py \<equiv> "\<lambda>x. if x \<in> Y`space M then measure M (Y -` {x} \<inter> space M) else 0"
  1651   def Pxy \<equiv> "\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x))`space M then measure M ((\<lambda>x. (X x, Y x)) -` {x} \<inter> space M) else 0"
  1652   def Pxxy \<equiv> "\<lambda>x. if x \<in> (\<lambda>x. (X x, X x, Y x))`space M then measure M ((\<lambda>x. (X x, X x, Y x)) -` {x} \<inter> space M) else 0"
  1653   let ?M = "X`space M \<times> X`space M \<times> Y`space M"
  1654 
  1655   note XY = simple_function_Pair[OF X Y]
  1656   note XXY = simple_function_Pair[OF X XY]
  1657   have Py: "simple_distributed M Y Py"
  1658     using Y by (rule simple_distributedI) (auto simp: Py_def)
  1659   have Pxy: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy"
  1660     using XY by (rule simple_distributedI) (auto simp: Pxy_def)
  1661   have Pxxy: "simple_distributed M (\<lambda>x. (X x, X x, Y x)) Pxxy"
  1662     using XXY by (rule simple_distributedI) (auto simp: Pxxy_def)
  1663   have eq: "(\<lambda>x. (X x, X x, Y x)) ` space M = (\<lambda>(x, y). (x, x, y)) ` (\<lambda>x. (X x, Y x)) ` space M"
  1664     by auto
  1665   have inj: "\<And>A. inj_on (\<lambda>(x, y). (x, x, y)) A"
  1666     by (auto simp: inj_on_def)
  1667   have Pxxy_eq: "\<And>x y. Pxxy (x, x, y) = Pxy (x, y)"
  1668     by (auto simp: Pxxy_def Pxy_def intro!: arg_cong[where f=prob])
  1669   have "AE x in count_space ((\<lambda>x. (X x, Y x))`space M). Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
  1670     by (intro subdensity_real[of snd, OF _ Pxy[THEN simple_distributed] Py[THEN simple_distributed]]) (auto intro: measurable_Pair)
  1671   then show ?thesis
  1672     apply (subst conditional_mutual_information_eq[OF Py Pxy Pxy Pxxy])
  1673     apply (subst conditional_entropy_eq[OF Py Pxy])
  1674     apply (auto intro!: setsum.cong simp: Pxxy_eq setsum_negf[symmetric] eq setsum.reindex[OF inj]
  1675                 log_simps zero_less_mult_iff zero_le_mult_iff field_simps mult_less_0_iff AE_count_space)
  1676     using Py[THEN simple_distributed, THEN distributed_real_AE] Pxy[THEN simple_distributed, THEN distributed_real_AE]
  1677   apply (auto simp add: not_le[symmetric] AE_count_space)
  1678     done
  1679 qed
  1680 
  1681 lemma (in information_space) conditional_entropy_nonneg:
  1682   assumes X: "simple_function M X" and Y: "simple_function M Y" shows "0 \<le> \<H>(X | Y)"
  1683   using conditional_mutual_information_eq_conditional_entropy[OF X Y] conditional_mutual_information_nonneg[OF X X Y]
  1684   by simp
  1685 
  1686 subsection {* Equalities *}
  1687 
  1688 lemma (in information_space) mutual_information_eq_entropy_conditional_entropy_distr:
  1689   fixes Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" and Pxy :: "('b \<times> 'c) \<Rightarrow> real"
  1690   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
  1691   assumes Px[measurable]: "distributed M S X Px" and Py[measurable]: "distributed M T Y Py"
  1692   assumes Pxy[measurable]: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
  1693   assumes Ix: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Px (fst x)))"
  1694   assumes Iy: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Py (snd x)))"
  1695   assumes Ixy: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x))"
  1696   shows  "mutual_information b S T X Y = entropy b S X + entropy b T Y - entropy b (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))"
  1697 proof -
  1698   have X: "entropy b S X = - (\<integral>x. Pxy x * log b (Px (fst x)) \<partial>(S \<Otimes>\<^sub>M T))"
  1699     using b_gt_1 Px[THEN distributed_real_measurable]
  1700     apply (subst entropy_distr[OF Px])
  1701     apply (subst distributed_transform_integral[OF Pxy Px, where T=fst])
  1702     apply (auto intro!: integral_cong)
  1703     done
  1704 
  1705   have Y: "entropy b T Y = - (\<integral>x. Pxy x * log b (Py (snd x)) \<partial>(S \<Otimes>\<^sub>M T))"
  1706     using b_gt_1 Py[THEN distributed_real_measurable]
  1707     apply (subst entropy_distr[OF Py])
  1708     apply (subst distributed_transform_integral[OF Pxy Py, where T=snd])
  1709     apply (auto intro!: integral_cong)
  1710     done
  1711 
  1712   interpret S: sigma_finite_measure S by fact
  1713   interpret T: sigma_finite_measure T by fact
  1714   interpret ST: pair_sigma_finite S T ..
  1715   have ST: "sigma_finite_measure (S \<Otimes>\<^sub>M T)" ..
  1716 
  1717   have XY: "entropy b (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) = - (\<integral>x. Pxy x * log b (Pxy x) \<partial>(S \<Otimes>\<^sub>M T))"
  1718     by (subst entropy_distr[OF Pxy]) (auto intro!: integral_cong)
  1719   
  1720   have "AE x in S \<Otimes>\<^sub>M T. Px (fst x) = 0 \<longrightarrow> Pxy x = 0"
  1721     by (intro subdensity_real[of fst, OF _ Pxy Px]) (auto intro: measurable_Pair)
  1722   moreover have "AE x in S \<Otimes>\<^sub>M T. Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
  1723     by (intro subdensity_real[of snd, OF _ Pxy Py]) (auto intro: measurable_Pair)
  1724   moreover have "AE x in S \<Otimes>\<^sub>M T. 0 \<le> Px (fst x)"
  1725     using Px by (intro ST.AE_pair_measure) (auto simp: comp_def intro!: measurable_fst'' dest: distributed_real_AE distributed_real_measurable)
  1726   moreover have "AE x in S \<Otimes>\<^sub>M T. 0 \<le> Py (snd x)"
  1727     using Py by (intro ST.AE_pair_measure) (auto simp: comp_def intro!: measurable_snd'' dest: distributed_real_AE distributed_real_measurable)
  1728   moreover note Pxy[THEN distributed_real_AE]
  1729   ultimately have "AE x in S \<Otimes>\<^sub>M T. Pxy x * log b (Pxy x) - Pxy x * log b (Px (fst x)) - Pxy x * log b (Py (snd x)) = 
  1730     Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x)))"
  1731     (is "AE x in _. ?f x = ?g x")
  1732   proof eventually_elim
  1733     case (goal1 x)
  1734     show ?case
  1735     proof cases
  1736       assume "Pxy x \<noteq> 0"
  1737       with goal1 have "0 < Px (fst x)" "0 < Py (snd x)" "0 < Pxy x"
  1738         by auto
  1739       then show ?thesis
  1740         using b_gt_1 by (simp add: log_simps less_imp_le field_simps)
  1741     qed simp
  1742   qed
  1743 
  1744   have "entropy b S X + entropy b T Y - entropy b (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) = integral\<^sup>L (S \<Otimes>\<^sub>M T) ?f"
  1745     unfolding X Y XY
  1746     apply (subst integral_diff)
  1747     apply (intro integrable_diff Ixy Ix Iy)+
  1748     apply (subst integral_diff)
  1749     apply (intro Ixy Ix Iy)+
  1750     apply (simp add: field_simps)
  1751     done
  1752   also have "\<dots> = integral\<^sup>L (S \<Otimes>\<^sub>M T) ?g"
  1753     using `AE x in _. ?f x = ?g x` by (intro integral_cong_AE) auto
  1754   also have "\<dots> = mutual_information b S T X Y"
  1755     by (rule mutual_information_distr[OF S T Px Py Pxy, symmetric])
  1756   finally show ?thesis ..
  1757 qed
  1758 
  1759 lemma (in information_space) mutual_information_eq_entropy_conditional_entropy':
  1760   fixes Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" and Pxy :: "('b \<times> 'c) \<Rightarrow> real"
  1761   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
  1762   assumes Px: "distributed M S X Px" and Py: "distributed M T Y Py"
  1763   assumes Pxy: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
  1764   assumes Ix: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Px (fst x)))"
  1765   assumes Iy: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Py (snd x)))"
  1766   assumes Ixy: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x))"
  1767   shows  "mutual_information b S T X Y = entropy b S X - conditional_entropy b S T X Y"
  1768   using
  1769     mutual_information_eq_entropy_conditional_entropy_distr[OF S T Px Py Pxy Ix Iy Ixy]
  1770     conditional_entropy_eq_entropy[OF S T Py Pxy Ixy Iy]
  1771   by simp
  1772 
  1773 lemma (in information_space) mutual_information_eq_entropy_conditional_entropy:
  1774   assumes sf_X: "simple_function M X" and sf_Y: "simple_function M Y"
  1775   shows  "\<I>(X ; Y) = \<H>(X) - \<H>(X | Y)"
  1776 proof -
  1777   have X: "simple_distributed M X (\<lambda>x. measure M (X -` {x} \<inter> space M))"
  1778     using sf_X by (rule simple_distributedI) auto
  1779   have Y: "simple_distributed M Y (\<lambda>x. measure M (Y -` {x} \<inter> space M))"
  1780     using sf_Y by (rule simple_distributedI) auto
  1781   have sf_XY: "simple_function M (\<lambda>x. (X x, Y x))"
  1782     using sf_X sf_Y by (rule simple_function_Pair)
  1783   then have XY: "simple_distributed M (\<lambda>x. (X x, Y x)) (\<lambda>x. measure M ((\<lambda>x. (X x, Y x)) -` {x} \<inter> space M))"
  1784     by (rule simple_distributedI) auto
  1785   from simple_distributed_joint_finite[OF this, simp]
  1786   have eq: "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) = count_space (X ` space M \<times> Y ` space M)"
  1787     by (simp add: pair_measure_count_space)
  1788 
  1789   have "\<I>(X ; Y) = \<H>(X) + \<H>(Y) - entropy b (count_space (X`space M) \<Otimes>\<^sub>M count_space (Y`space M)) (\<lambda>x. (X x, Y x))"
  1790     using sigma_finite_measure_count_space_finite sigma_finite_measure_count_space_finite simple_distributed[OF X] simple_distributed[OF Y] simple_distributed_joint[OF XY]
  1791     by (rule mutual_information_eq_entropy_conditional_entropy_distr) (auto simp: eq integrable_count_space)
  1792   then show ?thesis
  1793     unfolding conditional_entropy_eq_entropy_simple[OF sf_X sf_Y] by simp
  1794 qed
  1795 
  1796 lemma (in information_space) mutual_information_nonneg_simple:
  1797   assumes sf_X: "simple_function M X" and sf_Y: "simple_function M Y"
  1798   shows  "0 \<le> \<I>(X ; Y)"
  1799 proof -
  1800   have X: "simple_distributed M X (\<lambda>x. measure M (X -` {x} \<inter> space M))"
  1801     using sf_X by (rule simple_distributedI) auto
  1802   have Y: "simple_distributed M Y (\<lambda>x. measure M (Y -` {x} \<inter> space M))"
  1803     using sf_Y by (rule simple_distributedI) auto
  1804 
  1805   have sf_XY: "simple_function M (\<lambda>x. (X x, Y x))"
  1806     using sf_X sf_Y by (rule simple_function_Pair)
  1807   then have XY: "simple_distributed M (\<lambda>x. (X x, Y x)) (\<lambda>x. measure M ((\<lambda>x. (X x, Y x)) -` {x} \<inter> space M))"
  1808     by (rule simple_distributedI) auto
  1809 
  1810   from simple_distributed_joint_finite[OF this, simp]
  1811   have eq: "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) = count_space (X ` space M \<times> Y ` space M)"
  1812     by (simp add: pair_measure_count_space)
  1813 
  1814   show ?thesis
  1815     by (rule mutual_information_nonneg[OF _ _ simple_distributed[OF X] simple_distributed[OF Y] simple_distributed_joint[OF XY]])
  1816        (simp_all add: eq integrable_count_space sigma_finite_measure_count_space_finite)
  1817 qed
  1818 
  1819 lemma (in information_space) conditional_entropy_less_eq_entropy:
  1820   assumes X: "simple_function M X" and Z: "simple_function M Z"
  1821   shows "\<H>(X | Z) \<le> \<H>(X)"
  1822 proof -
  1823   have "0 \<le> \<I>(X ; Z)" using X Z by (rule mutual_information_nonneg_simple)
  1824   also have "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)" using mutual_information_eq_entropy_conditional_entropy[OF assms] .
  1825   finally show ?thesis by auto
  1826 qed
  1827 
  1828 lemma (in information_space) 
  1829   fixes Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" and Pxy :: "('b \<times> 'c) \<Rightarrow> real"
  1830   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
  1831   assumes Px: "finite_entropy S X Px" and Py: "finite_entropy T Y Py"
  1832   assumes Pxy: "finite_entropy (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
  1833   shows "conditional_entropy b S T X Y \<le> entropy b S X"
  1834 proof -
  1835 
  1836   have "0 \<le> mutual_information b S T X Y" 
  1837     by (rule mutual_information_nonneg') fact+
  1838   also have "\<dots> = entropy b S X - conditional_entropy b S T X Y"
  1839     apply (rule mutual_information_eq_entropy_conditional_entropy')
  1840     using assms
  1841     by (auto intro!: finite_entropy_integrable finite_entropy_distributed
  1842       finite_entropy_integrable_transform[OF Px]
  1843       finite_entropy_integrable_transform[OF Py])
  1844   finally show ?thesis by auto
  1845 qed
  1846 
  1847 lemma (in information_space) entropy_chain_rule:
  1848   assumes X: "simple_function M X" and Y: "simple_function M Y"
  1849   shows  "\<H>(\<lambda>x. (X x, Y x)) = \<H>(X) + \<H>(Y|X)"
  1850 proof -
  1851   note XY = simple_distributedI[OF simple_function_Pair[OF X Y] refl]
  1852   note YX = simple_distributedI[OF simple_function_Pair[OF Y X] refl]
  1853   note simple_distributed_joint_finite[OF this, simp]
  1854   let ?f = "\<lambda>x. prob ((\<lambda>x. (X x, Y x)) -` {x} \<inter> space M)"
  1855   let ?g = "\<lambda>x. prob ((\<lambda>x. (Y x, X x)) -` {x} \<inter> space M)"
  1856   let ?h = "\<lambda>x. if x \<in> (\<lambda>x. (Y x, X x)) ` space M then prob ((\<lambda>x. (Y x, X x)) -` {x} \<inter> space M) else 0"
  1857   have "\<H>(\<lambda>x. (X x, Y x)) = - (\<Sum>x\<in>(\<lambda>x. (X x, Y x)) ` space M. ?f x * log b (?f x))"
  1858     using XY by (rule entropy_simple_distributed)
  1859   also have "\<dots> = - (\<Sum>x\<in>(\<lambda>(x, y). (y, x)) ` (\<lambda>x. (X x, Y x)) ` space M. ?g x * log b (?g x))"
  1860     by (subst (2) setsum.reindex) (auto simp: inj_on_def intro!: setsum.cong arg_cong[where f="\<lambda>A. prob A * log b (prob A)"])
  1861   also have "\<dots> = - (\<Sum>x\<in>(\<lambda>x. (Y x, X x)) ` space M. ?h x * log b (?h x))"
  1862     by (auto intro!: setsum.cong)
  1863   also have "\<dots> = entropy b (count_space (Y ` space M) \<Otimes>\<^sub>M count_space (X ` space M)) (\<lambda>x. (Y x, X x))"
  1864     by (subst entropy_distr[OF simple_distributed_joint[OF YX]])
  1865        (auto simp: pair_measure_count_space sigma_finite_measure_count_space_finite lebesgue_integral_count_space_finite
  1866              cong del: setsum.cong  intro!: setsum.mono_neutral_left)
  1867   finally have "\<H>(\<lambda>x. (X x, Y x)) = entropy b (count_space (Y ` space M) \<Otimes>\<^sub>M count_space (X ` space M)) (\<lambda>x. (Y x, X x))" .
  1868   then show ?thesis
  1869     unfolding conditional_entropy_eq_entropy_simple[OF Y X] by simp
  1870 qed
  1871 
  1872 lemma (in information_space) entropy_partition:
  1873   assumes X: "simple_function M X"
  1874   shows "\<H>(X) = \<H>(f \<circ> X) + \<H>(X|f \<circ> X)"
  1875 proof -
  1876   note fX = simple_function_compose[OF X, of f]  
  1877   have eq: "(\<lambda>x. ((f \<circ> X) x, X x)) ` space M = (\<lambda>x. (f x, x)) ` X ` space M" by auto
  1878   have inj: "\<And>A. inj_on (\<lambda>x. (f x, x)) A"
  1879     by (auto simp: inj_on_def)
  1880   show ?thesis
  1881     apply (subst entropy_chain_rule[symmetric, OF fX X])
  1882     apply (subst entropy_simple_distributed[OF simple_distributedI[OF simple_function_Pair[OF fX X] refl]])
  1883     apply (subst entropy_simple_distributed[OF simple_distributedI[OF X refl]])
  1884     unfolding eq
  1885     apply (subst setsum.reindex[OF inj])
  1886     apply (auto intro!: setsum.cong arg_cong[where f="\<lambda>A. prob A * log b (prob A)"])
  1887     done
  1888 qed
  1889 
  1890 corollary (in information_space) entropy_data_processing:
  1891   assumes X: "simple_function M X" shows "\<H>(f \<circ> X) \<le> \<H>(X)"
  1892 proof -
  1893   note fX = simple_function_compose[OF X, of f]
  1894   from X have "\<H>(X) = \<H>(f\<circ>X) + \<H>(X|f\<circ>X)" by (rule entropy_partition)
  1895   then show "\<H>(f \<circ> X) \<le> \<H>(X)"
  1896     by (auto intro: conditional_entropy_nonneg[OF X fX])
  1897 qed
  1898 
  1899 corollary (in information_space) entropy_of_inj:
  1900   assumes X: "simple_function M X" and inj: "inj_on f (X`space M)"
  1901   shows "\<H>(f \<circ> X) = \<H>(X)"
  1902 proof (rule antisym)
  1903   show "\<H>(f \<circ> X) \<le> \<H>(X)" using entropy_data_processing[OF X] .
  1904 next
  1905   have sf: "simple_function M (f \<circ> X)"
  1906     using X by auto
  1907   have "\<H>(X) = \<H>(the_inv_into (X`space M) f \<circ> (f \<circ> X))"
  1908     unfolding o_assoc
  1909     apply (subst entropy_simple_distributed[OF simple_distributedI[OF X refl]])
  1910     apply (subst entropy_simple_distributed[OF simple_distributedI[OF simple_function_compose[OF X]], where f="\<lambda>x. prob (X -` {x} \<inter> space M)"])
  1911     apply (auto intro!: setsum.cong arg_cong[where f=prob] image_eqI simp: the_inv_into_f_f[OF inj] comp_def)
  1912     done
  1913   also have "... \<le> \<H>(f \<circ> X)"
  1914     using entropy_data_processing[OF sf] .
  1915   finally show "\<H>(X) \<le> \<H>(f \<circ> X)" .
  1916 qed
  1917 
  1918 end