src/HOL/Probability/Information.thy
changeset 62975 1d066f6ab25d
parent 62390 842917225d56
child 63040 eb4ddd18d635
equal deleted inserted replaced
62974:f17602cbf76a 62975:1d066f6ab25d
    71 
    71 
    72 text \<open>The Kullback$-$Leibler divergence is also known as relative entropy or
    72 text \<open>The Kullback$-$Leibler divergence is also known as relative entropy or
    73 Kullback$-$Leibler distance.\<close>
    73 Kullback$-$Leibler distance.\<close>
    74 
    74 
    75 definition
    75 definition
    76   "entropy_density b M N = log b \<circ> real_of_ereal \<circ> RN_deriv M N"
    76   "entropy_density b M N = log b \<circ> enn2real \<circ> RN_deriv M N"
    77 
    77 
    78 definition
    78 definition
    79   "KL_divergence b M N = integral\<^sup>L N (entropy_density b M N)"
    79   "KL_divergence b M N = integral\<^sup>L N (entropy_density b M N)"
    80 
    80 
    81 lemma measurable_entropy_density[measurable]: "entropy_density b M N \<in> borel_measurable M"
    81 lemma measurable_entropy_density[measurable]: "entropy_density b M N \<in> borel_measurable M"
    86   assumes "1 < b"
    86   assumes "1 < b"
    87   assumes f[measurable]: "f \<in> borel_measurable M" and nn: "AE x in M. 0 \<le> f x"
    87   assumes f[measurable]: "f \<in> borel_measurable M" and nn: "AE x in M. 0 \<le> f x"
    88   shows "KL_divergence b M (density M f) = (\<integral>x. f x * log b (f x) \<partial>M)"
    88   shows "KL_divergence b M (density M f) = (\<integral>x. f x * log b (f x) \<partial>M)"
    89   unfolding KL_divergence_def
    89   unfolding KL_divergence_def
    90 proof (subst integral_real_density)
    90 proof (subst integral_real_density)
    91   show [measurable]: "entropy_density b M (density M (\<lambda>x. ereal (f x))) \<in> borel_measurable M"
    91   show [measurable]: "entropy_density b M (density M (\<lambda>x. ennreal (f x))) \<in> borel_measurable M"
    92     using f
    92     using f
    93     by (auto simp: comp_def entropy_density_def)
    93     by (auto simp: comp_def entropy_density_def)
    94   have "density M (RN_deriv M (density M f)) = density M f"
    94   have "density M (RN_deriv M (density M f)) = density M f"
    95     using f nn by (intro density_RN_deriv_density) auto
    95     using f nn by (intro density_RN_deriv_density) auto
    96   then have eq: "AE x in M. RN_deriv M (density M f) x = f x"
    96   then have eq: "AE x in M. RN_deriv M (density M f) x = f x"
    97     using f nn by (intro density_unique) (auto simp: RN_deriv_nonneg)
    97     using f nn by (intro density_unique) auto
    98   show "(\<integral>x. f x * entropy_density b M (density M (\<lambda>x. ereal (f x))) x \<partial>M) = (\<integral>x. f x * log b (f x) \<partial>M)"
    98   show "(\<integral>x. f x * entropy_density b M (density M (\<lambda>x. ennreal (f x))) x \<partial>M) = (\<integral>x. f x * log b (f x) \<partial>M)"
    99     apply (intro integral_cong_AE)
    99     apply (intro integral_cong_AE)
   100     apply measurable
   100     apply measurable
   101     using eq
   101     using eq nn
   102     apply eventually_elim
   102     apply eventually_elim
   103     apply (auto simp: entropy_density_def)
   103     apply (auto simp: entropy_density_def)
   104     done
   104     done
   105 qed fact+
   105 qed fact+
   106 
   106 
   139 
   139 
   140   let ?D_set = "{x\<in>space M. D x \<noteq> 0}"
   140   let ?D_set = "{x\<in>space M. D x \<noteq> 0}"
   141   have [simp, intro]: "?D_set \<in> sets M"
   141   have [simp, intro]: "?D_set \<in> sets M"
   142     using D by auto
   142     using D by auto
   143 
   143 
   144   have D_neg: "(\<integral>\<^sup>+ x. ereal (- D x) \<partial>M) = 0"
   144   have D_neg: "(\<integral>\<^sup>+ x. ennreal (- D x) \<partial>M) = 0"
   145     using D by (subst nn_integral_0_iff_AE) auto
   145     using D by (subst nn_integral_0_iff_AE) (auto simp: ennreal_neg)
   146 
   146 
   147   have "(\<integral>\<^sup>+ x. ereal (D x) \<partial>M) = emeasure (density M D) (space M)"
   147   have "(\<integral>\<^sup>+ x. ennreal (D x) \<partial>M) = emeasure (density M D) (space M)"
   148     using D by (simp add: emeasure_density cong: nn_integral_cong)
   148     using D by (simp add: emeasure_density cong: nn_integral_cong)
   149   then have D_pos: "(\<integral>\<^sup>+ x. ereal (D x) \<partial>M) = 1"
   149   then have D_pos: "(\<integral>\<^sup>+ x. ennreal (D x) \<partial>M) = 1"
   150     using N.emeasure_space_1 by simp
   150     using N.emeasure_space_1 by simp
   151 
   151 
   152   have "integrable M D"
   152   have "integrable M D"
   153     using D D_pos D_neg unfolding real_integrable_def real_lebesgue_integral_def by simp_all
   153     using D D_pos D_neg unfolding real_integrable_def real_lebesgue_integral_def by simp_all
   154   then have "integral\<^sup>L M D = 1"
   154   then have "integral\<^sup>L M D = 1"
   160     using \<open>integrable M D\<close> \<open>integral\<^sup>L M D = 1\<close>
   160     using \<open>integrable M D\<close> \<open>integral\<^sup>L M D = 1\<close>
   161     by (simp add: emeasure_eq_measure)
   161     by (simp add: emeasure_eq_measure)
   162   also have "\<dots> < (\<integral> x. D x * (ln b * log b (D x)) \<partial>M)"
   162   also have "\<dots> < (\<integral> x. D x * (ln b * log b (D x)) \<partial>M)"
   163   proof (rule integral_less_AE)
   163   proof (rule integral_less_AE)
   164     show "integrable M (\<lambda>x. D x - indicator ?D_set x)"
   164     show "integrable M (\<lambda>x. D x - indicator ?D_set x)"
   165       using \<open>integrable M D\<close> by auto
   165       using \<open>integrable M D\<close> by (auto simp: less_top[symmetric])
   166   next
   166   next
   167     from integrable_mult_left(1)[OF int, of "ln b"]
   167     from integrable_mult_left(1)[OF int, of "ln b"]
   168     show "integrable M (\<lambda>x. D x * (ln b * log b (D x)))" 
   168     show "integrable M (\<lambda>x. D x * (ln b * log b (D x)))"
   169       by (simp add: ac_simps)
   169       by (simp add: ac_simps)
   170   next
   170   next
   171     show "emeasure M {x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} \<noteq> 0"
   171     show "emeasure M {x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} \<noteq> 0"
   172     proof
   172     proof
   173       assume eq_0: "emeasure M {x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} = 0"
   173       assume eq_0: "emeasure M {x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} = 0"
   174       then have disj: "AE x in M. D x = 1 \<or> D x = 0"
   174       then have disj: "AE x in M. D x = 1 \<or> D x = 0"
   175         using D(1) by (auto intro!: AE_I[OF subset_refl] sets.sets_Collect)
   175         using D(1) by (auto intro!: AE_I[OF subset_refl] sets.sets_Collect)
   176 
   176 
   177       have "emeasure M {x\<in>space M. D x = 1} = (\<integral>\<^sup>+ x. indicator {x\<in>space M. D x = 1} x \<partial>M)"
   177       have "emeasure M {x\<in>space M. D x = 1} = (\<integral>\<^sup>+ x. indicator {x\<in>space M. D x = 1} x \<partial>M)"
   178         using D(1) by auto
   178         using D(1) by auto
   179       also have "\<dots> = (\<integral>\<^sup>+ x. ereal (D x) \<partial>M)"
   179       also have "\<dots> = (\<integral>\<^sup>+ x. ennreal (D x) \<partial>M)"
   180         using disj by (auto intro!: nn_integral_cong_AE simp: indicator_def one_ereal_def)
   180         using disj by (auto intro!: nn_integral_cong_AE simp: indicator_def one_ennreal_def)
   181       finally have "AE x in M. D x = 1"
   181       finally have "AE x in M. D x = 1"
   182         using D D_pos by (intro AE_I_eq_1) auto
   182         using D D_pos by (intro AE_I_eq_1) auto
   183       then have "(\<integral>\<^sup>+x. indicator A x\<partial>M) = (\<integral>\<^sup>+x. ereal (D x) * indicator A x\<partial>M)"
   183       then have "(\<integral>\<^sup>+x. indicator A x\<partial>M) = (\<integral>\<^sup>+x. ennreal (D x) * indicator A x\<partial>M)"
   184         by (intro nn_integral_cong_AE) (auto simp: one_ereal_def[symmetric])
   184         by (intro nn_integral_cong_AE) (auto simp: one_ennreal_def[symmetric])
   185       also have "\<dots> = density M D A"
   185       also have "\<dots> = density M D A"
   186         using \<open>A \<in> sets M\<close> D by (simp add: emeasure_density)
   186         using \<open>A \<in> sets M\<close> D by (simp add: emeasure_density)
   187       finally show False using \<open>A \<in> sets M\<close> \<open>emeasure (density M D) A \<noteq> emeasure M A\<close> by simp
   187       finally show False using \<open>A \<in> sets M\<close> \<open>emeasure (density M D) A \<noteq> emeasure M A\<close> by simp
   188     qed
   188     qed
   189     show "{x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} \<in> sets M"
   189     show "{x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} \<in> sets M"
   238 
   238 
   239 lemma (in sigma_finite_measure) KL_same_eq_0: "KL_divergence b M M = 0"
   239 lemma (in sigma_finite_measure) KL_same_eq_0: "KL_divergence b M M = 0"
   240 proof -
   240 proof -
   241   have "AE x in M. 1 = RN_deriv M M x"
   241   have "AE x in M. 1 = RN_deriv M M x"
   242   proof (rule RN_deriv_unique)
   242   proof (rule RN_deriv_unique)
   243     show "(\<lambda>x. 1) \<in> borel_measurable M" "AE x in M. 0 \<le> (1 :: ereal)" by auto
       
   244     show "density M (\<lambda>x. 1) = M"
   243     show "density M (\<lambda>x. 1) = M"
   245       apply (auto intro!: measure_eqI emeasure_density)
   244       apply (auto intro!: measure_eqI emeasure_density)
   246       apply (subst emeasure_density)
   245       apply (subst emeasure_density)
   247       apply auto
   246       apply auto
   248       done
   247       done
   249   qed
   248   qed auto
   250   then have "AE x in M. log b (real_of_ereal (RN_deriv M M x)) = 0"
   249   then have "AE x in M. log b (enn2real (RN_deriv M M x)) = 0"
   251     by (elim AE_mp) simp
   250     by (elim AE_mp) simp
   252   from integral_cong_AE[OF _ _ this]
   251   from integral_cong_AE[OF _ _ this]
   253   have "integral\<^sup>L M (entropy_density b M M) = 0"
   252   have "integral\<^sup>L M (entropy_density b M M) = 0"
   254     by (simp add: entropy_density_def comp_def)
   253     by (simp add: entropy_density_def comp_def)
   255   then show "KL_divergence b M M = 0"
   254   then show "KL_divergence b M M = 0"
   274   shows "KL_divergence b M N = 0 \<longleftrightarrow> N = M"
   273   shows "KL_divergence b M N = 0 \<longleftrightarrow> N = M"
   275 proof -
   274 proof -
   276   interpret N: prob_space N by fact
   275   interpret N: prob_space N by fact
   277   have "finite_measure N" by unfold_locales
   276   have "finite_measure N" by unfold_locales
   278   from real_RN_deriv[OF this ac] guess D . note D = this
   277   from real_RN_deriv[OF this ac] guess D . note D = this
   279   
   278 
   280   have "N = density M (RN_deriv M N)"
   279   have "N = density M (RN_deriv M N)"
   281     using ac by (rule density_RN_deriv[symmetric])
   280     using ac by (rule density_RN_deriv[symmetric])
   282   also have "\<dots> = density M D"
   281   also have "\<dots> = density M D"
   283     using D by (auto intro!: density_cong)
   282     using D by (auto intro!: density_cong)
   284   finally have N: "N = density M D" .
   283   finally have N: "N = density M D" .
   332   finally show ?thesis .
   331   finally show ?thesis .
   333 qed
   332 qed
   334 
   333 
   335 subsection \<open>Finite Entropy\<close>
   334 subsection \<open>Finite Entropy\<close>
   336 
   335 
   337 definition (in information_space) 
   336 definition (in information_space) finite_entropy :: "'b measure \<Rightarrow> ('a \<Rightarrow> 'b) \<Rightarrow> ('b \<Rightarrow> real) \<Rightarrow> bool"
   338   "finite_entropy S X f \<longleftrightarrow> distributed M S X f \<and> integrable S (\<lambda>x. f x * log b (f x))"
   337 where
       
   338   "finite_entropy S X f \<longleftrightarrow>
       
   339     distributed M S X f \<and>
       
   340     integrable S (\<lambda>x. f x * log b (f x)) \<and>
       
   341     (\<forall>x\<in>space S. 0 \<le> f x)"
   339 
   342 
   340 lemma (in information_space) finite_entropy_simple_function:
   343 lemma (in information_space) finite_entropy_simple_function:
   341   assumes X: "simple_function M X"
   344   assumes X: "simple_function M X"
   342   shows "finite_entropy (count_space (X`space M)) X (\<lambda>a. measure M {x \<in> space M. X x = a})"
   345   shows "finite_entropy (count_space (X`space M)) X (\<lambda>a. measure M {x \<in> space M. X x = a})"
   343   unfolding finite_entropy_def
   346   unfolding finite_entropy_def
   344 proof
   347 proof safe
   345   have [simp]: "finite (X ` space M)"
   348   have [simp]: "finite (X ` space M)"
   346     using X by (auto simp: simple_function_def)
   349     using X by (auto simp: simple_function_def)
   347   then show "integrable (count_space (X ` space M))
   350   then show "integrable (count_space (X ` space M))
   348      (\<lambda>x. prob {xa \<in> space M. X xa = x} * log b (prob {xa \<in> space M. X xa = x}))"
   351      (\<lambda>x. prob {xa \<in> space M. X xa = x} * log b (prob {xa \<in> space M. X xa = x}))"
   349     by (rule integrable_count_space)
   352     by (rule integrable_count_space)
   350   have d: "distributed M (count_space (X ` space M)) X (\<lambda>x. ereal (if x \<in> X`space M then prob {xa \<in> space M. X xa = x} else 0))"
   353   have d: "distributed M (count_space (X ` space M)) X (\<lambda>x. ennreal (if x \<in> X`space M then prob {xa \<in> space M. X xa = x} else 0))"
   351     by (rule distributed_simple_function_superset[OF X]) (auto intro!: arg_cong[where f=prob])
   354     by (rule distributed_simple_function_superset[OF X]) (auto intro!: arg_cong[where f=prob])
   352   show "distributed M (count_space (X ` space M)) X (\<lambda>x. ereal (prob {xa \<in> space M. X xa = x}))"
   355   show "distributed M (count_space (X ` space M)) X (\<lambda>x. ennreal (prob {xa \<in> space M. X xa = x}))"
   353     by (rule distributed_cong_density[THEN iffD1, OF _ _ _ d]) auto
   356     by (rule distributed_cong_density[THEN iffD1, OF _ _ _ d]) auto
   354 qed
   357 qed (rule measure_nonneg)
   355 
       
   356 lemma distributed_transform_AE:
       
   357   assumes T: "T \<in> measurable P Q" "absolutely_continuous Q (distr P Q T)"
       
   358   assumes g: "distributed M Q Y g"
       
   359   shows "AE x in P. 0 \<le> g (T x)"
       
   360   using g
       
   361   apply (subst AE_distr_iff[symmetric, OF T(1)])
       
   362   apply simp
       
   363   apply (rule absolutely_continuous_AE[OF _ T(2)])
       
   364   apply simp
       
   365   apply (simp add: distributed_AE)
       
   366   done
       
   367 
   358 
   368 lemma ac_fst:
   359 lemma ac_fst:
   369   assumes "sigma_finite_measure T"
   360   assumes "sigma_finite_measure T"
   370   shows "absolutely_continuous S (distr (S \<Otimes>\<^sub>M T) S fst)"
   361   shows "absolutely_continuous S (distr (S \<Otimes>\<^sub>M T) S fst)"
   371 proof -
   362 proof -
   409 
   400 
   410 lemma (in information_space) finite_entropy_distributed:
   401 lemma (in information_space) finite_entropy_distributed:
   411   "finite_entropy S X Px \<Longrightarrow> distributed M S X Px"
   402   "finite_entropy S X Px \<Longrightarrow> distributed M S X Px"
   412   unfolding finite_entropy_def by auto
   403   unfolding finite_entropy_def by auto
   413 
   404 
       
   405 lemma (in information_space) finite_entropy_nn:
       
   406   "finite_entropy S X Px \<Longrightarrow> x \<in> space S \<Longrightarrow> 0 \<le> Px x"
       
   407   by (auto simp: finite_entropy_def)
       
   408 
       
   409 lemma (in information_space) finite_entropy_measurable:
       
   410   "finite_entropy S X Px \<Longrightarrow> Px \<in> S \<rightarrow>\<^sub>M borel"
       
   411   using distributed_real_measurable[of S Px M X]
       
   412     finite_entropy_nn[of S X Px] finite_entropy_distributed[of S X Px] by auto
       
   413 
       
   414 lemma (in information_space) subdensity_finite_entropy:
       
   415   fixes g :: "'b \<Rightarrow> real" and f :: "'c \<Rightarrow> real"
       
   416   assumes T: "T \<in> measurable P Q"
       
   417   assumes f: "finite_entropy P X f"
       
   418   assumes g: "finite_entropy Q Y g"
       
   419   assumes Y: "Y = T \<circ> X"
       
   420   shows "AE x in P. g (T x) = 0 \<longrightarrow> f x = 0"
       
   421   using subdensity[OF T, of M X "\<lambda>x. ennreal (f x)" Y "\<lambda>x. ennreal (g x)"]
       
   422     finite_entropy_distributed[OF f] finite_entropy_distributed[OF g]
       
   423     finite_entropy_nn[OF f] finite_entropy_nn[OF g]
       
   424     assms
       
   425   by auto
       
   426 
   414 lemma (in information_space) finite_entropy_integrable_transform:
   427 lemma (in information_space) finite_entropy_integrable_transform:
   415   assumes Fx: "finite_entropy S X Px"
   428   "finite_entropy S X Px \<Longrightarrow> distributed M T Y Py \<Longrightarrow> (\<And>x. x \<in> space T \<Longrightarrow> 0 \<le> Py x) \<Longrightarrow>
   416   assumes Fy: "distributed M T Y Py"
   429     X = (\<lambda>x. f (Y x)) \<Longrightarrow> f \<in> measurable T S \<Longrightarrow> integrable T (\<lambda>x. Py x * log b (Px (f x)))"
   417     and "X = (\<lambda>x. f (Y x))"
       
   418     and "f \<in> measurable T S"
       
   419   shows "integrable T (\<lambda>x. Py x * log b (Px (f x)))"
       
   420   using assms unfolding finite_entropy_def
       
   421   using distributed_transform_integrable[of M T Y Py S X Px f "\<lambda>x. log b (Px x)"]
   430   using distributed_transform_integrable[of M T Y Py S X Px f "\<lambda>x. log b (Px x)"]
   422   by auto
   431   using distributed_real_measurable[of S Px M X]
       
   432   by (auto simp: finite_entropy_def)
   423 
   433 
   424 subsection \<open>Mutual Information\<close>
   434 subsection \<open>Mutual Information\<close>
   425 
   435 
   426 definition (in prob_space)
   436 definition (in prob_space)
   427   "mutual_information b S T X Y =
   437   "mutual_information b S T X Y =
   501   assumes Fxy: "finite_entropy (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
   511   assumes Fxy: "finite_entropy (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
   502   defines "f \<equiv> \<lambda>x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x)))"
   512   defines "f \<equiv> \<lambda>x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x)))"
   503   shows mutual_information_distr': "mutual_information b S T X Y = integral\<^sup>L (S \<Otimes>\<^sub>M T) f" (is "?M = ?R")
   513   shows mutual_information_distr': "mutual_information b S T X Y = integral\<^sup>L (S \<Otimes>\<^sub>M T) f" (is "?M = ?R")
   504     and mutual_information_nonneg': "0 \<le> mutual_information b S T X Y"
   514     and mutual_information_nonneg': "0 \<le> mutual_information b S T X Y"
   505 proof -
   515 proof -
   506   have Px: "distributed M S X Px"
   516   have Px: "distributed M S X Px" and Px_nn: "\<And>x. x \<in> space S \<Longrightarrow> 0 \<le> Px x"
   507     using Fx by (auto simp: finite_entropy_def)
   517     using Fx by (auto simp: finite_entropy_def)
   508   have Py: "distributed M T Y Py"
   518   have Py: "distributed M T Y Py" and Py_nn: "\<And>x. x \<in> space T \<Longrightarrow> 0 \<le> Py x"
   509     using Fy by (auto simp: finite_entropy_def)
   519     using Fy by (auto simp: finite_entropy_def)
   510   have Pxy: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
   520   have Pxy: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
   511     using Fxy by (auto simp: finite_entropy_def)
   521     and Pxy_nn: "\<And>x. x \<in> space (S \<Otimes>\<^sub>M T) \<Longrightarrow> 0 \<le> Pxy x"
   512 
   522       "\<And>x y. x \<in> space S \<Longrightarrow> y \<in> space T \<Longrightarrow> 0 \<le> Pxy (x, y)"
   513   have X: "random_variable S X"
   523     using Fxy by (auto simp: finite_entropy_def space_pair_measure)
       
   524 
       
   525   have [measurable]: "Px \<in> S \<rightarrow>\<^sub>M borel"
       
   526     using Px Px_nn by (intro distributed_real_measurable)
       
   527   have [measurable]: "Py \<in> T \<rightarrow>\<^sub>M borel"
       
   528     using Py Py_nn by (intro distributed_real_measurable)
       
   529   have measurable_Pxy[measurable]: "Pxy \<in> (S \<Otimes>\<^sub>M T) \<rightarrow>\<^sub>M borel"
       
   530     using Pxy Pxy_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure)
       
   531 
       
   532   have X[measurable]: "random_variable S X"
   514     using Px by auto
   533     using Px by auto
   515   have Y: "random_variable T Y"
   534   have Y[measurable]: "random_variable T Y"
   516     using Py by auto
   535     using Py by auto
   517   interpret S: sigma_finite_measure S by fact
   536   interpret S: sigma_finite_measure S by fact
   518   interpret T: sigma_finite_measure T by fact
   537   interpret T: sigma_finite_measure T by fact
   519   interpret ST: pair_sigma_finite S T ..
   538   interpret ST: pair_sigma_finite S T ..
   520   interpret X: prob_space "distr M S X" using X by (rule prob_space_distr)
   539   interpret X: prob_space "distr M S X" using X by (rule prob_space_distr)
   522   interpret XY: pair_prob_space "distr M S X" "distr M T Y" ..
   541   interpret XY: pair_prob_space "distr M S X" "distr M T Y" ..
   523   let ?P = "S \<Otimes>\<^sub>M T"
   542   let ?P = "S \<Otimes>\<^sub>M T"
   524   let ?D = "distr M ?P (\<lambda>x. (X x, Y x))"
   543   let ?D = "distr M ?P (\<lambda>x. (X x, Y x))"
   525 
   544 
   526   { fix A assume "A \<in> sets S"
   545   { fix A assume "A \<in> sets S"
   527     with X Y have "emeasure (distr M S X) A = emeasure ?D (A \<times> space T)"
   546     with X[THEN measurable_space] Y[THEN measurable_space]
   528       by (auto simp: emeasure_distr measurable_Pair measurable_space
   547     have "emeasure (distr M S X) A = emeasure ?D (A \<times> space T)"
   529                intro!: arg_cong[where f="emeasure M"]) }
   548       by (auto simp: emeasure_distr intro!: arg_cong[where f="emeasure M"]) }
   530   note marginal_eq1 = this
   549   note marginal_eq1 = this
   531   { fix A assume "A \<in> sets T"
   550   { fix A assume "A \<in> sets T"
   532     with X Y have "emeasure (distr M T Y) A = emeasure ?D (space S \<times> A)"
   551     with X[THEN measurable_space] Y[THEN measurable_space]
   533       by (auto simp: emeasure_distr measurable_Pair measurable_space
   552     have "emeasure (distr M T Y) A = emeasure ?D (space S \<times> A)"
   534                intro!: arg_cong[where f="emeasure M"]) }
   553       by (auto simp: emeasure_distr intro!: arg_cong[where f="emeasure M"]) }
   535   note marginal_eq2 = this
   554   note marginal_eq2 = this
   536 
   555 
   537   have eq: "(\<lambda>x. ereal (Px (fst x) * Py (snd x))) = (\<lambda>(x, y). ereal (Px x) * ereal (Py y))"
   556   have distr_eq: "distr M S X \<Otimes>\<^sub>M distr M T Y = density ?P (\<lambda>x. ennreal (Px (fst x) * Py (snd x)))"
   538     by auto
   557     unfolding Px(1)[THEN distributed_distr_eq_density] Py(1)[THEN distributed_distr_eq_density]
   539 
       
   540   have distr_eq: "distr M S X \<Otimes>\<^sub>M distr M T Y = density ?P (\<lambda>x. ereal (Px (fst x) * Py (snd x)))"
       
   541     unfolding Px(1)[THEN distributed_distr_eq_density] Py(1)[THEN distributed_distr_eq_density] eq
       
   542   proof (subst pair_measure_density)
   558   proof (subst pair_measure_density)
   543     show "(\<lambda>x. ereal (Px x)) \<in> borel_measurable S" "(\<lambda>y. ereal (Py y)) \<in> borel_measurable T"
   559     show "(\<lambda>x. ennreal (Px x)) \<in> borel_measurable S" "(\<lambda>y. ennreal (Py y)) \<in> borel_measurable T"
   544       "AE x in S. 0 \<le> ereal (Px x)" "AE y in T. 0 \<le> ereal (Py y)"
       
   545       using Px Py by (auto simp: distributed_def)
   560       using Px Py by (auto simp: distributed_def)
   546     show "sigma_finite_measure (density T Py)" unfolding Py(1)[THEN distributed_distr_eq_density, symmetric] ..
   561     show "sigma_finite_measure (density T Py)" unfolding Py(1)[THEN distributed_distr_eq_density, symmetric] ..
   547   qed (fact | simp)+
   562     show "density (S \<Otimes>\<^sub>M T) (\<lambda>(x, y). ennreal (Px x) * ennreal (Py y)) =
   548   
   563       density (S \<Otimes>\<^sub>M T) (\<lambda>x. ennreal (Px (fst x) * Py (snd x)))"
   549   have M: "?M = KL_divergence b (density ?P (\<lambda>x. ereal (Px (fst x) * Py (snd x)))) (density ?P (\<lambda>x. ereal (Pxy x)))"
   564       using Px_nn Py_nn by (auto intro!: density_cong simp: distributed_def ennreal_mult space_pair_measure)
       
   565   qed fact
       
   566 
       
   567   have M: "?M = KL_divergence b (density ?P (\<lambda>x. ennreal (Px (fst x) * Py (snd x)))) (density ?P (\<lambda>x. ennreal (Pxy x)))"
   550     unfolding mutual_information_def distr_eq Pxy(1)[THEN distributed_distr_eq_density] ..
   568     unfolding mutual_information_def distr_eq Pxy(1)[THEN distributed_distr_eq_density] ..
   551 
   569 
   552   from Px Py have f: "(\<lambda>x. Px (fst x) * Py (snd x)) \<in> borel_measurable ?P"
   570   from Px Py have f: "(\<lambda>x. Px (fst x) * Py (snd x)) \<in> borel_measurable ?P"
   553     by (intro borel_measurable_times) (auto intro: distributed_real_measurable measurable_fst'' measurable_snd'')
   571     by (intro borel_measurable_times) (auto intro: distributed_real_measurable measurable_fst'' measurable_snd'')
   554   have PxPy_nonneg: "AE x in ?P. 0 \<le> Px (fst x) * Py (snd x)"
   572   have PxPy_nonneg: "AE x in ?P. 0 \<le> Px (fst x) * Py (snd x)"
   555   proof (rule ST.AE_pair_measure)
   573     using Px_nn Py_nn by (auto simp: space_pair_measure)
   556     show "{x \<in> space ?P. 0 \<le> Px (fst x) * Py (snd x)} \<in> sets ?P"
   574 
   557       using f by auto
   575   have A: "(AE x in ?P. Px (fst x) = 0 \<longrightarrow> Pxy x = 0)"
   558     show "AE x in S. AE y in T. 0 \<le> Px (fst (x, y)) * Py (snd (x, y))"
   576     by (rule subdensity_real[OF measurable_fst Pxy Px]) (insert Px_nn Pxy_nn, auto simp: space_pair_measure)
   559       using Px Py by (auto simp: zero_le_mult_iff dest!: distributed_real_AE)
       
   560   qed
       
   561 
       
   562   have "(AE x in ?P. Px (fst x) = 0 \<longrightarrow> Pxy x = 0)"
       
   563     by (rule subdensity_real[OF measurable_fst Pxy Px]) auto
       
   564   moreover
   577   moreover
   565   have "(AE x in ?P. Py (snd x) = 0 \<longrightarrow> Pxy x = 0)"
   578   have B: "(AE x in ?P. Py (snd x) = 0 \<longrightarrow> Pxy x = 0)"
   566     by (rule subdensity_real[OF measurable_snd Pxy Py]) auto
   579     by (rule subdensity_real[OF measurable_snd Pxy Py]) (insert Py_nn Pxy_nn, auto simp: space_pair_measure)
   567   ultimately have ac: "AE x in ?P. Px (fst x) * Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
   580   ultimately have ac: "AE x in ?P. Px (fst x) * Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
   568     by eventually_elim auto
   581     by eventually_elim auto
   569 
   582 
   570   show "?M = ?R"
   583   show "?M = ?R"
   571     unfolding M f_def
   584     unfolding M f_def using Pxy_nn Px_nn Py_nn
   572     using b_gt_1 f PxPy_nonneg Pxy[THEN distributed_real_measurable] Pxy[THEN distributed_real_AE] ac
   585     by (intro ST.KL_density_density b_gt_1 f PxPy_nonneg ac) (auto simp: space_pair_measure)
   573     by (rule ST.KL_density_density)
       
   574 
   586 
   575   have X: "X = fst \<circ> (\<lambda>x. (X x, Y x))" and Y: "Y = snd \<circ> (\<lambda>x. (X x, Y x))"
   587   have X: "X = fst \<circ> (\<lambda>x. (X x, Y x))" and Y: "Y = snd \<circ> (\<lambda>x. (X x, Y x))"
   576     by auto
   588     by auto
   577 
   589 
   578   have "integrable (S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x) - Pxy x * log b (Px (fst x)) - Pxy x * log b (Py (snd x)))"
   590   have "integrable (S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x) - Pxy x * log b (Px (fst x)) - Pxy x * log b (Py (snd x)))"
   579     using finite_entropy_integrable[OF Fxy]
   591     using finite_entropy_integrable[OF Fxy]
   580     using finite_entropy_integrable_transform[OF Fx Pxy, of fst]
   592     using finite_entropy_integrable_transform[OF Fx Pxy, of fst]
   581     using finite_entropy_integrable_transform[OF Fy Pxy, of snd]
   593     using finite_entropy_integrable_transform[OF Fy Pxy, of snd]
   582     by simp
   594     by (simp add: Pxy_nn)
   583   moreover have "f \<in> borel_measurable (S \<Otimes>\<^sub>M T)"
   595   moreover have "f \<in> borel_measurable (S \<Otimes>\<^sub>M T)"
   584     unfolding f_def using Px Py Pxy
   596     unfolding f_def using Px Py Pxy
   585     by (auto intro: distributed_real_measurable measurable_fst'' measurable_snd''
   597     by (auto intro: distributed_real_measurable measurable_fst'' measurable_snd''
   586       intro!: borel_measurable_times borel_measurable_log borel_measurable_divide)
   598       intro!: borel_measurable_times borel_measurable_log borel_measurable_divide)
   587   ultimately have int: "integrable (S \<Otimes>\<^sub>M T) f"
   599   ultimately have int: "integrable (S \<Otimes>\<^sub>M T) f"
   588     apply (rule integrable_cong_AE_imp)
   600     apply (rule integrable_cong_AE_imp)
   589     using
   601     using A B AE_space
   590       distributed_transform_AE[OF measurable_fst ac_fst, of T, OF T Px]
       
   591       distributed_transform_AE[OF measurable_snd ac_snd, of _ _ _ _ S, OF T Py]
       
   592       subdensity_real[OF measurable_fst Pxy Px X]
       
   593       subdensity_real[OF measurable_snd Pxy Py Y]
       
   594       distributed_real_AE[OF Pxy]
       
   595     by eventually_elim
   602     by eventually_elim
   596        (auto simp: f_def log_divide_eq log_mult_eq field_simps zero_less_mult_iff)
   603        (auto simp: f_def log_divide_eq log_mult_eq field_simps space_pair_measure Px_nn Py_nn Pxy_nn
       
   604                   less_le)
   597 
   605 
   598   show "0 \<le> ?M" unfolding M
   606   show "0 \<le> ?M" unfolding M
   599   proof (rule ST.KL_density_density_nonneg
   607   proof (intro ST.KL_density_density_nonneg)
   600     [OF b_gt_1 f PxPy_nonneg _ Pxy[THEN distributed_real_measurable] Pxy[THEN distributed_real_AE] _ ac int[unfolded f_def]])
   608     show "prob_space (density (S \<Otimes>\<^sub>M T) (\<lambda>x. ennreal (Pxy x))) "
   601     show "prob_space (density (S \<Otimes>\<^sub>M T) (\<lambda>x. ereal (Pxy x))) "
       
   602       unfolding distributed_distr_eq_density[OF Pxy, symmetric]
   609       unfolding distributed_distr_eq_density[OF Pxy, symmetric]
   603       using distributed_measurable[OF Pxy] by (rule prob_space_distr)
   610       using distributed_measurable[OF Pxy] by (rule prob_space_distr)
   604     show "prob_space (density (S \<Otimes>\<^sub>M T) (\<lambda>x. ereal (Px (fst x) * Py (snd x))))"
   611     show "prob_space (density (S \<Otimes>\<^sub>M T) (\<lambda>x. ennreal (Px (fst x) * Py (snd x))))"
   605       unfolding distr_eq[symmetric] by unfold_locales
   612       unfolding distr_eq[symmetric] by unfold_locales
   606   qed
   613     show "integrable (S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))))"
   607 qed
   614       using int unfolding f_def .
   608 
   615   qed (insert ac, auto simp: b_gt_1 Px_nn Py_nn Pxy_nn space_pair_measure)
       
   616 qed
   609 
   617 
   610 lemma (in information_space)
   618 lemma (in information_space)
   611   fixes Pxy :: "'b \<times> 'c \<Rightarrow> real" and Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real"
   619   fixes Pxy :: "'b \<times> 'c \<Rightarrow> real" and Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real"
   612   assumes "sigma_finite_measure S" "sigma_finite_measure T"
   620   assumes "sigma_finite_measure S" "sigma_finite_measure T"
   613   assumes Px: "distributed M S X Px" and Py: "distributed M T Y Py"
   621   assumes Px: "distributed M S X Px" and Px_nn: "\<And>x. x \<in> space S \<Longrightarrow> 0 \<le> Px x"
   614   assumes Pxy: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
   622     and Py: "distributed M T Y Py" and Py_nn: "\<And>y. y \<in> space T \<Longrightarrow> 0 \<le> Py y"
       
   623     and Pxy: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
       
   624     and Pxy_nn: "\<And>x y. x \<in> space S \<Longrightarrow> y \<in> space T \<Longrightarrow> 0 \<le> Pxy (x, y)"
   615   defines "f \<equiv> \<lambda>x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x)))"
   625   defines "f \<equiv> \<lambda>x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x)))"
   616   shows mutual_information_distr: "mutual_information b S T X Y = integral\<^sup>L (S \<Otimes>\<^sub>M T) f" (is "?M = ?R")
   626   shows mutual_information_distr: "mutual_information b S T X Y = integral\<^sup>L (S \<Otimes>\<^sub>M T) f" (is "?M = ?R")
   617     and mutual_information_nonneg: "integrable (S \<Otimes>\<^sub>M T) f \<Longrightarrow> 0 \<le> mutual_information b S T X Y"
   627     and mutual_information_nonneg: "integrable (S \<Otimes>\<^sub>M T) f \<Longrightarrow> 0 \<le> mutual_information b S T X Y"
   618 proof -
   628 proof -
   619   have X: "random_variable S X"
   629   have X[measurable]: "random_variable S X"
   620     using Px by (auto simp: distributed_def)
   630     using Px by (auto simp: distributed_def)
   621   have Y: "random_variable T Y"
   631   have Y[measurable]: "random_variable T Y"
   622     using Py by (auto simp: distributed_def)
   632     using Py by (auto simp: distributed_def)
       
   633   have [measurable]: "Px \<in> S \<rightarrow>\<^sub>M borel"
       
   634     using Px Px_nn by (intro distributed_real_measurable)
       
   635   have [measurable]: "Py \<in> T \<rightarrow>\<^sub>M borel"
       
   636     using Py Py_nn by (intro distributed_real_measurable)
       
   637   have measurable_Pxy[measurable]: "Pxy \<in> (S \<Otimes>\<^sub>M T) \<rightarrow>\<^sub>M borel"
       
   638     using Pxy Pxy_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure)
       
   639 
   623   interpret S: sigma_finite_measure S by fact
   640   interpret S: sigma_finite_measure S by fact
   624   interpret T: sigma_finite_measure T by fact
   641   interpret T: sigma_finite_measure T by fact
   625   interpret ST: pair_sigma_finite S T ..
   642   interpret ST: pair_sigma_finite S T ..
   626   interpret X: prob_space "distr M S X" using X by (rule prob_space_distr)
   643   interpret X: prob_space "distr M S X" using X by (rule prob_space_distr)
   627   interpret Y: prob_space "distr M T Y" using Y by (rule prob_space_distr)
   644   interpret Y: prob_space "distr M T Y" using Y by (rule prob_space_distr)
   628   interpret XY: pair_prob_space "distr M S X" "distr M T Y" ..
   645   interpret XY: pair_prob_space "distr M S X" "distr M T Y" ..
   629   let ?P = "S \<Otimes>\<^sub>M T"
   646   let ?P = "S \<Otimes>\<^sub>M T"
   630   let ?D = "distr M ?P (\<lambda>x. (X x, Y x))"
   647   let ?D = "distr M ?P (\<lambda>x. (X x, Y x))"
   631 
   648 
   632   { fix A assume "A \<in> sets S"
   649   { fix A assume "A \<in> sets S"
   633     with X Y have "emeasure (distr M S X) A = emeasure ?D (A \<times> space T)"
   650     with X[THEN measurable_space] Y[THEN measurable_space]
   634       by (auto simp: emeasure_distr measurable_Pair measurable_space
   651     have "emeasure (distr M S X) A = emeasure ?D (A \<times> space T)"
   635                intro!: arg_cong[where f="emeasure M"]) }
   652       by (auto simp: emeasure_distr intro!: arg_cong[where f="emeasure M"]) }
   636   note marginal_eq1 = this
   653   note marginal_eq1 = this
   637   { fix A assume "A \<in> sets T"
   654   { fix A assume "A \<in> sets T"
   638     with X Y have "emeasure (distr M T Y) A = emeasure ?D (space S \<times> A)"
   655     with X[THEN measurable_space] Y[THEN measurable_space]
   639       by (auto simp: emeasure_distr measurable_Pair measurable_space
   656     have "emeasure (distr M T Y) A = emeasure ?D (space S \<times> A)"
   640                intro!: arg_cong[where f="emeasure M"]) }
   657       by (auto simp: emeasure_distr intro!: arg_cong[where f="emeasure M"]) }
   641   note marginal_eq2 = this
   658   note marginal_eq2 = this
   642 
   659 
   643   have eq: "(\<lambda>x. ereal (Px (fst x) * Py (snd x))) = (\<lambda>(x, y). ereal (Px x) * ereal (Py y))"
   660   have distr_eq: "distr M S X \<Otimes>\<^sub>M distr M T Y = density ?P (\<lambda>x. ennreal (Px (fst x) * Py (snd x)))"
   644     by auto
   661     unfolding Px(1)[THEN distributed_distr_eq_density] Py(1)[THEN distributed_distr_eq_density]
   645 
       
   646   have distr_eq: "distr M S X \<Otimes>\<^sub>M distr M T Y = density ?P (\<lambda>x. ereal (Px (fst x) * Py (snd x)))"
       
   647     unfolding Px(1)[THEN distributed_distr_eq_density] Py(1)[THEN distributed_distr_eq_density] eq
       
   648   proof (subst pair_measure_density)
   662   proof (subst pair_measure_density)
   649     show "(\<lambda>x. ereal (Px x)) \<in> borel_measurable S" "(\<lambda>y. ereal (Py y)) \<in> borel_measurable T"
   663     show "(\<lambda>x. ennreal (Px x)) \<in> borel_measurable S" "(\<lambda>y. ennreal (Py y)) \<in> borel_measurable T"
   650       "AE x in S. 0 \<le> ereal (Px x)" "AE y in T. 0 \<le> ereal (Py y)"
       
   651       using Px Py by (auto simp: distributed_def)
   664       using Px Py by (auto simp: distributed_def)
   652     show "sigma_finite_measure (density T Py)" unfolding Py(1)[THEN distributed_distr_eq_density, symmetric] ..
   665     show "sigma_finite_measure (density T Py)" unfolding Py(1)[THEN distributed_distr_eq_density, symmetric] ..
   653   qed (fact | simp)+
   666     show "density (S \<Otimes>\<^sub>M T) (\<lambda>(x, y). ennreal (Px x) * ennreal (Py y)) =
   654   
   667       density (S \<Otimes>\<^sub>M T) (\<lambda>x. ennreal (Px (fst x) * Py (snd x)))"
   655   have M: "?M = KL_divergence b (density ?P (\<lambda>x. ereal (Px (fst x) * Py (snd x)))) (density ?P (\<lambda>x. ereal (Pxy x)))"
   668       using Px_nn Py_nn by (auto intro!: density_cong simp: distributed_def ennreal_mult space_pair_measure)
       
   669   qed fact
       
   670 
       
   671   have M: "?M = KL_divergence b (density ?P (\<lambda>x. ennreal (Px (fst x) * Py (snd x)))) (density ?P (\<lambda>x. ennreal (Pxy x)))"
   656     unfolding mutual_information_def distr_eq Pxy(1)[THEN distributed_distr_eq_density] ..
   672     unfolding mutual_information_def distr_eq Pxy(1)[THEN distributed_distr_eq_density] ..
   657 
   673 
   658   from Px Py have f: "(\<lambda>x. Px (fst x) * Py (snd x)) \<in> borel_measurable ?P"
   674   from Px Py have f: "(\<lambda>x. Px (fst x) * Py (snd x)) \<in> borel_measurable ?P"
   659     by (intro borel_measurable_times) (auto intro: distributed_real_measurable measurable_fst'' measurable_snd'')
   675     by (intro borel_measurable_times) (auto intro: distributed_real_measurable measurable_fst'' measurable_snd'')
   660   have PxPy_nonneg: "AE x in ?P. 0 \<le> Px (fst x) * Py (snd x)"
   676   have PxPy_nonneg: "AE x in ?P. 0 \<le> Px (fst x) * Py (snd x)"
   661   proof (rule ST.AE_pair_measure)
   677     using Px_nn Py_nn by (auto simp: space_pair_measure)
   662     show "{x \<in> space ?P. 0 \<le> Px (fst x) * Py (snd x)} \<in> sets ?P"
       
   663       using f by auto
       
   664     show "AE x in S. AE y in T. 0 \<le> Px (fst (x, y)) * Py (snd (x, y))"
       
   665       using Px Py by (auto simp: zero_le_mult_iff dest!: distributed_real_AE)
       
   666   qed
       
   667 
   678 
   668   have "(AE x in ?P. Px (fst x) = 0 \<longrightarrow> Pxy x = 0)"
   679   have "(AE x in ?P. Px (fst x) = 0 \<longrightarrow> Pxy x = 0)"
   669     by (rule subdensity_real[OF measurable_fst Pxy Px]) auto
   680     by (rule subdensity_real[OF measurable_fst Pxy Px]) (insert Px_nn Pxy_nn, auto simp: space_pair_measure)
   670   moreover
   681   moreover
   671   have "(AE x in ?P. Py (snd x) = 0 \<longrightarrow> Pxy x = 0)"
   682   have "(AE x in ?P. Py (snd x) = 0 \<longrightarrow> Pxy x = 0)"
   672     by (rule subdensity_real[OF measurable_snd Pxy Py]) auto
   683     by (rule subdensity_real[OF measurable_snd Pxy Py]) (insert Py_nn Pxy_nn, auto simp: space_pair_measure)
   673   ultimately have ac: "AE x in ?P. Px (fst x) * Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
   684   ultimately have ac: "AE x in ?P. Px (fst x) * Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
   674     by eventually_elim auto
   685     by eventually_elim auto
   675 
   686 
   676   show "?M = ?R"
   687   show "?M = ?R"
   677     unfolding M f_def
   688     unfolding M f_def
   678     using b_gt_1 f PxPy_nonneg Pxy[THEN distributed_real_measurable] Pxy[THEN distributed_real_AE] ac
   689     using b_gt_1 f PxPy_nonneg ac Pxy_nn
   679     by (rule ST.KL_density_density)
   690     by (intro ST.KL_density_density) (auto simp: space_pair_measure)
   680 
   691 
   681   assume int: "integrable (S \<Otimes>\<^sub>M T) f"
   692   assume int: "integrable (S \<Otimes>\<^sub>M T) f"
   682   show "0 \<le> ?M" unfolding M
   693   show "0 \<le> ?M" unfolding M
   683   proof (rule ST.KL_density_density_nonneg
   694   proof (intro ST.KL_density_density_nonneg)
   684     [OF b_gt_1 f PxPy_nonneg _ Pxy[THEN distributed_real_measurable] Pxy[THEN distributed_real_AE] _ ac int[unfolded f_def]])
   695     show "prob_space (density (S \<Otimes>\<^sub>M T) (\<lambda>x. ennreal (Pxy x))) "
   685     show "prob_space (density (S \<Otimes>\<^sub>M T) (\<lambda>x. ereal (Pxy x))) "
       
   686       unfolding distributed_distr_eq_density[OF Pxy, symmetric]
   696       unfolding distributed_distr_eq_density[OF Pxy, symmetric]
   687       using distributed_measurable[OF Pxy] by (rule prob_space_distr)
   697       using distributed_measurable[OF Pxy] by (rule prob_space_distr)
   688     show "prob_space (density (S \<Otimes>\<^sub>M T) (\<lambda>x. ereal (Px (fst x) * Py (snd x))))"
   698     show "prob_space (density (S \<Otimes>\<^sub>M T) (\<lambda>x. ennreal (Px (fst x) * Py (snd x))))"
   689       unfolding distr_eq[symmetric] by unfold_locales
   699       unfolding distr_eq[symmetric] by unfold_locales
   690   qed
   700     show "integrable (S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))))"
       
   701       using int unfolding f_def .
       
   702   qed (insert ac, auto simp: b_gt_1 Px_nn Py_nn Pxy_nn space_pair_measure)
   691 qed
   703 qed
   692 
   704 
   693 lemma (in information_space)
   705 lemma (in information_space)
   694   fixes Pxy :: "'b \<times> 'c \<Rightarrow> real" and Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real"
   706   fixes Pxy :: "'b \<times> 'c \<Rightarrow> real" and Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real"
   695   assumes "sigma_finite_measure S" "sigma_finite_measure T"
   707   assumes "sigma_finite_measure S" "sigma_finite_measure T"
   696   assumes Px[measurable]: "distributed M S X Px" and Py[measurable]: "distributed M T Y Py"
   708   assumes Px[measurable]: "distributed M S X Px" and Px_nn: "\<And>x. x \<in> space S \<Longrightarrow> 0 \<le> Px x"
   697   assumes Pxy[measurable]: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
   709     and Py[measurable]: "distributed M T Y Py" and Py_nn:  "\<And>x. x \<in> space T \<Longrightarrow> 0 \<le> Py x"
       
   710     and Pxy[measurable]: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
       
   711     and Pxy_nn: "\<And>x. x \<in> space (S \<Otimes>\<^sub>M T) \<Longrightarrow> 0 \<le> Pxy x"
   698   assumes ae: "AE x in S. AE y in T. Pxy (x, y) = Px x * Py y"
   712   assumes ae: "AE x in S. AE y in T. Pxy (x, y) = Px x * Py y"
   699   shows mutual_information_eq_0: "mutual_information b S T X Y = 0"
   713   shows mutual_information_eq_0: "mutual_information b S T X Y = 0"
   700 proof -
   714 proof -
   701   interpret S: sigma_finite_measure S by fact
   715   interpret S: sigma_finite_measure S by fact
   702   interpret T: sigma_finite_measure T by fact
   716   interpret T: sigma_finite_measure T by fact
   703   interpret ST: pair_sigma_finite S T ..
   717   interpret ST: pair_sigma_finite S T ..
       
   718   note
       
   719     distributed_real_measurable[OF Px_nn Px, measurable]
       
   720     distributed_real_measurable[OF Py_nn Py, measurable]
       
   721     distributed_real_measurable[OF Pxy_nn Pxy, measurable]
   704 
   722 
   705   have "AE x in S \<Otimes>\<^sub>M T. Px (fst x) = 0 \<longrightarrow> Pxy x = 0"
   723   have "AE x in S \<Otimes>\<^sub>M T. Px (fst x) = 0 \<longrightarrow> Pxy x = 0"
   706     by (rule subdensity_real[OF measurable_fst Pxy Px]) auto
   724     by (rule subdensity_real[OF measurable_fst Pxy Px]) (auto simp: Px_nn Pxy_nn space_pair_measure)
   707   moreover
   725   moreover
   708   have "AE x in S \<Otimes>\<^sub>M T. Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
   726   have "AE x in S \<Otimes>\<^sub>M T. Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
   709     by (rule subdensity_real[OF measurable_snd Pxy Py]) auto
   727     by (rule subdensity_real[OF measurable_snd Pxy Py]) (auto simp: Py_nn Pxy_nn space_pair_measure)
   710   moreover 
   728   moreover
   711   have "AE x in S \<Otimes>\<^sub>M T. Pxy x = Px (fst x) * Py (snd x)"
   729   have "AE x in S \<Otimes>\<^sub>M T. Pxy x = Px (fst x) * Py (snd x)"
   712     using distributed_real_measurable[OF Px] distributed_real_measurable[OF Py] distributed_real_measurable[OF Pxy]
       
   713     by (intro ST.AE_pair_measure) (auto simp: ae intro!: measurable_snd'' measurable_fst'')
   730     by (intro ST.AE_pair_measure) (auto simp: ae intro!: measurable_snd'' measurable_fst'')
   714   ultimately have "AE x in S \<Otimes>\<^sub>M T. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))) = 0"
   731   ultimately have "AE x in S \<Otimes>\<^sub>M T. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))) = 0"
   715     by eventually_elim simp
   732     by eventually_elim simp
   716   then have "(\<integral>x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))) \<partial>(S \<Otimes>\<^sub>M T)) = (\<integral>x. 0 \<partial>(S \<Otimes>\<^sub>M T))"
   733   then have "(\<integral>x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))) \<partial>(S \<Otimes>\<^sub>M T)) = (\<integral>x. 0 \<partial>(S \<Otimes>\<^sub>M T))"
   717     by (intro integral_cong_AE) auto
   734     by (intro integral_cong_AE) auto
   718   then show ?thesis
   735   then show ?thesis
   719     by (subst mutual_information_distr[OF assms(1-5)]) simp
   736     by (subst mutual_information_distr[OF assms(1-8)]) (auto simp add: space_pair_measure)
   720 qed
   737 qed
   721 
   738 
   722 lemma (in information_space) mutual_information_simple_distributed:
   739 lemma (in information_space) mutual_information_simple_distributed:
   723   assumes X: "simple_distributed M X Px" and Y: "simple_distributed M Y Py"
   740   assumes X: "simple_distributed M X Px" and Y: "simple_distributed M Y Py"
   724   assumes XY: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy"
   741   assumes XY: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy"
   725   shows "\<I>(X ; Y) = (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x))`space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y)))"
   742   shows "\<I>(X ; Y) = (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x))`space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y)))"
   726 proof (subst mutual_information_distr[OF _ _ simple_distributed[OF X] simple_distributed[OF Y] simple_distributed_joint[OF XY]])
   743 proof (subst mutual_information_distr[OF _ _ simple_distributed[OF X] _ simple_distributed[OF Y] _ simple_distributed_joint[OF XY]])
   727   note fin = simple_distributed_joint_finite[OF XY, simp]
   744   note fin = simple_distributed_joint_finite[OF XY, simp]
   728   show "sigma_finite_measure (count_space (X ` space M))"
   745   show "sigma_finite_measure (count_space (X ` space M))"
   729     by (simp add: sigma_finite_measure_count_space_finite)
   746     by (simp add: sigma_finite_measure_count_space_finite)
   730   show "sigma_finite_measure (count_space (Y ` space M))"
   747   show "sigma_finite_measure (count_space (Y ` space M))"
   731     by (simp add: sigma_finite_measure_count_space_finite)
   748     by (simp add: sigma_finite_measure_count_space_finite)
   735     by auto
   752     by auto
   736   with fin show "(\<integral> x. ?f x \<partial>(count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M))) =
   753   with fin show "(\<integral> x. ?f x \<partial>(count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M))) =
   737     (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y)))"
   754     (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y)))"
   738     by (auto simp add: pair_measure_count_space lebesgue_integral_count_space_finite setsum.If_cases split_beta'
   755     by (auto simp add: pair_measure_count_space lebesgue_integral_count_space_finite setsum.If_cases split_beta'
   739              intro!: setsum.cong)
   756              intro!: setsum.cong)
   740 qed
   757 qed (insert X Y XY, auto simp: simple_distributed_def)
   741 
   758 
   742 lemma (in information_space)
   759 lemma (in information_space)
   743   fixes Pxy :: "'b \<times> 'c \<Rightarrow> real" and Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real"
   760   fixes Pxy :: "'b \<times> 'c \<Rightarrow> real" and Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real"
   744   assumes Px: "simple_distributed M X Px" and Py: "simple_distributed M Y Py"
   761   assumes Px: "simple_distributed M X Px" and Py: "simple_distributed M Y Py"
   745   assumes Pxy: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy"
   762   assumes Pxy: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy"
   764 
   781 
   765 lemma (in prob_space) distributed_RN_deriv:
   782 lemma (in prob_space) distributed_RN_deriv:
   766   assumes X: "distributed M S X Px"
   783   assumes X: "distributed M S X Px"
   767   shows "AE x in S. RN_deriv S (density S Px) x = Px x"
   784   shows "AE x in S. RN_deriv S (density S Px) x = Px x"
   768 proof -
   785 proof -
   769   note D = distributed_measurable[OF X] distributed_borel_measurable[OF X] distributed_AE[OF X]
   786   note D = distributed_measurable[OF X] distributed_borel_measurable[OF X]
   770   interpret X: prob_space "distr M S X"
   787   interpret X: prob_space "distr M S X"
   771     using D(1) by (rule prob_space_distr)
   788     using D(1) by (rule prob_space_distr)
   772 
   789 
   773   have sf: "sigma_finite_measure (distr M S X)" by standard
   790   have sf: "sigma_finite_measure (distr M S X)" by standard
   774   show ?thesis
   791   show ?thesis
   775     using D
   792     using D
   776     apply (subst eq_commute)
   793     apply (subst eq_commute)
   777     apply (intro RN_deriv_unique_sigma_finite)
   794     apply (intro RN_deriv_unique_sigma_finite)
   778     apply (auto simp: distributed_distr_eq_density[symmetric, OF X] sf measure_nonneg)
   795     apply (auto simp: distributed_distr_eq_density[symmetric, OF X] sf)
   779     done
   796     done
   780 qed
   797 qed
   781 
   798 
   782 lemma (in information_space)
   799 lemma (in information_space)
   783   fixes X :: "'a \<Rightarrow> 'b"
   800   fixes X :: "'a \<Rightarrow> 'b"
   784   assumes X[measurable]: "distributed M MX X f"
   801   assumes X[measurable]: "distributed M MX X f" and nn: "\<And>x. x \<in> space MX \<Longrightarrow> 0 \<le> f x"
   785   shows entropy_distr: "entropy b MX X = - (\<integral>x. f x * log b (f x) \<partial>MX)" (is ?eq)
   802   shows entropy_distr: "entropy b MX X = - (\<integral>x. f x * log b (f x) \<partial>MX)" (is ?eq)
   786 proof -
   803 proof -
   787   note D = distributed_measurable[OF X] distributed_borel_measurable[OF X] distributed_AE[OF X]
   804   note D = distributed_measurable[OF X] distributed_borel_measurable[OF X]
   788   note ae = distributed_RN_deriv[OF X]
   805   note ae = distributed_RN_deriv[OF X]
   789 
   806   note distributed_real_measurable[OF nn X, measurable]
   790   have ae_eq: "AE x in distr M MX X. log b (real_of_ereal (RN_deriv MX (distr M MX X) x)) =
   807 
       
   808   have ae_eq: "AE x in distr M MX X. log b (enn2real (RN_deriv MX (distr M MX X) x)) =
   791     log b (f x)"
   809     log b (f x)"
   792     unfolding distributed_distr_eq_density[OF X]
   810     unfolding distributed_distr_eq_density[OF X]
   793     apply (subst AE_density)
   811     apply (subst AE_density)
   794     using D apply simp
   812     using D apply simp
   795     using ae apply eventually_elim
   813     using ae apply eventually_elim
   797     done
   815     done
   798 
   816 
   799   have int_eq: "(\<integral> x. f x * log b (f x) \<partial>MX) = (\<integral> x. log b (f x) \<partial>distr M MX X)"
   817   have int_eq: "(\<integral> x. f x * log b (f x) \<partial>MX) = (\<integral> x. log b (f x) \<partial>distr M MX X)"
   800     unfolding distributed_distr_eq_density[OF X]
   818     unfolding distributed_distr_eq_density[OF X]
   801     using D
   819     using D
   802     by (subst integral_density)
   820     by (subst integral_density) (auto simp: nn)
   803        (auto simp: borel_measurable_ereal_iff)
       
   804 
   821 
   805   show ?eq
   822   show ?eq
   806     unfolding entropy_def KL_divergence_def entropy_density_def comp_def int_eq neg_equal_iff_equal
   823     unfolding entropy_def KL_divergence_def entropy_density_def comp_def int_eq neg_equal_iff_equal
   807     using ae_eq by (intro integral_cong_AE) auto
   824     using ae_eq by (intro integral_cong_AE) (auto simp: nn)
   808 qed
   825 qed
   809 
   826 
   810 lemma (in prob_space) distributed_imp_emeasure_nonzero:
   827 lemma (in information_space) entropy_le:
   811   assumes X: "distributed M MX X Px"
   828   fixes Px :: "'b \<Rightarrow> real" and MX :: "'b measure"
   812   shows "emeasure MX {x \<in> space MX. Px x \<noteq> 0} \<noteq> 0"
   829   assumes X[measurable]: "distributed M MX X Px" and Px_nn[simp]: "\<And>x. x \<in> space MX \<Longrightarrow> 0 \<le> Px x"
   813 proof
   830   and fin: "emeasure MX {x \<in> space MX. Px x \<noteq> 0} \<noteq> top"
   814   note Px = distributed_borel_measurable[OF X] distributed_AE[OF X]
   831   and int: "integrable MX (\<lambda>x. - Px x * log b (Px x))"
       
   832   shows "entropy b MX X \<le> log b (measure MX {x \<in> space MX. Px x \<noteq> 0})"
       
   833 proof -
       
   834   note Px = distributed_borel_measurable[OF X]
   815   interpret X: prob_space "distr M MX X"
   835   interpret X: prob_space "distr M MX X"
   816     using distributed_measurable[OF X] by (rule prob_space_distr)
   836     using distributed_measurable[OF X] by (rule prob_space_distr)
   817 
   837 
   818   assume "emeasure MX {x \<in> space MX. Px x \<noteq> 0} = 0"
   838   have " - log b (measure MX {x \<in> space MX. Px x \<noteq> 0}) =
   819   with Px have "AE x in MX. Px x = 0"
       
   820     by (intro AE_I[OF subset_refl]) (auto simp: borel_measurable_ereal_iff)
       
   821   moreover
       
   822   from X.emeasure_space_1 have "(\<integral>\<^sup>+x. Px x \<partial>MX) = 1"
       
   823     unfolding distributed_distr_eq_density[OF X] using Px
       
   824     by (subst (asm) emeasure_density)
       
   825        (auto simp: borel_measurable_ereal_iff intro!: integral_cong cong: nn_integral_cong)
       
   826   ultimately show False
       
   827     by (simp add: nn_integral_cong_AE)
       
   828 qed
       
   829 
       
   830 lemma (in information_space) entropy_le:
       
   831   fixes Px :: "'b \<Rightarrow> real" and MX :: "'b measure"
       
   832   assumes X[measurable]: "distributed M MX X Px"
       
   833   and fin: "emeasure MX {x \<in> space MX. Px x \<noteq> 0} \<noteq> \<infinity>"
       
   834   and int: "integrable MX (\<lambda>x. - Px x * log b (Px x))"
       
   835   shows "entropy b MX X \<le> log b (measure MX {x \<in> space MX. Px x \<noteq> 0})"
       
   836 proof -
       
   837   note Px = distributed_borel_measurable[OF X] distributed_AE[OF X]
       
   838   interpret X: prob_space "distr M MX X"
       
   839     using distributed_measurable[OF X] by (rule prob_space_distr)
       
   840 
       
   841   have " - log b (measure MX {x \<in> space MX. Px x \<noteq> 0}) = 
       
   842     - log b (\<integral> x. indicator {x \<in> space MX. Px x \<noteq> 0} x \<partial>MX)"
   839     - log b (\<integral> x. indicator {x \<in> space MX. Px x \<noteq> 0} x \<partial>MX)"
   843     using Px fin
   840     using Px Px_nn fin by (auto simp: measure_def)
   844     by (auto simp: measure_def borel_measurable_ereal_iff)
       
   845   also have "- log b (\<integral> x. indicator {x \<in> space MX. Px x \<noteq> 0} x \<partial>MX) = - log b (\<integral> x. 1 / Px x \<partial>distr M MX X)"
   841   also have "- log b (\<integral> x. indicator {x \<in> space MX. Px x \<noteq> 0} x \<partial>MX) = - log b (\<integral> x. 1 / Px x \<partial>distr M MX X)"
   846     unfolding distributed_distr_eq_density[OF X] using Px
   842     unfolding distributed_distr_eq_density[OF X] using Px Px_nn
   847     apply (intro arg_cong[where f="log b"] arg_cong[where f=uminus])
   843     apply (intro arg_cong[where f="log b"] arg_cong[where f=uminus])
   848     by (subst integral_density) (auto simp: borel_measurable_ereal_iff simp del: integral_indicator intro!: integral_cong)
   844     by (subst integral_density) (auto simp del: integral_indicator intro!: integral_cong)
   849   also have "\<dots> \<le> (\<integral> x. - log b (1 / Px x) \<partial>distr M MX X)"
   845   also have "\<dots> \<le> (\<integral> x. - log b (1 / Px x) \<partial>distr M MX X)"
   850   proof (rule X.jensens_inequality[of "\<lambda>x. 1 / Px x" "{0<..}" 0 1 "\<lambda>x. - log b x"])
   846   proof (rule X.jensens_inequality[of "\<lambda>x. 1 / Px x" "{0<..}" 0 1 "\<lambda>x. - log b x"])
   851     show "AE x in distr M MX X. 1 / Px x \<in> {0<..}"
   847     show "AE x in distr M MX X. 1 / Px x \<in> {0<..}"
   852       unfolding distributed_distr_eq_density[OF X]
   848       unfolding distributed_distr_eq_density[OF X]
   853       using Px by (auto simp: AE_density)
   849       using Px by (auto simp: AE_density)
   854     have [simp]: "\<And>x. x \<in> space MX \<Longrightarrow> ereal (if Px x = 0 then 0 else 1) = indicator {x \<in> space MX. Px x \<noteq> 0} x"
   850     have [simp]: "\<And>x. x \<in> space MX \<Longrightarrow> ennreal (if Px x = 0 then 0 else 1) = indicator {x \<in> space MX. Px x \<noteq> 0} x"
   855       by (auto simp: one_ereal_def)
   851       by (auto simp: one_ennreal_def)
   856     have "(\<integral>\<^sup>+ x. max 0 (ereal (- (if Px x = 0 then 0 else 1))) \<partial>MX) = (\<integral>\<^sup>+ x. 0 \<partial>MX)"
   852     have "(\<integral>\<^sup>+ x. ennreal (- (if Px x = 0 then 0 else 1)) \<partial>MX) = (\<integral>\<^sup>+ x. 0 \<partial>MX)"
   857       by (intro nn_integral_cong) (auto split: split_max)
   853       by (intro nn_integral_cong) (auto simp: ennreal_neg)
   858     then show "integrable (distr M MX X) (\<lambda>x. 1 / Px x)"
   854     then show "integrable (distr M MX X) (\<lambda>x. 1 / Px x)"
   859       unfolding distributed_distr_eq_density[OF X] using Px
   855       unfolding distributed_distr_eq_density[OF X] using Px
   860       by (auto simp: nn_integral_density real_integrable_def borel_measurable_ereal_iff fin nn_integral_max_0
   856       by (auto simp: nn_integral_density real_integrable_def fin ennreal_neg ennreal_mult[symmetric]
   861               cong: nn_integral_cong)
   857               cong: nn_integral_cong)
   862     have "integrable MX (\<lambda>x. Px x * log b (1 / Px x)) =
   858     have "integrable MX (\<lambda>x. Px x * log b (1 / Px x)) =
   863       integrable MX (\<lambda>x. - Px x * log b (Px x))"
   859       integrable MX (\<lambda>x. - Px x * log b (Px x))"
   864       using Px
   860       using Px
   865       by (intro integrable_cong_AE)
   861       by (intro integrable_cong_AE) (auto simp: log_divide_eq less_le)
   866          (auto simp: borel_measurable_ereal_iff log_divide_eq
       
   867                   intro!: measurable_If)
       
   868     then show "integrable (distr M MX X) (\<lambda>x. - log b (1 / Px x))"
   862     then show "integrable (distr M MX X) (\<lambda>x. - log b (1 / Px x))"
   869       unfolding distributed_distr_eq_density[OF X]
   863       unfolding distributed_distr_eq_density[OF X]
   870       using Px int
   864       using Px int
   871       by (subst integrable_real_density) (auto simp: borel_measurable_ereal_iff)
   865       by (subst integrable_real_density) auto
   872   qed (auto simp: minus_log_convex[OF b_gt_1])
   866   qed (auto simp: minus_log_convex[OF b_gt_1])
   873   also have "\<dots> = (\<integral> x. log b (Px x) \<partial>distr M MX X)"
   867   also have "\<dots> = (\<integral> x. log b (Px x) \<partial>distr M MX X)"
   874     unfolding distributed_distr_eq_density[OF X] using Px
   868     unfolding distributed_distr_eq_density[OF X] using Px
   875     by (intro integral_cong_AE) (auto simp: AE_density log_divide_eq)
   869     by (intro integral_cong_AE) (auto simp: AE_density log_divide_eq)
   876   also have "\<dots> = - entropy b MX X"
   870   also have "\<dots> = - entropy b MX X"
   877     unfolding distributed_distr_eq_density[OF X] using Px
   871     unfolding distributed_distr_eq_density[OF X] using Px
   878     by (subst entropy_distr[OF X]) (auto simp: borel_measurable_ereal_iff integral_density)
   872     by (subst entropy_distr[OF X]) (auto simp: integral_density)
   879   finally show ?thesis
   873   finally show ?thesis
   880     by simp
   874     by simp
   881 qed
   875 qed
   882 
   876 
   883 lemma (in information_space) entropy_le_space:
   877 lemma (in information_space) entropy_le_space:
   884   fixes Px :: "'b \<Rightarrow> real" and MX :: "'b measure"
   878   fixes Px :: "'b \<Rightarrow> real" and MX :: "'b measure"
   885   assumes X: "distributed M MX X Px"
   879   assumes X: "distributed M MX X Px" and Px_nn[simp]: "\<And>x. x \<in> space MX \<Longrightarrow> 0 \<le> Px x"
   886   and fin: "finite_measure MX"
   880   and fin: "finite_measure MX"
   887   and int: "integrable MX (\<lambda>x. - Px x * log b (Px x))"
   881   and int: "integrable MX (\<lambda>x. - Px x * log b (Px x))"
   888   shows "entropy b MX X \<le> log b (measure MX (space MX))"
   882   shows "entropy b MX X \<le> log b (measure MX (space MX))"
   889 proof -
   883 proof -
   890   note Px = distributed_borel_measurable[OF X] distributed_AE[OF X]
   884   note Px = distributed_borel_measurable[OF X]
   891   interpret finite_measure MX by fact
   885   interpret finite_measure MX by fact
   892   have "entropy b MX X \<le> log b (measure MX {x \<in> space MX. Px x \<noteq> 0})"
   886   have "entropy b MX X \<le> log b (measure MX {x \<in> space MX. Px x \<noteq> 0})"
   893     using int X by (intro entropy_le) auto
   887     using int X by (intro entropy_le) auto
   894   also have "\<dots> \<le> log b (measure MX (space MX))"
   888   also have "\<dots> \<le> log b (measure MX (space MX))"
   895     using Px distributed_imp_emeasure_nonzero[OF X]
   889     using Px distributed_imp_emeasure_nonzero[OF X]
   896     by (intro log_le)
   890     by (intro log_le)
   897        (auto intro!: borel_measurable_ereal_iff finite_measure_mono b_gt_1
   891        (auto intro!: finite_measure_mono b_gt_1 less_le[THEN iffD2]
   898                      less_le[THEN iffD2] measure_nonneg simp: emeasure_eq_measure)
   892              simp: emeasure_eq_measure cong: conj_cong)
   899   finally show ?thesis .
   893   finally show ?thesis .
   900 qed
   894 qed
   901 
   895 
   902 lemma (in information_space) entropy_uniform:
   896 lemma (in information_space) entropy_uniform:
   903   assumes X: "distributed M MX X (\<lambda>x. indicator A x / measure MX A)" (is "distributed _ _ _ ?f")
   897   assumes X: "distributed M MX X (\<lambda>x. indicator A x / measure MX A)" (is "distributed _ _ _ ?f")
   905 proof (subst entropy_distr[OF X])
   899 proof (subst entropy_distr[OF X])
   906   have [simp]: "emeasure MX A \<noteq> \<infinity>"
   900   have [simp]: "emeasure MX A \<noteq> \<infinity>"
   907     using uniform_distributed_params[OF X] by (auto simp add: measure_def)
   901     using uniform_distributed_params[OF X] by (auto simp add: measure_def)
   908   have eq: "(\<integral> x. indicator A x / measure MX A * log b (indicator A x / measure MX A) \<partial>MX) =
   902   have eq: "(\<integral> x. indicator A x / measure MX A * log b (indicator A x / measure MX A) \<partial>MX) =
   909     (\<integral> x. (- log b (measure MX A) / measure MX A) * indicator A x \<partial>MX)"
   903     (\<integral> x. (- log b (measure MX A) / measure MX A) * indicator A x \<partial>MX)"
   910     using measure_nonneg[of MX A] uniform_distributed_params[OF X]
   904     using uniform_distributed_params[OF X]
   911     by (intro integral_cong) (auto split: split_indicator simp: log_divide_eq)
   905     by (intro integral_cong) (auto split: split_indicator simp: log_divide_eq zero_less_measure_iff)
   912   show "- (\<integral> x. indicator A x / measure MX A * log b (indicator A x / measure MX A) \<partial>MX) =
   906   show "- (\<integral> x. indicator A x / measure MX A * log b (indicator A x / measure MX A) \<partial>MX) =
   913     log b (measure MX A)"
   907     log b (measure MX A)"
   914     unfolding eq using uniform_distributed_params[OF X]
   908     unfolding eq using uniform_distributed_params[OF X]
   915     by (subst integral_mult_right) (auto simp: measure_def)
   909     by (subst integral_mult_right) (auto simp: measure_def less_top[symmetric] intro!: integrable_real_indicator)
   916 qed
   910 qed simp
   917 
   911 
   918 lemma (in information_space) entropy_simple_distributed:
   912 lemma (in information_space) entropy_simple_distributed:
   919   "simple_distributed M X f \<Longrightarrow> \<H>(X) = - (\<Sum>x\<in>X`space M. f x * log b (f x))"
   913   "simple_distributed M X f \<Longrightarrow> \<H>(X) = - (\<Sum>x\<in>X`space M. f x * log b (f x))"
   920   by (subst entropy_distr[OF simple_distributed])
   914   by (subst entropy_distr[OF simple_distributed])
   921      (auto simp add: lebesgue_integral_count_space_finite)
   915      (auto simp add: lebesgue_integral_count_space_finite)
   925   shows "\<H>(X) \<le> log b (card (X ` space M \<inter> {x. f x \<noteq> 0}))"
   919   shows "\<H>(X) \<le> log b (card (X ` space M \<inter> {x. f x \<noteq> 0}))"
   926 proof -
   920 proof -
   927   let ?X = "count_space (X`space M)"
   921   let ?X = "count_space (X`space M)"
   928   have "\<H>(X) \<le> log b (measure ?X {x \<in> space ?X. f x \<noteq> 0})"
   922   have "\<H>(X) \<le> log b (measure ?X {x \<in> space ?X. f x \<noteq> 0})"
   929     by (rule entropy_le[OF simple_distributed[OF X]])
   923     by (rule entropy_le[OF simple_distributed[OF X]])
   930        (simp_all add: simple_distributed_finite[OF X] subset_eq integrable_count_space emeasure_count_space)
   924        (insert X, auto simp: simple_distributed_finite[OF X] subset_eq integrable_count_space emeasure_count_space)
   931   also have "measure ?X {x \<in> space ?X. f x \<noteq> 0} = card (X ` space M \<inter> {x. f x \<noteq> 0})"
   925   also have "measure ?X {x \<in> space ?X. f x \<noteq> 0} = card (X ` space M \<inter> {x. f x \<noteq> 0})"
   932     by (simp_all add: simple_distributed_finite[OF X] subset_eq emeasure_count_space measure_def Int_def)
   926     by (simp_all add: simple_distributed_finite[OF X] subset_eq emeasure_count_space measure_def Int_def)
   933   finally show ?thesis .
   927   finally show ?thesis .
   934 qed
   928 qed
   935 
   929 
   938   shows "\<H>(X) \<le> log b (real (card (X ` space M)))"
   932   shows "\<H>(X) \<le> log b (real (card (X ` space M)))"
   939 proof -
   933 proof -
   940   let ?X = "count_space (X`space M)"
   934   let ?X = "count_space (X`space M)"
   941   have "\<H>(X) \<le> log b (measure ?X (space ?X))"
   935   have "\<H>(X) \<le> log b (measure ?X (space ?X))"
   942     by (rule entropy_le_space[OF simple_distributed[OF X]])
   936     by (rule entropy_le_space[OF simple_distributed[OF X]])
   943        (simp_all add: simple_distributed_finite[OF X] subset_eq integrable_count_space emeasure_count_space finite_measure_count_space)
   937        (insert X, auto simp: simple_distributed_finite[OF X] subset_eq integrable_count_space emeasure_count_space finite_measure_count_space)
   944   also have "measure ?X (space ?X) = card (X ` space M)"
   938   also have "measure ?X (space ?X) = card (X ` space M)"
   945     by (simp_all add: simple_distributed_finite[OF X] subset_eq emeasure_count_space measure_def)
   939     by (simp_all add: simple_distributed_finite[OF X] subset_eq emeasure_count_space measure_def)
   946   finally show ?thesis .
   940   finally show ?thesis .
   947 qed
   941 qed
   948 
   942 
   959     (count_space (X ` space M)) (count_space (Y ` space M)) (count_space (Z ` space M)) X Y Z"
   953     (count_space (X ` space M)) (count_space (Y ` space M)) (count_space (Z ` space M)) X Y Z"
   960 
   954 
   961 lemma (in information_space)
   955 lemma (in information_space)
   962   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" and P: "sigma_finite_measure P"
   956   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" and P: "sigma_finite_measure P"
   963   assumes Px[measurable]: "distributed M S X Px"
   957   assumes Px[measurable]: "distributed M S X Px"
       
   958     and Px_nn[simp]: "\<And>x. x \<in> space S \<Longrightarrow> 0 \<le> Px x"
   964   assumes Pz[measurable]: "distributed M P Z Pz"
   959   assumes Pz[measurable]: "distributed M P Z Pz"
       
   960     and Pz_nn[simp]: "\<And>z. z \<in> space P \<Longrightarrow> 0 \<le> Pz z"
   965   assumes Pyz[measurable]: "distributed M (T \<Otimes>\<^sub>M P) (\<lambda>x. (Y x, Z x)) Pyz"
   961   assumes Pyz[measurable]: "distributed M (T \<Otimes>\<^sub>M P) (\<lambda>x. (Y x, Z x)) Pyz"
       
   962     and Pyz_nn[simp]: "\<And>y z. y \<in> space T \<Longrightarrow> z \<in> space P \<Longrightarrow> 0 \<le> Pyz (y, z)"
   966   assumes Pxz[measurable]: "distributed M (S \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Z x)) Pxz"
   963   assumes Pxz[measurable]: "distributed M (S \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Z x)) Pxz"
       
   964     and Pxz_nn[simp]: "\<And>x z. x \<in> space S \<Longrightarrow> z \<in> space P \<Longrightarrow> 0 \<le> Pxz (x, z)"
   967   assumes Pxyz[measurable]: "distributed M (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Y x, Z x)) Pxyz"
   965   assumes Pxyz[measurable]: "distributed M (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Y x, Z x)) Pxyz"
       
   966     and Pxyz_nn[simp]: "\<And>x y z. x \<in> space S \<Longrightarrow> y \<in> space T \<Longrightarrow> z \<in> space P \<Longrightarrow> 0 \<le> Pxyz (x, y, z)"
   968   assumes I1: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z))))"
   967   assumes I1: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z))))"
   969   assumes I2: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z)))"
   968   assumes I2: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z)))"
   970   shows conditional_mutual_information_generic_eq: "conditional_mutual_information b S T P X Y Z
   969   shows conditional_mutual_information_generic_eq: "conditional_mutual_information b S T P X Y Z
   971     = (\<integral>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))" (is "?eq")
   970     = (\<integral>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))" (is "?eq")
   972     and conditional_mutual_information_generic_nonneg: "0 \<le> conditional_mutual_information b S T P X Y Z" (is "?nonneg")
   971     and conditional_mutual_information_generic_nonneg: "0 \<le> conditional_mutual_information b S T P X Y Z" (is "?nonneg")
   973 proof -
   972 proof -
       
   973   have [measurable]: "Px \<in> S \<rightarrow>\<^sub>M borel"
       
   974     using Px Px_nn by (intro distributed_real_measurable)
       
   975   have [measurable]: "Pz \<in> P \<rightarrow>\<^sub>M borel"
       
   976     using Pz Pz_nn by (intro distributed_real_measurable)
       
   977   have measurable_Pyz[measurable]: "Pyz \<in> (T \<Otimes>\<^sub>M P) \<rightarrow>\<^sub>M borel"
       
   978     using Pyz Pyz_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure)
       
   979   have measurable_Pxz[measurable]: "Pxz \<in> (S \<Otimes>\<^sub>M P) \<rightarrow>\<^sub>M borel"
       
   980     using Pxz Pxz_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure)
       
   981   have measurable_Pxyz[measurable]: "Pxyz \<in> (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) \<rightarrow>\<^sub>M borel"
       
   982     using Pxyz Pxyz_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure)
       
   983 
   974   interpret S: sigma_finite_measure S by fact
   984   interpret S: sigma_finite_measure S by fact
   975   interpret T: sigma_finite_measure T by fact
   985   interpret T: sigma_finite_measure T by fact
   976   interpret P: sigma_finite_measure P by fact
   986   interpret P: sigma_finite_measure P by fact
   977   interpret TP: pair_sigma_finite T P ..
   987   interpret TP: pair_sigma_finite T P ..
   978   interpret SP: pair_sigma_finite S P ..
   988   interpret SP: pair_sigma_finite S P ..
   982   interpret TPS: pair_sigma_finite "T \<Otimes>\<^sub>M P" S ..
   992   interpret TPS: pair_sigma_finite "T \<Otimes>\<^sub>M P" S ..
   983   have TP: "sigma_finite_measure (T \<Otimes>\<^sub>M P)" ..
   993   have TP: "sigma_finite_measure (T \<Otimes>\<^sub>M P)" ..
   984   have SP: "sigma_finite_measure (S \<Otimes>\<^sub>M P)" ..
   994   have SP: "sigma_finite_measure (S \<Otimes>\<^sub>M P)" ..
   985   have YZ: "random_variable (T \<Otimes>\<^sub>M P) (\<lambda>x. (Y x, Z x))"
   995   have YZ: "random_variable (T \<Otimes>\<^sub>M P) (\<lambda>x. (Y x, Z x))"
   986     using Pyz by (simp add: distributed_measurable)
   996     using Pyz by (simp add: distributed_measurable)
   987   
   997 
   988   from Pxz Pxyz have distr_eq: "distr M (S \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Z x)) =
   998   from Pxz Pxyz have distr_eq: "distr M (S \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Z x)) =
   989     distr (distr M (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Y x, Z x))) (S \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). (x, z))"
   999     distr (distr M (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Y x, Z x))) (S \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). (x, z))"
   990     by (simp add: comp_def distr_distr)
  1000     by (simp add: comp_def distr_distr)
   991 
  1001 
   992   have "mutual_information b S P X Z =
  1002   have "mutual_information b S P X Z =
   993     (\<integral>x. Pxz x * log b (Pxz x / (Px (fst x) * Pz (snd x))) \<partial>(S \<Otimes>\<^sub>M P))"
  1003     (\<integral>x. Pxz x * log b (Pxz x / (Px (fst x) * Pz (snd x))) \<partial>(S \<Otimes>\<^sub>M P))"
   994     by (rule mutual_information_distr[OF S P Px Pz Pxz])
  1004     by (rule mutual_information_distr[OF S P Px Px_nn Pz Pz_nn Pxz Pxz_nn])
   995   also have "\<dots> = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))"
  1005   also have "\<dots> = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))"
   996     using b_gt_1 Pxz Px Pz
  1006     using b_gt_1 Pxz Px Pz
   997     by (subst distributed_transform_integral[OF Pxyz Pxz, where T="\<lambda>(x, y, z). (x, z)"]) (auto simp: split_beta')
  1007     by (subst distributed_transform_integral[OF Pxyz _ Pxz _, where T="\<lambda>(x, y, z). (x, z)"])
       
  1008        (auto simp: split_beta' space_pair_measure)
   998   finally have mi_eq:
  1009   finally have mi_eq:
   999     "mutual_information b S P X Z = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))" .
  1010     "mutual_information b S P X Z = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))" .
  1000   
  1011 
  1001   have ae1: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Px (fst x) = 0 \<longrightarrow> Pxyz x = 0"
  1012   have ae1: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Px (fst x) = 0 \<longrightarrow> Pxyz x = 0"
  1002     by (intro subdensity_real[of fst, OF _ Pxyz Px]) auto
  1013     by (intro subdensity_real[of fst, OF _ Pxyz Px]) (auto simp: space_pair_measure)
  1003   moreover have ae2: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
  1014   moreover have ae2: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
  1004     by (intro subdensity_real[of "\<lambda>x. snd (snd x)", OF _ Pxyz Pz]) auto
  1015     by (intro subdensity_real[of "\<lambda>x. snd (snd x)", OF _ Pxyz Pz]) (auto simp: space_pair_measure)
  1005   moreover have ae3: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
  1016   moreover have ae3: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
  1006     by (intro subdensity_real[of "\<lambda>x. (fst x, snd (snd x))", OF _ Pxyz Pxz]) auto
  1017     by (intro subdensity_real[of "\<lambda>x. (fst x, snd (snd x))", OF _ Pxyz Pxz]) (auto simp: space_pair_measure)
  1007   moreover have ae4: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0"
  1018   moreover have ae4: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0"
  1008     by (intro subdensity_real[of snd, OF _ Pxyz Pyz]) auto
  1019     by (intro subdensity_real[of snd, OF _ Pxyz Pyz]) (auto simp: space_pair_measure)
  1009   moreover have ae5: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Px (fst x)"
       
  1010     using Px by (intro STP.AE_pair_measure) (auto simp: comp_def dest: distributed_real_AE)
       
  1011   moreover have ae6: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Pyz (snd x)"
       
  1012     using Pyz by (intro STP.AE_pair_measure) (auto simp: comp_def dest: distributed_real_AE)
       
  1013   moreover have ae7: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Pz (snd (snd x))"
       
  1014     using Pz Pz[THEN distributed_real_measurable]
       
  1015     by (auto intro!: TP.AE_pair_measure STP.AE_pair_measure AE_I2[of S] dest: distributed_real_AE)
       
  1016   moreover have ae8: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Pxz (fst x, snd (snd x))"
       
  1017     using Pxz[THEN distributed_real_AE, THEN SP.AE_pair]
       
  1018     by (auto intro!: TP.AE_pair_measure STP.AE_pair_measure)
       
  1019   moreover note Pxyz[THEN distributed_real_AE]
       
  1020   ultimately have ae: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P.
  1020   ultimately have ae: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P.
  1021     Pxyz x * log b (Pxyz x / (Px (fst x) * Pyz (snd x))) -
  1021     Pxyz x * log b (Pxyz x / (Px (fst x) * Pyz (snd x))) -
  1022     Pxyz x * log b (Pxz (fst x, snd (snd x)) / (Px (fst x) * Pz (snd (snd x)))) =
  1022     Pxyz x * log b (Pxz (fst x, snd (snd x)) / (Px (fst x) * Pz (snd (snd x)))) =
  1023     Pxyz x * log b (Pxyz x * Pz (snd (snd x)) / (Pxz (fst x, snd (snd x)) * Pyz (snd x))) "
  1023     Pxyz x * log b (Pxyz x * Pz (snd (snd x)) / (Pxz (fst x, snd (snd x)) * Pyz (snd x))) "
       
  1024     using AE_space
  1024   proof eventually_elim
  1025   proof eventually_elim
  1025     case (elim x)
  1026     case (elim x)
  1026     show ?case
  1027     show ?case
  1027     proof cases
  1028     proof cases
  1028       assume "Pxyz x \<noteq> 0"
  1029       assume "Pxyz x \<noteq> 0"
  1029       with elim have "0 < Px (fst x)" "0 < Pz (snd (snd x))" "0 < Pxz (fst x, snd (snd x))"
  1030       with elim have "0 < Px (fst x)" "0 < Pz (snd (snd x))" "0 < Pxz (fst x, snd (snd x))"
  1030         "0 < Pyz (snd x)" "0 < Pxyz x"
  1031         "0 < Pyz (snd x)" "0 < Pxyz x"
  1031         by auto
  1032         by (auto simp: space_pair_measure less_le)
  1032       then show ?thesis
  1033       then show ?thesis
  1033         using b_gt_1 by (simp add: log_simps less_imp_le field_simps)
  1034         using b_gt_1 by (simp add: log_simps less_imp_le field_simps)
  1034     qed simp
  1035     qed simp
  1035   qed
  1036   qed
  1036   with I1 I2 show ?eq
  1037   with I1 I2 show ?eq
  1037     unfolding conditional_mutual_information_def
  1038     unfolding conditional_mutual_information_def
  1038     apply (subst mi_eq)
  1039     apply (subst mi_eq)
  1039     apply (subst mutual_information_distr[OF S TP Px Pyz Pxyz])
  1040     apply (subst mutual_information_distr[OF S TP Px Px_nn Pyz _ Pxyz])
       
  1041     apply (auto simp: space_pair_measure)
  1040     apply (subst integral_diff[symmetric])
  1042     apply (subst integral_diff[symmetric])
  1041     apply (auto intro!: integral_cong_AE simp: split_beta' simp del: integral_diff)
  1043     apply (auto intro!: integral_cong_AE simp: split_beta' simp del: integral_diff)
  1042     done
  1044     done
  1043 
  1045 
  1044   let ?P = "density (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) Pxyz"
  1046   let ?P = "density (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) Pxyz"
  1051     unfolding distributed_distr_eq_density[OF Pyz, symmetric]
  1053     unfolding distributed_distr_eq_density[OF Pyz, symmetric]
  1052     by (rule prob_space_distr) simp
  1054     by (rule prob_space_distr) simp
  1053 
  1055 
  1054   let ?f = "\<lambda>(x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) / Pxyz (x, y, z)"
  1056   let ?f = "\<lambda>(x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) / Pxyz (x, y, z)"
  1055 
  1057 
  1056   from subdensity_real[of snd, OF _ Pyz Pz]
  1058   from subdensity_real[of snd, OF _ Pyz Pz _ AE_I2 AE_I2]
  1057   have aeX1: "AE x in T \<Otimes>\<^sub>M P. Pz (snd x) = 0 \<longrightarrow> Pyz x = 0" by (auto simp: comp_def)
  1059   have aeX1: "AE x in T \<Otimes>\<^sub>M P. Pz (snd x) = 0 \<longrightarrow> Pyz x = 0"
       
  1060     by (auto simp: comp_def space_pair_measure)
  1058   have aeX2: "AE x in T \<Otimes>\<^sub>M P. 0 \<le> Pz (snd x)"
  1061   have aeX2: "AE x in T \<Otimes>\<^sub>M P. 0 \<le> Pz (snd x)"
  1059     using Pz by (intro TP.AE_pair_measure) (auto simp: comp_def dest: distributed_real_AE)
  1062     using Pz by (intro TP.AE_pair_measure) (auto simp: comp_def)
  1060 
  1063 
  1061   have aeX3: "AE y in T \<Otimes>\<^sub>M P. (\<integral>\<^sup>+ x. ereal (Pxz (x, snd y)) \<partial>S) = ereal (Pz (snd y))"
  1064   have aeX3: "AE y in T \<Otimes>\<^sub>M P. (\<integral>\<^sup>+ x. ennreal (Pxz (x, snd y)) \<partial>S) = ennreal (Pz (snd y))"
  1062     using Pz distributed_marginal_eq_joint2[OF P S Pz Pxz]
  1065     using Pz distributed_marginal_eq_joint2[OF P S Pz Pxz]
  1063     by (intro TP.AE_pair_measure) (auto dest: distributed_real_AE)
  1066     by (intro TP.AE_pair_measure) auto
  1064 
  1067 
  1065   have "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<le> (\<integral>\<^sup>+ (x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))"
  1068   have "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<le> (\<integral>\<^sup>+ (x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))"
  1066     apply (subst nn_integral_density)
  1069     by (subst nn_integral_density)
  1067     apply simp
  1070        (auto intro!: nn_integral_mono simp: space_pair_measure ennreal_mult[symmetric])
  1068     apply (rule distributed_AE[OF Pxyz])
  1071   also have "\<dots> = (\<integral>\<^sup>+(y, z). (\<integral>\<^sup>+ x. ennreal (Pxz (x, z)) * ennreal (Pyz (y, z) / Pz z) \<partial>S) \<partial>(T \<Otimes>\<^sub>M P))"
  1069     apply auto []
  1072     by (subst STP.nn_integral_snd[symmetric])
  1070     apply (rule nn_integral_mono_AE)
  1073        (auto simp add: split_beta' ennreal_mult[symmetric] space_pair_measure intro!: nn_integral_cong)
  1071     using ae5 ae6 ae7 ae8
  1074   also have "\<dots> = (\<integral>\<^sup>+x. ennreal (Pyz x) * 1 \<partial>T \<Otimes>\<^sub>M P)"
       
  1075     apply (rule nn_integral_cong_AE)
       
  1076     using aeX1 aeX2 aeX3 AE_space
  1072     apply eventually_elim
  1077     apply eventually_elim
  1073     apply auto
  1078   proof (case_tac x, simp add: space_pair_measure)
  1074     done
  1079     fix a b assume "Pz b = 0 \<longrightarrow> Pyz (a, b) = 0" "a \<in> space T \<and> b \<in> space P"
  1075   also have "\<dots> = (\<integral>\<^sup>+(y, z). \<integral>\<^sup>+ x. ereal (Pxz (x, z)) * ereal (Pyz (y, z) / Pz z) \<partial>S \<partial>T \<Otimes>\<^sub>M P)"
  1080       "(\<integral>\<^sup>+ x. ennreal (Pxz (x, b)) \<partial>S) = ennreal (Pz b)"
  1076     by (subst STP.nn_integral_snd[symmetric]) (auto simp add: split_beta')
  1081     then show "(\<integral>\<^sup>+ x. ennreal (Pxz (x, b)) * ennreal (Pyz (a, b) / Pz b) \<partial>S) = ennreal (Pyz (a, b))"
  1077   also have "\<dots> = (\<integral>\<^sup>+x. ereal (Pyz x) * 1 \<partial>T \<Otimes>\<^sub>M P)"
  1082       by (subst nn_integral_multc) (auto split: prod.split simp: ennreal_mult[symmetric])
  1078     apply (rule nn_integral_cong_AE)
       
  1079     using aeX1 aeX2 aeX3 distributed_AE[OF Pyz] AE_space
       
  1080     apply eventually_elim
       
  1081   proof (case_tac x, simp del: times_ereal.simps add: space_pair_measure)
       
  1082     fix a b assume "Pz b = 0 \<longrightarrow> Pyz (a, b) = 0" "0 \<le> Pz b" "a \<in> space T \<and> b \<in> space P"
       
  1083       "(\<integral>\<^sup>+ x. ereal (Pxz (x, b)) \<partial>S) = ereal (Pz b)" "0 \<le> Pyz (a, b)" 
       
  1084     then show "(\<integral>\<^sup>+ x. ereal (Pxz (x, b)) * ereal (Pyz (a, b) / Pz b) \<partial>S) = ereal (Pyz (a, b))"
       
  1085       by (subst nn_integral_multc)
       
  1086          (auto split: prod.split)
       
  1087   qed
  1083   qed
  1088   also have "\<dots> = 1"
  1084   also have "\<dots> = 1"
  1089     using Q.emeasure_space_1 distributed_AE[OF Pyz] distributed_distr_eq_density[OF Pyz]
  1085     using Q.emeasure_space_1 distributed_distr_eq_density[OF Pyz]
  1090     by (subst nn_integral_density[symmetric]) auto
  1086     by (subst nn_integral_density[symmetric]) auto
  1091   finally have le1: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<le> 1" .
  1087   finally have le1: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<le> 1" .
  1092   also have "\<dots> < \<infinity>" by simp
  1088   also have "\<dots> < \<infinity>" by simp
  1093   finally have fin: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> \<infinity>" by simp
  1089   finally have fin: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> \<infinity>" by simp
  1094 
  1090 
  1095   have pos: "(\<integral>\<^sup>+x. ?f x \<partial>?P) \<noteq> 0"
  1091   have pos: "(\<integral>\<^sup>+x. ?f x \<partial>?P) \<noteq> 0"
  1096     apply (subst nn_integral_density)
  1092     apply (subst nn_integral_density)
  1097     apply simp
  1093     apply (simp_all add: split_beta')
  1098     apply (rule distributed_AE[OF Pxyz])
       
  1099     apply auto []
       
  1100     apply (simp add: split_beta')
       
  1101   proof
  1094   proof
  1102     let ?g = "\<lambda>x. ereal (if Pxyz x = 0 then 0 else Pxz (fst x, snd (snd x)) * Pyz (snd x) / Pz (snd (snd x)))"
  1095     let ?g = "\<lambda>x. ennreal (Pxyz x) * (Pxz (fst x, snd (snd x)) * Pyz (snd x) / (Pz (snd (snd x)) * Pxyz x))"
  1103     assume "(\<integral>\<^sup>+x. ?g x \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)) = 0"
  1096     assume "(\<integral>\<^sup>+x. ?g x \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)) = 0"
  1104     then have "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. ?g x \<le> 0"
  1097     then have "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. ?g x = 0"
  1105       by (intro nn_integral_0_iff_AE[THEN iffD1]) auto
  1098       by (intro nn_integral_0_iff_AE[THEN iffD1]) auto
  1106     then have "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pxyz x = 0"
  1099     then have "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pxyz x = 0"
  1107       using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
  1100       using ae1 ae2 ae3 ae4 AE_space
  1108       by eventually_elim (auto split: if_split_asm simp: mult_le_0_iff divide_le_0_iff)
  1101       by eventually_elim (auto split: if_split_asm simp: mult_le_0_iff divide_le_0_iff space_pair_measure)
  1109     then have "(\<integral>\<^sup>+ x. ereal (Pxyz x) \<partial>S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) = 0"
  1102     then have "(\<integral>\<^sup>+ x. ennreal (Pxyz x) \<partial>S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) = 0"
  1110       by (subst nn_integral_cong_AE[of _ "\<lambda>x. 0"]) auto
  1103       by (subst nn_integral_cong_AE[of _ "\<lambda>x. 0"]) auto
  1111     with P.emeasure_space_1 show False
  1104     with P.emeasure_space_1 show False
  1112       by (subst (asm) emeasure_density) (auto cong: nn_integral_cong)
  1105       by (subst (asm) emeasure_density) (auto cong: nn_integral_cong)
  1113   qed
  1106   qed
  1114 
  1107 
  1115   have neg: "(\<integral>\<^sup>+ x. - ?f x \<partial>?P) = 0"
  1108   have neg: "(\<integral>\<^sup>+ x. - ?f x \<partial>?P) = 0"
  1116     apply (rule nn_integral_0_iff_AE[THEN iffD2])
  1109     apply (rule nn_integral_0_iff_AE[THEN iffD2])
  1117     apply simp
  1110     apply simp
  1118     apply (subst AE_density)
  1111     apply (subst AE_density)
  1119     apply simp
  1112     apply (auto simp: space_pair_measure ennreal_neg)
  1120     using ae5 ae6 ae7 ae8
       
  1121     apply eventually_elim
       
  1122     apply auto
       
  1123     done
  1113     done
  1124 
  1114 
  1125   have I3: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))"
  1115   have I3: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))"
  1126     apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ integrable_diff[OF I1 I2]])
  1116     apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ integrable_diff[OF I1 I2]])
  1127     using ae
  1117     using ae
  1140     qed simp
  1130     qed simp
  1141     then have "(\<integral>\<^sup>+ x. ?f x \<partial>?P) = (\<integral>x. ?f x \<partial>?P)"
  1131     then have "(\<integral>\<^sup>+ x. ?f x \<partial>?P) = (\<integral>x. ?f x \<partial>?P)"
  1142       apply (rule nn_integral_eq_integral)
  1132       apply (rule nn_integral_eq_integral)
  1143       apply (subst AE_density)
  1133       apply (subst AE_density)
  1144       apply simp
  1134       apply simp
  1145       using ae5 ae6 ae7 ae8
  1135       apply (auto simp: space_pair_measure ennreal_neg)
  1146       apply eventually_elim
       
  1147       apply auto
       
  1148       done
  1136       done
  1149     with nn_integral_nonneg[of ?P ?f] pos le1
  1137     with pos le1
  1150     show "0 < (\<integral>x. ?f x \<partial>?P)" "(\<integral>x. ?f x \<partial>?P) \<le> 1"
  1138     show "0 < (\<integral>x. ?f x \<partial>?P)" "(\<integral>x. ?f x \<partial>?P) \<le> 1"
  1151       by (simp_all add: one_ereal_def)
  1139       by (simp_all add: one_ennreal.rep_eq zero_less_iff_neq_zero[symmetric])
  1152   qed
  1140   qed
  1153   also have "- log b (integral\<^sup>L ?P ?f) \<le> (\<integral> x. - log b (?f x) \<partial>?P)"
  1141   also have "- log b (integral\<^sup>L ?P ?f) \<le> (\<integral> x. - log b (?f x) \<partial>?P)"
  1154   proof (rule P.jensens_inequality[where a=0 and b=1 and I="{0<..}"])
  1142   proof (rule P.jensens_inequality[where a=0 and b=1 and I="{0<..}"])
  1155     show "AE x in ?P. ?f x \<in> {0<..}"
  1143     show "AE x in ?P. ?f x \<in> {0<..}"
  1156       unfolding AE_density[OF distributed_borel_measurable[OF Pxyz]]
  1144       unfolding AE_density[OF distributed_borel_measurable[OF Pxyz]]
  1157       using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
  1145       using ae1 ae2 ae3 ae4 AE_space
  1158       by eventually_elim (auto)
  1146       by eventually_elim (auto simp: space_pair_measure less_le)
  1159     show "integrable ?P ?f"
  1147     show "integrable ?P ?f"
  1160       unfolding real_integrable_def 
  1148       unfolding real_integrable_def
  1161       using fin neg by (auto simp: split_beta')
  1149       using fin neg by (auto simp: split_beta')
  1162     show "integrable ?P (\<lambda>x. - log b (?f x))"
  1150     show "integrable ?P (\<lambda>x. - log b (?f x))"
  1163       apply (subst integrable_real_density)
  1151       apply (subst integrable_real_density)
  1164       apply simp
  1152       apply simp
  1165       apply (auto intro!: distributed_real_AE[OF Pxyz]) []
  1153       apply (auto simp: space_pair_measure) []
  1166       apply simp
  1154       apply simp
  1167       apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ I3])
  1155       apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ I3])
  1168       apply simp
  1156       apply simp
  1169       apply simp
  1157       apply simp
  1170       using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
  1158       using ae1 ae2 ae3 ae4 AE_space
  1171       apply eventually_elim
  1159       apply eventually_elim
  1172       apply (auto simp: log_divide_eq log_mult_eq zero_le_mult_iff zero_less_mult_iff zero_less_divide_iff field_simps)
  1160       apply (auto simp: log_divide_eq log_mult_eq zero_le_mult_iff zero_less_mult_iff zero_less_divide_iff field_simps
       
  1161         less_le space_pair_measure)
  1173       done
  1162       done
  1174   qed (auto simp: b_gt_1 minus_log_convex)
  1163   qed (auto simp: b_gt_1 minus_log_convex)
  1175   also have "\<dots> = conditional_mutual_information b S T P X Y Z"
  1164   also have "\<dots> = conditional_mutual_information b S T P X Y Z"
  1176     unfolding \<open>?eq\<close>
  1165     unfolding \<open>?eq\<close>
  1177     apply (subst integral_real_density)
  1166     apply (subst integral_real_density)
  1178     apply simp
  1167     apply simp
  1179     apply (auto intro!: distributed_real_AE[OF Pxyz]) []
  1168     apply (auto simp: space_pair_measure) []
  1180     apply simp
  1169     apply simp
  1181     apply (intro integral_cong_AE)
  1170     apply (intro integral_cong_AE)
  1182     using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
  1171     using ae1 ae2 ae3 ae4
  1183     apply (auto simp: log_divide_eq zero_less_mult_iff zero_less_divide_iff field_simps)
  1172     apply (auto simp: log_divide_eq zero_less_mult_iff zero_less_divide_iff field_simps
       
  1173       space_pair_measure less_le)
  1184     done
  1174     done
  1185   finally show ?nonneg
  1175   finally show ?nonneg
  1186     by simp
  1176     by simp
  1187 qed
  1177 qed
  1188 
  1178 
  1202   note Pz = Fz[THEN finite_entropy_distributed, measurable]
  1192   note Pz = Fz[THEN finite_entropy_distributed, measurable]
  1203   note Pyz = Fyz[THEN finite_entropy_distributed, measurable]
  1193   note Pyz = Fyz[THEN finite_entropy_distributed, measurable]
  1204   note Pxz = Fxz[THEN finite_entropy_distributed, measurable]
  1194   note Pxz = Fxz[THEN finite_entropy_distributed, measurable]
  1205   note Pxyz = Fxyz[THEN finite_entropy_distributed, measurable]
  1195   note Pxyz = Fxyz[THEN finite_entropy_distributed, measurable]
  1206 
  1196 
       
  1197   note Px_nn = Fx[THEN finite_entropy_nn]
       
  1198   note Pz_nn = Fz[THEN finite_entropy_nn]
       
  1199   note Pyz_nn = Fyz[THEN finite_entropy_nn]
       
  1200   note Pxz_nn = Fxz[THEN finite_entropy_nn]
       
  1201   note Pxyz_nn = Fxyz[THEN finite_entropy_nn]
       
  1202 
       
  1203   note Px' = Fx[THEN finite_entropy_measurable, measurable]
       
  1204   note Pz' = Fz[THEN finite_entropy_measurable, measurable]
       
  1205   note Pyz' = Fyz[THEN finite_entropy_measurable, measurable]
       
  1206   note Pxz' = Fxz[THEN finite_entropy_measurable, measurable]
       
  1207   note Pxyz' = Fxyz[THEN finite_entropy_measurable, measurable]
       
  1208 
  1207   interpret S: sigma_finite_measure S by fact
  1209   interpret S: sigma_finite_measure S by fact
  1208   interpret T: sigma_finite_measure T by fact
  1210   interpret T: sigma_finite_measure T by fact
  1209   interpret P: sigma_finite_measure P by fact
  1211   interpret P: sigma_finite_measure P by fact
  1210   interpret TP: pair_sigma_finite T P ..
  1212   interpret TP: pair_sigma_finite T P ..
  1211   interpret SP: pair_sigma_finite S P ..
  1213   interpret SP: pair_sigma_finite S P ..
  1220     distr (distr M (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Y x, Z x))) (S \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). (x, z))"
  1222     distr (distr M (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Y x, Z x))) (S \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). (x, z))"
  1221     by (simp add: distr_distr comp_def)
  1223     by (simp add: distr_distr comp_def)
  1222 
  1224 
  1223   have "mutual_information b S P X Z =
  1225   have "mutual_information b S P X Z =
  1224     (\<integral>x. Pxz x * log b (Pxz x / (Px (fst x) * Pz (snd x))) \<partial>(S \<Otimes>\<^sub>M P))"
  1226     (\<integral>x. Pxz x * log b (Pxz x / (Px (fst x) * Pz (snd x))) \<partial>(S \<Otimes>\<^sub>M P))"
  1225     by (rule mutual_information_distr[OF S P Px Pz Pxz])
  1227     using Px Px_nn Pz Pz_nn Pxz Pxz_nn
       
  1228     by (rule mutual_information_distr[OF S P]) (auto simp: space_pair_measure)
  1226   also have "\<dots> = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))"
  1229   also have "\<dots> = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))"
  1227     using b_gt_1 Pxz Px Pz
  1230     using b_gt_1 Pxz Pxz_nn Pxyz Pxyz_nn
  1228     by (subst distributed_transform_integral[OF Pxyz Pxz, where T="\<lambda>(x, y, z). (x, z)"])
  1231     by (subst distributed_transform_integral[OF Pxyz _ Pxz, where T="\<lambda>(x, y, z). (x, z)"])
  1229        (auto simp: split_beta')
  1232        (auto simp: split_beta')
  1230   finally have mi_eq:
  1233   finally have mi_eq:
  1231     "mutual_information b S P X Z = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))" .
  1234     "mutual_information b S P X Z = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))" .
  1232   
  1235 
  1233   have ae1: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Px (fst x) = 0 \<longrightarrow> Pxyz x = 0"
  1236   have ae1: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Px (fst x) = 0 \<longrightarrow> Pxyz x = 0"
  1234     by (intro subdensity_real[of fst, OF _ Pxyz Px]) auto
  1237     by (intro subdensity_finite_entropy[of fst, OF _ Fxyz Fx]) auto
  1235   moreover have ae2: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
  1238   moreover have ae2: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
  1236     by (intro subdensity_real[of "\<lambda>x. snd (snd x)", OF _ Pxyz Pz]) auto
  1239     by (intro subdensity_finite_entropy[of "\<lambda>x. snd (snd x)", OF _ Fxyz Fz]) auto
  1237   moreover have ae3: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
  1240   moreover have ae3: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
  1238     by (intro subdensity_real[of "\<lambda>x. (fst x, snd (snd x))", OF _ Pxyz Pxz]) auto
  1241     by (intro subdensity_finite_entropy[of "\<lambda>x. (fst x, snd (snd x))", OF _ Fxyz Fxz]) auto
  1239   moreover have ae4: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0"
  1242   moreover have ae4: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0"
  1240     by (intro subdensity_real[of snd, OF _ Pxyz Pyz]) auto
  1243     by (intro subdensity_finite_entropy[of snd, OF _ Fxyz Fyz]) auto
  1241   moreover have ae5: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Px (fst x)"
       
  1242     using Px by (intro STP.AE_pair_measure) (auto dest: distributed_real_AE)
       
  1243   moreover have ae6: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Pyz (snd x)"
       
  1244     using Pyz by (intro STP.AE_pair_measure) (auto dest: distributed_real_AE)
       
  1245   moreover have ae7: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Pz (snd (snd x))"
       
  1246     using Pz Pz[THEN distributed_real_measurable] by (auto intro!: TP.AE_pair_measure STP.AE_pair_measure AE_I2[of S] dest: distributed_real_AE)
       
  1247   moreover have ae8: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 0 \<le> Pxz (fst x, snd (snd x))"
       
  1248     using Pxz[THEN distributed_real_AE, THEN SP.AE_pair]
       
  1249     by (auto intro!: TP.AE_pair_measure STP.AE_pair_measure simp: comp_def)
       
  1250   moreover note ae9 = Pxyz[THEN distributed_real_AE]
       
  1251   ultimately have ae: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P.
  1244   ultimately have ae: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P.
  1252     Pxyz x * log b (Pxyz x / (Px (fst x) * Pyz (snd x))) -
  1245     Pxyz x * log b (Pxyz x / (Px (fst x) * Pyz (snd x))) -
  1253     Pxyz x * log b (Pxz (fst x, snd (snd x)) / (Px (fst x) * Pz (snd (snd x)))) =
  1246     Pxyz x * log b (Pxz (fst x, snd (snd x)) / (Px (fst x) * Pz (snd (snd x)))) =
  1254     Pxyz x * log b (Pxyz x * Pz (snd (snd x)) / (Pxz (fst x, snd (snd x)) * Pyz (snd x))) "
  1247     Pxyz x * log b (Pxyz x * Pz (snd (snd x)) / (Pxz (fst x, snd (snd x)) * Pyz (snd x))) "
       
  1248     using AE_space
  1255   proof eventually_elim
  1249   proof eventually_elim
  1256     case (elim x)
  1250     case (elim x)
  1257     show ?case
  1251     show ?case
  1258     proof cases
  1252     proof cases
  1259       assume "Pxyz x \<noteq> 0"
  1253       assume "Pxyz x \<noteq> 0"
  1260       with elim have "0 < Px (fst x)" "0 < Pz (snd (snd x))" "0 < Pxz (fst x, snd (snd x))"
  1254       with elim have "0 < Px (fst x)" "0 < Pz (snd (snd x))" "0 < Pxz (fst x, snd (snd x))"
  1261         "0 < Pyz (snd x)" "0 < Pxyz x"
  1255         "0 < Pyz (snd x)" "0 < Pxyz x"
  1262         by auto
  1256         using Px_nn Pz_nn Pxz_nn Pyz_nn Pxyz_nn
       
  1257         by (auto simp: space_pair_measure less_le)
  1263       then show ?thesis
  1258       then show ?thesis
  1264         using b_gt_1 by (simp add: log_simps less_imp_le field_simps)
  1259         using b_gt_1 by (simp add: log_simps less_imp_le field_simps)
  1265     qed simp
  1260     qed simp
  1266   qed
  1261   qed
  1267 
  1262 
  1268   have "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)
  1263   have "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)
  1269     (\<lambda>x. Pxyz x * log b (Pxyz x) - Pxyz x * log b (Px (fst x)) - Pxyz x * log b (Pyz (snd x)))"
  1264     (\<lambda>x. Pxyz x * log b (Pxyz x) - Pxyz x * log b (Px (fst x)) - Pxyz x * log b (Pyz (snd x)))"
  1270     using finite_entropy_integrable[OF Fxyz]
  1265     using finite_entropy_integrable[OF Fxyz]
  1271     using finite_entropy_integrable_transform[OF Fx Pxyz, of fst]
  1266     using finite_entropy_integrable_transform[OF Fx Pxyz Pxyz_nn, of fst]
  1272     using finite_entropy_integrable_transform[OF Fyz Pxyz, of snd]
  1267     using finite_entropy_integrable_transform[OF Fyz Pxyz Pxyz_nn, of snd]
  1273     by simp
  1268     by simp
  1274   moreover have "(\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z)))) \<in> borel_measurable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)"
  1269   moreover have "(\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z)))) \<in> borel_measurable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)"
  1275     using Pxyz Px Pyz by simp
  1270     using Pxyz Px Pyz by simp
  1276   ultimately have I1: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z))))"
  1271   ultimately have I1: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z))))"
  1277     apply (rule integrable_cong_AE_imp)
  1272     apply (rule integrable_cong_AE_imp)
  1278     using ae1 ae4 ae5 ae6 ae9
  1273     using ae1 ae4 AE_space
  1279     by eventually_elim
  1274     by eventually_elim
  1280        (auto simp: log_divide_eq log_mult_eq field_simps zero_less_mult_iff)
  1275        (insert Px_nn Pyz_nn Pxyz_nn,
       
  1276         auto simp: log_divide_eq log_mult_eq field_simps zero_less_mult_iff space_pair_measure less_le)
  1281 
  1277 
  1282   have "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)
  1278   have "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)
  1283     (\<lambda>x. Pxyz x * log b (Pxz (fst x, snd (snd x))) - Pxyz x * log b (Px (fst x)) - Pxyz x * log b (Pz (snd (snd x))))"
  1279     (\<lambda>x. Pxyz x * log b (Pxz (fst x, snd (snd x))) - Pxyz x * log b (Px (fst x)) - Pxyz x * log b (Pz (snd (snd x))))"
  1284     using finite_entropy_integrable_transform[OF Fxz Pxyz, of "\<lambda>x. (fst x, snd (snd x))"]
  1280     using finite_entropy_integrable_transform[OF Fxz Pxyz Pxyz_nn, of "\<lambda>x. (fst x, snd (snd x))"]
  1285     using finite_entropy_integrable_transform[OF Fx Pxyz, of fst]
  1281     using finite_entropy_integrable_transform[OF Fx Pxyz Pxyz_nn, of fst]
  1286     using finite_entropy_integrable_transform[OF Fz Pxyz, of "snd \<circ> snd"]
  1282     using finite_entropy_integrable_transform[OF Fz Pxyz Pxyz_nn, of "snd \<circ> snd"]
  1287     by simp
  1283     by simp
  1288   moreover have "(\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z))) \<in> borel_measurable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)"
  1284   moreover have "(\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z))) \<in> borel_measurable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)"
  1289     using Pxyz Px Pz
  1285     using Pxyz Px Pz
  1290     by auto
  1286     by auto
  1291   ultimately have I2: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z)))"
  1287   ultimately have I2: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z)))"
  1292     apply (rule integrable_cong_AE_imp)
  1288     apply (rule integrable_cong_AE_imp)
  1293     using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 ae9
  1289     using ae1 ae2 ae3 ae4 AE_space
  1294     by eventually_elim
  1290     by eventually_elim
  1295        (auto simp: log_divide_eq log_mult_eq field_simps zero_less_mult_iff)
  1291        (insert Px_nn Pz_nn Pxz_nn Pyz_nn Pxyz_nn,
       
  1292          auto simp: log_divide_eq log_mult_eq field_simps zero_less_mult_iff less_le space_pair_measure)
  1296 
  1293 
  1297   from ae I1 I2 show ?eq
  1294   from ae I1 I2 show ?eq
  1298     unfolding conditional_mutual_information_def
  1295     unfolding conditional_mutual_information_def
  1299     apply (subst mi_eq)
  1296     apply (subst mi_eq)
  1300     apply (subst mutual_information_distr[OF S TP Px Pyz Pxyz])
  1297     apply (subst mutual_information_distr[OF S TP Px Px_nn Pyz Pyz_nn Pxyz Pxyz_nn])
       
  1298     apply simp
       
  1299     apply simp
       
  1300     apply (simp add: space_pair_measure)
  1301     apply (subst integral_diff[symmetric])
  1301     apply (subst integral_diff[symmetric])
  1302     apply (auto intro!: integral_cong_AE simp: split_beta' simp del: integral_diff)
  1302     apply (auto intro!: integral_cong_AE simp: split_beta' simp del: integral_diff)
  1303     done
  1303     done
  1304 
  1304 
  1305   let ?P = "density (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) Pxyz"
  1305   let ?P = "density (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) Pxyz"
  1310   interpret Q: prob_space ?Q
  1310   interpret Q: prob_space ?Q
  1311     unfolding distributed_distr_eq_density[OF Pyz, symmetric] by (rule prob_space_distr) simp
  1311     unfolding distributed_distr_eq_density[OF Pyz, symmetric] by (rule prob_space_distr) simp
  1312 
  1312 
  1313   let ?f = "\<lambda>(x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) / Pxyz (x, y, z)"
  1313   let ?f = "\<lambda>(x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) / Pxyz (x, y, z)"
  1314 
  1314 
  1315   from subdensity_real[of snd, OF _ Pyz Pz]
  1315   from subdensity_finite_entropy[of snd, OF _ Fyz Fz]
  1316   have aeX1: "AE x in T \<Otimes>\<^sub>M P. Pz (snd x) = 0 \<longrightarrow> Pyz x = 0" by (auto simp: comp_def)
  1316   have aeX1: "AE x in T \<Otimes>\<^sub>M P. Pz (snd x) = 0 \<longrightarrow> Pyz x = 0" by (auto simp: comp_def)
  1317   have aeX2: "AE x in T \<Otimes>\<^sub>M P. 0 \<le> Pz (snd x)"
  1317   have aeX2: "AE x in T \<Otimes>\<^sub>M P. 0 \<le> Pz (snd x)"
  1318     using Pz by (intro TP.AE_pair_measure) (auto dest: distributed_real_AE)
  1318     using Pz by (intro TP.AE_pair_measure) (auto intro: Pz_nn)
  1319 
  1319 
  1320   have aeX3: "AE y in T \<Otimes>\<^sub>M P. (\<integral>\<^sup>+ x. ereal (Pxz (x, snd y)) \<partial>S) = ereal (Pz (snd y))"
  1320   have aeX3: "AE y in T \<Otimes>\<^sub>M P. (\<integral>\<^sup>+ x. ennreal (Pxz (x, snd y)) \<partial>S) = ennreal (Pz (snd y))"
  1321     using Pz distributed_marginal_eq_joint2[OF P S Pz Pxz]
  1321     using Pz distributed_marginal_eq_joint2[OF P S Pz Pxz]
  1322     by (intro TP.AE_pair_measure) (auto dest: distributed_real_AE)
  1322     by (intro TP.AE_pair_measure) (auto )
  1323   have "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<le> (\<integral>\<^sup>+ (x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))"
  1323   have "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<le> (\<integral>\<^sup>+ (x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))"
  1324     apply (subst nn_integral_density)
  1324     using Px_nn Pz_nn Pxz_nn Pyz_nn Pxyz_nn
  1325     apply (rule distributed_borel_measurable[OF Pxyz])
  1325     by (subst nn_integral_density)
  1326     apply (rule distributed_AE[OF Pxyz])
  1326        (auto intro!: nn_integral_mono simp: space_pair_measure ennreal_mult[symmetric])
  1327     apply simp
  1327   also have "\<dots> = (\<integral>\<^sup>+(y, z). \<integral>\<^sup>+ x. ennreal (Pxz (x, z)) * ennreal (Pyz (y, z) / Pz z) \<partial>S \<partial>T \<Otimes>\<^sub>M P)"
  1328     apply (rule nn_integral_mono_AE)
  1328     using Px_nn Pz_nn Pxz_nn Pyz_nn Pxyz_nn
  1329     using ae5 ae6 ae7 ae8
  1329     by (subst STP.nn_integral_snd[symmetric])
       
  1330        (auto simp add: split_beta' ennreal_mult[symmetric] space_pair_measure intro!: nn_integral_cong)
       
  1331   also have "\<dots> = (\<integral>\<^sup>+x. ennreal (Pyz x) * 1 \<partial>T \<Otimes>\<^sub>M P)"
       
  1332     apply (rule nn_integral_cong_AE)
       
  1333     using aeX1 aeX2 aeX3 AE_space
  1330     apply eventually_elim
  1334     apply eventually_elim
  1331     apply auto
  1335   proof (case_tac x, simp add: space_pair_measure)
  1332     done
       
  1333   also have "\<dots> = (\<integral>\<^sup>+(y, z). \<integral>\<^sup>+ x. ereal (Pxz (x, z)) * ereal (Pyz (y, z) / Pz z) \<partial>S \<partial>T \<Otimes>\<^sub>M P)"
       
  1334     by (subst STP.nn_integral_snd[symmetric]) (auto simp add: split_beta')
       
  1335   also have "\<dots> = (\<integral>\<^sup>+x. ereal (Pyz x) * 1 \<partial>T \<Otimes>\<^sub>M P)"
       
  1336     apply (rule nn_integral_cong_AE)
       
  1337     using aeX1 aeX2 aeX3 distributed_AE[OF Pyz] AE_space
       
  1338     apply eventually_elim
       
  1339   proof (case_tac x, simp del: times_ereal.simps add: space_pair_measure)
       
  1340     fix a b assume "Pz b = 0 \<longrightarrow> Pyz (a, b) = 0" "0 \<le> Pz b" "a \<in> space T \<and> b \<in> space P"
  1336     fix a b assume "Pz b = 0 \<longrightarrow> Pyz (a, b) = 0" "0 \<le> Pz b" "a \<in> space T \<and> b \<in> space P"
  1341       "(\<integral>\<^sup>+ x. ereal (Pxz (x, b)) \<partial>S) = ereal (Pz b)" "0 \<le> Pyz (a, b)" 
  1337       "(\<integral>\<^sup>+ x. ennreal (Pxz (x, b)) \<partial>S) = ennreal (Pz b)"
  1342     then show "(\<integral>\<^sup>+ x. ereal (Pxz (x, b)) * ereal (Pyz (a, b) / Pz b) \<partial>S) = ereal (Pyz (a, b))"
  1338     then show "(\<integral>\<^sup>+ x. ennreal (Pxz (x, b)) * ennreal (Pyz (a, b) / Pz b) \<partial>S) = ennreal (Pyz (a, b))"
  1343       by (subst nn_integral_multc) auto
  1339       using Pyz_nn[of "(a,b)"]
       
  1340       by (subst nn_integral_multc) (auto simp: space_pair_measure ennreal_mult[symmetric])
  1344   qed
  1341   qed
  1345   also have "\<dots> = 1"
  1342   also have "\<dots> = 1"
  1346     using Q.emeasure_space_1 distributed_AE[OF Pyz] distributed_distr_eq_density[OF Pyz]
  1343     using Q.emeasure_space_1 Pyz_nn distributed_distr_eq_density[OF Pyz]
  1347     by (subst nn_integral_density[symmetric]) auto
  1344     by (subst nn_integral_density[symmetric]) auto
  1348   finally have le1: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<le> 1" .
  1345   finally have le1: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<le> 1" .
  1349   also have "\<dots> < \<infinity>" by simp
  1346   also have "\<dots> < \<infinity>" by simp
  1350   finally have fin: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> \<infinity>" by simp
  1347   finally have fin: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> \<infinity>" by simp
  1351 
  1348 
  1352   have pos: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> 0"
  1349   have "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> 0"
       
  1350     using Pxyz_nn
  1353     apply (subst nn_integral_density)
  1351     apply (subst nn_integral_density)
  1354     apply simp
  1352     apply (simp_all add: split_beta'  ennreal_mult'[symmetric] cong: nn_integral_cong)
  1355     apply (rule distributed_AE[OF Pxyz])
       
  1356     apply simp
       
  1357     apply (simp add: split_beta')
       
  1358   proof
  1353   proof
  1359     let ?g = "\<lambda>x. ereal (if Pxyz x = 0 then 0 else Pxz (fst x, snd (snd x)) * Pyz (snd x) / Pz (snd (snd x)))"
  1354     let ?g = "\<lambda>x. ennreal (if Pxyz x = 0 then 0 else Pxz (fst x, snd (snd x)) * Pyz (snd x) / Pz (snd (snd x)))"
  1360     assume "(\<integral>\<^sup>+ x. ?g x \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)) = 0"
  1355     assume "(\<integral>\<^sup>+ x. ?g x \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)) = 0"
  1361     then have "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. ?g x \<le> 0"
  1356     then have "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. ?g x = 0"
  1362       by (intro nn_integral_0_iff_AE[THEN iffD1]) (auto intro!: borel_measurable_ereal measurable_If)
  1357       by (intro nn_integral_0_iff_AE[THEN iffD1]) auto
  1363     then have "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pxyz x = 0"
  1358     then have "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pxyz x = 0"
  1364       using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
  1359       using ae1 ae2 ae3 ae4 AE_space
  1365       by eventually_elim (auto split: if_split_asm simp: mult_le_0_iff divide_le_0_iff)
  1360       by eventually_elim
  1366     then have "(\<integral>\<^sup>+ x. ereal (Pxyz x) \<partial>S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) = 0"
  1361          (insert Px_nn Pz_nn Pxz_nn Pyz_nn,
       
  1362            auto split: if_split_asm simp: mult_le_0_iff divide_le_0_iff space_pair_measure)
       
  1363     then have "(\<integral>\<^sup>+ x. ennreal (Pxyz x) \<partial>S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) = 0"
  1367       by (subst nn_integral_cong_AE[of _ "\<lambda>x. 0"]) auto
  1364       by (subst nn_integral_cong_AE[of _ "\<lambda>x. 0"]) auto
  1368     with P.emeasure_space_1 show False
  1365     with P.emeasure_space_1 show False
  1369       by (subst (asm) emeasure_density) (auto cong: nn_integral_cong)
  1366       by (subst (asm) emeasure_density) (auto cong: nn_integral_cong)
  1370   qed
  1367   qed
       
  1368   then have pos: "0 < (\<integral>\<^sup>+ x. ?f x \<partial>?P)"
       
  1369     by (simp add: zero_less_iff_neq_zero)
  1371 
  1370 
  1372   have neg: "(\<integral>\<^sup>+ x. - ?f x \<partial>?P) = 0"
  1371   have neg: "(\<integral>\<^sup>+ x. - ?f x \<partial>?P) = 0"
  1373     apply (rule nn_integral_0_iff_AE[THEN iffD2])
  1372     using Pz_nn Pxz_nn Pyz_nn Pxyz_nn
  1374     apply (auto simp: split_beta') []
  1373     by (intro nn_integral_0_iff_AE[THEN iffD2])
  1375     apply (subst AE_density)
  1374        (auto simp: split_beta' AE_density space_pair_measure intro!: AE_I2 ennreal_neg)
  1376     apply (auto simp: split_beta') []
       
  1377     using ae5 ae6 ae7 ae8
       
  1378     apply eventually_elim
       
  1379     apply auto
       
  1380     done
       
  1381 
  1375 
  1382   have I3: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))"
  1376   have I3: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))"
  1383     apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ integrable_diff[OF I1 I2]])
  1377     apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ integrable_diff[OF I1 I2]])
  1384     using ae
  1378     using ae
  1385     apply (auto simp: split_beta')
  1379     apply (auto simp: split_beta')
  1394         by simp
  1388         by simp
  1395       from fin show "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> \<infinity>"
  1389       from fin show "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> \<infinity>"
  1396         by simp
  1390         by simp
  1397     qed simp
  1391     qed simp
  1398     then have "(\<integral>\<^sup>+ x. ?f x \<partial>?P) = (\<integral>x. ?f x \<partial>?P)"
  1392     then have "(\<integral>\<^sup>+ x. ?f x \<partial>?P) = (\<integral>x. ?f x \<partial>?P)"
  1399       apply (rule nn_integral_eq_integral)
  1393       using Pz_nn Pxz_nn Pyz_nn Pxyz_nn
  1400       apply (subst AE_density)
  1394       by (intro nn_integral_eq_integral)
  1401       apply simp
  1395          (auto simp: AE_density space_pair_measure)
  1402       using ae5 ae6 ae7 ae8
  1396     with pos le1
  1403       apply eventually_elim
       
  1404       apply auto
       
  1405       done
       
  1406     with nn_integral_nonneg[of ?P ?f] pos le1
       
  1407     show "0 < (\<integral>x. ?f x \<partial>?P)" "(\<integral>x. ?f x \<partial>?P) \<le> 1"
  1397     show "0 < (\<integral>x. ?f x \<partial>?P)" "(\<integral>x. ?f x \<partial>?P) \<le> 1"
  1408       by (simp_all add: one_ereal_def)
  1398       by (simp_all add: )
  1409   qed
  1399   qed
  1410   also have "- log b (integral\<^sup>L ?P ?f) \<le> (\<integral> x. - log b (?f x) \<partial>?P)"
  1400   also have "- log b (integral\<^sup>L ?P ?f) \<le> (\<integral> x. - log b (?f x) \<partial>?P)"
  1411   proof (rule P.jensens_inequality[where a=0 and b=1 and I="{0<..}"])
  1401   proof (rule P.jensens_inequality[where a=0 and b=1 and I="{0<..}"])
  1412     show "AE x in ?P. ?f x \<in> {0<..}"
  1402     show "AE x in ?P. ?f x \<in> {0<..}"
  1413       unfolding AE_density[OF distributed_borel_measurable[OF Pxyz]]
  1403       unfolding AE_density[OF distributed_borel_measurable[OF Pxyz]]
  1414       using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
  1404       using ae1 ae2 ae3 ae4 AE_space
  1415       by eventually_elim (auto)
  1405       by eventually_elim (insert Pxyz_nn Pyz_nn Pz_nn Pxz_nn, auto simp: space_pair_measure less_le)
  1416     show "integrable ?P ?f"
  1406     show "integrable ?P ?f"
  1417       unfolding real_integrable_def
  1407       unfolding real_integrable_def
  1418       using fin neg by (auto simp: split_beta')
  1408       using fin neg by (auto simp: split_beta')
  1419     show "integrable ?P (\<lambda>x. - log b (?f x))"
  1409     show "integrable ?P (\<lambda>x. - log b (?f x))"
       
  1410       using Pz_nn Pxz_nn Pyz_nn Pxyz_nn
  1420       apply (subst integrable_real_density)
  1411       apply (subst integrable_real_density)
  1421       apply simp
  1412       apply simp
  1422       apply (auto intro!: distributed_real_AE[OF Pxyz]) []
  1413       apply simp
  1423       apply simp
  1414       apply simp
  1424       apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ I3])
  1415       apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ I3])
  1425       apply simp
  1416       apply simp
  1426       apply simp
  1417       apply simp
  1427       using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
  1418       using ae1 ae2 ae3 ae4 AE_space
  1428       apply eventually_elim
  1419       apply eventually_elim
  1429       apply (auto simp: log_divide_eq log_mult_eq zero_le_mult_iff zero_less_mult_iff zero_less_divide_iff field_simps)
  1420       apply (auto simp: log_divide_eq log_mult_eq zero_le_mult_iff zero_less_mult_iff
       
  1421                         zero_less_divide_iff field_simps space_pair_measure less_le)
  1430       done
  1422       done
  1431   qed (auto simp: b_gt_1 minus_log_convex)
  1423   qed (auto simp: b_gt_1 minus_log_convex)
  1432   also have "\<dots> = conditional_mutual_information b S T P X Y Z"
  1424   also have "\<dots> = conditional_mutual_information b S T P X Y Z"
  1433     unfolding \<open>?eq\<close>
  1425     unfolding \<open>?eq\<close>
       
  1426     using Pz_nn Pxz_nn Pyz_nn Pxyz_nn
  1434     apply (subst integral_real_density)
  1427     apply (subst integral_real_density)
  1435     apply simp
  1428     apply simp
  1436     apply (auto intro!: distributed_real_AE[OF Pxyz]) []
  1429     apply simp
  1437     apply simp
  1430     apply simp
  1438     apply (intro integral_cong_AE)
  1431     apply (intro integral_cong_AE)
  1439     using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
  1432     using ae1 ae2 ae3 ae4 AE_space
  1440     apply (auto simp: log_divide_eq zero_less_mult_iff zero_less_divide_iff field_simps)
  1433     apply (auto simp: log_divide_eq zero_less_mult_iff zero_less_divide_iff
       
  1434                       field_simps space_pair_measure less_le)
  1441     done
  1435     done
  1442   finally show ?nonneg
  1436   finally show ?nonneg
  1443     by simp
  1437     by simp
  1444 qed
  1438 qed
  1445 
  1439 
  1448   assumes Pyz: "simple_distributed M (\<lambda>x. (Y x, Z x)) Pyz"
  1442   assumes Pyz: "simple_distributed M (\<lambda>x. (Y x, Z x)) Pyz"
  1449   assumes Pxz: "simple_distributed M (\<lambda>x. (X x, Z x)) Pxz"
  1443   assumes Pxz: "simple_distributed M (\<lambda>x. (X x, Z x)) Pxz"
  1450   assumes Pxyz: "simple_distributed M (\<lambda>x. (X x, Y x, Z x)) Pxyz"
  1444   assumes Pxyz: "simple_distributed M (\<lambda>x. (X x, Y x, Z x)) Pxyz"
  1451   shows "\<I>(X ; Y | Z) =
  1445   shows "\<I>(X ; Y | Z) =
  1452    (\<Sum>(x, y, z)\<in>(\<lambda>x. (X x, Y x, Z x))`space M. Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))"
  1446    (\<Sum>(x, y, z)\<in>(\<lambda>x. (X x, Y x, Z x))`space M. Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))"
  1453 proof (subst conditional_mutual_information_generic_eq[OF _ _ _ _
  1447 proof (subst conditional_mutual_information_generic_eq[OF _ _ _ _ _
  1454     simple_distributed[OF Pz] simple_distributed_joint[OF Pyz] simple_distributed_joint[OF Pxz]
  1448     simple_distributed[OF Pz] _ simple_distributed_joint[OF Pyz] _ simple_distributed_joint[OF Pxz] _
  1455     simple_distributed_joint2[OF Pxyz]])
  1449     simple_distributed_joint2[OF Pxyz]])
  1456   note simple_distributed_joint2_finite[OF Pxyz, simp]
  1450   note simple_distributed_joint2_finite[OF Pxyz, simp]
  1457   show "sigma_finite_measure (count_space (X ` space M))"
  1451   show "sigma_finite_measure (count_space (X ` space M))"
  1458     by (simp add: sigma_finite_measure_count_space_finite)
  1452     by (simp add: sigma_finite_measure_count_space_finite)
  1459   show "sigma_finite_measure (count_space (Y ` space M))"
  1453   show "sigma_finite_measure (count_space (Y ` space M))"
  1469   have "(\<lambda>x. (X x, Z x)) \<in> measurable M (count_space (X ` space M) \<Otimes>\<^sub>M count_space (Z ` space M))"
  1463   have "(\<lambda>x. (X x, Z x)) \<in> measurable M (count_space (X ` space M) \<Otimes>\<^sub>M count_space (Z ` space M))"
  1470     using simple_distributed_joint[OF Pxz] by (rule distributed_measurable)
  1464     using simple_distributed_joint[OF Pxz] by (rule distributed_measurable)
  1471   from measurable_comp[OF this measurable_fst]
  1465   from measurable_comp[OF this measurable_fst]
  1472   have "random_variable (count_space (X ` space M)) X"
  1466   have "random_variable (count_space (X ` space M)) X"
  1473     by (simp add: comp_def)
  1467     by (simp add: comp_def)
  1474   then have "simple_function M X"    
  1468   then have "simple_function M X"
  1475     unfolding simple_function_def by (auto simp: measurable_count_space_eq2)
  1469     unfolding simple_function_def by (auto simp: measurable_count_space_eq2)
  1476   then have "simple_distributed M X ?Px"
  1470   then have "simple_distributed M X ?Px"
  1477     by (rule simple_distributedI) auto
  1471     by (rule simple_distributedI) (auto simp: measure_nonneg)
  1478   then show "distributed M (count_space (X ` space M)) X ?Px"
  1472   then show "distributed M (count_space (X ` space M)) X ?Px"
  1479     by (rule simple_distributed)
  1473     by (rule simple_distributed)
  1480 
  1474 
  1481   let ?f = "(\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x, Z x)) ` space M then Pxyz x else 0)"
  1475   let ?f = "(\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x, Z x)) ` space M then Pxyz x else 0)"
  1482   let ?g = "(\<lambda>x. if x \<in> (\<lambda>x. (Y x, Z x)) ` space M then Pyz x else 0)"
  1476   let ?g = "(\<lambda>x. if x \<in> (\<lambda>x. (Y x, Z x)) ` space M then Pyz x else 0)"
  1489   let ?j = "\<lambda>x y z. Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z)))"
  1483   let ?j = "\<lambda>x y z. Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z)))"
  1490   have "(\<lambda>(x, y, z). ?i x y z) = (\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x, Z x)) ` space M then ?j (fst x) (fst (snd x)) (snd (snd x)) else 0)"
  1484   have "(\<lambda>(x, y, z). ?i x y z) = (\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x, Z x)) ` space M then ?j (fst x) (fst (snd x)) (snd (snd x)) else 0)"
  1491     by (auto intro!: ext)
  1485     by (auto intro!: ext)
  1492   then show "(\<integral> (x, y, z). ?i x y z \<partial>?P) = (\<Sum>(x, y, z)\<in>(\<lambda>x. (X x, Y x, Z x)) ` space M. ?j x y z)"
  1486   then show "(\<integral> (x, y, z). ?i x y z \<partial>?P) = (\<Sum>(x, y, z)\<in>(\<lambda>x. (X x, Y x, Z x)) ` space M. ?j x y z)"
  1493     by (auto intro!: setsum.cong simp add: \<open>?P = ?C\<close> lebesgue_integral_count_space_finite simple_distributed_finite setsum.If_cases split_beta')
  1487     by (auto intro!: setsum.cong simp add: \<open>?P = ?C\<close> lebesgue_integral_count_space_finite simple_distributed_finite setsum.If_cases split_beta')
  1494 qed
  1488 qed (insert Pz Pyz Pxz Pxyz, auto intro: measure_nonneg)
  1495 
  1489 
  1496 lemma (in information_space) conditional_mutual_information_nonneg:
  1490 lemma (in information_space) conditional_mutual_information_nonneg:
  1497   assumes X: "simple_function M X" and Y: "simple_function M Y" and Z: "simple_function M Z"
  1491   assumes X: "simple_function M X" and Y: "simple_function M Y" and Z: "simple_function M Z"
  1498   shows "0 \<le> \<I>(X ; Y | Z)"
  1492   shows "0 \<le> \<I>(X ; Y | Z)"
  1499 proof -
  1493 proof -
  1500   have [simp]: "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) \<Otimes>\<^sub>M count_space (Z ` space M) =
  1494   have [simp]: "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) \<Otimes>\<^sub>M count_space (Z ` space M) =
  1501       count_space (X`space M \<times> Y`space M \<times> Z`space M)"
  1495       count_space (X`space M \<times> Y`space M \<times> Z`space M)"
  1502     by (simp add: pair_measure_count_space X Y Z simple_functionD)
  1496     by (simp add: pair_measure_count_space X Y Z simple_functionD)
  1503   note sf = sigma_finite_measure_count_space_finite[OF simple_functionD(1)]
  1497   note sf = sigma_finite_measure_count_space_finite[OF simple_functionD(1)]
  1504   note sd = simple_distributedI[OF _ refl]
  1498   note sd = simple_distributedI[OF _ _ refl]
  1505   note sp = simple_function_Pair
  1499   note sp = simple_function_Pair
  1506   show ?thesis
  1500   show ?thesis
  1507    apply (rule conditional_mutual_information_generic_nonneg[OF sf[OF X] sf[OF Y] sf[OF Z]])
  1501    apply (rule conditional_mutual_information_generic_nonneg[OF sf[OF X] sf[OF Y] sf[OF Z]])
  1508    apply (rule simple_distributed[OF sd[OF X]])
  1502    apply (rule simple_distributed[OF sd[OF X]])
       
  1503    apply simp
       
  1504    apply simp
  1509    apply (rule simple_distributed[OF sd[OF Z]])
  1505    apply (rule simple_distributed[OF sd[OF Z]])
       
  1506    apply simp
       
  1507    apply simp
  1510    apply (rule simple_distributed_joint[OF sd[OF sp[OF Y Z]]])
  1508    apply (rule simple_distributed_joint[OF sd[OF sp[OF Y Z]]])
       
  1509    apply simp
       
  1510    apply simp
  1511    apply (rule simple_distributed_joint[OF sd[OF sp[OF X Z]]])
  1511    apply (rule simple_distributed_joint[OF sd[OF sp[OF X Z]]])
       
  1512    apply simp
       
  1513    apply simp
  1512    apply (rule simple_distributed_joint2[OF sd[OF sp[OF X sp[OF Y Z]]]])
  1514    apply (rule simple_distributed_joint2[OF sd[OF sp[OF X sp[OF Y Z]]]])
       
  1515    apply simp
       
  1516    apply simp
  1513    apply (auto intro!: integrable_count_space simp: X Y Z simple_functionD)
  1517    apply (auto intro!: integrable_count_space simp: X Y Z simple_functionD)
  1514    done
  1518    done
  1515 qed
  1519 qed
  1516 
  1520 
  1517 subsection \<open>Conditional Entropy\<close>
  1521 subsection \<open>Conditional Entropy\<close>
  1518 
  1522 
  1519 definition (in prob_space)
  1523 definition (in prob_space)
  1520   "conditional_entropy b S T X Y = - (\<integral>(x, y). log b (real_of_ereal (RN_deriv (S \<Otimes>\<^sub>M T) (distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))) (x, y)) / 
  1524   "conditional_entropy b S T X Y = - (\<integral>(x, y). log b (enn2real (RN_deriv (S \<Otimes>\<^sub>M T) (distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))) (x, y)) /
  1521     real_of_ereal (RN_deriv T (distr M T Y) y)) \<partial>distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)))"
  1525     enn2real (RN_deriv T (distr M T Y) y)) \<partial>distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)))"
  1522 
  1526 
  1523 abbreviation (in information_space)
  1527 abbreviation (in information_space)
  1524   conditional_entropy_Pow ("\<H>'(_ | _')") where
  1528   conditional_entropy_Pow ("\<H>'(_ | _')") where
  1525   "\<H>(X | Y) \<equiv> conditional_entropy b (count_space (X`space M)) (count_space (Y`space M)) X Y"
  1529   "\<H>(X | Y) \<equiv> conditional_entropy b (count_space (X`space M)) (count_space (Y`space M)) X Y"
  1526 
  1530 
  1527 lemma (in information_space) conditional_entropy_generic_eq:
  1531 lemma (in information_space) conditional_entropy_generic_eq:
  1528   fixes Pxy :: "_ \<Rightarrow> real" and Py :: "'c \<Rightarrow> real"
  1532   fixes Pxy :: "_ \<Rightarrow> real" and Py :: "'c \<Rightarrow> real"
  1529   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
  1533   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
  1530   assumes Py[measurable]: "distributed M T Y Py"
  1534   assumes Py[measurable]: "distributed M T Y Py" and Py_nn[simp]: "\<And>x. x \<in> space T \<Longrightarrow> 0 \<le> Py x"
  1531   assumes Pxy[measurable]: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
  1535   assumes Pxy[measurable]: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
       
  1536     and Pxy_nn[simp]: "\<And>x y. x \<in> space S \<Longrightarrow> y \<in> space T \<Longrightarrow> 0 \<le> Pxy (x, y)"
  1532   shows "conditional_entropy b S T X Y = - (\<integral>(x, y). Pxy (x, y) * log b (Pxy (x, y) / Py y) \<partial>(S \<Otimes>\<^sub>M T))"
  1537   shows "conditional_entropy b S T X Y = - (\<integral>(x, y). Pxy (x, y) * log b (Pxy (x, y) / Py y) \<partial>(S \<Otimes>\<^sub>M T))"
  1533 proof -
  1538 proof -
  1534   interpret S: sigma_finite_measure S by fact
  1539   interpret S: sigma_finite_measure S by fact
  1535   interpret T: sigma_finite_measure T by fact
  1540   interpret T: sigma_finite_measure T by fact
  1536   interpret ST: pair_sigma_finite S T ..
  1541   interpret ST: pair_sigma_finite S T ..
  1537 
  1542 
  1538   have "AE x in density (S \<Otimes>\<^sub>M T) (\<lambda>x. ereal (Pxy x)). Pxy x = real_of_ereal (RN_deriv (S \<Otimes>\<^sub>M T) (distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))) x)"
  1543   have [measurable]: "Py \<in> T \<rightarrow>\<^sub>M borel"
       
  1544     using Py Py_nn by (intro distributed_real_measurable)
       
  1545   have measurable_Pxy[measurable]: "Pxy \<in> (S \<Otimes>\<^sub>M T) \<rightarrow>\<^sub>M borel"
       
  1546     using Pxy Pxy_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure)
       
  1547 
       
  1548   have "AE x in density (S \<Otimes>\<^sub>M T) (\<lambda>x. ennreal (Pxy x)). Pxy x = enn2real (RN_deriv (S \<Otimes>\<^sub>M T) (distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))) x)"
  1539     unfolding AE_density[OF distributed_borel_measurable, OF Pxy]
  1549     unfolding AE_density[OF distributed_borel_measurable, OF Pxy]
  1540     unfolding distributed_distr_eq_density[OF Pxy]
  1550     unfolding distributed_distr_eq_density[OF Pxy]
  1541     using distributed_RN_deriv[OF Pxy]
  1551     using distributed_RN_deriv[OF Pxy]
  1542     by auto
  1552     by auto
  1543   moreover
  1553   moreover
  1544   have "AE x in density (S \<Otimes>\<^sub>M T) (\<lambda>x. ereal (Pxy x)). Py (snd x) = real_of_ereal (RN_deriv T (distr M T Y) (snd x))"
  1554   have "AE x in density (S \<Otimes>\<^sub>M T) (\<lambda>x. ennreal (Pxy x)). Py (snd x) = enn2real (RN_deriv T (distr M T Y) (snd x))"
  1545     unfolding AE_density[OF distributed_borel_measurable, OF Pxy]
  1555     unfolding AE_density[OF distributed_borel_measurable, OF Pxy]
  1546     unfolding distributed_distr_eq_density[OF Py]
  1556     unfolding distributed_distr_eq_density[OF Py]
  1547     apply (rule ST.AE_pair_measure)
  1557     apply (rule ST.AE_pair_measure)
  1548     apply auto
  1558     apply auto
  1549     using distributed_RN_deriv[OF Py]
  1559     using distributed_RN_deriv[OF Py]
  1550     apply auto
  1560     apply auto
  1551     done    
  1561     done
  1552   ultimately
  1562   ultimately
  1553   have "conditional_entropy b S T X Y = - (\<integral>x. Pxy x * log b (Pxy x / Py (snd x)) \<partial>(S \<Otimes>\<^sub>M T))"
  1563   have "conditional_entropy b S T X Y = - (\<integral>x. Pxy x * log b (Pxy x / Py (snd x)) \<partial>(S \<Otimes>\<^sub>M T))"
  1554     unfolding conditional_entropy_def neg_equal_iff_equal
  1564     unfolding conditional_entropy_def neg_equal_iff_equal
  1555     apply (subst integral_real_density[symmetric])
  1565     apply (subst integral_real_density[symmetric])
  1556     apply (auto simp: distributed_real_AE[OF Pxy] distributed_distr_eq_density[OF Pxy]
  1566     apply (auto simp: distributed_distr_eq_density[OF Pxy] space_pair_measure
  1557                 intro!: integral_cong_AE)
  1567                 intro!: integral_cong_AE)
  1558     done
  1568     done
  1559   then show ?thesis by (simp add: split_beta')
  1569   then show ?thesis by (simp add: split_beta')
  1560 qed
  1570 qed
  1561 
  1571 
  1562 lemma (in information_space) conditional_entropy_eq_entropy:
  1572 lemma (in information_space) conditional_entropy_eq_entropy:
  1563   fixes Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real"
  1573   fixes Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real"
  1564   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
  1574   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
  1565   assumes Py[measurable]: "distributed M T Y Py"
  1575   assumes Py[measurable]: "distributed M T Y Py"
       
  1576     and Py_nn[simp]: "\<And>x. x \<in> space T \<Longrightarrow> 0 \<le> Py x"
  1566   assumes Pxy[measurable]: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
  1577   assumes Pxy[measurable]: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
       
  1578     and Pxy_nn[simp]: "\<And>x y. x \<in> space S \<Longrightarrow> y \<in> space T \<Longrightarrow> 0 \<le> Pxy (x, y)"
  1567   assumes I1: "integrable (S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x))"
  1579   assumes I1: "integrable (S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x))"
  1568   assumes I2: "integrable (S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Py (snd x)))"
  1580   assumes I2: "integrable (S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Py (snd x)))"
  1569   shows "conditional_entropy b S T X Y = entropy b (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) - entropy b T Y"
  1581   shows "conditional_entropy b S T X Y = entropy b (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) - entropy b T Y"
  1570 proof -
  1582 proof -
  1571   interpret S: sigma_finite_measure S by fact
  1583   interpret S: sigma_finite_measure S by fact
  1572   interpret T: sigma_finite_measure T by fact
  1584   interpret T: sigma_finite_measure T by fact
  1573   interpret ST: pair_sigma_finite S T ..
  1585   interpret ST: pair_sigma_finite S T ..
  1574 
  1586 
       
  1587   have [measurable]: "Py \<in> T \<rightarrow>\<^sub>M borel"
       
  1588     using Py Py_nn by (intro distributed_real_measurable)
       
  1589   have measurable_Pxy[measurable]: "Pxy \<in> (S \<Otimes>\<^sub>M T) \<rightarrow>\<^sub>M borel"
       
  1590     using Pxy Pxy_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure)
       
  1591 
  1575   have "entropy b T Y = - (\<integral>y. Py y * log b (Py y) \<partial>T)"
  1592   have "entropy b T Y = - (\<integral>y. Py y * log b (Py y) \<partial>T)"
  1576     by (rule entropy_distr[OF Py])
  1593     by (rule entropy_distr[OF Py Py_nn])
  1577   also have "\<dots> = - (\<integral>(x,y). Pxy (x,y) * log b (Py y) \<partial>(S \<Otimes>\<^sub>M T))"
  1594   also have "\<dots> = - (\<integral>(x,y). Pxy (x,y) * log b (Py y) \<partial>(S \<Otimes>\<^sub>M T))"
  1578     using b_gt_1 Py[THEN distributed_real_measurable]
  1595     using b_gt_1
  1579     by (subst distributed_transform_integral[OF Pxy Py, where T=snd]) (auto intro!: integral_cong)
  1596     by (subst distributed_transform_integral[OF Pxy _ Py, where T=snd])
       
  1597        (auto intro!: integral_cong simp: space_pair_measure)
  1580   finally have e_eq: "entropy b T Y = - (\<integral>(x,y). Pxy (x,y) * log b (Py y) \<partial>(S \<Otimes>\<^sub>M T))" .
  1598   finally have e_eq: "entropy b T Y = - (\<integral>(x,y). Pxy (x,y) * log b (Py y) \<partial>(S \<Otimes>\<^sub>M T))" .
  1581 
  1599 
       
  1600   have **: "\<And>x. x \<in> space (S \<Otimes>\<^sub>M T) \<Longrightarrow> 0 \<le> Pxy x"
       
  1601     by (auto simp: space_pair_measure)
       
  1602 
  1582   have ae2: "AE x in S \<Otimes>\<^sub>M T. Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
  1603   have ae2: "AE x in S \<Otimes>\<^sub>M T. Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
  1583     by (intro subdensity_real[of snd, OF _ Pxy Py]) (auto intro: measurable_Pair)
  1604     by (intro subdensity_real[of snd, OF _ Pxy Py])
       
  1605        (auto intro: measurable_Pair simp: space_pair_measure)
  1584   moreover have ae4: "AE x in S \<Otimes>\<^sub>M T. 0 \<le> Py (snd x)"
  1606   moreover have ae4: "AE x in S \<Otimes>\<^sub>M T. 0 \<le> Py (snd x)"
  1585     using Py by (intro ST.AE_pair_measure) (auto simp: comp_def intro!: measurable_snd'' dest: distributed_real_AE distributed_real_measurable)
  1607     using Py by (intro ST.AE_pair_measure) (auto simp: comp_def intro!: measurable_snd'')
  1586   moreover note ae5 = Pxy[THEN distributed_real_AE]
       
  1587   ultimately have "AE x in S \<Otimes>\<^sub>M T. 0 \<le> Pxy x \<and> 0 \<le> Py (snd x) \<and>
  1608   ultimately have "AE x in S \<Otimes>\<^sub>M T. 0 \<le> Pxy x \<and> 0 \<le> Py (snd x) \<and>
  1588     (Pxy x = 0 \<or> (Pxy x \<noteq> 0 \<longrightarrow> 0 < Pxy x \<and> 0 < Py (snd x)))"
  1609     (Pxy x = 0 \<or> (Pxy x \<noteq> 0 \<longrightarrow> 0 < Pxy x \<and> 0 < Py (snd x)))"
  1589     by eventually_elim auto
  1610     using AE_space by eventually_elim (auto simp: space_pair_measure less_le)
  1590   then have ae: "AE x in S \<Otimes>\<^sub>M T.
  1611   then have ae: "AE x in S \<Otimes>\<^sub>M T.
  1591      Pxy x * log b (Pxy x) - Pxy x * log b (Py (snd x)) = Pxy x * log b (Pxy x / Py (snd x))"
  1612      Pxy x * log b (Pxy x) - Pxy x * log b (Py (snd x)) = Pxy x * log b (Pxy x / Py (snd x))"
  1592     by eventually_elim (auto simp: log_simps field_simps b_gt_1)
  1613     by eventually_elim (auto simp: log_simps field_simps b_gt_1)
  1593   have "conditional_entropy b S T X Y = 
  1614   have "conditional_entropy b S T X Y =
  1594     - (\<integral>x. Pxy x * log b (Pxy x) - Pxy x * log b (Py (snd x)) \<partial>(S \<Otimes>\<^sub>M T))"
  1615     - (\<integral>x. Pxy x * log b (Pxy x) - Pxy x * log b (Py (snd x)) \<partial>(S \<Otimes>\<^sub>M T))"
  1595     unfolding conditional_entropy_generic_eq[OF S T Py Pxy] neg_equal_iff_equal
  1616     unfolding conditional_entropy_generic_eq[OF S T Py Py_nn Pxy Pxy_nn, simplified] neg_equal_iff_equal
  1596     apply (intro integral_cong_AE)
  1617     apply (intro integral_cong_AE)
  1597     using ae
  1618     using ae
  1598     apply auto
  1619     apply auto
  1599     done
  1620     done
  1600   also have "\<dots> = - (\<integral>x. Pxy x * log b (Pxy x) \<partial>(S \<Otimes>\<^sub>M T)) - - (\<integral>x.  Pxy x * log b (Py (snd x)) \<partial>(S \<Otimes>\<^sub>M T))"
  1621   also have "\<dots> = - (\<integral>x. Pxy x * log b (Pxy x) \<partial>(S \<Otimes>\<^sub>M T)) - - (\<integral>x.  Pxy x * log b (Py (snd x)) \<partial>(S \<Otimes>\<^sub>M T))"
  1601     by (simp add: integral_diff[OF I1 I2])
  1622     by (simp add: integral_diff[OF I1 I2])
  1602   finally show ?thesis 
  1623   finally show ?thesis
  1603     unfolding conditional_entropy_generic_eq[OF S T Py Pxy] entropy_distr[OF Pxy] e_eq
  1624     using conditional_entropy_generic_eq[OF S T Py Py_nn Pxy Pxy_nn, simplified]
       
  1625       entropy_distr[OF Pxy **, simplified] e_eq
  1604     by (simp add: split_beta')
  1626     by (simp add: split_beta')
  1605 qed
  1627 qed
  1606 
  1628 
  1607 lemma (in information_space) conditional_entropy_eq_entropy_simple:
  1629 lemma (in information_space) conditional_entropy_eq_entropy_simple:
  1608   assumes X: "simple_function M X" and Y: "simple_function M Y"
  1630   assumes X: "simple_function M X" and Y: "simple_function M Y"
  1610 proof -
  1632 proof -
  1611   have "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) = count_space (X`space M \<times> Y`space M)"
  1633   have "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) = count_space (X`space M \<times> Y`space M)"
  1612     (is "?P = ?C") using X Y by (simp add: simple_functionD pair_measure_count_space)
  1634     (is "?P = ?C") using X Y by (simp add: simple_functionD pair_measure_count_space)
  1613   show ?thesis
  1635   show ?thesis
  1614     by (rule conditional_entropy_eq_entropy sigma_finite_measure_count_space_finite
  1636     by (rule conditional_entropy_eq_entropy sigma_finite_measure_count_space_finite
  1615                  simple_functionD  X Y simple_distributed simple_distributedI[OF _ refl]
  1637              simple_functionD  X Y simple_distributed simple_distributedI[OF _ _ refl]
  1616                  simple_distributed_joint simple_function_Pair integrable_count_space)+
  1638              simple_distributed_joint simple_function_Pair integrable_count_space measure_nonneg)+
  1617        (auto simp: \<open>?P = ?C\<close> intro!: integrable_count_space simple_functionD  X Y)
  1639        (auto simp: \<open>?P = ?C\<close> measure_nonneg intro!: integrable_count_space simple_functionD  X Y)
  1618 qed
  1640 qed
  1619 
  1641 
  1620 lemma (in information_space) conditional_entropy_eq:
  1642 lemma (in information_space) conditional_entropy_eq:
  1621   assumes Y: "simple_distributed M Y Py"
  1643   assumes Y: "simple_distributed M Y Py"
  1622   assumes XY: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy"
  1644   assumes XY: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy"
  1623     shows "\<H>(X | Y) = - (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / Py y))"
  1645     shows "\<H>(X | Y) = - (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / Py y))"
  1624 proof (subst conditional_entropy_generic_eq[OF _ _
  1646 proof (subst conditional_entropy_generic_eq[OF _ _
  1625   simple_distributed[OF Y] simple_distributed_joint[OF XY]])
  1647   simple_distributed[OF Y] _ simple_distributed_joint[OF XY]])
  1626   have "finite ((\<lambda>x. (X x, Y x))`space M)"
  1648   have "finite ((\<lambda>x. (X x, Y x))`space M)"
  1627     using XY unfolding simple_distributed_def by auto
  1649     using XY unfolding simple_distributed_def by auto
  1628   from finite_imageI[OF this, of fst]
  1650   from finite_imageI[OF this, of fst]
  1629   have [simp]: "finite (X`space M)"
  1651   have [simp]: "finite (X`space M)"
  1630     by (simp add: image_comp comp_def)
  1652     by (simp add: image_comp comp_def)
  1641     (\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x)) ` space M then Pxy x * log b (Pxy x / Py (snd x)) else 0)"
  1663     (\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x)) ` space M then Pxy x * log b (Pxy x / Py (snd x)) else 0)"
  1642     by auto
  1664     by auto
  1643   from Y show "- (\<integral> (x, y). ?f (x, y) * log b (?f (x, y) / Py y) \<partial>?P) =
  1665   from Y show "- (\<integral> (x, y). ?f (x, y) * log b (?f (x, y) / Py y) \<partial>?P) =
  1644     - (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / Py y))"
  1666     - (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / Py y))"
  1645     by (auto intro!: setsum.cong simp add: \<open>?P = ?C\<close> lebesgue_integral_count_space_finite simple_distributed_finite eq setsum.If_cases split_beta')
  1667     by (auto intro!: setsum.cong simp add: \<open>?P = ?C\<close> lebesgue_integral_count_space_finite simple_distributed_finite eq setsum.If_cases split_beta')
  1646 qed
  1668 qed (insert Y XY, auto)
  1647 
  1669 
  1648 lemma (in information_space) conditional_mutual_information_eq_conditional_entropy:
  1670 lemma (in information_space) conditional_mutual_information_eq_conditional_entropy:
  1649   assumes X: "simple_function M X" and Y: "simple_function M Y"
  1671   assumes X: "simple_function M X" and Y: "simple_function M Y"
  1650   shows "\<I>(X ; X | Y) = \<H>(X | Y)"
  1672   shows "\<I>(X ; X | Y) = \<H>(X | Y)"
  1651 proof -
  1673 proof -
  1655   let ?M = "X`space M \<times> X`space M \<times> Y`space M"
  1677   let ?M = "X`space M \<times> X`space M \<times> Y`space M"
  1656 
  1678 
  1657   note XY = simple_function_Pair[OF X Y]
  1679   note XY = simple_function_Pair[OF X Y]
  1658   note XXY = simple_function_Pair[OF X XY]
  1680   note XXY = simple_function_Pair[OF X XY]
  1659   have Py: "simple_distributed M Y Py"
  1681   have Py: "simple_distributed M Y Py"
  1660     using Y by (rule simple_distributedI) (auto simp: Py_def)
  1682     using Y by (rule simple_distributedI) (auto simp: Py_def measure_nonneg)
  1661   have Pxy: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy"
  1683   have Pxy: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy"
  1662     using XY by (rule simple_distributedI) (auto simp: Pxy_def)
  1684     using XY by (rule simple_distributedI) (auto simp: Pxy_def measure_nonneg)
  1663   have Pxxy: "simple_distributed M (\<lambda>x. (X x, X x, Y x)) Pxxy"
  1685   have Pxxy: "simple_distributed M (\<lambda>x. (X x, X x, Y x)) Pxxy"
  1664     using XXY by (rule simple_distributedI) (auto simp: Pxxy_def)
  1686     using XXY by (rule simple_distributedI) (auto simp: Pxxy_def measure_nonneg)
  1665   have eq: "(\<lambda>x. (X x, X x, Y x)) ` space M = (\<lambda>(x, y). (x, x, y)) ` (\<lambda>x. (X x, Y x)) ` space M"
  1687   have eq: "(\<lambda>x. (X x, X x, Y x)) ` space M = (\<lambda>(x, y). (x, x, y)) ` (\<lambda>x. (X x, Y x)) ` space M"
  1666     by auto
  1688     by auto
  1667   have inj: "\<And>A. inj_on (\<lambda>(x, y). (x, x, y)) A"
  1689   have inj: "\<And>A. inj_on (\<lambda>(x, y). (x, x, y)) A"
  1668     by (auto simp: inj_on_def)
  1690     by (auto simp: inj_on_def)
  1669   have Pxxy_eq: "\<And>x y. Pxxy (x, x, y) = Pxy (x, y)"
  1691   have Pxxy_eq: "\<And>x y. Pxxy (x, x, y) = Pxy (x, y)"
  1670     by (auto simp: Pxxy_def Pxy_def intro!: arg_cong[where f=prob])
  1692     by (auto simp: Pxxy_def Pxy_def intro!: arg_cong[where f=prob])
  1671   have "AE x in count_space ((\<lambda>x. (X x, Y x))`space M). Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
  1693   have "AE x in count_space ((\<lambda>x. (X x, Y x))`space M). Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
  1672     by (intro subdensity_real[of snd, OF _ Pxy[THEN simple_distributed] Py[THEN simple_distributed]]) (auto intro: measurable_Pair)
  1694     using Py Pxy
       
  1695     by (intro subdensity_real[of snd, OF _ Pxy[THEN simple_distributed] Py[THEN simple_distributed]])
       
  1696        (auto intro: measurable_Pair simp: AE_count_space)
  1673   then show ?thesis
  1697   then show ?thesis
  1674     apply (subst conditional_mutual_information_eq[OF Py Pxy Pxy Pxxy])
  1698     apply (subst conditional_mutual_information_eq[OF Py Pxy Pxy Pxxy])
  1675     apply (subst conditional_entropy_eq[OF Py Pxy])
  1699     apply (subst conditional_entropy_eq[OF Py Pxy])
  1676     apply (auto intro!: setsum.cong simp: Pxxy_eq setsum_negf[symmetric] eq setsum.reindex[OF inj]
  1700     apply (auto intro!: setsum.cong simp: Pxxy_eq setsum_negf[symmetric] eq setsum.reindex[OF inj]
  1677                 log_simps zero_less_mult_iff zero_le_mult_iff field_simps mult_less_0_iff AE_count_space)
  1701                 log_simps zero_less_mult_iff zero_le_mult_iff field_simps mult_less_0_iff AE_count_space)
  1678     using Py[THEN simple_distributed, THEN distributed_real_AE] Pxy[THEN simple_distributed, THEN distributed_real_AE]
  1702     using Py[THEN simple_distributed] Pxy[THEN simple_distributed]
  1679   apply (auto simp add: not_le[symmetric] AE_count_space)
  1703     apply (auto simp add: not_le AE_count_space less_le antisym
       
  1704       simple_distributed_nonneg[OF Py] simple_distributed_nonneg[OF Pxy])
  1680     done
  1705     done
  1681 qed
  1706 qed
  1682 
  1707 
  1683 lemma (in information_space) conditional_entropy_nonneg:
  1708 lemma (in information_space) conditional_entropy_nonneg:
  1684   assumes X: "simple_function M X" and Y: "simple_function M Y" shows "0 \<le> \<H>(X | Y)"
  1709   assumes X: "simple_function M X" and Y: "simple_function M Y" shows "0 \<le> \<H>(X | Y)"
  1688 subsection \<open>Equalities\<close>
  1713 subsection \<open>Equalities\<close>
  1689 
  1714 
  1690 lemma (in information_space) mutual_information_eq_entropy_conditional_entropy_distr:
  1715 lemma (in information_space) mutual_information_eq_entropy_conditional_entropy_distr:
  1691   fixes Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" and Pxy :: "('b \<times> 'c) \<Rightarrow> real"
  1716   fixes Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" and Pxy :: "('b \<times> 'c) \<Rightarrow> real"
  1692   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
  1717   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
  1693   assumes Px[measurable]: "distributed M S X Px" and Py[measurable]: "distributed M T Y Py"
  1718   assumes Px[measurable]: "distributed M S X Px"
  1694   assumes Pxy[measurable]: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
  1719     and Px_nn[simp]: "\<And>x. x \<in> space S \<Longrightarrow> 0 \<le> Px x"
       
  1720     and Py[measurable]: "distributed M T Y Py"
       
  1721     and Py_nn[simp]: "\<And>x. x \<in> space T \<Longrightarrow> 0 \<le> Py x"
       
  1722     and Pxy[measurable]: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
       
  1723     and Pxy_nn[simp]: "\<And>x y. x \<in> space S \<Longrightarrow> y \<in> space T \<Longrightarrow> 0 \<le> Pxy (x, y)"
  1695   assumes Ix: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Px (fst x)))"
  1724   assumes Ix: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Px (fst x)))"
  1696   assumes Iy: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Py (snd x)))"
  1725   assumes Iy: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Py (snd x)))"
  1697   assumes Ixy: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x))"
  1726   assumes Ixy: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x))"
  1698   shows  "mutual_information b S T X Y = entropy b S X + entropy b T Y - entropy b (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))"
  1727   shows  "mutual_information b S T X Y = entropy b S X + entropy b T Y - entropy b (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))"
  1699 proof -
  1728 proof -
       
  1729   have [measurable]: "Px \<in> S \<rightarrow>\<^sub>M borel"
       
  1730     using Px Px_nn by (intro distributed_real_measurable)
       
  1731   have [measurable]: "Py \<in> T \<rightarrow>\<^sub>M borel"
       
  1732     using Py Py_nn by (intro distributed_real_measurable)
       
  1733   have measurable_Pxy[measurable]: "Pxy \<in> (S \<Otimes>\<^sub>M T) \<rightarrow>\<^sub>M borel"
       
  1734     using Pxy Pxy_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure)
       
  1735 
  1700   have X: "entropy b S X = - (\<integral>x. Pxy x * log b (Px (fst x)) \<partial>(S \<Otimes>\<^sub>M T))"
  1736   have X: "entropy b S X = - (\<integral>x. Pxy x * log b (Px (fst x)) \<partial>(S \<Otimes>\<^sub>M T))"
  1701     using b_gt_1 Px[THEN distributed_real_measurable]
  1737     using b_gt_1
  1702     apply (subst entropy_distr[OF Px])
  1738     apply (subst entropy_distr[OF Px Px_nn], simp)
  1703     apply (subst distributed_transform_integral[OF Pxy Px, where T=fst])
  1739     apply (subst distributed_transform_integral[OF Pxy _ Px, where T=fst])
  1704     apply (auto intro!: integral_cong)
  1740     apply (auto intro!: integral_cong simp: space_pair_measure)
  1705     done
  1741     done
  1706 
  1742 
  1707   have Y: "entropy b T Y = - (\<integral>x. Pxy x * log b (Py (snd x)) \<partial>(S \<Otimes>\<^sub>M T))"
  1743   have Y: "entropy b T Y = - (\<integral>x. Pxy x * log b (Py (snd x)) \<partial>(S \<Otimes>\<^sub>M T))"
  1708     using b_gt_1 Py[THEN distributed_real_measurable]
  1744     using b_gt_1
  1709     apply (subst entropy_distr[OF Py])
  1745     apply (subst entropy_distr[OF Py Py_nn], simp)
  1710     apply (subst distributed_transform_integral[OF Pxy Py, where T=snd])
  1746     apply (subst distributed_transform_integral[OF Pxy _ Py, where T=snd])
  1711     apply (auto intro!: integral_cong)
  1747     apply (auto intro!: integral_cong simp: space_pair_measure)
  1712     done
  1748     done
  1713 
  1749 
  1714   interpret S: sigma_finite_measure S by fact
  1750   interpret S: sigma_finite_measure S by fact
  1715   interpret T: sigma_finite_measure T by fact
  1751   interpret T: sigma_finite_measure T by fact
  1716   interpret ST: pair_sigma_finite S T ..
  1752   interpret ST: pair_sigma_finite S T ..
  1717   have ST: "sigma_finite_measure (S \<Otimes>\<^sub>M T)" ..
  1753   have ST: "sigma_finite_measure (S \<Otimes>\<^sub>M T)" ..
  1718 
  1754 
  1719   have XY: "entropy b (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) = - (\<integral>x. Pxy x * log b (Pxy x) \<partial>(S \<Otimes>\<^sub>M T))"
  1755   have XY: "entropy b (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) = - (\<integral>x. Pxy x * log b (Pxy x) \<partial>(S \<Otimes>\<^sub>M T))"
  1720     by (subst entropy_distr[OF Pxy]) (auto intro!: integral_cong)
  1756     by (subst entropy_distr[OF Pxy]) (auto intro!: integral_cong simp: space_pair_measure)
  1721   
  1757 
  1722   have "AE x in S \<Otimes>\<^sub>M T. Px (fst x) = 0 \<longrightarrow> Pxy x = 0"
  1758   have "AE x in S \<Otimes>\<^sub>M T. Px (fst x) = 0 \<longrightarrow> Pxy x = 0"
  1723     by (intro subdensity_real[of fst, OF _ Pxy Px]) (auto intro: measurable_Pair)
  1759     by (intro subdensity_real[of fst, OF _ Pxy Px]) (auto intro: measurable_Pair simp: space_pair_measure)
  1724   moreover have "AE x in S \<Otimes>\<^sub>M T. Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
  1760   moreover have "AE x in S \<Otimes>\<^sub>M T. Py (snd x) = 0 \<longrightarrow> Pxy x = 0"
  1725     by (intro subdensity_real[of snd, OF _ Pxy Py]) (auto intro: measurable_Pair)
  1761     by (intro subdensity_real[of snd, OF _ Pxy Py]) (auto intro: measurable_Pair simp: space_pair_measure)
  1726   moreover have "AE x in S \<Otimes>\<^sub>M T. 0 \<le> Px (fst x)"
  1762   moreover have "AE x in S \<Otimes>\<^sub>M T. 0 \<le> Px (fst x)"
  1727     using Px by (intro ST.AE_pair_measure) (auto simp: comp_def intro!: measurable_fst'' dest: distributed_real_AE distributed_real_measurable)
  1763     using Px by (intro ST.AE_pair_measure) (auto simp: comp_def intro!: measurable_fst'')
  1728   moreover have "AE x in S \<Otimes>\<^sub>M T. 0 \<le> Py (snd x)"
  1764   moreover have "AE x in S \<Otimes>\<^sub>M T. 0 \<le> Py (snd x)"
  1729     using Py by (intro ST.AE_pair_measure) (auto simp: comp_def intro!: measurable_snd'' dest: distributed_real_AE distributed_real_measurable)
  1765     using Py by (intro ST.AE_pair_measure) (auto simp: comp_def intro!: measurable_snd'')
  1730   moreover note Pxy[THEN distributed_real_AE]
  1766   ultimately have "AE x in S \<Otimes>\<^sub>M T. Pxy x * log b (Pxy x) - Pxy x * log b (Px (fst x)) - Pxy x * log b (Py (snd x)) =
  1731   ultimately have "AE x in S \<Otimes>\<^sub>M T. Pxy x * log b (Pxy x) - Pxy x * log b (Px (fst x)) - Pxy x * log b (Py (snd x)) = 
       
  1732     Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x)))"
  1767     Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x)))"
  1733     (is "AE x in _. ?f x = ?g x")
  1768     (is "AE x in _. ?f x = ?g x")
       
  1769     using AE_space
  1734   proof eventually_elim
  1770   proof eventually_elim
  1735     case (elim x)
  1771     case (elim x)
  1736     show ?case
  1772     show ?case
  1737     proof cases
  1773     proof cases
  1738       assume "Pxy x \<noteq> 0"
  1774       assume "Pxy x \<noteq> 0"
  1739       with elim have "0 < Px (fst x)" "0 < Py (snd x)" "0 < Pxy x"
  1775       with elim have "0 < Px (fst x)" "0 < Py (snd x)" "0 < Pxy x"
  1740         by auto
  1776         by (auto simp: space_pair_measure less_le)
  1741       then show ?thesis
  1777       then show ?thesis
  1742         using b_gt_1 by (simp add: log_simps less_imp_le field_simps)
  1778         using b_gt_1 by (simp add: log_simps less_imp_le field_simps)
  1743     qed simp
  1779     qed simp
  1744   qed
  1780   qed
  1745 
  1781 
  1752     apply (simp add: field_simps)
  1788     apply (simp add: field_simps)
  1753     done
  1789     done
  1754   also have "\<dots> = integral\<^sup>L (S \<Otimes>\<^sub>M T) ?g"
  1790   also have "\<dots> = integral\<^sup>L (S \<Otimes>\<^sub>M T) ?g"
  1755     using \<open>AE x in _. ?f x = ?g x\<close> by (intro integral_cong_AE) auto
  1791     using \<open>AE x in _. ?f x = ?g x\<close> by (intro integral_cong_AE) auto
  1756   also have "\<dots> = mutual_information b S T X Y"
  1792   also have "\<dots> = mutual_information b S T X Y"
  1757     by (rule mutual_information_distr[OF S T Px Py Pxy, symmetric])
  1793     by (rule mutual_information_distr[OF S T Px _ Py _ Pxy _ , symmetric])
       
  1794        (auto simp: space_pair_measure)
  1758   finally show ?thesis ..
  1795   finally show ?thesis ..
  1759 qed
  1796 qed
  1760 
  1797 
  1761 lemma (in information_space) mutual_information_eq_entropy_conditional_entropy':
  1798 lemma (in information_space) mutual_information_eq_entropy_conditional_entropy':
  1762   fixes Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" and Pxy :: "('b \<times> 'c) \<Rightarrow> real"
  1799   fixes Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" and Pxy :: "('b \<times> 'c) \<Rightarrow> real"
  1763   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
  1800   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
  1764   assumes Px: "distributed M S X Px" and Py: "distributed M T Y Py"
  1801   assumes Px: "distributed M S X Px" "\<And>x. x \<in> space S \<Longrightarrow> 0 \<le> Px x"
       
  1802     and Py: "distributed M T Y Py" "\<And>x. x \<in> space T \<Longrightarrow> 0 \<le> Py x"
  1765   assumes Pxy: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
  1803   assumes Pxy: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
       
  1804     "\<And>x. x \<in> space (S \<Otimes>\<^sub>M T) \<Longrightarrow> 0 \<le> Pxy x"
  1766   assumes Ix: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Px (fst x)))"
  1805   assumes Ix: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Px (fst x)))"
  1767   assumes Iy: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Py (snd x)))"
  1806   assumes Iy: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Py (snd x)))"
  1768   assumes Ixy: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x))"
  1807   assumes Ixy: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x))"
  1769   shows  "mutual_information b S T X Y = entropy b S X - conditional_entropy b S T X Y"
  1808   shows  "mutual_information b S T X Y = entropy b S X - conditional_entropy b S T X Y"
  1770   using
  1809   using
  1771     mutual_information_eq_entropy_conditional_entropy_distr[OF S T Px Py Pxy Ix Iy Ixy]
  1810     mutual_information_eq_entropy_conditional_entropy_distr[OF S T Px Py Pxy Ix Iy Ixy]
  1772     conditional_entropy_eq_entropy[OF S T Py Pxy Ixy Iy]
  1811     conditional_entropy_eq_entropy[OF S T Py Pxy Ixy Iy]
  1773   by simp
  1812   by (simp add: space_pair_measure)
  1774 
  1813 
  1775 lemma (in information_space) mutual_information_eq_entropy_conditional_entropy:
  1814 lemma (in information_space) mutual_information_eq_entropy_conditional_entropy:
  1776   assumes sf_X: "simple_function M X" and sf_Y: "simple_function M Y"
  1815   assumes sf_X: "simple_function M X" and sf_Y: "simple_function M Y"
  1777   shows  "\<I>(X ; Y) = \<H>(X) - \<H>(X | Y)"
  1816   shows  "\<I>(X ; Y) = \<H>(X) - \<H>(X | Y)"
  1778 proof -
  1817 proof -
  1779   have X: "simple_distributed M X (\<lambda>x. measure M (X -` {x} \<inter> space M))"
  1818   have X: "simple_distributed M X (\<lambda>x. measure M (X -` {x} \<inter> space M))"
  1780     using sf_X by (rule simple_distributedI) auto
  1819     using sf_X by (rule simple_distributedI) (auto simp: measure_nonneg)
  1781   have Y: "simple_distributed M Y (\<lambda>x. measure M (Y -` {x} \<inter> space M))"
  1820   have Y: "simple_distributed M Y (\<lambda>x. measure M (Y -` {x} \<inter> space M))"
  1782     using sf_Y by (rule simple_distributedI) auto
  1821     using sf_Y by (rule simple_distributedI) (auto simp: measure_nonneg)
  1783   have sf_XY: "simple_function M (\<lambda>x. (X x, Y x))"
  1822   have sf_XY: "simple_function M (\<lambda>x. (X x, Y x))"
  1784     using sf_X sf_Y by (rule simple_function_Pair)
  1823     using sf_X sf_Y by (rule simple_function_Pair)
  1785   then have XY: "simple_distributed M (\<lambda>x. (X x, Y x)) (\<lambda>x. measure M ((\<lambda>x. (X x, Y x)) -` {x} \<inter> space M))"
  1824   then have XY: "simple_distributed M (\<lambda>x. (X x, Y x)) (\<lambda>x. measure M ((\<lambda>x. (X x, Y x)) -` {x} \<inter> space M))"
  1786     by (rule simple_distributedI) auto
  1825     by (rule simple_distributedI) (auto simp: measure_nonneg)
  1787   from simple_distributed_joint_finite[OF this, simp]
  1826   from simple_distributed_joint_finite[OF this, simp]
  1788   have eq: "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) = count_space (X ` space M \<times> Y ` space M)"
  1827   have eq: "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) = count_space (X ` space M \<times> Y ` space M)"
  1789     by (simp add: pair_measure_count_space)
  1828     by (simp add: pair_measure_count_space)
  1790 
  1829 
  1791   have "\<I>(X ; Y) = \<H>(X) + \<H>(Y) - entropy b (count_space (X`space M) \<Otimes>\<^sub>M count_space (Y`space M)) (\<lambda>x. (X x, Y x))"
  1830   have "\<I>(X ; Y) = \<H>(X) + \<H>(Y) - entropy b (count_space (X`space M) \<Otimes>\<^sub>M count_space (Y`space M)) (\<lambda>x. (X x, Y x))"
  1792     using sigma_finite_measure_count_space_finite sigma_finite_measure_count_space_finite simple_distributed[OF X] simple_distributed[OF Y] simple_distributed_joint[OF XY]
  1831     using sigma_finite_measure_count_space_finite
  1793     by (rule mutual_information_eq_entropy_conditional_entropy_distr) (auto simp: eq integrable_count_space)
  1832       sigma_finite_measure_count_space_finite
       
  1833       simple_distributed[OF X] measure_nonneg simple_distributed[OF Y] measure_nonneg simple_distributed_joint[OF XY]
       
  1834     by (rule mutual_information_eq_entropy_conditional_entropy_distr)
       
  1835        (auto simp: eq integrable_count_space measure_nonneg)
  1794   then show ?thesis
  1836   then show ?thesis
  1795     unfolding conditional_entropy_eq_entropy_simple[OF sf_X sf_Y] by simp
  1837     unfolding conditional_entropy_eq_entropy_simple[OF sf_X sf_Y] by simp
  1796 qed
  1838 qed
  1797 
  1839 
  1798 lemma (in information_space) mutual_information_nonneg_simple:
  1840 lemma (in information_space) mutual_information_nonneg_simple:
  1799   assumes sf_X: "simple_function M X" and sf_Y: "simple_function M Y"
  1841   assumes sf_X: "simple_function M X" and sf_Y: "simple_function M Y"
  1800   shows  "0 \<le> \<I>(X ; Y)"
  1842   shows  "0 \<le> \<I>(X ; Y)"
  1801 proof -
  1843 proof -
  1802   have X: "simple_distributed M X (\<lambda>x. measure M (X -` {x} \<inter> space M))"
  1844   have X: "simple_distributed M X (\<lambda>x. measure M (X -` {x} \<inter> space M))"
  1803     using sf_X by (rule simple_distributedI) auto
  1845     using sf_X by (rule simple_distributedI) (auto simp: measure_nonneg)
  1804   have Y: "simple_distributed M Y (\<lambda>x. measure M (Y -` {x} \<inter> space M))"
  1846   have Y: "simple_distributed M Y (\<lambda>x. measure M (Y -` {x} \<inter> space M))"
  1805     using sf_Y by (rule simple_distributedI) auto
  1847     using sf_Y by (rule simple_distributedI) (auto simp: measure_nonneg)
  1806 
  1848 
  1807   have sf_XY: "simple_function M (\<lambda>x. (X x, Y x))"
  1849   have sf_XY: "simple_function M (\<lambda>x. (X x, Y x))"
  1808     using sf_X sf_Y by (rule simple_function_Pair)
  1850     using sf_X sf_Y by (rule simple_function_Pair)
  1809   then have XY: "simple_distributed M (\<lambda>x. (X x, Y x)) (\<lambda>x. measure M ((\<lambda>x. (X x, Y x)) -` {x} \<inter> space M))"
  1851   then have XY: "simple_distributed M (\<lambda>x. (X x, Y x)) (\<lambda>x. measure M ((\<lambda>x. (X x, Y x)) -` {x} \<inter> space M))"
  1810     by (rule simple_distributedI) auto
  1852     by (rule simple_distributedI) (auto simp: measure_nonneg)
  1811 
  1853 
  1812   from simple_distributed_joint_finite[OF this, simp]
  1854   from simple_distributed_joint_finite[OF this, simp]
  1813   have eq: "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) = count_space (X ` space M \<times> Y ` space M)"
  1855   have eq: "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) = count_space (X ` space M \<times> Y ` space M)"
  1814     by (simp add: pair_measure_count_space)
  1856     by (simp add: pair_measure_count_space)
  1815 
  1857 
  1816   show ?thesis
  1858   show ?thesis
  1817     by (rule mutual_information_nonneg[OF _ _ simple_distributed[OF X] simple_distributed[OF Y] simple_distributed_joint[OF XY]])
  1859     by (rule mutual_information_nonneg[OF _ _ simple_distributed[OF X] _ simple_distributed[OF Y] _ simple_distributed_joint[OF XY]])
  1818        (simp_all add: eq integrable_count_space sigma_finite_measure_count_space_finite)
  1860        (simp_all add: eq integrable_count_space sigma_finite_measure_count_space_finite measure_nonneg)
  1819 qed
  1861 qed
  1820 
  1862 
  1821 lemma (in information_space) conditional_entropy_less_eq_entropy:
  1863 lemma (in information_space) conditional_entropy_less_eq_entropy:
  1822   assumes X: "simple_function M X" and Z: "simple_function M Z"
  1864   assumes X: "simple_function M X" and Z: "simple_function M Z"
  1823   shows "\<H>(X | Z) \<le> \<H>(X)"
  1865   shows "\<H>(X | Z) \<le> \<H>(X)"
  1825   have "0 \<le> \<I>(X ; Z)" using X Z by (rule mutual_information_nonneg_simple)
  1867   have "0 \<le> \<I>(X ; Z)" using X Z by (rule mutual_information_nonneg_simple)
  1826   also have "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)" using mutual_information_eq_entropy_conditional_entropy[OF assms] .
  1868   also have "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)" using mutual_information_eq_entropy_conditional_entropy[OF assms] .
  1827   finally show ?thesis by auto
  1869   finally show ?thesis by auto
  1828 qed
  1870 qed
  1829 
  1871 
  1830 lemma (in information_space) 
  1872 lemma (in information_space)
  1831   fixes Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" and Pxy :: "('b \<times> 'c) \<Rightarrow> real"
  1873   fixes Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" and Pxy :: "('b \<times> 'c) \<Rightarrow> real"
  1832   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
  1874   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
  1833   assumes Px: "finite_entropy S X Px" and Py: "finite_entropy T Y Py"
  1875   assumes Px: "finite_entropy S X Px" and Py: "finite_entropy T Y Py"
  1834   assumes Pxy: "finite_entropy (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
  1876   assumes Pxy: "finite_entropy (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy"
  1835   shows "conditional_entropy b S T X Y \<le> entropy b S X"
  1877   shows "conditional_entropy b S T X Y \<le> entropy b S X"
  1836 proof -
  1878 proof -
  1837 
  1879 
  1838   have "0 \<le> mutual_information b S T X Y" 
  1880   have "0 \<le> mutual_information b S T X Y"
  1839     by (rule mutual_information_nonneg') fact+
  1881     by (rule mutual_information_nonneg') fact+
  1840   also have "\<dots> = entropy b S X - conditional_entropy b S T X Y"
  1882   also have "\<dots> = entropy b S X - conditional_entropy b S T X Y"
  1841     apply (rule mutual_information_eq_entropy_conditional_entropy')
  1883     apply (rule mutual_information_eq_entropy_conditional_entropy')
  1842     using assms
  1884     using assms
  1843     by (auto intro!: finite_entropy_integrable finite_entropy_distributed
  1885     by (auto intro!: finite_entropy_integrable finite_entropy_distributed
  1844       finite_entropy_integrable_transform[OF Px]
  1886       finite_entropy_integrable_transform[OF Px]
  1845       finite_entropy_integrable_transform[OF Py])
  1887       finite_entropy_integrable_transform[OF Py]
       
  1888       intro: finite_entropy_nn)
  1846   finally show ?thesis by auto
  1889   finally show ?thesis by auto
  1847 qed
  1890 qed
  1848 
  1891 
  1849 lemma (in information_space) entropy_chain_rule:
  1892 lemma (in information_space) entropy_chain_rule:
  1850   assumes X: "simple_function M X" and Y: "simple_function M Y"
  1893   assumes X: "simple_function M X" and Y: "simple_function M Y"
  1851   shows  "\<H>(\<lambda>x. (X x, Y x)) = \<H>(X) + \<H>(Y|X)"
  1894   shows  "\<H>(\<lambda>x. (X x, Y x)) = \<H>(X) + \<H>(Y|X)"
  1852 proof -
  1895 proof -
  1853   note XY = simple_distributedI[OF simple_function_Pair[OF X Y] refl]
  1896   note XY = simple_distributedI[OF simple_function_Pair[OF X Y] measure_nonneg refl]
  1854   note YX = simple_distributedI[OF simple_function_Pair[OF Y X] refl]
  1897   note YX = simple_distributedI[OF simple_function_Pair[OF Y X] measure_nonneg refl]
  1855   note simple_distributed_joint_finite[OF this, simp]
  1898   note simple_distributed_joint_finite[OF this, simp]
  1856   let ?f = "\<lambda>x. prob ((\<lambda>x. (X x, Y x)) -` {x} \<inter> space M)"
  1899   let ?f = "\<lambda>x. prob ((\<lambda>x. (X x, Y x)) -` {x} \<inter> space M)"
  1857   let ?g = "\<lambda>x. prob ((\<lambda>x. (Y x, X x)) -` {x} \<inter> space M)"
  1900   let ?g = "\<lambda>x. prob ((\<lambda>x. (Y x, X x)) -` {x} \<inter> space M)"
  1858   let ?h = "\<lambda>x. if x \<in> (\<lambda>x. (Y x, X x)) ` space M then prob ((\<lambda>x. (Y x, X x)) -` {x} \<inter> space M) else 0"
  1901   let ?h = "\<lambda>x. if x \<in> (\<lambda>x. (Y x, X x)) ` space M then prob ((\<lambda>x. (Y x, X x)) -` {x} \<inter> space M) else 0"
  1859   have "\<H>(\<lambda>x. (X x, Y x)) = - (\<Sum>x\<in>(\<lambda>x. (X x, Y x)) ` space M. ?f x * log b (?f x))"
  1902   have "\<H>(\<lambda>x. (X x, Y x)) = - (\<Sum>x\<in>(\<lambda>x. (X x, Y x)) ` space M. ?f x * log b (?f x))"
  1863   also have "\<dots> = - (\<Sum>x\<in>(\<lambda>x. (Y x, X x)) ` space M. ?h x * log b (?h x))"
  1906   also have "\<dots> = - (\<Sum>x\<in>(\<lambda>x. (Y x, X x)) ` space M. ?h x * log b (?h x))"
  1864     by (auto intro!: setsum.cong)
  1907     by (auto intro!: setsum.cong)
  1865   also have "\<dots> = entropy b (count_space (Y ` space M) \<Otimes>\<^sub>M count_space (X ` space M)) (\<lambda>x. (Y x, X x))"
  1908   also have "\<dots> = entropy b (count_space (Y ` space M) \<Otimes>\<^sub>M count_space (X ` space M)) (\<lambda>x. (Y x, X x))"
  1866     by (subst entropy_distr[OF simple_distributed_joint[OF YX]])
  1909     by (subst entropy_distr[OF simple_distributed_joint[OF YX]])
  1867        (auto simp: pair_measure_count_space sigma_finite_measure_count_space_finite lebesgue_integral_count_space_finite
  1910        (auto simp: pair_measure_count_space sigma_finite_measure_count_space_finite lebesgue_integral_count_space_finite
  1868              cong del: setsum.cong  intro!: setsum.mono_neutral_left)
  1911              cong del: setsum.cong  intro!: setsum.mono_neutral_left measure_nonneg)
  1869   finally have "\<H>(\<lambda>x. (X x, Y x)) = entropy b (count_space (Y ` space M) \<Otimes>\<^sub>M count_space (X ` space M)) (\<lambda>x. (Y x, X x))" .
  1912   finally have "\<H>(\<lambda>x. (X x, Y x)) = entropy b (count_space (Y ` space M) \<Otimes>\<^sub>M count_space (X ` space M)) (\<lambda>x. (Y x, X x))" .
  1870   then show ?thesis
  1913   then show ?thesis
  1871     unfolding conditional_entropy_eq_entropy_simple[OF Y X] by simp
  1914     unfolding conditional_entropy_eq_entropy_simple[OF Y X] by simp
  1872 qed
  1915 qed
  1873 
  1916 
  1874 lemma (in information_space) entropy_partition:
  1917 lemma (in information_space) entropy_partition:
  1875   assumes X: "simple_function M X"
  1918   assumes X: "simple_function M X"
  1876   shows "\<H>(X) = \<H>(f \<circ> X) + \<H>(X|f \<circ> X)"
  1919   shows "\<H>(X) = \<H>(f \<circ> X) + \<H>(X|f \<circ> X)"
  1877 proof -
  1920 proof -
  1878   note fX = simple_function_compose[OF X, of f]  
  1921   note fX = simple_function_compose[OF X, of f]
  1879   have eq: "(\<lambda>x. ((f \<circ> X) x, X x)) ` space M = (\<lambda>x. (f x, x)) ` X ` space M" by auto
  1922   have eq: "(\<lambda>x. ((f \<circ> X) x, X x)) ` space M = (\<lambda>x. (f x, x)) ` X ` space M" by auto
  1880   have inj: "\<And>A. inj_on (\<lambda>x. (f x, x)) A"
  1923   have inj: "\<And>A. inj_on (\<lambda>x. (f x, x)) A"
  1881     by (auto simp: inj_on_def)
  1924     by (auto simp: inj_on_def)
  1882   show ?thesis
  1925   show ?thesis
  1883     apply (subst entropy_chain_rule[symmetric, OF fX X])
  1926     apply (subst entropy_chain_rule[symmetric, OF fX X])
  1884     apply (subst entropy_simple_distributed[OF simple_distributedI[OF simple_function_Pair[OF fX X] refl]])
  1927     apply (subst entropy_simple_distributed[OF simple_distributedI[OF simple_function_Pair[OF fX X] measure_nonneg refl]])
  1885     apply (subst entropy_simple_distributed[OF simple_distributedI[OF X refl]])
  1928     apply (subst entropy_simple_distributed[OF simple_distributedI[OF X measure_nonneg refl]])
  1886     unfolding eq
  1929     unfolding eq
  1887     apply (subst setsum.reindex[OF inj])
  1930     apply (subst setsum.reindex[OF inj])
  1888     apply (auto intro!: setsum.cong arg_cong[where f="\<lambda>A. prob A * log b (prob A)"])
  1931     apply (auto intro!: setsum.cong arg_cong[where f="\<lambda>A. prob A * log b (prob A)"])
  1889     done
  1932     done
  1890 qed
  1933 qed
  1906 next
  1949 next
  1907   have sf: "simple_function M (f \<circ> X)"
  1950   have sf: "simple_function M (f \<circ> X)"
  1908     using X by auto
  1951     using X by auto
  1909   have "\<H>(X) = \<H>(the_inv_into (X`space M) f \<circ> (f \<circ> X))"
  1952   have "\<H>(X) = \<H>(the_inv_into (X`space M) f \<circ> (f \<circ> X))"
  1910     unfolding o_assoc
  1953     unfolding o_assoc
  1911     apply (subst entropy_simple_distributed[OF simple_distributedI[OF X refl]])
  1954     apply (subst entropy_simple_distributed[OF simple_distributedI[OF X measure_nonneg refl]])
  1912     apply (subst entropy_simple_distributed[OF simple_distributedI[OF simple_function_compose[OF X]], where f="\<lambda>x. prob (X -` {x} \<inter> space M)"])
  1955     apply (subst entropy_simple_distributed[OF simple_distributedI[OF simple_function_compose[OF X]], where f="\<lambda>x. prob (X -` {x} \<inter> space M)"])
  1913     apply (auto intro!: setsum.cong arg_cong[where f=prob] image_eqI simp: the_inv_into_f_f[OF inj] comp_def)
  1956     apply (auto intro!: setsum.cong arg_cong[where f=prob] image_eqI simp: the_inv_into_f_f[OF inj] comp_def measure_nonneg)
  1914     done
  1957     done
  1915   also have "... \<le> \<H>(f \<circ> X)"
  1958   also have "... \<le> \<H>(f \<circ> X)"
  1916     using entropy_data_processing[OF sf] .
  1959     using entropy_data_processing[OF sf] .
  1917   finally show "\<H>(X) \<le> \<H>(f \<circ> X)" .
  1960   finally show "\<H>(X) \<le> \<H>(f \<circ> X)" .
  1918 qed
  1961 qed