src/HOL/Probability/Information.thy
author wenzelm
Wed, 29 Dec 2010 17:34:41 +0100
changeset 41413 64cd30d6b0b8
parent 41095 c335d880ff82
child 41661 baf1964bc468
permissions -rw-r--r--
explicit file specifications -- avoid secondary load path;

theory Information
imports
  Probability_Space
  "~~/src/HOL/Library/Convex"
  Lebesgue_Measure
begin

lemma log_le: "1 < a \<Longrightarrow> 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log a x \<le> log a y"
  by (subst log_le_cancel_iff) auto

lemma log_less: "1 < a \<Longrightarrow> 0 < x \<Longrightarrow> x < y \<Longrightarrow> log a x < log a y"
  by (subst log_less_cancel_iff) auto

lemma setsum_cartesian_product':
  "(\<Sum>x\<in>A \<times> B. f x) = (\<Sum>x\<in>A. setsum (\<lambda>y. f (x, y)) B)"
  unfolding setsum_cartesian_product by simp

section "Convex theory"

lemma log_setsum:
  assumes "finite s" "s \<noteq> {}"
  assumes "b > 1"
  assumes "(\<Sum> i \<in> s. a i) = 1"
  assumes "\<And> i. i \<in> s \<Longrightarrow> a i \<ge> 0"
  assumes "\<And> i. i \<in> s \<Longrightarrow> y i \<in> {0 <..}"
  shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
proof -
  have "convex_on {0 <..} (\<lambda> x. - log b x)"
    by (rule minus_log_convex[OF `b > 1`])
  hence "- log b (\<Sum> i \<in> s. a i * y i) \<le> (\<Sum> i \<in> s. a i * - log b (y i))"
    using convex_on_setsum[of _ _ "\<lambda> x. - log b x"] assms pos_is_convex by fastsimp
  thus ?thesis by (auto simp add:setsum_negf le_imp_neg_le)
qed

lemma log_setsum':
  assumes "finite s" "s \<noteq> {}"
  assumes "b > 1"
  assumes "(\<Sum> i \<in> s. a i) = 1"
  assumes pos: "\<And> i. i \<in> s \<Longrightarrow> 0 \<le> a i"
          "\<And> i. \<lbrakk> i \<in> s ; 0 < a i \<rbrakk> \<Longrightarrow> 0 < y i"
  shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
proof -
  have "\<And>y. (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) = (\<Sum> i \<in> s. a i * y i)"
    using assms by (auto intro!: setsum_mono_zero_cong_left)
  moreover have "log b (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) \<ge> (\<Sum> i \<in> s - {i. a i = 0}. a i * log b (y i))"
  proof (rule log_setsum)
    have "setsum a (s - {i. a i = 0}) = setsum a s"
      using assms(1) by (rule setsum_mono_zero_cong_left) auto
    thus sum_1: "setsum a (s - {i. a i = 0}) = 1"
      "finite (s - {i. a i = 0})" using assms by simp_all

    show "s - {i. a i = 0} \<noteq> {}"
    proof
      assume *: "s - {i. a i = 0} = {}"
      hence "setsum a (s - {i. a i = 0}) = 0" by (simp add: * setsum_empty)
      with sum_1 show False by simp
    qed

    fix i assume "i \<in> s - {i. a i = 0}"
    hence "i \<in> s" "a i \<noteq> 0" by simp_all
    thus "0 \<le> a i" "y i \<in> {0<..}" using pos[of i] by auto
  qed fact+
  ultimately show ?thesis by simp
qed

lemma log_setsum_divide:
  assumes "finite S" and "S \<noteq> {}" and "1 < b"
  assumes "(\<Sum>x\<in>S. g x) = 1"
  assumes pos: "\<And>x. x \<in> S \<Longrightarrow> g x \<ge> 0" "\<And>x. x \<in> S \<Longrightarrow> f x \<ge> 0"
  assumes g_pos: "\<And>x. \<lbrakk> x \<in> S ; 0 < g x \<rbrakk> \<Longrightarrow> 0 < f x"
  shows "- (\<Sum>x\<in>S. g x * log b (g x / f x)) \<le> log b (\<Sum>x\<in>S. f x)"
proof -
  have log_mono: "\<And>x y. 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log b x \<le> log b y"
    using `1 < b` by (subst log_le_cancel_iff) auto

  have "- (\<Sum>x\<in>S. g x * log b (g x / f x)) = (\<Sum>x\<in>S. g x * log b (f x / g x))"
  proof (unfold setsum_negf[symmetric], rule setsum_cong)
    fix x assume x: "x \<in> S"
    show "- (g x * log b (g x / f x)) = g x * log b (f x / g x)"
    proof (cases "g x = 0")
      case False
      with pos[OF x] g_pos[OF x] have "0 < f x" "0 < g x" by simp_all
      thus ?thesis using `1 < b` by (simp add: log_divide field_simps)
    qed simp
  qed rule
  also have "... \<le> log b (\<Sum>x\<in>S. g x * (f x / g x))"
  proof (rule log_setsum')
    fix x assume x: "x \<in> S" "0 < g x"
    with g_pos[OF x] show "0 < f x / g x" by (safe intro!: divide_pos_pos)
  qed fact+
  also have "... = log b (\<Sum>x\<in>S - {x. g x = 0}. f x)" using `finite S`
    by (auto intro!: setsum_mono_zero_cong_right arg_cong[where f="log b"]
        split: split_if_asm)
  also have "... \<le> log b (\<Sum>x\<in>S. f x)"
  proof (rule log_mono)
    have "0 = (\<Sum>x\<in>S - {x. g x = 0}. 0)" by simp
    also have "... < (\<Sum>x\<in>S - {x. g x = 0}. f x)" (is "_ < ?sum")
    proof (rule setsum_strict_mono)
      show "finite (S - {x. g x = 0})" using `finite S` by simp
      show "S - {x. g x = 0} \<noteq> {}"
      proof
        assume "S - {x. g x = 0} = {}"
        hence "(\<Sum>x\<in>S. g x) = 0" by (subst setsum_0') auto
        with `(\<Sum>x\<in>S. g x) = 1` show False by simp
      qed
      fix x assume "x \<in> S - {x. g x = 0}"
      thus "0 < f x" using g_pos[of x] pos(1)[of x] by auto
    qed
    finally show "0 < ?sum" .
    show "(\<Sum>x\<in>S - {x. g x = 0}. f x) \<le> (\<Sum>x\<in>S. f x)"
      using `finite S` pos by (auto intro!: setsum_mono2)
  qed
  finally show ?thesis .
qed

lemma split_pairs:
  "((A, B) = X) \<longleftrightarrow> (fst X = A \<and> snd X = B)" and
  "(X = (A, B)) \<longleftrightarrow> (fst X = A \<and> snd X = B)" by auto

section "Information theory"

locale information_space = prob_space +
  fixes b :: real assumes b_gt_1: "1 < b"

context information_space
begin

text {* Introduce some simplification rules for logarithm of base @{term b}. *}

lemma log_neg_const:
  assumes "x \<le> 0"
  shows "log b x = log b 0"
proof -
  { fix u :: real
    have "x \<le> 0" by fact
    also have "0 < exp u"
      using exp_gt_zero .
    finally have "exp u \<noteq> x"
      by auto }
  then show "log b x = log b 0"
    by (simp add: log_def ln_def)
qed

lemma log_mult_eq:
  "log b (A * B) = (if 0 < A * B then log b \<bar>A\<bar> + log b \<bar>B\<bar> else log b 0)"
  using log_mult[of b "\<bar>A\<bar>" "\<bar>B\<bar>"] b_gt_1 log_neg_const[of "A * B"]
  by (auto simp: zero_less_mult_iff mult_le_0_iff)

lemma log_inverse_eq:
  "log b (inverse B) = (if 0 < B then - log b B else log b 0)"
  using log_inverse[of b B] log_neg_const[of "inverse B"] b_gt_1 by simp

lemma log_divide_eq:
  "log b (A / B) = (if 0 < A * B then log b \<bar>A\<bar> - log b \<bar>B\<bar> else log b 0)"
  unfolding divide_inverse log_mult_eq log_inverse_eq abs_inverse
  by (auto simp: zero_less_mult_iff mult_le_0_iff)

lemmas log_simps = log_mult_eq log_inverse_eq log_divide_eq

end

subsection "Kullback$-$Leibler divergence"

text {* The Kullback$-$Leibler divergence is also known as relative entropy or
Kullback$-$Leibler distance. *}

definition
  "KL_divergence b M \<mu> \<nu> =
    measure_space.integral M \<mu> (\<lambda>x. log b (real (sigma_finite_measure.RN_deriv M \<nu> \<mu> x)))"

lemma (in sigma_finite_measure) KL_divergence_cong:
  assumes "measure_space M \<nu>"
  and cong: "\<And>A. A \<in> sets M \<Longrightarrow> \<mu>' A = \<mu> A" "\<And>A. A \<in> sets M \<Longrightarrow> \<nu>' A = \<nu> A"
  shows "KL_divergence b M \<nu>' \<mu>' = KL_divergence b M \<nu> \<mu>"
proof -
  interpret \<nu>: measure_space M \<nu> by fact
  show ?thesis
    unfolding KL_divergence_def
    using RN_deriv_cong[OF cong, of "\<lambda>A. A"]
    by (simp add: cong \<nu>.integral_cong_measure[OF cong(2)])
qed

lemma (in sigma_finite_measure) KL_divergence_vimage:
  assumes f: "bij_betw f S (space M)"
  assumes \<nu>: "measure_space M \<nu>" "absolutely_continuous \<nu>"
  shows "KL_divergence b (vimage_algebra S f) (\<lambda>A. \<nu> (f ` A)) (\<lambda>A. \<mu> (f ` A)) = KL_divergence b M \<nu> \<mu>"
    (is "KL_divergence b ?M ?\<nu> ?\<mu> = _")
proof -
  interpret \<nu>: measure_space M \<nu> by fact
  interpret v: measure_space ?M ?\<nu>
    using f by (rule \<nu>.measure_space_isomorphic)

  let ?RN = "sigma_finite_measure.RN_deriv ?M ?\<mu> ?\<nu>"
  from RN_deriv_vimage[OF f[THEN bij_inv_the_inv_into] \<nu>]
  have *: "\<nu>.almost_everywhere (\<lambda>x. ?RN (the_inv_into S f x) = RN_deriv \<nu> x)"
    by (rule absolutely_continuous_AE[OF \<nu>])

  show ?thesis
    unfolding KL_divergence_def \<nu>.integral_vimage_inv[OF f]
    apply (rule \<nu>.integral_cong_AE)
    apply (rule \<nu>.AE_mp[OF *])
    apply (rule \<nu>.AE_cong)
    apply simp
    done
qed

lemma (in finite_measure_space) KL_divergence_eq_finite:
  assumes v: "finite_measure_space M \<nu>"
  assumes ac: "absolutely_continuous \<nu>"
  shows "KL_divergence b M \<nu> \<mu> = (\<Sum>x\<in>space M. real (\<nu> {x}) * log b (real (\<nu> {x}) / real (\<mu> {x})))" (is "_ = ?sum")
proof (simp add: KL_divergence_def finite_measure_space.integral_finite_singleton[OF v])
  interpret v: finite_measure_space M \<nu> by fact
  have ms: "measure_space M \<nu>" by fact
  show "(\<Sum>x \<in> space M. log b (real (RN_deriv \<nu> x)) * real (\<nu> {x})) = ?sum"
    using RN_deriv_finite_measure[OF ms ac]
    by (auto intro!: setsum_cong simp: field_simps real_of_pextreal_mult[symmetric])
qed

lemma (in finite_prob_space) KL_divergence_positive_finite:
  assumes v: "finite_prob_space M \<nu>"
  assumes ac: "absolutely_continuous \<nu>"
  and "1 < b"
  shows "0 \<le> KL_divergence b M \<nu> \<mu>"
proof -
  interpret v: finite_prob_space M \<nu> using v .
  have ms: "finite_measure_space M \<nu>" by default

  have "- (KL_divergence b M \<nu> \<mu>) \<le> log b (\<Sum>x\<in>space M. real (\<mu> {x}))"
  proof (subst KL_divergence_eq_finite[OF ms ac], safe intro!: log_setsum_divide not_empty)
    show "finite (space M)" using finite_space by simp
    show "1 < b" by fact
    show "(\<Sum>x\<in>space M. real (\<nu> {x})) = 1" using v.finite_sum_over_space_eq_1 by simp

    fix x assume "x \<in> space M"
    then have x: "{x} \<in> sets M" unfolding sets_eq_Pow by auto
    { assume "0 < real (\<nu> {x})"
      then have "\<nu> {x} \<noteq> 0" by auto
      then have "\<mu> {x} \<noteq> 0"
        using ac[unfolded absolutely_continuous_def, THEN bspec, of "{x}"] x by auto
      thus "0 < prob {x}" using finite_measure[of "{x}"] x by auto }
  qed auto
  thus "0 \<le> KL_divergence b M \<nu> \<mu>" using finite_sum_over_space_eq_1 by simp
qed

subsection {* Mutual Information *}

definition (in prob_space)
  "mutual_information b S T X Y =
    KL_divergence b (sigma (pair_algebra S T))
      (joint_distribution X Y)
      (pair_sigma_finite.pair_measure S (distribution X) T (distribution Y))"

definition (in prob_space)
  "entropy b s X = mutual_information b s s X X"

abbreviation (in information_space)
  mutual_information_Pow ("\<I>'(_ ; _')") where
  "\<I>(X ; Y) \<equiv> mutual_information b
    \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
    \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr> X Y"

lemma (in information_space) mutual_information_commute_generic:
  assumes X: "random_variable S X" and Y: "random_variable T Y"
  assumes ac: "measure_space.absolutely_continuous (sigma (pair_algebra S T))
   (pair_sigma_finite.pair_measure S (distribution X) T (distribution Y)) (joint_distribution X Y)"
  shows "mutual_information b S T X Y = mutual_information b T S Y X"
proof -
  interpret P: prob_space "sigma (pair_algebra S T)" "joint_distribution X Y"
    using random_variable_pairI[OF X Y] by (rule distribution_prob_space)
  interpret Q: prob_space "sigma (pair_algebra T S)" "joint_distribution Y X"
    using random_variable_pairI[OF Y X] by (rule distribution_prob_space)
  interpret X: prob_space S "distribution X" using X by (rule distribution_prob_space)
  interpret Y: prob_space T "distribution Y" using Y by (rule distribution_prob_space)
  interpret ST: pair_sigma_finite S "distribution X" T "distribution Y" by default
  interpret TS: pair_sigma_finite T "distribution Y" S "distribution X" by default

  have ST: "measure_space (sigma (pair_algebra S T)) (joint_distribution X Y)" by default
  have TS: "measure_space (sigma (pair_algebra T S)) (joint_distribution Y X)" by default

  have bij_ST: "bij_betw (\<lambda>(x, y). (y, x)) (space (sigma (pair_algebra S T))) (space (sigma (pair_algebra T S)))"
    by (auto intro!: inj_onI simp: space_pair_algebra bij_betw_def)
  have bij_TS: "bij_betw (\<lambda>(x, y). (y, x)) (space (sigma (pair_algebra T S))) (space (sigma (pair_algebra S T)))"
    by (auto intro!: inj_onI simp: space_pair_algebra bij_betw_def)

  { fix A
    have "joint_distribution X Y ((\<lambda>(x, y). (y, x)) ` A) = joint_distribution Y X A"
      unfolding distribution_def by (auto intro!: arg_cong[where f=\<mu>]) }
  note jd_commute = this

  { fix A assume A: "A \<in> sets (sigma (pair_algebra T S))"
    have *: "\<And>x y. indicator ((\<lambda>(x, y). (y, x)) ` A) (x, y) = (indicator A (y, x) :: pextreal)"
      unfolding indicator_def by auto
    have "ST.pair_measure ((\<lambda>(x, y). (y, x)) ` A) = TS.pair_measure A"
      unfolding ST.pair_measure_def TS.pair_measure_def
      using A by (auto simp add: TS.Fubini[symmetric] *) }
  note pair_measure_commute = this

  show ?thesis
    unfolding mutual_information_def
    unfolding ST.KL_divergence_vimage[OF bij_TS ST ac, symmetric]
    unfolding space_sigma space_pair_algebra jd_commute
    unfolding ST.pair_sigma_algebra_swap[symmetric]
    by (simp cong: TS.KL_divergence_cong[OF TS] add: pair_measure_commute)
qed

lemma (in prob_space) finite_variables_absolutely_continuous:
  assumes X: "finite_random_variable S X" and Y: "finite_random_variable T Y"
  shows "measure_space.absolutely_continuous (sigma (pair_algebra S T))
   (pair_sigma_finite.pair_measure S (distribution X) T (distribution Y)) (joint_distribution X Y)"
proof -
  interpret X: finite_prob_space S "distribution X" using X by (rule distribution_finite_prob_space)
  interpret Y: finite_prob_space T "distribution Y" using Y by (rule distribution_finite_prob_space)
  interpret XY: pair_finite_prob_space S "distribution X" T "distribution Y" by default
  interpret P: finite_prob_space XY.P "joint_distribution X Y"
    using assms by (intro joint_distribution_finite_prob_space)
  show "XY.absolutely_continuous (joint_distribution X Y)"
  proof (rule XY.absolutely_continuousI)
    show "finite_measure_space XY.P (joint_distribution X Y)" by default
    fix x assume "x \<in> space XY.P" and "XY.pair_measure {x} = 0"
    then obtain a b where "(a, b) = x" and "a \<in> space S" "b \<in> space T"
      and distr: "distribution X {a} * distribution Y {b} = 0"
      by (cases x) (auto simp: pair_algebra_def)
    with assms[THEN finite_random_variableD]
      joint_distribution_Times_le_fst[of S X T Y "{a}" "{b}"]
      joint_distribution_Times_le_snd[of S X T Y "{a}" "{b}"]
    have "joint_distribution X Y {x} \<le> distribution Y {b}"
         "joint_distribution X Y {x} \<le> distribution X {a}"
      by auto
    with distr show "joint_distribution X Y {x} = 0" by auto
  qed
qed

lemma (in information_space) mutual_information_commute:
  assumes X: "finite_random_variable S X" and Y: "finite_random_variable T Y"
  shows "mutual_information b S T X Y = mutual_information b T S Y X"
  by (intro finite_random_variableD X Y mutual_information_commute_generic finite_variables_absolutely_continuous)

lemma (in information_space) mutual_information_commute_simple:
  assumes X: "simple_function X" and Y: "simple_function Y"
  shows "\<I>(X;Y) = \<I>(Y;X)"
  by (intro X Y simple_function_imp_finite_random_variable mutual_information_commute)

lemma (in information_space)
  assumes MX: "finite_random_variable MX X"
  assumes MY: "finite_random_variable MY Y"
  shows mutual_information_generic_eq:
    "mutual_information b MX MY X Y = (\<Sum> (x,y) \<in> space MX \<times> space MY.
      real (joint_distribution X Y {(x,y)}) *
      log b (real (joint_distribution X Y {(x,y)}) /
      (real (distribution X {x}) * real (distribution Y {y}))))"
    (is ?sum)
  and mutual_information_positive_generic:
     "0 \<le> mutual_information b MX MY X Y" (is ?positive)
proof -
  interpret X: finite_prob_space MX "distribution X" using MX by (rule distribution_finite_prob_space)
  interpret Y: finite_prob_space MY "distribution Y" using MY by (rule distribution_finite_prob_space)
  interpret XY: pair_finite_prob_space MX "distribution X" MY "distribution Y" by default
  interpret P: finite_prob_space XY.P "joint_distribution X Y"
    using assms by (intro joint_distribution_finite_prob_space)

  have P_ms: "finite_measure_space XY.P (joint_distribution X Y)" by default
  have P_ps: "finite_prob_space XY.P (joint_distribution X Y)" by default

  show ?sum
    unfolding Let_def mutual_information_def
    by (subst XY.KL_divergence_eq_finite[OF P_ms finite_variables_absolutely_continuous[OF MX MY]])
       (auto simp add: pair_algebra_def setsum_cartesian_product' real_of_pextreal_mult[symmetric])

  show ?positive
    using XY.KL_divergence_positive_finite[OF P_ps finite_variables_absolutely_continuous[OF MX MY] b_gt_1]
    unfolding mutual_information_def .
qed

lemma (in information_space) mutual_information_eq:
  assumes "simple_function X" "simple_function Y"
  shows "\<I>(X;Y) = (\<Sum> (x,y) \<in> X ` space M \<times> Y ` space M.
    real (distribution (\<lambda>x. (X x, Y x)) {(x,y)}) * log b (real (distribution (\<lambda>x. (X x, Y x)) {(x,y)}) /
                                                   (real (distribution X {x}) * real (distribution Y {y}))))"
  using assms by (simp add: mutual_information_generic_eq)

lemma (in information_space) mutual_information_generic_cong:
  assumes X: "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x"
  assumes Y: "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x"
  shows "mutual_information b MX MY X Y = mutual_information b MX MY X' Y'"
  unfolding mutual_information_def using X Y
  by (simp cong: distribution_cong)

lemma (in information_space) mutual_information_cong:
  assumes X: "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x"
  assumes Y: "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x"
  shows "\<I>(X; Y) = \<I>(X'; Y')"
  unfolding mutual_information_def using X Y
  by (simp cong: distribution_cong image_cong)

lemma (in information_space) mutual_information_positive:
  assumes "simple_function X" "simple_function Y"
  shows "0 \<le> \<I>(X;Y)"
  using assms by (simp add: mutual_information_positive_generic)

subsection {* Entropy *}

abbreviation (in information_space)
  entropy_Pow ("\<H>'(_')") where
  "\<H>(X) \<equiv> entropy b \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr> X"

lemma (in information_space) entropy_generic_eq:
  assumes MX: "finite_random_variable MX X"
  shows "entropy b MX X = -(\<Sum> x \<in> space MX. real (distribution X {x}) * log b (real (distribution X {x})))"
proof -
  interpret MX: finite_prob_space MX "distribution X" using MX by (rule distribution_finite_prob_space)
  let "?X x" = "real (distribution X {x})"
  let "?XX x y" = "real (joint_distribution X X {(x, y)})"
  { fix x y
    have "(\<lambda>x. (X x, X x)) -` {(x, y)} = (if x = y then X -` {x} else {})" by auto
    then have "?XX x y * log b (?XX x y / (?X x * ?X y)) =
        (if x = y then - ?X y * log b (?X y) else 0)"
      unfolding distribution_def by (auto simp: log_simps zero_less_mult_iff) }
  note remove_XX = this
  show ?thesis
    unfolding entropy_def mutual_information_generic_eq[OF MX MX]
    unfolding setsum_cartesian_product[symmetric] setsum_negf[symmetric] remove_XX
    by (auto simp: setsum_cases MX.finite_space)
qed

lemma (in information_space) entropy_eq:
  assumes "simple_function X"
  shows "\<H>(X) = -(\<Sum> x \<in> X ` space M. real (distribution X {x}) * log b (real (distribution X {x})))"
  using assms by (simp add: entropy_generic_eq)

lemma (in information_space) entropy_positive:
  "simple_function X \<Longrightarrow> 0 \<le> \<H>(X)"
  unfolding entropy_def by (simp add: mutual_information_positive)

lemma (in information_space) entropy_certainty_eq_0:
  assumes "simple_function X" and "x \<in> X ` space M" and "distribution X {x} = 1"
  shows "\<H>(X) = 0"
proof -
  interpret X: finite_prob_space "\<lparr> space = X ` space M, sets = Pow (X ` space M) \<rparr>" "distribution X"
    using simple_function_imp_finite_random_variable[OF `simple_function X`]
    by (rule distribution_finite_prob_space)
  have "distribution X (X ` space M - {x}) = distribution X (X ` space M) - distribution X {x}"
    using X.measure_compl[of "{x}"] assms by auto
  also have "\<dots> = 0" using X.prob_space assms by auto
  finally have X0: "distribution X (X ` space M - {x}) = 0" by auto
  { fix y assume asm: "y \<noteq> x" "y \<in> X ` space M"
    hence "{y} \<subseteq> X ` space M - {x}" by auto
    from X.measure_mono[OF this] X0 asm
    have "distribution X {y} = 0" by auto }
  hence fi: "\<And> y. y \<in> X ` space M \<Longrightarrow> real (distribution X {y}) = (if x = y then 1 else 0)"
    using assms by auto
  have y: "\<And>y. (if x = y then 1 else 0) * log b (if x = y then 1 else 0) = 0" by simp
  show ?thesis unfolding entropy_eq[OF `simple_function X`] by (auto simp: y fi)
qed

lemma (in information_space) entropy_le_card_not_0:
  assumes "simple_function X"
  shows "\<H>(X) \<le> log b (real (card (X ` space M \<inter> {x . distribution X {x} \<noteq> 0})))"
proof -
  let "?d x" = "distribution X {x}"
  let "?p x" = "real (?d x)"
  have "\<H>(X) = (\<Sum>x\<in>X`space M. ?p x * log b (1 / ?p x))"
    by (auto intro!: setsum_cong simp: entropy_eq[OF `simple_function X`] setsum_negf[symmetric] log_simps not_less)
  also have "\<dots> \<le> log b (\<Sum>x\<in>X`space M. ?p x * (1 / ?p x))"
    apply (rule log_setsum')
    using not_empty b_gt_1 `simple_function X` sum_over_space_real_distribution
    by (auto simp: simple_function_def)
  also have "\<dots> = log b (\<Sum>x\<in>X`space M. if ?d x \<noteq> 0 then 1 else 0)"
    using distribution_finite[OF `simple_function X`[THEN simple_function_imp_random_variable], simplified]
    by (intro arg_cong[where f="\<lambda>X. log b X"] setsum_cong) (auto simp: real_of_pextreal_eq_0)
  finally show ?thesis
    using `simple_function X` by (auto simp: setsum_cases real_eq_of_nat simple_function_def)
qed

lemma (in information_space) entropy_uniform_max:
  assumes "simple_function X"
  assumes "\<And>x y. \<lbrakk> x \<in> X ` space M ; y \<in> X ` space M \<rbrakk> \<Longrightarrow> distribution X {x} = distribution X {y}"
  shows "\<H>(X) = log b (real (card (X ` space M)))"
proof -
  interpret X: finite_prob_space "\<lparr> space = X ` space M, sets = Pow (X ` space M) \<rparr>" "distribution X"
    using simple_function_imp_finite_random_variable[OF `simple_function X`]
    by (rule distribution_finite_prob_space)
  have card_gt0: "0 < card (X ` space M)" unfolding card_gt_0_iff
    using `simple_function X` not_empty by (auto simp: simple_function_def)
  { fix x assume "x \<in> X ` space M"
    hence "real (distribution X {x}) = 1 / real (card (X ` space M))"
    proof (rule X.uniform_prob[simplified])
      fix x y assume "x \<in> X`space M" "y \<in> X`space M"
      from assms(2)[OF this] show "real (distribution X {x}) = real (distribution X {y})" by simp
    qed }
  thus ?thesis
    using not_empty X.finite_space b_gt_1 card_gt0
    by (simp add: entropy_eq[OF `simple_function X`] real_eq_of_nat[symmetric] log_simps)
qed

lemma (in information_space) entropy_le_card:
  assumes "simple_function X"
  shows "\<H>(X) \<le> log b (real (card (X ` space M)))"
proof cases
  assume "X ` space M \<inter> {x. distribution X {x} \<noteq> 0} = {}"
  then have "\<And>x. x\<in>X`space M \<Longrightarrow> distribution X {x} = 0" by auto
  moreover
  have "0 < card (X`space M)"
    using `simple_function X` not_empty
    by (auto simp: card_gt_0_iff simple_function_def)
  then have "log b 1 \<le> log b (real (card (X`space M)))"
    using b_gt_1 by (intro log_le) auto
  ultimately show ?thesis using assms by (simp add: entropy_eq)
next
  assume False: "X ` space M \<inter> {x. distribution X {x} \<noteq> 0} \<noteq> {}"
  have "card (X ` space M \<inter> {x. distribution X {x} \<noteq> 0}) \<le> card (X ` space M)"
    (is "?A \<le> ?B") using assms not_empty by (auto intro!: card_mono simp: simple_function_def)
  note entropy_le_card_not_0[OF assms]
  also have "log b (real ?A) \<le> log b (real ?B)"
    using b_gt_1 False not_empty `?A \<le> ?B` assms
    by (auto intro!: log_le simp: card_gt_0_iff simp: simple_function_def)
  finally show ?thesis .
qed

lemma (in information_space) entropy_commute:
  assumes "simple_function X" "simple_function Y"
  shows "\<H>(\<lambda>x. (X x, Y x)) = \<H>(\<lambda>x. (Y x, X x))"
proof -
  have sf: "simple_function (\<lambda>x. (X x, Y x))" "simple_function (\<lambda>x. (Y x, X x))"
    using assms by (auto intro: simple_function_Pair)
  have *: "(\<lambda>x. (Y x, X x))`space M = (\<lambda>(a,b). (b,a))`(\<lambda>x. (X x, Y x))`space M"
    by auto
  have inj: "\<And>X. inj_on (\<lambda>(a,b). (b,a)) X"
    by (auto intro!: inj_onI)
  show ?thesis
    unfolding sf[THEN entropy_eq] unfolding * setsum_reindex[OF inj]
    by (simp add: joint_distribution_commute[of Y X] split_beta)
qed

lemma (in information_space) entropy_eq_cartesian_product:
  assumes "simple_function X" "simple_function Y"
  shows "\<H>(\<lambda>x. (X x, Y x)) = -(\<Sum>x\<in>X`space M. \<Sum>y\<in>Y`space M.
    real (joint_distribution X Y {(x,y)}) *
    log b (real (joint_distribution X Y {(x,y)})))"
proof -
  have sf: "simple_function (\<lambda>x. (X x, Y x))"
    using assms by (auto intro: simple_function_Pair)
  { fix x assume "x\<notin>(\<lambda>x. (X x, Y x))`space M"
    then have "(\<lambda>x. (X x, Y x)) -` {x} \<inter> space M = {}" by auto
    then have "joint_distribution X Y {x} = 0"
      unfolding distribution_def by auto }
  then show ?thesis using sf assms
    unfolding entropy_eq[OF sf] neg_equal_iff_equal setsum_cartesian_product
    by (auto intro!: setsum_mono_zero_cong_left simp: simple_function_def)
qed

subsection {* Conditional Mutual Information *}

definition (in prob_space)
  "conditional_mutual_information b M1 M2 M3 X Y Z \<equiv>
    mutual_information b M1 (sigma (pair_algebra M2 M3)) X (\<lambda>x. (Y x, Z x)) -
    mutual_information b M1 M3 X Z"

abbreviation (in information_space)
  conditional_mutual_information_Pow ("\<I>'( _ ; _ | _ ')") where
  "\<I>(X ; Y | Z) \<equiv> conditional_mutual_information b
    \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
    \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr>
    \<lparr> space = Z`space M, sets = Pow (Z`space M) \<rparr>
    X Y Z"


lemma (in information_space) conditional_mutual_information_generic_eq:
  assumes MX: "finite_random_variable MX X"
    and MY: "finite_random_variable MY Y"
    and MZ: "finite_random_variable MZ Z"
  shows "conditional_mutual_information b MX MY MZ X Y Z = (\<Sum>(x, y, z) \<in> space MX \<times> space MY \<times> space MZ.
             real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) *
             log b (real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) /
    (real (joint_distribution X Z {(x, z)}) * real (joint_distribution Y Z {(y,z)} / distribution Z {z}))))"
  (is "_ = (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?XZ x z * ?YZdZ y z)))")
proof -
  let ?YZ = "\<lambda>y z. real (joint_distribution Y Z {(y, z)})"
  let ?X = "\<lambda>x. real (distribution X {x})"
  let ?Z = "\<lambda>z. real (distribution Z {z})"

  txt {* This proof is actually quiet easy, however we need to show that the
    distributions are finite and the joint distributions are zero when one of
    the variables distribution is also zero. *}

  note finite_var = MX MY MZ
  note random_var = finite_var[THEN finite_random_variableD]

  note space_simps = space_pair_algebra space_sigma algebra.simps

  note YZ = finite_random_variable_pairI[OF finite_var(2,3)]
  note XZ = finite_random_variable_pairI[OF finite_var(1,3)]
  note ZX = finite_random_variable_pairI[OF finite_var(3,1)]
  note YZX = finite_random_variable_pairI[OF finite_var(2) ZX]
  note order1 =
    finite_distribution_order(5,6)[OF finite_var(1) YZ, simplified space_simps]
    finite_distribution_order(5,6)[OF finite_var(1,3), simplified space_simps]

  note finite = finite_var(1) YZ finite_var(3) XZ YZX
  note finite[THEN finite_distribution_finite, simplified space_simps, simp]

  have order2: "\<And>x y z. \<lbrakk>x \<in> space MX; y \<in> space MY; z \<in> space MZ; joint_distribution X Z {(x, z)} = 0\<rbrakk>
          \<Longrightarrow> joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)} = 0"
    unfolding joint_distribution_commute_singleton[of X]
    unfolding joint_distribution_assoc_singleton[symmetric]
    using finite_distribution_order(6)[OF finite_var(2) ZX]
    by (auto simp: space_simps)

  have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?XZ x z * ?YZdZ y z))) =
    (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * (log b (?XYZ x y z / (?X x * ?YZ y z)) - log b (?XZ x z / (?X x * ?Z z))))"
    (is "(\<Sum>(x, y, z)\<in>?S. ?L x y z) = (\<Sum>(x, y, z)\<in>?S. ?R x y z)")
  proof (safe intro!: setsum_cong)
    fix x y z assume space: "x \<in> space MX" "y \<in> space MY" "z \<in> space MZ"
    then have *: "?XYZ x y z / (?XZ x z * ?YZdZ y z) =
      (?XYZ x y z / (?X x * ?YZ y z)) / (?XZ x z / (?X x * ?Z z))"
      using order1(3)
      by (auto simp: real_of_pextreal_mult[symmetric] real_of_pextreal_eq_0)
    show "?L x y z = ?R x y z"
    proof cases
      assume "?XYZ x y z \<noteq> 0"
      with space b_gt_1 order1 order2 show ?thesis unfolding *
        by (subst log_divide)
           (auto simp: zero_less_divide_iff zero_less_real_of_pextreal
                       real_of_pextreal_eq_0 zero_less_mult_iff)
    qed simp
  qed
  also have "\<dots> = (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?X x * ?YZ y z))) -
                  (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XZ x z / (?X x * ?Z z)))"
    by (auto simp add: setsum_subtractf[symmetric] field_simps intro!: setsum_cong)
  also have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XZ x z / (?X x * ?Z z))) =
             (\<Sum>(x, z)\<in>space MX \<times> space MZ. ?XZ x z * log b (?XZ x z / (?X x * ?Z z)))"
    unfolding setsum_cartesian_product[symmetric] setsum_commute[of _ _ "space MY"]
              setsum_left_distrib[symmetric]
    unfolding joint_distribution_commute_singleton[of X]
    unfolding joint_distribution_assoc_singleton[symmetric]
    using setsum_real_joint_distribution_singleton[OF finite_var(2) ZX, unfolded space_simps]
    by (intro setsum_cong refl) simp
  also have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?X x * ?YZ y z))) -
             (\<Sum>(x, z)\<in>space MX \<times> space MZ. ?XZ x z * log b (?XZ x z / (?X x * ?Z z))) =
             conditional_mutual_information b MX MY MZ X Y Z"
    unfolding conditional_mutual_information_def
    unfolding mutual_information_generic_eq[OF finite_var(1,3)]
    unfolding mutual_information_generic_eq[OF finite_var(1) YZ]
    by (simp add: space_sigma space_pair_algebra setsum_cartesian_product')
  finally show ?thesis by simp
qed

lemma (in information_space) conditional_mutual_information_eq:
  assumes "simple_function X" "simple_function Y" "simple_function Z"
  shows "\<I>(X;Y|Z) = (\<Sum>(x, y, z) \<in> X`space M \<times> Y`space M \<times> Z`space M.
             real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) *
             log b (real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) /
    (real (joint_distribution X Z {(x, z)}) * real (joint_distribution Y Z {(y,z)} / distribution Z {z}))))"
  using conditional_mutual_information_generic_eq[OF assms[THEN simple_function_imp_finite_random_variable]]
  by simp

lemma (in information_space) conditional_mutual_information_eq_mutual_information:
  assumes X: "simple_function X" and Y: "simple_function Y"
  shows "\<I>(X ; Y) = \<I>(X ; Y | (\<lambda>x. ()))"
proof -
  have [simp]: "(\<lambda>x. ()) ` space M = {()}" using not_empty by auto
  have C: "simple_function (\<lambda>x. ())" by auto
  show ?thesis
    unfolding conditional_mutual_information_eq[OF X Y C]
    unfolding mutual_information_eq[OF X Y]
    by (simp add: setsum_cartesian_product' distribution_remove_const)
qed

lemma (in prob_space) distribution_unit[simp]: "distribution (\<lambda>x. ()) {()} = 1"
  unfolding distribution_def using measure_space_1 by auto

lemma (in prob_space) joint_distribution_unit[simp]: "distribution (\<lambda>x. (X x, ())) {(a, ())} = distribution X {a}"
  unfolding distribution_def by (auto intro!: arg_cong[where f=\<mu>])

lemma (in prob_space) setsum_distribution:
  assumes X: "finite_random_variable MX X" shows "(\<Sum>a\<in>space MX. distribution X {a}) = 1"
  using setsum_joint_distribution[OF assms, of "\<lparr> space = UNIV, sets = Pow UNIV \<rparr>" "\<lambda>x. ()" "{()}"]
  using sigma_algebra_Pow[of "UNIV::unit set"] by simp

lemma (in prob_space) setsum_real_distribution:
  assumes X: "finite_random_variable MX X" shows "(\<Sum>a\<in>space MX. real (distribution X {a})) = 1"
  using setsum_real_joint_distribution[OF assms, of "\<lparr> space = UNIV, sets = Pow UNIV \<rparr>" "\<lambda>x. ()" "{()}"]
  using sigma_algebra_Pow[of "UNIV::unit set"] by simp

lemma (in information_space) conditional_mutual_information_generic_positive:
  assumes "finite_random_variable MX X" and "finite_random_variable MY Y" and "finite_random_variable MZ Z"
  shows "0 \<le> conditional_mutual_information b MX MY MZ X Y Z"
proof (cases "space MX \<times> space MY \<times> space MZ = {}")
  case True show ?thesis
    unfolding conditional_mutual_information_generic_eq[OF assms] True
    by simp
next
  case False
  let "?dXYZ A" = "real (distribution (\<lambda>x. (X x, Y x, Z x)) A)"
  let "?dXZ A" = "real (joint_distribution X Z A)"
  let "?dYZ A" = "real (joint_distribution Y Z A)"
  let "?dX A" = "real (distribution X A)"
  let "?dZ A" = "real (distribution Z A)"
  let ?M = "space MX \<times> space MY \<times> space MZ"

  have split_beta: "\<And>f. split f = (\<lambda>x. f (fst x) (snd x))" by (simp add: fun_eq_iff)

  note space_simps = space_pair_algebra space_sigma algebra.simps

  note finite_var = assms
  note YZ = finite_random_variable_pairI[OF finite_var(2,3)]
  note XZ = finite_random_variable_pairI[OF finite_var(1,3)]
  note ZX = finite_random_variable_pairI[OF finite_var(3,1)]
  note YZ = finite_random_variable_pairI[OF finite_var(2,3)]
  note XYZ = finite_random_variable_pairI[OF finite_var(1) YZ]
  note finite = finite_var(3) YZ XZ XYZ
  note finite = finite[THEN finite_distribution_finite, simplified space_simps]

  have order: "\<And>x y z. \<lbrakk>x \<in> space MX; y \<in> space MY; z \<in> space MZ; joint_distribution X Z {(x, z)} = 0\<rbrakk>
          \<Longrightarrow> joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)} = 0"
    unfolding joint_distribution_commute_singleton[of X]
    unfolding joint_distribution_assoc_singleton[symmetric]
    using finite_distribution_order(6)[OF finite_var(2) ZX]
    by (auto simp: space_simps)

  note order = order
    finite_distribution_order(5,6)[OF finite_var(1) YZ, simplified space_simps]
    finite_distribution_order(5,6)[OF finite_var(2,3), simplified space_simps]

  have "- conditional_mutual_information b MX MY MZ X Y Z = - (\<Sum>(x, y, z) \<in> ?M. ?dXYZ {(x, y, z)} *
    log b (?dXYZ {(x, y, z)} / (?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})))"
    unfolding conditional_mutual_information_generic_eq[OF assms] neg_equal_iff_equal
    by (intro setsum_cong) (auto intro!: arg_cong[where f="log b"] simp: real_of_pextreal_mult[symmetric])
  also have "\<dots> \<le> log b (\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})"
    unfolding split_beta
  proof (rule log_setsum_divide)
    show "?M \<noteq> {}" using False by simp
    show "1 < b" using b_gt_1 .

    show "finite ?M" using assms
      unfolding finite_sigma_algebra_def finite_sigma_algebra_axioms_def by auto

    show "(\<Sum>x\<in>?M. ?dXYZ {(fst x, fst (snd x), snd (snd x))}) = 1"
      unfolding setsum_cartesian_product'
      unfolding setsum_commute[of _ "space MY"]
      unfolding setsum_commute[of _ "space MZ"]
      by (simp_all add: space_pair_algebra
        setsum_real_joint_distribution_singleton[OF `finite_random_variable MX X` YZ]
        setsum_real_joint_distribution_singleton[OF `finite_random_variable MY Y` finite_var(3)]
        setsum_real_distribution[OF `finite_random_variable MZ Z`])

    fix x assume "x \<in> ?M"
    let ?x = "(fst x, fst (snd x), snd (snd x))"

    show "0 \<le> ?dXYZ {?x}" using real_pextreal_nonneg .
    show "0 \<le> ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}"
     by (simp add: real_pextreal_nonneg mult_nonneg_nonneg divide_nonneg_nonneg)

    assume *: "0 < ?dXYZ {?x}"
    with `x \<in> ?M` show "0 < ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}"
      using finite order
      by (cases x)
         (auto simp add: zero_less_real_of_pextreal zero_less_mult_iff zero_less_divide_iff)
  qed
  also have "(\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z}) = (\<Sum>z\<in>space MZ. ?dZ {z})"
    apply (simp add: setsum_cartesian_product')
    apply (subst setsum_commute)
    apply (subst (2) setsum_commute)
    by (auto simp: setsum_divide_distrib[symmetric] setsum_product[symmetric]
                   setsum_real_joint_distribution_singleton[OF finite_var(1,3)]
                   setsum_real_joint_distribution_singleton[OF finite_var(2,3)]
          intro!: setsum_cong)
  also have "log b (\<Sum>z\<in>space MZ. ?dZ {z}) = 0"
    unfolding setsum_real_distribution[OF finite_var(3)] by simp
  finally show ?thesis by simp
qed

lemma (in information_space) conditional_mutual_information_positive:
  assumes "simple_function X" and "simple_function Y" and "simple_function Z"
  shows "0 \<le> \<I>(X;Y|Z)"
  using conditional_mutual_information_generic_positive[OF assms[THEN simple_function_imp_finite_random_variable]]
  by simp

subsection {* Conditional Entropy *}

definition (in prob_space)
  "conditional_entropy b S T X Y = conditional_mutual_information b S S T X X Y"

abbreviation (in information_space)
  conditional_entropy_Pow ("\<H>'(_ | _')") where
  "\<H>(X | Y) \<equiv> conditional_entropy b
    \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
    \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr> X Y"

lemma (in information_space) conditional_entropy_positive:
  "simple_function X \<Longrightarrow> simple_function Y \<Longrightarrow> 0 \<le> \<H>(X | Y)"
  unfolding conditional_entropy_def by (auto intro!: conditional_mutual_information_positive)

lemma (in measure_space) empty_measureI: "A = {} \<Longrightarrow> \<mu> A = 0" by simp

lemma (in information_space) conditional_entropy_generic_eq:
  assumes MX: "finite_random_variable MX X"
  assumes MZ: "finite_random_variable MZ Z"
  shows "conditional_entropy b MX MZ X Z =
     - (\<Sum>(x, z)\<in>space MX \<times> space MZ.
         real (joint_distribution X Z {(x, z)}) *
         log b (real (joint_distribution X Z {(x, z)}) / real (distribution Z {z})))"
proof -
  interpret MX: finite_sigma_algebra MX using MX by simp
  interpret MZ: finite_sigma_algebra MZ using MZ by simp
  let "?XXZ x y z" = "joint_distribution X (\<lambda>x. (X x, Z x)) {(x, y, z)}"
  let "?XZ x z" = "joint_distribution X Z {(x, z)}"
  let "?Z z" = "distribution Z {z}"
  let "?f x y z" = "log b (real (?XXZ x y z) / (real (?XZ x z) * real (?XZ y z / ?Z z)))"
  { fix x z have "?XXZ x x z = ?XZ x z"
      unfolding distribution_def by (auto intro!: arg_cong[where f=\<mu>]) }
  note this[simp]
  { fix x x' :: 'b and z assume "x' \<noteq> x"
    then have "?XXZ x x' z = 0"
      by (auto simp: distribution_def intro!: arg_cong[where f=\<mu>] empty_measureI) }
  note this[simp]
  { fix x x' z assume *: "x \<in> space MX" "z \<in> space MZ"
    then have "(\<Sum>x'\<in>space MX. real (?XXZ x x' z) * ?f x x' z)
      = (\<Sum>x'\<in>space MX. if x = x' then real (?XZ x z) * ?f x x z else 0)"
      by (auto intro!: setsum_cong)
    also have "\<dots> = real (?XZ x z) * ?f x x z"
      using `x \<in> space MX` by (simp add: setsum_cases[OF MX.finite_space])
    also have "\<dots> = real (?XZ x z) * log b (real (?Z z) / real (?XZ x z))"
      by (auto simp: real_of_pextreal_mult[symmetric])
    also have "\<dots> = - real (?XZ x z) * log b (real (?XZ x z) / real (?Z z))"
      using assms[THEN finite_distribution_finite]
      using finite_distribution_order(6)[OF MX MZ]
      by (auto simp: log_simps field_simps zero_less_mult_iff zero_less_real_of_pextreal real_of_pextreal_eq_0)
    finally have "(\<Sum>x'\<in>space MX. real (?XXZ x x' z) * ?f x x' z) =
      - real (?XZ x z) * log b (real (?XZ x z) / real (?Z z))" . }
  note * = this

  show ?thesis
    unfolding conditional_entropy_def
    unfolding conditional_mutual_information_generic_eq[OF MX MX MZ]
    by (auto simp: setsum_cartesian_product' setsum_negf[symmetric]
                   setsum_commute[of _ "space MZ"] *   simp del: divide_pextreal_def
             intro!: setsum_cong)
qed

lemma (in information_space) conditional_entropy_eq:
  assumes "simple_function X" "simple_function Z"
  shows "\<H>(X | Z) =
     - (\<Sum>(x, z)\<in>X ` space M \<times> Z ` space M.
         real (joint_distribution X Z {(x, z)}) *
         log b (real (joint_distribution X Z {(x, z)}) / real (distribution Z {z})))"
  using conditional_entropy_generic_eq[OF assms[THEN simple_function_imp_finite_random_variable]]
  by simp

lemma (in information_space) conditional_entropy_eq_ce_with_hypothesis:
  assumes X: "simple_function X" and Y: "simple_function Y"
  shows "\<H>(X | Y) =
    -(\<Sum>y\<in>Y`space M. real (distribution Y {y}) *
      (\<Sum>x\<in>X`space M. real (joint_distribution X Y {(x,y)}) / real (distribution Y {(y)}) *
              log b (real (joint_distribution X Y {(x,y)}) / real (distribution Y {(y)}))))"
  unfolding conditional_entropy_eq[OF assms]
  using finite_distribution_finite[OF finite_random_variable_pairI[OF assms[THEN simple_function_imp_finite_random_variable]]]
  using finite_distribution_order(5,6)[OF assms[THEN simple_function_imp_finite_random_variable]]
  using finite_distribution_finite[OF Y[THEN simple_function_imp_finite_random_variable]]
  by (auto simp: setsum_cartesian_product'  setsum_commute[of _ "Y`space M"] setsum_right_distrib real_of_pextreal_eq_0
           intro!: setsum_cong)

lemma (in information_space) conditional_entropy_eq_cartesian_product:
  assumes "simple_function X" "simple_function Y"
  shows "\<H>(X | Y) = -(\<Sum>x\<in>X`space M. \<Sum>y\<in>Y`space M.
    real (joint_distribution X Y {(x,y)}) *
    log b (real (joint_distribution X Y {(x,y)}) / real (distribution Y {y})))"
  unfolding conditional_entropy_eq[OF assms]
  by (auto intro!: setsum_cong simp: setsum_cartesian_product')

subsection {* Equalities *}

lemma (in information_space) mutual_information_eq_entropy_conditional_entropy:
  assumes X: "simple_function X" and Z: "simple_function Z"
  shows  "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)"
proof -
  let "?XZ x z" = "real (joint_distribution X Z {(x, z)})"
  let "?Z z" = "real (distribution Z {z})"
  let "?X x" = "real (distribution X {x})"
  note fX = X[THEN simple_function_imp_finite_random_variable]
  note fZ = Z[THEN simple_function_imp_finite_random_variable]
  note fX[THEN finite_distribution_finite, simp] and fZ[THEN finite_distribution_finite, simp]
  note finite_distribution_order[OF fX fZ, simp]
  { fix x z assume "x \<in> X`space M" "z \<in> Z`space M"
    have "?XZ x z * log b (?XZ x z / (?X x * ?Z z)) =
          ?XZ x z * log b (?XZ x z / ?Z z) - ?XZ x z * log b (?X x)"
      by (auto simp: log_simps real_of_pextreal_mult[symmetric] zero_less_mult_iff
                     zero_less_real_of_pextreal field_simps real_of_pextreal_eq_0 abs_mult) }
  note * = this
  show ?thesis
    unfolding entropy_eq[OF X] conditional_entropy_eq[OF X Z] mutual_information_eq[OF X Z]
    using setsum_real_joint_distribution_singleton[OF fZ fX, unfolded joint_distribution_commute_singleton[of Z X]]
    by (simp add: * setsum_cartesian_product' setsum_subtractf setsum_left_distrib[symmetric]
                     setsum_real_distribution)
qed

lemma (in information_space) conditional_entropy_less_eq_entropy:
  assumes X: "simple_function X" and Z: "simple_function Z"
  shows "\<H>(X | Z) \<le> \<H>(X)"
proof -
  have "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)" using mutual_information_eq_entropy_conditional_entropy[OF assms] .
  with mutual_information_positive[OF X Z] entropy_positive[OF X]
  show ?thesis by auto
qed

lemma (in information_space) entropy_chain_rule:
  assumes X: "simple_function X" and Y: "simple_function Y"
  shows  "\<H>(\<lambda>x. (X x, Y x)) = \<H>(X) + \<H>(Y|X)"
proof -
  let "?XY x y" = "real (joint_distribution X Y {(x, y)})"
  let "?Y y" = "real (distribution Y {y})"
  let "?X x" = "real (distribution X {x})"
  note fX = X[THEN simple_function_imp_finite_random_variable]
  note fY = Y[THEN simple_function_imp_finite_random_variable]
  note fX[THEN finite_distribution_finite, simp] and fY[THEN finite_distribution_finite, simp]
  note finite_distribution_order[OF fX fY, simp]
  { fix x y assume "x \<in> X`space M" "y \<in> Y`space M"
    have "?XY x y * log b (?XY x y / ?X x) =
          ?XY x y * log b (?XY x y) - ?XY x y * log b (?X x)"
      by (auto simp: log_simps real_of_pextreal_mult[symmetric] zero_less_mult_iff
                     zero_less_real_of_pextreal field_simps real_of_pextreal_eq_0 abs_mult) }
  note * = this
  show ?thesis
    using setsum_real_joint_distribution_singleton[OF fY fX]
    unfolding entropy_eq[OF X] conditional_entropy_eq_cartesian_product[OF Y X] entropy_eq_cartesian_product[OF X Y]
    unfolding joint_distribution_commute_singleton[of Y X] setsum_commute[of _ "X`space M"]
    by (simp add: * setsum_subtractf setsum_left_distrib[symmetric])
qed

section {* Partitioning *}

definition "subvimage A f g \<longleftrightarrow> (\<forall>x \<in> A. f -` {f x} \<inter> A \<subseteq> g -` {g x} \<inter> A)"

lemma subvimageI:
  assumes "\<And>x y. \<lbrakk> x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
  shows "subvimage A f g"
  using assms unfolding subvimage_def by blast

lemma subvimageE[consumes 1]:
  assumes "subvimage A f g"
  obtains "\<And>x y. \<lbrakk> x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
  using assms unfolding subvimage_def by blast

lemma subvimageD:
  "\<lbrakk> subvimage A f g ; x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
  using assms unfolding subvimage_def by blast

lemma subvimage_subset:
  "\<lbrakk> subvimage B f g ; A \<subseteq> B \<rbrakk> \<Longrightarrow> subvimage A f g"
  unfolding subvimage_def by auto

lemma subvimage_idem[intro]: "subvimage A g g"
  by (safe intro!: subvimageI)

lemma subvimage_comp_finer[intro]:
  assumes svi: "subvimage A g h"
  shows "subvimage A g (f \<circ> h)"
proof (rule subvimageI, simp)
  fix x y assume "x \<in> A" "y \<in> A" "g x = g y"
  from svi[THEN subvimageD, OF this]
  show "f (h x) = f (h y)" by simp
qed

lemma subvimage_comp_gran:
  assumes svi: "subvimage A g h"
  assumes inj: "inj_on f (g ` A)"
  shows "subvimage A (f \<circ> g) h"
  by (rule subvimageI) (auto intro!: subvimageD[OF svi] simp: inj_on_iff[OF inj])

lemma subvimage_comp:
  assumes svi: "subvimage (f ` A) g h"
  shows "subvimage A (g \<circ> f) (h \<circ> f)"
  by (rule subvimageI) (auto intro!: svi[THEN subvimageD])

lemma subvimage_trans:
  assumes fg: "subvimage A f g"
  assumes gh: "subvimage A g h"
  shows "subvimage A f h"
  by (rule subvimageI) (auto intro!: fg[THEN subvimageD] gh[THEN subvimageD])

lemma subvimage_translator:
  assumes svi: "subvimage A f g"
  shows "\<exists>h. \<forall>x \<in> A. h (f x)  = g x"
proof (safe intro!: exI[of _ "\<lambda>x. (THE z. z \<in> (g ` (f -` {x} \<inter> A)))"])
  fix x assume "x \<in> A"
  show "(THE x'. x' \<in> (g ` (f -` {f x} \<inter> A))) = g x"
    by (rule theI2[of _ "g x"])
      (insert `x \<in> A`, auto intro!: svi[THEN subvimageD])
qed

lemma subvimage_translator_image:
  assumes svi: "subvimage A f g"
  shows "\<exists>h. h ` f ` A = g ` A"
proof -
  from subvimage_translator[OF svi]
  obtain h where "\<And>x. x \<in> A \<Longrightarrow> h (f x) = g x" by auto
  thus ?thesis
    by (auto intro!: exI[of _ h]
      simp: image_compose[symmetric] comp_def cong: image_cong)
qed

lemma subvimage_finite:
  assumes svi: "subvimage A f g" and fin: "finite (f`A)"
  shows "finite (g`A)"
proof -
  from subvimage_translator_image[OF svi]
  obtain h where "g`A = h`f`A" by fastsimp
  with fin show "finite (g`A)" by simp
qed

lemma subvimage_disj:
  assumes svi: "subvimage A f g"
  shows "f -` {x} \<inter> A \<subseteq> g -` {y} \<inter> A \<or>
      f -` {x} \<inter> g -` {y} \<inter> A = {}" (is "?sub \<or> ?dist")
proof (rule disjCI)
  assume "\<not> ?dist"
  then obtain z where "z \<in> A" and "x = f z" and "y = g z" by auto
  thus "?sub" using svi unfolding subvimage_def by auto
qed

lemma setsum_image_split:
  assumes svi: "subvimage A f g" and fin: "finite (f ` A)"
  shows "(\<Sum>x\<in>f`A. h x) = (\<Sum>y\<in>g`A. \<Sum>x\<in>f`(g -` {y} \<inter> A). h x)"
    (is "?lhs = ?rhs")
proof -
  have "f ` A =
      snd ` (SIGMA x : g ` A. f ` (g -` {x} \<inter> A))"
      (is "_ = snd ` ?SIGMA")
    unfolding image_split_eq_Sigma[symmetric]
    by (simp add: image_compose[symmetric] comp_def)
  moreover
  have snd_inj: "inj_on snd ?SIGMA"
    unfolding image_split_eq_Sigma[symmetric]
    by (auto intro!: inj_onI subvimageD[OF svi])
  ultimately
  have "(\<Sum>x\<in>f`A. h x) = (\<Sum>(x,y)\<in>?SIGMA. h y)"
    by (auto simp: setsum_reindex intro: setsum_cong)
  also have "... = ?rhs"
    using subvimage_finite[OF svi fin] fin
    apply (subst setsum_Sigma[symmetric])
    by (auto intro!: finite_subset[of _ "f`A"])
  finally show ?thesis .
qed

lemma (in information_space) entropy_partition:
  assumes sf: "simple_function X" "simple_function P"
  assumes svi: "subvimage (space M) X P"
  shows "\<H>(X) = \<H>(P) + \<H>(X|P)"
proof -
  let "?XP x p" = "real (joint_distribution X P {(x, p)})"
  let "?X x" = "real (distribution X {x})"
  let "?P p" = "real (distribution P {p})"
  note fX = sf(1)[THEN simple_function_imp_finite_random_variable]
  note fP = sf(2)[THEN simple_function_imp_finite_random_variable]
  note fX[THEN finite_distribution_finite, simp] and fP[THEN finite_distribution_finite, simp]
  note finite_distribution_order[OF fX fP, simp]
  have "(\<Sum>x\<in>X ` space M. real (distribution X {x}) * log b (real (distribution X {x}))) =
    (\<Sum>y\<in>P `space M. \<Sum>x\<in>X ` space M.
    real (joint_distribution X P {(x, y)}) * log b (real (joint_distribution X P {(x, y)})))"
  proof (subst setsum_image_split[OF svi],
      safe intro!: setsum_mono_zero_cong_left imageI)
    show "finite (X ` space M)" "finite (X ` space M)" "finite (P ` space M)"
      using sf unfolding simple_function_def by auto
  next
    fix p x assume in_space: "p \<in> space M" "x \<in> space M"
    assume "real (joint_distribution X P {(X x, P p)}) * log b (real (joint_distribution X P {(X x, P p)})) \<noteq> 0"
    hence "(\<lambda>x. (X x, P x)) -` {(X x, P p)} \<inter> space M \<noteq> {}" by (auto simp: distribution_def)
    with svi[unfolded subvimage_def, rule_format, OF `x \<in> space M`]
    show "x \<in> P -` {P p}" by auto
  next
    fix p x assume in_space: "p \<in> space M" "x \<in> space M"
    assume "P x = P p"
    from this[symmetric] svi[unfolded subvimage_def, rule_format, OF `x \<in> space M`]
    have "X -` {X x} \<inter> space M \<subseteq> P -` {P p} \<inter> space M"
      by auto
    hence "(\<lambda>x. (X x, P x)) -` {(X x, P p)} \<inter> space M = X -` {X x} \<inter> space M"
      by auto
    thus "real (distribution X {X x}) * log b (real (distribution X {X x})) =
          real (joint_distribution X P {(X x, P p)}) *
          log b (real (joint_distribution X P {(X x, P p)}))"
      by (auto simp: distribution_def)
  qed
  moreover have "\<And>x y. real (joint_distribution X P {(x, y)}) *
      log b (real (joint_distribution X P {(x, y)}) / real (distribution P {y})) =
      real (joint_distribution X P {(x, y)}) * log b (real (joint_distribution X P {(x, y)})) -
      real (joint_distribution X P {(x, y)}) * log b (real (distribution P {y}))"
    by (auto simp add: log_simps zero_less_mult_iff field_simps)
  ultimately show ?thesis
    unfolding sf[THEN entropy_eq] conditional_entropy_eq[OF sf]
    using setsum_real_joint_distribution_singleton[OF fX fP]
    by (simp add: setsum_cartesian_product' setsum_subtractf setsum_real_distribution
      setsum_left_distrib[symmetric] setsum_commute[where B="P`space M"])
qed

corollary (in information_space) entropy_data_processing:
  assumes X: "simple_function X" shows "\<H>(f \<circ> X) \<le> \<H>(X)"
proof -
  note X
  moreover have fX: "simple_function (f \<circ> X)" using X by auto
  moreover have "subvimage (space M) X (f \<circ> X)" by auto
  ultimately have "\<H>(X) = \<H>(f\<circ>X) + \<H>(X|f\<circ>X)" by (rule entropy_partition)
  then show "\<H>(f \<circ> X) \<le> \<H>(X)"
    by (auto intro: conditional_entropy_positive[OF X fX])
qed

corollary (in information_space) entropy_of_inj:
  assumes X: "simple_function X" and inj: "inj_on f (X`space M)"
  shows "\<H>(f \<circ> X) = \<H>(X)"
proof (rule antisym)
  show "\<H>(f \<circ> X) \<le> \<H>(X)" using entropy_data_processing[OF X] .
next
  have sf: "simple_function (f \<circ> X)"
    using X by auto
  have "\<H>(X) = \<H>(the_inv_into (X`space M) f \<circ> (f \<circ> X))"
    by (auto intro!: mutual_information_cong simp: entropy_def the_inv_into_f_f[OF inj])
  also have "... \<le> \<H>(f \<circ> X)"
    using entropy_data_processing[OF sf] .
  finally show "\<H>(X) \<le> \<H>(f \<circ> X)" .
qed

end