src/HOL/Probability/Information.thy
author hoelzl
Wed Feb 23 11:40:18 2011 +0100 (2011-02-23)
changeset 41833 563bea92b2c0
parent 41689 3e39b0e730d6
child 41981 cdf7693bbe08
permissions -rw-r--r--
add lemma KL_divergence_vimage, mutual_information_generic
     1 theory Information
     2 imports
     3   Probability_Space
     4   "~~/src/HOL/Library/Convex"
     5   Lebesgue_Measure
     6 begin
     7 
     8 lemma log_le: "1 < a \<Longrightarrow> 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log a x \<le> log a y"
     9   by (subst log_le_cancel_iff) auto
    10 
    11 lemma log_less: "1 < a \<Longrightarrow> 0 < x \<Longrightarrow> x < y \<Longrightarrow> log a x < log a y"
    12   by (subst log_less_cancel_iff) auto
    13 
    14 lemma setsum_cartesian_product':
    15   "(\<Sum>x\<in>A \<times> B. f x) = (\<Sum>x\<in>A. setsum (\<lambda>y. f (x, y)) B)"
    16   unfolding setsum_cartesian_product by simp
    17 
    18 section "Convex theory"
    19 
    20 lemma log_setsum:
    21   assumes "finite s" "s \<noteq> {}"
    22   assumes "b > 1"
    23   assumes "(\<Sum> i \<in> s. a i) = 1"
    24   assumes "\<And> i. i \<in> s \<Longrightarrow> a i \<ge> 0"
    25   assumes "\<And> i. i \<in> s \<Longrightarrow> y i \<in> {0 <..}"
    26   shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
    27 proof -
    28   have "convex_on {0 <..} (\<lambda> x. - log b x)"
    29     by (rule minus_log_convex[OF `b > 1`])
    30   hence "- log b (\<Sum> i \<in> s. a i * y i) \<le> (\<Sum> i \<in> s. a i * - log b (y i))"
    31     using convex_on_setsum[of _ _ "\<lambda> x. - log b x"] assms pos_is_convex by fastsimp
    32   thus ?thesis by (auto simp add:setsum_negf le_imp_neg_le)
    33 qed
    34 
    35 lemma log_setsum':
    36   assumes "finite s" "s \<noteq> {}"
    37   assumes "b > 1"
    38   assumes "(\<Sum> i \<in> s. a i) = 1"
    39   assumes pos: "\<And> i. i \<in> s \<Longrightarrow> 0 \<le> a i"
    40           "\<And> i. \<lbrakk> i \<in> s ; 0 < a i \<rbrakk> \<Longrightarrow> 0 < y i"
    41   shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
    42 proof -
    43   have "\<And>y. (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) = (\<Sum> i \<in> s. a i * y i)"
    44     using assms by (auto intro!: setsum_mono_zero_cong_left)
    45   moreover have "log b (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) \<ge> (\<Sum> i \<in> s - {i. a i = 0}. a i * log b (y i))"
    46   proof (rule log_setsum)
    47     have "setsum a (s - {i. a i = 0}) = setsum a s"
    48       using assms(1) by (rule setsum_mono_zero_cong_left) auto
    49     thus sum_1: "setsum a (s - {i. a i = 0}) = 1"
    50       "finite (s - {i. a i = 0})" using assms by simp_all
    51 
    52     show "s - {i. a i = 0} \<noteq> {}"
    53     proof
    54       assume *: "s - {i. a i = 0} = {}"
    55       hence "setsum a (s - {i. a i = 0}) = 0" by (simp add: * setsum_empty)
    56       with sum_1 show False by simp
    57     qed
    58 
    59     fix i assume "i \<in> s - {i. a i = 0}"
    60     hence "i \<in> s" "a i \<noteq> 0" by simp_all
    61     thus "0 \<le> a i" "y i \<in> {0<..}" using pos[of i] by auto
    62   qed fact+
    63   ultimately show ?thesis by simp
    64 qed
    65 
    66 lemma log_setsum_divide:
    67   assumes "finite S" and "S \<noteq> {}" and "1 < b"
    68   assumes "(\<Sum>x\<in>S. g x) = 1"
    69   assumes pos: "\<And>x. x \<in> S \<Longrightarrow> g x \<ge> 0" "\<And>x. x \<in> S \<Longrightarrow> f x \<ge> 0"
    70   assumes g_pos: "\<And>x. \<lbrakk> x \<in> S ; 0 < g x \<rbrakk> \<Longrightarrow> 0 < f x"
    71   shows "- (\<Sum>x\<in>S. g x * log b (g x / f x)) \<le> log b (\<Sum>x\<in>S. f x)"
    72 proof -
    73   have log_mono: "\<And>x y. 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log b x \<le> log b y"
    74     using `1 < b` by (subst log_le_cancel_iff) auto
    75 
    76   have "- (\<Sum>x\<in>S. g x * log b (g x / f x)) = (\<Sum>x\<in>S. g x * log b (f x / g x))"
    77   proof (unfold setsum_negf[symmetric], rule setsum_cong)
    78     fix x assume x: "x \<in> S"
    79     show "- (g x * log b (g x / f x)) = g x * log b (f x / g x)"
    80     proof (cases "g x = 0")
    81       case False
    82       with pos[OF x] g_pos[OF x] have "0 < f x" "0 < g x" by simp_all
    83       thus ?thesis using `1 < b` by (simp add: log_divide field_simps)
    84     qed simp
    85   qed rule
    86   also have "... \<le> log b (\<Sum>x\<in>S. g x * (f x / g x))"
    87   proof (rule log_setsum')
    88     fix x assume x: "x \<in> S" "0 < g x"
    89     with g_pos[OF x] show "0 < f x / g x" by (safe intro!: divide_pos_pos)
    90   qed fact+
    91   also have "... = log b (\<Sum>x\<in>S - {x. g x = 0}. f x)" using `finite S`
    92     by (auto intro!: setsum_mono_zero_cong_right arg_cong[where f="log b"]
    93         split: split_if_asm)
    94   also have "... \<le> log b (\<Sum>x\<in>S. f x)"
    95   proof (rule log_mono)
    96     have "0 = (\<Sum>x\<in>S - {x. g x = 0}. 0)" by simp
    97     also have "... < (\<Sum>x\<in>S - {x. g x = 0}. f x)" (is "_ < ?sum")
    98     proof (rule setsum_strict_mono)
    99       show "finite (S - {x. g x = 0})" using `finite S` by simp
   100       show "S - {x. g x = 0} \<noteq> {}"
   101       proof
   102         assume "S - {x. g x = 0} = {}"
   103         hence "(\<Sum>x\<in>S. g x) = 0" by (subst setsum_0') auto
   104         with `(\<Sum>x\<in>S. g x) = 1` show False by simp
   105       qed
   106       fix x assume "x \<in> S - {x. g x = 0}"
   107       thus "0 < f x" using g_pos[of x] pos(1)[of x] by auto
   108     qed
   109     finally show "0 < ?sum" .
   110     show "(\<Sum>x\<in>S - {x. g x = 0}. f x) \<le> (\<Sum>x\<in>S. f x)"
   111       using `finite S` pos by (auto intro!: setsum_mono2)
   112   qed
   113   finally show ?thesis .
   114 qed
   115 
   116 lemma split_pairs:
   117   "((A, B) = X) \<longleftrightarrow> (fst X = A \<and> snd X = B)" and
   118   "(X = (A, B)) \<longleftrightarrow> (fst X = A \<and> snd X = B)" by auto
   119 
   120 section "Information theory"
   121 
   122 locale information_space = prob_space +
   123   fixes b :: real assumes b_gt_1: "1 < b"
   124 
   125 context information_space
   126 begin
   127 
   128 text {* Introduce some simplification rules for logarithm of base @{term b}. *}
   129 
   130 lemma log_neg_const:
   131   assumes "x \<le> 0"
   132   shows "log b x = log b 0"
   133 proof -
   134   { fix u :: real
   135     have "x \<le> 0" by fact
   136     also have "0 < exp u"
   137       using exp_gt_zero .
   138     finally have "exp u \<noteq> x"
   139       by auto }
   140   then show "log b x = log b 0"
   141     by (simp add: log_def ln_def)
   142 qed
   143 
   144 lemma log_mult_eq:
   145   "log b (A * B) = (if 0 < A * B then log b \<bar>A\<bar> + log b \<bar>B\<bar> else log b 0)"
   146   using log_mult[of b "\<bar>A\<bar>" "\<bar>B\<bar>"] b_gt_1 log_neg_const[of "A * B"]
   147   by (auto simp: zero_less_mult_iff mult_le_0_iff)
   148 
   149 lemma log_inverse_eq:
   150   "log b (inverse B) = (if 0 < B then - log b B else log b 0)"
   151   using log_inverse[of b B] log_neg_const[of "inverse B"] b_gt_1 by simp
   152 
   153 lemma log_divide_eq:
   154   "log b (A / B) = (if 0 < A * B then log b \<bar>A\<bar> - log b \<bar>B\<bar> else log b 0)"
   155   unfolding divide_inverse log_mult_eq log_inverse_eq abs_inverse
   156   by (auto simp: zero_less_mult_iff mult_le_0_iff)
   157 
   158 lemmas log_simps = log_mult_eq log_inverse_eq log_divide_eq
   159 
   160 end
   161 
   162 subsection "Kullback$-$Leibler divergence"
   163 
   164 text {* The Kullback$-$Leibler divergence is also known as relative entropy or
   165 Kullback$-$Leibler distance. *}
   166 
   167 definition
   168   "KL_divergence b M \<nu> = \<integral>x. log b (real (RN_deriv M \<nu> x)) \<partial>M\<lparr>measure := \<nu>\<rparr>"
   169 
   170 lemma (in sigma_finite_measure) KL_divergence_vimage:
   171   assumes T: "T \<in> measure_preserving M M'"
   172     and T': "T' \<in> measure_preserving (M'\<lparr> measure := \<nu>' \<rparr>) (M\<lparr> measure := \<nu> \<rparr>)"
   173     and inv: "\<And>x. x \<in> space M \<Longrightarrow> T' (T x) = x"
   174     and inv': "\<And>x. x \<in> space M' \<Longrightarrow> T (T' x) = x"
   175   and \<nu>': "measure_space (M'\<lparr>measure := \<nu>'\<rparr>)" "measure_space.absolutely_continuous M' \<nu>'"
   176   and "1 < b"
   177   shows "KL_divergence b M' \<nu>' = KL_divergence b M \<nu>"
   178 proof -
   179   interpret \<nu>': measure_space "M'\<lparr>measure := \<nu>'\<rparr>" by fact
   180   have M: "measure_space (M\<lparr> measure := \<nu>\<rparr>)"
   181     by (rule \<nu>'.measure_space_vimage[OF _ T'], simp) default
   182   have "sigma_algebra (M'\<lparr> measure := \<nu>'\<rparr>)" by default
   183   then have saM': "sigma_algebra M'" by simp
   184   then interpret M': measure_space M' by (rule measure_space_vimage) fact
   185   have ac: "absolutely_continuous \<nu>" unfolding absolutely_continuous_def
   186   proof safe
   187     fix N assume N: "N \<in> sets M" and N_0: "\<mu> N = 0"
   188     then have N': "T' -` N \<inter> space M' \<in> sets M'"
   189       using T' by (auto simp: measurable_def measure_preserving_def)
   190     have "T -` (T' -` N \<inter> space M') \<inter> space M = N"
   191       using inv T N sets_into_space[OF N] by (auto simp: measurable_def measure_preserving_def)
   192     then have "measure M' (T' -` N \<inter> space M') = 0"
   193       using measure_preservingD[OF T N'] N_0 by auto
   194     with \<nu>'(2) N' show "\<nu> N = 0" using measure_preservingD[OF T', of N] N
   195       unfolding M'.absolutely_continuous_def measurable_def by auto
   196   qed
   197 
   198   have sa: "sigma_algebra (M\<lparr>measure := \<nu>\<rparr>)" by simp default
   199   have AE: "AE x. RN_deriv M' \<nu>' (T x) = RN_deriv M \<nu> x"
   200     by (rule RN_deriv_vimage[OF T T' inv \<nu>'])
   201   show ?thesis
   202     unfolding KL_divergence_def
   203   proof (subst \<nu>'.integral_vimage[OF sa T'])
   204     show "(\<lambda>x. log b (real (RN_deriv M \<nu> x))) \<in> borel_measurable (M\<lparr>measure := \<nu>\<rparr>)"
   205       by (auto intro!: RN_deriv[OF M ac] borel_measurable_log[OF _ `1 < b`])
   206     have "(\<integral> x. log b (real (RN_deriv M' \<nu>' x)) \<partial>M'\<lparr>measure := \<nu>'\<rparr>) =
   207       (\<integral> x. log b (real (RN_deriv M' \<nu>' (T (T' x)))) \<partial>M'\<lparr>measure := \<nu>'\<rparr>)" (is "?l = _")
   208       using inv' by (auto intro!: \<nu>'.integral_cong)
   209     also have "\<dots> = (\<integral> x. log b (real (RN_deriv M \<nu> (T' x))) \<partial>M'\<lparr>measure := \<nu>'\<rparr>)" (is "_ = ?r")
   210       using M ac AE
   211       by (intro \<nu>'.integral_cong_AE \<nu>'.almost_everywhere_vimage[OF sa T'] absolutely_continuous_AE[OF M])
   212          (auto elim!: AE_mp)
   213     finally show "?l = ?r" .
   214   qed
   215 qed
   216 
   217 lemma (in sigma_finite_measure) KL_divergence_cong:
   218   assumes "measure_space (M\<lparr>measure := \<nu>\<rparr>)" (is "measure_space ?\<nu>")
   219   assumes [simp]: "sets N = sets M" "space N = space M"
   220     "\<And>A. A \<in> sets M \<Longrightarrow> measure N A = \<mu> A"
   221     "\<And>A. A \<in> sets M \<Longrightarrow> \<nu> A = \<nu>' A"
   222   shows "KL_divergence b M \<nu> = KL_divergence b N \<nu>'"
   223 proof -
   224   interpret \<nu>: measure_space ?\<nu> by fact
   225   have "KL_divergence b M \<nu> = \<integral>x. log b (real (RN_deriv N \<nu>' x)) \<partial>?\<nu>"
   226     by (simp cong: RN_deriv_cong \<nu>.integral_cong add: KL_divergence_def)
   227   also have "\<dots> = KL_divergence b N \<nu>'"
   228     by (auto intro!: \<nu>.integral_cong_measure[symmetric] simp: KL_divergence_def)
   229   finally show ?thesis .
   230 qed
   231 
   232 lemma (in finite_measure_space) KL_divergence_eq_finite:
   233   assumes v: "finite_measure_space (M\<lparr>measure := \<nu>\<rparr>)"
   234   assumes ac: "absolutely_continuous \<nu>"
   235   shows "KL_divergence b M \<nu> = (\<Sum>x\<in>space M. real (\<nu> {x}) * log b (real (\<nu> {x}) / real (\<mu> {x})))" (is "_ = ?sum")
   236 proof (simp add: KL_divergence_def finite_measure_space.integral_finite_singleton[OF v])
   237   interpret v: finite_measure_space "M\<lparr>measure := \<nu>\<rparr>" by fact
   238   have ms: "measure_space (M\<lparr>measure := \<nu>\<rparr>)" by default
   239   show "(\<Sum>x \<in> space M. log b (real (RN_deriv M \<nu> x)) * real (\<nu> {x})) = ?sum"
   240     using RN_deriv_finite_measure[OF ms ac]
   241     by (auto intro!: setsum_cong simp: field_simps real_of_pextreal_mult[symmetric])
   242 qed
   243 
   244 lemma (in finite_prob_space) KL_divergence_positive_finite:
   245   assumes v: "finite_prob_space (M\<lparr>measure := \<nu>\<rparr>)"
   246   assumes ac: "absolutely_continuous \<nu>"
   247   and "1 < b"
   248   shows "0 \<le> KL_divergence b M \<nu>"
   249 proof -
   250   interpret v: finite_prob_space "M\<lparr>measure := \<nu>\<rparr>" by fact
   251   have ms: "finite_measure_space (M\<lparr>measure := \<nu>\<rparr>)" by default
   252 
   253   have "- (KL_divergence b M \<nu>) \<le> log b (\<Sum>x\<in>space M. real (\<mu> {x}))"
   254   proof (subst KL_divergence_eq_finite[OF ms ac], safe intro!: log_setsum_divide not_empty)
   255     show "finite (space M)" using finite_space by simp
   256     show "1 < b" by fact
   257     show "(\<Sum>x\<in>space M. real (\<nu> {x})) = 1" using v.finite_sum_over_space_eq_1 by simp
   258 
   259     fix x assume "x \<in> space M"
   260     then have x: "{x} \<in> sets M" unfolding sets_eq_Pow by auto
   261     { assume "0 < real (\<nu> {x})"
   262       then have "\<nu> {x} \<noteq> 0" by auto
   263       then have "\<mu> {x} \<noteq> 0"
   264         using ac[unfolded absolutely_continuous_def, THEN bspec, of "{x}"] x by auto
   265       thus "0 < prob {x}" using finite_measure[of "{x}"] x by auto }
   266   qed auto
   267   thus "0 \<le> KL_divergence b M \<nu>" using finite_sum_over_space_eq_1 by simp
   268 qed
   269 
   270 subsection {* Mutual Information *}
   271 
   272 definition (in prob_space)
   273   "mutual_information b S T X Y =
   274     KL_divergence b (S\<lparr>measure := distribution X\<rparr> \<Otimes>\<^isub>M T\<lparr>measure := distribution Y\<rparr>)
   275       (joint_distribution X Y)"
   276 
   277 definition (in prob_space)
   278   "entropy b s X = mutual_information b s s X X"
   279 
   280 abbreviation (in information_space)
   281   mutual_information_Pow ("\<I>'(_ ; _')") where
   282   "\<I>(X ; Y) \<equiv> mutual_information b
   283     \<lparr> space = X`space M, sets = Pow (X`space M), measure = distribution X \<rparr>
   284     \<lparr> space = Y`space M, sets = Pow (Y`space M), measure = distribution Y \<rparr> X Y"
   285 
   286 lemma (in prob_space) finite_variables_absolutely_continuous:
   287   assumes X: "finite_random_variable S X" and Y: "finite_random_variable T Y"
   288   shows "measure_space.absolutely_continuous
   289     (S\<lparr>measure := distribution X\<rparr> \<Otimes>\<^isub>M T\<lparr>measure := distribution Y\<rparr>)
   290     (joint_distribution X Y)"
   291 proof -
   292   interpret X: finite_prob_space "S\<lparr>measure := distribution X\<rparr>"
   293     using X by (rule distribution_finite_prob_space)
   294   interpret Y: finite_prob_space "T\<lparr>measure := distribution Y\<rparr>"
   295     using Y by (rule distribution_finite_prob_space)
   296   interpret XY: pair_finite_prob_space
   297     "S\<lparr>measure := distribution X\<rparr>" "T\<lparr> measure := distribution Y\<rparr>" by default
   298   interpret P: finite_prob_space "XY.P\<lparr> measure := joint_distribution X Y\<rparr>"
   299     using assms by (auto intro!: joint_distribution_finite_prob_space)
   300   note rv = assms[THEN finite_random_variableD]
   301   show "XY.absolutely_continuous (joint_distribution X Y)"
   302   proof (rule XY.absolutely_continuousI)
   303     show "finite_measure_space (XY.P\<lparr> measure := joint_distribution X Y\<rparr>)" by default
   304     fix x assume "x \<in> space XY.P" and "XY.\<mu> {x} = 0"
   305     then obtain a b where "(a, b) = x" and "a \<in> space S" "b \<in> space T"
   306       and distr: "distribution X {a} * distribution Y {b} = 0"
   307       by (cases x) (auto simp: space_pair_measure)
   308     with X.sets_eq_Pow Y.sets_eq_Pow
   309       joint_distribution_Times_le_fst[OF rv, of "{a}" "{b}"]
   310       joint_distribution_Times_le_snd[OF rv, of "{a}" "{b}"]
   311     have "joint_distribution X Y {x} \<le> distribution Y {b}"
   312          "joint_distribution X Y {x} \<le> distribution X {a}"
   313       by (auto simp del: X.sets_eq_Pow Y.sets_eq_Pow)
   314     with distr show "joint_distribution X Y {x} = 0" by auto
   315   qed
   316 qed
   317 
   318 lemma (in information_space)
   319   assumes MX: "finite_random_variable MX X"
   320   assumes MY: "finite_random_variable MY Y"
   321   shows mutual_information_generic_eq:
   322     "mutual_information b MX MY X Y = (\<Sum> (x,y) \<in> space MX \<times> space MY.
   323       real (joint_distribution X Y {(x,y)}) *
   324       log b (real (joint_distribution X Y {(x,y)}) /
   325       (real (distribution X {x}) * real (distribution Y {y}))))"
   326     (is ?sum)
   327   and mutual_information_positive_generic:
   328      "0 \<le> mutual_information b MX MY X Y" (is ?positive)
   329 proof -
   330   interpret X: finite_prob_space "MX\<lparr>measure := distribution X\<rparr>"
   331     using MX by (rule distribution_finite_prob_space)
   332   interpret Y: finite_prob_space "MY\<lparr>measure := distribution Y\<rparr>"
   333     using MY by (rule distribution_finite_prob_space)
   334   interpret XY: pair_finite_prob_space "MX\<lparr>measure := distribution X\<rparr>" "MY\<lparr>measure := distribution Y\<rparr>" by default
   335   interpret P: finite_prob_space "XY.P\<lparr>measure := joint_distribution X Y\<rparr>"
   336     using assms by (auto intro!: joint_distribution_finite_prob_space)
   337 
   338   have P_ms: "finite_measure_space (XY.P\<lparr>measure :=joint_distribution X Y\<rparr>)" by default
   339   have P_ps: "finite_prob_space (XY.P\<lparr>measure := joint_distribution X Y\<rparr>)" by default
   340 
   341   show ?sum
   342     unfolding Let_def mutual_information_def
   343     by (subst XY.KL_divergence_eq_finite[OF P_ms finite_variables_absolutely_continuous[OF MX MY]])
   344        (auto simp add: space_pair_measure setsum_cartesian_product' real_of_pextreal_mult[symmetric])
   345 
   346   show ?positive
   347     using XY.KL_divergence_positive_finite[OF P_ps finite_variables_absolutely_continuous[OF MX MY] b_gt_1]
   348     unfolding mutual_information_def .
   349 qed
   350 
   351 lemma (in information_space) mutual_information_commute_generic:
   352   assumes X: "random_variable S X" and Y: "random_variable T Y"
   353   assumes ac: "measure_space.absolutely_continuous
   354     (S\<lparr>measure := distribution X\<rparr> \<Otimes>\<^isub>M T\<lparr>measure := distribution Y\<rparr>) (joint_distribution X Y)"
   355   shows "mutual_information b S T X Y = mutual_information b T S Y X"
   356 proof -
   357   let ?S = "S\<lparr>measure := distribution X\<rparr>" and ?T = "T\<lparr>measure := distribution Y\<rparr>"
   358   interpret S: prob_space ?S using X by (rule distribution_prob_space)
   359   interpret T: prob_space ?T using Y by (rule distribution_prob_space)
   360   interpret P: pair_prob_space ?S ?T ..
   361   interpret Q: pair_prob_space ?T ?S ..
   362   show ?thesis
   363     unfolding mutual_information_def
   364   proof (intro Q.KL_divergence_vimage[OF Q.measure_preserving_swap _ _ _ _ ac b_gt_1])
   365     show "(\<lambda>(x,y). (y,x)) \<in> measure_preserving
   366       (P.P \<lparr> measure := joint_distribution X Y\<rparr>) (Q.P \<lparr> measure := joint_distribution Y X\<rparr>)"
   367       using X Y unfolding measurable_def
   368       unfolding measure_preserving_def using P.pair_sigma_algebra_swap_measurable
   369       by (auto simp add: space_pair_measure distribution_def intro!: arg_cong[where f=\<mu>])
   370     have "prob_space (P.P\<lparr> measure := joint_distribution X Y\<rparr>)"
   371       using X Y by (auto intro!: distribution_prob_space random_variable_pairI)
   372     then show "measure_space (P.P\<lparr> measure := joint_distribution X Y\<rparr>)"
   373       unfolding prob_space_def by simp
   374   qed auto
   375 qed
   376 
   377 lemma (in information_space) mutual_information_commute:
   378   assumes X: "finite_random_variable S X" and Y: "finite_random_variable T Y"
   379   shows "mutual_information b S T X Y = mutual_information b T S Y X"
   380   unfolding mutual_information_generic_eq[OF X Y] mutual_information_generic_eq[OF Y X]
   381   unfolding joint_distribution_commute_singleton[of X Y]
   382   by (auto simp add: ac_simps intro!: setsum_reindex_cong[OF swap_inj_on])
   383 
   384 lemma (in information_space) mutual_information_commute_simple:
   385   assumes X: "simple_function M X" and Y: "simple_function M Y"
   386   shows "\<I>(X;Y) = \<I>(Y;X)"
   387   by (intro mutual_information_commute X Y simple_function_imp_finite_random_variable)
   388 
   389 lemma (in information_space) mutual_information_eq:
   390   assumes "simple_function M X" "simple_function M Y"
   391   shows "\<I>(X;Y) = (\<Sum> (x,y) \<in> X ` space M \<times> Y ` space M.
   392     real (distribution (\<lambda>x. (X x, Y x)) {(x,y)}) * log b (real (distribution (\<lambda>x. (X x, Y x)) {(x,y)}) /
   393                                                    (real (distribution X {x}) * real (distribution Y {y}))))"
   394   using assms by (simp add: mutual_information_generic_eq)
   395 
   396 lemma (in information_space) mutual_information_generic_cong:
   397   assumes X: "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x"
   398   assumes Y: "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x"
   399   shows "mutual_information b MX MY X Y = mutual_information b MX MY X' Y'"
   400   unfolding mutual_information_def using X Y
   401   by (simp cong: distribution_cong)
   402 
   403 lemma (in information_space) mutual_information_cong:
   404   assumes X: "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x"
   405   assumes Y: "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x"
   406   shows "\<I>(X; Y) = \<I>(X'; Y')"
   407   unfolding mutual_information_def using X Y
   408   by (simp cong: distribution_cong image_cong)
   409 
   410 lemma (in information_space) mutual_information_positive:
   411   assumes "simple_function M X" "simple_function M Y"
   412   shows "0 \<le> \<I>(X;Y)"
   413   using assms by (simp add: mutual_information_positive_generic)
   414 
   415 subsection {* Entropy *}
   416 
   417 abbreviation (in information_space)
   418   entropy_Pow ("\<H>'(_')") where
   419   "\<H>(X) \<equiv> entropy b \<lparr> space = X`space M, sets = Pow (X`space M), measure = distribution X \<rparr> X"
   420 
   421 lemma (in information_space) entropy_generic_eq:
   422   assumes MX: "finite_random_variable MX X"
   423   shows "entropy b MX X = -(\<Sum> x \<in> space MX. real (distribution X {x}) * log b (real (distribution X {x})))"
   424 proof -
   425   interpret MX: finite_prob_space "MX\<lparr>measure := distribution X\<rparr>"
   426     using MX by (rule distribution_finite_prob_space)
   427   let "?X x" = "real (distribution X {x})"
   428   let "?XX x y" = "real (joint_distribution X X {(x, y)})"
   429   { fix x y
   430     have "(\<lambda>x. (X x, X x)) -` {(x, y)} = (if x = y then X -` {x} else {})" by auto
   431     then have "?XX x y * log b (?XX x y / (?X x * ?X y)) =
   432         (if x = y then - ?X y * log b (?X y) else 0)"
   433       unfolding distribution_def by (auto simp: log_simps zero_less_mult_iff) }
   434   note remove_XX = this
   435   show ?thesis
   436     unfolding entropy_def mutual_information_generic_eq[OF MX MX]
   437     unfolding setsum_cartesian_product[symmetric] setsum_negf[symmetric] remove_XX
   438     using MX.finite_space by (auto simp: setsum_cases)
   439 qed
   440 
   441 lemma (in information_space) entropy_eq:
   442   assumes "simple_function M X"
   443   shows "\<H>(X) = -(\<Sum> x \<in> X ` space M. real (distribution X {x}) * log b (real (distribution X {x})))"
   444   using assms by (simp add: entropy_generic_eq)
   445 
   446 lemma (in information_space) entropy_positive:
   447   "simple_function M X \<Longrightarrow> 0 \<le> \<H>(X)"
   448   unfolding entropy_def by (simp add: mutual_information_positive)
   449 
   450 lemma (in information_space) entropy_certainty_eq_0:
   451   assumes "simple_function M X" and "x \<in> X ` space M" and "distribution X {x} = 1"
   452   shows "\<H>(X) = 0"
   453 proof -
   454   let ?X = "\<lparr> space = X ` space M, sets = Pow (X ` space M), measure = distribution X\<rparr>"
   455   note simple_function_imp_finite_random_variable[OF `simple_function M X`]
   456   from distribution_finite_prob_space[OF this, of "\<lparr> measure = distribution X \<rparr>"]
   457   interpret X: finite_prob_space ?X by simp
   458   have "distribution X (X ` space M - {x}) = distribution X (X ` space M) - distribution X {x}"
   459     using X.measure_compl[of "{x}"] assms by auto
   460   also have "\<dots> = 0" using X.prob_space assms by auto
   461   finally have X0: "distribution X (X ` space M - {x}) = 0" by auto
   462   { fix y assume asm: "y \<noteq> x" "y \<in> X ` space M"
   463     hence "{y} \<subseteq> X ` space M - {x}" by auto
   464     from X.measure_mono[OF this] X0 asm
   465     have "distribution X {y} = 0" by auto }
   466   hence fi: "\<And> y. y \<in> X ` space M \<Longrightarrow> real (distribution X {y}) = (if x = y then 1 else 0)"
   467     using assms by auto
   468   have y: "\<And>y. (if x = y then 1 else 0) * log b (if x = y then 1 else 0) = 0" by simp
   469   show ?thesis unfolding entropy_eq[OF `simple_function M X`] by (auto simp: y fi)
   470 qed
   471 
   472 lemma (in information_space) entropy_le_card_not_0:
   473   assumes "simple_function M X"
   474   shows "\<H>(X) \<le> log b (real (card (X ` space M \<inter> {x . distribution X {x} \<noteq> 0})))"
   475 proof -
   476   let "?d x" = "distribution X {x}"
   477   let "?p x" = "real (?d x)"
   478   have "\<H>(X) = (\<Sum>x\<in>X`space M. ?p x * log b (1 / ?p x))"
   479     by (auto intro!: setsum_cong simp: entropy_eq[OF `simple_function M X`] setsum_negf[symmetric] log_simps not_less)
   480   also have "\<dots> \<le> log b (\<Sum>x\<in>X`space M. ?p x * (1 / ?p x))"
   481     apply (rule log_setsum')
   482     using not_empty b_gt_1 `simple_function M X` sum_over_space_real_distribution
   483     by (auto simp: simple_function_def)
   484   also have "\<dots> = log b (\<Sum>x\<in>X`space M. if ?d x \<noteq> 0 then 1 else 0)"
   485     using distribution_finite[OF `simple_function M X`[THEN simple_function_imp_random_variable], simplified]
   486     by (intro arg_cong[where f="\<lambda>X. log b X"] setsum_cong) (auto simp: real_of_pextreal_eq_0)
   487   finally show ?thesis
   488     using `simple_function M X` by (auto simp: setsum_cases real_eq_of_nat simple_function_def)
   489 qed
   490 
   491 lemma (in information_space) entropy_uniform_max:
   492   assumes "simple_function M X"
   493   assumes "\<And>x y. \<lbrakk> x \<in> X ` space M ; y \<in> X ` space M \<rbrakk> \<Longrightarrow> distribution X {x} = distribution X {y}"
   494   shows "\<H>(X) = log b (real (card (X ` space M)))"
   495 proof -
   496   let ?X = "\<lparr> space = X ` space M, sets = Pow (X ` space M), measure = distribution X\<rparr>"
   497   note simple_function_imp_finite_random_variable[OF `simple_function M X`]
   498   from distribution_finite_prob_space[OF this, of "\<lparr> measure = distribution X \<rparr>"]
   499   interpret X: finite_prob_space ?X by simp
   500   have card_gt0: "0 < card (X ` space M)" unfolding card_gt_0_iff
   501     using `simple_function M X` not_empty by (auto simp: simple_function_def)
   502   { fix x assume "x \<in> X ` space M"
   503     hence "real (distribution X {x}) = 1 / real (card (X ` space M))"
   504     proof (rule X.uniform_prob[simplified])
   505       fix x y assume "x \<in> X`space M" "y \<in> X`space M"
   506       from assms(2)[OF this] show "real (distribution X {x}) = real (distribution X {y})" by simp
   507     qed }
   508   thus ?thesis
   509     using not_empty X.finite_space b_gt_1 card_gt0
   510     by (simp add: entropy_eq[OF `simple_function M X`] real_eq_of_nat[symmetric] log_simps)
   511 qed
   512 
   513 lemma (in information_space) entropy_le_card:
   514   assumes "simple_function M X"
   515   shows "\<H>(X) \<le> log b (real (card (X ` space M)))"
   516 proof cases
   517   assume "X ` space M \<inter> {x. distribution X {x} \<noteq> 0} = {}"
   518   then have "\<And>x. x\<in>X`space M \<Longrightarrow> distribution X {x} = 0" by auto
   519   moreover
   520   have "0 < card (X`space M)"
   521     using `simple_function M X` not_empty
   522     by (auto simp: card_gt_0_iff simple_function_def)
   523   then have "log b 1 \<le> log b (real (card (X`space M)))"
   524     using b_gt_1 by (intro log_le) auto
   525   ultimately show ?thesis using assms by (simp add: entropy_eq)
   526 next
   527   assume False: "X ` space M \<inter> {x. distribution X {x} \<noteq> 0} \<noteq> {}"
   528   have "card (X ` space M \<inter> {x. distribution X {x} \<noteq> 0}) \<le> card (X ` space M)"
   529     (is "?A \<le> ?B") using assms not_empty by (auto intro!: card_mono simp: simple_function_def)
   530   note entropy_le_card_not_0[OF assms]
   531   also have "log b (real ?A) \<le> log b (real ?B)"
   532     using b_gt_1 False not_empty `?A \<le> ?B` assms
   533     by (auto intro!: log_le simp: card_gt_0_iff simp: simple_function_def)
   534   finally show ?thesis .
   535 qed
   536 
   537 lemma (in information_space) entropy_commute:
   538   assumes "simple_function M X" "simple_function M Y"
   539   shows "\<H>(\<lambda>x. (X x, Y x)) = \<H>(\<lambda>x. (Y x, X x))"
   540 proof -
   541   have sf: "simple_function M (\<lambda>x. (X x, Y x))" "simple_function M (\<lambda>x. (Y x, X x))"
   542     using assms by (auto intro: simple_function_Pair)
   543   have *: "(\<lambda>x. (Y x, X x))`space M = (\<lambda>(a,b). (b,a))`(\<lambda>x. (X x, Y x))`space M"
   544     by auto
   545   have inj: "\<And>X. inj_on (\<lambda>(a,b). (b,a)) X"
   546     by (auto intro!: inj_onI)
   547   show ?thesis
   548     unfolding sf[THEN entropy_eq] unfolding * setsum_reindex[OF inj]
   549     by (simp add: joint_distribution_commute[of Y X] split_beta)
   550 qed
   551 
   552 lemma (in information_space) entropy_eq_cartesian_product:
   553   assumes "simple_function M X" "simple_function M Y"
   554   shows "\<H>(\<lambda>x. (X x, Y x)) = -(\<Sum>x\<in>X`space M. \<Sum>y\<in>Y`space M.
   555     real (joint_distribution X Y {(x,y)}) *
   556     log b (real (joint_distribution X Y {(x,y)})))"
   557 proof -
   558   have sf: "simple_function M (\<lambda>x. (X x, Y x))"
   559     using assms by (auto intro: simple_function_Pair)
   560   { fix x assume "x\<notin>(\<lambda>x. (X x, Y x))`space M"
   561     then have "(\<lambda>x. (X x, Y x)) -` {x} \<inter> space M = {}" by auto
   562     then have "joint_distribution X Y {x} = 0"
   563       unfolding distribution_def by auto }
   564   then show ?thesis using sf assms
   565     unfolding entropy_eq[OF sf] neg_equal_iff_equal setsum_cartesian_product
   566     by (auto intro!: setsum_mono_zero_cong_left simp: simple_function_def)
   567 qed
   568 
   569 subsection {* Conditional Mutual Information *}
   570 
   571 definition (in prob_space)
   572   "conditional_mutual_information b MX MY MZ X Y Z \<equiv>
   573     mutual_information b MX (MY \<Otimes>\<^isub>M MZ) X (\<lambda>x. (Y x, Z x)) -
   574     mutual_information b MX MZ X Z"
   575 
   576 abbreviation (in information_space)
   577   conditional_mutual_information_Pow ("\<I>'( _ ; _ | _ ')") where
   578   "\<I>(X ; Y | Z) \<equiv> conditional_mutual_information b
   579     \<lparr> space = X`space M, sets = Pow (X`space M), measure = distribution X \<rparr>
   580     \<lparr> space = Y`space M, sets = Pow (Y`space M), measure = distribution Y \<rparr>
   581     \<lparr> space = Z`space M, sets = Pow (Z`space M), measure = distribution Z \<rparr>
   582     X Y Z"
   583 
   584 lemma (in information_space) conditional_mutual_information_generic_eq:
   585   assumes MX: "finite_random_variable MX X"
   586     and MY: "finite_random_variable MY Y"
   587     and MZ: "finite_random_variable MZ Z"
   588   shows "conditional_mutual_information b MX MY MZ X Y Z = (\<Sum>(x, y, z) \<in> space MX \<times> space MY \<times> space MZ.
   589              real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) *
   590              log b (real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) /
   591     (real (joint_distribution X Z {(x, z)}) * real (joint_distribution Y Z {(y,z)} / distribution Z {z}))))"
   592   (is "_ = (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?XZ x z * ?YZdZ y z)))")
   593 proof -
   594   let ?YZ = "\<lambda>y z. real (joint_distribution Y Z {(y, z)})"
   595   let ?X = "\<lambda>x. real (distribution X {x})"
   596   let ?Z = "\<lambda>z. real (distribution Z {z})"
   597 
   598   txt {* This proof is actually quiet easy, however we need to show that the
   599     distributions are finite and the joint distributions are zero when one of
   600     the variables distribution is also zero. *}
   601 
   602   note finite_var = MX MY MZ
   603   note random_var = finite_var[THEN finite_random_variableD]
   604 
   605   note space_simps = space_pair_measure space_sigma algebra.simps
   606 
   607   note YZ = finite_random_variable_pairI[OF finite_var(2,3)]
   608   note XZ = finite_random_variable_pairI[OF finite_var(1,3)]
   609   note ZX = finite_random_variable_pairI[OF finite_var(3,1)]
   610   note YZX = finite_random_variable_pairI[OF finite_var(2) ZX]
   611   note order1 =
   612     finite_distribution_order(5,6)[OF finite_var(1) YZ, simplified space_simps]
   613     finite_distribution_order(5,6)[OF finite_var(1,3), simplified space_simps]
   614 
   615   note finite = finite_var(1) YZ finite_var(3) XZ YZX
   616   note finite[THEN finite_distribution_finite, simplified space_simps, simp]
   617 
   618   have order2: "\<And>x y z. \<lbrakk>x \<in> space MX; y \<in> space MY; z \<in> space MZ; joint_distribution X Z {(x, z)} = 0\<rbrakk>
   619           \<Longrightarrow> joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)} = 0"
   620     unfolding joint_distribution_commute_singleton[of X]
   621     unfolding joint_distribution_assoc_singleton[symmetric]
   622     using finite_distribution_order(6)[OF finite_var(2) ZX]
   623     by (auto simp: space_simps)
   624 
   625   have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?XZ x z * ?YZdZ y z))) =
   626     (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * (log b (?XYZ x y z / (?X x * ?YZ y z)) - log b (?XZ x z / (?X x * ?Z z))))"
   627     (is "(\<Sum>(x, y, z)\<in>?S. ?L x y z) = (\<Sum>(x, y, z)\<in>?S. ?R x y z)")
   628   proof (safe intro!: setsum_cong)
   629     fix x y z assume space: "x \<in> space MX" "y \<in> space MY" "z \<in> space MZ"
   630     then have *: "?XYZ x y z / (?XZ x z * ?YZdZ y z) =
   631       (?XYZ x y z / (?X x * ?YZ y z)) / (?XZ x z / (?X x * ?Z z))"
   632       using order1(3)
   633       by (auto simp: real_of_pextreal_mult[symmetric] real_of_pextreal_eq_0)
   634     show "?L x y z = ?R x y z"
   635     proof cases
   636       assume "?XYZ x y z \<noteq> 0"
   637       with space b_gt_1 order1 order2 show ?thesis unfolding *
   638         by (subst log_divide)
   639            (auto simp: zero_less_divide_iff zero_less_real_of_pextreal
   640                        real_of_pextreal_eq_0 zero_less_mult_iff)
   641     qed simp
   642   qed
   643   also have "\<dots> = (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?X x * ?YZ y z))) -
   644                   (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XZ x z / (?X x * ?Z z)))"
   645     by (auto simp add: setsum_subtractf[symmetric] field_simps intro!: setsum_cong)
   646   also have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XZ x z / (?X x * ?Z z))) =
   647              (\<Sum>(x, z)\<in>space MX \<times> space MZ. ?XZ x z * log b (?XZ x z / (?X x * ?Z z)))"
   648     unfolding setsum_cartesian_product[symmetric] setsum_commute[of _ _ "space MY"]
   649               setsum_left_distrib[symmetric]
   650     unfolding joint_distribution_commute_singleton[of X]
   651     unfolding joint_distribution_assoc_singleton[symmetric]
   652     using setsum_real_joint_distribution_singleton[OF finite_var(2) ZX, unfolded space_simps]
   653     by (intro setsum_cong refl) simp
   654   also have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?X x * ?YZ y z))) -
   655              (\<Sum>(x, z)\<in>space MX \<times> space MZ. ?XZ x z * log b (?XZ x z / (?X x * ?Z z))) =
   656              conditional_mutual_information b MX MY MZ X Y Z"
   657     unfolding conditional_mutual_information_def
   658     unfolding mutual_information_generic_eq[OF finite_var(1,3)]
   659     unfolding mutual_information_generic_eq[OF finite_var(1) YZ]
   660     by (simp add: space_sigma space_pair_measure setsum_cartesian_product')
   661   finally show ?thesis by simp
   662 qed
   663 
   664 lemma (in information_space) conditional_mutual_information_eq:
   665   assumes "simple_function M X" "simple_function M Y" "simple_function M Z"
   666   shows "\<I>(X;Y|Z) = (\<Sum>(x, y, z) \<in> X`space M \<times> Y`space M \<times> Z`space M.
   667              real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) *
   668              log b (real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) /
   669     (real (joint_distribution X Z {(x, z)}) * real (joint_distribution Y Z {(y,z)} / distribution Z {z}))))"
   670   using conditional_mutual_information_generic_eq[OF assms[THEN simple_function_imp_finite_random_variable]]
   671   by simp
   672 
   673 lemma (in information_space) conditional_mutual_information_eq_mutual_information:
   674   assumes X: "simple_function M X" and Y: "simple_function M Y"
   675   shows "\<I>(X ; Y) = \<I>(X ; Y | (\<lambda>x. ()))"
   676 proof -
   677   have [simp]: "(\<lambda>x. ()) ` space M = {()}" using not_empty by auto
   678   have C: "simple_function M (\<lambda>x. ())" by auto
   679   show ?thesis
   680     unfolding conditional_mutual_information_eq[OF X Y C]
   681     unfolding mutual_information_eq[OF X Y]
   682     by (simp add: setsum_cartesian_product' distribution_remove_const)
   683 qed
   684 
   685 lemma (in prob_space) distribution_unit[simp]: "distribution (\<lambda>x. ()) {()} = 1"
   686   unfolding distribution_def using measure_space_1 by auto
   687 
   688 lemma (in prob_space) joint_distribution_unit[simp]: "distribution (\<lambda>x. (X x, ())) {(a, ())} = distribution X {a}"
   689   unfolding distribution_def by (auto intro!: arg_cong[where f=\<mu>])
   690 
   691 lemma (in prob_space) setsum_distribution:
   692   assumes X: "finite_random_variable MX X" shows "(\<Sum>a\<in>space MX. distribution X {a}) = 1"
   693   using setsum_joint_distribution[OF assms, of "\<lparr> space = UNIV, sets = Pow UNIV \<rparr>" "\<lambda>x. ()" "{()}"]
   694   using sigma_algebra_Pow[of "UNIV::unit set" "()"] by simp
   695 
   696 lemma (in prob_space) setsum_real_distribution:
   697   fixes MX :: "('c, 'd) measure_space_scheme"
   698   assumes X: "finite_random_variable MX X" shows "(\<Sum>a\<in>space MX. real (distribution X {a})) = 1"
   699   using setsum_real_joint_distribution[OF assms, of "\<lparr> space = UNIV, sets = Pow UNIV, measure = undefined \<rparr>" "\<lambda>x. ()" "{()}"]
   700   using sigma_algebra_Pow[of "UNIV::unit set" "\<lparr> measure = undefined \<rparr>"] by simp
   701 
   702 lemma (in information_space) conditional_mutual_information_generic_positive:
   703   assumes "finite_random_variable MX X" and "finite_random_variable MY Y" and "finite_random_variable MZ Z"
   704   shows "0 \<le> conditional_mutual_information b MX MY MZ X Y Z"
   705 proof (cases "space MX \<times> space MY \<times> space MZ = {}")
   706   case True show ?thesis
   707     unfolding conditional_mutual_information_generic_eq[OF assms] True
   708     by simp
   709 next
   710   case False
   711   let "?dXYZ A" = "real (distribution (\<lambda>x. (X x, Y x, Z x)) A)"
   712   let "?dXZ A" = "real (joint_distribution X Z A)"
   713   let "?dYZ A" = "real (joint_distribution Y Z A)"
   714   let "?dX A" = "real (distribution X A)"
   715   let "?dZ A" = "real (distribution Z A)"
   716   let ?M = "space MX \<times> space MY \<times> space MZ"
   717 
   718   have split_beta: "\<And>f. split f = (\<lambda>x. f (fst x) (snd x))" by (simp add: fun_eq_iff)
   719 
   720   note space_simps = space_pair_measure space_sigma algebra.simps
   721 
   722   note finite_var = assms
   723   note YZ = finite_random_variable_pairI[OF finite_var(2,3)]
   724   note XZ = finite_random_variable_pairI[OF finite_var(1,3)]
   725   note ZX = finite_random_variable_pairI[OF finite_var(3,1)]
   726   note YZ = finite_random_variable_pairI[OF finite_var(2,3)]
   727   note XYZ = finite_random_variable_pairI[OF finite_var(1) YZ]
   728   note finite = finite_var(3) YZ XZ XYZ
   729   note finite = finite[THEN finite_distribution_finite, simplified space_simps]
   730 
   731   have order: "\<And>x y z. \<lbrakk>x \<in> space MX; y \<in> space MY; z \<in> space MZ; joint_distribution X Z {(x, z)} = 0\<rbrakk>
   732           \<Longrightarrow> joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)} = 0"
   733     unfolding joint_distribution_commute_singleton[of X]
   734     unfolding joint_distribution_assoc_singleton[symmetric]
   735     using finite_distribution_order(6)[OF finite_var(2) ZX]
   736     by (auto simp: space_simps)
   737 
   738   note order = order
   739     finite_distribution_order(5,6)[OF finite_var(1) YZ, simplified space_simps]
   740     finite_distribution_order(5,6)[OF finite_var(2,3), simplified space_simps]
   741 
   742   have "- conditional_mutual_information b MX MY MZ X Y Z = - (\<Sum>(x, y, z) \<in> ?M. ?dXYZ {(x, y, z)} *
   743     log b (?dXYZ {(x, y, z)} / (?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})))"
   744     unfolding conditional_mutual_information_generic_eq[OF assms] neg_equal_iff_equal
   745     by (intro setsum_cong) (auto intro!: arg_cong[where f="log b"] simp: real_of_pextreal_mult[symmetric])
   746   also have "\<dots> \<le> log b (\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})"
   747     unfolding split_beta
   748   proof (rule log_setsum_divide)
   749     show "?M \<noteq> {}" using False by simp
   750     show "1 < b" using b_gt_1 .
   751 
   752     show "finite ?M" using assms
   753       unfolding finite_sigma_algebra_def finite_sigma_algebra_axioms_def by auto
   754 
   755     show "(\<Sum>x\<in>?M. ?dXYZ {(fst x, fst (snd x), snd (snd x))}) = 1"
   756       unfolding setsum_cartesian_product'
   757       unfolding setsum_commute[of _ "space MY"]
   758       unfolding setsum_commute[of _ "space MZ"]
   759       by (simp_all add: space_pair_measure
   760         setsum_real_joint_distribution_singleton[OF `finite_random_variable MX X` YZ]
   761         setsum_real_joint_distribution_singleton[OF `finite_random_variable MY Y` finite_var(3)]
   762         setsum_real_distribution[OF `finite_random_variable MZ Z`])
   763 
   764     fix x assume "x \<in> ?M"
   765     let ?x = "(fst x, fst (snd x), snd (snd x))"
   766 
   767     show "0 \<le> ?dXYZ {?x}" using real_pextreal_nonneg .
   768     show "0 \<le> ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}"
   769      by (simp add: real_pextreal_nonneg mult_nonneg_nonneg divide_nonneg_nonneg)
   770 
   771     assume *: "0 < ?dXYZ {?x}"
   772     with `x \<in> ?M` show "0 < ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}"
   773       using finite order
   774       by (cases x)
   775          (auto simp add: zero_less_real_of_pextreal zero_less_mult_iff zero_less_divide_iff)
   776   qed
   777   also have "(\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z}) = (\<Sum>z\<in>space MZ. ?dZ {z})"
   778     apply (simp add: setsum_cartesian_product')
   779     apply (subst setsum_commute)
   780     apply (subst (2) setsum_commute)
   781     by (auto simp: setsum_divide_distrib[symmetric] setsum_product[symmetric]
   782                    setsum_real_joint_distribution_singleton[OF finite_var(1,3)]
   783                    setsum_real_joint_distribution_singleton[OF finite_var(2,3)]
   784           intro!: setsum_cong)
   785   also have "log b (\<Sum>z\<in>space MZ. ?dZ {z}) = 0"
   786     unfolding setsum_real_distribution[OF finite_var(3)] by simp
   787   finally show ?thesis by simp
   788 qed
   789 
   790 lemma (in information_space) conditional_mutual_information_positive:
   791   assumes "simple_function M X" and "simple_function M Y" and "simple_function M Z"
   792   shows "0 \<le> \<I>(X;Y|Z)"
   793   by (rule conditional_mutual_information_generic_positive[OF assms[THEN simple_function_imp_finite_random_variable]])
   794 
   795 subsection {* Conditional Entropy *}
   796 
   797 definition (in prob_space)
   798   "conditional_entropy b S T X Y = conditional_mutual_information b S S T X X Y"
   799 
   800 abbreviation (in information_space)
   801   conditional_entropy_Pow ("\<H>'(_ | _')") where
   802   "\<H>(X | Y) \<equiv> conditional_entropy b
   803     \<lparr> space = X`space M, sets = Pow (X`space M), measure = distribution X \<rparr>
   804     \<lparr> space = Y`space M, sets = Pow (Y`space M), measure = distribution Y \<rparr> X Y"
   805 
   806 lemma (in information_space) conditional_entropy_positive:
   807   "simple_function M X \<Longrightarrow> simple_function M Y \<Longrightarrow> 0 \<le> \<H>(X | Y)"
   808   unfolding conditional_entropy_def by (auto intro!: conditional_mutual_information_positive)
   809 
   810 lemma (in measure_space) empty_measureI: "A = {} \<Longrightarrow> \<mu> A = 0" by simp
   811 
   812 lemma (in information_space) conditional_entropy_generic_eq:
   813   fixes MX :: "('c, 'd) measure_space_scheme" and MY :: "('e, 'f) measure_space_scheme"
   814   assumes MX: "finite_random_variable MX X"
   815   assumes MZ: "finite_random_variable MZ Z"
   816   shows "conditional_entropy b MX MZ X Z =
   817      - (\<Sum>(x, z)\<in>space MX \<times> space MZ.
   818          real (joint_distribution X Z {(x, z)}) *
   819          log b (real (joint_distribution X Z {(x, z)}) / real (distribution Z {z})))"
   820 proof -
   821   interpret MX: finite_sigma_algebra MX using MX by simp
   822   interpret MZ: finite_sigma_algebra MZ using MZ by simp
   823   let "?XXZ x y z" = "joint_distribution X (\<lambda>x. (X x, Z x)) {(x, y, z)}"
   824   let "?XZ x z" = "joint_distribution X Z {(x, z)}"
   825   let "?Z z" = "distribution Z {z}"
   826   let "?f x y z" = "log b (real (?XXZ x y z) / (real (?XZ x z) * real (?XZ y z / ?Z z)))"
   827   { fix x z have "?XXZ x x z = ?XZ x z"
   828       unfolding distribution_def by (auto intro!: arg_cong[where f=\<mu>]) }
   829   note this[simp]
   830   { fix x x' :: 'c and z assume "x' \<noteq> x"
   831     then have "?XXZ x x' z = 0"
   832       by (auto simp: distribution_def intro!: arg_cong[where f=\<mu>] empty_measureI) }
   833   note this[simp]
   834   { fix x x' z assume *: "x \<in> space MX" "z \<in> space MZ"
   835     then have "(\<Sum>x'\<in>space MX. real (?XXZ x x' z) * ?f x x' z)
   836       = (\<Sum>x'\<in>space MX. if x = x' then real (?XZ x z) * ?f x x z else 0)"
   837       by (auto intro!: setsum_cong)
   838     also have "\<dots> = real (?XZ x z) * ?f x x z"
   839       using `x \<in> space MX` by (simp add: setsum_cases[OF MX.finite_space])
   840     also have "\<dots> = real (?XZ x z) * log b (real (?Z z) / real (?XZ x z))"
   841       by (auto simp: real_of_pextreal_mult[symmetric])
   842     also have "\<dots> = - real (?XZ x z) * log b (real (?XZ x z) / real (?Z z))"
   843       using assms[THEN finite_distribution_finite]
   844       using finite_distribution_order(6)[OF MX MZ]
   845       by (auto simp: log_simps field_simps zero_less_mult_iff zero_less_real_of_pextreal real_of_pextreal_eq_0)
   846     finally have "(\<Sum>x'\<in>space MX. real (?XXZ x x' z) * ?f x x' z) =
   847       - real (?XZ x z) * log b (real (?XZ x z) / real (?Z z))" . }
   848   note * = this
   849   show ?thesis
   850     unfolding conditional_entropy_def
   851     unfolding conditional_mutual_information_generic_eq[OF MX MX MZ]
   852     by (auto simp: setsum_cartesian_product' setsum_negf[symmetric]
   853                    setsum_commute[of _ "space MZ"] *   simp del: divide_pextreal_def
   854              intro!: setsum_cong)
   855 qed
   856 
   857 lemma (in information_space) conditional_entropy_eq:
   858   assumes "simple_function M X" "simple_function M Z"
   859   shows "\<H>(X | Z) =
   860      - (\<Sum>(x, z)\<in>X ` space M \<times> Z ` space M.
   861          real (joint_distribution X Z {(x, z)}) *
   862          log b (real (joint_distribution X Z {(x, z)}) / real (distribution Z {z})))"
   863   using conditional_entropy_generic_eq[OF assms[THEN simple_function_imp_finite_random_variable]]
   864   by simp
   865 
   866 lemma (in information_space) conditional_entropy_eq_ce_with_hypothesis:
   867   assumes X: "simple_function M X" and Y: "simple_function M Y"
   868   shows "\<H>(X | Y) =
   869     -(\<Sum>y\<in>Y`space M. real (distribution Y {y}) *
   870       (\<Sum>x\<in>X`space M. real (joint_distribution X Y {(x,y)}) / real (distribution Y {(y)}) *
   871               log b (real (joint_distribution X Y {(x,y)}) / real (distribution Y {(y)}))))"
   872   unfolding conditional_entropy_eq[OF assms]
   873   using finite_distribution_finite[OF finite_random_variable_pairI[OF assms[THEN simple_function_imp_finite_random_variable]]]
   874   using finite_distribution_order(5,6)[OF assms[THEN simple_function_imp_finite_random_variable]]
   875   using finite_distribution_finite[OF Y[THEN simple_function_imp_finite_random_variable]]
   876   by (auto simp: setsum_cartesian_product'  setsum_commute[of _ "Y`space M"] setsum_right_distrib real_of_pextreal_eq_0
   877            intro!: setsum_cong)
   878 
   879 lemma (in information_space) conditional_entropy_eq_cartesian_product:
   880   assumes "simple_function M X" "simple_function M Y"
   881   shows "\<H>(X | Y) = -(\<Sum>x\<in>X`space M. \<Sum>y\<in>Y`space M.
   882     real (joint_distribution X Y {(x,y)}) *
   883     log b (real (joint_distribution X Y {(x,y)}) / real (distribution Y {y})))"
   884   unfolding conditional_entropy_eq[OF assms]
   885   by (auto intro!: setsum_cong simp: setsum_cartesian_product')
   886 
   887 subsection {* Equalities *}
   888 
   889 lemma (in information_space) mutual_information_eq_entropy_conditional_entropy:
   890   assumes X: "simple_function M X" and Z: "simple_function M Z"
   891   shows  "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)"
   892 proof -
   893   let "?XZ x z" = "real (joint_distribution X Z {(x, z)})"
   894   let "?Z z" = "real (distribution Z {z})"
   895   let "?X x" = "real (distribution X {x})"
   896   note fX = X[THEN simple_function_imp_finite_random_variable]
   897   note fZ = Z[THEN simple_function_imp_finite_random_variable]
   898   note fX[THEN finite_distribution_finite, simp] and fZ[THEN finite_distribution_finite, simp]
   899   note finite_distribution_order[OF fX fZ, simp]
   900   { fix x z assume "x \<in> X`space M" "z \<in> Z`space M"
   901     have "?XZ x z * log b (?XZ x z / (?X x * ?Z z)) =
   902           ?XZ x z * log b (?XZ x z / ?Z z) - ?XZ x z * log b (?X x)"
   903       by (auto simp: log_simps real_of_pextreal_mult[symmetric] zero_less_mult_iff
   904                      zero_less_real_of_pextreal field_simps real_of_pextreal_eq_0 abs_mult) }
   905   note * = this
   906   show ?thesis
   907     unfolding entropy_eq[OF X] conditional_entropy_eq[OF X Z] mutual_information_eq[OF X Z]
   908     using setsum_real_joint_distribution_singleton[OF fZ fX, unfolded joint_distribution_commute_singleton[of Z X]]
   909     by (simp add: * setsum_cartesian_product' setsum_subtractf setsum_left_distrib[symmetric]
   910                      setsum_real_distribution)
   911 qed
   912 
   913 lemma (in information_space) conditional_entropy_less_eq_entropy:
   914   assumes X: "simple_function M X" and Z: "simple_function M Z"
   915   shows "\<H>(X | Z) \<le> \<H>(X)"
   916 proof -
   917   have "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)" using mutual_information_eq_entropy_conditional_entropy[OF assms] .
   918   with mutual_information_positive[OF X Z] entropy_positive[OF X]
   919   show ?thesis by auto
   920 qed
   921 
   922 lemma (in information_space) entropy_chain_rule:
   923   assumes X: "simple_function M X" and Y: "simple_function M Y"
   924   shows  "\<H>(\<lambda>x. (X x, Y x)) = \<H>(X) + \<H>(Y|X)"
   925 proof -
   926   let "?XY x y" = "real (joint_distribution X Y {(x, y)})"
   927   let "?Y y" = "real (distribution Y {y})"
   928   let "?X x" = "real (distribution X {x})"
   929   note fX = X[THEN simple_function_imp_finite_random_variable]
   930   note fY = Y[THEN simple_function_imp_finite_random_variable]
   931   note fX[THEN finite_distribution_finite, simp] and fY[THEN finite_distribution_finite, simp]
   932   note finite_distribution_order[OF fX fY, simp]
   933   { fix x y assume "x \<in> X`space M" "y \<in> Y`space M"
   934     have "?XY x y * log b (?XY x y / ?X x) =
   935           ?XY x y * log b (?XY x y) - ?XY x y * log b (?X x)"
   936       by (auto simp: log_simps real_of_pextreal_mult[symmetric] zero_less_mult_iff
   937                      zero_less_real_of_pextreal field_simps real_of_pextreal_eq_0 abs_mult) }
   938   note * = this
   939   show ?thesis
   940     using setsum_real_joint_distribution_singleton[OF fY fX]
   941     unfolding entropy_eq[OF X] conditional_entropy_eq_cartesian_product[OF Y X] entropy_eq_cartesian_product[OF X Y]
   942     unfolding joint_distribution_commute_singleton[of Y X] setsum_commute[of _ "X`space M"]
   943     by (simp add: * setsum_subtractf setsum_left_distrib[symmetric])
   944 qed
   945 
   946 section {* Partitioning *}
   947 
   948 definition "subvimage A f g \<longleftrightarrow> (\<forall>x \<in> A. f -` {f x} \<inter> A \<subseteq> g -` {g x} \<inter> A)"
   949 
   950 lemma subvimageI:
   951   assumes "\<And>x y. \<lbrakk> x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
   952   shows "subvimage A f g"
   953   using assms unfolding subvimage_def by blast
   954 
   955 lemma subvimageE[consumes 1]:
   956   assumes "subvimage A f g"
   957   obtains "\<And>x y. \<lbrakk> x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
   958   using assms unfolding subvimage_def by blast
   959 
   960 lemma subvimageD:
   961   "\<lbrakk> subvimage A f g ; x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
   962   using assms unfolding subvimage_def by blast
   963 
   964 lemma subvimage_subset:
   965   "\<lbrakk> subvimage B f g ; A \<subseteq> B \<rbrakk> \<Longrightarrow> subvimage A f g"
   966   unfolding subvimage_def by auto
   967 
   968 lemma subvimage_idem[intro]: "subvimage A g g"
   969   by (safe intro!: subvimageI)
   970 
   971 lemma subvimage_comp_finer[intro]:
   972   assumes svi: "subvimage A g h"
   973   shows "subvimage A g (f \<circ> h)"
   974 proof (rule subvimageI, simp)
   975   fix x y assume "x \<in> A" "y \<in> A" "g x = g y"
   976   from svi[THEN subvimageD, OF this]
   977   show "f (h x) = f (h y)" by simp
   978 qed
   979 
   980 lemma subvimage_comp_gran:
   981   assumes svi: "subvimage A g h"
   982   assumes inj: "inj_on f (g ` A)"
   983   shows "subvimage A (f \<circ> g) h"
   984   by (rule subvimageI) (auto intro!: subvimageD[OF svi] simp: inj_on_iff[OF inj])
   985 
   986 lemma subvimage_comp:
   987   assumes svi: "subvimage (f ` A) g h"
   988   shows "subvimage A (g \<circ> f) (h \<circ> f)"
   989   by (rule subvimageI) (auto intro!: svi[THEN subvimageD])
   990 
   991 lemma subvimage_trans:
   992   assumes fg: "subvimage A f g"
   993   assumes gh: "subvimage A g h"
   994   shows "subvimage A f h"
   995   by (rule subvimageI) (auto intro!: fg[THEN subvimageD] gh[THEN subvimageD])
   996 
   997 lemma subvimage_translator:
   998   assumes svi: "subvimage A f g"
   999   shows "\<exists>h. \<forall>x \<in> A. h (f x)  = g x"
  1000 proof (safe intro!: exI[of _ "\<lambda>x. (THE z. z \<in> (g ` (f -` {x} \<inter> A)))"])
  1001   fix x assume "x \<in> A"
  1002   show "(THE x'. x' \<in> (g ` (f -` {f x} \<inter> A))) = g x"
  1003     by (rule theI2[of _ "g x"])
  1004       (insert `x \<in> A`, auto intro!: svi[THEN subvimageD])
  1005 qed
  1006 
  1007 lemma subvimage_translator_image:
  1008   assumes svi: "subvimage A f g"
  1009   shows "\<exists>h. h ` f ` A = g ` A"
  1010 proof -
  1011   from subvimage_translator[OF svi]
  1012   obtain h where "\<And>x. x \<in> A \<Longrightarrow> h (f x) = g x" by auto
  1013   thus ?thesis
  1014     by (auto intro!: exI[of _ h]
  1015       simp: image_compose[symmetric] comp_def cong: image_cong)
  1016 qed
  1017 
  1018 lemma subvimage_finite:
  1019   assumes svi: "subvimage A f g" and fin: "finite (f`A)"
  1020   shows "finite (g`A)"
  1021 proof -
  1022   from subvimage_translator_image[OF svi]
  1023   obtain h where "g`A = h`f`A" by fastsimp
  1024   with fin show "finite (g`A)" by simp
  1025 qed
  1026 
  1027 lemma subvimage_disj:
  1028   assumes svi: "subvimage A f g"
  1029   shows "f -` {x} \<inter> A \<subseteq> g -` {y} \<inter> A \<or>
  1030       f -` {x} \<inter> g -` {y} \<inter> A = {}" (is "?sub \<or> ?dist")
  1031 proof (rule disjCI)
  1032   assume "\<not> ?dist"
  1033   then obtain z where "z \<in> A" and "x = f z" and "y = g z" by auto
  1034   thus "?sub" using svi unfolding subvimage_def by auto
  1035 qed
  1036 
  1037 lemma setsum_image_split:
  1038   assumes svi: "subvimage A f g" and fin: "finite (f ` A)"
  1039   shows "(\<Sum>x\<in>f`A. h x) = (\<Sum>y\<in>g`A. \<Sum>x\<in>f`(g -` {y} \<inter> A). h x)"
  1040     (is "?lhs = ?rhs")
  1041 proof -
  1042   have "f ` A =
  1043       snd ` (SIGMA x : g ` A. f ` (g -` {x} \<inter> A))"
  1044       (is "_ = snd ` ?SIGMA")
  1045     unfolding image_split_eq_Sigma[symmetric]
  1046     by (simp add: image_compose[symmetric] comp_def)
  1047   moreover
  1048   have snd_inj: "inj_on snd ?SIGMA"
  1049     unfolding image_split_eq_Sigma[symmetric]
  1050     by (auto intro!: inj_onI subvimageD[OF svi])
  1051   ultimately
  1052   have "(\<Sum>x\<in>f`A. h x) = (\<Sum>(x,y)\<in>?SIGMA. h y)"
  1053     by (auto simp: setsum_reindex intro: setsum_cong)
  1054   also have "... = ?rhs"
  1055     using subvimage_finite[OF svi fin] fin
  1056     apply (subst setsum_Sigma[symmetric])
  1057     by (auto intro!: finite_subset[of _ "f`A"])
  1058   finally show ?thesis .
  1059 qed
  1060 
  1061 lemma (in information_space) entropy_partition:
  1062   assumes sf: "simple_function M X" "simple_function M P"
  1063   assumes svi: "subvimage (space M) X P"
  1064   shows "\<H>(X) = \<H>(P) + \<H>(X|P)"
  1065 proof -
  1066   let "?XP x p" = "real (joint_distribution X P {(x, p)})"
  1067   let "?X x" = "real (distribution X {x})"
  1068   let "?P p" = "real (distribution P {p})"
  1069   note fX = sf(1)[THEN simple_function_imp_finite_random_variable]
  1070   note fP = sf(2)[THEN simple_function_imp_finite_random_variable]
  1071   note fX[THEN finite_distribution_finite, simp] and fP[THEN finite_distribution_finite, simp]
  1072   note finite_distribution_order[OF fX fP, simp]
  1073   have "(\<Sum>x\<in>X ` space M. real (distribution X {x}) * log b (real (distribution X {x}))) =
  1074     (\<Sum>y\<in>P `space M. \<Sum>x\<in>X ` space M.
  1075     real (joint_distribution X P {(x, y)}) * log b (real (joint_distribution X P {(x, y)})))"
  1076   proof (subst setsum_image_split[OF svi],
  1077       safe intro!: setsum_mono_zero_cong_left imageI)
  1078     show "finite (X ` space M)" "finite (X ` space M)" "finite (P ` space M)"
  1079       using sf unfolding simple_function_def by auto
  1080   next
  1081     fix p x assume in_space: "p \<in> space M" "x \<in> space M"
  1082     assume "real (joint_distribution X P {(X x, P p)}) * log b (real (joint_distribution X P {(X x, P p)})) \<noteq> 0"
  1083     hence "(\<lambda>x. (X x, P x)) -` {(X x, P p)} \<inter> space M \<noteq> {}" by (auto simp: distribution_def)
  1084     with svi[unfolded subvimage_def, rule_format, OF `x \<in> space M`]
  1085     show "x \<in> P -` {P p}" by auto
  1086   next
  1087     fix p x assume in_space: "p \<in> space M" "x \<in> space M"
  1088     assume "P x = P p"
  1089     from this[symmetric] svi[unfolded subvimage_def, rule_format, OF `x \<in> space M`]
  1090     have "X -` {X x} \<inter> space M \<subseteq> P -` {P p} \<inter> space M"
  1091       by auto
  1092     hence "(\<lambda>x. (X x, P x)) -` {(X x, P p)} \<inter> space M = X -` {X x} \<inter> space M"
  1093       by auto
  1094     thus "real (distribution X {X x}) * log b (real (distribution X {X x})) =
  1095           real (joint_distribution X P {(X x, P p)}) *
  1096           log b (real (joint_distribution X P {(X x, P p)}))"
  1097       by (auto simp: distribution_def)
  1098   qed
  1099   moreover have "\<And>x y. real (joint_distribution X P {(x, y)}) *
  1100       log b (real (joint_distribution X P {(x, y)}) / real (distribution P {y})) =
  1101       real (joint_distribution X P {(x, y)}) * log b (real (joint_distribution X P {(x, y)})) -
  1102       real (joint_distribution X P {(x, y)}) * log b (real (distribution P {y}))"
  1103     by (auto simp add: log_simps zero_less_mult_iff field_simps)
  1104   ultimately show ?thesis
  1105     unfolding sf[THEN entropy_eq] conditional_entropy_eq[OF sf]
  1106     using setsum_real_joint_distribution_singleton[OF fX fP]
  1107     by (simp add: setsum_cartesian_product' setsum_subtractf setsum_real_distribution
  1108       setsum_left_distrib[symmetric] setsum_commute[where B="P`space M"])
  1109 qed
  1110 
  1111 corollary (in information_space) entropy_data_processing:
  1112   assumes X: "simple_function M X" shows "\<H>(f \<circ> X) \<le> \<H>(X)"
  1113 proof -
  1114   note X
  1115   moreover have fX: "simple_function M (f \<circ> X)" using X by auto
  1116   moreover have "subvimage (space M) X (f \<circ> X)" by auto
  1117   ultimately have "\<H>(X) = \<H>(f\<circ>X) + \<H>(X|f\<circ>X)" by (rule entropy_partition)
  1118   then show "\<H>(f \<circ> X) \<le> \<H>(X)"
  1119     by (auto intro: conditional_entropy_positive[OF X fX])
  1120 qed
  1121 
  1122 corollary (in information_space) entropy_of_inj:
  1123   assumes X: "simple_function M X" and inj: "inj_on f (X`space M)"
  1124   shows "\<H>(f \<circ> X) = \<H>(X)"
  1125 proof (rule antisym)
  1126   show "\<H>(f \<circ> X) \<le> \<H>(X)" using entropy_data_processing[OF X] .
  1127 next
  1128   have sf: "simple_function M (f \<circ> X)"
  1129     using X by auto
  1130   have "\<H>(X) = \<H>(the_inv_into (X`space M) f \<circ> (f \<circ> X))"
  1131     by (auto intro!: mutual_information_cong simp: entropy_def the_inv_into_f_f[OF inj])
  1132   also have "... \<le> \<H>(f \<circ> X)"
  1133     using entropy_data_processing[OF sf] .
  1134   finally show "\<H>(X) \<le> \<H>(f \<circ> X)" .
  1135 qed
  1136 
  1137 end