src/HOL/Probability/Information.thy
author hoelzl
Tue Jan 18 21:37:23 2011 +0100 (2011-01-18)
changeset 41654 32fe42892983
parent 41413 64cd30d6b0b8
child 41661 baf1964bc468
permissions -rw-r--r--
Gauge measure removed
     1 theory Information
     2 imports
     3   Probability_Space
     4   "~~/src/HOL/Library/Convex"
     5   Lebesgue_Measure
     6 begin
     7 
     8 lemma log_le: "1 < a \<Longrightarrow> 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log a x \<le> log a y"
     9   by (subst log_le_cancel_iff) auto
    10 
    11 lemma log_less: "1 < a \<Longrightarrow> 0 < x \<Longrightarrow> x < y \<Longrightarrow> log a x < log a y"
    12   by (subst log_less_cancel_iff) auto
    13 
    14 lemma setsum_cartesian_product':
    15   "(\<Sum>x\<in>A \<times> B. f x) = (\<Sum>x\<in>A. setsum (\<lambda>y. f (x, y)) B)"
    16   unfolding setsum_cartesian_product by simp
    17 
    18 section "Convex theory"
    19 
    20 lemma log_setsum:
    21   assumes "finite s" "s \<noteq> {}"
    22   assumes "b > 1"
    23   assumes "(\<Sum> i \<in> s. a i) = 1"
    24   assumes "\<And> i. i \<in> s \<Longrightarrow> a i \<ge> 0"
    25   assumes "\<And> i. i \<in> s \<Longrightarrow> y i \<in> {0 <..}"
    26   shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
    27 proof -
    28   have "convex_on {0 <..} (\<lambda> x. - log b x)"
    29     by (rule minus_log_convex[OF `b > 1`])
    30   hence "- log b (\<Sum> i \<in> s. a i * y i) \<le> (\<Sum> i \<in> s. a i * - log b (y i))"
    31     using convex_on_setsum[of _ _ "\<lambda> x. - log b x"] assms pos_is_convex by fastsimp
    32   thus ?thesis by (auto simp add:setsum_negf le_imp_neg_le)
    33 qed
    34 
    35 lemma log_setsum':
    36   assumes "finite s" "s \<noteq> {}"
    37   assumes "b > 1"
    38   assumes "(\<Sum> i \<in> s. a i) = 1"
    39   assumes pos: "\<And> i. i \<in> s \<Longrightarrow> 0 \<le> a i"
    40           "\<And> i. \<lbrakk> i \<in> s ; 0 < a i \<rbrakk> \<Longrightarrow> 0 < y i"
    41   shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
    42 proof -
    43   have "\<And>y. (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) = (\<Sum> i \<in> s. a i * y i)"
    44     using assms by (auto intro!: setsum_mono_zero_cong_left)
    45   moreover have "log b (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) \<ge> (\<Sum> i \<in> s - {i. a i = 0}. a i * log b (y i))"
    46   proof (rule log_setsum)
    47     have "setsum a (s - {i. a i = 0}) = setsum a s"
    48       using assms(1) by (rule setsum_mono_zero_cong_left) auto
    49     thus sum_1: "setsum a (s - {i. a i = 0}) = 1"
    50       "finite (s - {i. a i = 0})" using assms by simp_all
    51 
    52     show "s - {i. a i = 0} \<noteq> {}"
    53     proof
    54       assume *: "s - {i. a i = 0} = {}"
    55       hence "setsum a (s - {i. a i = 0}) = 0" by (simp add: * setsum_empty)
    56       with sum_1 show False by simp
    57     qed
    58 
    59     fix i assume "i \<in> s - {i. a i = 0}"
    60     hence "i \<in> s" "a i \<noteq> 0" by simp_all
    61     thus "0 \<le> a i" "y i \<in> {0<..}" using pos[of i] by auto
    62   qed fact+
    63   ultimately show ?thesis by simp
    64 qed
    65 
    66 lemma log_setsum_divide:
    67   assumes "finite S" and "S \<noteq> {}" and "1 < b"
    68   assumes "(\<Sum>x\<in>S. g x) = 1"
    69   assumes pos: "\<And>x. x \<in> S \<Longrightarrow> g x \<ge> 0" "\<And>x. x \<in> S \<Longrightarrow> f x \<ge> 0"
    70   assumes g_pos: "\<And>x. \<lbrakk> x \<in> S ; 0 < g x \<rbrakk> \<Longrightarrow> 0 < f x"
    71   shows "- (\<Sum>x\<in>S. g x * log b (g x / f x)) \<le> log b (\<Sum>x\<in>S. f x)"
    72 proof -
    73   have log_mono: "\<And>x y. 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log b x \<le> log b y"
    74     using `1 < b` by (subst log_le_cancel_iff) auto
    75 
    76   have "- (\<Sum>x\<in>S. g x * log b (g x / f x)) = (\<Sum>x\<in>S. g x * log b (f x / g x))"
    77   proof (unfold setsum_negf[symmetric], rule setsum_cong)
    78     fix x assume x: "x \<in> S"
    79     show "- (g x * log b (g x / f x)) = g x * log b (f x / g x)"
    80     proof (cases "g x = 0")
    81       case False
    82       with pos[OF x] g_pos[OF x] have "0 < f x" "0 < g x" by simp_all
    83       thus ?thesis using `1 < b` by (simp add: log_divide field_simps)
    84     qed simp
    85   qed rule
    86   also have "... \<le> log b (\<Sum>x\<in>S. g x * (f x / g x))"
    87   proof (rule log_setsum')
    88     fix x assume x: "x \<in> S" "0 < g x"
    89     with g_pos[OF x] show "0 < f x / g x" by (safe intro!: divide_pos_pos)
    90   qed fact+
    91   also have "... = log b (\<Sum>x\<in>S - {x. g x = 0}. f x)" using `finite S`
    92     by (auto intro!: setsum_mono_zero_cong_right arg_cong[where f="log b"]
    93         split: split_if_asm)
    94   also have "... \<le> log b (\<Sum>x\<in>S. f x)"
    95   proof (rule log_mono)
    96     have "0 = (\<Sum>x\<in>S - {x. g x = 0}. 0)" by simp
    97     also have "... < (\<Sum>x\<in>S - {x. g x = 0}. f x)" (is "_ < ?sum")
    98     proof (rule setsum_strict_mono)
    99       show "finite (S - {x. g x = 0})" using `finite S` by simp
   100       show "S - {x. g x = 0} \<noteq> {}"
   101       proof
   102         assume "S - {x. g x = 0} = {}"
   103         hence "(\<Sum>x\<in>S. g x) = 0" by (subst setsum_0') auto
   104         with `(\<Sum>x\<in>S. g x) = 1` show False by simp
   105       qed
   106       fix x assume "x \<in> S - {x. g x = 0}"
   107       thus "0 < f x" using g_pos[of x] pos(1)[of x] by auto
   108     qed
   109     finally show "0 < ?sum" .
   110     show "(\<Sum>x\<in>S - {x. g x = 0}. f x) \<le> (\<Sum>x\<in>S. f x)"
   111       using `finite S` pos by (auto intro!: setsum_mono2)
   112   qed
   113   finally show ?thesis .
   114 qed
   115 
   116 lemma split_pairs:
   117   "((A, B) = X) \<longleftrightarrow> (fst X = A \<and> snd X = B)" and
   118   "(X = (A, B)) \<longleftrightarrow> (fst X = A \<and> snd X = B)" by auto
   119 
   120 section "Information theory"
   121 
   122 locale information_space = prob_space +
   123   fixes b :: real assumes b_gt_1: "1 < b"
   124 
   125 context information_space
   126 begin
   127 
   128 text {* Introduce some simplification rules for logarithm of base @{term b}. *}
   129 
   130 lemma log_neg_const:
   131   assumes "x \<le> 0"
   132   shows "log b x = log b 0"
   133 proof -
   134   { fix u :: real
   135     have "x \<le> 0" by fact
   136     also have "0 < exp u"
   137       using exp_gt_zero .
   138     finally have "exp u \<noteq> x"
   139       by auto }
   140   then show "log b x = log b 0"
   141     by (simp add: log_def ln_def)
   142 qed
   143 
   144 lemma log_mult_eq:
   145   "log b (A * B) = (if 0 < A * B then log b \<bar>A\<bar> + log b \<bar>B\<bar> else log b 0)"
   146   using log_mult[of b "\<bar>A\<bar>" "\<bar>B\<bar>"] b_gt_1 log_neg_const[of "A * B"]
   147   by (auto simp: zero_less_mult_iff mult_le_0_iff)
   148 
   149 lemma log_inverse_eq:
   150   "log b (inverse B) = (if 0 < B then - log b B else log b 0)"
   151   using log_inverse[of b B] log_neg_const[of "inverse B"] b_gt_1 by simp
   152 
   153 lemma log_divide_eq:
   154   "log b (A / B) = (if 0 < A * B then log b \<bar>A\<bar> - log b \<bar>B\<bar> else log b 0)"
   155   unfolding divide_inverse log_mult_eq log_inverse_eq abs_inverse
   156   by (auto simp: zero_less_mult_iff mult_le_0_iff)
   157 
   158 lemmas log_simps = log_mult_eq log_inverse_eq log_divide_eq
   159 
   160 end
   161 
   162 subsection "Kullback$-$Leibler divergence"
   163 
   164 text {* The Kullback$-$Leibler divergence is also known as relative entropy or
   165 Kullback$-$Leibler distance. *}
   166 
   167 definition
   168   "KL_divergence b M \<mu> \<nu> =
   169     measure_space.integral M \<mu> (\<lambda>x. log b (real (sigma_finite_measure.RN_deriv M \<nu> \<mu> x)))"
   170 
   171 lemma (in sigma_finite_measure) KL_divergence_cong:
   172   assumes "measure_space M \<nu>"
   173   and cong: "\<And>A. A \<in> sets M \<Longrightarrow> \<mu>' A = \<mu> A" "\<And>A. A \<in> sets M \<Longrightarrow> \<nu>' A = \<nu> A"
   174   shows "KL_divergence b M \<nu>' \<mu>' = KL_divergence b M \<nu> \<mu>"
   175 proof -
   176   interpret \<nu>: measure_space M \<nu> by fact
   177   show ?thesis
   178     unfolding KL_divergence_def
   179     using RN_deriv_cong[OF cong, of "\<lambda>A. A"]
   180     by (simp add: cong \<nu>.integral_cong_measure[OF cong(2)])
   181 qed
   182 
   183 lemma (in sigma_finite_measure) KL_divergence_vimage:
   184   assumes f: "bij_betw f S (space M)"
   185   assumes \<nu>: "measure_space M \<nu>" "absolutely_continuous \<nu>"
   186   shows "KL_divergence b (vimage_algebra S f) (\<lambda>A. \<nu> (f ` A)) (\<lambda>A. \<mu> (f ` A)) = KL_divergence b M \<nu> \<mu>"
   187     (is "KL_divergence b ?M ?\<nu> ?\<mu> = _")
   188 proof -
   189   interpret \<nu>: measure_space M \<nu> by fact
   190   interpret v: measure_space ?M ?\<nu>
   191     using f by (rule \<nu>.measure_space_isomorphic)
   192 
   193   let ?RN = "sigma_finite_measure.RN_deriv ?M ?\<mu> ?\<nu>"
   194   from RN_deriv_vimage[OF f[THEN bij_inv_the_inv_into] \<nu>]
   195   have *: "\<nu>.almost_everywhere (\<lambda>x. ?RN (the_inv_into S f x) = RN_deriv \<nu> x)"
   196     by (rule absolutely_continuous_AE[OF \<nu>])
   197 
   198   show ?thesis
   199     unfolding KL_divergence_def \<nu>.integral_vimage_inv[OF f]
   200     apply (rule \<nu>.integral_cong_AE)
   201     apply (rule \<nu>.AE_mp[OF *])
   202     apply (rule \<nu>.AE_cong)
   203     apply simp
   204     done
   205 qed
   206 
   207 lemma (in finite_measure_space) KL_divergence_eq_finite:
   208   assumes v: "finite_measure_space M \<nu>"
   209   assumes ac: "absolutely_continuous \<nu>"
   210   shows "KL_divergence b M \<nu> \<mu> = (\<Sum>x\<in>space M. real (\<nu> {x}) * log b (real (\<nu> {x}) / real (\<mu> {x})))" (is "_ = ?sum")
   211 proof (simp add: KL_divergence_def finite_measure_space.integral_finite_singleton[OF v])
   212   interpret v: finite_measure_space M \<nu> by fact
   213   have ms: "measure_space M \<nu>" by fact
   214   show "(\<Sum>x \<in> space M. log b (real (RN_deriv \<nu> x)) * real (\<nu> {x})) = ?sum"
   215     using RN_deriv_finite_measure[OF ms ac]
   216     by (auto intro!: setsum_cong simp: field_simps real_of_pextreal_mult[symmetric])
   217 qed
   218 
   219 lemma (in finite_prob_space) KL_divergence_positive_finite:
   220   assumes v: "finite_prob_space M \<nu>"
   221   assumes ac: "absolutely_continuous \<nu>"
   222   and "1 < b"
   223   shows "0 \<le> KL_divergence b M \<nu> \<mu>"
   224 proof -
   225   interpret v: finite_prob_space M \<nu> using v .
   226   have ms: "finite_measure_space M \<nu>" by default
   227 
   228   have "- (KL_divergence b M \<nu> \<mu>) \<le> log b (\<Sum>x\<in>space M. real (\<mu> {x}))"
   229   proof (subst KL_divergence_eq_finite[OF ms ac], safe intro!: log_setsum_divide not_empty)
   230     show "finite (space M)" using finite_space by simp
   231     show "1 < b" by fact
   232     show "(\<Sum>x\<in>space M. real (\<nu> {x})) = 1" using v.finite_sum_over_space_eq_1 by simp
   233 
   234     fix x assume "x \<in> space M"
   235     then have x: "{x} \<in> sets M" unfolding sets_eq_Pow by auto
   236     { assume "0 < real (\<nu> {x})"
   237       then have "\<nu> {x} \<noteq> 0" by auto
   238       then have "\<mu> {x} \<noteq> 0"
   239         using ac[unfolded absolutely_continuous_def, THEN bspec, of "{x}"] x by auto
   240       thus "0 < prob {x}" using finite_measure[of "{x}"] x by auto }
   241   qed auto
   242   thus "0 \<le> KL_divergence b M \<nu> \<mu>" using finite_sum_over_space_eq_1 by simp
   243 qed
   244 
   245 subsection {* Mutual Information *}
   246 
   247 definition (in prob_space)
   248   "mutual_information b S T X Y =
   249     KL_divergence b (sigma (pair_algebra S T))
   250       (joint_distribution X Y)
   251       (pair_sigma_finite.pair_measure S (distribution X) T (distribution Y))"
   252 
   253 definition (in prob_space)
   254   "entropy b s X = mutual_information b s s X X"
   255 
   256 abbreviation (in information_space)
   257   mutual_information_Pow ("\<I>'(_ ; _')") where
   258   "\<I>(X ; Y) \<equiv> mutual_information b
   259     \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
   260     \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr> X Y"
   261 
   262 lemma (in information_space) mutual_information_commute_generic:
   263   assumes X: "random_variable S X" and Y: "random_variable T Y"
   264   assumes ac: "measure_space.absolutely_continuous (sigma (pair_algebra S T))
   265    (pair_sigma_finite.pair_measure S (distribution X) T (distribution Y)) (joint_distribution X Y)"
   266   shows "mutual_information b S T X Y = mutual_information b T S Y X"
   267 proof -
   268   interpret P: prob_space "sigma (pair_algebra S T)" "joint_distribution X Y"
   269     using random_variable_pairI[OF X Y] by (rule distribution_prob_space)
   270   interpret Q: prob_space "sigma (pair_algebra T S)" "joint_distribution Y X"
   271     using random_variable_pairI[OF Y X] by (rule distribution_prob_space)
   272   interpret X: prob_space S "distribution X" using X by (rule distribution_prob_space)
   273   interpret Y: prob_space T "distribution Y" using Y by (rule distribution_prob_space)
   274   interpret ST: pair_sigma_finite S "distribution X" T "distribution Y" by default
   275   interpret TS: pair_sigma_finite T "distribution Y" S "distribution X" by default
   276 
   277   have ST: "measure_space (sigma (pair_algebra S T)) (joint_distribution X Y)" by default
   278   have TS: "measure_space (sigma (pair_algebra T S)) (joint_distribution Y X)" by default
   279 
   280   have bij_ST: "bij_betw (\<lambda>(x, y). (y, x)) (space (sigma (pair_algebra S T))) (space (sigma (pair_algebra T S)))"
   281     by (auto intro!: inj_onI simp: space_pair_algebra bij_betw_def)
   282   have bij_TS: "bij_betw (\<lambda>(x, y). (y, x)) (space (sigma (pair_algebra T S))) (space (sigma (pair_algebra S T)))"
   283     by (auto intro!: inj_onI simp: space_pair_algebra bij_betw_def)
   284 
   285   { fix A
   286     have "joint_distribution X Y ((\<lambda>(x, y). (y, x)) ` A) = joint_distribution Y X A"
   287       unfolding distribution_def by (auto intro!: arg_cong[where f=\<mu>]) }
   288   note jd_commute = this
   289 
   290   { fix A assume A: "A \<in> sets (sigma (pair_algebra T S))"
   291     have *: "\<And>x y. indicator ((\<lambda>(x, y). (y, x)) ` A) (x, y) = (indicator A (y, x) :: pextreal)"
   292       unfolding indicator_def by auto
   293     have "ST.pair_measure ((\<lambda>(x, y). (y, x)) ` A) = TS.pair_measure A"
   294       unfolding ST.pair_measure_def TS.pair_measure_def
   295       using A by (auto simp add: TS.Fubini[symmetric] *) }
   296   note pair_measure_commute = this
   297 
   298   show ?thesis
   299     unfolding mutual_information_def
   300     unfolding ST.KL_divergence_vimage[OF bij_TS ST ac, symmetric]
   301     unfolding space_sigma space_pair_algebra jd_commute
   302     unfolding ST.pair_sigma_algebra_swap[symmetric]
   303     by (simp cong: TS.KL_divergence_cong[OF TS] add: pair_measure_commute)
   304 qed
   305 
   306 lemma (in prob_space) finite_variables_absolutely_continuous:
   307   assumes X: "finite_random_variable S X" and Y: "finite_random_variable T Y"
   308   shows "measure_space.absolutely_continuous (sigma (pair_algebra S T))
   309    (pair_sigma_finite.pair_measure S (distribution X) T (distribution Y)) (joint_distribution X Y)"
   310 proof -
   311   interpret X: finite_prob_space S "distribution X" using X by (rule distribution_finite_prob_space)
   312   interpret Y: finite_prob_space T "distribution Y" using Y by (rule distribution_finite_prob_space)
   313   interpret XY: pair_finite_prob_space S "distribution X" T "distribution Y" by default
   314   interpret P: finite_prob_space XY.P "joint_distribution X Y"
   315     using assms by (intro joint_distribution_finite_prob_space)
   316   show "XY.absolutely_continuous (joint_distribution X Y)"
   317   proof (rule XY.absolutely_continuousI)
   318     show "finite_measure_space XY.P (joint_distribution X Y)" by default
   319     fix x assume "x \<in> space XY.P" and "XY.pair_measure {x} = 0"
   320     then obtain a b where "(a, b) = x" and "a \<in> space S" "b \<in> space T"
   321       and distr: "distribution X {a} * distribution Y {b} = 0"
   322       by (cases x) (auto simp: pair_algebra_def)
   323     with assms[THEN finite_random_variableD]
   324       joint_distribution_Times_le_fst[of S X T Y "{a}" "{b}"]
   325       joint_distribution_Times_le_snd[of S X T Y "{a}" "{b}"]
   326     have "joint_distribution X Y {x} \<le> distribution Y {b}"
   327          "joint_distribution X Y {x} \<le> distribution X {a}"
   328       by auto
   329     with distr show "joint_distribution X Y {x} = 0" by auto
   330   qed
   331 qed
   332 
   333 lemma (in information_space) mutual_information_commute:
   334   assumes X: "finite_random_variable S X" and Y: "finite_random_variable T Y"
   335   shows "mutual_information b S T X Y = mutual_information b T S Y X"
   336   by (intro finite_random_variableD X Y mutual_information_commute_generic finite_variables_absolutely_continuous)
   337 
   338 lemma (in information_space) mutual_information_commute_simple:
   339   assumes X: "simple_function X" and Y: "simple_function Y"
   340   shows "\<I>(X;Y) = \<I>(Y;X)"
   341   by (intro X Y simple_function_imp_finite_random_variable mutual_information_commute)
   342 
   343 lemma (in information_space)
   344   assumes MX: "finite_random_variable MX X"
   345   assumes MY: "finite_random_variable MY Y"
   346   shows mutual_information_generic_eq:
   347     "mutual_information b MX MY X Y = (\<Sum> (x,y) \<in> space MX \<times> space MY.
   348       real (joint_distribution X Y {(x,y)}) *
   349       log b (real (joint_distribution X Y {(x,y)}) /
   350       (real (distribution X {x}) * real (distribution Y {y}))))"
   351     (is ?sum)
   352   and mutual_information_positive_generic:
   353      "0 \<le> mutual_information b MX MY X Y" (is ?positive)
   354 proof -
   355   interpret X: finite_prob_space MX "distribution X" using MX by (rule distribution_finite_prob_space)
   356   interpret Y: finite_prob_space MY "distribution Y" using MY by (rule distribution_finite_prob_space)
   357   interpret XY: pair_finite_prob_space MX "distribution X" MY "distribution Y" by default
   358   interpret P: finite_prob_space XY.P "joint_distribution X Y"
   359     using assms by (intro joint_distribution_finite_prob_space)
   360 
   361   have P_ms: "finite_measure_space XY.P (joint_distribution X Y)" by default
   362   have P_ps: "finite_prob_space XY.P (joint_distribution X Y)" by default
   363 
   364   show ?sum
   365     unfolding Let_def mutual_information_def
   366     by (subst XY.KL_divergence_eq_finite[OF P_ms finite_variables_absolutely_continuous[OF MX MY]])
   367        (auto simp add: pair_algebra_def setsum_cartesian_product' real_of_pextreal_mult[symmetric])
   368 
   369   show ?positive
   370     using XY.KL_divergence_positive_finite[OF P_ps finite_variables_absolutely_continuous[OF MX MY] b_gt_1]
   371     unfolding mutual_information_def .
   372 qed
   373 
   374 lemma (in information_space) mutual_information_eq:
   375   assumes "simple_function X" "simple_function Y"
   376   shows "\<I>(X;Y) = (\<Sum> (x,y) \<in> X ` space M \<times> Y ` space M.
   377     real (distribution (\<lambda>x. (X x, Y x)) {(x,y)}) * log b (real (distribution (\<lambda>x. (X x, Y x)) {(x,y)}) /
   378                                                    (real (distribution X {x}) * real (distribution Y {y}))))"
   379   using assms by (simp add: mutual_information_generic_eq)
   380 
   381 lemma (in information_space) mutual_information_generic_cong:
   382   assumes X: "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x"
   383   assumes Y: "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x"
   384   shows "mutual_information b MX MY X Y = mutual_information b MX MY X' Y'"
   385   unfolding mutual_information_def using X Y
   386   by (simp cong: distribution_cong)
   387 
   388 lemma (in information_space) mutual_information_cong:
   389   assumes X: "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x"
   390   assumes Y: "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x"
   391   shows "\<I>(X; Y) = \<I>(X'; Y')"
   392   unfolding mutual_information_def using X Y
   393   by (simp cong: distribution_cong image_cong)
   394 
   395 lemma (in information_space) mutual_information_positive:
   396   assumes "simple_function X" "simple_function Y"
   397   shows "0 \<le> \<I>(X;Y)"
   398   using assms by (simp add: mutual_information_positive_generic)
   399 
   400 subsection {* Entropy *}
   401 
   402 abbreviation (in information_space)
   403   entropy_Pow ("\<H>'(_')") where
   404   "\<H>(X) \<equiv> entropy b \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr> X"
   405 
   406 lemma (in information_space) entropy_generic_eq:
   407   assumes MX: "finite_random_variable MX X"
   408   shows "entropy b MX X = -(\<Sum> x \<in> space MX. real (distribution X {x}) * log b (real (distribution X {x})))"
   409 proof -
   410   interpret MX: finite_prob_space MX "distribution X" using MX by (rule distribution_finite_prob_space)
   411   let "?X x" = "real (distribution X {x})"
   412   let "?XX x y" = "real (joint_distribution X X {(x, y)})"
   413   { fix x y
   414     have "(\<lambda>x. (X x, X x)) -` {(x, y)} = (if x = y then X -` {x} else {})" by auto
   415     then have "?XX x y * log b (?XX x y / (?X x * ?X y)) =
   416         (if x = y then - ?X y * log b (?X y) else 0)"
   417       unfolding distribution_def by (auto simp: log_simps zero_less_mult_iff) }
   418   note remove_XX = this
   419   show ?thesis
   420     unfolding entropy_def mutual_information_generic_eq[OF MX MX]
   421     unfolding setsum_cartesian_product[symmetric] setsum_negf[symmetric] remove_XX
   422     by (auto simp: setsum_cases MX.finite_space)
   423 qed
   424 
   425 lemma (in information_space) entropy_eq:
   426   assumes "simple_function X"
   427   shows "\<H>(X) = -(\<Sum> x \<in> X ` space M. real (distribution X {x}) * log b (real (distribution X {x})))"
   428   using assms by (simp add: entropy_generic_eq)
   429 
   430 lemma (in information_space) entropy_positive:
   431   "simple_function X \<Longrightarrow> 0 \<le> \<H>(X)"
   432   unfolding entropy_def by (simp add: mutual_information_positive)
   433 
   434 lemma (in information_space) entropy_certainty_eq_0:
   435   assumes "simple_function X" and "x \<in> X ` space M" and "distribution X {x} = 1"
   436   shows "\<H>(X) = 0"
   437 proof -
   438   interpret X: finite_prob_space "\<lparr> space = X ` space M, sets = Pow (X ` space M) \<rparr>" "distribution X"
   439     using simple_function_imp_finite_random_variable[OF `simple_function X`]
   440     by (rule distribution_finite_prob_space)
   441   have "distribution X (X ` space M - {x}) = distribution X (X ` space M) - distribution X {x}"
   442     using X.measure_compl[of "{x}"] assms by auto
   443   also have "\<dots> = 0" using X.prob_space assms by auto
   444   finally have X0: "distribution X (X ` space M - {x}) = 0" by auto
   445   { fix y assume asm: "y \<noteq> x" "y \<in> X ` space M"
   446     hence "{y} \<subseteq> X ` space M - {x}" by auto
   447     from X.measure_mono[OF this] X0 asm
   448     have "distribution X {y} = 0" by auto }
   449   hence fi: "\<And> y. y \<in> X ` space M \<Longrightarrow> real (distribution X {y}) = (if x = y then 1 else 0)"
   450     using assms by auto
   451   have y: "\<And>y. (if x = y then 1 else 0) * log b (if x = y then 1 else 0) = 0" by simp
   452   show ?thesis unfolding entropy_eq[OF `simple_function X`] by (auto simp: y fi)
   453 qed
   454 
   455 lemma (in information_space) entropy_le_card_not_0:
   456   assumes "simple_function X"
   457   shows "\<H>(X) \<le> log b (real (card (X ` space M \<inter> {x . distribution X {x} \<noteq> 0})))"
   458 proof -
   459   let "?d x" = "distribution X {x}"
   460   let "?p x" = "real (?d x)"
   461   have "\<H>(X) = (\<Sum>x\<in>X`space M. ?p x * log b (1 / ?p x))"
   462     by (auto intro!: setsum_cong simp: entropy_eq[OF `simple_function X`] setsum_negf[symmetric] log_simps not_less)
   463   also have "\<dots> \<le> log b (\<Sum>x\<in>X`space M. ?p x * (1 / ?p x))"
   464     apply (rule log_setsum')
   465     using not_empty b_gt_1 `simple_function X` sum_over_space_real_distribution
   466     by (auto simp: simple_function_def)
   467   also have "\<dots> = log b (\<Sum>x\<in>X`space M. if ?d x \<noteq> 0 then 1 else 0)"
   468     using distribution_finite[OF `simple_function X`[THEN simple_function_imp_random_variable], simplified]
   469     by (intro arg_cong[where f="\<lambda>X. log b X"] setsum_cong) (auto simp: real_of_pextreal_eq_0)
   470   finally show ?thesis
   471     using `simple_function X` by (auto simp: setsum_cases real_eq_of_nat simple_function_def)
   472 qed
   473 
   474 lemma (in information_space) entropy_uniform_max:
   475   assumes "simple_function X"
   476   assumes "\<And>x y. \<lbrakk> x \<in> X ` space M ; y \<in> X ` space M \<rbrakk> \<Longrightarrow> distribution X {x} = distribution X {y}"
   477   shows "\<H>(X) = log b (real (card (X ` space M)))"
   478 proof -
   479   interpret X: finite_prob_space "\<lparr> space = X ` space M, sets = Pow (X ` space M) \<rparr>" "distribution X"
   480     using simple_function_imp_finite_random_variable[OF `simple_function X`]
   481     by (rule distribution_finite_prob_space)
   482   have card_gt0: "0 < card (X ` space M)" unfolding card_gt_0_iff
   483     using `simple_function X` not_empty by (auto simp: simple_function_def)
   484   { fix x assume "x \<in> X ` space M"
   485     hence "real (distribution X {x}) = 1 / real (card (X ` space M))"
   486     proof (rule X.uniform_prob[simplified])
   487       fix x y assume "x \<in> X`space M" "y \<in> X`space M"
   488       from assms(2)[OF this] show "real (distribution X {x}) = real (distribution X {y})" by simp
   489     qed }
   490   thus ?thesis
   491     using not_empty X.finite_space b_gt_1 card_gt0
   492     by (simp add: entropy_eq[OF `simple_function X`] real_eq_of_nat[symmetric] log_simps)
   493 qed
   494 
   495 lemma (in information_space) entropy_le_card:
   496   assumes "simple_function X"
   497   shows "\<H>(X) \<le> log b (real (card (X ` space M)))"
   498 proof cases
   499   assume "X ` space M \<inter> {x. distribution X {x} \<noteq> 0} = {}"
   500   then have "\<And>x. x\<in>X`space M \<Longrightarrow> distribution X {x} = 0" by auto
   501   moreover
   502   have "0 < card (X`space M)"
   503     using `simple_function X` not_empty
   504     by (auto simp: card_gt_0_iff simple_function_def)
   505   then have "log b 1 \<le> log b (real (card (X`space M)))"
   506     using b_gt_1 by (intro log_le) auto
   507   ultimately show ?thesis using assms by (simp add: entropy_eq)
   508 next
   509   assume False: "X ` space M \<inter> {x. distribution X {x} \<noteq> 0} \<noteq> {}"
   510   have "card (X ` space M \<inter> {x. distribution X {x} \<noteq> 0}) \<le> card (X ` space M)"
   511     (is "?A \<le> ?B") using assms not_empty by (auto intro!: card_mono simp: simple_function_def)
   512   note entropy_le_card_not_0[OF assms]
   513   also have "log b (real ?A) \<le> log b (real ?B)"
   514     using b_gt_1 False not_empty `?A \<le> ?B` assms
   515     by (auto intro!: log_le simp: card_gt_0_iff simp: simple_function_def)
   516   finally show ?thesis .
   517 qed
   518 
   519 lemma (in information_space) entropy_commute:
   520   assumes "simple_function X" "simple_function Y"
   521   shows "\<H>(\<lambda>x. (X x, Y x)) = \<H>(\<lambda>x. (Y x, X x))"
   522 proof -
   523   have sf: "simple_function (\<lambda>x. (X x, Y x))" "simple_function (\<lambda>x. (Y x, X x))"
   524     using assms by (auto intro: simple_function_Pair)
   525   have *: "(\<lambda>x. (Y x, X x))`space M = (\<lambda>(a,b). (b,a))`(\<lambda>x. (X x, Y x))`space M"
   526     by auto
   527   have inj: "\<And>X. inj_on (\<lambda>(a,b). (b,a)) X"
   528     by (auto intro!: inj_onI)
   529   show ?thesis
   530     unfolding sf[THEN entropy_eq] unfolding * setsum_reindex[OF inj]
   531     by (simp add: joint_distribution_commute[of Y X] split_beta)
   532 qed
   533 
   534 lemma (in information_space) entropy_eq_cartesian_product:
   535   assumes "simple_function X" "simple_function Y"
   536   shows "\<H>(\<lambda>x. (X x, Y x)) = -(\<Sum>x\<in>X`space M. \<Sum>y\<in>Y`space M.
   537     real (joint_distribution X Y {(x,y)}) *
   538     log b (real (joint_distribution X Y {(x,y)})))"
   539 proof -
   540   have sf: "simple_function (\<lambda>x. (X x, Y x))"
   541     using assms by (auto intro: simple_function_Pair)
   542   { fix x assume "x\<notin>(\<lambda>x. (X x, Y x))`space M"
   543     then have "(\<lambda>x. (X x, Y x)) -` {x} \<inter> space M = {}" by auto
   544     then have "joint_distribution X Y {x} = 0"
   545       unfolding distribution_def by auto }
   546   then show ?thesis using sf assms
   547     unfolding entropy_eq[OF sf] neg_equal_iff_equal setsum_cartesian_product
   548     by (auto intro!: setsum_mono_zero_cong_left simp: simple_function_def)
   549 qed
   550 
   551 subsection {* Conditional Mutual Information *}
   552 
   553 definition (in prob_space)
   554   "conditional_mutual_information b M1 M2 M3 X Y Z \<equiv>
   555     mutual_information b M1 (sigma (pair_algebra M2 M3)) X (\<lambda>x. (Y x, Z x)) -
   556     mutual_information b M1 M3 X Z"
   557 
   558 abbreviation (in information_space)
   559   conditional_mutual_information_Pow ("\<I>'( _ ; _ | _ ')") where
   560   "\<I>(X ; Y | Z) \<equiv> conditional_mutual_information b
   561     \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
   562     \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr>
   563     \<lparr> space = Z`space M, sets = Pow (Z`space M) \<rparr>
   564     X Y Z"
   565 
   566 
   567 lemma (in information_space) conditional_mutual_information_generic_eq:
   568   assumes MX: "finite_random_variable MX X"
   569     and MY: "finite_random_variable MY Y"
   570     and MZ: "finite_random_variable MZ Z"
   571   shows "conditional_mutual_information b MX MY MZ X Y Z = (\<Sum>(x, y, z) \<in> space MX \<times> space MY \<times> space MZ.
   572              real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) *
   573              log b (real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) /
   574     (real (joint_distribution X Z {(x, z)}) * real (joint_distribution Y Z {(y,z)} / distribution Z {z}))))"
   575   (is "_ = (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?XZ x z * ?YZdZ y z)))")
   576 proof -
   577   let ?YZ = "\<lambda>y z. real (joint_distribution Y Z {(y, z)})"
   578   let ?X = "\<lambda>x. real (distribution X {x})"
   579   let ?Z = "\<lambda>z. real (distribution Z {z})"
   580 
   581   txt {* This proof is actually quiet easy, however we need to show that the
   582     distributions are finite and the joint distributions are zero when one of
   583     the variables distribution is also zero. *}
   584 
   585   note finite_var = MX MY MZ
   586   note random_var = finite_var[THEN finite_random_variableD]
   587 
   588   note space_simps = space_pair_algebra space_sigma algebra.simps
   589 
   590   note YZ = finite_random_variable_pairI[OF finite_var(2,3)]
   591   note XZ = finite_random_variable_pairI[OF finite_var(1,3)]
   592   note ZX = finite_random_variable_pairI[OF finite_var(3,1)]
   593   note YZX = finite_random_variable_pairI[OF finite_var(2) ZX]
   594   note order1 =
   595     finite_distribution_order(5,6)[OF finite_var(1) YZ, simplified space_simps]
   596     finite_distribution_order(5,6)[OF finite_var(1,3), simplified space_simps]
   597 
   598   note finite = finite_var(1) YZ finite_var(3) XZ YZX
   599   note finite[THEN finite_distribution_finite, simplified space_simps, simp]
   600 
   601   have order2: "\<And>x y z. \<lbrakk>x \<in> space MX; y \<in> space MY; z \<in> space MZ; joint_distribution X Z {(x, z)} = 0\<rbrakk>
   602           \<Longrightarrow> joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)} = 0"
   603     unfolding joint_distribution_commute_singleton[of X]
   604     unfolding joint_distribution_assoc_singleton[symmetric]
   605     using finite_distribution_order(6)[OF finite_var(2) ZX]
   606     by (auto simp: space_simps)
   607 
   608   have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?XZ x z * ?YZdZ y z))) =
   609     (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * (log b (?XYZ x y z / (?X x * ?YZ y z)) - log b (?XZ x z / (?X x * ?Z z))))"
   610     (is "(\<Sum>(x, y, z)\<in>?S. ?L x y z) = (\<Sum>(x, y, z)\<in>?S. ?R x y z)")
   611   proof (safe intro!: setsum_cong)
   612     fix x y z assume space: "x \<in> space MX" "y \<in> space MY" "z \<in> space MZ"
   613     then have *: "?XYZ x y z / (?XZ x z * ?YZdZ y z) =
   614       (?XYZ x y z / (?X x * ?YZ y z)) / (?XZ x z / (?X x * ?Z z))"
   615       using order1(3)
   616       by (auto simp: real_of_pextreal_mult[symmetric] real_of_pextreal_eq_0)
   617     show "?L x y z = ?R x y z"
   618     proof cases
   619       assume "?XYZ x y z \<noteq> 0"
   620       with space b_gt_1 order1 order2 show ?thesis unfolding *
   621         by (subst log_divide)
   622            (auto simp: zero_less_divide_iff zero_less_real_of_pextreal
   623                        real_of_pextreal_eq_0 zero_less_mult_iff)
   624     qed simp
   625   qed
   626   also have "\<dots> = (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?X x * ?YZ y z))) -
   627                   (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XZ x z / (?X x * ?Z z)))"
   628     by (auto simp add: setsum_subtractf[symmetric] field_simps intro!: setsum_cong)
   629   also have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XZ x z / (?X x * ?Z z))) =
   630              (\<Sum>(x, z)\<in>space MX \<times> space MZ. ?XZ x z * log b (?XZ x z / (?X x * ?Z z)))"
   631     unfolding setsum_cartesian_product[symmetric] setsum_commute[of _ _ "space MY"]
   632               setsum_left_distrib[symmetric]
   633     unfolding joint_distribution_commute_singleton[of X]
   634     unfolding joint_distribution_assoc_singleton[symmetric]
   635     using setsum_real_joint_distribution_singleton[OF finite_var(2) ZX, unfolded space_simps]
   636     by (intro setsum_cong refl) simp
   637   also have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?X x * ?YZ y z))) -
   638              (\<Sum>(x, z)\<in>space MX \<times> space MZ. ?XZ x z * log b (?XZ x z / (?X x * ?Z z))) =
   639              conditional_mutual_information b MX MY MZ X Y Z"
   640     unfolding conditional_mutual_information_def
   641     unfolding mutual_information_generic_eq[OF finite_var(1,3)]
   642     unfolding mutual_information_generic_eq[OF finite_var(1) YZ]
   643     by (simp add: space_sigma space_pair_algebra setsum_cartesian_product')
   644   finally show ?thesis by simp
   645 qed
   646 
   647 lemma (in information_space) conditional_mutual_information_eq:
   648   assumes "simple_function X" "simple_function Y" "simple_function Z"
   649   shows "\<I>(X;Y|Z) = (\<Sum>(x, y, z) \<in> X`space M \<times> Y`space M \<times> Z`space M.
   650              real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) *
   651              log b (real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) /
   652     (real (joint_distribution X Z {(x, z)}) * real (joint_distribution Y Z {(y,z)} / distribution Z {z}))))"
   653   using conditional_mutual_information_generic_eq[OF assms[THEN simple_function_imp_finite_random_variable]]
   654   by simp
   655 
   656 lemma (in information_space) conditional_mutual_information_eq_mutual_information:
   657   assumes X: "simple_function X" and Y: "simple_function Y"
   658   shows "\<I>(X ; Y) = \<I>(X ; Y | (\<lambda>x. ()))"
   659 proof -
   660   have [simp]: "(\<lambda>x. ()) ` space M = {()}" using not_empty by auto
   661   have C: "simple_function (\<lambda>x. ())" by auto
   662   show ?thesis
   663     unfolding conditional_mutual_information_eq[OF X Y C]
   664     unfolding mutual_information_eq[OF X Y]
   665     by (simp add: setsum_cartesian_product' distribution_remove_const)
   666 qed
   667 
   668 lemma (in prob_space) distribution_unit[simp]: "distribution (\<lambda>x. ()) {()} = 1"
   669   unfolding distribution_def using measure_space_1 by auto
   670 
   671 lemma (in prob_space) joint_distribution_unit[simp]: "distribution (\<lambda>x. (X x, ())) {(a, ())} = distribution X {a}"
   672   unfolding distribution_def by (auto intro!: arg_cong[where f=\<mu>])
   673 
   674 lemma (in prob_space) setsum_distribution:
   675   assumes X: "finite_random_variable MX X" shows "(\<Sum>a\<in>space MX. distribution X {a}) = 1"
   676   using setsum_joint_distribution[OF assms, of "\<lparr> space = UNIV, sets = Pow UNIV \<rparr>" "\<lambda>x. ()" "{()}"]
   677   using sigma_algebra_Pow[of "UNIV::unit set"] by simp
   678 
   679 lemma (in prob_space) setsum_real_distribution:
   680   assumes X: "finite_random_variable MX X" shows "(\<Sum>a\<in>space MX. real (distribution X {a})) = 1"
   681   using setsum_real_joint_distribution[OF assms, of "\<lparr> space = UNIV, sets = Pow UNIV \<rparr>" "\<lambda>x. ()" "{()}"]
   682   using sigma_algebra_Pow[of "UNIV::unit set"] by simp
   683 
   684 lemma (in information_space) conditional_mutual_information_generic_positive:
   685   assumes "finite_random_variable MX X" and "finite_random_variable MY Y" and "finite_random_variable MZ Z"
   686   shows "0 \<le> conditional_mutual_information b MX MY MZ X Y Z"
   687 proof (cases "space MX \<times> space MY \<times> space MZ = {}")
   688   case True show ?thesis
   689     unfolding conditional_mutual_information_generic_eq[OF assms] True
   690     by simp
   691 next
   692   case False
   693   let "?dXYZ A" = "real (distribution (\<lambda>x. (X x, Y x, Z x)) A)"
   694   let "?dXZ A" = "real (joint_distribution X Z A)"
   695   let "?dYZ A" = "real (joint_distribution Y Z A)"
   696   let "?dX A" = "real (distribution X A)"
   697   let "?dZ A" = "real (distribution Z A)"
   698   let ?M = "space MX \<times> space MY \<times> space MZ"
   699 
   700   have split_beta: "\<And>f. split f = (\<lambda>x. f (fst x) (snd x))" by (simp add: fun_eq_iff)
   701 
   702   note space_simps = space_pair_algebra space_sigma algebra.simps
   703 
   704   note finite_var = assms
   705   note YZ = finite_random_variable_pairI[OF finite_var(2,3)]
   706   note XZ = finite_random_variable_pairI[OF finite_var(1,3)]
   707   note ZX = finite_random_variable_pairI[OF finite_var(3,1)]
   708   note YZ = finite_random_variable_pairI[OF finite_var(2,3)]
   709   note XYZ = finite_random_variable_pairI[OF finite_var(1) YZ]
   710   note finite = finite_var(3) YZ XZ XYZ
   711   note finite = finite[THEN finite_distribution_finite, simplified space_simps]
   712 
   713   have order: "\<And>x y z. \<lbrakk>x \<in> space MX; y \<in> space MY; z \<in> space MZ; joint_distribution X Z {(x, z)} = 0\<rbrakk>
   714           \<Longrightarrow> joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)} = 0"
   715     unfolding joint_distribution_commute_singleton[of X]
   716     unfolding joint_distribution_assoc_singleton[symmetric]
   717     using finite_distribution_order(6)[OF finite_var(2) ZX]
   718     by (auto simp: space_simps)
   719 
   720   note order = order
   721     finite_distribution_order(5,6)[OF finite_var(1) YZ, simplified space_simps]
   722     finite_distribution_order(5,6)[OF finite_var(2,3), simplified space_simps]
   723 
   724   have "- conditional_mutual_information b MX MY MZ X Y Z = - (\<Sum>(x, y, z) \<in> ?M. ?dXYZ {(x, y, z)} *
   725     log b (?dXYZ {(x, y, z)} / (?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})))"
   726     unfolding conditional_mutual_information_generic_eq[OF assms] neg_equal_iff_equal
   727     by (intro setsum_cong) (auto intro!: arg_cong[where f="log b"] simp: real_of_pextreal_mult[symmetric])
   728   also have "\<dots> \<le> log b (\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})"
   729     unfolding split_beta
   730   proof (rule log_setsum_divide)
   731     show "?M \<noteq> {}" using False by simp
   732     show "1 < b" using b_gt_1 .
   733 
   734     show "finite ?M" using assms
   735       unfolding finite_sigma_algebra_def finite_sigma_algebra_axioms_def by auto
   736 
   737     show "(\<Sum>x\<in>?M. ?dXYZ {(fst x, fst (snd x), snd (snd x))}) = 1"
   738       unfolding setsum_cartesian_product'
   739       unfolding setsum_commute[of _ "space MY"]
   740       unfolding setsum_commute[of _ "space MZ"]
   741       by (simp_all add: space_pair_algebra
   742         setsum_real_joint_distribution_singleton[OF `finite_random_variable MX X` YZ]
   743         setsum_real_joint_distribution_singleton[OF `finite_random_variable MY Y` finite_var(3)]
   744         setsum_real_distribution[OF `finite_random_variable MZ Z`])
   745 
   746     fix x assume "x \<in> ?M"
   747     let ?x = "(fst x, fst (snd x), snd (snd x))"
   748 
   749     show "0 \<le> ?dXYZ {?x}" using real_pextreal_nonneg .
   750     show "0 \<le> ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}"
   751      by (simp add: real_pextreal_nonneg mult_nonneg_nonneg divide_nonneg_nonneg)
   752 
   753     assume *: "0 < ?dXYZ {?x}"
   754     with `x \<in> ?M` show "0 < ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}"
   755       using finite order
   756       by (cases x)
   757          (auto simp add: zero_less_real_of_pextreal zero_less_mult_iff zero_less_divide_iff)
   758   qed
   759   also have "(\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z}) = (\<Sum>z\<in>space MZ. ?dZ {z})"
   760     apply (simp add: setsum_cartesian_product')
   761     apply (subst setsum_commute)
   762     apply (subst (2) setsum_commute)
   763     by (auto simp: setsum_divide_distrib[symmetric] setsum_product[symmetric]
   764                    setsum_real_joint_distribution_singleton[OF finite_var(1,3)]
   765                    setsum_real_joint_distribution_singleton[OF finite_var(2,3)]
   766           intro!: setsum_cong)
   767   also have "log b (\<Sum>z\<in>space MZ. ?dZ {z}) = 0"
   768     unfolding setsum_real_distribution[OF finite_var(3)] by simp
   769   finally show ?thesis by simp
   770 qed
   771 
   772 lemma (in information_space) conditional_mutual_information_positive:
   773   assumes "simple_function X" and "simple_function Y" and "simple_function Z"
   774   shows "0 \<le> \<I>(X;Y|Z)"
   775   using conditional_mutual_information_generic_positive[OF assms[THEN simple_function_imp_finite_random_variable]]
   776   by simp
   777 
   778 subsection {* Conditional Entropy *}
   779 
   780 definition (in prob_space)
   781   "conditional_entropy b S T X Y = conditional_mutual_information b S S T X X Y"
   782 
   783 abbreviation (in information_space)
   784   conditional_entropy_Pow ("\<H>'(_ | _')") where
   785   "\<H>(X | Y) \<equiv> conditional_entropy b
   786     \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
   787     \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr> X Y"
   788 
   789 lemma (in information_space) conditional_entropy_positive:
   790   "simple_function X \<Longrightarrow> simple_function Y \<Longrightarrow> 0 \<le> \<H>(X | Y)"
   791   unfolding conditional_entropy_def by (auto intro!: conditional_mutual_information_positive)
   792 
   793 lemma (in measure_space) empty_measureI: "A = {} \<Longrightarrow> \<mu> A = 0" by simp
   794 
   795 lemma (in information_space) conditional_entropy_generic_eq:
   796   assumes MX: "finite_random_variable MX X"
   797   assumes MZ: "finite_random_variable MZ Z"
   798   shows "conditional_entropy b MX MZ X Z =
   799      - (\<Sum>(x, z)\<in>space MX \<times> space MZ.
   800          real (joint_distribution X Z {(x, z)}) *
   801          log b (real (joint_distribution X Z {(x, z)}) / real (distribution Z {z})))"
   802 proof -
   803   interpret MX: finite_sigma_algebra MX using MX by simp
   804   interpret MZ: finite_sigma_algebra MZ using MZ by simp
   805   let "?XXZ x y z" = "joint_distribution X (\<lambda>x. (X x, Z x)) {(x, y, z)}"
   806   let "?XZ x z" = "joint_distribution X Z {(x, z)}"
   807   let "?Z z" = "distribution Z {z}"
   808   let "?f x y z" = "log b (real (?XXZ x y z) / (real (?XZ x z) * real (?XZ y z / ?Z z)))"
   809   { fix x z have "?XXZ x x z = ?XZ x z"
   810       unfolding distribution_def by (auto intro!: arg_cong[where f=\<mu>]) }
   811   note this[simp]
   812   { fix x x' :: 'b and z assume "x' \<noteq> x"
   813     then have "?XXZ x x' z = 0"
   814       by (auto simp: distribution_def intro!: arg_cong[where f=\<mu>] empty_measureI) }
   815   note this[simp]
   816   { fix x x' z assume *: "x \<in> space MX" "z \<in> space MZ"
   817     then have "(\<Sum>x'\<in>space MX. real (?XXZ x x' z) * ?f x x' z)
   818       = (\<Sum>x'\<in>space MX. if x = x' then real (?XZ x z) * ?f x x z else 0)"
   819       by (auto intro!: setsum_cong)
   820     also have "\<dots> = real (?XZ x z) * ?f x x z"
   821       using `x \<in> space MX` by (simp add: setsum_cases[OF MX.finite_space])
   822     also have "\<dots> = real (?XZ x z) * log b (real (?Z z) / real (?XZ x z))"
   823       by (auto simp: real_of_pextreal_mult[symmetric])
   824     also have "\<dots> = - real (?XZ x z) * log b (real (?XZ x z) / real (?Z z))"
   825       using assms[THEN finite_distribution_finite]
   826       using finite_distribution_order(6)[OF MX MZ]
   827       by (auto simp: log_simps field_simps zero_less_mult_iff zero_less_real_of_pextreal real_of_pextreal_eq_0)
   828     finally have "(\<Sum>x'\<in>space MX. real (?XXZ x x' z) * ?f x x' z) =
   829       - real (?XZ x z) * log b (real (?XZ x z) / real (?Z z))" . }
   830   note * = this
   831 
   832   show ?thesis
   833     unfolding conditional_entropy_def
   834     unfolding conditional_mutual_information_generic_eq[OF MX MX MZ]
   835     by (auto simp: setsum_cartesian_product' setsum_negf[symmetric]
   836                    setsum_commute[of _ "space MZ"] *   simp del: divide_pextreal_def
   837              intro!: setsum_cong)
   838 qed
   839 
   840 lemma (in information_space) conditional_entropy_eq:
   841   assumes "simple_function X" "simple_function Z"
   842   shows "\<H>(X | Z) =
   843      - (\<Sum>(x, z)\<in>X ` space M \<times> Z ` space M.
   844          real (joint_distribution X Z {(x, z)}) *
   845          log b (real (joint_distribution X Z {(x, z)}) / real (distribution Z {z})))"
   846   using conditional_entropy_generic_eq[OF assms[THEN simple_function_imp_finite_random_variable]]
   847   by simp
   848 
   849 lemma (in information_space) conditional_entropy_eq_ce_with_hypothesis:
   850   assumes X: "simple_function X" and Y: "simple_function Y"
   851   shows "\<H>(X | Y) =
   852     -(\<Sum>y\<in>Y`space M. real (distribution Y {y}) *
   853       (\<Sum>x\<in>X`space M. real (joint_distribution X Y {(x,y)}) / real (distribution Y {(y)}) *
   854               log b (real (joint_distribution X Y {(x,y)}) / real (distribution Y {(y)}))))"
   855   unfolding conditional_entropy_eq[OF assms]
   856   using finite_distribution_finite[OF finite_random_variable_pairI[OF assms[THEN simple_function_imp_finite_random_variable]]]
   857   using finite_distribution_order(5,6)[OF assms[THEN simple_function_imp_finite_random_variable]]
   858   using finite_distribution_finite[OF Y[THEN simple_function_imp_finite_random_variable]]
   859   by (auto simp: setsum_cartesian_product'  setsum_commute[of _ "Y`space M"] setsum_right_distrib real_of_pextreal_eq_0
   860            intro!: setsum_cong)
   861 
   862 lemma (in information_space) conditional_entropy_eq_cartesian_product:
   863   assumes "simple_function X" "simple_function Y"
   864   shows "\<H>(X | Y) = -(\<Sum>x\<in>X`space M. \<Sum>y\<in>Y`space M.
   865     real (joint_distribution X Y {(x,y)}) *
   866     log b (real (joint_distribution X Y {(x,y)}) / real (distribution Y {y})))"
   867   unfolding conditional_entropy_eq[OF assms]
   868   by (auto intro!: setsum_cong simp: setsum_cartesian_product')
   869 
   870 subsection {* Equalities *}
   871 
   872 lemma (in information_space) mutual_information_eq_entropy_conditional_entropy:
   873   assumes X: "simple_function X" and Z: "simple_function Z"
   874   shows  "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)"
   875 proof -
   876   let "?XZ x z" = "real (joint_distribution X Z {(x, z)})"
   877   let "?Z z" = "real (distribution Z {z})"
   878   let "?X x" = "real (distribution X {x})"
   879   note fX = X[THEN simple_function_imp_finite_random_variable]
   880   note fZ = Z[THEN simple_function_imp_finite_random_variable]
   881   note fX[THEN finite_distribution_finite, simp] and fZ[THEN finite_distribution_finite, simp]
   882   note finite_distribution_order[OF fX fZ, simp]
   883   { fix x z assume "x \<in> X`space M" "z \<in> Z`space M"
   884     have "?XZ x z * log b (?XZ x z / (?X x * ?Z z)) =
   885           ?XZ x z * log b (?XZ x z / ?Z z) - ?XZ x z * log b (?X x)"
   886       by (auto simp: log_simps real_of_pextreal_mult[symmetric] zero_less_mult_iff
   887                      zero_less_real_of_pextreal field_simps real_of_pextreal_eq_0 abs_mult) }
   888   note * = this
   889   show ?thesis
   890     unfolding entropy_eq[OF X] conditional_entropy_eq[OF X Z] mutual_information_eq[OF X Z]
   891     using setsum_real_joint_distribution_singleton[OF fZ fX, unfolded joint_distribution_commute_singleton[of Z X]]
   892     by (simp add: * setsum_cartesian_product' setsum_subtractf setsum_left_distrib[symmetric]
   893                      setsum_real_distribution)
   894 qed
   895 
   896 lemma (in information_space) conditional_entropy_less_eq_entropy:
   897   assumes X: "simple_function X" and Z: "simple_function Z"
   898   shows "\<H>(X | Z) \<le> \<H>(X)"
   899 proof -
   900   have "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)" using mutual_information_eq_entropy_conditional_entropy[OF assms] .
   901   with mutual_information_positive[OF X Z] entropy_positive[OF X]
   902   show ?thesis by auto
   903 qed
   904 
   905 lemma (in information_space) entropy_chain_rule:
   906   assumes X: "simple_function X" and Y: "simple_function Y"
   907   shows  "\<H>(\<lambda>x. (X x, Y x)) = \<H>(X) + \<H>(Y|X)"
   908 proof -
   909   let "?XY x y" = "real (joint_distribution X Y {(x, y)})"
   910   let "?Y y" = "real (distribution Y {y})"
   911   let "?X x" = "real (distribution X {x})"
   912   note fX = X[THEN simple_function_imp_finite_random_variable]
   913   note fY = Y[THEN simple_function_imp_finite_random_variable]
   914   note fX[THEN finite_distribution_finite, simp] and fY[THEN finite_distribution_finite, simp]
   915   note finite_distribution_order[OF fX fY, simp]
   916   { fix x y assume "x \<in> X`space M" "y \<in> Y`space M"
   917     have "?XY x y * log b (?XY x y / ?X x) =
   918           ?XY x y * log b (?XY x y) - ?XY x y * log b (?X x)"
   919       by (auto simp: log_simps real_of_pextreal_mult[symmetric] zero_less_mult_iff
   920                      zero_less_real_of_pextreal field_simps real_of_pextreal_eq_0 abs_mult) }
   921   note * = this
   922   show ?thesis
   923     using setsum_real_joint_distribution_singleton[OF fY fX]
   924     unfolding entropy_eq[OF X] conditional_entropy_eq_cartesian_product[OF Y X] entropy_eq_cartesian_product[OF X Y]
   925     unfolding joint_distribution_commute_singleton[of Y X] setsum_commute[of _ "X`space M"]
   926     by (simp add: * setsum_subtractf setsum_left_distrib[symmetric])
   927 qed
   928 
   929 section {* Partitioning *}
   930 
   931 definition "subvimage A f g \<longleftrightarrow> (\<forall>x \<in> A. f -` {f x} \<inter> A \<subseteq> g -` {g x} \<inter> A)"
   932 
   933 lemma subvimageI:
   934   assumes "\<And>x y. \<lbrakk> x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
   935   shows "subvimage A f g"
   936   using assms unfolding subvimage_def by blast
   937 
   938 lemma subvimageE[consumes 1]:
   939   assumes "subvimage A f g"
   940   obtains "\<And>x y. \<lbrakk> x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
   941   using assms unfolding subvimage_def by blast
   942 
   943 lemma subvimageD:
   944   "\<lbrakk> subvimage A f g ; x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
   945   using assms unfolding subvimage_def by blast
   946 
   947 lemma subvimage_subset:
   948   "\<lbrakk> subvimage B f g ; A \<subseteq> B \<rbrakk> \<Longrightarrow> subvimage A f g"
   949   unfolding subvimage_def by auto
   950 
   951 lemma subvimage_idem[intro]: "subvimage A g g"
   952   by (safe intro!: subvimageI)
   953 
   954 lemma subvimage_comp_finer[intro]:
   955   assumes svi: "subvimage A g h"
   956   shows "subvimage A g (f \<circ> h)"
   957 proof (rule subvimageI, simp)
   958   fix x y assume "x \<in> A" "y \<in> A" "g x = g y"
   959   from svi[THEN subvimageD, OF this]
   960   show "f (h x) = f (h y)" by simp
   961 qed
   962 
   963 lemma subvimage_comp_gran:
   964   assumes svi: "subvimage A g h"
   965   assumes inj: "inj_on f (g ` A)"
   966   shows "subvimage A (f \<circ> g) h"
   967   by (rule subvimageI) (auto intro!: subvimageD[OF svi] simp: inj_on_iff[OF inj])
   968 
   969 lemma subvimage_comp:
   970   assumes svi: "subvimage (f ` A) g h"
   971   shows "subvimage A (g \<circ> f) (h \<circ> f)"
   972   by (rule subvimageI) (auto intro!: svi[THEN subvimageD])
   973 
   974 lemma subvimage_trans:
   975   assumes fg: "subvimage A f g"
   976   assumes gh: "subvimage A g h"
   977   shows "subvimage A f h"
   978   by (rule subvimageI) (auto intro!: fg[THEN subvimageD] gh[THEN subvimageD])
   979 
   980 lemma subvimage_translator:
   981   assumes svi: "subvimage A f g"
   982   shows "\<exists>h. \<forall>x \<in> A. h (f x)  = g x"
   983 proof (safe intro!: exI[of _ "\<lambda>x. (THE z. z \<in> (g ` (f -` {x} \<inter> A)))"])
   984   fix x assume "x \<in> A"
   985   show "(THE x'. x' \<in> (g ` (f -` {f x} \<inter> A))) = g x"
   986     by (rule theI2[of _ "g x"])
   987       (insert `x \<in> A`, auto intro!: svi[THEN subvimageD])
   988 qed
   989 
   990 lemma subvimage_translator_image:
   991   assumes svi: "subvimage A f g"
   992   shows "\<exists>h. h ` f ` A = g ` A"
   993 proof -
   994   from subvimage_translator[OF svi]
   995   obtain h where "\<And>x. x \<in> A \<Longrightarrow> h (f x) = g x" by auto
   996   thus ?thesis
   997     by (auto intro!: exI[of _ h]
   998       simp: image_compose[symmetric] comp_def cong: image_cong)
   999 qed
  1000 
  1001 lemma subvimage_finite:
  1002   assumes svi: "subvimage A f g" and fin: "finite (f`A)"
  1003   shows "finite (g`A)"
  1004 proof -
  1005   from subvimage_translator_image[OF svi]
  1006   obtain h where "g`A = h`f`A" by fastsimp
  1007   with fin show "finite (g`A)" by simp
  1008 qed
  1009 
  1010 lemma subvimage_disj:
  1011   assumes svi: "subvimage A f g"
  1012   shows "f -` {x} \<inter> A \<subseteq> g -` {y} \<inter> A \<or>
  1013       f -` {x} \<inter> g -` {y} \<inter> A = {}" (is "?sub \<or> ?dist")
  1014 proof (rule disjCI)
  1015   assume "\<not> ?dist"
  1016   then obtain z where "z \<in> A" and "x = f z" and "y = g z" by auto
  1017   thus "?sub" using svi unfolding subvimage_def by auto
  1018 qed
  1019 
  1020 lemma setsum_image_split:
  1021   assumes svi: "subvimage A f g" and fin: "finite (f ` A)"
  1022   shows "(\<Sum>x\<in>f`A. h x) = (\<Sum>y\<in>g`A. \<Sum>x\<in>f`(g -` {y} \<inter> A). h x)"
  1023     (is "?lhs = ?rhs")
  1024 proof -
  1025   have "f ` A =
  1026       snd ` (SIGMA x : g ` A. f ` (g -` {x} \<inter> A))"
  1027       (is "_ = snd ` ?SIGMA")
  1028     unfolding image_split_eq_Sigma[symmetric]
  1029     by (simp add: image_compose[symmetric] comp_def)
  1030   moreover
  1031   have snd_inj: "inj_on snd ?SIGMA"
  1032     unfolding image_split_eq_Sigma[symmetric]
  1033     by (auto intro!: inj_onI subvimageD[OF svi])
  1034   ultimately
  1035   have "(\<Sum>x\<in>f`A. h x) = (\<Sum>(x,y)\<in>?SIGMA. h y)"
  1036     by (auto simp: setsum_reindex intro: setsum_cong)
  1037   also have "... = ?rhs"
  1038     using subvimage_finite[OF svi fin] fin
  1039     apply (subst setsum_Sigma[symmetric])
  1040     by (auto intro!: finite_subset[of _ "f`A"])
  1041   finally show ?thesis .
  1042 qed
  1043 
  1044 lemma (in information_space) entropy_partition:
  1045   assumes sf: "simple_function X" "simple_function P"
  1046   assumes svi: "subvimage (space M) X P"
  1047   shows "\<H>(X) = \<H>(P) + \<H>(X|P)"
  1048 proof -
  1049   let "?XP x p" = "real (joint_distribution X P {(x, p)})"
  1050   let "?X x" = "real (distribution X {x})"
  1051   let "?P p" = "real (distribution P {p})"
  1052   note fX = sf(1)[THEN simple_function_imp_finite_random_variable]
  1053   note fP = sf(2)[THEN simple_function_imp_finite_random_variable]
  1054   note fX[THEN finite_distribution_finite, simp] and fP[THEN finite_distribution_finite, simp]
  1055   note finite_distribution_order[OF fX fP, simp]
  1056   have "(\<Sum>x\<in>X ` space M. real (distribution X {x}) * log b (real (distribution X {x}))) =
  1057     (\<Sum>y\<in>P `space M. \<Sum>x\<in>X ` space M.
  1058     real (joint_distribution X P {(x, y)}) * log b (real (joint_distribution X P {(x, y)})))"
  1059   proof (subst setsum_image_split[OF svi],
  1060       safe intro!: setsum_mono_zero_cong_left imageI)
  1061     show "finite (X ` space M)" "finite (X ` space M)" "finite (P ` space M)"
  1062       using sf unfolding simple_function_def by auto
  1063   next
  1064     fix p x assume in_space: "p \<in> space M" "x \<in> space M"
  1065     assume "real (joint_distribution X P {(X x, P p)}) * log b (real (joint_distribution X P {(X x, P p)})) \<noteq> 0"
  1066     hence "(\<lambda>x. (X x, P x)) -` {(X x, P p)} \<inter> space M \<noteq> {}" by (auto simp: distribution_def)
  1067     with svi[unfolded subvimage_def, rule_format, OF `x \<in> space M`]
  1068     show "x \<in> P -` {P p}" by auto
  1069   next
  1070     fix p x assume in_space: "p \<in> space M" "x \<in> space M"
  1071     assume "P x = P p"
  1072     from this[symmetric] svi[unfolded subvimage_def, rule_format, OF `x \<in> space M`]
  1073     have "X -` {X x} \<inter> space M \<subseteq> P -` {P p} \<inter> space M"
  1074       by auto
  1075     hence "(\<lambda>x. (X x, P x)) -` {(X x, P p)} \<inter> space M = X -` {X x} \<inter> space M"
  1076       by auto
  1077     thus "real (distribution X {X x}) * log b (real (distribution X {X x})) =
  1078           real (joint_distribution X P {(X x, P p)}) *
  1079           log b (real (joint_distribution X P {(X x, P p)}))"
  1080       by (auto simp: distribution_def)
  1081   qed
  1082   moreover have "\<And>x y. real (joint_distribution X P {(x, y)}) *
  1083       log b (real (joint_distribution X P {(x, y)}) / real (distribution P {y})) =
  1084       real (joint_distribution X P {(x, y)}) * log b (real (joint_distribution X P {(x, y)})) -
  1085       real (joint_distribution X P {(x, y)}) * log b (real (distribution P {y}))"
  1086     by (auto simp add: log_simps zero_less_mult_iff field_simps)
  1087   ultimately show ?thesis
  1088     unfolding sf[THEN entropy_eq] conditional_entropy_eq[OF sf]
  1089     using setsum_real_joint_distribution_singleton[OF fX fP]
  1090     by (simp add: setsum_cartesian_product' setsum_subtractf setsum_real_distribution
  1091       setsum_left_distrib[symmetric] setsum_commute[where B="P`space M"])
  1092 qed
  1093 
  1094 corollary (in information_space) entropy_data_processing:
  1095   assumes X: "simple_function X" shows "\<H>(f \<circ> X) \<le> \<H>(X)"
  1096 proof -
  1097   note X
  1098   moreover have fX: "simple_function (f \<circ> X)" using X by auto
  1099   moreover have "subvimage (space M) X (f \<circ> X)" by auto
  1100   ultimately have "\<H>(X) = \<H>(f\<circ>X) + \<H>(X|f\<circ>X)" by (rule entropy_partition)
  1101   then show "\<H>(f \<circ> X) \<le> \<H>(X)"
  1102     by (auto intro: conditional_entropy_positive[OF X fX])
  1103 qed
  1104 
  1105 corollary (in information_space) entropy_of_inj:
  1106   assumes X: "simple_function X" and inj: "inj_on f (X`space M)"
  1107   shows "\<H>(f \<circ> X) = \<H>(X)"
  1108 proof (rule antisym)
  1109   show "\<H>(f \<circ> X) \<le> \<H>(X)" using entropy_data_processing[OF X] .
  1110 next
  1111   have sf: "simple_function (f \<circ> X)"
  1112     using X by auto
  1113   have "\<H>(X) = \<H>(the_inv_into (X`space M) f \<circ> (f \<circ> X))"
  1114     by (auto intro!: mutual_information_cong simp: entropy_def the_inv_into_f_f[OF inj])
  1115   also have "... \<le> \<H>(f \<circ> X)"
  1116     using entropy_data_processing[OF sf] .
  1117   finally show "\<H>(X) \<le> \<H>(f \<circ> X)" .
  1118 qed
  1119 
  1120 end