src/HOL/Probability/Information.thy
author hoelzl
Wed Feb 02 12:34:45 2011 +0100 (2011-02-02)
changeset 41689 3e39b0e730d6
parent 41661 baf1964bc468
child 41833 563bea92b2c0
permissions -rw-r--r--
the measure valuation is again part of the measure_space type, instead of an explicit parameter to the locale;
changed syntax for simple_function, simple_integral, positive_integral, integral and RN_deriv.
introduced binder variants for simple_integral, positive_integral and integral.
     1 theory Information
     2 imports
     3   Probability_Space
     4   "~~/src/HOL/Library/Convex"
     5   Lebesgue_Measure
     6 begin
     7 
     8 lemma log_le: "1 < a \<Longrightarrow> 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log a x \<le> log a y"
     9   by (subst log_le_cancel_iff) auto
    10 
    11 lemma log_less: "1 < a \<Longrightarrow> 0 < x \<Longrightarrow> x < y \<Longrightarrow> log a x < log a y"
    12   by (subst log_less_cancel_iff) auto
    13 
    14 lemma setsum_cartesian_product':
    15   "(\<Sum>x\<in>A \<times> B. f x) = (\<Sum>x\<in>A. setsum (\<lambda>y. f (x, y)) B)"
    16   unfolding setsum_cartesian_product by simp
    17 
    18 section "Convex theory"
    19 
    20 lemma log_setsum:
    21   assumes "finite s" "s \<noteq> {}"
    22   assumes "b > 1"
    23   assumes "(\<Sum> i \<in> s. a i) = 1"
    24   assumes "\<And> i. i \<in> s \<Longrightarrow> a i \<ge> 0"
    25   assumes "\<And> i. i \<in> s \<Longrightarrow> y i \<in> {0 <..}"
    26   shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
    27 proof -
    28   have "convex_on {0 <..} (\<lambda> x. - log b x)"
    29     by (rule minus_log_convex[OF `b > 1`])
    30   hence "- log b (\<Sum> i \<in> s. a i * y i) \<le> (\<Sum> i \<in> s. a i * - log b (y i))"
    31     using convex_on_setsum[of _ _ "\<lambda> x. - log b x"] assms pos_is_convex by fastsimp
    32   thus ?thesis by (auto simp add:setsum_negf le_imp_neg_le)
    33 qed
    34 
    35 lemma log_setsum':
    36   assumes "finite s" "s \<noteq> {}"
    37   assumes "b > 1"
    38   assumes "(\<Sum> i \<in> s. a i) = 1"
    39   assumes pos: "\<And> i. i \<in> s \<Longrightarrow> 0 \<le> a i"
    40           "\<And> i. \<lbrakk> i \<in> s ; 0 < a i \<rbrakk> \<Longrightarrow> 0 < y i"
    41   shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
    42 proof -
    43   have "\<And>y. (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) = (\<Sum> i \<in> s. a i * y i)"
    44     using assms by (auto intro!: setsum_mono_zero_cong_left)
    45   moreover have "log b (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) \<ge> (\<Sum> i \<in> s - {i. a i = 0}. a i * log b (y i))"
    46   proof (rule log_setsum)
    47     have "setsum a (s - {i. a i = 0}) = setsum a s"
    48       using assms(1) by (rule setsum_mono_zero_cong_left) auto
    49     thus sum_1: "setsum a (s - {i. a i = 0}) = 1"
    50       "finite (s - {i. a i = 0})" using assms by simp_all
    51 
    52     show "s - {i. a i = 0} \<noteq> {}"
    53     proof
    54       assume *: "s - {i. a i = 0} = {}"
    55       hence "setsum a (s - {i. a i = 0}) = 0" by (simp add: * setsum_empty)
    56       with sum_1 show False by simp
    57     qed
    58 
    59     fix i assume "i \<in> s - {i. a i = 0}"
    60     hence "i \<in> s" "a i \<noteq> 0" by simp_all
    61     thus "0 \<le> a i" "y i \<in> {0<..}" using pos[of i] by auto
    62   qed fact+
    63   ultimately show ?thesis by simp
    64 qed
    65 
    66 lemma log_setsum_divide:
    67   assumes "finite S" and "S \<noteq> {}" and "1 < b"
    68   assumes "(\<Sum>x\<in>S. g x) = 1"
    69   assumes pos: "\<And>x. x \<in> S \<Longrightarrow> g x \<ge> 0" "\<And>x. x \<in> S \<Longrightarrow> f x \<ge> 0"
    70   assumes g_pos: "\<And>x. \<lbrakk> x \<in> S ; 0 < g x \<rbrakk> \<Longrightarrow> 0 < f x"
    71   shows "- (\<Sum>x\<in>S. g x * log b (g x / f x)) \<le> log b (\<Sum>x\<in>S. f x)"
    72 proof -
    73   have log_mono: "\<And>x y. 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log b x \<le> log b y"
    74     using `1 < b` by (subst log_le_cancel_iff) auto
    75 
    76   have "- (\<Sum>x\<in>S. g x * log b (g x / f x)) = (\<Sum>x\<in>S. g x * log b (f x / g x))"
    77   proof (unfold setsum_negf[symmetric], rule setsum_cong)
    78     fix x assume x: "x \<in> S"
    79     show "- (g x * log b (g x / f x)) = g x * log b (f x / g x)"
    80     proof (cases "g x = 0")
    81       case False
    82       with pos[OF x] g_pos[OF x] have "0 < f x" "0 < g x" by simp_all
    83       thus ?thesis using `1 < b` by (simp add: log_divide field_simps)
    84     qed simp
    85   qed rule
    86   also have "... \<le> log b (\<Sum>x\<in>S. g x * (f x / g x))"
    87   proof (rule log_setsum')
    88     fix x assume x: "x \<in> S" "0 < g x"
    89     with g_pos[OF x] show "0 < f x / g x" by (safe intro!: divide_pos_pos)
    90   qed fact+
    91   also have "... = log b (\<Sum>x\<in>S - {x. g x = 0}. f x)" using `finite S`
    92     by (auto intro!: setsum_mono_zero_cong_right arg_cong[where f="log b"]
    93         split: split_if_asm)
    94   also have "... \<le> log b (\<Sum>x\<in>S. f x)"
    95   proof (rule log_mono)
    96     have "0 = (\<Sum>x\<in>S - {x. g x = 0}. 0)" by simp
    97     also have "... < (\<Sum>x\<in>S - {x. g x = 0}. f x)" (is "_ < ?sum")
    98     proof (rule setsum_strict_mono)
    99       show "finite (S - {x. g x = 0})" using `finite S` by simp
   100       show "S - {x. g x = 0} \<noteq> {}"
   101       proof
   102         assume "S - {x. g x = 0} = {}"
   103         hence "(\<Sum>x\<in>S. g x) = 0" by (subst setsum_0') auto
   104         with `(\<Sum>x\<in>S. g x) = 1` show False by simp
   105       qed
   106       fix x assume "x \<in> S - {x. g x = 0}"
   107       thus "0 < f x" using g_pos[of x] pos(1)[of x] by auto
   108     qed
   109     finally show "0 < ?sum" .
   110     show "(\<Sum>x\<in>S - {x. g x = 0}. f x) \<le> (\<Sum>x\<in>S. f x)"
   111       using `finite S` pos by (auto intro!: setsum_mono2)
   112   qed
   113   finally show ?thesis .
   114 qed
   115 
   116 lemma split_pairs:
   117   "((A, B) = X) \<longleftrightarrow> (fst X = A \<and> snd X = B)" and
   118   "(X = (A, B)) \<longleftrightarrow> (fst X = A \<and> snd X = B)" by auto
   119 
   120 section "Information theory"
   121 
   122 locale information_space = prob_space +
   123   fixes b :: real assumes b_gt_1: "1 < b"
   124 
   125 context information_space
   126 begin
   127 
   128 text {* Introduce some simplification rules for logarithm of base @{term b}. *}
   129 
   130 lemma log_neg_const:
   131   assumes "x \<le> 0"
   132   shows "log b x = log b 0"
   133 proof -
   134   { fix u :: real
   135     have "x \<le> 0" by fact
   136     also have "0 < exp u"
   137       using exp_gt_zero .
   138     finally have "exp u \<noteq> x"
   139       by auto }
   140   then show "log b x = log b 0"
   141     by (simp add: log_def ln_def)
   142 qed
   143 
   144 lemma log_mult_eq:
   145   "log b (A * B) = (if 0 < A * B then log b \<bar>A\<bar> + log b \<bar>B\<bar> else log b 0)"
   146   using log_mult[of b "\<bar>A\<bar>" "\<bar>B\<bar>"] b_gt_1 log_neg_const[of "A * B"]
   147   by (auto simp: zero_less_mult_iff mult_le_0_iff)
   148 
   149 lemma log_inverse_eq:
   150   "log b (inverse B) = (if 0 < B then - log b B else log b 0)"
   151   using log_inverse[of b B] log_neg_const[of "inverse B"] b_gt_1 by simp
   152 
   153 lemma log_divide_eq:
   154   "log b (A / B) = (if 0 < A * B then log b \<bar>A\<bar> - log b \<bar>B\<bar> else log b 0)"
   155   unfolding divide_inverse log_mult_eq log_inverse_eq abs_inverse
   156   by (auto simp: zero_less_mult_iff mult_le_0_iff)
   157 
   158 lemmas log_simps = log_mult_eq log_inverse_eq log_divide_eq
   159 
   160 end
   161 
   162 subsection "Kullback$-$Leibler divergence"
   163 
   164 text {* The Kullback$-$Leibler divergence is also known as relative entropy or
   165 Kullback$-$Leibler distance. *}
   166 
   167 definition
   168   "KL_divergence b M \<nu> = \<integral>x. log b (real (RN_deriv M \<nu> x)) \<partial>M\<lparr>measure := \<nu>\<rparr>"
   169 
   170 lemma (in sigma_finite_measure) KL_divergence_cong:
   171   assumes "measure_space (M\<lparr>measure := \<nu>\<rparr>)" (is "measure_space ?\<nu>")
   172   assumes [simp]: "sets N = sets M" "space N = space M"
   173     "\<And>A. A \<in> sets M \<Longrightarrow> measure N A = \<mu> A"
   174     "\<And>A. A \<in> sets M \<Longrightarrow> \<nu> A = \<nu>' A"
   175   shows "KL_divergence b M \<nu> = KL_divergence b N \<nu>'"
   176 proof -
   177   interpret \<nu>: measure_space ?\<nu> by fact
   178   have "KL_divergence b M \<nu> = \<integral>x. log b (real (RN_deriv N \<nu>' x)) \<partial>?\<nu>"
   179     by (simp cong: RN_deriv_cong \<nu>.integral_cong add: KL_divergence_def)
   180   also have "\<dots> = KL_divergence b N \<nu>'"
   181     by (auto intro!: \<nu>.integral_cong_measure[symmetric] simp: KL_divergence_def)
   182   finally show ?thesis .
   183 qed
   184 
   185 lemma (in finite_measure_space) KL_divergence_eq_finite:
   186   assumes v: "finite_measure_space (M\<lparr>measure := \<nu>\<rparr>)"
   187   assumes ac: "absolutely_continuous \<nu>"
   188   shows "KL_divergence b M \<nu> = (\<Sum>x\<in>space M. real (\<nu> {x}) * log b (real (\<nu> {x}) / real (\<mu> {x})))" (is "_ = ?sum")
   189 proof (simp add: KL_divergence_def finite_measure_space.integral_finite_singleton[OF v])
   190   interpret v: finite_measure_space "M\<lparr>measure := \<nu>\<rparr>" by fact
   191   have ms: "measure_space (M\<lparr>measure := \<nu>\<rparr>)" by default
   192   show "(\<Sum>x \<in> space M. log b (real (RN_deriv M \<nu> x)) * real (\<nu> {x})) = ?sum"
   193     using RN_deriv_finite_measure[OF ms ac]
   194     by (auto intro!: setsum_cong simp: field_simps real_of_pextreal_mult[symmetric])
   195 qed
   196 
   197 lemma (in finite_prob_space) KL_divergence_positive_finite:
   198   assumes v: "finite_prob_space (M\<lparr>measure := \<nu>\<rparr>)"
   199   assumes ac: "absolutely_continuous \<nu>"
   200   and "1 < b"
   201   shows "0 \<le> KL_divergence b M \<nu>"
   202 proof -
   203   interpret v: finite_prob_space "M\<lparr>measure := \<nu>\<rparr>" by fact
   204   have ms: "finite_measure_space (M\<lparr>measure := \<nu>\<rparr>)" by default
   205 
   206   have "- (KL_divergence b M \<nu>) \<le> log b (\<Sum>x\<in>space M. real (\<mu> {x}))"
   207   proof (subst KL_divergence_eq_finite[OF ms ac], safe intro!: log_setsum_divide not_empty)
   208     show "finite (space M)" using finite_space by simp
   209     show "1 < b" by fact
   210     show "(\<Sum>x\<in>space M. real (\<nu> {x})) = 1" using v.finite_sum_over_space_eq_1 by simp
   211 
   212     fix x assume "x \<in> space M"
   213     then have x: "{x} \<in> sets M" unfolding sets_eq_Pow by auto
   214     { assume "0 < real (\<nu> {x})"
   215       then have "\<nu> {x} \<noteq> 0" by auto
   216       then have "\<mu> {x} \<noteq> 0"
   217         using ac[unfolded absolutely_continuous_def, THEN bspec, of "{x}"] x by auto
   218       thus "0 < prob {x}" using finite_measure[of "{x}"] x by auto }
   219   qed auto
   220   thus "0 \<le> KL_divergence b M \<nu>" using finite_sum_over_space_eq_1 by simp
   221 qed
   222 
   223 subsection {* Mutual Information *}
   224 
   225 definition (in prob_space)
   226   "mutual_information b S T X Y =
   227     KL_divergence b (S\<lparr>measure := distribution X\<rparr> \<Otimes>\<^isub>M T\<lparr>measure := distribution Y\<rparr>)
   228       (joint_distribution X Y)"
   229 
   230 definition (in prob_space)
   231   "entropy b s X = mutual_information b s s X X"
   232 
   233 abbreviation (in information_space)
   234   mutual_information_Pow ("\<I>'(_ ; _')") where
   235   "\<I>(X ; Y) \<equiv> mutual_information b
   236     \<lparr> space = X`space M, sets = Pow (X`space M), measure = distribution X \<rparr>
   237     \<lparr> space = Y`space M, sets = Pow (Y`space M), measure = distribution Y \<rparr> X Y"
   238 
   239 lemma algebra_measure_update[simp]:
   240   "algebra (M'\<lparr>measure := m\<rparr>) \<longleftrightarrow> algebra M'"
   241   unfolding algebra_def by simp
   242 
   243 lemma sigma_algebra_measure_update[simp]:
   244   "sigma_algebra (M'\<lparr>measure := m\<rparr>) \<longleftrightarrow> sigma_algebra M'"
   245   unfolding sigma_algebra_def sigma_algebra_axioms_def by simp
   246 
   247 lemma finite_sigma_algebra_measure_update[simp]:
   248   "finite_sigma_algebra (M'\<lparr>measure := m\<rparr>) \<longleftrightarrow> finite_sigma_algebra M'"
   249   unfolding finite_sigma_algebra_def finite_sigma_algebra_axioms_def by simp
   250 
   251 lemma (in prob_space) finite_variables_absolutely_continuous:
   252   assumes X: "finite_random_variable S X" and Y: "finite_random_variable T Y"
   253   shows "measure_space.absolutely_continuous
   254     (S\<lparr>measure := distribution X\<rparr> \<Otimes>\<^isub>M T\<lparr>measure := distribution Y\<rparr>)
   255     (joint_distribution X Y)"
   256 proof -
   257   interpret X: finite_prob_space "S\<lparr>measure := distribution X\<rparr>"
   258     using X by (rule distribution_finite_prob_space)
   259   interpret Y: finite_prob_space "T\<lparr>measure := distribution Y\<rparr>"
   260     using Y by (rule distribution_finite_prob_space)
   261   interpret XY: pair_finite_prob_space
   262     "S\<lparr>measure := distribution X\<rparr>" "T\<lparr> measure := distribution Y\<rparr>" by default
   263   interpret P: finite_prob_space "XY.P\<lparr> measure := joint_distribution X Y\<rparr>"
   264     using assms by (auto intro!: joint_distribution_finite_prob_space)
   265   note rv = assms[THEN finite_random_variableD]
   266   show "XY.absolutely_continuous (joint_distribution X Y)"
   267   proof (rule XY.absolutely_continuousI)
   268     show "finite_measure_space (XY.P\<lparr> measure := joint_distribution X Y\<rparr>)" by default
   269     fix x assume "x \<in> space XY.P" and "XY.\<mu> {x} = 0"
   270     then obtain a b where "(a, b) = x" and "a \<in> space S" "b \<in> space T"
   271       and distr: "distribution X {a} * distribution Y {b} = 0"
   272       by (cases x) (auto simp: space_pair_measure)
   273     with X.sets_eq_Pow Y.sets_eq_Pow
   274       joint_distribution_Times_le_fst[OF rv, of "{a}" "{b}"]
   275       joint_distribution_Times_le_snd[OF rv, of "{a}" "{b}"]
   276     have "joint_distribution X Y {x} \<le> distribution Y {b}"
   277          "joint_distribution X Y {x} \<le> distribution X {a}"
   278       by (auto simp del: X.sets_eq_Pow Y.sets_eq_Pow)
   279     with distr show "joint_distribution X Y {x} = 0" by auto
   280   qed
   281 qed
   282 
   283 lemma (in information_space)
   284   assumes MX: "finite_random_variable MX X"
   285   assumes MY: "finite_random_variable MY Y"
   286   shows mutual_information_generic_eq:
   287     "mutual_information b MX MY X Y = (\<Sum> (x,y) \<in> space MX \<times> space MY.
   288       real (joint_distribution X Y {(x,y)}) *
   289       log b (real (joint_distribution X Y {(x,y)}) /
   290       (real (distribution X {x}) * real (distribution Y {y}))))"
   291     (is ?sum)
   292   and mutual_information_positive_generic:
   293      "0 \<le> mutual_information b MX MY X Y" (is ?positive)
   294 proof -
   295   interpret X: finite_prob_space "MX\<lparr>measure := distribution X\<rparr>"
   296     using MX by (rule distribution_finite_prob_space)
   297   interpret Y: finite_prob_space "MY\<lparr>measure := distribution Y\<rparr>"
   298     using MY by (rule distribution_finite_prob_space)
   299   interpret XY: pair_finite_prob_space "MX\<lparr>measure := distribution X\<rparr>" "MY\<lparr>measure := distribution Y\<rparr>" by default
   300   interpret P: finite_prob_space "XY.P\<lparr>measure := joint_distribution X Y\<rparr>"
   301     using assms by (auto intro!: joint_distribution_finite_prob_space)
   302 
   303   have P_ms: "finite_measure_space (XY.P\<lparr>measure :=joint_distribution X Y\<rparr>)" by default
   304   have P_ps: "finite_prob_space (XY.P\<lparr>measure := joint_distribution X Y\<rparr>)" by default
   305 
   306   show ?sum
   307     unfolding Let_def mutual_information_def
   308     by (subst XY.KL_divergence_eq_finite[OF P_ms finite_variables_absolutely_continuous[OF MX MY]])
   309        (auto simp add: space_pair_measure setsum_cartesian_product' real_of_pextreal_mult[symmetric])
   310 
   311   show ?positive
   312     using XY.KL_divergence_positive_finite[OF P_ps finite_variables_absolutely_continuous[OF MX MY] b_gt_1]
   313     unfolding mutual_information_def .
   314 qed
   315 
   316 lemma (in information_space) mutual_information_commute:
   317   assumes X: "finite_random_variable S X" and Y: "finite_random_variable T Y"
   318   shows "mutual_information b S T X Y = mutual_information b T S Y X"
   319   unfolding mutual_information_generic_eq[OF X Y] mutual_information_generic_eq[OF Y X]
   320   unfolding joint_distribution_commute_singleton[of X Y]
   321   by (auto simp add: ac_simps intro!: setsum_reindex_cong[OF swap_inj_on])
   322 
   323 lemma (in information_space) mutual_information_commute_simple:
   324   assumes X: "simple_function M X" and Y: "simple_function M Y"
   325   shows "\<I>(X;Y) = \<I>(Y;X)"
   326   by (intro X Y simple_function_imp_finite_random_variable mutual_information_commute)
   327 
   328 lemma (in information_space) mutual_information_eq:
   329   assumes "simple_function M X" "simple_function M Y"
   330   shows "\<I>(X;Y) = (\<Sum> (x,y) \<in> X ` space M \<times> Y ` space M.
   331     real (distribution (\<lambda>x. (X x, Y x)) {(x,y)}) * log b (real (distribution (\<lambda>x. (X x, Y x)) {(x,y)}) /
   332                                                    (real (distribution X {x}) * real (distribution Y {y}))))"
   333   using assms by (simp add: mutual_information_generic_eq)
   334 
   335 lemma (in information_space) mutual_information_generic_cong:
   336   assumes X: "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x"
   337   assumes Y: "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x"
   338   shows "mutual_information b MX MY X Y = mutual_information b MX MY X' Y'"
   339   unfolding mutual_information_def using X Y
   340   by (simp cong: distribution_cong)
   341 
   342 lemma (in information_space) mutual_information_cong:
   343   assumes X: "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x"
   344   assumes Y: "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x"
   345   shows "\<I>(X; Y) = \<I>(X'; Y')"
   346   unfolding mutual_information_def using X Y
   347   by (simp cong: distribution_cong image_cong)
   348 
   349 lemma (in information_space) mutual_information_positive:
   350   assumes "simple_function M X" "simple_function M Y"
   351   shows "0 \<le> \<I>(X;Y)"
   352   using assms by (simp add: mutual_information_positive_generic)
   353 
   354 subsection {* Entropy *}
   355 
   356 abbreviation (in information_space)
   357   entropy_Pow ("\<H>'(_')") where
   358   "\<H>(X) \<equiv> entropy b \<lparr> space = X`space M, sets = Pow (X`space M), measure = distribution X \<rparr> X"
   359 
   360 lemma (in information_space) entropy_generic_eq:
   361   assumes MX: "finite_random_variable MX X"
   362   shows "entropy b MX X = -(\<Sum> x \<in> space MX. real (distribution X {x}) * log b (real (distribution X {x})))"
   363 proof -
   364   interpret MX: finite_prob_space "MX\<lparr>measure := distribution X\<rparr>"
   365     using MX by (rule distribution_finite_prob_space)
   366   let "?X x" = "real (distribution X {x})"
   367   let "?XX x y" = "real (joint_distribution X X {(x, y)})"
   368   { fix x y
   369     have "(\<lambda>x. (X x, X x)) -` {(x, y)} = (if x = y then X -` {x} else {})" by auto
   370     then have "?XX x y * log b (?XX x y / (?X x * ?X y)) =
   371         (if x = y then - ?X y * log b (?X y) else 0)"
   372       unfolding distribution_def by (auto simp: log_simps zero_less_mult_iff) }
   373   note remove_XX = this
   374   show ?thesis
   375     unfolding entropy_def mutual_information_generic_eq[OF MX MX]
   376     unfolding setsum_cartesian_product[symmetric] setsum_negf[symmetric] remove_XX
   377     using MX.finite_space by (auto simp: setsum_cases)
   378 qed
   379 
   380 lemma (in information_space) entropy_eq:
   381   assumes "simple_function M X"
   382   shows "\<H>(X) = -(\<Sum> x \<in> X ` space M. real (distribution X {x}) * log b (real (distribution X {x})))"
   383   using assms by (simp add: entropy_generic_eq)
   384 
   385 lemma (in information_space) entropy_positive:
   386   "simple_function M X \<Longrightarrow> 0 \<le> \<H>(X)"
   387   unfolding entropy_def by (simp add: mutual_information_positive)
   388 
   389 lemma (in information_space) entropy_certainty_eq_0:
   390   assumes "simple_function M X" and "x \<in> X ` space M" and "distribution X {x} = 1"
   391   shows "\<H>(X) = 0"
   392 proof -
   393   let ?X = "\<lparr> space = X ` space M, sets = Pow (X ` space M), measure = distribution X\<rparr>"
   394   note simple_function_imp_finite_random_variable[OF `simple_function M X`]
   395   from distribution_finite_prob_space[OF this, of "\<lparr> measure = distribution X \<rparr>"]
   396   interpret X: finite_prob_space ?X by simp
   397   have "distribution X (X ` space M - {x}) = distribution X (X ` space M) - distribution X {x}"
   398     using X.measure_compl[of "{x}"] assms by auto
   399   also have "\<dots> = 0" using X.prob_space assms by auto
   400   finally have X0: "distribution X (X ` space M - {x}) = 0" by auto
   401   { fix y assume asm: "y \<noteq> x" "y \<in> X ` space M"
   402     hence "{y} \<subseteq> X ` space M - {x}" by auto
   403     from X.measure_mono[OF this] X0 asm
   404     have "distribution X {y} = 0" by auto }
   405   hence fi: "\<And> y. y \<in> X ` space M \<Longrightarrow> real (distribution X {y}) = (if x = y then 1 else 0)"
   406     using assms by auto
   407   have y: "\<And>y. (if x = y then 1 else 0) * log b (if x = y then 1 else 0) = 0" by simp
   408   show ?thesis unfolding entropy_eq[OF `simple_function M X`] by (auto simp: y fi)
   409 qed
   410 
   411 lemma (in information_space) entropy_le_card_not_0:
   412   assumes "simple_function M X"
   413   shows "\<H>(X) \<le> log b (real (card (X ` space M \<inter> {x . distribution X {x} \<noteq> 0})))"
   414 proof -
   415   let "?d x" = "distribution X {x}"
   416   let "?p x" = "real (?d x)"
   417   have "\<H>(X) = (\<Sum>x\<in>X`space M. ?p x * log b (1 / ?p x))"
   418     by (auto intro!: setsum_cong simp: entropy_eq[OF `simple_function M X`] setsum_negf[symmetric] log_simps not_less)
   419   also have "\<dots> \<le> log b (\<Sum>x\<in>X`space M. ?p x * (1 / ?p x))"
   420     apply (rule log_setsum')
   421     using not_empty b_gt_1 `simple_function M X` sum_over_space_real_distribution
   422     by (auto simp: simple_function_def)
   423   also have "\<dots> = log b (\<Sum>x\<in>X`space M. if ?d x \<noteq> 0 then 1 else 0)"
   424     using distribution_finite[OF `simple_function M X`[THEN simple_function_imp_random_variable], simplified]
   425     by (intro arg_cong[where f="\<lambda>X. log b X"] setsum_cong) (auto simp: real_of_pextreal_eq_0)
   426   finally show ?thesis
   427     using `simple_function M X` by (auto simp: setsum_cases real_eq_of_nat simple_function_def)
   428 qed
   429 
   430 lemma (in information_space) entropy_uniform_max:
   431   assumes "simple_function M X"
   432   assumes "\<And>x y. \<lbrakk> x \<in> X ` space M ; y \<in> X ` space M \<rbrakk> \<Longrightarrow> distribution X {x} = distribution X {y}"
   433   shows "\<H>(X) = log b (real (card (X ` space M)))"
   434 proof -
   435   let ?X = "\<lparr> space = X ` space M, sets = Pow (X ` space M), measure = distribution X\<rparr>"
   436   note simple_function_imp_finite_random_variable[OF `simple_function M X`]
   437   from distribution_finite_prob_space[OF this, of "\<lparr> measure = distribution X \<rparr>"]
   438   interpret X: finite_prob_space ?X by simp
   439   have card_gt0: "0 < card (X ` space M)" unfolding card_gt_0_iff
   440     using `simple_function M X` not_empty by (auto simp: simple_function_def)
   441   { fix x assume "x \<in> X ` space M"
   442     hence "real (distribution X {x}) = 1 / real (card (X ` space M))"
   443     proof (rule X.uniform_prob[simplified])
   444       fix x y assume "x \<in> X`space M" "y \<in> X`space M"
   445       from assms(2)[OF this] show "real (distribution X {x}) = real (distribution X {y})" by simp
   446     qed }
   447   thus ?thesis
   448     using not_empty X.finite_space b_gt_1 card_gt0
   449     by (simp add: entropy_eq[OF `simple_function M X`] real_eq_of_nat[symmetric] log_simps)
   450 qed
   451 
   452 lemma (in information_space) entropy_le_card:
   453   assumes "simple_function M X"
   454   shows "\<H>(X) \<le> log b (real (card (X ` space M)))"
   455 proof cases
   456   assume "X ` space M \<inter> {x. distribution X {x} \<noteq> 0} = {}"
   457   then have "\<And>x. x\<in>X`space M \<Longrightarrow> distribution X {x} = 0" by auto
   458   moreover
   459   have "0 < card (X`space M)"
   460     using `simple_function M X` not_empty
   461     by (auto simp: card_gt_0_iff simple_function_def)
   462   then have "log b 1 \<le> log b (real (card (X`space M)))"
   463     using b_gt_1 by (intro log_le) auto
   464   ultimately show ?thesis using assms by (simp add: entropy_eq)
   465 next
   466   assume False: "X ` space M \<inter> {x. distribution X {x} \<noteq> 0} \<noteq> {}"
   467   have "card (X ` space M \<inter> {x. distribution X {x} \<noteq> 0}) \<le> card (X ` space M)"
   468     (is "?A \<le> ?B") using assms not_empty by (auto intro!: card_mono simp: simple_function_def)
   469   note entropy_le_card_not_0[OF assms]
   470   also have "log b (real ?A) \<le> log b (real ?B)"
   471     using b_gt_1 False not_empty `?A \<le> ?B` assms
   472     by (auto intro!: log_le simp: card_gt_0_iff simp: simple_function_def)
   473   finally show ?thesis .
   474 qed
   475 
   476 lemma (in information_space) entropy_commute:
   477   assumes "simple_function M X" "simple_function M Y"
   478   shows "\<H>(\<lambda>x. (X x, Y x)) = \<H>(\<lambda>x. (Y x, X x))"
   479 proof -
   480   have sf: "simple_function M (\<lambda>x. (X x, Y x))" "simple_function M (\<lambda>x. (Y x, X x))"
   481     using assms by (auto intro: simple_function_Pair)
   482   have *: "(\<lambda>x. (Y x, X x))`space M = (\<lambda>(a,b). (b,a))`(\<lambda>x. (X x, Y x))`space M"
   483     by auto
   484   have inj: "\<And>X. inj_on (\<lambda>(a,b). (b,a)) X"
   485     by (auto intro!: inj_onI)
   486   show ?thesis
   487     unfolding sf[THEN entropy_eq] unfolding * setsum_reindex[OF inj]
   488     by (simp add: joint_distribution_commute[of Y X] split_beta)
   489 qed
   490 
   491 lemma (in information_space) entropy_eq_cartesian_product:
   492   assumes "simple_function M X" "simple_function M Y"
   493   shows "\<H>(\<lambda>x. (X x, Y x)) = -(\<Sum>x\<in>X`space M. \<Sum>y\<in>Y`space M.
   494     real (joint_distribution X Y {(x,y)}) *
   495     log b (real (joint_distribution X Y {(x,y)})))"
   496 proof -
   497   have sf: "simple_function M (\<lambda>x. (X x, Y x))"
   498     using assms by (auto intro: simple_function_Pair)
   499   { fix x assume "x\<notin>(\<lambda>x. (X x, Y x))`space M"
   500     then have "(\<lambda>x. (X x, Y x)) -` {x} \<inter> space M = {}" by auto
   501     then have "joint_distribution X Y {x} = 0"
   502       unfolding distribution_def by auto }
   503   then show ?thesis using sf assms
   504     unfolding entropy_eq[OF sf] neg_equal_iff_equal setsum_cartesian_product
   505     by (auto intro!: setsum_mono_zero_cong_left simp: simple_function_def)
   506 qed
   507 
   508 subsection {* Conditional Mutual Information *}
   509 
   510 definition (in prob_space)
   511   "conditional_mutual_information b MX MY MZ X Y Z \<equiv>
   512     mutual_information b MX (MY \<Otimes>\<^isub>M MZ) X (\<lambda>x. (Y x, Z x)) -
   513     mutual_information b MX MZ X Z"
   514 
   515 abbreviation (in information_space)
   516   conditional_mutual_information_Pow ("\<I>'( _ ; _ | _ ')") where
   517   "\<I>(X ; Y | Z) \<equiv> conditional_mutual_information b
   518     \<lparr> space = X`space M, sets = Pow (X`space M), measure = distribution X \<rparr>
   519     \<lparr> space = Y`space M, sets = Pow (Y`space M), measure = distribution Y \<rparr>
   520     \<lparr> space = Z`space M, sets = Pow (Z`space M), measure = distribution Z \<rparr>
   521     X Y Z"
   522 
   523 lemma (in information_space) conditional_mutual_information_generic_eq:
   524   assumes MX: "finite_random_variable MX X"
   525     and MY: "finite_random_variable MY Y"
   526     and MZ: "finite_random_variable MZ Z"
   527   shows "conditional_mutual_information b MX MY MZ X Y Z = (\<Sum>(x, y, z) \<in> space MX \<times> space MY \<times> space MZ.
   528              real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) *
   529              log b (real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) /
   530     (real (joint_distribution X Z {(x, z)}) * real (joint_distribution Y Z {(y,z)} / distribution Z {z}))))"
   531   (is "_ = (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?XZ x z * ?YZdZ y z)))")
   532 proof -
   533   let ?YZ = "\<lambda>y z. real (joint_distribution Y Z {(y, z)})"
   534   let ?X = "\<lambda>x. real (distribution X {x})"
   535   let ?Z = "\<lambda>z. real (distribution Z {z})"
   536 
   537   txt {* This proof is actually quiet easy, however we need to show that the
   538     distributions are finite and the joint distributions are zero when one of
   539     the variables distribution is also zero. *}
   540 
   541   note finite_var = MX MY MZ
   542   note random_var = finite_var[THEN finite_random_variableD]
   543 
   544   note space_simps = space_pair_measure space_sigma algebra.simps
   545 
   546   note YZ = finite_random_variable_pairI[OF finite_var(2,3)]
   547   note XZ = finite_random_variable_pairI[OF finite_var(1,3)]
   548   note ZX = finite_random_variable_pairI[OF finite_var(3,1)]
   549   note YZX = finite_random_variable_pairI[OF finite_var(2) ZX]
   550   note order1 =
   551     finite_distribution_order(5,6)[OF finite_var(1) YZ, simplified space_simps]
   552     finite_distribution_order(5,6)[OF finite_var(1,3), simplified space_simps]
   553 
   554   note finite = finite_var(1) YZ finite_var(3) XZ YZX
   555   note finite[THEN finite_distribution_finite, simplified space_simps, simp]
   556 
   557   have order2: "\<And>x y z. \<lbrakk>x \<in> space MX; y \<in> space MY; z \<in> space MZ; joint_distribution X Z {(x, z)} = 0\<rbrakk>
   558           \<Longrightarrow> joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)} = 0"
   559     unfolding joint_distribution_commute_singleton[of X]
   560     unfolding joint_distribution_assoc_singleton[symmetric]
   561     using finite_distribution_order(6)[OF finite_var(2) ZX]
   562     by (auto simp: space_simps)
   563 
   564   have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?XZ x z * ?YZdZ y z))) =
   565     (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * (log b (?XYZ x y z / (?X x * ?YZ y z)) - log b (?XZ x z / (?X x * ?Z z))))"
   566     (is "(\<Sum>(x, y, z)\<in>?S. ?L x y z) = (\<Sum>(x, y, z)\<in>?S. ?R x y z)")
   567   proof (safe intro!: setsum_cong)
   568     fix x y z assume space: "x \<in> space MX" "y \<in> space MY" "z \<in> space MZ"
   569     then have *: "?XYZ x y z / (?XZ x z * ?YZdZ y z) =
   570       (?XYZ x y z / (?X x * ?YZ y z)) / (?XZ x z / (?X x * ?Z z))"
   571       using order1(3)
   572       by (auto simp: real_of_pextreal_mult[symmetric] real_of_pextreal_eq_0)
   573     show "?L x y z = ?R x y z"
   574     proof cases
   575       assume "?XYZ x y z \<noteq> 0"
   576       with space b_gt_1 order1 order2 show ?thesis unfolding *
   577         by (subst log_divide)
   578            (auto simp: zero_less_divide_iff zero_less_real_of_pextreal
   579                        real_of_pextreal_eq_0 zero_less_mult_iff)
   580     qed simp
   581   qed
   582   also have "\<dots> = (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?X x * ?YZ y z))) -
   583                   (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XZ x z / (?X x * ?Z z)))"
   584     by (auto simp add: setsum_subtractf[symmetric] field_simps intro!: setsum_cong)
   585   also have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XZ x z / (?X x * ?Z z))) =
   586              (\<Sum>(x, z)\<in>space MX \<times> space MZ. ?XZ x z * log b (?XZ x z / (?X x * ?Z z)))"
   587     unfolding setsum_cartesian_product[symmetric] setsum_commute[of _ _ "space MY"]
   588               setsum_left_distrib[symmetric]
   589     unfolding joint_distribution_commute_singleton[of X]
   590     unfolding joint_distribution_assoc_singleton[symmetric]
   591     using setsum_real_joint_distribution_singleton[OF finite_var(2) ZX, unfolded space_simps]
   592     by (intro setsum_cong refl) simp
   593   also have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?X x * ?YZ y z))) -
   594              (\<Sum>(x, z)\<in>space MX \<times> space MZ. ?XZ x z * log b (?XZ x z / (?X x * ?Z z))) =
   595              conditional_mutual_information b MX MY MZ X Y Z"
   596     unfolding conditional_mutual_information_def
   597     unfolding mutual_information_generic_eq[OF finite_var(1,3)]
   598     unfolding mutual_information_generic_eq[OF finite_var(1) YZ]
   599     by (simp add: space_sigma space_pair_measure setsum_cartesian_product')
   600   finally show ?thesis by simp
   601 qed
   602 
   603 lemma (in information_space) conditional_mutual_information_eq:
   604   assumes "simple_function M X" "simple_function M Y" "simple_function M Z"
   605   shows "\<I>(X;Y|Z) = (\<Sum>(x, y, z) \<in> X`space M \<times> Y`space M \<times> Z`space M.
   606              real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) *
   607              log b (real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) /
   608     (real (joint_distribution X Z {(x, z)}) * real (joint_distribution Y Z {(y,z)} / distribution Z {z}))))"
   609   using conditional_mutual_information_generic_eq[OF assms[THEN simple_function_imp_finite_random_variable]]
   610   by simp
   611 
   612 lemma (in information_space) conditional_mutual_information_eq_mutual_information:
   613   assumes X: "simple_function M X" and Y: "simple_function M Y"
   614   shows "\<I>(X ; Y) = \<I>(X ; Y | (\<lambda>x. ()))"
   615 proof -
   616   have [simp]: "(\<lambda>x. ()) ` space M = {()}" using not_empty by auto
   617   have C: "simple_function M (\<lambda>x. ())" by auto
   618   show ?thesis
   619     unfolding conditional_mutual_information_eq[OF X Y C]
   620     unfolding mutual_information_eq[OF X Y]
   621     by (simp add: setsum_cartesian_product' distribution_remove_const)
   622 qed
   623 
   624 lemma (in prob_space) distribution_unit[simp]: "distribution (\<lambda>x. ()) {()} = 1"
   625   unfolding distribution_def using measure_space_1 by auto
   626 
   627 lemma (in prob_space) joint_distribution_unit[simp]: "distribution (\<lambda>x. (X x, ())) {(a, ())} = distribution X {a}"
   628   unfolding distribution_def by (auto intro!: arg_cong[where f=\<mu>])
   629 
   630 lemma (in prob_space) setsum_distribution:
   631   assumes X: "finite_random_variable MX X" shows "(\<Sum>a\<in>space MX. distribution X {a}) = 1"
   632   using setsum_joint_distribution[OF assms, of "\<lparr> space = UNIV, sets = Pow UNIV \<rparr>" "\<lambda>x. ()" "{()}"]
   633   using sigma_algebra_Pow[of "UNIV::unit set" "()"] by simp
   634 
   635 lemma (in prob_space) setsum_real_distribution:
   636   fixes MX :: "('c, 'd) measure_space_scheme"
   637   assumes X: "finite_random_variable MX X" shows "(\<Sum>a\<in>space MX. real (distribution X {a})) = 1"
   638   using setsum_real_joint_distribution[OF assms, of "\<lparr> space = UNIV, sets = Pow UNIV, measure = undefined \<rparr>" "\<lambda>x. ()" "{()}"]
   639   using sigma_algebra_Pow[of "UNIV::unit set" "\<lparr> measure = undefined \<rparr>"] by simp
   640 
   641 lemma (in information_space) conditional_mutual_information_generic_positive:
   642   assumes "finite_random_variable MX X" and "finite_random_variable MY Y" and "finite_random_variable MZ Z"
   643   shows "0 \<le> conditional_mutual_information b MX MY MZ X Y Z"
   644 proof (cases "space MX \<times> space MY \<times> space MZ = {}")
   645   case True show ?thesis
   646     unfolding conditional_mutual_information_generic_eq[OF assms] True
   647     by simp
   648 next
   649   case False
   650   let "?dXYZ A" = "real (distribution (\<lambda>x. (X x, Y x, Z x)) A)"
   651   let "?dXZ A" = "real (joint_distribution X Z A)"
   652   let "?dYZ A" = "real (joint_distribution Y Z A)"
   653   let "?dX A" = "real (distribution X A)"
   654   let "?dZ A" = "real (distribution Z A)"
   655   let ?M = "space MX \<times> space MY \<times> space MZ"
   656 
   657   have split_beta: "\<And>f. split f = (\<lambda>x. f (fst x) (snd x))" by (simp add: fun_eq_iff)
   658 
   659   note space_simps = space_pair_measure space_sigma algebra.simps
   660 
   661   note finite_var = assms
   662   note YZ = finite_random_variable_pairI[OF finite_var(2,3)]
   663   note XZ = finite_random_variable_pairI[OF finite_var(1,3)]
   664   note ZX = finite_random_variable_pairI[OF finite_var(3,1)]
   665   note YZ = finite_random_variable_pairI[OF finite_var(2,3)]
   666   note XYZ = finite_random_variable_pairI[OF finite_var(1) YZ]
   667   note finite = finite_var(3) YZ XZ XYZ
   668   note finite = finite[THEN finite_distribution_finite, simplified space_simps]
   669 
   670   have order: "\<And>x y z. \<lbrakk>x \<in> space MX; y \<in> space MY; z \<in> space MZ; joint_distribution X Z {(x, z)} = 0\<rbrakk>
   671           \<Longrightarrow> joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)} = 0"
   672     unfolding joint_distribution_commute_singleton[of X]
   673     unfolding joint_distribution_assoc_singleton[symmetric]
   674     using finite_distribution_order(6)[OF finite_var(2) ZX]
   675     by (auto simp: space_simps)
   676 
   677   note order = order
   678     finite_distribution_order(5,6)[OF finite_var(1) YZ, simplified space_simps]
   679     finite_distribution_order(5,6)[OF finite_var(2,3), simplified space_simps]
   680 
   681   have "- conditional_mutual_information b MX MY MZ X Y Z = - (\<Sum>(x, y, z) \<in> ?M. ?dXYZ {(x, y, z)} *
   682     log b (?dXYZ {(x, y, z)} / (?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})))"
   683     unfolding conditional_mutual_information_generic_eq[OF assms] neg_equal_iff_equal
   684     by (intro setsum_cong) (auto intro!: arg_cong[where f="log b"] simp: real_of_pextreal_mult[symmetric])
   685   also have "\<dots> \<le> log b (\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})"
   686     unfolding split_beta
   687   proof (rule log_setsum_divide)
   688     show "?M \<noteq> {}" using False by simp
   689     show "1 < b" using b_gt_1 .
   690 
   691     show "finite ?M" using assms
   692       unfolding finite_sigma_algebra_def finite_sigma_algebra_axioms_def by auto
   693 
   694     show "(\<Sum>x\<in>?M. ?dXYZ {(fst x, fst (snd x), snd (snd x))}) = 1"
   695       unfolding setsum_cartesian_product'
   696       unfolding setsum_commute[of _ "space MY"]
   697       unfolding setsum_commute[of _ "space MZ"]
   698       by (simp_all add: space_pair_measure
   699         setsum_real_joint_distribution_singleton[OF `finite_random_variable MX X` YZ]
   700         setsum_real_joint_distribution_singleton[OF `finite_random_variable MY Y` finite_var(3)]
   701         setsum_real_distribution[OF `finite_random_variable MZ Z`])
   702 
   703     fix x assume "x \<in> ?M"
   704     let ?x = "(fst x, fst (snd x), snd (snd x))"
   705 
   706     show "0 \<le> ?dXYZ {?x}" using real_pextreal_nonneg .
   707     show "0 \<le> ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}"
   708      by (simp add: real_pextreal_nonneg mult_nonneg_nonneg divide_nonneg_nonneg)
   709 
   710     assume *: "0 < ?dXYZ {?x}"
   711     with `x \<in> ?M` show "0 < ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}"
   712       using finite order
   713       by (cases x)
   714          (auto simp add: zero_less_real_of_pextreal zero_less_mult_iff zero_less_divide_iff)
   715   qed
   716   also have "(\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z}) = (\<Sum>z\<in>space MZ. ?dZ {z})"
   717     apply (simp add: setsum_cartesian_product')
   718     apply (subst setsum_commute)
   719     apply (subst (2) setsum_commute)
   720     by (auto simp: setsum_divide_distrib[symmetric] setsum_product[symmetric]
   721                    setsum_real_joint_distribution_singleton[OF finite_var(1,3)]
   722                    setsum_real_joint_distribution_singleton[OF finite_var(2,3)]
   723           intro!: setsum_cong)
   724   also have "log b (\<Sum>z\<in>space MZ. ?dZ {z}) = 0"
   725     unfolding setsum_real_distribution[OF finite_var(3)] by simp
   726   finally show ?thesis by simp
   727 qed
   728 
   729 lemma (in information_space) conditional_mutual_information_positive:
   730   assumes "simple_function M X" and "simple_function M Y" and "simple_function M Z"
   731   shows "0 \<le> \<I>(X;Y|Z)"
   732   by (rule conditional_mutual_information_generic_positive[OF assms[THEN simple_function_imp_finite_random_variable]])
   733 
   734 subsection {* Conditional Entropy *}
   735 
   736 definition (in prob_space)
   737   "conditional_entropy b S T X Y = conditional_mutual_information b S S T X X Y"
   738 
   739 abbreviation (in information_space)
   740   conditional_entropy_Pow ("\<H>'(_ | _')") where
   741   "\<H>(X | Y) \<equiv> conditional_entropy b
   742     \<lparr> space = X`space M, sets = Pow (X`space M), measure = distribution X \<rparr>
   743     \<lparr> space = Y`space M, sets = Pow (Y`space M), measure = distribution Y \<rparr> X Y"
   744 
   745 lemma (in information_space) conditional_entropy_positive:
   746   "simple_function M X \<Longrightarrow> simple_function M Y \<Longrightarrow> 0 \<le> \<H>(X | Y)"
   747   unfolding conditional_entropy_def by (auto intro!: conditional_mutual_information_positive)
   748 
   749 lemma (in measure_space) empty_measureI: "A = {} \<Longrightarrow> \<mu> A = 0" by simp
   750 
   751 lemma (in information_space) conditional_entropy_generic_eq:
   752   fixes MX :: "('c, 'd) measure_space_scheme" and MY :: "('e, 'f) measure_space_scheme"
   753   assumes MX: "finite_random_variable MX X"
   754   assumes MZ: "finite_random_variable MZ Z"
   755   shows "conditional_entropy b MX MZ X Z =
   756      - (\<Sum>(x, z)\<in>space MX \<times> space MZ.
   757          real (joint_distribution X Z {(x, z)}) *
   758          log b (real (joint_distribution X Z {(x, z)}) / real (distribution Z {z})))"
   759 proof -
   760   interpret MX: finite_sigma_algebra MX using MX by simp
   761   interpret MZ: finite_sigma_algebra MZ using MZ by simp
   762   let "?XXZ x y z" = "joint_distribution X (\<lambda>x. (X x, Z x)) {(x, y, z)}"
   763   let "?XZ x z" = "joint_distribution X Z {(x, z)}"
   764   let "?Z z" = "distribution Z {z}"
   765   let "?f x y z" = "log b (real (?XXZ x y z) / (real (?XZ x z) * real (?XZ y z / ?Z z)))"
   766   { fix x z have "?XXZ x x z = ?XZ x z"
   767       unfolding distribution_def by (auto intro!: arg_cong[where f=\<mu>]) }
   768   note this[simp]
   769   { fix x x' :: 'c and z assume "x' \<noteq> x"
   770     then have "?XXZ x x' z = 0"
   771       by (auto simp: distribution_def intro!: arg_cong[where f=\<mu>] empty_measureI) }
   772   note this[simp]
   773   { fix x x' z assume *: "x \<in> space MX" "z \<in> space MZ"
   774     then have "(\<Sum>x'\<in>space MX. real (?XXZ x x' z) * ?f x x' z)
   775       = (\<Sum>x'\<in>space MX. if x = x' then real (?XZ x z) * ?f x x z else 0)"
   776       by (auto intro!: setsum_cong)
   777     also have "\<dots> = real (?XZ x z) * ?f x x z"
   778       using `x \<in> space MX` by (simp add: setsum_cases[OF MX.finite_space])
   779     also have "\<dots> = real (?XZ x z) * log b (real (?Z z) / real (?XZ x z))"
   780       by (auto simp: real_of_pextreal_mult[symmetric])
   781     also have "\<dots> = - real (?XZ x z) * log b (real (?XZ x z) / real (?Z z))"
   782       using assms[THEN finite_distribution_finite]
   783       using finite_distribution_order(6)[OF MX MZ]
   784       by (auto simp: log_simps field_simps zero_less_mult_iff zero_less_real_of_pextreal real_of_pextreal_eq_0)
   785     finally have "(\<Sum>x'\<in>space MX. real (?XXZ x x' z) * ?f x x' z) =
   786       - real (?XZ x z) * log b (real (?XZ x z) / real (?Z z))" . }
   787   note * = this
   788   show ?thesis
   789     unfolding conditional_entropy_def
   790     unfolding conditional_mutual_information_generic_eq[OF MX MX MZ]
   791     by (auto simp: setsum_cartesian_product' setsum_negf[symmetric]
   792                    setsum_commute[of _ "space MZ"] *   simp del: divide_pextreal_def
   793              intro!: setsum_cong)
   794 qed
   795 
   796 lemma (in information_space) conditional_entropy_eq:
   797   assumes "simple_function M X" "simple_function M Z"
   798   shows "\<H>(X | Z) =
   799      - (\<Sum>(x, z)\<in>X ` space M \<times> Z ` space M.
   800          real (joint_distribution X Z {(x, z)}) *
   801          log b (real (joint_distribution X Z {(x, z)}) / real (distribution Z {z})))"
   802   using conditional_entropy_generic_eq[OF assms[THEN simple_function_imp_finite_random_variable]]
   803   by simp
   804 
   805 lemma (in information_space) conditional_entropy_eq_ce_with_hypothesis:
   806   assumes X: "simple_function M X" and Y: "simple_function M Y"
   807   shows "\<H>(X | Y) =
   808     -(\<Sum>y\<in>Y`space M. real (distribution Y {y}) *
   809       (\<Sum>x\<in>X`space M. real (joint_distribution X Y {(x,y)}) / real (distribution Y {(y)}) *
   810               log b (real (joint_distribution X Y {(x,y)}) / real (distribution Y {(y)}))))"
   811   unfolding conditional_entropy_eq[OF assms]
   812   using finite_distribution_finite[OF finite_random_variable_pairI[OF assms[THEN simple_function_imp_finite_random_variable]]]
   813   using finite_distribution_order(5,6)[OF assms[THEN simple_function_imp_finite_random_variable]]
   814   using finite_distribution_finite[OF Y[THEN simple_function_imp_finite_random_variable]]
   815   by (auto simp: setsum_cartesian_product'  setsum_commute[of _ "Y`space M"] setsum_right_distrib real_of_pextreal_eq_0
   816            intro!: setsum_cong)
   817 
   818 lemma (in information_space) conditional_entropy_eq_cartesian_product:
   819   assumes "simple_function M X" "simple_function M Y"
   820   shows "\<H>(X | Y) = -(\<Sum>x\<in>X`space M. \<Sum>y\<in>Y`space M.
   821     real (joint_distribution X Y {(x,y)}) *
   822     log b (real (joint_distribution X Y {(x,y)}) / real (distribution Y {y})))"
   823   unfolding conditional_entropy_eq[OF assms]
   824   by (auto intro!: setsum_cong simp: setsum_cartesian_product')
   825 
   826 subsection {* Equalities *}
   827 
   828 lemma (in information_space) mutual_information_eq_entropy_conditional_entropy:
   829   assumes X: "simple_function M X" and Z: "simple_function M Z"
   830   shows  "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)"
   831 proof -
   832   let "?XZ x z" = "real (joint_distribution X Z {(x, z)})"
   833   let "?Z z" = "real (distribution Z {z})"
   834   let "?X x" = "real (distribution X {x})"
   835   note fX = X[THEN simple_function_imp_finite_random_variable]
   836   note fZ = Z[THEN simple_function_imp_finite_random_variable]
   837   note fX[THEN finite_distribution_finite, simp] and fZ[THEN finite_distribution_finite, simp]
   838   note finite_distribution_order[OF fX fZ, simp]
   839   { fix x z assume "x \<in> X`space M" "z \<in> Z`space M"
   840     have "?XZ x z * log b (?XZ x z / (?X x * ?Z z)) =
   841           ?XZ x z * log b (?XZ x z / ?Z z) - ?XZ x z * log b (?X x)"
   842       by (auto simp: log_simps real_of_pextreal_mult[symmetric] zero_less_mult_iff
   843                      zero_less_real_of_pextreal field_simps real_of_pextreal_eq_0 abs_mult) }
   844   note * = this
   845   show ?thesis
   846     unfolding entropy_eq[OF X] conditional_entropy_eq[OF X Z] mutual_information_eq[OF X Z]
   847     using setsum_real_joint_distribution_singleton[OF fZ fX, unfolded joint_distribution_commute_singleton[of Z X]]
   848     by (simp add: * setsum_cartesian_product' setsum_subtractf setsum_left_distrib[symmetric]
   849                      setsum_real_distribution)
   850 qed
   851 
   852 lemma (in information_space) conditional_entropy_less_eq_entropy:
   853   assumes X: "simple_function M X" and Z: "simple_function M Z"
   854   shows "\<H>(X | Z) \<le> \<H>(X)"
   855 proof -
   856   have "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)" using mutual_information_eq_entropy_conditional_entropy[OF assms] .
   857   with mutual_information_positive[OF X Z] entropy_positive[OF X]
   858   show ?thesis by auto
   859 qed
   860 
   861 lemma (in information_space) entropy_chain_rule:
   862   assumes X: "simple_function M X" and Y: "simple_function M Y"
   863   shows  "\<H>(\<lambda>x. (X x, Y x)) = \<H>(X) + \<H>(Y|X)"
   864 proof -
   865   let "?XY x y" = "real (joint_distribution X Y {(x, y)})"
   866   let "?Y y" = "real (distribution Y {y})"
   867   let "?X x" = "real (distribution X {x})"
   868   note fX = X[THEN simple_function_imp_finite_random_variable]
   869   note fY = Y[THEN simple_function_imp_finite_random_variable]
   870   note fX[THEN finite_distribution_finite, simp] and fY[THEN finite_distribution_finite, simp]
   871   note finite_distribution_order[OF fX fY, simp]
   872   { fix x y assume "x \<in> X`space M" "y \<in> Y`space M"
   873     have "?XY x y * log b (?XY x y / ?X x) =
   874           ?XY x y * log b (?XY x y) - ?XY x y * log b (?X x)"
   875       by (auto simp: log_simps real_of_pextreal_mult[symmetric] zero_less_mult_iff
   876                      zero_less_real_of_pextreal field_simps real_of_pextreal_eq_0 abs_mult) }
   877   note * = this
   878   show ?thesis
   879     using setsum_real_joint_distribution_singleton[OF fY fX]
   880     unfolding entropy_eq[OF X] conditional_entropy_eq_cartesian_product[OF Y X] entropy_eq_cartesian_product[OF X Y]
   881     unfolding joint_distribution_commute_singleton[of Y X] setsum_commute[of _ "X`space M"]
   882     by (simp add: * setsum_subtractf setsum_left_distrib[symmetric])
   883 qed
   884 
   885 section {* Partitioning *}
   886 
   887 definition "subvimage A f g \<longleftrightarrow> (\<forall>x \<in> A. f -` {f x} \<inter> A \<subseteq> g -` {g x} \<inter> A)"
   888 
   889 lemma subvimageI:
   890   assumes "\<And>x y. \<lbrakk> x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
   891   shows "subvimage A f g"
   892   using assms unfolding subvimage_def by blast
   893 
   894 lemma subvimageE[consumes 1]:
   895   assumes "subvimage A f g"
   896   obtains "\<And>x y. \<lbrakk> x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
   897   using assms unfolding subvimage_def by blast
   898 
   899 lemma subvimageD:
   900   "\<lbrakk> subvimage A f g ; x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
   901   using assms unfolding subvimage_def by blast
   902 
   903 lemma subvimage_subset:
   904   "\<lbrakk> subvimage B f g ; A \<subseteq> B \<rbrakk> \<Longrightarrow> subvimage A f g"
   905   unfolding subvimage_def by auto
   906 
   907 lemma subvimage_idem[intro]: "subvimage A g g"
   908   by (safe intro!: subvimageI)
   909 
   910 lemma subvimage_comp_finer[intro]:
   911   assumes svi: "subvimage A g h"
   912   shows "subvimage A g (f \<circ> h)"
   913 proof (rule subvimageI, simp)
   914   fix x y assume "x \<in> A" "y \<in> A" "g x = g y"
   915   from svi[THEN subvimageD, OF this]
   916   show "f (h x) = f (h y)" by simp
   917 qed
   918 
   919 lemma subvimage_comp_gran:
   920   assumes svi: "subvimage A g h"
   921   assumes inj: "inj_on f (g ` A)"
   922   shows "subvimage A (f \<circ> g) h"
   923   by (rule subvimageI) (auto intro!: subvimageD[OF svi] simp: inj_on_iff[OF inj])
   924 
   925 lemma subvimage_comp:
   926   assumes svi: "subvimage (f ` A) g h"
   927   shows "subvimage A (g \<circ> f) (h \<circ> f)"
   928   by (rule subvimageI) (auto intro!: svi[THEN subvimageD])
   929 
   930 lemma subvimage_trans:
   931   assumes fg: "subvimage A f g"
   932   assumes gh: "subvimage A g h"
   933   shows "subvimage A f h"
   934   by (rule subvimageI) (auto intro!: fg[THEN subvimageD] gh[THEN subvimageD])
   935 
   936 lemma subvimage_translator:
   937   assumes svi: "subvimage A f g"
   938   shows "\<exists>h. \<forall>x \<in> A. h (f x)  = g x"
   939 proof (safe intro!: exI[of _ "\<lambda>x. (THE z. z \<in> (g ` (f -` {x} \<inter> A)))"])
   940   fix x assume "x \<in> A"
   941   show "(THE x'. x' \<in> (g ` (f -` {f x} \<inter> A))) = g x"
   942     by (rule theI2[of _ "g x"])
   943       (insert `x \<in> A`, auto intro!: svi[THEN subvimageD])
   944 qed
   945 
   946 lemma subvimage_translator_image:
   947   assumes svi: "subvimage A f g"
   948   shows "\<exists>h. h ` f ` A = g ` A"
   949 proof -
   950   from subvimage_translator[OF svi]
   951   obtain h where "\<And>x. x \<in> A \<Longrightarrow> h (f x) = g x" by auto
   952   thus ?thesis
   953     by (auto intro!: exI[of _ h]
   954       simp: image_compose[symmetric] comp_def cong: image_cong)
   955 qed
   956 
   957 lemma subvimage_finite:
   958   assumes svi: "subvimage A f g" and fin: "finite (f`A)"
   959   shows "finite (g`A)"
   960 proof -
   961   from subvimage_translator_image[OF svi]
   962   obtain h where "g`A = h`f`A" by fastsimp
   963   with fin show "finite (g`A)" by simp
   964 qed
   965 
   966 lemma subvimage_disj:
   967   assumes svi: "subvimage A f g"
   968   shows "f -` {x} \<inter> A \<subseteq> g -` {y} \<inter> A \<or>
   969       f -` {x} \<inter> g -` {y} \<inter> A = {}" (is "?sub \<or> ?dist")
   970 proof (rule disjCI)
   971   assume "\<not> ?dist"
   972   then obtain z where "z \<in> A" and "x = f z" and "y = g z" by auto
   973   thus "?sub" using svi unfolding subvimage_def by auto
   974 qed
   975 
   976 lemma setsum_image_split:
   977   assumes svi: "subvimage A f g" and fin: "finite (f ` A)"
   978   shows "(\<Sum>x\<in>f`A. h x) = (\<Sum>y\<in>g`A. \<Sum>x\<in>f`(g -` {y} \<inter> A). h x)"
   979     (is "?lhs = ?rhs")
   980 proof -
   981   have "f ` A =
   982       snd ` (SIGMA x : g ` A. f ` (g -` {x} \<inter> A))"
   983       (is "_ = snd ` ?SIGMA")
   984     unfolding image_split_eq_Sigma[symmetric]
   985     by (simp add: image_compose[symmetric] comp_def)
   986   moreover
   987   have snd_inj: "inj_on snd ?SIGMA"
   988     unfolding image_split_eq_Sigma[symmetric]
   989     by (auto intro!: inj_onI subvimageD[OF svi])
   990   ultimately
   991   have "(\<Sum>x\<in>f`A. h x) = (\<Sum>(x,y)\<in>?SIGMA. h y)"
   992     by (auto simp: setsum_reindex intro: setsum_cong)
   993   also have "... = ?rhs"
   994     using subvimage_finite[OF svi fin] fin
   995     apply (subst setsum_Sigma[symmetric])
   996     by (auto intro!: finite_subset[of _ "f`A"])
   997   finally show ?thesis .
   998 qed
   999 
  1000 lemma (in information_space) entropy_partition:
  1001   assumes sf: "simple_function M X" "simple_function M P"
  1002   assumes svi: "subvimage (space M) X P"
  1003   shows "\<H>(X) = \<H>(P) + \<H>(X|P)"
  1004 proof -
  1005   let "?XP x p" = "real (joint_distribution X P {(x, p)})"
  1006   let "?X x" = "real (distribution X {x})"
  1007   let "?P p" = "real (distribution P {p})"
  1008   note fX = sf(1)[THEN simple_function_imp_finite_random_variable]
  1009   note fP = sf(2)[THEN simple_function_imp_finite_random_variable]
  1010   note fX[THEN finite_distribution_finite, simp] and fP[THEN finite_distribution_finite, simp]
  1011   note finite_distribution_order[OF fX fP, simp]
  1012   have "(\<Sum>x\<in>X ` space M. real (distribution X {x}) * log b (real (distribution X {x}))) =
  1013     (\<Sum>y\<in>P `space M. \<Sum>x\<in>X ` space M.
  1014     real (joint_distribution X P {(x, y)}) * log b (real (joint_distribution X P {(x, y)})))"
  1015   proof (subst setsum_image_split[OF svi],
  1016       safe intro!: setsum_mono_zero_cong_left imageI)
  1017     show "finite (X ` space M)" "finite (X ` space M)" "finite (P ` space M)"
  1018       using sf unfolding simple_function_def by auto
  1019   next
  1020     fix p x assume in_space: "p \<in> space M" "x \<in> space M"
  1021     assume "real (joint_distribution X P {(X x, P p)}) * log b (real (joint_distribution X P {(X x, P p)})) \<noteq> 0"
  1022     hence "(\<lambda>x. (X x, P x)) -` {(X x, P p)} \<inter> space M \<noteq> {}" by (auto simp: distribution_def)
  1023     with svi[unfolded subvimage_def, rule_format, OF `x \<in> space M`]
  1024     show "x \<in> P -` {P p}" by auto
  1025   next
  1026     fix p x assume in_space: "p \<in> space M" "x \<in> space M"
  1027     assume "P x = P p"
  1028     from this[symmetric] svi[unfolded subvimage_def, rule_format, OF `x \<in> space M`]
  1029     have "X -` {X x} \<inter> space M \<subseteq> P -` {P p} \<inter> space M"
  1030       by auto
  1031     hence "(\<lambda>x. (X x, P x)) -` {(X x, P p)} \<inter> space M = X -` {X x} \<inter> space M"
  1032       by auto
  1033     thus "real (distribution X {X x}) * log b (real (distribution X {X x})) =
  1034           real (joint_distribution X P {(X x, P p)}) *
  1035           log b (real (joint_distribution X P {(X x, P p)}))"
  1036       by (auto simp: distribution_def)
  1037   qed
  1038   moreover have "\<And>x y. real (joint_distribution X P {(x, y)}) *
  1039       log b (real (joint_distribution X P {(x, y)}) / real (distribution P {y})) =
  1040       real (joint_distribution X P {(x, y)}) * log b (real (joint_distribution X P {(x, y)})) -
  1041       real (joint_distribution X P {(x, y)}) * log b (real (distribution P {y}))"
  1042     by (auto simp add: log_simps zero_less_mult_iff field_simps)
  1043   ultimately show ?thesis
  1044     unfolding sf[THEN entropy_eq] conditional_entropy_eq[OF sf]
  1045     using setsum_real_joint_distribution_singleton[OF fX fP]
  1046     by (simp add: setsum_cartesian_product' setsum_subtractf setsum_real_distribution
  1047       setsum_left_distrib[symmetric] setsum_commute[where B="P`space M"])
  1048 qed
  1049 
  1050 corollary (in information_space) entropy_data_processing:
  1051   assumes X: "simple_function M X" shows "\<H>(f \<circ> X) \<le> \<H>(X)"
  1052 proof -
  1053   note X
  1054   moreover have fX: "simple_function M (f \<circ> X)" using X by auto
  1055   moreover have "subvimage (space M) X (f \<circ> X)" by auto
  1056   ultimately have "\<H>(X) = \<H>(f\<circ>X) + \<H>(X|f\<circ>X)" by (rule entropy_partition)
  1057   then show "\<H>(f \<circ> X) \<le> \<H>(X)"
  1058     by (auto intro: conditional_entropy_positive[OF X fX])
  1059 qed
  1060 
  1061 corollary (in information_space) entropy_of_inj:
  1062   assumes X: "simple_function M X" and inj: "inj_on f (X`space M)"
  1063   shows "\<H>(f \<circ> X) = \<H>(X)"
  1064 proof (rule antisym)
  1065   show "\<H>(f \<circ> X) \<le> \<H>(X)" using entropy_data_processing[OF X] .
  1066 next
  1067   have sf: "simple_function M (f \<circ> X)"
  1068     using X by auto
  1069   have "\<H>(X) = \<H>(the_inv_into (X`space M) f \<circ> (f \<circ> X))"
  1070     by (auto intro!: mutual_information_cong simp: entropy_def the_inv_into_f_f[OF inj])
  1071   also have "... \<le> \<H>(f \<circ> X)"
  1072     using entropy_data_processing[OF sf] .
  1073   finally show "\<H>(X) \<le> \<H>(f \<circ> X)" .
  1074 qed
  1075 
  1076 end