src/HOL/Probability/Information.thy
author hoelzl
Wed Dec 08 16:15:14 2010 +0100 (2010-12-08)
changeset 41095 c335d880ff82
parent 41023 9118eb4eb8dc
child 41413 64cd30d6b0b8
permissions -rw-r--r--
cleanup bijectivity btw. product spaces and pairs
hoelzl@36080
     1
theory Information
hoelzl@40859
     2
imports Probability_Space Convex Lebesgue_Measure
hoelzl@36080
     3
begin
hoelzl@36080
     4
hoelzl@39097
     5
lemma log_le: "1 < a \<Longrightarrow> 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log a x \<le> log a y"
hoelzl@39097
     6
  by (subst log_le_cancel_iff) auto
hoelzl@39097
     7
hoelzl@39097
     8
lemma log_less: "1 < a \<Longrightarrow> 0 < x \<Longrightarrow> x < y \<Longrightarrow> log a x < log a y"
hoelzl@39097
     9
  by (subst log_less_cancel_iff) auto
hoelzl@39097
    10
hoelzl@39097
    11
lemma setsum_cartesian_product':
hoelzl@39097
    12
  "(\<Sum>x\<in>A \<times> B. f x) = (\<Sum>x\<in>A. setsum (\<lambda>y. f (x, y)) B)"
hoelzl@39097
    13
  unfolding setsum_cartesian_product by simp
hoelzl@39097
    14
hoelzl@36624
    15
section "Convex theory"
hoelzl@36080
    16
hoelzl@36624
    17
lemma log_setsum:
hoelzl@36624
    18
  assumes "finite s" "s \<noteq> {}"
hoelzl@36624
    19
  assumes "b > 1"
hoelzl@36624
    20
  assumes "(\<Sum> i \<in> s. a i) = 1"
hoelzl@36624
    21
  assumes "\<And> i. i \<in> s \<Longrightarrow> a i \<ge> 0"
hoelzl@36624
    22
  assumes "\<And> i. i \<in> s \<Longrightarrow> y i \<in> {0 <..}"
hoelzl@36624
    23
  shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
hoelzl@36624
    24
proof -
hoelzl@36624
    25
  have "convex_on {0 <..} (\<lambda> x. - log b x)"
hoelzl@36624
    26
    by (rule minus_log_convex[OF `b > 1`])
hoelzl@36624
    27
  hence "- log b (\<Sum> i \<in> s. a i * y i) \<le> (\<Sum> i \<in> s. a i * - log b (y i))"
hoelzl@36624
    28
    using convex_on_setsum[of _ _ "\<lambda> x. - log b x"] assms pos_is_convex by fastsimp
hoelzl@36624
    29
  thus ?thesis by (auto simp add:setsum_negf le_imp_neg_le)
hoelzl@36624
    30
qed
hoelzl@36080
    31
hoelzl@36624
    32
lemma log_setsum':
hoelzl@36624
    33
  assumes "finite s" "s \<noteq> {}"
hoelzl@36624
    34
  assumes "b > 1"
hoelzl@36624
    35
  assumes "(\<Sum> i \<in> s. a i) = 1"
hoelzl@36624
    36
  assumes pos: "\<And> i. i \<in> s \<Longrightarrow> 0 \<le> a i"
hoelzl@36624
    37
          "\<And> i. \<lbrakk> i \<in> s ; 0 < a i \<rbrakk> \<Longrightarrow> 0 < y i"
hoelzl@36624
    38
  shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
hoelzl@36080
    39
proof -
hoelzl@36624
    40
  have "\<And>y. (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) = (\<Sum> i \<in> s. a i * y i)"
hoelzl@36624
    41
    using assms by (auto intro!: setsum_mono_zero_cong_left)
hoelzl@36624
    42
  moreover have "log b (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) \<ge> (\<Sum> i \<in> s - {i. a i = 0}. a i * log b (y i))"
hoelzl@36624
    43
  proof (rule log_setsum)
hoelzl@36624
    44
    have "setsum a (s - {i. a i = 0}) = setsum a s"
hoelzl@36624
    45
      using assms(1) by (rule setsum_mono_zero_cong_left) auto
hoelzl@36624
    46
    thus sum_1: "setsum a (s - {i. a i = 0}) = 1"
hoelzl@36624
    47
      "finite (s - {i. a i = 0})" using assms by simp_all
hoelzl@36624
    48
hoelzl@36624
    49
    show "s - {i. a i = 0} \<noteq> {}"
hoelzl@36624
    50
    proof
hoelzl@36624
    51
      assume *: "s - {i. a i = 0} = {}"
hoelzl@36624
    52
      hence "setsum a (s - {i. a i = 0}) = 0" by (simp add: * setsum_empty)
hoelzl@36624
    53
      with sum_1 show False by simp
hoelzl@38656
    54
    qed
hoelzl@36624
    55
hoelzl@36624
    56
    fix i assume "i \<in> s - {i. a i = 0}"
hoelzl@36624
    57
    hence "i \<in> s" "a i \<noteq> 0" by simp_all
hoelzl@36624
    58
    thus "0 \<le> a i" "y i \<in> {0<..}" using pos[of i] by auto
hoelzl@36624
    59
  qed fact+
hoelzl@36624
    60
  ultimately show ?thesis by simp
hoelzl@36080
    61
qed
hoelzl@36080
    62
hoelzl@36624
    63
lemma log_setsum_divide:
hoelzl@36624
    64
  assumes "finite S" and "S \<noteq> {}" and "1 < b"
hoelzl@36624
    65
  assumes "(\<Sum>x\<in>S. g x) = 1"
hoelzl@36624
    66
  assumes pos: "\<And>x. x \<in> S \<Longrightarrow> g x \<ge> 0" "\<And>x. x \<in> S \<Longrightarrow> f x \<ge> 0"
hoelzl@36624
    67
  assumes g_pos: "\<And>x. \<lbrakk> x \<in> S ; 0 < g x \<rbrakk> \<Longrightarrow> 0 < f x"
hoelzl@36624
    68
  shows "- (\<Sum>x\<in>S. g x * log b (g x / f x)) \<le> log b (\<Sum>x\<in>S. f x)"
hoelzl@36624
    69
proof -
hoelzl@36624
    70
  have log_mono: "\<And>x y. 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log b x \<le> log b y"
hoelzl@36624
    71
    using `1 < b` by (subst log_le_cancel_iff) auto
hoelzl@36080
    72
hoelzl@36624
    73
  have "- (\<Sum>x\<in>S. g x * log b (g x / f x)) = (\<Sum>x\<in>S. g x * log b (f x / g x))"
hoelzl@36624
    74
  proof (unfold setsum_negf[symmetric], rule setsum_cong)
hoelzl@36624
    75
    fix x assume x: "x \<in> S"
hoelzl@36624
    76
    show "- (g x * log b (g x / f x)) = g x * log b (f x / g x)"
hoelzl@36624
    77
    proof (cases "g x = 0")
hoelzl@36624
    78
      case False
hoelzl@36624
    79
      with pos[OF x] g_pos[OF x] have "0 < f x" "0 < g x" by simp_all
hoelzl@36624
    80
      thus ?thesis using `1 < b` by (simp add: log_divide field_simps)
hoelzl@36624
    81
    qed simp
hoelzl@36624
    82
  qed rule
hoelzl@36624
    83
  also have "... \<le> log b (\<Sum>x\<in>S. g x * (f x / g x))"
hoelzl@36624
    84
  proof (rule log_setsum')
hoelzl@36624
    85
    fix x assume x: "x \<in> S" "0 < g x"
hoelzl@36624
    86
    with g_pos[OF x] show "0 < f x / g x" by (safe intro!: divide_pos_pos)
hoelzl@36624
    87
  qed fact+
hoelzl@36624
    88
  also have "... = log b (\<Sum>x\<in>S - {x. g x = 0}. f x)" using `finite S`
hoelzl@36624
    89
    by (auto intro!: setsum_mono_zero_cong_right arg_cong[where f="log b"]
hoelzl@36624
    90
        split: split_if_asm)
hoelzl@36624
    91
  also have "... \<le> log b (\<Sum>x\<in>S. f x)"
hoelzl@36624
    92
  proof (rule log_mono)
hoelzl@36624
    93
    have "0 = (\<Sum>x\<in>S - {x. g x = 0}. 0)" by simp
hoelzl@36624
    94
    also have "... < (\<Sum>x\<in>S - {x. g x = 0}. f x)" (is "_ < ?sum")
hoelzl@36624
    95
    proof (rule setsum_strict_mono)
hoelzl@36624
    96
      show "finite (S - {x. g x = 0})" using `finite S` by simp
hoelzl@36624
    97
      show "S - {x. g x = 0} \<noteq> {}"
hoelzl@36624
    98
      proof
hoelzl@36624
    99
        assume "S - {x. g x = 0} = {}"
hoelzl@36624
   100
        hence "(\<Sum>x\<in>S. g x) = 0" by (subst setsum_0') auto
hoelzl@36624
   101
        with `(\<Sum>x\<in>S. g x) = 1` show False by simp
hoelzl@36624
   102
      qed
hoelzl@36624
   103
      fix x assume "x \<in> S - {x. g x = 0}"
hoelzl@36624
   104
      thus "0 < f x" using g_pos[of x] pos(1)[of x] by auto
hoelzl@36624
   105
    qed
hoelzl@36624
   106
    finally show "0 < ?sum" .
hoelzl@36624
   107
    show "(\<Sum>x\<in>S - {x. g x = 0}. f x) \<le> (\<Sum>x\<in>S. f x)"
hoelzl@36624
   108
      using `finite S` pos by (auto intro!: setsum_mono2)
hoelzl@36080
   109
  qed
hoelzl@36624
   110
  finally show ?thesis .
hoelzl@36080
   111
qed
hoelzl@36080
   112
hoelzl@39097
   113
lemma split_pairs:
hoelzl@40859
   114
  "((A, B) = X) \<longleftrightarrow> (fst X = A \<and> snd X = B)" and
hoelzl@40859
   115
  "(X = (A, B)) \<longleftrightarrow> (fst X = A \<and> snd X = B)" by auto
hoelzl@38656
   116
hoelzl@38656
   117
section "Information theory"
hoelzl@38656
   118
hoelzl@40859
   119
locale information_space = prob_space +
hoelzl@38656
   120
  fixes b :: real assumes b_gt_1: "1 < b"
hoelzl@38656
   121
hoelzl@40859
   122
context information_space
hoelzl@38656
   123
begin
hoelzl@38656
   124
hoelzl@40859
   125
text {* Introduce some simplification rules for logarithm of base @{term b}. *}
hoelzl@40859
   126
hoelzl@40859
   127
lemma log_neg_const:
hoelzl@40859
   128
  assumes "x \<le> 0"
hoelzl@40859
   129
  shows "log b x = log b 0"
hoelzl@36624
   130
proof -
hoelzl@40859
   131
  { fix u :: real
hoelzl@40859
   132
    have "x \<le> 0" by fact
hoelzl@40859
   133
    also have "0 < exp u"
hoelzl@40859
   134
      using exp_gt_zero .
hoelzl@40859
   135
    finally have "exp u \<noteq> x"
hoelzl@40859
   136
      by auto }
hoelzl@40859
   137
  then show "log b x = log b 0"
hoelzl@40859
   138
    by (simp add: log_def ln_def)
hoelzl@38656
   139
qed
hoelzl@38656
   140
hoelzl@40859
   141
lemma log_mult_eq:
hoelzl@40859
   142
  "log b (A * B) = (if 0 < A * B then log b \<bar>A\<bar> + log b \<bar>B\<bar> else log b 0)"
hoelzl@40859
   143
  using log_mult[of b "\<bar>A\<bar>" "\<bar>B\<bar>"] b_gt_1 log_neg_const[of "A * B"]
hoelzl@40859
   144
  by (auto simp: zero_less_mult_iff mult_le_0_iff)
hoelzl@38656
   145
hoelzl@40859
   146
lemma log_inverse_eq:
hoelzl@40859
   147
  "log b (inverse B) = (if 0 < B then - log b B else log b 0)"
hoelzl@40859
   148
  using log_inverse[of b B] log_neg_const[of "inverse B"] b_gt_1 by simp
hoelzl@36080
   149
hoelzl@40859
   150
lemma log_divide_eq:
hoelzl@40859
   151
  "log b (A / B) = (if 0 < A * B then log b \<bar>A\<bar> - log b \<bar>B\<bar> else log b 0)"
hoelzl@40859
   152
  unfolding divide_inverse log_mult_eq log_inverse_eq abs_inverse
hoelzl@40859
   153
  by (auto simp: zero_less_mult_iff mult_le_0_iff)
hoelzl@38656
   154
hoelzl@40859
   155
lemmas log_simps = log_mult_eq log_inverse_eq log_divide_eq
hoelzl@38656
   156
hoelzl@38656
   157
end
hoelzl@38656
   158
hoelzl@39097
   159
subsection "Kullback$-$Leibler divergence"
hoelzl@36080
   160
hoelzl@39097
   161
text {* The Kullback$-$Leibler divergence is also known as relative entropy or
hoelzl@39097
   162
Kullback$-$Leibler distance. *}
hoelzl@39097
   163
hoelzl@39097
   164
definition
hoelzl@39097
   165
  "KL_divergence b M \<mu> \<nu> =
hoelzl@39097
   166
    measure_space.integral M \<mu> (\<lambda>x. log b (real (sigma_finite_measure.RN_deriv M \<nu> \<mu> x)))"
hoelzl@38656
   167
hoelzl@40859
   168
lemma (in sigma_finite_measure) KL_divergence_cong:
hoelzl@40859
   169
  assumes "measure_space M \<nu>"
hoelzl@40859
   170
  and cong: "\<And>A. A \<in> sets M \<Longrightarrow> \<mu>' A = \<mu> A" "\<And>A. A \<in> sets M \<Longrightarrow> \<nu>' A = \<nu> A"
hoelzl@40859
   171
  shows "KL_divergence b M \<nu>' \<mu>' = KL_divergence b M \<nu> \<mu>"
hoelzl@40859
   172
proof -
hoelzl@40859
   173
  interpret \<nu>: measure_space M \<nu> by fact
hoelzl@40859
   174
  show ?thesis
hoelzl@40859
   175
    unfolding KL_divergence_def
hoelzl@40859
   176
    using RN_deriv_cong[OF cong, of "\<lambda>A. A"]
hoelzl@40859
   177
    by (simp add: cong \<nu>.integral_cong_measure[OF cong(2)])
hoelzl@40859
   178
qed
hoelzl@40859
   179
hoelzl@40859
   180
lemma (in sigma_finite_measure) KL_divergence_vimage:
hoelzl@40859
   181
  assumes f: "bij_betw f S (space M)"
hoelzl@40859
   182
  assumes \<nu>: "measure_space M \<nu>" "absolutely_continuous \<nu>"
hoelzl@40859
   183
  shows "KL_divergence b (vimage_algebra S f) (\<lambda>A. \<nu> (f ` A)) (\<lambda>A. \<mu> (f ` A)) = KL_divergence b M \<nu> \<mu>"
hoelzl@40859
   184
    (is "KL_divergence b ?M ?\<nu> ?\<mu> = _")
hoelzl@40859
   185
proof -
hoelzl@40859
   186
  interpret \<nu>: measure_space M \<nu> by fact
hoelzl@40859
   187
  interpret v: measure_space ?M ?\<nu>
hoelzl@40859
   188
    using f by (rule \<nu>.measure_space_isomorphic)
hoelzl@40859
   189
hoelzl@40859
   190
  let ?RN = "sigma_finite_measure.RN_deriv ?M ?\<mu> ?\<nu>"
hoelzl@41095
   191
  from RN_deriv_vimage[OF f[THEN bij_inv_the_inv_into] \<nu>]
hoelzl@40859
   192
  have *: "\<nu>.almost_everywhere (\<lambda>x. ?RN (the_inv_into S f x) = RN_deriv \<nu> x)"
hoelzl@40859
   193
    by (rule absolutely_continuous_AE[OF \<nu>])
hoelzl@40859
   194
hoelzl@40859
   195
  show ?thesis
hoelzl@40859
   196
    unfolding KL_divergence_def \<nu>.integral_vimage_inv[OF f]
hoelzl@40859
   197
    apply (rule \<nu>.integral_cong_AE)
hoelzl@40859
   198
    apply (rule \<nu>.AE_mp[OF *])
hoelzl@40859
   199
    apply (rule \<nu>.AE_cong)
hoelzl@40859
   200
    apply simp
hoelzl@40859
   201
    done
hoelzl@40859
   202
qed
hoelzl@40859
   203
hoelzl@38656
   204
lemma (in finite_measure_space) KL_divergence_eq_finite:
hoelzl@38656
   205
  assumes v: "finite_measure_space M \<nu>"
hoelzl@40859
   206
  assumes ac: "absolutely_continuous \<nu>"
hoelzl@38656
   207
  shows "KL_divergence b M \<nu> \<mu> = (\<Sum>x\<in>space M. real (\<nu> {x}) * log b (real (\<nu> {x}) / real (\<mu> {x})))" (is "_ = ?sum")
hoelzl@38656
   208
proof (simp add: KL_divergence_def finite_measure_space.integral_finite_singleton[OF v])
hoelzl@38656
   209
  interpret v: finite_measure_space M \<nu> by fact
hoelzl@38656
   210
  have ms: "measure_space M \<nu>" by fact
hoelzl@38656
   211
  show "(\<Sum>x \<in> space M. log b (real (RN_deriv \<nu> x)) * real (\<nu> {x})) = ?sum"
hoelzl@38656
   212
    using RN_deriv_finite_measure[OF ms ac]
hoelzl@41023
   213
    by (auto intro!: setsum_cong simp: field_simps real_of_pextreal_mult[symmetric])
hoelzl@38656
   214
qed
hoelzl@36080
   215
hoelzl@38656
   216
lemma (in finite_prob_space) KL_divergence_positive_finite:
hoelzl@38656
   217
  assumes v: "finite_prob_space M \<nu>"
hoelzl@40859
   218
  assumes ac: "absolutely_continuous \<nu>"
hoelzl@38656
   219
  and "1 < b"
hoelzl@38656
   220
  shows "0 \<le> KL_divergence b M \<nu> \<mu>"
hoelzl@38656
   221
proof -
hoelzl@38656
   222
  interpret v: finite_prob_space M \<nu> using v .
hoelzl@40859
   223
  have ms: "finite_measure_space M \<nu>" by default
hoelzl@38656
   224
hoelzl@40859
   225
  have "- (KL_divergence b M \<nu> \<mu>) \<le> log b (\<Sum>x\<in>space M. real (\<mu> {x}))"
hoelzl@40859
   226
  proof (subst KL_divergence_eq_finite[OF ms ac], safe intro!: log_setsum_divide not_empty)
hoelzl@40859
   227
    show "finite (space M)" using finite_space by simp
hoelzl@40859
   228
    show "1 < b" by fact
hoelzl@40859
   229
    show "(\<Sum>x\<in>space M. real (\<nu> {x})) = 1" using v.finite_sum_over_space_eq_1 by simp
hoelzl@38656
   230
hoelzl@40859
   231
    fix x assume "x \<in> space M"
hoelzl@40859
   232
    then have x: "{x} \<in> sets M" unfolding sets_eq_Pow by auto
hoelzl@40859
   233
    { assume "0 < real (\<nu> {x})"
hoelzl@40859
   234
      then have "\<nu> {x} \<noteq> 0" by auto
hoelzl@40859
   235
      then have "\<mu> {x} \<noteq> 0"
hoelzl@40859
   236
        using ac[unfolded absolutely_continuous_def, THEN bspec, of "{x}"] x by auto
hoelzl@40859
   237
      thus "0 < prob {x}" using finite_measure[of "{x}"] x by auto }
hoelzl@40859
   238
  qed auto
hoelzl@38656
   239
  thus "0 \<le> KL_divergence b M \<nu> \<mu>" using finite_sum_over_space_eq_1 by simp
hoelzl@36080
   240
qed
hoelzl@36080
   241
hoelzl@39097
   242
subsection {* Mutual Information *}
hoelzl@39097
   243
hoelzl@36080
   244
definition (in prob_space)
hoelzl@38656
   245
  "mutual_information b S T X Y =
hoelzl@40859
   246
    KL_divergence b (sigma (pair_algebra S T))
hoelzl@38656
   247
      (joint_distribution X Y)
hoelzl@40859
   248
      (pair_sigma_finite.pair_measure S (distribution X) T (distribution Y))"
hoelzl@36080
   249
hoelzl@40859
   250
definition (in prob_space)
hoelzl@40859
   251
  "entropy b s X = mutual_information b s s X X"
hoelzl@40859
   252
hoelzl@40859
   253
abbreviation (in information_space)
hoelzl@40859
   254
  mutual_information_Pow ("\<I>'(_ ; _')") where
hoelzl@36624
   255
  "\<I>(X ; Y) \<equiv> mutual_information b
hoelzl@36080
   256
    \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
hoelzl@36080
   257
    \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr> X Y"
hoelzl@36080
   258
hoelzl@40859
   259
lemma (in information_space) mutual_information_commute_generic:
hoelzl@40859
   260
  assumes X: "random_variable S X" and Y: "random_variable T Y"
hoelzl@40859
   261
  assumes ac: "measure_space.absolutely_continuous (sigma (pair_algebra S T))
hoelzl@40859
   262
   (pair_sigma_finite.pair_measure S (distribution X) T (distribution Y)) (joint_distribution X Y)"
hoelzl@40859
   263
  shows "mutual_information b S T X Y = mutual_information b T S Y X"
hoelzl@39092
   264
proof -
hoelzl@40859
   265
  interpret P: prob_space "sigma (pair_algebra S T)" "joint_distribution X Y"
hoelzl@40859
   266
    using random_variable_pairI[OF X Y] by (rule distribution_prob_space)
hoelzl@40859
   267
  interpret Q: prob_space "sigma (pair_algebra T S)" "joint_distribution Y X"
hoelzl@40859
   268
    using random_variable_pairI[OF Y X] by (rule distribution_prob_space)
hoelzl@40859
   269
  interpret X: prob_space S "distribution X" using X by (rule distribution_prob_space)
hoelzl@40859
   270
  interpret Y: prob_space T "distribution Y" using Y by (rule distribution_prob_space)
hoelzl@40859
   271
  interpret ST: pair_sigma_finite S "distribution X" T "distribution Y" by default
hoelzl@40859
   272
  interpret TS: pair_sigma_finite T "distribution Y" S "distribution X" by default
hoelzl@40859
   273
hoelzl@40859
   274
  have ST: "measure_space (sigma (pair_algebra S T)) (joint_distribution X Y)" by default
hoelzl@40859
   275
  have TS: "measure_space (sigma (pair_algebra T S)) (joint_distribution Y X)" by default
hoelzl@40859
   276
hoelzl@40859
   277
  have bij_ST: "bij_betw (\<lambda>(x, y). (y, x)) (space (sigma (pair_algebra S T))) (space (sigma (pair_algebra T S)))"
hoelzl@40859
   278
    by (auto intro!: inj_onI simp: space_pair_algebra bij_betw_def)
hoelzl@40859
   279
  have bij_TS: "bij_betw (\<lambda>(x, y). (y, x)) (space (sigma (pair_algebra T S))) (space (sigma (pair_algebra S T)))"
hoelzl@40859
   280
    by (auto intro!: inj_onI simp: space_pair_algebra bij_betw_def)
hoelzl@40859
   281
hoelzl@40859
   282
  { fix A
hoelzl@40859
   283
    have "joint_distribution X Y ((\<lambda>(x, y). (y, x)) ` A) = joint_distribution Y X A"
hoelzl@40859
   284
      unfolding distribution_def by (auto intro!: arg_cong[where f=\<mu>]) }
hoelzl@40859
   285
  note jd_commute = this
hoelzl@40859
   286
hoelzl@40859
   287
  { fix A assume A: "A \<in> sets (sigma (pair_algebra T S))"
hoelzl@41023
   288
    have *: "\<And>x y. indicator ((\<lambda>(x, y). (y, x)) ` A) (x, y) = (indicator A (y, x) :: pextreal)"
hoelzl@40859
   289
      unfolding indicator_def by auto
hoelzl@40859
   290
    have "ST.pair_measure ((\<lambda>(x, y). (y, x)) ` A) = TS.pair_measure A"
hoelzl@40859
   291
      unfolding ST.pair_measure_def TS.pair_measure_def
hoelzl@40859
   292
      using A by (auto simp add: TS.Fubini[symmetric] *) }
hoelzl@40859
   293
  note pair_measure_commute = this
hoelzl@40859
   294
hoelzl@39092
   295
  show ?thesis
hoelzl@40859
   296
    unfolding mutual_information_def
hoelzl@40859
   297
    unfolding ST.KL_divergence_vimage[OF bij_TS ST ac, symmetric]
hoelzl@40859
   298
    unfolding space_sigma space_pair_algebra jd_commute
hoelzl@40859
   299
    unfolding ST.pair_sigma_algebra_swap[symmetric]
hoelzl@40859
   300
    by (simp cong: TS.KL_divergence_cong[OF TS] add: pair_measure_commute)
hoelzl@39092
   301
qed
hoelzl@39092
   302
hoelzl@40859
   303
lemma (in prob_space) finite_variables_absolutely_continuous:
hoelzl@40859
   304
  assumes X: "finite_random_variable S X" and Y: "finite_random_variable T Y"
hoelzl@40859
   305
  shows "measure_space.absolutely_continuous (sigma (pair_algebra S T))
hoelzl@40859
   306
   (pair_sigma_finite.pair_measure S (distribution X) T (distribution Y)) (joint_distribution X Y)"
hoelzl@40859
   307
proof -
hoelzl@40859
   308
  interpret X: finite_prob_space S "distribution X" using X by (rule distribution_finite_prob_space)
hoelzl@40859
   309
  interpret Y: finite_prob_space T "distribution Y" using Y by (rule distribution_finite_prob_space)
hoelzl@40859
   310
  interpret XY: pair_finite_prob_space S "distribution X" T "distribution Y" by default
hoelzl@40859
   311
  interpret P: finite_prob_space XY.P "joint_distribution X Y"
hoelzl@40859
   312
    using assms by (intro joint_distribution_finite_prob_space)
hoelzl@40859
   313
  show "XY.absolutely_continuous (joint_distribution X Y)"
hoelzl@40859
   314
  proof (rule XY.absolutely_continuousI)
hoelzl@40859
   315
    show "finite_measure_space XY.P (joint_distribution X Y)" by default
hoelzl@40859
   316
    fix x assume "x \<in> space XY.P" and "XY.pair_measure {x} = 0"
hoelzl@40859
   317
    then obtain a b where "(a, b) = x" and "a \<in> space S" "b \<in> space T"
hoelzl@40859
   318
      and distr: "distribution X {a} * distribution Y {b} = 0"
hoelzl@40859
   319
      by (cases x) (auto simp: pair_algebra_def)
hoelzl@40859
   320
    with assms[THEN finite_random_variableD]
hoelzl@40859
   321
      joint_distribution_Times_le_fst[of S X T Y "{a}" "{b}"]
hoelzl@40859
   322
      joint_distribution_Times_le_snd[of S X T Y "{a}" "{b}"]
hoelzl@40859
   323
    have "joint_distribution X Y {x} \<le> distribution Y {b}"
hoelzl@40859
   324
         "joint_distribution X Y {x} \<le> distribution X {a}"
hoelzl@40859
   325
      by auto
hoelzl@40859
   326
    with distr show "joint_distribution X Y {x} = 0" by auto
hoelzl@40859
   327
  qed
hoelzl@40859
   328
qed
hoelzl@40859
   329
hoelzl@40859
   330
lemma (in information_space) mutual_information_commute:
hoelzl@40859
   331
  assumes X: "finite_random_variable S X" and Y: "finite_random_variable T Y"
hoelzl@40859
   332
  shows "mutual_information b S T X Y = mutual_information b T S Y X"
hoelzl@40859
   333
  by (intro finite_random_variableD X Y mutual_information_commute_generic finite_variables_absolutely_continuous)
hoelzl@40859
   334
hoelzl@40859
   335
lemma (in information_space) mutual_information_commute_simple:
hoelzl@40859
   336
  assumes X: "simple_function X" and Y: "simple_function Y"
hoelzl@40859
   337
  shows "\<I>(X;Y) = \<I>(Y;X)"
hoelzl@40859
   338
  by (intro X Y simple_function_imp_finite_random_variable mutual_information_commute)
hoelzl@40859
   339
hoelzl@40859
   340
lemma (in information_space)
hoelzl@40859
   341
  assumes MX: "finite_random_variable MX X"
hoelzl@40859
   342
  assumes MY: "finite_random_variable MY Y"
hoelzl@40859
   343
  shows mutual_information_generic_eq:
hoelzl@36624
   344
    "mutual_information b MX MY X Y = (\<Sum> (x,y) \<in> space MX \<times> space MY.
hoelzl@38656
   345
      real (joint_distribution X Y {(x,y)}) *
hoelzl@38656
   346
      log b (real (joint_distribution X Y {(x,y)}) /
hoelzl@38656
   347
      (real (distribution X {x}) * real (distribution Y {y}))))"
hoelzl@40859
   348
    (is ?sum)
hoelzl@36624
   349
  and mutual_information_positive_generic:
hoelzl@40859
   350
     "0 \<le> mutual_information b MX MY X Y" (is ?positive)
hoelzl@36624
   351
proof -
hoelzl@40859
   352
  interpret X: finite_prob_space MX "distribution X" using MX by (rule distribution_finite_prob_space)
hoelzl@40859
   353
  interpret Y: finite_prob_space MY "distribution Y" using MY by (rule distribution_finite_prob_space)
hoelzl@40859
   354
  interpret XY: pair_finite_prob_space MX "distribution X" MY "distribution Y" by default
hoelzl@40859
   355
  interpret P: finite_prob_space XY.P "joint_distribution X Y"
hoelzl@40859
   356
    using assms by (intro joint_distribution_finite_prob_space)
hoelzl@36080
   357
hoelzl@40859
   358
  have P_ms: "finite_measure_space XY.P (joint_distribution X Y)" by default
hoelzl@40859
   359
  have P_ps: "finite_prob_space XY.P (joint_distribution X Y)" by default
hoelzl@36624
   360
hoelzl@40859
   361
  show ?sum
hoelzl@38656
   362
    unfolding Let_def mutual_information_def
hoelzl@40859
   363
    by (subst XY.KL_divergence_eq_finite[OF P_ms finite_variables_absolutely_continuous[OF MX MY]])
hoelzl@41023
   364
       (auto simp add: pair_algebra_def setsum_cartesian_product' real_of_pextreal_mult[symmetric])
hoelzl@36080
   365
hoelzl@36624
   366
  show ?positive
hoelzl@40859
   367
    using XY.KL_divergence_positive_finite[OF P_ps finite_variables_absolutely_continuous[OF MX MY] b_gt_1]
hoelzl@40859
   368
    unfolding mutual_information_def .
hoelzl@36080
   369
qed
hoelzl@36080
   370
hoelzl@40859
   371
lemma (in information_space) mutual_information_eq:
hoelzl@40859
   372
  assumes "simple_function X" "simple_function Y"
hoelzl@40859
   373
  shows "\<I>(X;Y) = (\<Sum> (x,y) \<in> X ` space M \<times> Y ` space M.
hoelzl@38656
   374
    real (distribution (\<lambda>x. (X x, Y x)) {(x,y)}) * log b (real (distribution (\<lambda>x. (X x, Y x)) {(x,y)}) /
hoelzl@38656
   375
                                                   (real (distribution X {x}) * real (distribution Y {y}))))"
hoelzl@40859
   376
  using assms by (simp add: mutual_information_generic_eq)
hoelzl@36080
   377
hoelzl@40859
   378
lemma (in information_space) mutual_information_generic_cong:
hoelzl@39097
   379
  assumes X: "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x"
hoelzl@39097
   380
  assumes Y: "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x"
hoelzl@40859
   381
  shows "mutual_information b MX MY X Y = mutual_information b MX MY X' Y'"
hoelzl@40859
   382
  unfolding mutual_information_def using X Y
hoelzl@40859
   383
  by (simp cong: distribution_cong)
hoelzl@39097
   384
hoelzl@40859
   385
lemma (in information_space) mutual_information_cong:
hoelzl@40859
   386
  assumes X: "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x"
hoelzl@40859
   387
  assumes Y: "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x"
hoelzl@40859
   388
  shows "\<I>(X; Y) = \<I>(X'; Y')"
hoelzl@40859
   389
  unfolding mutual_information_def using X Y
hoelzl@40859
   390
  by (simp cong: distribution_cong image_cong)
hoelzl@40859
   391
hoelzl@40859
   392
lemma (in information_space) mutual_information_positive:
hoelzl@40859
   393
  assumes "simple_function X" "simple_function Y"
hoelzl@40859
   394
  shows "0 \<le> \<I>(X;Y)"
hoelzl@40859
   395
  using assms by (simp add: mutual_information_positive_generic)
hoelzl@36080
   396
hoelzl@39097
   397
subsection {* Entropy *}
hoelzl@39097
   398
hoelzl@40859
   399
abbreviation (in information_space)
hoelzl@40859
   400
  entropy_Pow ("\<H>'(_')") where
hoelzl@36624
   401
  "\<H>(X) \<equiv> entropy b \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr> X"
hoelzl@36080
   402
hoelzl@40859
   403
lemma (in information_space) entropy_generic_eq:
hoelzl@40859
   404
  assumes MX: "finite_random_variable MX X"
hoelzl@39097
   405
  shows "entropy b MX X = -(\<Sum> x \<in> space MX. real (distribution X {x}) * log b (real (distribution X {x})))"
hoelzl@39097
   406
proof -
hoelzl@40859
   407
  interpret MX: finite_prob_space MX "distribution X" using MX by (rule distribution_finite_prob_space)
hoelzl@39097
   408
  let "?X x" = "real (distribution X {x})"
hoelzl@39097
   409
  let "?XX x y" = "real (joint_distribution X X {(x, y)})"
hoelzl@39097
   410
  { fix x y
hoelzl@39097
   411
    have "(\<lambda>x. (X x, X x)) -` {(x, y)} = (if x = y then X -` {x} else {})" by auto
hoelzl@39097
   412
    then have "?XX x y * log b (?XX x y / (?X x * ?X y)) =
hoelzl@39097
   413
        (if x = y then - ?X y * log b (?X y) else 0)"
hoelzl@40859
   414
      unfolding distribution_def by (auto simp: log_simps zero_less_mult_iff) }
hoelzl@39097
   415
  note remove_XX = this
hoelzl@39097
   416
  show ?thesis
hoelzl@39097
   417
    unfolding entropy_def mutual_information_generic_eq[OF MX MX]
hoelzl@39097
   418
    unfolding setsum_cartesian_product[symmetric] setsum_negf[symmetric] remove_XX
hoelzl@39097
   419
    by (auto simp: setsum_cases MX.finite_space)
hoelzl@39097
   420
qed
hoelzl@36624
   421
hoelzl@40859
   422
lemma (in information_space) entropy_eq:
hoelzl@40859
   423
  assumes "simple_function X"
hoelzl@40859
   424
  shows "\<H>(X) = -(\<Sum> x \<in> X ` space M. real (distribution X {x}) * log b (real (distribution X {x})))"
hoelzl@40859
   425
  using assms by (simp add: entropy_generic_eq)
hoelzl@36080
   426
hoelzl@40859
   427
lemma (in information_space) entropy_positive:
hoelzl@40859
   428
  "simple_function X \<Longrightarrow> 0 \<le> \<H>(X)"
hoelzl@40859
   429
  unfolding entropy_def by (simp add: mutual_information_positive)
hoelzl@36080
   430
hoelzl@40859
   431
lemma (in information_space) entropy_certainty_eq_0:
hoelzl@40859
   432
  assumes "simple_function X" and "x \<in> X ` space M" and "distribution X {x} = 1"
hoelzl@39097
   433
  shows "\<H>(X) = 0"
hoelzl@39097
   434
proof -
hoelzl@39097
   435
  interpret X: finite_prob_space "\<lparr> space = X ` space M, sets = Pow (X ` space M) \<rparr>" "distribution X"
hoelzl@40859
   436
    using simple_function_imp_finite_random_variable[OF `simple_function X`]
hoelzl@40859
   437
    by (rule distribution_finite_prob_space)
hoelzl@39097
   438
  have "distribution X (X ` space M - {x}) = distribution X (X ` space M) - distribution X {x}"
hoelzl@39097
   439
    using X.measure_compl[of "{x}"] assms by auto
hoelzl@39097
   440
  also have "\<dots> = 0" using X.prob_space assms by auto
hoelzl@39097
   441
  finally have X0: "distribution X (X ` space M - {x}) = 0" by auto
hoelzl@39097
   442
  { fix y assume asm: "y \<noteq> x" "y \<in> X ` space M"
hoelzl@39097
   443
    hence "{y} \<subseteq> X ` space M - {x}" by auto
hoelzl@39097
   444
    from X.measure_mono[OF this] X0 asm
hoelzl@39097
   445
    have "distribution X {y} = 0" by auto }
hoelzl@39097
   446
  hence fi: "\<And> y. y \<in> X ` space M \<Longrightarrow> real (distribution X {y}) = (if x = y then 1 else 0)"
hoelzl@39097
   447
    using assms by auto
hoelzl@39097
   448
  have y: "\<And>y. (if x = y then 1 else 0) * log b (if x = y then 1 else 0) = 0" by simp
hoelzl@40859
   449
  show ?thesis unfolding entropy_eq[OF `simple_function X`] by (auto simp: y fi)
hoelzl@39097
   450
qed
hoelzl@39097
   451
hoelzl@40859
   452
lemma (in information_space) entropy_le_card_not_0:
hoelzl@40859
   453
  assumes "simple_function X"
hoelzl@40859
   454
  shows "\<H>(X) \<le> log b (real (card (X ` space M \<inter> {x . distribution X {x} \<noteq> 0})))"
hoelzl@39097
   455
proof -
hoelzl@39097
   456
  let "?d x" = "distribution X {x}"
hoelzl@39097
   457
  let "?p x" = "real (?d x)"
hoelzl@39097
   458
  have "\<H>(X) = (\<Sum>x\<in>X`space M. ?p x * log b (1 / ?p x))"
hoelzl@40859
   459
    by (auto intro!: setsum_cong simp: entropy_eq[OF `simple_function X`] setsum_negf[symmetric] log_simps not_less)
hoelzl@39097
   460
  also have "\<dots> \<le> log b (\<Sum>x\<in>X`space M. ?p x * (1 / ?p x))"
hoelzl@39097
   461
    apply (rule log_setsum')
hoelzl@40859
   462
    using not_empty b_gt_1 `simple_function X` sum_over_space_real_distribution
hoelzl@40859
   463
    by (auto simp: simple_function_def)
hoelzl@39097
   464
  also have "\<dots> = log b (\<Sum>x\<in>X`space M. if ?d x \<noteq> 0 then 1 else 0)"
hoelzl@40859
   465
    using distribution_finite[OF `simple_function X`[THEN simple_function_imp_random_variable], simplified]
hoelzl@41023
   466
    by (intro arg_cong[where f="\<lambda>X. log b X"] setsum_cong) (auto simp: real_of_pextreal_eq_0)
hoelzl@39097
   467
  finally show ?thesis
hoelzl@40859
   468
    using `simple_function X` by (auto simp: setsum_cases real_eq_of_nat simple_function_def)
hoelzl@39097
   469
qed
hoelzl@39097
   470
hoelzl@40859
   471
lemma (in information_space) entropy_uniform_max:
hoelzl@40859
   472
  assumes "simple_function X"
hoelzl@39097
   473
  assumes "\<And>x y. \<lbrakk> x \<in> X ` space M ; y \<in> X ` space M \<rbrakk> \<Longrightarrow> distribution X {x} = distribution X {y}"
hoelzl@39097
   474
  shows "\<H>(X) = log b (real (card (X ` space M)))"
hoelzl@39097
   475
proof -
hoelzl@40859
   476
  interpret X: finite_prob_space "\<lparr> space = X ` space M, sets = Pow (X ` space M) \<rparr>" "distribution X"
hoelzl@40859
   477
    using simple_function_imp_finite_random_variable[OF `simple_function X`]
hoelzl@40859
   478
    by (rule distribution_finite_prob_space)
hoelzl@39097
   479
  have card_gt0: "0 < card (X ` space M)" unfolding card_gt_0_iff
hoelzl@40859
   480
    using `simple_function X` not_empty by (auto simp: simple_function_def)
hoelzl@39097
   481
  { fix x assume "x \<in> X ` space M"
hoelzl@39097
   482
    hence "real (distribution X {x}) = 1 / real (card (X ` space M))"
hoelzl@40859
   483
    proof (rule X.uniform_prob[simplified])
hoelzl@39097
   484
      fix x y assume "x \<in> X`space M" "y \<in> X`space M"
hoelzl@40859
   485
      from assms(2)[OF this] show "real (distribution X {x}) = real (distribution X {y})" by simp
hoelzl@39097
   486
    qed }
hoelzl@39097
   487
  thus ?thesis
hoelzl@40859
   488
    using not_empty X.finite_space b_gt_1 card_gt0
hoelzl@40859
   489
    by (simp add: entropy_eq[OF `simple_function X`] real_eq_of_nat[symmetric] log_simps)
hoelzl@39097
   490
qed
hoelzl@39097
   491
hoelzl@40859
   492
lemma (in information_space) entropy_le_card:
hoelzl@40859
   493
  assumes "simple_function X"
hoelzl@40859
   494
  shows "\<H>(X) \<le> log b (real (card (X ` space M)))"
hoelzl@39097
   495
proof cases
hoelzl@39097
   496
  assume "X ` space M \<inter> {x. distribution X {x} \<noteq> 0} = {}"
hoelzl@39097
   497
  then have "\<And>x. x\<in>X`space M \<Longrightarrow> distribution X {x} = 0" by auto
hoelzl@39097
   498
  moreover
hoelzl@39097
   499
  have "0 < card (X`space M)"
hoelzl@40859
   500
    using `simple_function X` not_empty
hoelzl@40859
   501
    by (auto simp: card_gt_0_iff simple_function_def)
hoelzl@39097
   502
  then have "log b 1 \<le> log b (real (card (X`space M)))"
hoelzl@39097
   503
    using b_gt_1 by (intro log_le) auto
hoelzl@40859
   504
  ultimately show ?thesis using assms by (simp add: entropy_eq)
hoelzl@39097
   505
next
hoelzl@39097
   506
  assume False: "X ` space M \<inter> {x. distribution X {x} \<noteq> 0} \<noteq> {}"
hoelzl@39097
   507
  have "card (X ` space M \<inter> {x. distribution X {x} \<noteq> 0}) \<le> card (X ` space M)"
hoelzl@40859
   508
    (is "?A \<le> ?B") using assms not_empty by (auto intro!: card_mono simp: simple_function_def)
hoelzl@40859
   509
  note entropy_le_card_not_0[OF assms]
hoelzl@39097
   510
  also have "log b (real ?A) \<le> log b (real ?B)"
hoelzl@40859
   511
    using b_gt_1 False not_empty `?A \<le> ?B` assms
hoelzl@40859
   512
    by (auto intro!: log_le simp: card_gt_0_iff simp: simple_function_def)
hoelzl@39097
   513
  finally show ?thesis .
hoelzl@39097
   514
qed
hoelzl@39097
   515
hoelzl@40859
   516
lemma (in information_space) entropy_commute:
hoelzl@40859
   517
  assumes "simple_function X" "simple_function Y"
hoelzl@40859
   518
  shows "\<H>(\<lambda>x. (X x, Y x)) = \<H>(\<lambda>x. (Y x, X x))"
hoelzl@39097
   519
proof -
hoelzl@40859
   520
  have sf: "simple_function (\<lambda>x. (X x, Y x))" "simple_function (\<lambda>x. (Y x, X x))"
hoelzl@40859
   521
    using assms by (auto intro: simple_function_Pair)
hoelzl@39097
   522
  have *: "(\<lambda>x. (Y x, X x))`space M = (\<lambda>(a,b). (b,a))`(\<lambda>x. (X x, Y x))`space M"
hoelzl@39097
   523
    by auto
hoelzl@39097
   524
  have inj: "\<And>X. inj_on (\<lambda>(a,b). (b,a)) X"
hoelzl@39097
   525
    by (auto intro!: inj_onI)
hoelzl@39097
   526
  show ?thesis
hoelzl@40859
   527
    unfolding sf[THEN entropy_eq] unfolding * setsum_reindex[OF inj]
hoelzl@39097
   528
    by (simp add: joint_distribution_commute[of Y X] split_beta)
hoelzl@39097
   529
qed
hoelzl@39097
   530
hoelzl@40859
   531
lemma (in information_space) entropy_eq_cartesian_product:
hoelzl@40859
   532
  assumes "simple_function X" "simple_function Y"
hoelzl@40859
   533
  shows "\<H>(\<lambda>x. (X x, Y x)) = -(\<Sum>x\<in>X`space M. \<Sum>y\<in>Y`space M.
hoelzl@39097
   534
    real (joint_distribution X Y {(x,y)}) *
hoelzl@39097
   535
    log b (real (joint_distribution X Y {(x,y)})))"
hoelzl@39097
   536
proof -
hoelzl@40859
   537
  have sf: "simple_function (\<lambda>x. (X x, Y x))"
hoelzl@40859
   538
    using assms by (auto intro: simple_function_Pair)
hoelzl@39097
   539
  { fix x assume "x\<notin>(\<lambda>x. (X x, Y x))`space M"
hoelzl@39097
   540
    then have "(\<lambda>x. (X x, Y x)) -` {x} \<inter> space M = {}" by auto
hoelzl@39097
   541
    then have "joint_distribution X Y {x} = 0"
hoelzl@39097
   542
      unfolding distribution_def by auto }
hoelzl@40859
   543
  then show ?thesis using sf assms
hoelzl@40859
   544
    unfolding entropy_eq[OF sf] neg_equal_iff_equal setsum_cartesian_product
hoelzl@40859
   545
    by (auto intro!: setsum_mono_zero_cong_left simp: simple_function_def)
hoelzl@39097
   546
qed
hoelzl@39097
   547
hoelzl@39097
   548
subsection {* Conditional Mutual Information *}
hoelzl@39097
   549
hoelzl@36080
   550
definition (in prob_space)
hoelzl@38656
   551
  "conditional_mutual_information b M1 M2 M3 X Y Z \<equiv>
hoelzl@40859
   552
    mutual_information b M1 (sigma (pair_algebra M2 M3)) X (\<lambda>x. (Y x, Z x)) -
hoelzl@38656
   553
    mutual_information b M1 M3 X Z"
hoelzl@36080
   554
hoelzl@40859
   555
abbreviation (in information_space)
hoelzl@40859
   556
  conditional_mutual_information_Pow ("\<I>'( _ ; _ | _ ')") where
hoelzl@36624
   557
  "\<I>(X ; Y | Z) \<equiv> conditional_mutual_information b
hoelzl@36080
   558
    \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
hoelzl@36080
   559
    \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr>
hoelzl@36080
   560
    \<lparr> space = Z`space M, sets = Pow (Z`space M) \<rparr>
hoelzl@36080
   561
    X Y Z"
hoelzl@36080
   562
hoelzl@38656
   563
hoelzl@40859
   564
lemma (in information_space) conditional_mutual_information_generic_eq:
hoelzl@40859
   565
  assumes MX: "finite_random_variable MX X"
hoelzl@40859
   566
    and MY: "finite_random_variable MY Y"
hoelzl@40859
   567
    and MZ: "finite_random_variable MZ Z"
hoelzl@40859
   568
  shows "conditional_mutual_information b MX MY MZ X Y Z = (\<Sum>(x, y, z) \<in> space MX \<times> space MY \<times> space MZ.
hoelzl@38656
   569
             real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) *
hoelzl@38656
   570
             log b (real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) /
hoelzl@38656
   571
    (real (joint_distribution X Z {(x, z)}) * real (joint_distribution Y Z {(y,z)} / distribution Z {z}))))"
hoelzl@40859
   572
  (is "_ = (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?XZ x z * ?YZdZ y z)))")
hoelzl@40859
   573
proof -
hoelzl@40859
   574
  let ?YZ = "\<lambda>y z. real (joint_distribution Y Z {(y, z)})"
hoelzl@40859
   575
  let ?X = "\<lambda>x. real (distribution X {x})"
hoelzl@40859
   576
  let ?Z = "\<lambda>z. real (distribution Z {z})"
hoelzl@40859
   577
hoelzl@40859
   578
  txt {* This proof is actually quiet easy, however we need to show that the
hoelzl@40859
   579
    distributions are finite and the joint distributions are zero when one of
hoelzl@40859
   580
    the variables distribution is also zero. *}
hoelzl@40859
   581
hoelzl@40859
   582
  note finite_var = MX MY MZ
hoelzl@40859
   583
  note random_var = finite_var[THEN finite_random_variableD]
hoelzl@40859
   584
hoelzl@40859
   585
  note space_simps = space_pair_algebra space_sigma algebra.simps
hoelzl@40859
   586
hoelzl@40859
   587
  note YZ = finite_random_variable_pairI[OF finite_var(2,3)]
hoelzl@40859
   588
  note XZ = finite_random_variable_pairI[OF finite_var(1,3)]
hoelzl@40859
   589
  note ZX = finite_random_variable_pairI[OF finite_var(3,1)]
hoelzl@40859
   590
  note YZX = finite_random_variable_pairI[OF finite_var(2) ZX]
hoelzl@40859
   591
  note order1 =
hoelzl@40859
   592
    finite_distribution_order(5,6)[OF finite_var(1) YZ, simplified space_simps]
hoelzl@40859
   593
    finite_distribution_order(5,6)[OF finite_var(1,3), simplified space_simps]
hoelzl@40859
   594
hoelzl@40859
   595
  note finite = finite_var(1) YZ finite_var(3) XZ YZX
hoelzl@40859
   596
  note finite[THEN finite_distribution_finite, simplified space_simps, simp]
hoelzl@40859
   597
hoelzl@40859
   598
  have order2: "\<And>x y z. \<lbrakk>x \<in> space MX; y \<in> space MY; z \<in> space MZ; joint_distribution X Z {(x, z)} = 0\<rbrakk>
hoelzl@40859
   599
          \<Longrightarrow> joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)} = 0"
hoelzl@40859
   600
    unfolding joint_distribution_commute_singleton[of X]
hoelzl@40859
   601
    unfolding joint_distribution_assoc_singleton[symmetric]
hoelzl@40859
   602
    using finite_distribution_order(6)[OF finite_var(2) ZX]
hoelzl@40859
   603
    by (auto simp: space_simps)
hoelzl@36624
   604
hoelzl@40859
   605
  have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?XZ x z * ?YZdZ y z))) =
hoelzl@40859
   606
    (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * (log b (?XYZ x y z / (?X x * ?YZ y z)) - log b (?XZ x z / (?X x * ?Z z))))"
hoelzl@40859
   607
    (is "(\<Sum>(x, y, z)\<in>?S. ?L x y z) = (\<Sum>(x, y, z)\<in>?S. ?R x y z)")
hoelzl@40859
   608
  proof (safe intro!: setsum_cong)
hoelzl@40859
   609
    fix x y z assume space: "x \<in> space MX" "y \<in> space MY" "z \<in> space MZ"
hoelzl@40859
   610
    then have *: "?XYZ x y z / (?XZ x z * ?YZdZ y z) =
hoelzl@40859
   611
      (?XYZ x y z / (?X x * ?YZ y z)) / (?XZ x z / (?X x * ?Z z))"
hoelzl@40859
   612
      using order1(3)
hoelzl@41023
   613
      by (auto simp: real_of_pextreal_mult[symmetric] real_of_pextreal_eq_0)
hoelzl@40859
   614
    show "?L x y z = ?R x y z"
hoelzl@40859
   615
    proof cases
hoelzl@40859
   616
      assume "?XYZ x y z \<noteq> 0"
hoelzl@40859
   617
      with space b_gt_1 order1 order2 show ?thesis unfolding *
hoelzl@40859
   618
        by (subst log_divide)
hoelzl@41023
   619
           (auto simp: zero_less_divide_iff zero_less_real_of_pextreal
hoelzl@41023
   620
                       real_of_pextreal_eq_0 zero_less_mult_iff)
hoelzl@40859
   621
    qed simp
hoelzl@40859
   622
  qed
hoelzl@40859
   623
  also have "\<dots> = (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?X x * ?YZ y z))) -
hoelzl@40859
   624
                  (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XZ x z / (?X x * ?Z z)))"
hoelzl@40859
   625
    by (auto simp add: setsum_subtractf[symmetric] field_simps intro!: setsum_cong)
hoelzl@40859
   626
  also have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XZ x z / (?X x * ?Z z))) =
hoelzl@40859
   627
             (\<Sum>(x, z)\<in>space MX \<times> space MZ. ?XZ x z * log b (?XZ x z / (?X x * ?Z z)))"
hoelzl@40859
   628
    unfolding setsum_cartesian_product[symmetric] setsum_commute[of _ _ "space MY"]
hoelzl@40859
   629
              setsum_left_distrib[symmetric]
hoelzl@40859
   630
    unfolding joint_distribution_commute_singleton[of X]
hoelzl@40859
   631
    unfolding joint_distribution_assoc_singleton[symmetric]
hoelzl@40859
   632
    using setsum_real_joint_distribution_singleton[OF finite_var(2) ZX, unfolded space_simps]
hoelzl@40859
   633
    by (intro setsum_cong refl) simp
hoelzl@40859
   634
  also have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?X x * ?YZ y z))) -
hoelzl@40859
   635
             (\<Sum>(x, z)\<in>space MX \<times> space MZ. ?XZ x z * log b (?XZ x z / (?X x * ?Z z))) =
hoelzl@40859
   636
             conditional_mutual_information b MX MY MZ X Y Z"
hoelzl@40859
   637
    unfolding conditional_mutual_information_def
hoelzl@40859
   638
    unfolding mutual_information_generic_eq[OF finite_var(1,3)]
hoelzl@40859
   639
    unfolding mutual_information_generic_eq[OF finite_var(1) YZ]
hoelzl@40859
   640
    by (simp add: space_sigma space_pair_algebra setsum_cartesian_product')
hoelzl@40859
   641
  finally show ?thesis by simp
hoelzl@40859
   642
qed
hoelzl@40859
   643
hoelzl@40859
   644
lemma (in information_space) conditional_mutual_information_eq:
hoelzl@40859
   645
  assumes "simple_function X" "simple_function Y" "simple_function Z"
hoelzl@40859
   646
  shows "\<I>(X;Y|Z) = (\<Sum>(x, y, z) \<in> X`space M \<times> Y`space M \<times> Z`space M.
hoelzl@40859
   647
             real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) *
hoelzl@40859
   648
             log b (real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) /
hoelzl@40859
   649
    (real (joint_distribution X Z {(x, z)}) * real (joint_distribution Y Z {(y,z)} / distribution Z {z}))))"
hoelzl@40859
   650
  using conditional_mutual_information_generic_eq[OF assms[THEN simple_function_imp_finite_random_variable]]
hoelzl@40859
   651
  by simp
hoelzl@40859
   652
hoelzl@40859
   653
lemma (in information_space) conditional_mutual_information_eq_mutual_information:
hoelzl@40859
   654
  assumes X: "simple_function X" and Y: "simple_function Y"
hoelzl@40859
   655
  shows "\<I>(X ; Y) = \<I>(X ; Y | (\<lambda>x. ()))"
hoelzl@36624
   656
proof -
hoelzl@36624
   657
  have [simp]: "(\<lambda>x. ()) ` space M = {()}" using not_empty by auto
hoelzl@40859
   658
  have C: "simple_function (\<lambda>x. ())" by auto
hoelzl@36624
   659
  show ?thesis
hoelzl@40859
   660
    unfolding conditional_mutual_information_eq[OF X Y C]
hoelzl@40859
   661
    unfolding mutual_information_eq[OF X Y]
hoelzl@36624
   662
    by (simp add: setsum_cartesian_product' distribution_remove_const)
hoelzl@36624
   663
qed
hoelzl@36624
   664
hoelzl@40859
   665
lemma (in prob_space) distribution_unit[simp]: "distribution (\<lambda>x. ()) {()} = 1"
hoelzl@40859
   666
  unfolding distribution_def using measure_space_1 by auto
hoelzl@40859
   667
hoelzl@40859
   668
lemma (in prob_space) joint_distribution_unit[simp]: "distribution (\<lambda>x. (X x, ())) {(a, ())} = distribution X {a}"
hoelzl@40859
   669
  unfolding distribution_def by (auto intro!: arg_cong[where f=\<mu>])
hoelzl@40859
   670
hoelzl@40859
   671
lemma (in prob_space) setsum_distribution:
hoelzl@40859
   672
  assumes X: "finite_random_variable MX X" shows "(\<Sum>a\<in>space MX. distribution X {a}) = 1"
hoelzl@40859
   673
  using setsum_joint_distribution[OF assms, of "\<lparr> space = UNIV, sets = Pow UNIV \<rparr>" "\<lambda>x. ()" "{()}"]
hoelzl@40859
   674
  using sigma_algebra_Pow[of "UNIV::unit set"] by simp
hoelzl@40859
   675
hoelzl@40859
   676
lemma (in prob_space) setsum_real_distribution:
hoelzl@40859
   677
  assumes X: "finite_random_variable MX X" shows "(\<Sum>a\<in>space MX. real (distribution X {a})) = 1"
hoelzl@40859
   678
  using setsum_real_joint_distribution[OF assms, of "\<lparr> space = UNIV, sets = Pow UNIV \<rparr>" "\<lambda>x. ()" "{()}"]
hoelzl@40859
   679
  using sigma_algebra_Pow[of "UNIV::unit set"] by simp
hoelzl@40859
   680
hoelzl@40859
   681
lemma (in information_space) conditional_mutual_information_generic_positive:
hoelzl@40859
   682
  assumes "finite_random_variable MX X" and "finite_random_variable MY Y" and "finite_random_variable MZ Z"
hoelzl@40859
   683
  shows "0 \<le> conditional_mutual_information b MX MY MZ X Y Z"
hoelzl@40859
   684
proof (cases "space MX \<times> space MY \<times> space MZ = {}")
hoelzl@40859
   685
  case True show ?thesis
hoelzl@40859
   686
    unfolding conditional_mutual_information_generic_eq[OF assms] True
hoelzl@40859
   687
    by simp
hoelzl@40859
   688
next
hoelzl@40859
   689
  case False
hoelzl@38656
   690
  let "?dXYZ A" = "real (distribution (\<lambda>x. (X x, Y x, Z x)) A)"
hoelzl@38656
   691
  let "?dXZ A" = "real (joint_distribution X Z A)"
hoelzl@38656
   692
  let "?dYZ A" = "real (joint_distribution Y Z A)"
hoelzl@38656
   693
  let "?dX A" = "real (distribution X A)"
hoelzl@38656
   694
  let "?dZ A" = "real (distribution Z A)"
hoelzl@40859
   695
  let ?M = "space MX \<times> space MY \<times> space MZ"
hoelzl@36624
   696
nipkow@39302
   697
  have split_beta: "\<And>f. split f = (\<lambda>x. f (fst x) (snd x))" by (simp add: fun_eq_iff)
hoelzl@36080
   698
hoelzl@40859
   699
  note space_simps = space_pair_algebra space_sigma algebra.simps
hoelzl@40859
   700
hoelzl@40859
   701
  note finite_var = assms
hoelzl@40859
   702
  note YZ = finite_random_variable_pairI[OF finite_var(2,3)]
hoelzl@40859
   703
  note XZ = finite_random_variable_pairI[OF finite_var(1,3)]
hoelzl@40859
   704
  note ZX = finite_random_variable_pairI[OF finite_var(3,1)]
hoelzl@40859
   705
  note YZ = finite_random_variable_pairI[OF finite_var(2,3)]
hoelzl@40859
   706
  note XYZ = finite_random_variable_pairI[OF finite_var(1) YZ]
hoelzl@40859
   707
  note finite = finite_var(3) YZ XZ XYZ
hoelzl@40859
   708
  note finite = finite[THEN finite_distribution_finite, simplified space_simps]
hoelzl@40859
   709
hoelzl@40859
   710
  have order: "\<And>x y z. \<lbrakk>x \<in> space MX; y \<in> space MY; z \<in> space MZ; joint_distribution X Z {(x, z)} = 0\<rbrakk>
hoelzl@40859
   711
          \<Longrightarrow> joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)} = 0"
hoelzl@40859
   712
    unfolding joint_distribution_commute_singleton[of X]
hoelzl@40859
   713
    unfolding joint_distribution_assoc_singleton[symmetric]
hoelzl@40859
   714
    using finite_distribution_order(6)[OF finite_var(2) ZX]
hoelzl@40859
   715
    by (auto simp: space_simps)
hoelzl@40859
   716
hoelzl@40859
   717
  note order = order
hoelzl@40859
   718
    finite_distribution_order(5,6)[OF finite_var(1) YZ, simplified space_simps]
hoelzl@40859
   719
    finite_distribution_order(5,6)[OF finite_var(2,3), simplified space_simps]
hoelzl@40859
   720
hoelzl@40859
   721
  have "- conditional_mutual_information b MX MY MZ X Y Z = - (\<Sum>(x, y, z) \<in> ?M. ?dXYZ {(x, y, z)} *
hoelzl@40859
   722
    log b (?dXYZ {(x, y, z)} / (?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})))"
hoelzl@40859
   723
    unfolding conditional_mutual_information_generic_eq[OF assms] neg_equal_iff_equal
hoelzl@41023
   724
    by (intro setsum_cong) (auto intro!: arg_cong[where f="log b"] simp: real_of_pextreal_mult[symmetric])
hoelzl@40859
   725
  also have "\<dots> \<le> log b (\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})"
hoelzl@36624
   726
    unfolding split_beta
hoelzl@36624
   727
  proof (rule log_setsum_divide)
hoelzl@40859
   728
    show "?M \<noteq> {}" using False by simp
hoelzl@36624
   729
    show "1 < b" using b_gt_1 .
hoelzl@36080
   730
hoelzl@40859
   731
    show "finite ?M" using assms
hoelzl@40859
   732
      unfolding finite_sigma_algebra_def finite_sigma_algebra_axioms_def by auto
hoelzl@40859
   733
hoelzl@40859
   734
    show "(\<Sum>x\<in>?M. ?dXYZ {(fst x, fst (snd x), snd (snd x))}) = 1"
hoelzl@40859
   735
      unfolding setsum_cartesian_product'
hoelzl@40859
   736
      unfolding setsum_commute[of _ "space MY"]
hoelzl@40859
   737
      unfolding setsum_commute[of _ "space MZ"]
hoelzl@40859
   738
      by (simp_all add: space_pair_algebra
hoelzl@40859
   739
        setsum_real_joint_distribution_singleton[OF `finite_random_variable MX X` YZ]
hoelzl@40859
   740
        setsum_real_joint_distribution_singleton[OF `finite_random_variable MY Y` finite_var(3)]
hoelzl@40859
   741
        setsum_real_distribution[OF `finite_random_variable MZ Z`])
hoelzl@40859
   742
hoelzl@36624
   743
    fix x assume "x \<in> ?M"
hoelzl@38656
   744
    let ?x = "(fst x, fst (snd x), snd (snd x))"
hoelzl@38656
   745
hoelzl@41023
   746
    show "0 \<le> ?dXYZ {?x}" using real_pextreal_nonneg .
hoelzl@36624
   747
    show "0 \<le> ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}"
hoelzl@41023
   748
     by (simp add: real_pextreal_nonneg mult_nonneg_nonneg divide_nonneg_nonneg)
hoelzl@36080
   749
hoelzl@38656
   750
    assume *: "0 < ?dXYZ {?x}"
hoelzl@40859
   751
    with `x \<in> ?M` show "0 < ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}"
hoelzl@40859
   752
      using finite order
hoelzl@40859
   753
      by (cases x)
hoelzl@41023
   754
         (auto simp add: zero_less_real_of_pextreal zero_less_mult_iff zero_less_divide_iff)
hoelzl@40859
   755
  qed
hoelzl@40859
   756
  also have "(\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z}) = (\<Sum>z\<in>space MZ. ?dZ {z})"
hoelzl@36624
   757
    apply (simp add: setsum_cartesian_product')
hoelzl@36624
   758
    apply (subst setsum_commute)
hoelzl@36624
   759
    apply (subst (2) setsum_commute)
hoelzl@40859
   760
    by (auto simp: setsum_divide_distrib[symmetric] setsum_product[symmetric]
hoelzl@40859
   761
                   setsum_real_joint_distribution_singleton[OF finite_var(1,3)]
hoelzl@40859
   762
                   setsum_real_joint_distribution_singleton[OF finite_var(2,3)]
hoelzl@36624
   763
          intro!: setsum_cong)
hoelzl@40859
   764
  also have "log b (\<Sum>z\<in>space MZ. ?dZ {z}) = 0"
hoelzl@40859
   765
    unfolding setsum_real_distribution[OF finite_var(3)] by simp
hoelzl@40859
   766
  finally show ?thesis by simp
hoelzl@36080
   767
qed
hoelzl@36080
   768
hoelzl@40859
   769
lemma (in information_space) conditional_mutual_information_positive:
hoelzl@40859
   770
  assumes "simple_function X" and "simple_function Y" and "simple_function Z"
hoelzl@40859
   771
  shows "0 \<le> \<I>(X;Y|Z)"
hoelzl@40859
   772
  using conditional_mutual_information_generic_positive[OF assms[THEN simple_function_imp_finite_random_variable]]
hoelzl@40859
   773
  by simp
hoelzl@40859
   774
hoelzl@39097
   775
subsection {* Conditional Entropy *}
hoelzl@39097
   776
hoelzl@36080
   777
definition (in prob_space)
hoelzl@36080
   778
  "conditional_entropy b S T X Y = conditional_mutual_information b S S T X X Y"
hoelzl@36080
   779
hoelzl@40859
   780
abbreviation (in information_space)
hoelzl@40859
   781
  conditional_entropy_Pow ("\<H>'(_ | _')") where
hoelzl@36624
   782
  "\<H>(X | Y) \<equiv> conditional_entropy b
hoelzl@36080
   783
    \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
hoelzl@36080
   784
    \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr> X Y"
hoelzl@36080
   785
hoelzl@40859
   786
lemma (in information_space) conditional_entropy_positive:
hoelzl@40859
   787
  "simple_function X \<Longrightarrow> simple_function Y \<Longrightarrow> 0 \<le> \<H>(X | Y)"
hoelzl@40859
   788
  unfolding conditional_entropy_def by (auto intro!: conditional_mutual_information_positive)
hoelzl@36080
   789
hoelzl@40859
   790
lemma (in measure_space) empty_measureI: "A = {} \<Longrightarrow> \<mu> A = 0" by simp
hoelzl@40859
   791
hoelzl@40859
   792
lemma (in information_space) conditional_entropy_generic_eq:
hoelzl@40859
   793
  assumes MX: "finite_random_variable MX X"
hoelzl@40859
   794
  assumes MZ: "finite_random_variable MZ Z"
hoelzl@39097
   795
  shows "conditional_entropy b MX MZ X Z =
hoelzl@39097
   796
     - (\<Sum>(x, z)\<in>space MX \<times> space MZ.
hoelzl@39097
   797
         real (joint_distribution X Z {(x, z)}) *
hoelzl@39097
   798
         log b (real (joint_distribution X Z {(x, z)}) / real (distribution Z {z})))"
hoelzl@40859
   799
proof -
hoelzl@40859
   800
  interpret MX: finite_sigma_algebra MX using MX by simp
hoelzl@40859
   801
  interpret MZ: finite_sigma_algebra MZ using MZ by simp
hoelzl@40859
   802
  let "?XXZ x y z" = "joint_distribution X (\<lambda>x. (X x, Z x)) {(x, y, z)}"
hoelzl@40859
   803
  let "?XZ x z" = "joint_distribution X Z {(x, z)}"
hoelzl@40859
   804
  let "?Z z" = "distribution Z {z}"
hoelzl@40859
   805
  let "?f x y z" = "log b (real (?XXZ x y z) / (real (?XZ x z) * real (?XZ y z / ?Z z)))"
hoelzl@40859
   806
  { fix x z have "?XXZ x x z = ?XZ x z"
hoelzl@40859
   807
      unfolding distribution_def by (auto intro!: arg_cong[where f=\<mu>]) }
hoelzl@40859
   808
  note this[simp]
hoelzl@40859
   809
  { fix x x' :: 'b and z assume "x' \<noteq> x"
hoelzl@40859
   810
    then have "?XXZ x x' z = 0"
hoelzl@40859
   811
      by (auto simp: distribution_def intro!: arg_cong[where f=\<mu>] empty_measureI) }
hoelzl@40859
   812
  note this[simp]
hoelzl@40859
   813
  { fix x x' z assume *: "x \<in> space MX" "z \<in> space MZ"
hoelzl@40859
   814
    then have "(\<Sum>x'\<in>space MX. real (?XXZ x x' z) * ?f x x' z)
hoelzl@40859
   815
      = (\<Sum>x'\<in>space MX. if x = x' then real (?XZ x z) * ?f x x z else 0)"
hoelzl@40859
   816
      by (auto intro!: setsum_cong)
hoelzl@40859
   817
    also have "\<dots> = real (?XZ x z) * ?f x x z"
hoelzl@40859
   818
      using `x \<in> space MX` by (simp add: setsum_cases[OF MX.finite_space])
hoelzl@40859
   819
    also have "\<dots> = real (?XZ x z) * log b (real (?Z z) / real (?XZ x z))"
hoelzl@41023
   820
      by (auto simp: real_of_pextreal_mult[symmetric])
hoelzl@40859
   821
    also have "\<dots> = - real (?XZ x z) * log b (real (?XZ x z) / real (?Z z))"
hoelzl@40859
   822
      using assms[THEN finite_distribution_finite]
hoelzl@40859
   823
      using finite_distribution_order(6)[OF MX MZ]
hoelzl@41023
   824
      by (auto simp: log_simps field_simps zero_less_mult_iff zero_less_real_of_pextreal real_of_pextreal_eq_0)
hoelzl@40859
   825
    finally have "(\<Sum>x'\<in>space MX. real (?XXZ x x' z) * ?f x x' z) =
hoelzl@40859
   826
      - real (?XZ x z) * log b (real (?XZ x z) / real (?Z z))" . }
hoelzl@40859
   827
  note * = this
hoelzl@40859
   828
hoelzl@40859
   829
  show ?thesis
hoelzl@40859
   830
    unfolding conditional_entropy_def
hoelzl@40859
   831
    unfolding conditional_mutual_information_generic_eq[OF MX MX MZ]
hoelzl@40859
   832
    by (auto simp: setsum_cartesian_product' setsum_negf[symmetric]
hoelzl@41023
   833
                   setsum_commute[of _ "space MZ"] *   simp del: divide_pextreal_def
hoelzl@40859
   834
             intro!: setsum_cong)
hoelzl@39097
   835
qed
hoelzl@39097
   836
hoelzl@40859
   837
lemma (in information_space) conditional_entropy_eq:
hoelzl@40859
   838
  assumes "simple_function X" "simple_function Z"
hoelzl@40859
   839
  shows "\<H>(X | Z) =
hoelzl@36080
   840
     - (\<Sum>(x, z)\<in>X ` space M \<times> Z ` space M.
hoelzl@38656
   841
         real (joint_distribution X Z {(x, z)}) *
hoelzl@38656
   842
         log b (real (joint_distribution X Z {(x, z)}) / real (distribution Z {z})))"
hoelzl@40859
   843
  using conditional_entropy_generic_eq[OF assms[THEN simple_function_imp_finite_random_variable]]
hoelzl@40859
   844
  by simp
hoelzl@39097
   845
hoelzl@40859
   846
lemma (in information_space) conditional_entropy_eq_ce_with_hypothesis:
hoelzl@40859
   847
  assumes X: "simple_function X" and Y: "simple_function Y"
hoelzl@40859
   848
  shows "\<H>(X | Y) =
hoelzl@39097
   849
    -(\<Sum>y\<in>Y`space M. real (distribution Y {y}) *
hoelzl@39097
   850
      (\<Sum>x\<in>X`space M. real (joint_distribution X Y {(x,y)}) / real (distribution Y {(y)}) *
hoelzl@39097
   851
              log b (real (joint_distribution X Y {(x,y)}) / real (distribution Y {(y)}))))"
hoelzl@40859
   852
  unfolding conditional_entropy_eq[OF assms]
hoelzl@40859
   853
  using finite_distribution_finite[OF finite_random_variable_pairI[OF assms[THEN simple_function_imp_finite_random_variable]]]
hoelzl@40859
   854
  using finite_distribution_order(5,6)[OF assms[THEN simple_function_imp_finite_random_variable]]
hoelzl@40859
   855
  using finite_distribution_finite[OF Y[THEN simple_function_imp_finite_random_variable]]
hoelzl@41023
   856
  by (auto simp: setsum_cartesian_product'  setsum_commute[of _ "Y`space M"] setsum_right_distrib real_of_pextreal_eq_0
hoelzl@40859
   857
           intro!: setsum_cong)
hoelzl@39097
   858
hoelzl@40859
   859
lemma (in information_space) conditional_entropy_eq_cartesian_product:
hoelzl@40859
   860
  assumes "simple_function X" "simple_function Y"
hoelzl@40859
   861
  shows "\<H>(X | Y) = -(\<Sum>x\<in>X`space M. \<Sum>y\<in>Y`space M.
hoelzl@39097
   862
    real (joint_distribution X Y {(x,y)}) *
hoelzl@39097
   863
    log b (real (joint_distribution X Y {(x,y)}) / real (distribution Y {y})))"
hoelzl@40859
   864
  unfolding conditional_entropy_eq[OF assms]
hoelzl@40859
   865
  by (auto intro!: setsum_cong simp: setsum_cartesian_product')
hoelzl@36080
   866
hoelzl@39097
   867
subsection {* Equalities *}
hoelzl@39097
   868
hoelzl@40859
   869
lemma (in information_space) mutual_information_eq_entropy_conditional_entropy:
hoelzl@40859
   870
  assumes X: "simple_function X" and Z: "simple_function Z"
hoelzl@40859
   871
  shows  "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)"
hoelzl@40859
   872
proof -
hoelzl@40859
   873
  let "?XZ x z" = "real (joint_distribution X Z {(x, z)})"
hoelzl@40859
   874
  let "?Z z" = "real (distribution Z {z})"
hoelzl@40859
   875
  let "?X x" = "real (distribution X {x})"
hoelzl@40859
   876
  note fX = X[THEN simple_function_imp_finite_random_variable]
hoelzl@40859
   877
  note fZ = Z[THEN simple_function_imp_finite_random_variable]
hoelzl@40859
   878
  note fX[THEN finite_distribution_finite, simp] and fZ[THEN finite_distribution_finite, simp]
hoelzl@40859
   879
  note finite_distribution_order[OF fX fZ, simp]
hoelzl@40859
   880
  { fix x z assume "x \<in> X`space M" "z \<in> Z`space M"
hoelzl@40859
   881
    have "?XZ x z * log b (?XZ x z / (?X x * ?Z z)) =
hoelzl@40859
   882
          ?XZ x z * log b (?XZ x z / ?Z z) - ?XZ x z * log b (?X x)"
hoelzl@41023
   883
      by (auto simp: log_simps real_of_pextreal_mult[symmetric] zero_less_mult_iff
hoelzl@41023
   884
                     zero_less_real_of_pextreal field_simps real_of_pextreal_eq_0 abs_mult) }
hoelzl@40859
   885
  note * = this
hoelzl@40859
   886
  show ?thesis
hoelzl@40859
   887
    unfolding entropy_eq[OF X] conditional_entropy_eq[OF X Z] mutual_information_eq[OF X Z]
hoelzl@40859
   888
    using setsum_real_joint_distribution_singleton[OF fZ fX, unfolded joint_distribution_commute_singleton[of Z X]]
hoelzl@40859
   889
    by (simp add: * setsum_cartesian_product' setsum_subtractf setsum_left_distrib[symmetric]
hoelzl@40859
   890
                     setsum_real_distribution)
hoelzl@40859
   891
qed
hoelzl@36080
   892
hoelzl@40859
   893
lemma (in information_space) conditional_entropy_less_eq_entropy:
hoelzl@40859
   894
  assumes X: "simple_function X" and Z: "simple_function Z"
hoelzl@40859
   895
  shows "\<H>(X | Z) \<le> \<H>(X)"
hoelzl@36624
   896
proof -
hoelzl@40859
   897
  have "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)" using mutual_information_eq_entropy_conditional_entropy[OF assms] .
hoelzl@40859
   898
  with mutual_information_positive[OF X Z] entropy_positive[OF X]
hoelzl@36624
   899
  show ?thesis by auto
hoelzl@36080
   900
qed
hoelzl@36080
   901
hoelzl@40859
   902
lemma (in information_space) entropy_chain_rule:
hoelzl@40859
   903
  assumes X: "simple_function X" and Y: "simple_function Y"
hoelzl@40859
   904
  shows  "\<H>(\<lambda>x. (X x, Y x)) = \<H>(X) + \<H>(Y|X)"
hoelzl@40859
   905
proof -
hoelzl@40859
   906
  let "?XY x y" = "real (joint_distribution X Y {(x, y)})"
hoelzl@40859
   907
  let "?Y y" = "real (distribution Y {y})"
hoelzl@40859
   908
  let "?X x" = "real (distribution X {x})"
hoelzl@40859
   909
  note fX = X[THEN simple_function_imp_finite_random_variable]
hoelzl@40859
   910
  note fY = Y[THEN simple_function_imp_finite_random_variable]
hoelzl@40859
   911
  note fX[THEN finite_distribution_finite, simp] and fY[THEN finite_distribution_finite, simp]
hoelzl@40859
   912
  note finite_distribution_order[OF fX fY, simp]
hoelzl@40859
   913
  { fix x y assume "x \<in> X`space M" "y \<in> Y`space M"
hoelzl@40859
   914
    have "?XY x y * log b (?XY x y / ?X x) =
hoelzl@40859
   915
          ?XY x y * log b (?XY x y) - ?XY x y * log b (?X x)"
hoelzl@41023
   916
      by (auto simp: log_simps real_of_pextreal_mult[symmetric] zero_less_mult_iff
hoelzl@41023
   917
                     zero_less_real_of_pextreal field_simps real_of_pextreal_eq_0 abs_mult) }
hoelzl@40859
   918
  note * = this
hoelzl@40859
   919
  show ?thesis
hoelzl@40859
   920
    using setsum_real_joint_distribution_singleton[OF fY fX]
hoelzl@40859
   921
    unfolding entropy_eq[OF X] conditional_entropy_eq_cartesian_product[OF Y X] entropy_eq_cartesian_product[OF X Y]
hoelzl@40859
   922
    unfolding joint_distribution_commute_singleton[of Y X] setsum_commute[of _ "X`space M"]
hoelzl@40859
   923
    by (simp add: * setsum_subtractf setsum_left_distrib[symmetric])
hoelzl@40859
   924
qed
hoelzl@38656
   925
hoelzl@39097
   926
section {* Partitioning *}
hoelzl@36080
   927
hoelzl@36624
   928
definition "subvimage A f g \<longleftrightarrow> (\<forall>x \<in> A. f -` {f x} \<inter> A \<subseteq> g -` {g x} \<inter> A)"
hoelzl@36624
   929
hoelzl@36624
   930
lemma subvimageI:
hoelzl@36624
   931
  assumes "\<And>x y. \<lbrakk> x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
hoelzl@36624
   932
  shows "subvimage A f g"
hoelzl@36624
   933
  using assms unfolding subvimage_def by blast
hoelzl@36624
   934
hoelzl@36624
   935
lemma subvimageE[consumes 1]:
hoelzl@36624
   936
  assumes "subvimage A f g"
hoelzl@36624
   937
  obtains "\<And>x y. \<lbrakk> x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
hoelzl@36624
   938
  using assms unfolding subvimage_def by blast
hoelzl@36624
   939
hoelzl@36624
   940
lemma subvimageD:
hoelzl@36624
   941
  "\<lbrakk> subvimage A f g ; x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
hoelzl@36624
   942
  using assms unfolding subvimage_def by blast
hoelzl@36624
   943
hoelzl@36624
   944
lemma subvimage_subset:
hoelzl@36624
   945
  "\<lbrakk> subvimage B f g ; A \<subseteq> B \<rbrakk> \<Longrightarrow> subvimage A f g"
hoelzl@36624
   946
  unfolding subvimage_def by auto
hoelzl@36624
   947
hoelzl@36624
   948
lemma subvimage_idem[intro]: "subvimage A g g"
hoelzl@36624
   949
  by (safe intro!: subvimageI)
hoelzl@36624
   950
hoelzl@36624
   951
lemma subvimage_comp_finer[intro]:
hoelzl@36624
   952
  assumes svi: "subvimage A g h"
hoelzl@36624
   953
  shows "subvimage A g (f \<circ> h)"
hoelzl@36624
   954
proof (rule subvimageI, simp)
hoelzl@36624
   955
  fix x y assume "x \<in> A" "y \<in> A" "g x = g y"
hoelzl@36624
   956
  from svi[THEN subvimageD, OF this]
hoelzl@36624
   957
  show "f (h x) = f (h y)" by simp
hoelzl@36624
   958
qed
hoelzl@36624
   959
hoelzl@36624
   960
lemma subvimage_comp_gran:
hoelzl@36624
   961
  assumes svi: "subvimage A g h"
hoelzl@36624
   962
  assumes inj: "inj_on f (g ` A)"
hoelzl@36624
   963
  shows "subvimage A (f \<circ> g) h"
hoelzl@36624
   964
  by (rule subvimageI) (auto intro!: subvimageD[OF svi] simp: inj_on_iff[OF inj])
hoelzl@36624
   965
hoelzl@36624
   966
lemma subvimage_comp:
hoelzl@36624
   967
  assumes svi: "subvimage (f ` A) g h"
hoelzl@36624
   968
  shows "subvimage A (g \<circ> f) (h \<circ> f)"
hoelzl@36624
   969
  by (rule subvimageI) (auto intro!: svi[THEN subvimageD])
hoelzl@36624
   970
hoelzl@36624
   971
lemma subvimage_trans:
hoelzl@36624
   972
  assumes fg: "subvimage A f g"
hoelzl@36624
   973
  assumes gh: "subvimage A g h"
hoelzl@36624
   974
  shows "subvimage A f h"
hoelzl@36624
   975
  by (rule subvimageI) (auto intro!: fg[THEN subvimageD] gh[THEN subvimageD])
hoelzl@36624
   976
hoelzl@36624
   977
lemma subvimage_translator:
hoelzl@36624
   978
  assumes svi: "subvimage A f g"
hoelzl@36624
   979
  shows "\<exists>h. \<forall>x \<in> A. h (f x)  = g x"
hoelzl@36624
   980
proof (safe intro!: exI[of _ "\<lambda>x. (THE z. z \<in> (g ` (f -` {x} \<inter> A)))"])
hoelzl@36624
   981
  fix x assume "x \<in> A"
hoelzl@36624
   982
  show "(THE x'. x' \<in> (g ` (f -` {f x} \<inter> A))) = g x"
hoelzl@36624
   983
    by (rule theI2[of _ "g x"])
hoelzl@36624
   984
      (insert `x \<in> A`, auto intro!: svi[THEN subvimageD])
hoelzl@36624
   985
qed
hoelzl@36624
   986
hoelzl@36624
   987
lemma subvimage_translator_image:
hoelzl@36624
   988
  assumes svi: "subvimage A f g"
hoelzl@36624
   989
  shows "\<exists>h. h ` f ` A = g ` A"
hoelzl@36624
   990
proof -
hoelzl@36624
   991
  from subvimage_translator[OF svi]
hoelzl@36624
   992
  obtain h where "\<And>x. x \<in> A \<Longrightarrow> h (f x) = g x" by auto
hoelzl@36624
   993
  thus ?thesis
hoelzl@36624
   994
    by (auto intro!: exI[of _ h]
hoelzl@36624
   995
      simp: image_compose[symmetric] comp_def cong: image_cong)
hoelzl@36624
   996
qed
hoelzl@36624
   997
hoelzl@36624
   998
lemma subvimage_finite:
hoelzl@36624
   999
  assumes svi: "subvimage A f g" and fin: "finite (f`A)"
hoelzl@36624
  1000
  shows "finite (g`A)"
hoelzl@36624
  1001
proof -
hoelzl@36624
  1002
  from subvimage_translator_image[OF svi]
hoelzl@36624
  1003
  obtain h where "g`A = h`f`A" by fastsimp
hoelzl@36624
  1004
  with fin show "finite (g`A)" by simp
hoelzl@36624
  1005
qed
hoelzl@36624
  1006
hoelzl@36624
  1007
lemma subvimage_disj:
hoelzl@36624
  1008
  assumes svi: "subvimage A f g"
hoelzl@36624
  1009
  shows "f -` {x} \<inter> A \<subseteq> g -` {y} \<inter> A \<or>
hoelzl@36624
  1010
      f -` {x} \<inter> g -` {y} \<inter> A = {}" (is "?sub \<or> ?dist")
hoelzl@36624
  1011
proof (rule disjCI)
hoelzl@36624
  1012
  assume "\<not> ?dist"
hoelzl@36624
  1013
  then obtain z where "z \<in> A" and "x = f z" and "y = g z" by auto
hoelzl@36624
  1014
  thus "?sub" using svi unfolding subvimage_def by auto
hoelzl@36624
  1015
qed
hoelzl@36624
  1016
hoelzl@36624
  1017
lemma setsum_image_split:
hoelzl@36624
  1018
  assumes svi: "subvimage A f g" and fin: "finite (f ` A)"
hoelzl@36624
  1019
  shows "(\<Sum>x\<in>f`A. h x) = (\<Sum>y\<in>g`A. \<Sum>x\<in>f`(g -` {y} \<inter> A). h x)"
hoelzl@36624
  1020
    (is "?lhs = ?rhs")
hoelzl@36624
  1021
proof -
hoelzl@36624
  1022
  have "f ` A =
hoelzl@36624
  1023
      snd ` (SIGMA x : g ` A. f ` (g -` {x} \<inter> A))"
hoelzl@36624
  1024
      (is "_ = snd ` ?SIGMA")
hoelzl@36624
  1025
    unfolding image_split_eq_Sigma[symmetric]
hoelzl@36624
  1026
    by (simp add: image_compose[symmetric] comp_def)
hoelzl@36624
  1027
  moreover
hoelzl@36624
  1028
  have snd_inj: "inj_on snd ?SIGMA"
hoelzl@36624
  1029
    unfolding image_split_eq_Sigma[symmetric]
hoelzl@36624
  1030
    by (auto intro!: inj_onI subvimageD[OF svi])
hoelzl@36624
  1031
  ultimately
hoelzl@36624
  1032
  have "(\<Sum>x\<in>f`A. h x) = (\<Sum>(x,y)\<in>?SIGMA. h y)"
hoelzl@36624
  1033
    by (auto simp: setsum_reindex intro: setsum_cong)
hoelzl@36624
  1034
  also have "... = ?rhs"
hoelzl@36624
  1035
    using subvimage_finite[OF svi fin] fin
hoelzl@36624
  1036
    apply (subst setsum_Sigma[symmetric])
hoelzl@36624
  1037
    by (auto intro!: finite_subset[of _ "f`A"])
hoelzl@36624
  1038
  finally show ?thesis .
hoelzl@36624
  1039
qed
hoelzl@36624
  1040
hoelzl@40859
  1041
lemma (in information_space) entropy_partition:
hoelzl@40859
  1042
  assumes sf: "simple_function X" "simple_function P"
hoelzl@36624
  1043
  assumes svi: "subvimage (space M) X P"
hoelzl@36624
  1044
  shows "\<H>(X) = \<H>(P) + \<H>(X|P)"
hoelzl@36624
  1045
proof -
hoelzl@40859
  1046
  let "?XP x p" = "real (joint_distribution X P {(x, p)})"
hoelzl@40859
  1047
  let "?X x" = "real (distribution X {x})"
hoelzl@40859
  1048
  let "?P p" = "real (distribution P {p})"
hoelzl@40859
  1049
  note fX = sf(1)[THEN simple_function_imp_finite_random_variable]
hoelzl@40859
  1050
  note fP = sf(2)[THEN simple_function_imp_finite_random_variable]
hoelzl@40859
  1051
  note fX[THEN finite_distribution_finite, simp] and fP[THEN finite_distribution_finite, simp]
hoelzl@40859
  1052
  note finite_distribution_order[OF fX fP, simp]
hoelzl@38656
  1053
  have "(\<Sum>x\<in>X ` space M. real (distribution X {x}) * log b (real (distribution X {x}))) =
hoelzl@36624
  1054
    (\<Sum>y\<in>P `space M. \<Sum>x\<in>X ` space M.
hoelzl@38656
  1055
    real (joint_distribution X P {(x, y)}) * log b (real (joint_distribution X P {(x, y)})))"
hoelzl@36624
  1056
  proof (subst setsum_image_split[OF svi],
hoelzl@40859
  1057
      safe intro!: setsum_mono_zero_cong_left imageI)
hoelzl@40859
  1058
    show "finite (X ` space M)" "finite (X ` space M)" "finite (P ` space M)"
hoelzl@40859
  1059
      using sf unfolding simple_function_def by auto
hoelzl@40859
  1060
  next
hoelzl@36624
  1061
    fix p x assume in_space: "p \<in> space M" "x \<in> space M"
hoelzl@38656
  1062
    assume "real (joint_distribution X P {(X x, P p)}) * log b (real (joint_distribution X P {(X x, P p)})) \<noteq> 0"
hoelzl@36624
  1063
    hence "(\<lambda>x. (X x, P x)) -` {(X x, P p)} \<inter> space M \<noteq> {}" by (auto simp: distribution_def)
hoelzl@36624
  1064
    with svi[unfolded subvimage_def, rule_format, OF `x \<in> space M`]
hoelzl@36624
  1065
    show "x \<in> P -` {P p}" by auto
hoelzl@36624
  1066
  next
hoelzl@36624
  1067
    fix p x assume in_space: "p \<in> space M" "x \<in> space M"
hoelzl@36624
  1068
    assume "P x = P p"
hoelzl@36624
  1069
    from this[symmetric] svi[unfolded subvimage_def, rule_format, OF `x \<in> space M`]
hoelzl@36624
  1070
    have "X -` {X x} \<inter> space M \<subseteq> P -` {P p} \<inter> space M"
hoelzl@36624
  1071
      by auto
hoelzl@36624
  1072
    hence "(\<lambda>x. (X x, P x)) -` {(X x, P p)} \<inter> space M = X -` {X x} \<inter> space M"
hoelzl@36624
  1073
      by auto
hoelzl@38656
  1074
    thus "real (distribution X {X x}) * log b (real (distribution X {X x})) =
hoelzl@38656
  1075
          real (joint_distribution X P {(X x, P p)}) *
hoelzl@38656
  1076
          log b (real (joint_distribution X P {(X x, P p)}))"
hoelzl@36624
  1077
      by (auto simp: distribution_def)
hoelzl@36624
  1078
  qed
hoelzl@40859
  1079
  moreover have "\<And>x y. real (joint_distribution X P {(x, y)}) *
hoelzl@40859
  1080
      log b (real (joint_distribution X P {(x, y)}) / real (distribution P {y})) =
hoelzl@40859
  1081
      real (joint_distribution X P {(x, y)}) * log b (real (joint_distribution X P {(x, y)})) -
hoelzl@40859
  1082
      real (joint_distribution X P {(x, y)}) * log b (real (distribution P {y}))"
hoelzl@40859
  1083
    by (auto simp add: log_simps zero_less_mult_iff field_simps)
hoelzl@40859
  1084
  ultimately show ?thesis
hoelzl@40859
  1085
    unfolding sf[THEN entropy_eq] conditional_entropy_eq[OF sf]
hoelzl@40859
  1086
    using setsum_real_joint_distribution_singleton[OF fX fP]
hoelzl@38656
  1087
    by (simp add: setsum_cartesian_product' setsum_subtractf setsum_real_distribution
hoelzl@36624
  1088
      setsum_left_distrib[symmetric] setsum_commute[where B="P`space M"])
hoelzl@36624
  1089
qed
hoelzl@36624
  1090
hoelzl@40859
  1091
corollary (in information_space) entropy_data_processing:
hoelzl@40859
  1092
  assumes X: "simple_function X" shows "\<H>(f \<circ> X) \<le> \<H>(X)"
hoelzl@40859
  1093
proof -
hoelzl@40859
  1094
  note X
hoelzl@40859
  1095
  moreover have fX: "simple_function (f \<circ> X)" using X by auto
hoelzl@40859
  1096
  moreover have "subvimage (space M) X (f \<circ> X)" by auto
hoelzl@40859
  1097
  ultimately have "\<H>(X) = \<H>(f\<circ>X) + \<H>(X|f\<circ>X)" by (rule entropy_partition)
hoelzl@40859
  1098
  then show "\<H>(f \<circ> X) \<le> \<H>(X)"
hoelzl@40859
  1099
    by (auto intro: conditional_entropy_positive[OF X fX])
hoelzl@40859
  1100
qed
hoelzl@36624
  1101
hoelzl@40859
  1102
corollary (in information_space) entropy_of_inj:
hoelzl@40859
  1103
  assumes X: "simple_function X" and inj: "inj_on f (X`space M)"
hoelzl@36624
  1104
  shows "\<H>(f \<circ> X) = \<H>(X)"
hoelzl@36624
  1105
proof (rule antisym)
hoelzl@40859
  1106
  show "\<H>(f \<circ> X) \<le> \<H>(X)" using entropy_data_processing[OF X] .
hoelzl@36624
  1107
next
hoelzl@40859
  1108
  have sf: "simple_function (f \<circ> X)"
hoelzl@40859
  1109
    using X by auto
hoelzl@36624
  1110
  have "\<H>(X) = \<H>(the_inv_into (X`space M) f \<circ> (f \<circ> X))"
hoelzl@40859
  1111
    by (auto intro!: mutual_information_cong simp: entropy_def the_inv_into_f_f[OF inj])
hoelzl@36624
  1112
  also have "... \<le> \<H>(f \<circ> X)"
hoelzl@40859
  1113
    using entropy_data_processing[OF sf] .
hoelzl@36624
  1114
  finally show "\<H>(X) \<le> \<H>(f \<circ> X)" .
hoelzl@36624
  1115
qed
hoelzl@36624
  1116
hoelzl@36080
  1117
end