diff --git a/flake.nix b/flake.nix index 3299196..53c17bc 100644 --- a/flake.nix +++ b/flake.nix @@ -52,10 +52,10 @@ enable = true; package = treefmtEval.config.build.wrapper; }; - typst-fmt = { + typstyle = { enable = true; - name = "Typst Formatter"; - entry = "${pkgs.typstfmt}/bin/typstfmt"; + name = "typstyle"; + entry = "${pkgs.typstyle}/bin/typstyle --check"; files = "\\.typ$"; language = "rust"; }; diff --git a/slides/00-tools.typ b/slides/00-tools.typ index 95ce234..f30f356 100644 --- a/slides/00-tools.typ +++ b/slides/00-tools.typ @@ -4,9 +4,7 @@ #new-section-slide("Tools") -#slide( - title: "Recommended References", -)[ +#slide(title: "Recommended References")[ - #cite(, form: "prose") - Turing paper - #cite(, form: "prose") - Stan paper @@ -20,11 +18,13 @@ #focus-slide(background: julia-purple)[ #quote( - block: true, attribution: [Vita Sackville-West], + block: true, + attribution: [Vita Sackville-West], )[A man and his tools make a man and his trade] #quote( - block: true, attribution: [Winston Churchill], + block: true, + attribution: [Winston Churchill], )[We shape our tools and then the tools shape us] ] @@ -32,9 +32,7 @@ #align(center)[#image("images/memes/standards.png")] ] -#slide( - title: "Tools", -)[ +#slide(title: "Tools")[ - #link("https://mc-stan.org")[Stan] (BSD-3 License) - #link("https://turinglang.org")[Turing] (MIT License) @@ -46,15 +44,9 @@ - #link("https://www.mrc-bsu.cam.ac.uk/software/bugs/")[BUGS] (GPL License) ] -#slide( - title: [Stan #footnote[#cite(, form: "prose")]], -)[ - #side-by-side( - columns: (4fr, 1fr), - )[ - #text( - size: 16pt, - )[ +#slide(title: [Stan #footnote[#cite(, form: "prose")]])[ + #side-by-side(columns: (4fr, 1fr))[ + #text(size: 16pt)[ - High-performance platform for statistical modeling and statistical computation - Financial support from #link("https://numfocus.org/")[NUMFocus]: @@ -75,36 +67,30 @@ ] #slide(title: [Stan Code Example])[ -#fit-to-height(1fr)[```cpp - data { - int N; - vector[N] x; - vector[N] y; - } - parameters { - real alpha; - real beta; - real sigma; - } - model { - alpha ~ normal(0, 20); - beta ~ normal(0, 2); - sigma ~ cauchy(0, 2.5); - y ~ normal(alpha + beta * x, sigma); - } - ``` -] + #fit-to-height(1fr)[```cpp + data { + int N; + vector[N] x; + vector[N] y; + } + parameters { + real alpha; + real beta; + real sigma; + } + model { + alpha ~ normal(0, 20); + beta ~ normal(0, 2); + sigma ~ cauchy(0, 2.5); + y ~ normal(alpha + beta * x, sigma); + } + ``` + ] ] -#slide( - title: [Turing #footnote[#cite(, form: "prose")]], -)[ - #side-by-side( - columns: (4fr, 1fr), - )[ - #text( - size: 18pt, - )[ +#slide(title: [Turing #footnote[#cite(, form: "prose")]])[ + #side-by-side(columns: (4fr, 1fr))[ + #text(size: 18pt)[ - Ecosystem of Julia packages for Bayesian Inference using probabilistic programming @@ -127,12 +113,8 @@ ] ] -#slide( - title: [Turing Ecosystem], -)[ - #text( - size: 16pt, - )[ +#slide(title: [Turing Ecosystem])[ + #text(size: 16pt)[ We have several Julia packages under Turing's GitHub organization #link("https://github.com/TuringLang")[TuringLang], but I will focus on 6 of those: @@ -170,9 +152,9 @@ ] @storopoli2021bayesianjulia ] Code Example])[ -#v(1em) + #v(1em) -```julia + ```julia @model function linreg(x, y) α ~ Normal(0, 20) β ~ Normal(0, 2) @@ -183,12 +165,8 @@ ``` ] -#slide( - title: [PyMC #footnote[#cite(form: "prose", )]], -)[ - #side-by-side( - columns: (4fr, 1fr), - )[ +#slide(title: [PyMC #footnote[#cite(form: "prose", )]])[ + #side-by-side(columns: (4fr, 1fr))[ - Python package for Bayesian statistics with a Markov Chain Monte Carlo sampler @@ -208,7 +186,7 @@ ] #slide(title: [PyMC Code Example])[ -```python + ```python with pm.Model() as model: alpha = pm.Normal("Intercept", mu=0, sigma=20) beta = pm.Normal("beta", mu=0, sigma=2) @@ -230,9 +208,7 @@ ] ] -#slide( - title: [Why Turing], -)[ +#slide(title: [Why Turing])[ - *Julia* all the way down... - Can *interface/compose* _any_ Julia package - Decoupling of *modeling DSL, inference algorithms and data* @@ -246,9 +222,7 @@ - Very easy to do *distributed model inference and prediction*. ] -#slide( - title: [Why _Not_ Turing], -)[ +#slide(title: [Why _Not_ Turing])[ - *Not as fast*, but pretty close behind, as Stan. - *Not enough learning materials*, example models, tutorials. Also documentation @@ -271,9 +245,7 @@ - *More tutorials, example models, and learning materials available*. ] -#slide( - title: [Why _Not_ Stan], -)[ +#slide(title: [Why _Not_ Stan])[ - If you want to try *something new*, you'll have to do in *C++*. - Constrained *only to HMC-NUTS* as MCMC algorithm. diff --git a/slides/01-bayesian_statistics.typ b/slides/01-bayesian_statistics.typ index d27d1bd..a8e0947 100644 --- a/slides/01-bayesian_statistics.typ +++ b/slides/01-bayesian_statistics.typ @@ -5,12 +5,8 @@ #new-section-slide("Bayesian Statistics") -#slide( - title: "Recommended References", -)[ - #text( - size: 18pt, - )[ +#slide(title: "Recommended References")[ + #text(size: 18pt)[ - #cite(, form: "prose") - Chapter 1: Probability and inference @@ -38,7 +34,8 @@ #focus-slide(background: julia-purple)[ #quote( - block: true, attribution: [Denis Lindley], + block: true, + attribution: [Denis Lindley], )[Inside every nonBayesian there is a Bayesian struggling to get out] ] @@ -47,9 +44,7 @@ #align(center)[#image("images/memes/frequentists-vs-bayesians.png")] ] -#slide( - title: [What is Bayesian Statistics?], -)[ +#slide(title: [What is Bayesian Statistics?])[ #v(2em) Bayesian statistics is a *data analysis approach based on Bayes' theorem* where @@ -67,12 +62,8 @@ The posterior can also be used to make predictions about future events. ] -#slide( - title: [What changes from Frequentist Statistics?], -)[ - #text( - size: 16pt, - )[ +#slide(title: [What changes from Frequentist Statistics?])[ + #text(size: 16pt)[ - *Flexibility* - probabilistic building blocks to construct a model #footnote[like LEGO]: - Probabilistic conjectures about parameters: - Prior @@ -90,9 +81,7 @@ ] ] -#slide( - title: [A little bit more formal], -)[ +#slide(title: [A little bit more formal])[ - Bayesian Statistics uses probabilistic statements: - one or more parameters $θ$ @@ -108,20 +97,14 @@ - We also, implicitly, condition on the observed data from any covariate $x$ ] -#slide( - title: [Definition of Bayesian Statistics], -)[ +#slide(title: [Definition of Bayesian Statistics])[ #v(4em) The use of Bayes theorem as the procedure to *estimate parameters of interest $θ$ or unobserved data $tilde(y)$*. @gelman2013bayesian ] -#slide( - title: [PROBABILITY DOES NOT EXIST! #footnote[#cite(, form: "prose")]], -)[ - #side-by-side( - columns: (4fr, 1fr), - )[ +#slide(title: [PROBABILITY DOES NOT EXIST! #footnote[#cite(, form: "prose")]])[ + #side-by-side(columns: (4fr, 1fr))[ #v(3em) - Yes, probability does not exist ... @@ -137,12 +120,8 @@ ] ] -#slide( - title: [PROBABILITY DOES NOT EXIST! #footnote[#cite(, form: "prose")]], -)[ - #side-by-side( - columns: (3fr, 2fr), - )[ +#slide(title: [PROBABILITY DOES NOT EXIST! #footnote[#cite(, form: "prose")]])[ + #side-by-side(columns: (3fr, 2fr))[ - Consider flipping a biased coin - The trials are considered independent and, as a result, have an important property: *the order does not matter* @@ -152,42 +131,72 @@ - We say that the probability is *invariant under permutations* ][ - #align( - center, - )[ + #align(center)[ #canvas( - length: 0.65cm, { + length: 0.65cm, + { import draw: * - set-style(mark: (fill: black, size: 0.3), stroke: (thickness: 2pt), radius: 1) + set-style( + mark: (fill: black, size: 0.3), + stroke: (thickness: 2pt), + radius: 1, + ) circle((0, 0)) - arc((1, 0), start: 0deg, delta: 180deg, mode: "PIE", fill: julia-green) - arc((-1, 0), start: 180deg, delta: 180deg, mode: "PIE", fill: julia-red) + arc( + (1, 0), + start: 0deg, + delta: 180deg, + mode: "PIE", + fill: julia-green, + ) + arc( + (-1, 0), + start: 180deg, + delta: 180deg, + mode: "PIE", + fill: julia-red, + ) content((0, 0.5), [#align(center)[#text(fill: white, size: 14pt)[H]]]) - content((0, -0.5), [#align(center)[#text(fill: white, size: 14pt)[T]]]) + content( + (0, -0.5), + [#align(center)[#text(fill: white, size: 14pt)[T]]], + ) line((0, -1), (-3.5, -5)) content((-2.5, -3))[#text(size: 14pt)[$0.5$]] line((0, -1), (3.5, -5)) content((2.5, -3))[#text(size: 14pt)[$0.5$]] circle((-3.5, -6), fill: julia-green) - content((-3.5, -6), [#align(center)[#text(fill: white, size: 14pt)[H]]]) + content( + (-3.5, -6), + [#align(center)[#text(fill: white, size: 14pt)[H]]], + ) line((-3.5, -7), (-5, -11)) content((-5, -9))[#text(size: 14pt)[$0.5$]] line((-3.5, -7), (-2, -11)) content((-2, -9))[#text(size: 14pt)[$0.5$]] circle((3.5, -6), fill: julia-red) - content((3.5, -6), [#align(center)[#text(fill: white, size: 14pt)[T]]]) + content( + (3.5, -6), + [#align(center)[#text(fill: white, size: 14pt)[T]]], + ) line((3.5, -7), (2, -11)) content((2, -9))[#text(size: 14pt)[$0.5$]] line((3.5, -7), (5, -11)) content((5, -9))[#text(size: 14pt)[$0.5$]] circle((-5, -12), fill: julia-green) - content((-5, -12), [#align(center)[#text(fill: white, size: 14pt)[H]]]) + content( + (-5, -12), + [#align(center)[#text(fill: white, size: 14pt)[H]]], + ) circle((-2, -12), fill: julia-red) - content((-2, -12), [#align(center)[#text(fill: white, size: 14pt)[T]]]) + content( + (-2, -12), + [#align(center)[#text(fill: white, size: 14pt)[T]]], + ) circle((2, -12), fill: julia-green) content((2, -12), [#align(center)[#text(fill: white, size: 14pt)[H]]]) @@ -216,9 +225,7 @@ - $P("me being elected president") = 10^(-10)$ (highly unlikely) ] -#slide( - title: [What is Probability?], -)[ +#slide(title: [What is Probability?])[ We define $A$ is an event and $P(A)$ the probability of event $A$. $P(A)$ has to be between $0$ and $1$, where higher values defines higher @@ -231,12 +238,8 @@ $ ] -#slide( - title: [Probability Axioms #footnote[#cite(, form: "prose")]], -)[ - #side-by-side( - columns: (4fr, 1fr), - )[ +#slide(title: [Probability Axioms #footnote[#cite(, form: "prose")]])[ + #side-by-side(columns: (4fr, 1fr))[ - *Non-negativity*: For every $A$: $P(A) >= 0$ @@ -278,17 +281,18 @@ - Neptune: ♆ ] -#slide( - title: [Discrete Sample Space], -)[ +#slide(title: [Discrete Sample Space])[ #side-by-side[ - #align( - center, - )[ + #align(center)[ #canvas( - length: 0.5cm, { + length: 0.5cm, + { import draw: * - set-style(mark: (fill: black, size: 0.3), stroke: (thickness: 2pt), radius: 1) + set-style( + mark: (fill: black, size: 0.3), + stroke: (thickness: 2pt), + radius: 1, + ) content((0, 0))[#text(size: 18pt, fill: white)[placeholder]] content((0, -2))[#text(size: 18pt)[ @@ -310,13 +314,16 @@ ) ] ][ - #align( - center, - )[ + #align(center)[ #canvas( - length: 0.5cm, { + length: 0.5cm, + { import draw: * - set-style(mark: (fill: black, size: 0.3), stroke: (thickness: 2pt), radius: 1) + set-style( + mark: (fill: black, size: 0.3), + stroke: (thickness: 2pt), + radius: 1, + ) content((0, 0))[#text(size: 18pt)[$θ ∈ E_1$]] circle((-7, -2)) @@ -413,44 +420,45 @@ ] ] -#slide( - title: [Continuous Sample Space], -)[ +#slide(title: [Continuous Sample Space])[ #side-by-side[ #align(center)[ - #canvas(length: 0.5cm, { - import draw: * - set-style(mark: (fill: black, size: 0.3), stroke: (thickness: 2pt)) - - content((0, 0))[#text(size: 18pt, fill: white)[placeholder]] - content((0, -2))[#text(size: 18pt)[ - The distance is less than five centimeters - ]] - content((0, -6))[#text(size: 18pt)[ - The distance is between three and seven centimeters - ]] - content((0, -10))[#text(size: 18pt)[ - The distance is less than five centimeters \ - _and_ between three and seven centimeters - ]] - content((0, -14))[#text(size: 18pt)[ - The distance is less than five centimeters \ - _or_ between three and seven centimeters - ]] - content((0, -18))[#text(size: 18pt)[ - The distance is _not_ less than five centimeters - ]] - }) + #canvas( + length: 0.5cm, + { + import draw: * + set-style(mark: (fill: black, size: 0.3), stroke: (thickness: 2pt)) + + content((0, 0))[#text(size: 18pt, fill: white)[placeholder]] + content((0, -2))[#text(size: 18pt)[ + The distance is less than five centimeters + ]] + content((0, -6))[#text(size: 18pt)[ + The distance is between three and seven centimeters + ]] + content((0, -10))[#text(size: 18pt)[ + The distance is less than five centimeters \ + _and_ between three and seven centimeters + ]] + content((0, -14))[#text(size: 18pt)[ + The distance is less than five centimeters \ + _or_ between three and seven centimeters + ]] + content((0, -18))[#text(size: 18pt)[ + The distance is _not_ less than five centimeters + ]] + }, + ) ] ][ - #align( - center, - )[ + #align(center)[ #canvas( - length: 0.5cm, { + length: 0.5cm, + { import draw: * set-style( - mark: (start: "|", end: "|", fill: black, size: 1), stroke: (thickness: 4pt), + mark: (start: "|", end: "|", fill: black, size: 1), + stroke: (thickness: 4pt), ) content((0, 0))[#text(size: 18pt)[$θ ∈ E_1$]] @@ -478,12 +486,8 @@ ] ] -#slide( - title: [Discrete versus Continuous Parameters], -)[ - #text( - size: 18pt, - )[ +#slide(title: [Discrete versus Continuous Parameters])[ + #text(size: 18pt)[ Everything that has been exposed here was under the assumption that the parameters are discrete. @@ -505,9 +509,7 @@ ] ] -#slide( - title: [Conditional Probability], -)[ +#slide(title: [Conditional Probability])[ Probability of an event occurring in case another has occurred or not. The notation we use is $P(A | B)$, that read as "the probability of observing $A$ given @@ -521,9 +523,7 @@ assuming that $P(B) > 0$}. ] -#slide( - title: [Example of Conditional Probability -- Poker Texas Hold'em], -)[ +#slide(title: [Example of Conditional Probability -- Poker Texas Hold'em])[ - *Sample Space*: $52$ cards in a deck, $13$ types of cards and $4$ types of suits. @@ -538,18 +538,14 @@ Ace $(4 / 51 ≈ 0.078)$ ] -#slide( - title: [Caution! Not always $P(A | B) = P(B | A)$], -)[ +#slide(title: [Caution! Not always $P(A | B) = P(B | A)$])[ In the previous example we have the symmetry $P(A | K) = P(K | A)$, *but not always this is true* #footnote[ More specific, if the basal rates $P(A)$ and $P(B)$ aren't equal, the symmetry is broken $P(A | B) ≠ P(B | A)$ ] - #text( - size: 14pt, - )[ + #text(size: 14pt)[ The Pope is catholic: - $P("pope")$: Probability of some random person being the Pope, something really @@ -581,12 +577,8 @@ $ ] -#slide( - title: [Example of Joint Probability -- Revisiting Poker Texas Hold'em], -)[ - #text( - size: 17pt, - )[ +#slide(title: [Example of Joint Probability -- Revisiting Poker Texas Hold'em])[ + #text(size: 17pt)[ - *Sample Space*: $52$ cards in a deck, $13$ types of cards and $4$ types of suits. - $P(A)$: Probability of being dealt an Ace $(4 / 52 = 1 / 13)$ @@ -597,25 +589,25 @@ Ace $(4 / 51 ≈ 0.078)$ - $P(A, K)$: Probability of being dealt an Ace _and_ being dealt a King $ - P(A, K) &= P(K, A) \ + P(A, K) &= P(K, A) \ P(A) dot P(K | A) &= P(K) dot P(A | K) \ 1 / 13 dot 4 / 51 &= 1 / 13 dot 4 / 51 \ - &≈ 0.006 + &≈ 0.006 $ ] ] -#slide( - title: [Visualization of Joint Probability versus Conditional Probability], -)[ +#slide(title: [Visualization of Joint Probability versus Conditional Probability])[ #figure( - image("images/probability/joint_vs_conditional_probability.svg", height: 80%), caption: [$P(X,Y)$ versus $P(X | Y=-0.75)$], + image( + "images/probability/joint_vs_conditional_probability.svg", + height: 80%, + ), + caption: [$P(X,Y)$ versus $P(X | Y=-0.75)$], ) ] -#slide( - title: [Product Rule of Probability #footnote[also called the Product Rule of Probability.]], -)[ +#slide(title: [Product Rule of Probability #footnote[also called the Product Rule of Probability.]])[ #v(2em) We can decompose a joint probability $P(A,B)$ into the product of two @@ -624,20 +616,14 @@ #v(2em) $ - P(A,B) &= P(B,A) \ + P(A,B) &= P(B,A) \ P(A) dot P(B | A) &= P(B) dot P(A | B) $ ] -#slide( - title: [Who was Thomas Bayes?], -)[ - #side-by-side( - columns: (4fr, 1fr), - )[ - #text( - size: 17pt, - )[ +#slide(title: [Who was Thomas Bayes?])[ + #side-by-side(columns: (4fr, 1fr))[ + #text(size: 17pt)[ - *Thomas Bayes* (1701 - 1761) was a statistician, philosopher and Presbyterian minister who is known for formulating a specific case of the theorem that bears his name: Bayes' theorem. @@ -671,7 +657,7 @@ #text(size: 18pt)[ $ - P(A,B) &= P(B,A) \ + P(A,B) &= P(B,A) \ P(A) dot P(B | A) &= P(B) dot P(A | B) $ @@ -680,19 +666,15 @@ $ (P(A) dot P(B | A)) / P(B) &= (P(B) dot P(A | B)) / P(B) \ (P(A) dot P(B | A)) / P(B) &= P(A | B) \ - P(A | B) &= (P(A) dot P(B | A)) / P(B) + P(A | B) &= (P(A) dot P(B | A)) / P(B) $ ] ] -#slide( - title: [A Probability Textbook Classic #footnote[Adapted from: #link( +#slide(title: [A Probability Textbook Classic #footnote[Adapted from: #link( "https://www.yudkowsky.net/rational/bayes", - )[Yudkowski - _An Intuitive Explanation of Bayes' Theorem_]]], -)[ - #text( - size: 15pt, - )[ + )[Yudkowski - _An Intuitive Explanation of Bayes' Theorem_]]])[ + #text(size: 15pt)[ How accurate is a *breast cancer* test? - 1% of women have *breast cancer* (Prevalence) @@ -704,20 +686,22 @@ $ P(C | +) &= (P(+ | C) dot P(C)) / P(+) \ - P(C | +) &= (P(+ | C) dot P(C)) / (P(+ | C) dot P(C) + P(+ | not C) dot P(not C)) \ + P(C | +) &= (P(+ | C) dot P(C)) / (P(+ | C) dot P(C) + P( + + | not C + ) dot P(not C)) \ P(C | +) &= (0.8 dot 0.01) / (0.8 dot 0.01 + 0.096 dot 0.99) \ P(C | +) &≈ 0.0776 $ ] ] -#slide( - title: [Why Bayes' Theorem is Important?], -)[ +#slide(title: [Why Bayes' Theorem is Important?])[ We can invert the conditional probability: $ - P("hypothesis" | "data") = (P("data" | "hypothesis") dot P("hypothesis")) / P("data") + P("hypothesis" | "data") = (P("data" | "hypothesis") dot P( + "hypothesis" + )) / P("data") $ #v(2em) @@ -727,9 +711,7 @@ #align(center)[#text(size: 24pt, fill: julia-red)[NO!]] ] -#slide( - title: [What are $p$-values?], -)[ +#slide(title: [What are $p$-values?])[ #v(2em) $p$-value is the probability of obtaining results at least as extreme as the @@ -744,12 +726,8 @@ #align(center)[#image("images/memes/pvalue.jpg", height: 80%)] ] -#slide( - title: [What $p$-value is *not*!], -)[ - #text( - size: 18pt, - )[ +#slide(title: [What $p$-value is *not*!])[ + #text(size: 18pt)[ - *$p$-value is not the probability of the null hypothesis* - #text(fill: julia-red)[No!] - Infamous confusion between $P(D | H_0)$ and $P(H_0 | D)$. @@ -768,9 +746,7 @@ ] ] -#slide( - title: [The relationship between $p$-value and $H_0$], -)[ +#slide(title: [The relationship between $p$-value and $H_0$])[ To find out about any $p$-value, *find out what $H_0$ is behind it*. It's definition will never change, since it is always $P(D | H_0)$: @@ -783,15 +759,12 @@ - *Shapiro-Wilk*: $P(D | "population is distributed as a Normal distribution")$ ] -#slide( - title: [What are Confidence Intervals?], -)[ - #side-by-side( - columns: (4fr, 1fr), - )[ +#slide(title: [What are Confidence Intervals?])[ + #side-by-side(columns: (4fr, 1fr))[ #v(2em) #quote( - block: true, attribution: [ + block: true, + attribution: [ #cite(, form: "prose") the "father" of confidence intervals ], )[ @@ -804,12 +777,8 @@ ] ] -#slide( - title: [What are Confidence Intervals?], -)[ - #text( - size: 18pt, - )[ +#slide(title: [What are Confidence Intervals?])[ + #text(size: 18pt)[ Say you performed a statistical analysis to compare the efficacy of a public policy between two groups and you obtain a difference between the mean of these groups. You can express this difference as a confidence interval. Often we @@ -824,25 +793,29 @@ The interval that we got in this particular instance is irrelevant and might as well be thrown away. - #text( - fill: julia-red, - )[ + #text(fill: julia-red)[ Doesn't say anything about you *target population*, but about you *sample* in an insane process of *infinite sampling* ... ] ] ] -#slide( - title: [Confidence Intervals versus Posterior Intervals], -)[ - #align( - center, - )[ +#slide(title: [Confidence Intervals versus Posterior Intervals])[ + #align(center)[ #canvas( - length: 0.9cm, { + length: 0.9cm, + { plot.plot( - size: (16, 9), x-label: $θ$, y-label: none, x-tick-step: 5, y-tick-step: 0.25, x-min: -0.1, x-max: 4, y-min: -0.01, y-max: 1.5, { + size: (16, 9), + x-label: $θ$, + y-label: none, + x-tick-step: 5, + y-tick-step: 0.25, + x-min: -0.1, + x-max: 4, + y-min: -0.01, + y-max: 1.5, + { plot.add(domain: (0.02, 4), samples: 200, x => lognormal(x, 0, 2)) plot.add( domain: (0.25950495026507125, 3.8534910373715427), samples: 200, style: plot.palette.blue, hypograph: true, @@ -869,21 +842,18 @@ #align(center)[#image("images/memes/assumptions-vs-reality.jpg")] ] -#slide( - title: [But why I never see stats without $p$-values?], -)[ - #side-by-side( - columns: (4fr, 1fr), - )[ - #text( - size: 17pt, - )[ +#slide(title: [But why I never see stats without $p$-values?])[ + #side-by-side(columns: (4fr, 1fr))[ + #text(size: 17pt)[ We cannot understand $p$-values if we do no not comprehend its origins and historical trajectory. The first mention of $p$-values was made by the statistician Ronald Fischer in 1925: - #quote(block: true, attribution: [ - #cite(, form: "prose") - ])[ + #quote( + block: true, + attribution: [ + #cite(, form: "prose") + ], + )[ $p$-value is a measure of evidence against the null hypothesis ] - To quantify the strength of the evidence against the null hypothesis, Fisher @@ -897,9 +867,7 @@ ] ] -#slide( - title: [$p = 0.06$], -)[ +#slide(title: [$p = 0.06$])[ - Since $p$-value is a probability, it is also a continuous measure. @@ -913,21 +881,18 @@ much as the $.05$" @rosnow1989statistical. ] -#slide( - title: [ - But why I never heard about Bayesian statistics? #footnote[ +#slide(title: [ + But why I never heard about Bayesian statistics? #footnote[ _inverse probability_ was how Bayes' theorem was called in the beginning of the 20th century. ] - ], -)[ - #side-by-side( - columns: (4fr, 1fr), - )[ +])[ + #side-by-side(columns: (4fr, 1fr))[ #v(3em) #quote( - block: true, attribution: [ + block: true, + attribution: [ #cite(, form: "prose") ], )[ @@ -940,17 +905,13 @@ ] ] -#slide( - title: [ - Inside every nonBayesian, there is a Bayesian struggling to get out - #footnote[ - quote from Dennis Lindley. - ] - ], -)[ - #side-by-side( - columns: (4fr, 1fr), - )[ +#slide(title: [ + Inside every nonBayesian, there is a Bayesian struggling to get out + #footnote[ + quote from Dennis Lindley. + ] +])[ + #side-by-side(columns: (4fr, 1fr))[ #v(2em) - In his final year of life, Fisher published a paper @@ -966,19 +927,14 @@ ] ] -#slide( - title: [Bayes' Theorem as an Inference Engine], -)[ - #text( - size: 13pt, - )[ +#slide(title: [Bayes' Theorem as an Inference Engine])[ + #text(size: 13pt)[ Now that you know what is probability and Bayes' theorem, I will propose the following: $ underbrace(P(θ | y), "Posterior") = - (overbrace(P(y | θ), "Likelihood") dot overbrace(P(θ), "Prior")) / - underbrace(P(y), "Normalizing Constant") + (overbrace(P(y | θ), "Likelihood") dot overbrace(P(θ), "Prior")) / underbrace(P(y), "Normalizing Constant") $ - $θ$ -- parameter(s) of interest @@ -1000,9 +956,7 @@ ] ] -#slide( - title: [Bayes' Theorem as an Inference Engine], -)[ +#slide(title: [Bayes' Theorem as an Inference Engine])[ Bayesian statistics allows us to *quantify directly the uncertainty* related to the value of one or more parameters of our model given the observed data. @@ -1019,19 +973,28 @@ ] // typstfmt::off -#slide( -title: [Bayesian vs Frequentist Stats], -)[ +#slide(title: [Bayesian vs Frequentist Stats])[ #table( columns: (auto, auto, auto), inset: 10pt, align: (left + horizon, center + horizon, center + horizon), - [], [#text(fill: julia-blue)[Bayesian Statistics]], [#text(fill: julia-red)[Frequentist Statistics]], - [*Data*], [Fixed -- Non-random], [Uncertain -- Random], - [*Parameters*], [Uncertain -- Random], [Fixed -- Non-random], - [*Inference*], [Uncertainty regarding the parameter value], [Uncertainty regarding the sampling process from an infinite population], - [*Probability*], [Subjective], [Objective (but with several model assumptions)], - [*Uncertainty*], [Posterior Interval -- $P(θ | y)$], [Confidence Interval -- $P(y | θ)$], + [], + [#text(fill: julia-blue)[Bayesian Statistics]], + [#text(fill: julia-red)[Frequentist Statistics]], + + [*Data*], [Fixed -- Non-random], [Uncertain -- Random], + [*Parameters*], [Uncertain -- Random], [Fixed -- Non-random], + [*Inference*], + [Uncertainty regarding the parameter value], + [Uncertainty regarding the sampling process from an infinite population], + + [*Probability*], + [Subjective], + [Objective (but with several model assumptions)], + + [*Uncertainty*], + [Posterior Interval -- $P(θ | y)$], + [Confidence Interval -- $P(y | θ)$], ) ] // typstfmt::on @@ -1053,31 +1016,27 @@ title: [Bayesian vs Frequentist Stats], ] #slide(title: [The beginning of the end of Frequentist Statistics])[ - #text( - size: 18pt - )[ - - Know that you are in a very special moment in history of great changes in statistics - - - I believe that frequentist statistics, specially the way we qualify evidence and hypotheses with - $p$-values will transform in a "significant" #footnote[pun intended ...] way. - - - 8 years ago, the _American Statistical Association_ (ASA) published a declaration about - $p$-values @Wasserstein2016. - It states exactly what we exposed here: - The main concepts of the null hypothesis significant testing and, - in particular $p$-values, cannot provide what researchers demand of them. - Despite what says several textbooks, learning materials and published content, - $p$-values below $0.05$ doesn't "prove" anything. - Not, on the other way around, $p$-values higher than $0.05$ refute anything. - - - ASA statement has more than 4.700 citations with relevant impact. + #text(size: 18pt)[ + - Know that you are in a very special moment in history of great changes in statistics + + - I believe that frequentist statistics, specially the way we qualify evidence and hypotheses with + $p$-values will transform in a "significant" #footnote[pun intended ...] way. + + - 8 years ago, the _American Statistical Association_ (ASA) published a declaration about + $p$-values @Wasserstein2016. + It states exactly what we exposed here: + The main concepts of the null hypothesis significant testing and, + in particular $p$-values, cannot provide what researchers demand of them. + Despite what says several textbooks, learning materials and published content, + $p$-values below $0.05$ doesn't "prove" anything. + Not, on the other way around, $p$-values higher than $0.05$ refute anything. + + - ASA statement has more than 4.700 citations with relevant impact. ] ] #slide(title: [The beginning of the end of Frequentist Statistics])[ - #text( - size: 17pt - )[ + #text(size: 17pt)[ - An international symposium was promoted in 2017 which originated an open-access special edition of _The American Statistician_ dedicated to practical ways to abandon $p < 0.05$ @wassersteinMovingWorld052019. diff --git a/slides/02-statistical_distributions.typ b/slides/02-statistical_distributions.typ index be39ee5..ccde8d8 100644 --- a/slides/02-statistical_distributions.typ +++ b/slides/02-statistical_distributions.typ @@ -22,9 +22,7 @@ #align(center)[#image("images/memes/statistical_distributions.jpg")] ] -#slide( - title: [Probability Distributions], -)[ +#slide(title: [Probability Distributions])[ Bayesian statistics uses probability distributions as the inference engine of the parameter and uncertainty estimates. @@ -39,9 +37,7 @@ relationships. ] -#slide( - title: [Probability Distribution Function], -)[ +#slide(title: [Probability Distribution Function])[ A probability distribution function is a mathematical function that outputs the probabilities for different results of an experiment. It is a mathematical description of a random phenomena in terms of its sample space and the event @@ -53,9 +49,7 @@ variables, we define as "density". ] -#slide( - title: [Mathematical Notation], -)[ +#slide(title: [Mathematical Notation])[ We use the notation $ X tilde "Dist"(θ_1, θ_2, dots) $ @@ -71,18 +65,24 @@ that allow to control certain distribution aspects for a specific goal. ] -#slide( - title: [Probability Distribution Function], -)[ - #align( - center, - )[ +#slide(title: [Probability Distribution Function])[ + #align(center)[ #canvas( - length: 0.9cm, { + length: 0.9cm, + { plot.plot( - size: (16, 9), x-label: $X$, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.1, y-max: 0.45, { + size: (16, 9), + x-label: $X$, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.1, + y-max: 0.45, + { plot.add( - domain: (-4, 4), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => gaussian(x, 0, 1), + domain: (-4, 4), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => gaussian(x, 0, 1), ) }, ) @@ -91,9 +91,7 @@ ] ] -#slide( - title: [Cumulative Distribution Function], -)[ +#slide(title: [Cumulative Distribution Function])[ The cumulative distribution function (CDF) of a random variable $X$ evaluated at $x$ is the probability that $X$ will take values less or qual than $x$: @@ -101,18 +99,25 @@ $ "CDF" = P(X <= x) $ ] -#slide( - title: [Cumulative Distribution Function], -)[ - #align( - center, - )[ +#slide(title: [Cumulative Distribution Function])[ + #align(center)[ #canvas( - length: 0.9cm, { + length: 0.9cm, + { plot.plot( - size: (16, 9), x-label: $X$, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.25, y-min: -0.01, y-max: 1.01, { + size: (16, 9), + x-label: $X$, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.25, + y-min: -0.01, + y-max: 1.01, + { plot.add( - domain: (-4, 4), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => normcdf(x, 0, 1), + domain: (-4, 4), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => normcdf(x, 0, 1), ) }, ) @@ -121,9 +126,7 @@ ] ] -#slide( - title: [Discrete Distributions], -)[ +#slide(title: [Discrete Distributions])[ Discrete probability distributions are distributions which the results are a discrete number: $-N, dots, -2, 1, 0, 1, 2, dots, N$ and $N ∈ ZZ$. @@ -139,9 +142,7 @@ $ "PMF"(x) = P(X = x) $ ] -#slide( - title: [Discrete Uniform], -)[ +#slide(title: [Discrete Uniform])[ The discrete uniform is a symmetric probability distribution in which a finite number of values are equally likely of being observable. Each one of the $n$ values have probability $1 / n$. @@ -159,37 +160,47 @@ Example: dice. ] -#slide( - title: [Discrete Uniform], -)[ +#slide(title: [Discrete Uniform])[ #v(4em) - $ "Uniform"(a, b) = f(x, a, b) = 1 / (b - a + 1) "for" a <= x <= b "and" x ∈ {a, a + 1, dots ,b - 1, b} $ + $ + "Uniform"(a, b) = f(x, a, b) = 1 / (b - a + 1) "for" a <= x <= b "and" x ∈ { + a, a + 1, dots ,b - 1, b + } + $ ] -#slide( - title: [Discrete Uniform], -)[ - #align( - center, - )[ +#slide(title: [Discrete Uniform])[ + #align(center)[ #figure( { let data = for i in range(1, 7) { ((discreteuniform(1, 6), i),) } - let x_axis = axis(min: 0, max: 7, step: 1, location: "bottom", title: none) - let y_axis = axis(min: 0, max: 0.21, step: 0.1, location: "left", title: $"PMF"$) + let x_axis = axis( + min: 0, + max: 7, + step: 1, + location: "bottom", + title: none, + ) + let y_axis = axis( + min: 0, + max: 0.21, + step: 0.1, + location: "left", + title: $"PMF"$, + ) let pl = pplot(data: data, axes: ((x_axis, y_axis))) bar_chart(pl, (350pt, 275pt), bar_width: 70%, caption: none) - }, caption: [$a = 1, b = 6$], numbering: none, + }, + caption: [$a = 1, b = 6$], + numbering: none, ) ] ] -#slide( - title: [Bernoulli], -)[ +#slide(title: [Bernoulli])[ Bernoulli distribution describes a binary event of the success of an experiment. We represent $0$ as failure and $1$ as success, hence the result of a Bernoulli distribution is a binary variable $Y ∈ {0, 1}$. @@ -213,29 +224,34 @@ $ "Bernoulli"(p) = f(x, p)=p^(x)(1 - p)^(1 - x) "for" x ∈ {0, 1} $ ] -#slide( - title: [Bernoulli], -)[ - #align( - center, - )[ +#slide(title: [Bernoulli])[ + #align(center)[ #figure( { let data = ((0.666, "0"), (0.333, "1")) - let x_axis = axis(values: ("", "0", "1"), location: "bottom", title: none) + let x_axis = axis( + values: ("", "0", "1"), + location: "bottom", + title: none, + ) let y_axis = axis( - min: 0, max: 0.7, step: 0.2, value_formatter: "{:.1}", location: "left", title: $"PMF"$, + min: 0, + max: 0.7, + step: 0.2, + value_formatter: "{:.1}", + location: "left", + title: $"PMF"$, ) let pl = pplot(data: data, axes: ((x_axis, y_axis))) bar_chart(pl, (350pt, 275pt), bar_width: 70%, caption: none) - }, caption: [$p = 1 / 3$], numbering: none, + }, + caption: [$p = 1 / 3$], + numbering: none, ) ] ] -#slide( - title: [Binomial], -)[ +#slide(title: [Binomial])[ The binomial distribution describes an event in which the number of successes in a sequence $n$ independent experiments, each one making a yes--no question with probability of success $p$. Notice that Bernoulli distribution is a special case @@ -254,21 +270,19 @@ Example: number of heads in five coin throws. ] -#slide( - title: [Binomial], -)[ +#slide(title: [Binomial])[ #v(4em) - $ "Binomial"(n,p) = f(x, n, p) = binom(n, x)p^(x)(1-p)^(n-x) "for" x ∈ {0, 1, dots, n} $ + $ + "Binomial"(n,p) = f(x, n, p) = binom(n, x)p^(x)(1-p)^(n-x) "for" x ∈ { + 0, 1, dots, n + } + $ ] -#slide( - title: [Binomial], -)[ +#slide(title: [Binomial])[ #side-by-side[ - #align( - center, - )[ + #align(center)[ #figure( { let data = for i in range(0, 11) { @@ -279,17 +293,22 @@ x_ticks.insert(0, "") let x_axis = axis(values: x_ticks, location: "bottom", title: none) let y_axis = axis( - min: 0, max: 0.5, step: 0.1, value_formatter: "{:.1}", location: "left", title: $"PMF"$, + min: 0, + max: 0.5, + step: 0.1, + value_formatter: "{:.1}", + location: "left", + title: $"PMF"$, ) let pl = pplot(data: data, axes: ((x_axis, y_axis))) bar_chart(pl, (350pt, 275pt), bar_width: 70%, caption: none) - }, caption: [$n = 10, p = 1 / 5$], numbering: none, + }, + caption: [$n = 10, p = 1 / 5$], + numbering: none, ) ] ][ - #align( - center, - )[ + #align(center)[ #figure( { let data = for i in range(0, 11) { @@ -300,19 +319,24 @@ x_ticks.insert(0, "") let x_axis = axis(values: x_ticks, location: "bottom", title: none) let y_axis = axis( - min: 0, max: 0.5, step: 0.1, value_formatter: "{:.1}", location: "left", title: $"PMF"$, + min: 0, + max: 0.5, + step: 0.1, + value_formatter: "{:.1}", + location: "left", + title: $"PMF"$, ) let pl = pplot(data: data, axes: ((x_axis, y_axis))) bar_chart(pl, (350pt, 275pt), bar_width: 70%, caption: none) - }, caption: [$n = 10, p = 1 / 2$], numbering: none, + }, + caption: [$n = 10, p = 1 / 2$], + numbering: none, ) ] ] ] -#slide( - title: [Poisson], -)[ +#slide(title: [Poisson])[ Poisson distribution describes the probability of a certain number of events occurring in a fixed time interval if these events occur with a constant mean rate which is known and independent since the time of last occurrence. Poisson @@ -336,13 +360,9 @@ $ "Poisson"(λ) = f(x, λ) = (λ^x e^(-λ)) / (x!) "for" λ > 0 $ ] -#slide( - title: [Poisson], -)[ +#slide(title: [Poisson])[ #side-by-side[ - #align( - center, - )[ + #align(center)[ #figure( { let data = for i in range(0, 9) { @@ -353,17 +373,22 @@ x_ticks.insert(0, "") let x_axis = axis(values: x_ticks, location: "bottom", title: none) let y_axis = axis( - min: 0, max: 0.5, step: 0.1, value_formatter: "{:.1}", location: "left", title: $"PMF"$, + min: 0, + max: 0.5, + step: 0.1, + value_formatter: "{:.1}", + location: "left", + title: $"PMF"$, ) let pl = pplot(data: data, axes: ((x_axis, y_axis))) bar_chart(pl, (350pt, 275pt), bar_width: 70%, caption: none) - }, caption: [$λ = 2$], numbering: none, + }, + caption: [$λ = 2$], + numbering: none, ) ] ][ - #align( - center, - )[ + #align(center)[ #figure( { let data = for i in range(0, 9) { @@ -374,27 +399,30 @@ x_ticks.insert(0, "") let x_axis = axis(values: x_ticks, location: "bottom", title: none) let y_axis = axis( - min: 0, max: 0.5, step: 0.1, value_formatter: "{:.1}", location: "left", title: $"PMF"$, + min: 0, + max: 0.5, + step: 0.1, + value_formatter: "{:.1}", + location: "left", + title: $"PMF"$, ) let pl = pplot(data: data, axes: ((x_axis, y_axis))) bar_chart(pl, (350pt, 275pt), bar_width: 70%, caption: none) - }, caption: [$λ = 3$], numbering: none, + }, + caption: [$λ = 3$], + numbering: none, ) ] ] ] -#slide( - title: [ - Negative Binomial #footnote[ +#slide(title: [ + Negative Binomial #footnote[ any phenomena that can be modeles as a Poisson distribution can be modeled also as negative binomial distribution @gelman2013bayesian, @gelman2020regression. ] - ], -)[ - #text( - size: 16pt, - )[ +])[ + #text(size: 16pt)[ The binomial distribution describes an event in which the number of successes in a sequence $n$ independent experiments, each one making a yes--no question with probability of success $p$ @@ -415,24 +443,20 @@ ] ] -#slide( - title: [Negative Binomial], -)[ +#slide(title: [Negative Binomial])[ #v(4em) $ - "Negative Binomial"(k, p) &= f(x, k, p) &= binom(x + k - 1, k - 1)p^(x)(1-p)^(k) \ + "Negative Binomial"(k, p) &= f(x, k, p) &= binom(x + k - 1, k - 1)p^(x)( + 1-p + )^(k) \ \ - & &"for" x ∈ {0, 1, dots, n} + & &"for" x ∈ {0, 1, dots, n} $ ] -#slide( - title: [Negative Binomial], -)[ +#slide(title: [Negative Binomial])[ #side-by-side[ - #align( - center, - )[ + #align(center)[ #figure( { let data = for i in range(0, 9) { @@ -443,17 +467,22 @@ x_ticks.insert(0, "") let x_axis = axis(values: x_ticks, location: "bottom", title: none) let y_axis = axis( - min: 0, max: 0.5, step: 0.1, value_formatter: "{:.1}", location: "left", title: $"PMF"$, + min: 0, + max: 0.5, + step: 0.1, + value_formatter: "{:.1}", + location: "left", + title: $"PMF"$, ) let pl = pplot(data: data, axes: ((x_axis, y_axis))) bar_chart(pl, (350pt, 275pt), bar_width: 70%, caption: none) - }, caption: [$k = 1, p = 1 / 2$], numbering: none, + }, + caption: [$k = 1, p = 1 / 2$], + numbering: none, ) ] ][ - #align( - center, - )[ + #align(center)[ #figure( { let data = for i in range(0, 9) { @@ -464,22 +493,25 @@ x_ticks.insert(0, "") let x_axis = axis(values: x_ticks, location: "bottom", title: none) let y_axis = axis( - min: 0, max: 0.5, step: 0.1, value_formatter: "{:.1}", location: "left", title: $"PMF"$, + min: 0, + max: 0.5, + step: 0.1, + value_formatter: "{:.1}", + location: "left", + title: $"PMF"$, ) let pl = pplot(data: data, axes: ((x_axis, y_axis))) bar_chart(pl, (350pt, 275pt), bar_width: 70%, caption: none) - }, caption: [$k = 5, p = 1 / 2$], numbering: none, + }, + caption: [$k = 5, p = 1 / 2$], + numbering: none, ) ] ] ] -#slide( - title: [Continuous Distributions], -)[ - #text( - size: 16pt, - )[ +#slide(title: [Continuous Distributions])[ + #text(size: 16pt)[ Continuous probability distributions are distributions which the results are values in a continuous real number line: $(-oo, +oo) ∈ RR$. @@ -500,9 +532,7 @@ ] ] -#slide( - title: [Continuous Uniform], -)[ +#slide(title: [Continuous Uniform])[ The continuous uniform distribution is a symmetric probability distribution in which an infinite number of value intervals are equally likely of being observable. Each one of the infinite $n$ intervals have probability $1 / n$. @@ -515,41 +545,48 @@ - $b$ -- upper bound ] -#slide( - title: [Continuous Uniform], -)[ +#slide(title: [Continuous Uniform])[ #v(4em) $ "Uniform"(a,b) = f(x, a, b) = 1 / (b-a) "for" a <= x <= b "and" x ∈ [a, b] $ ] -#slide( - title: [Continuous Uniform], -)[ - #align( - center, - )[ +#slide(title: [Continuous Uniform])[ + #align(center)[ #figure( { canvas( - length: 0.9cm, { + length: 0.9cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.1, x-min: 0, x-max: 6, y-max: 0.4, y-min: 0, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.1, + x-min: 0, + x-max: 6, + y-max: 0.4, + y-min: 0, + { plot.add( - domain: (0, 6), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => continuousuniform(0, 6), + domain: (0, 6), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => continuousuniform(0, 6), ) }, ) }, ) - }, caption: [$a = 0, b = 6$], numbering: none, + }, + caption: [$a = 0, b = 6$], + numbering: none, ) ] ] -#slide( - title: [Normal], -)[ +#slide(title: [Normal])[ This distribution is generally used in social and natural sciences to represent continuous variables in which its underlying distribution are unknown. @@ -568,9 +605,7 @@ similar to normal distributions. ] -#slide( - title: [Normal], -)[ +#slide(title: [Normal])[ The normal distribution has two parameters and its notation is $"Normal"(μ, σ)$ or $"N"(μ, σ)$: @@ -583,13 +618,11 @@ Example: height, weight etc. ] -#slide( - title: [Normal #footnote[ +#slide(title: [Normal #footnote[ see how the normal distribution was derived from the binomial distribution in the backup slides. ] - ], -)[ +])[ #v(4em) $ @@ -598,55 +631,77 @@ $ ] -#slide( - title: [Normal], -)[ +#slide(title: [Normal])[ #side-by-side[ - #align( - center, - )[ + #align(center)[ #figure( { canvas( - length: 0.75cm, { + length: 0.75cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.1, x-min: -4, x-max: 4, y-max: 0.65, y-min: 0, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.1, + x-min: -4, + x-max: 4, + y-max: 0.65, + y-min: 0, + { plot.add( - domain: (-4, 4), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => gaussian(x, 0, 1), + domain: (-4, 4), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => gaussian(x, 0, 1), ) }, ) }, ) - }, caption: [$μ = 0, σ = 1$], numbering: none, + }, + caption: [$μ = 0, σ = 1$], + numbering: none, ) ] ][ - #align( - center, - )[ + #align(center)[ #figure( { canvas( - length: 0.75cm, { + length: 0.75cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.1, x-min: -4, x-max: 4, y-max: 0.65, y-min: 0, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.1, + x-min: -4, + x-max: 4, + y-max: 0.65, + y-min: 0, + { plot.add( - domain: (-4, 4), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => gaussian(x, 1, 2 / 3), + domain: (-4, 4), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => gaussian(x, 1, 2 / 3), ) }, ) }, ) - }, caption: [$μ = 1, σ = 2 / 3$], numbering: none, + }, + caption: [$μ = 1, σ = 2 / 3$], + numbering: none, ) ] ] ] -#slide( - title: [Log-Normal], -)[ +#slide(title: [Log-Normal])[ The log-normal distribution is a continuous probability distribution of a random variable which its natural logarithm is distributed as a normal distribution. Thus, if the natural logarithm a random variable $X$, $ln(X)$, is distributed as @@ -662,9 +717,7 @@ of many independent positive random variables. ] -#slide( - title: [Log-Normal], -)[ +#slide(title: [Log-Normal])[ The log-normal distribution has two parameters and its notation is $"Log-Normal"(μ, σ^2)$: @@ -674,63 +727,87 @@ - $σ$ -- square root of the variance of the distribution's natural logarithm ] -#slide( - title: [Log-Normal], -)[ +#slide(title: [Log-Normal])[ #v(4em) - $ "Log-Normal"(μ,σ) = f(x, μ, σ) = 1 / (x σ sqrt(2π))e^((-ln(x) - μ)^2 / (2 σ^2)) "for" σ > 0 $ + $ + "Log-Normal"(μ,σ) = f(x, μ, σ) = 1 / (x σ sqrt(2π))e^(( + -ln(x) - μ + )^2 / (2 σ^2)) "for" σ > 0 + $ ] -#slide( - title: [Log-Normal], -)[ +#slide(title: [Log-Normal])[ #side-by-side[ - #align( - center, - )[ + #align(center)[ #figure( { canvas( - length: 0.75cm, { + length: 0.75cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.1, x-min: 0, x-max: 5, y-max: 0.7, y-min: 0, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.1, + x-min: 0, + x-max: 5, + y-max: 0.7, + y-min: 0, + { plot.add( - domain: (0.001, 5), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => lognormal(x, 0, 1), + domain: (0.001, 5), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => lognormal(x, 0, 1), ) }, ) }, ) - }, caption: [$μ = 0, σ = 1$], numbering: none, + }, + caption: [$μ = 0, σ = 1$], + numbering: none, ) ] ][ - #align( - center, - )[ + #align(center)[ #figure( { canvas( - length: 0.75cm, { + length: 0.75cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.1, x-min: 0, x-max: 5, y-max: 0.7, y-min: 0, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.1, + x-min: 0, + x-max: 5, + y-max: 0.7, + y-min: 0, + { plot.add( - domain: (0.001, 5), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => lognormal(x, 1, 2 / 3), + domain: (0.001, 5), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => lognormal(x, 1, 2 / 3), ) }, ) }, ) - }, caption: [$μ = 1, σ = 2 / 3$], numbering: none, + }, + caption: [$μ = 1, σ = 2 / 3$], + numbering: none, ) ] ] ] -#slide( - title: [Exponential], -)[ +#slide(title: [Exponential])[ The exponential distribution is the probability distribution of the time between events that occurs in a continuous manner, are independent, and have constant mean rate of occurrence. @@ -754,55 +831,77 @@ $ "Exponential"(λ) = f(x, λ) = λ e^(-λ x) "for" λ > 0 $ ] -#slide( - title: [Exponential], -)[ +#slide(title: [Exponential])[ #side-by-side[ - #align( - center, - )[ + #align(center)[ #figure( { canvas( - length: 0.75cm, { + length: 0.75cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.25, x-min: 0, x-max: 5, y-max: 0.95, y-min: 0, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.25, + x-min: 0, + x-max: 5, + y-max: 0.95, + y-min: 0, + { plot.add( - domain: (0.001, 5), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => exponential(x, 1), + domain: (0.001, 5), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => exponential(x, 1), ) }, ) }, ) - }, caption: [$λ = 1$], numbering: none, + }, + caption: [$λ = 1$], + numbering: none, ) ] ][ - #align( - center, - )[ + #align(center)[ #figure( { canvas( - length: 0.75cm, { + length: 0.75cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.25, x-min: 0, x-max: 5, y-max: 0.95, y-min: 0, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.25, + x-min: 0, + x-max: 5, + y-max: 0.95, + y-min: 0, + { plot.add( - domain: (0.001, 5), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => exponential(x, 1 / 2), + domain: (0.001, 5), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => exponential(x, 1 / 2), ) }, ) }, ) - }, caption: [$λ = 1 / 2$], numbering: none, + }, + caption: [$λ = 1 / 2$], + numbering: none, ) ] ] ] -#slide( - title: [Gamma], -)[ +#slide(title: [Gamma])[ The gamma distribution is a long-tailed distribution with support only for positive real numbers. @@ -819,66 +918,88 @@ Example: Any waiting time can be modelled with a gamma distribution. ] -#slide( - title: [Gamma], -)[ +#slide(title: [Gamma])[ #v(4em) - $ "Gamma"(α, θ) = f(x, α, θ) = (x^(α-1) e^(-x / θ)) / (Γ(α) θ^α) "for" x, α, θ > 0 $ + $ + "Gamma"(α, θ) = f(x, α, θ) = (x^(α-1) e^(-x / θ)) / (Γ( + α + ) θ^α) "for" x, α, θ > 0 + $ ] -#slide( - title: [Gamma], -)[ +#slide(title: [Gamma])[ #side-by-side[ - #align( - center, - )[ + #align(center)[ #figure( { canvas( - length: 0.75cm, { + length: 0.75cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.2, x-min: 0, x-max: 6, y-max: 0.95, y-min: 0, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.2, + x-min: 0, + x-max: 6, + y-max: 0.95, + y-min: 0, + { plot.add( - domain: (0.001, 6), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => gammadist(x, 1, 1), + domain: (0.001, 6), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => gammadist(x, 1, 1), ) }, ) }, ) - }, caption: [$α = 1, θ = 1$], numbering: none, + }, + caption: [$α = 1, θ = 1$], + numbering: none, ) ] ][ - #align( - center, - )[ + #align(center)[ #figure( { canvas( - length: 0.75cm, { + length: 0.75cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.2, x-min: 0, x-max: 6, y-max: 0.95, y-min: 0, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.2, + x-min: 0, + x-max: 6, + y-max: 0.95, + y-min: 0, + { plot.add( - domain: (0.001, 6), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => gammadist(x, 2, 1 / 2), + domain: (0.001, 6), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => gammadist(x, 2, 1 / 2), ) }, ) }, ) - }, caption: [$α = 2, θ = 1 / 2$], numbering: none, + }, + caption: [$α = 2, θ = 1 / 2$], + numbering: none, ) ] ] ] -#slide( - title: [Student's $t$], -)[ - #text( - size: 18pt, - )[ +#slide(title: [Student's $t$])[ + #text(size: 18pt)[ Student's $t$ distribution arises by estimating the mean of a normally-distributed population in situations where the sample size is small and the standard deviation is known #footnote[this is where the ubiquitous Student's $t$ test.]. @@ -899,9 +1020,7 @@ ] ] -#slide( - title: [Student's $t$], -)[ +#slide(title: [Student's $t$])[ Student's $t$ distribution has one parameter and its notation is $"Student"(ν)$: @@ -912,63 +1031,87 @@ Example: a dataset full of outliers. ] -#slide( - title: [Student's $t$], -)[ +#slide(title: [Student's $t$])[ #v(4em) - $ "Student"(ν) = f(x, ν) = (Γ ((ν + 1) / 2) ) / (sqrt(ν π) Γ (ν / 2 )) (1+ x^2 / ν)^(-(ν+1) / 2) "for" ν >= 1 $ + $ + "Student"(ν) = f(x, ν) = (Γ ((ν + 1) / 2) ) / (sqrt(ν π) Γ (ν / 2)) ( + 1+ x^2 / ν + )^(-(ν+1) / 2) "for" ν >= 1 + $ ] -#slide( - title: [Student's $t$], -)[ +#slide(title: [Student's $t$])[ #side-by-side[ - #align( - center, - )[ + #align(center)[ #figure( { canvas( - length: 0.75cm, { + length: 0.75cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.1, x-min: -4, x-max: 4, y-max: 0.45, y-min: 0, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.1, + x-min: -4, + x-max: 4, + y-max: 0.45, + y-min: 0, + { plot.add( - domain: (-4, 4), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => student(x, 1), + domain: (-4, 4), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => student(x, 1), ) }, ) }, ) - }, caption: [$ν = 1$], numbering: none, + }, + caption: [$ν = 1$], + numbering: none, ) ] ][ - #align( - center, - )[ + #align(center)[ #figure( { canvas( - length: 0.75cm, { + length: 0.75cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.1, x-min: -4, x-max: 4, y-max: 0.45, y-min: 0, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.1, + x-min: -4, + x-max: 4, + y-max: 0.45, + y-min: 0, + { plot.add( - domain: (-4, 4), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => student(x, 3), + domain: (-4, 4), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => student(x, 3), ) }, ) }, ) - }, caption: [$ν = 3$], numbering: none, + }, + caption: [$ν = 3$], + numbering: none, ) ] ] ] -#slide( - title: [Cauchy], -)[ +#slide(title: [Cauchy])[ The Cauchy distribution is bell-shaped distribution and a special case for Student's $t$ with $ν = 1$. @@ -989,58 +1132,80 @@ #slide(title: [Cauchy])[ #v(4em) - $ "Cauchy"(μ, σ) = 1 / (π σ (1 + ((x - μ) / σ )^2 )) "for" σ >= 0 $ + $ "Cauchy"(μ, σ) = 1 / (π σ (1 + ((x - μ) / σ)^2)) "for" σ >= 0 $ ] -#slide( - title: [Cauchy], -)[ +#slide(title: [Cauchy])[ #side-by-side[ - #align( - center, - )[ + #align(center)[ #figure( { canvas( - length: 0.75cm, { + length: 0.75cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.1, x-min: -4, x-max: 4, y-max: 0.65, y-min: 0, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.1, + x-min: -4, + x-max: 4, + y-max: 0.65, + y-min: 0, + { plot.add( - domain: (-4, 4), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => cauchy(x, 0, 1), + domain: (-4, 4), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => cauchy(x, 0, 1), ) }, ) }, ) - }, caption: [$μ = 0, σ = 1$], numbering: none, + }, + caption: [$μ = 0, σ = 1$], + numbering: none, ) ] ][ - #align( - center, - )[ + #align(center)[ #figure( { canvas( - length: 0.75cm, { + length: 0.75cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.1, x-min: -4, x-max: 4, y-max: 0.65, y-min: 0, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.1, + x-min: -4, + x-max: 4, + y-max: 0.65, + y-min: 0, + { plot.add( - domain: (-4, 4), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => cauchy(x, 0, 1 / 2), + domain: (-4, 4), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => cauchy(x, 0, 1 / 2), ) }, ) }, ) - }, caption: [$μ = 0, σ = 1 / 2$], numbering: none, + }, + caption: [$μ = 0, σ = 1 / 2$], + numbering: none, ) ] ] ] -#slide( - title: [Beta], -)[ +#slide(title: [Beta])[ The beta distribution is a natural choice to model anything that is restricted to values between $0$ e $1$. Hence, it is a good candidate to model probabilities and proportions. @@ -1061,55 +1226,81 @@ in a total of 8 attempts -- $"Beta"(3, 5)$ ] -#slide( - title: [Beta], -)[ +#slide(title: [Beta])[ #v(4em) - $ "Beta"(α, β) = f(x, α, β) (x^(α - 1)(1 - x)^(β - 1)) / ((Γ(α)Γ(β)) / (Γ(α +β ))) "for" α, β > 0 "and" x ∈ [0, 1] $ + $ + "Beta"(α, β) = f(x, α, β) (x^(α - 1)(1 - x)^(β - 1)) / ((Γ(α)Γ(β)) / (Γ( + α +β + ))) "for" α, β > 0 "and" x ∈ [0, 1] + $ ] -#slide( - title: [Beta], -)[ +#slide(title: [Beta])[ #side-by-side[ - #align( - center, - )[ + #align(center)[ #figure( { canvas( - length: 0.75cm, { + length: 0.75cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.1, x-min: 0, x-max: 1, y-max: 0.3, y-min: 0, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.1, + x-min: 0, + x-max: 1, + y-max: 0.3, + y-min: 0, + { plot.add( - domain: (0.0001, 0.9999), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => beta(x, 1, 1), + domain: (0.0001, 0.9999), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => beta(x, 1, 1), ) }, ) }, ) - }, caption: [$α = 1, β = 1$], numbering: none, + }, + caption: [$α = 1, β = 1$], + numbering: none, ) ] ][ - #align( - center, - )[ + #align(center)[ #figure( { canvas( - length: 0.75cm, { + length: 0.75cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.1, x-min: 0, x-max: 1, y-max: 0.3, y-min: 0, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.1, + x-min: 0, + x-max: 1, + y-max: 0.3, + y-min: 0, + { plot.add( - domain: (0.0001, 0.9999), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => beta(x, 3, 2), + domain: (0.0001, 0.9999), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => beta(x, 3, 2), ) }, ) }, ) - }, caption: [$α = 3, β = 2$], numbering: none, + }, + caption: [$α = 3, β = 2$], + numbering: none, ) ] ] diff --git a/slides/03-priors.typ b/slides/03-priors.typ index db50373..ffa64b6 100644 --- a/slides/03-priors.typ +++ b/slides/03-priors.typ @@ -4,9 +4,7 @@ #new-section-slide("Priors") -#slide( - title: "Recommended References", -)[ +#slide(title: "Recommended References")[ - #cite(, form: "prose"): - Chapter 2: Single-parameter models @@ -26,17 +24,15 @@ #align(center)[#image("images/memes/priors.jpg")] ] -#slide( - title: "Prior Probability", -)[ +#slide(title: "Prior Probability")[ Bayesian statistics is characterized by the use of prior information as the prior probability $P(θ)$, often just prior: - $ underbrace(P(θ | y), "Posterior") = (overbrace(P(y | θ), "Likelihood") dot overbrace(P(θ), "Prior")) / underbrace(P(y), "Normalizing Constant") $ + $ + underbrace(P(θ | y), "Posterior") = (overbrace(P(y | θ), "Likelihood") dot overbrace(P(θ), "Prior")) / underbrace(P(y), "Normalizing Constant") + $ ] -#slide( - title: "The Subjectivity of the Prior", -)[ +#slide(title: "The Subjectivity of the Prior")[ - Many critics to Bayesian statistics are due the subjectivity in eliciting priors probability on certain hypothesis or model parameter's values. - Subjectivity is something unwanted in the ideal picture of the scientist and the @@ -51,9 +47,7 @@ @vandeschootBayesianStatisticsModelling2021. ] -#slide( - title: "How to Incorporate Subjectivity", -)[ +#slide(title: "How to Incorporate Subjectivity")[ - Bayesian statistics *embraces* subjectivity while frequentist statistics *bans* it. @@ -68,9 +62,7 @@ *declared and formalized* ] -#slide( - title: "Types of Priors", -)[ +#slide(title: "Types of Priors")[ In general, we can have 3 types of priors in a Bayesian approach @gelman2013bayesian @mcelreath2020statistical @vandeschootBayesianStatisticsModelling2021: @@ -82,9 +74,7 @@ - *informative*: introduction of medium to high domain knowledge. ] -#slide( - title: "Uniform Prior (Flat)", -)[ +#slide(title: "Uniform Prior (Flat)")[ Starts from the premise that "everything is possible". There is no limits in the degree of beliefs that the distribution of certain values must be or any sort of restrictions. @@ -100,9 +90,7 @@ - *model error or residuals*: ${σ ∈ RR^+ : 0 < θ < oo}$ ] -#slide( - title: "Weakly Uninformative Prior", -)[ +#slide(title: "Weakly Uninformative Prior")[ Here we start to have "educated" guess about our parameter values. Hence, we don't start from the premise that "anything is possible". @@ -122,12 +110,8 @@ - $θ tilde "Student"(ν=3, 0, 1)$ (Aki Vehtari's preferred choice #footnote()) ] -#slide( - title: "An Example of a Robust Prior", -)[ - #text( - size: 18pt, - )[ +#slide(title: "An Example of a Robust Prior")[ + #text(size: 18pt)[ A nice example comes from a Ben Goodrich's lecture #footnote[ https://youtu.be/p6cyRBWahRA, in case you want to see the full video, the section about priors related to the argument begins at minute 40 @@ -147,9 +131,7 @@ ] ] -#slide( - title: "Informative Prior", -)[ +#slide(title: "Informative Prior")[ In some contexts, it is interesting to use an informative prior. Good candidates are when data is scarce or expensive and prior knowledge about the phenomena is available. diff --git a/slides/04-bayesian_workflow.typ b/slides/04-bayesian_workflow.typ index fdace38..891493b 100644 --- a/slides/04-bayesian_workflow.typ +++ b/slides/04-bayesian_workflow.typ @@ -5,9 +5,7 @@ #new-section-slide("Bayesian Workflow") -#slide( - title: "Recommended References", -)[ +#slide(title: "Recommended References")[ - #cite(, form: "prose") - Chapter 6: Model checking - #cite(, form: "prose") - Chapter 4: Geocentric Models @@ -23,15 +21,12 @@ #align(center)[#image("images/memes/workflow.jpg")] ] -#slide( - title: "All Models Are Wrong", -)[ - #side-by-side( - columns: (4fr, 1fr), - )[ - #align( - center + horizon, - )[#quote(block: true, attribution: [George Box @boxScienceStatistics1976])[ +#slide(title: "All Models Are Wrong")[ + #side-by-side(columns: (4fr, 1fr))[ + #align(center + horizon)[#quote( + block: true, + attribution: [George Box @boxScienceStatistics1976], + )[ All models are wrong but some are useful ]] ][ @@ -39,39 +34,33 @@ ] ] -#slide( - title: [Bayesian Workflow #footnote[ +#slide(title: [Bayesian Workflow #footnote[ based on #cite(, form: "prose") - ]], -)[ + ]])[ #v(2em) - #align( - center, - )[#cetz.canvas( - { - import cetz.draw: * - set-style( - mark: (start: ">", end: ">", fill: black, size: 0.3), stroke: (thickness: 2pt), radius: 3, - ) - circle((0, 0)) - content((0, 0), [#align(center)[Prior \ Elicitation]]) - line((3, 0), (7, 0)) - content((5, 2), [#align(center)[Prior \ Predictive \ Check]]) - circle((10, 0)) - content((10, 0), [#align(center)[Model \ Specification]]) - line((13, 0), (17, 0)) - content((15, 2), [#align(center)[Posterior \ Predictive \ Check]]) - circle((20, 0)) - content((20, 0), [#align(center)[Posterior \ Inference]]) - }, - )] + #align(center)[#cetz.canvas({ + import cetz.draw: * + set-style( + mark: (start: ">", end: ">", fill: black, size: 0.3), + stroke: (thickness: 2pt), + radius: 3, + ) + circle((0, 0)) + content((0, 0), [#align(center)[Prior \ Elicitation]]) + line((3, 0), (7, 0)) + content((5, 2), [#align(center)[Prior \ Predictive \ Check]]) + circle((10, 0)) + content((10, 0), [#align(center)[Model \ Specification]]) + line((13, 0), (17, 0)) + content((15, 2), [#align(center)[Posterior \ Predictive \ Check]]) + circle((20, 0)) + content((20, 0), [#align(center)[Posterior \ Inference]]) + })] ] -#slide( - title: [Bayesian Workflow #footnote[ +#slide(title: [Bayesian Workflow #footnote[ adapted from #link("https://github.com/elizavetasemenova")[Elizaveta Semenova]. - ]], -)[ + ]])[ - Understand the domain and problem. - Formulate the model mathematically. - Implement model, test, and debug. @@ -83,28 +72,24 @@ efficient models. ] -#slide( - title: "Actual Bayesian Workflow", -)[ +#slide(title: "Actual Bayesian Workflow")[ #figure( - image("images/workflow/workflow_overview.svg", height: 80%), caption: [Bayesian workflow by #cite(, form: "prose").], + image("images/workflow/workflow_overview.svg", height: 80%), + caption: [Bayesian workflow by #cite(, form: "prose").], ) ] -#slide( - title: [Not a "new idea"], -)[ +#slide(title: [Not a "new idea"])[ #figure( - image("images/workflow/box_loop.png", height: 80%), caption: [Box's Loop from + image("images/workflow/box_loop.png", height: 80%), + caption: [Box's Loop from #cite(, form: "prose") but taken from #cite(, form: "prose").], ) ] -#slide( - title: "Prior Predictive Check", -)[ +#slide(title: "Prior Predictive Check")[ Before we feed data into our model, we need to check all of our priors. #v(1em) @@ -120,12 +105,8 @@ understanding of the prior influence onto the posterior. ] -#slide( - title: "Posterior Predictive Check", -)[ - #text( - size: 18pt, - )[ +#slide(title: "Posterior Predictive Check")[ + #text(size: 18pt)[ We need to make sure that the posterior distribution of $bold(y)$, namely $bold(tilde(y))$, can capture all the nuances of the real distribution density/mass of $bold(y)$. @@ -145,18 +126,16 @@ ] ] -#slide( - title: "Examples of Posterior Predictive Checks", -)[ - #side-by-side( - columns: (1fr, 1fr), - )[ +#slide(title: "Examples of Posterior Predictive Checks")[ + #side-by-side(columns: (1fr, 1fr))[ #figure( - image("images/predictive_checks/pp_check_brms.svg", height: 80%), caption: [#text(size: 18pt)[Real versus Simulated Densities]], + image("images/predictive_checks/pp_check_brms.svg", height: 80%), + caption: [#text(size: 18pt)[Real versus Simulated Densities]], ) ][ #figure( - image("images/predictive_checks/pp_check_brms_ecdf.svg", height: 80%), caption: [#text(size: 18pt)[Real versus Simulated Empirical CDFs]], + image("images/predictive_checks/pp_check_brms_ecdf.svg", height: 80%), + caption: [#text(size: 18pt)[Real versus Simulated Empirical CDFs]], ) ] ] diff --git a/slides/05-linear_regression.typ b/slides/05-linear_regression.typ index a56bb34..cd0c1c0 100644 --- a/slides/05-linear_regression.typ +++ b/slides/05-linear_regression.typ @@ -5,9 +5,7 @@ #new-section-slide("Linear Regression") -#slide( - title: "Recommended References", -)[ +#slide(title: "Recommended References")[ - #cite(, form: "prose"): - Chapter 14: Introduction to regression models - Chapter 16: Generalized linear models @@ -24,25 +22,37 @@ #align(center)[#image("images/memes/linear_regression.jpg")] ] -#slide( - title: "What is Linear Regression?", -)[ - #let data_scatter = ((0.5, 0.7), (1, 0.7), (2, 2.4), (3, 2.6), (4, 4.6), (5, 4.2),) - #let data_graph = ((0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5),) +#slide(title: "What is Linear Regression?")[ + #let data_scatter = ( + (0.5, 0.7), + (1, 0.7), + (2, 2.4), + (3, 2.6), + (4, 4.6), + (5, 4.2), + ) + #let data_graph = ((0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)) #let x_axis = axis(min: 0, max: 5, location: "bottom") #let y_axis = axis(min: 0, max: 5, location: "left") #let pl_scatter = pplot(data: data_scatter, axes: ((x_axis, y_axis))) - #let scatter_display = scatter_plot(pl_scatter, 70pt, stroke: 3pt, caption: none) + #let scatter_display = scatter_plot( + pl_scatter, + 70pt, + stroke: 3pt, + caption: none, + ) #let pl_graph = pplot(data: data_graph, axes: (x_axis, y_axis)) #let graph_display = graph_plot( - pl_graph, 70pt, stroke: 2pt + julia-blue, markings: none, caption: none, + pl_graph, + 70pt, + stroke: 2pt + julia-blue, + markings: none, + caption: none, ) #overlay((scatter_display, graph_display), (360pt, 280pt)) ] -#slide( - title: "What is Linear Regression?", -)[ +#slide(title: "What is Linear Regression?")[ The ideia here is to model a dependent variable as a linear combination of independent variables. @@ -61,9 +71,7 @@ #align(center)[#image("images/memes/assumptions.jpg")] ] -#slide( - title: "Linear Regression Assumptions", -)[ +#slide(title: "Linear Regression Assumptions")[ #v(3em) - model error $ε$ is independent of $bold(X)$ and $bold(y)$. @@ -77,24 +85,22 @@ - Observations are I.I.D #footnote[independent and identically distributed.]. ] -#slide( - title: "Linear Regression Specification", -)[ +#slide(title: "Linear Regression Specification")[ To estimate the intercept $α$ and coefficients $bold(β)$ we use a Gaussian/normal likelihood function. Mathematically speaking, Bayesian linear regression is: #v(2em) - $ bold(y) &tilde "Normal"(α + bold(X) bold(β), σ) \ - α &tilde "Normal"(μ_α, σ_α) \ - bold(β) &tilde "Normal"(μ_bold(β), σ_bold(β)) \ - σ &tilde "Exponential"(λ_σ) $ + $ + bold(y) &tilde "Normal"(α + bold(X) bold(β), σ) \ + α &tilde "Normal"(μ_α, σ_α) \ + bold(β) &tilde "Normal"(μ_bold(β), σ_bold(β)) \ + σ &tilde "Exponential"(λ_σ) + $ ] -#slide( - title: "Linear Regression Specification", -)[ +#slide(title: "Linear Regression Specification")[ What we are missing is the prior probabilities for the model's parameters: #v(2em) @@ -109,9 +115,7 @@ Knowledge that we have about the model's error. ] -#slide( - title: "Good Candidates for Prior Distributions", -)[ +#slide(title: "Good Candidates for Prior Distributions")[ First, center ($μ = 0$) and standardize ($σ = 1$) the independent variables. #v(2em) @@ -127,9 +131,7 @@ to positive values only. Exponential is a good candidate. ] -#slide( - title: "Posterior Computation", -)[ +#slide(title: "Posterior Computation")[ Our aim to is to *find the posterior distribution of the model's parameters of interest* ($α$ and $bold(β)$) by computing the full posterior distribution of: diff --git a/slides/06-logistic_regression.typ b/slides/06-logistic_regression.typ index 00bd63a..d62c0b0 100644 --- a/slides/06-logistic_regression.typ +++ b/slides/06-logistic_regression.typ @@ -5,9 +5,7 @@ #new-section-slide("Logistic Regression") -#slide( - title: "Recommended References", -)[ +#slide(title: "Recommended References")[ - #cite(, form: "prose") - Chapter 16: Generalized linear models @@ -26,9 +24,7 @@ #align(center)[#image("images/memes/logistic_regression.jpg")] ] -#slide( - title: [Welcome to the Magical World of the Linear Generalized Models], -)[ +#slide(title: [Welcome to the Magical World of the Linear Generalized Models])[ Leaving the realm of the linear models, we start to adventure to the generalized linear models -- GLM. @@ -38,21 +34,17 @@ binomial regression). ] -#slide( - title: [Binary Data #footnote[ +#slide(title: [Binary Data #footnote[ also known as dichotomous, dummy, indicator variable, etc. ] - ], -)[ +])[ #v(5em) We use logistic regression when our dependent variable is *binary*. It only takes two distinct values, usually coded as $0$ and $1$. ] -#slide( - title: [What is Logistic Regression], -)[ +#slide(title: [What is Logistic Regression])[ Logistic regression behaves exactly as a linear model: it makes a prediction by simply computing a weighted sum of the independent variables $bold(X)$ using the estimated coefficients $bold(β)$, along with a constant term $α$. @@ -62,19 +54,23 @@ However, instead of outputting a continuous value $bold(y)$, it returns the *logistic function* of this value: - $ "logistic"(x) = 1/(1 + e^(-x)) $ + $ "logistic"(x) = 1 / (1 + e^(-x)) $ ] -#slide( - title: [Logistic Function], -)[ - #align( - center, - )[ +#slide(title: [Logistic Function])[ + #align(center)[ #canvas( - length: 0.9cm, { + length: 0.9cm, + { plot.plot( - size: (16, 9), x-label: $x$, y-label: $"logistic"(x)$, x-tick-step: 5, y-tick-step: 0.25, y-min: -0.01, y-max: 1.01, { + size: (16, 9), + x-label: $x$, + y-label: $"logistic"(x)$, + x-tick-step: 5, + y-tick-step: 0.25, + y-min: -0.01, + y-max: 1.01, + { plot.add( domain: (-10, 10), samples: 200, // label: $"logistic"(x)$, // FIXME: depends on unreleased cetz 2.0.0 @@ -87,27 +83,29 @@ ] ] -#slide( - title: [Probit Function], -)[ +#slide(title: [Probit Function])[ We can also opt to choose to use the *probit function* (usually represented by the Greek letter $Φ$) which is the CDF of a normal distribution: #v(2em) - $ Φ(x)= 1/(sqrt(2π)) ∫_(-oo)^(x)e^((-t^2)/2) dif t $ + $ Φ(x)= 1 / (sqrt(2π)) ∫_(-oo)^(x)e^((-t^2) / 2) dif t $ ] -#slide( - title: [Probit Function], -)[ - #align( - center, - )[ +#slide(title: [Probit Function])[ + #align(center)[ #canvas( - length: 0.9cm, { + length: 0.9cm, + { plot.plot( - size: (16, 9), x-label: $x$, y-label: $Φ(x)$, x-tick-step: 5, y-tick-step: 0.25, y-min: -0.01, y-max: 1.01, { + size: (16, 9), + x-label: $x$, + y-label: $Φ(x)$, + x-tick-step: 5, + y-tick-step: 0.25, + y-min: -0.01, + y-max: 1.01, + { plot.add( domain: (-10, 10), samples: 200, // label: $"probit"(x)$, // FIXME: depends on unreleased cetz 2.0.0 @@ -120,16 +118,20 @@ ] ] -#slide( - title: [Logistic Function versus Probit Function], -)[ - #align( - center, - )[ +#slide(title: [Logistic Function versus Probit Function])[ + #align(center)[ #canvas( - length: 0.9cm, { + length: 0.9cm, + { plot.plot( - size: (16, 9), x-label: $x$, y-label: $Φ(x)$, x-tick-step: 5, y-tick-step: 0.25, y-min: -0.01, y-max: 1.01, { + size: (16, 9), + x-label: $x$, + y-label: $Φ(x)$, + x-tick-step: 5, + y-tick-step: 0.25, + y-min: -0.01, + y-max: 1.01, + { plot.add( domain: (-10, 10), samples: 200, // label: $"logistic"(x)$, // FIXME: depends on unreleased cetz 2.0.0 @@ -147,9 +149,7 @@ ] ] -#slide( - title: [Comparison with Linear Regression], -)[ +#slide(title: [Comparison with Linear Regression])[ Linear regression follows the following mathematical expression: $ "linear" = α + β_1 x_1 + β_2 x_2 + dots + β_k x_k $ @@ -167,9 +167,7 @@ $bold(y)$'s predicted binary value. ] -#slide( - title: [Logistic Regression Specification], -)[ +#slide(title: [Logistic Regression Specification])[ We can model logistic regression using two approaches: #v(3em) @@ -187,8 +185,8 @@ #text(size: 16pt)[ $ bold(y) &tilde "Bernoulli"(p) \ - p &= "logistic/probit"(α + bold(X) bold(β)) \ - α &tilde "Normal"(μ_α, σ_α) \ + p &= "logistic/probit"(α + bold(X) bold(β)) \ + α &tilde "Normal"(μ_α, σ_α) \ bold(β) &tilde "Normal"(μ_bold(β), σ_bold(β)) $ @@ -208,8 +206,8 @@ #text(size: 16pt)[ $ bold(y) &tilde "Binomial"(n, p) \ - p &= "logistic/probit"(α + bold(X) bold(β)) \ - α &tilde "Normal"(μ_α, σ_α) \ + p &= "logistic/probit"(α + bold(X) bold(β)) \ + α &tilde "Normal"(μ_α, σ_α) \ bold(β) &tilde "Normal"(μ_bold(β), σ_bold(β)) $ @@ -226,9 +224,7 @@ ] ] -#slide( - title: [Posterior Computation], -)[ +#slide(title: [Posterior Computation])[ Our aim to is to *find the posterior distribution of the model's parameters of interest* ($α$ and $bold(β)$) by computing the full posterior distribution of: @@ -237,9 +233,7 @@ $ P(bold(θ) | bold(y)) = P(α, bold(β) | bold(y)) $ ] -#slide( - title: [How to Interpret Coefficients], -)[ +#slide(title: [How to Interpret Coefficients])[ If we revisit logistic transformation mathematical expression, we see that, in order to interpret coefficients $bold(β)$, we need to perform a transformation. @@ -249,12 +243,8 @@ its inverse function. ] -#slide( - title: [Probability versus Odds], -)[ - #text( - size: 17pt, - )[ +#slide(title: [Probability versus Odds])[ + #text(size: 17pt)[ But before that, we need to discern between *probability and odds* #footnote[mathematically speaking.]. - *Probability*: a real number between $0$ and $1$ @@ -274,9 +264,7 @@ ] ] -#slide( - title: [Probability versus Odds], -)[ +#slide(title: [Probability versus Odds])[ $ "odds" = p / (1 - p) $ where $p$ is the probability. @@ -288,22 +276,18 @@ - Odds over $1$ increase the probability of seeing a certain event. ] -#slide( - title: [Logodds], -)[ +#slide(title: [Logodds])[ If you revisit the logistic function, you'll se that the intercept $α$ and coefficients $bold(β)$ are literally the *log of the odds* (logodds): $ - p &= "logistic"(α + bold(X) bold(β) ) \ - p &= "logistic"(α) + "logistic"( bold(X) bold(β)) \ - p &= 1 / (1 + e^(-bold(β))) \ + p &= "logistic"(α + bold(X) bold(β)) \ + p &= "logistic"(α) + "logistic"(bold(X) bold(β)) \ + p &= 1 / (1 + e^(-bold(β))) \ bold(β) &= log("odds") $ ] -#slide( - title: [Logodds], -)[ +#slide(title: [Logodds])[ Hence, the coefficients of a logistic regression are expressed in logodds, in which $0$ is the neutral element, and any number above or below it increases or decreases, respectively, the changes of obtaining a "success" in $bold(y)$. To @@ -313,7 +297,7 @@ values: $ - "odds"(α) & = e^α \ + "odds"(α) & = e^α \ "odds"(bold(β)) & = e^(bold(β)) $ ] diff --git a/slides/07-ordinal_regression.typ b/slides/07-ordinal_regression.typ index b27c4a6..8972b4e 100644 --- a/slides/07-ordinal_regression.typ +++ b/slides/07-ordinal_regression.typ @@ -5,9 +5,7 @@ #new-section-slide("Ordinal Regression") -#slide( - title: "Recommended References", -)[ +#slide(title: "Recommended References")[ - #cite(, form: "prose") - Chapter 16: Generalized linear models, Section 16.2: Models for multivariate and multinomial responses @@ -26,9 +24,7 @@ #align(center)[#image("images/memes/categorical_data.jpg")] ] -#slide( - title: [What is Ordinal Regression?], -)[ +#slide(title: [What is Ordinal Regression?])[ #v(2em) *Ordinal regression* is a regression model for *discrete data* and, more @@ -41,9 +37,7 @@ agree-disagree, or a patient perception of pain score. ] -#slide( - title: [Why not just use Linear Regression?], -)[ +#slide(title: [Why not just use Linear Regression?])[ The main reason to not simply use linear regression with ordinal discrete outcomes is that the categories of the dependent variable could not be *equidistant*. @@ -69,9 +63,7 @@ Another *non-linear transformation*. ] -#slide( - title: [Cumulative Distribution Function -- CDF], -)[ +#slide(title: [Cumulative Distribution Function -- CDF])[ In the case of ordinal regression, first we need to transform the *dependent variable into a cumulative scale* @@ -86,12 +78,8 @@ $y$ ] -#slide( - title: [Log-cumulative-odds], -)[ - #text( - size: 18pt, - )[ +#slide(title: [Log-cumulative-odds])[ + #text(size: 18pt)[ Still, this is not enough. We need to apply the *logit function onto the CDF*: $ op("logit")(x) = op("logistic")^(-1)(x) = ln(x / (1 -x)) $ @@ -107,9 +95,7 @@ ] ] -#slide( - title: [$K-1$ Intercepts], -)[ +#slide(title: [$K-1$ Intercepts])[ What do we do with this *log-cumulative-odds*? It allows us to construct *different intercepts for all possible values of the @@ -126,9 +112,7 @@ $Y$ *can take*. ] -#slide( - title: [Violation of the Equidistant Assumption], -)[ +#slide(title: [Violation of the Equidistant Assumption])[ #v(3em) Since each intercept implies a different CDF value for each $k ∈ K$, we can @@ -136,9 +120,7 @@ ordinal variables. ] -#slide( - title: [Cut Points], -)[ +#slide(title: [Cut Points])[ Each intercept implies in a log-cumulative-odds for each $k ∈ K$; We need also to *undo the cumulative nature of the $K - 1$ intercepts*. Firstly, we *convert the log-cumulative-odds back to a valid probability with the logistic @@ -153,53 +135,98 @@ $ P(Y = k) = P(Y <= k) - P(Y <= k - 1) $ ] -#slide( - title: [Example - Probability Mass Function of an Ordinal Variable], -)[ - #align( - center, - )[ - #let data = ((0.10, 1), (0.15, 2), (0.33, 3), (0.25, 4), (0.10, 5), (0.07, 6),) +#slide(title: [Example - Probability Mass Function of an Ordinal Variable])[ + #align(center)[ + #let data = ( + (0.10, 1), + (0.15, 2), + (0.33, 3), + (0.25, 4), + (0.10, 5), + (0.07, 6), + ) #let x_axis = axis( - min: 0, max: 6.1, step: 1, location: "bottom", helper_lines: false, title: "values", + min: 0, + max: 6.1, + step: 1, + location: "bottom", + helper_lines: false, + title: "values", ) #let y_axis = axis( - min: 0, max: 0.46, step: 0.1, location: "left", show_markings: true, helper_lines: true, value_formatter: "{:.1}", title: "PDF", + min: 0, + max: 0.46, + step: 0.1, + location: "left", + show_markings: true, + helper_lines: true, + value_formatter: "{:.1}", + title: "PDF", ) #let pl = pplot(data: data, axes: (x_axis, y_axis)) #bar_chart(pl, 80%, bar_width: 70%, caption: none) ] ] -#slide( - title: [Example - CDF versus log-cumulative-odds], -)[ +#slide(title: [Example - CDF versus log-cumulative-odds])[ #side-by-side[ - #align( - center, - )[ - #let data = ((0.10, 1), (0.25, 2), (0.58, 3), (0.83, 4), (0.93, 5), (1.00, 6),) + #align(center)[ + #let data = ( + (0.10, 1), + (0.25, 2), + (0.58, 3), + (0.83, 4), + (0.93, 5), + (1.00, 6), + ) #let x_axis = axis( - min: 0, max: 6.1, step: 1, location: "bottom", helper_lines: false, title: "values", + min: 0, + max: 6.1, + step: 1, + location: "bottom", + helper_lines: false, + title: "values", ) #let y_axis = axis( - min: 0, max: 1.2, step: 0.1, location: "left", show_markings: true, helper_lines: true, value_formatter: "{:.1}", title: "CDF", + min: 0, + max: 1.2, + step: 0.1, + location: "left", + show_markings: true, + helper_lines: true, + value_formatter: "{:.1}", + title: "CDF", ) #let pl = pplot(data: data, axes: (x_axis, y_axis)) #bar_chart(pl, 80%, bar_width: 70%, caption: none) ] ][ - #align( - center, - )[ + #align(center)[ #let data = ( - (1, -2.19722), (2, -1.09861), (3, 0.322773), (4, 1.58563), (5, 2.58669), (6, 10.0), + (1, -2.19722), + (2, -1.09861), + (3, 0.322773), + (4, 1.58563), + (5, 2.58669), + (6, 10.0), ) #let x_axis = axis( - min: 0, max: 6.1, step: 1, location: "bottom", helper_lines: false, title: "values", + min: 0, + max: 6.1, + step: 1, + location: "bottom", + helper_lines: false, + title: "values", ) #let y_axis = axis( - min: -3, max: 3, step: 1, location: "left", show_markings: true, helper_lines: true, value_formatter: "{:.1}", title: "log-cumulative-odds", + min: -3, + max: 3, + step: 1, + location: "left", + show_markings: true, + helper_lines: true, + value_formatter: "{:.1}", + title: "log-cumulative-odds", ) #let pl = pplot(data: data, axes: (x_axis, y_axis)) #scatter_plot(pl, 80%, stroke: 4pt, caption: none) @@ -207,9 +234,7 @@ ] ] -#slide( - title: [Adding Coefficients $bold(β)$], -)[ +#slide(title: [Adding Coefficients $bold(β)$])[ #v(3em) With the equidistant assumption solved with $K - 1$ intercepts, we can add @@ -217,9 +242,7 @@ regression model. ] -#slide( - title: [More Log-cumulative-odds], -)[ +#slide(title: [More Log-cumulative-odds])[ We've transformed all intercepts into log-cumulative-odds so that we can add effects as weighted sums of the independent variables to our basal rates (intercepts). @@ -234,9 +257,7 @@ Lastly, $φ_k$ represents the linear predictor for the $k$th intercept. ] -#slide( - title: [Matrix Notation], -)[ +#slide(title: [Matrix Notation])[ This can become more elegant and computationally efficient if we use matrix/vector notation: @@ -253,26 +274,20 @@ observation and every column an independent variable. ] -#slide( - title: [Ordinal Regression Specification], -)[ - #text( - size: 13pt, - )[ +#slide(title: [Ordinal Regression Specification])[ + #text(size: 13pt)[ $ bold(y) &tilde "Categorical"(bold(p)) \ bold(p) &= "logistic"(bold(φ)) \ bold(φ) &= bold(α) + bold(c) + bold(X) \cdot bold(β) \ - c_1 &= "logit"("CDF"(y_1)) \ - c_k &= "logit"("CDF"(y_k) - "CDF"(y_(k-1))) "for" 2 <= k <= K-1 \ - c_K &= "logit"(1 - "CDF"(y_(K-1))) \ + c_1 &= "logit"("CDF"(y_1)) \ + c_k &= "logit"("CDF"(y_k) - "CDF"(y_(k-1))) "for" 2 <= k <= K-1 \ + c_K &= "logit"(1 - "CDF"(y_(K-1))) \ bold(α) &tilde "Normal"(μ_α, σ_α) \ bold(β) &tilde "Normal"(μ_(bold(β)), σ_{bold(β)}) $ ] - #text( - size: 12pt, - )[ + #text(size: 12pt)[ - $bold(y)$ -- ordinal discrete dependent variable. - $bold(p)$ -- probability vector of size $K$. - $K$: number of possible values that $bold(y)$ can take, i.e. number of ordered diff --git a/slides/08-poisson_regression.typ b/slides/08-poisson_regression.typ index 8af7bc5..fdce0f6 100644 --- a/slides/08-poisson_regression.typ +++ b/slides/08-poisson_regression.typ @@ -5,9 +5,7 @@ #new-section-slide("Poisson Regression") -#slide( - title: "Recommended References", -)[ +#slide(title: "Recommended References")[ - #cite(, form: "prose") - Chapter 16: Generalized linear models @@ -23,18 +21,14 @@ #align(center)[#image("images/memes/poisson-distribution.jpg")] ] -#slide( - title: [Count Data], -)[ +#slide(title: [Count Data])[ #v(3em) Poisson regression is used when our dependent variable can only take *positive values*, usually in the context of *count data*. ] -#slide( - title: [What is Poisson Regression?], -)[ +#slide(title: [What is Poisson Regression?])[ Poisson regression behaves exactly like a linear model: it makes a prediction by simply computing a weighted sum of the independent variables $bold(X)$ with the estimated coefficients $bold(β)$: @@ -53,16 +47,19 @@ $ ] -#slide( - title: [Exponential Function], -)[ - #align( - center, - )[ +#slide(title: [Exponential Function])[ + #align(center)[ #canvas( - length: 0.9cm, { + length: 0.9cm, + { plot.plot( - size: (16, 9), x-label: $x$, y-label: $e^x$, x-tick-step: 1, y-tick-step: 20, y-min: -0.01, { + size: (16, 9), + x-label: $x$, + y-label: $e^x$, + x-tick-step: 1, + y-tick-step: 20, + y-min: -0.01, + { plot.add( domain: (-1, 5), samples: 200, // label: $"exponential"(x)$, // FIXME: depends on unreleased cetz 2.0.0 @@ -75,9 +72,7 @@ ] ] -#slide( - title: [Comparison with Linear Regression], -)[ +#slide(title: [Comparison with Linear Regression])[ Linear regression has the following mathematical expression: $ @@ -95,9 +90,7 @@ - $log(y) = e^("Linear") = e^(α + β_1 x_1 + β_2 x_2 + dots + β_k x_k)$ ] -#slide( - title: [Poisson Regression Specification], -)[ +#slide(title: [Poisson Regression Specification])[ We can use Poisson regression if the dependent variable $bold(y)$ has count data, i.e., $bold(y)$ only takes positive values. @@ -109,14 +102,12 @@ $ bold(y) &tilde "Poisson"(e^((α + bold(X) bold(β)))) \ - α &tilde "Normal"(μ_α, σ_α) \ + α &tilde "Normal"(μ_α, σ_α) \ bold(β) &tilde "Normal"(μ_(bold(β)), σ_(bold(β))) $ ] -#slide( - title: [Interpreting the Coefficients], -)[ +#slide(title: [Interpreting the Coefficients])[ When we see the Poisson regression specification, we realize that the coefficient interpretation requires a transformation. What we need to do is undo the logarithm transformation: @@ -129,13 +120,13 @@ $ bold(y) &= e^((α + bold(X) bold(β))) \ - &= e^(α) dot e^(( X_((1)) dot β_((1)) )) dot e^(( X_((2)) dot β_((2)) )) dot dots dot e^(( X_((k)) dot β_((k)) )) + &= e^(α) dot e^((X_((1)) dot β_((1)))) dot e^(( + X_((2)) dot β_((2)) + )) dot dots dot e^((X_((k)) dot β_((k)))) $ ] -#slide( - title: [Interpreting the Coefficients], -)[ +#slide(title: [Interpreting the Coefficients])[ Finally, notice that, when transformed, our dependent variables is no more a "weighted sum of an intercept and independent variables": @@ -143,7 +134,9 @@ $ bold(y) &= e^((α + bold(X) bold(β))) \ - &= e^(α) dot e^(( X_((1)) dot β_((1)) )) dot e^(( X_((2)) dot β_((2)) )) dot dots dot e^(( X_((k)) dot β_((k)) )) + &= e^(α) dot e^((X_((1)) dot β_((1)))) dot e^(( + X_((2)) dot β_((2)) + )) dot dots dot e^((X_((k)) dot β_((k)))) $ #v(1em) diff --git a/slides/09-robust_regression.typ b/slides/09-robust_regression.typ index e2518c1..53e4ceb 100644 --- a/slides/09-robust_regression.typ +++ b/slides/09-robust_regression.typ @@ -5,9 +5,7 @@ #new-section-slide("Robust Regression") -#slide( - title: "Recommended References", -)[ +#slide(title: "Recommended References")[ - #cite(, form: "prose") - Chapter 17: Models for robust inference @@ -23,9 +21,7 @@ #align(center)[#image("images/memes/not_normal_transparent.png")] ] -#slide( - title: [Robust Models], -)[ +#slide(title: [Robust Models])[ Almost always data from real world are really strange. #v(1em) @@ -39,9 +35,7 @@ - unrealistic model assumptions? ] -#slide( - title: [Outliers], -)[ +#slide(title: [Outliers])[ #v(2em) Models based on the *normal distribution are notoriously "non-robust" against @@ -50,9 +44,7 @@ with it. ] -#slide( - title: [Overdispersion], -)[ +#slide(title: [Overdispersion])[ Superdispersion and underdispersion #footnote[ rarer to find in the real world. ] @@ -65,9 +57,7 @@ *a single parameter* is added to allow for overdispersion @gelman2013bayesian. ] -#slide( - title: [Overdispersion Example], -)[ +#slide(title: [Overdispersion Example])[ Suppose you are analyzing data from car accidents. The model we generally use in this type of phenomena is *Poisson regression*. @@ -83,9 +73,7 @@ the desired phenomena. ] -#slide( - title: [Student's $t$ instead of Normal], -)[ +#slide(title: [Student's $t$ instead of Normal])[ Student's $t$ distribution has *wider #footnote[or "fatter".] tails* than the Normal distribution. @@ -104,16 +92,20 @@ function. ] -#slide( - title: [Student's $t$ instead of Normal], -)[ - #align( - center, - )[ +#slide(title: [Student's $t$ instead of Normal])[ + #align(center)[ #canvas( - length: 0.9cm, { + length: 0.9cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.1, y-min: -0.01, y-max: 0.42, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.1, + y-min: -0.01, + y-max: 0.42, + { plot.add( domain: (-4, 4), samples: 200, // label: "Normal", // FIXME: depends on unreleased cetz 2.0.0 @@ -131,22 +123,18 @@ ] ] -#slide( - title: [Student's $t$ instead of Normal], -)[ - #text( - size: 16pt, - )[ +#slide(title: [Student's $t$ instead of Normal])[ + #text(size: 16pt)[ By using a Student's $t$ distribution instead of the Normal distribution as likelihood functions, the model's error $σ$ does _not_ follow a Normal distribution, but a Student's $t$ distribution: $ bold(y) &tilde "Student"(ν, α + bold(X) bold(β), σ) \ - α &tilde "Normal"(μ_α, σ_α) \ + α &tilde "Normal"(μ_α, σ_α) \ bold(β) &tilde "Normal"(μ_bold(β), σ_bold(β)) \ - ν &tilde "Log-Normal"(2, 1) \ - σ &tilde "Exponential"(λ_σ) + ν &tilde "Log-Normal"(2, 1) \ + σ &tilde "Exponential"(λ_σ) $ Note that we are including an extra parameter $ν$, which represents the @@ -158,9 +146,7 @@ ] ] -#slide( - title: [Beta-Binomial instead of the Binomial], -)[ +#slide(title: [Beta-Binomial instead of the Binomial])[ The binomial distribution has a practical limitation that we only have one free parameter to estimate #footnote[since $n$ already comes from data.] ($p$). This implies in the *variance to determined by the mean*. Hence, the binomial @@ -174,9 +160,7 @@ against overdispersion*. ] -#slide( - title: [Beta-Binomial instead of Binomial], -)[ +#slide(title: [Beta-Binomial instead of Binomial])[ The *beta-binomial distribution* is a binomial distribution, where the probability of success $p$ is parameterized as a $"Beta"(α, β)$. @@ -190,18 +174,14 @@ Values of $β >= 1$ make the beta-binomial behave the same as a binomial. ] -#slide( - title: [Beta-Binomial instead of Binomial], -)[ - #text( - size: 18pt, - )[ +#slide(title: [Beta-Binomial instead of Binomial])[ + #text(size: 18pt)[ $ bold(y) & tilde "Beta-Binomial"(n, p, φ) \ - p & tilde "Logistic/Probit"(α + bold(X) bold(β)) \ - α & tilde "Normal"(μ_α, σ_α) \ + p & tilde "Logistic/Probit"(α + bold(X) bold(β)) \ + α & tilde "Normal"(μ_α, σ_α) \ bold(β) & tilde "Normal"(μ_bold(β), σ_bold(β)) \ - φ & tilde "Exponential"(1) + φ & tilde "Exponential"(1) $ #v(1em) @@ -212,12 +192,8 @@ ] ] -#slide( - title: [Student's $t$ instead Binomial], -)[ - #text( - size: 15pt, - )[ +#slide(title: [Student's $t$ instead Binomial])[ + #text(size: 15pt)[ Also known as Robit #footnote[there is a great discussion between Gelman, Vehtari and Kurz at #link("https://discourse.mc-stan.org/t/robit-regression-not-robust/21245/")[ Stan's Discourse @@ -230,7 +206,7 @@ y_i &= cases(0 "if" z_i < 0, 1 "if" z_i > 0) \ z_i &= X_i bold(β) + ε_i \ ε_i &tilde "Student"(ν, 0, sqrt((ν - 2) / ν)) \ - ν &tilde "Gamma"(2, 0.1) ∈ [2, oo) + ν &tilde "Gamma"(2, 0.1) ∈ [2, oo) $ Here we are using the gamma distribution as a truncated Student's $t$ @@ -239,9 +215,7 @@ ] ] -#slide( - title: [Negative Binomial instead of Poisson], -)[ +#slide(title: [Negative Binomial instead of Poisson])[ This is the overdispersion example. The Poisson distribution uses a *single parameter for both its mean and variance*. @@ -258,13 +232,11 @@ $φ$ is also known as a "reciprocal dispersion" parameter. ] -#slide( - title: [Negative Binomial instead of Poisson], -)[ +#slide(title: [Negative Binomial instead of Poisson])[ $ bold(y) &tilde "Negative Binomial"(e^((α + bold(X) bold(β))), φ) \ - φ &tilde "Gamma"(0.01, 0.01) \ - α &tilde "Normal"(μ_α, σ_α) \ + φ &tilde "Gamma"(0.01, 0.01) \ + α &tilde "Normal"(μ_α, σ_α) \ bold(β) &tilde "Normal"(μ_bold(β), σ_bold(β)) $ @@ -284,19 +256,17 @@ @mcelreath2020statistical. ] -#slide( - title: [Negative Binomial Mixture instead of Poisson], -)[ +#slide(title: [Negative Binomial Mixture instead of Poisson])[ Here, $S_i$ is a dummy variable, taking value $1$ if the $i$th observation has a value $≠ 0$. $S_i$ can be modeled using logistic regression: $ - bold(y) &cases( + bold(y) &cases( = 0 "if" S_i = 0, tilde "Negative Binomial"(e^((α + bold(X) bold(β))), φ ) "if" S_i = 1, ) \ P(S_i = 1) &= "Logistic/Probit"(bold(X) bold(γ)) \ - γ &tilde "Beta"(1, 1) + γ &tilde "Beta"(1, 1) $ #v(1em) @@ -304,9 +274,7 @@ $γ$ is a new coefficients which we give uniform prior of $"Beta"(1, 1)$. ] -#slide( - title: [Why Use Non-Robust Models?], -)[ +#slide(title: [Why Use Non-Robust Models?])[ The *central limit theorem* tells us that the *normal distribution* is an appropriate model for data that arises as a *sum of independent components*. diff --git a/slides/10-sparse_regression.typ b/slides/10-sparse_regression.typ index 0879937..6f1bd9e 100644 --- a/slides/10-sparse_regression.typ +++ b/slides/10-sparse_regression.typ @@ -5,9 +5,7 @@ #new-section-slide("Sparse Regression") -#slide( - title: "Recommended References", -)[ +#slide(title: "Recommended References")[ - #cite(, form: "prose") - Chapter 12, Section 12.8: Models for regression coefficients @@ -22,9 +20,7 @@ #align(center)[#image("images/memes/horseshoe.jpg")] ] -#slide( - title: [What is Sparsity?], -)[ +#slide(title: [What is Sparsity?])[ #v(2em) Sparsity is a concept frequently encountered in statistics, signal processing, @@ -32,9 +28,7 @@ elements in a dataset or a vector are zero or close to zero. ] -#slide( - title: [How to Handle Sparsity?], -)[ +#slide(title: [How to Handle Sparsity?])[ Almost all techniques deal with some sort of *variable selection*, instead of altering data. @@ -44,9 +38,7 @@ don't want to throw information away. ] -#slide( - title: [Frequentist Approach], -)[ +#slide(title: [Frequentist Approach])[ The frequentist approach deals with sparse regression by staying in the "optimization" context but adding *Lagrangian constraints* #footnote[ this is called *LASSO* (least absolute shrinkage and selection operator) from #cite(, form: "prose") @@ -64,9 +56,7 @@ Here $|| dot ||_p$ is the $p$-norm. ] -#slide( - title: [Variable Selection Techniques], -)[ +#slide(title: [Variable Selection Techniques])[ #v(4em) - *discrete mixtures*: _spike-and-slab_ prior @@ -74,12 +64,8 @@ - *shrinkage priors*: _Laplace_ prior and _horseshoe_ prior @carvalho2009handling ] -#slide( - title: [Discrete Mixtures -- Spike-and-Slab Prior], -)[ - #text( - size: 16pt, - )[ +#slide(title: [Discrete Mixtures -- Spike-and-Slab Prior])[ + #text(size: 16pt)[ Mixture of two distributions—one that is concentrated at zero (the "spike") and one with a much wider spread (the "slab"). This prior indicates that we believe most coefficients in our model are @@ -90,7 +76,7 @@ $ β_i | λ_i, c &tilde "Normal"(0, sqrt(λ_i^2 c^2)) \ - λ_i &tilde "Bernoulli"(p) + λ_i &tilde "Bernoulli"(p) $ where: @@ -103,55 +89,77 @@ ] ] -#slide( - title: [Discrete Mixtures -- Spike-and-Slab Prior], -)[ +#slide(title: [Discrete Mixtures -- Spike-and-Slab Prior])[ #side-by-side[ - #align( - center, - )[ + #align(center)[ #figure( { canvas( - length: 0.75cm, { + length: 0.75cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.1, x-min: -4, x-max: 4, y-max: 0.5, y-min: 0, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.1, + x-min: -4, + x-max: 4, + y-max: 0.5, + y-min: 0, + { plot.add( - domain: (-4, 4), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => hs(x, 1, 0.05), + domain: (-4, 4), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => hs(x, 1, 0.05), ) }, ) }, ) - }, caption: [$c = 1, λ = 0$], numbering: none, + }, + caption: [$c = 1, λ = 0$], + numbering: none, ) ] ][ - #align( - center, - )[ + #align(center)[ #figure( { canvas( - length: 0.75cm, { + length: 0.75cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.1, x-min: -4, x-max: 4, y-max: 0.5, y-min: 0, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.1, + x-min: -4, + x-max: 4, + y-max: 0.5, + y-min: 0, + { plot.add( - domain: (-4, 4), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => hs(x, 1, 1), + domain: (-4, 4), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => hs(x, 1, 1), ) }, ) }, ) - }, caption: [$c = 1, λ = 1$], numbering: none, + }, + caption: [$c = 1, λ = 1$], + numbering: none, ) ] ] ] -#slide( - title: [Shinkrage Priors -- Laplace Prior], -)[ +#slide(title: [Shinkrage Priors -- Laplace Prior])[ The Laplace distribution is a continuous probability distribution named after Pierre-Simon Laplace. It is also known as the double exponential distribution. @@ -169,36 +177,43 @@ It is a symmetrical exponential decay around $μ$ with scale governed by $b$. ] -#slide( - title: [Shinkrage Priors -- Laplace Prior], -)[ - #align( - center, - )[ +#slide(title: [Shinkrage Priors -- Laplace Prior])[ + #align(center)[ #figure( { canvas( - length: 0.9cm, { + length: 0.9cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.1, x-min: -4, x-max: 4, y-max: 0.55, y-min: 0, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.1, + x-min: -4, + x-max: 4, + y-max: 0.55, + y-min: 0, + { plot.add( - domain: (-4, 4), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => laplace(x, 1), + domain: (-4, 4), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => laplace(x, 1), ) }, ) }, ) - }, caption: [$μ = 0, b = 1$], numbering: none, + }, + caption: [$μ = 0, b = 1$], + numbering: none, ) ] ] -#slide( - title: [Shinkrage Priors -- Horseshoe Prior], -)[ - #text( - size: 17pt, - )[ +#slide(title: [Shinkrage Priors -- Horseshoe Prior])[ + #text(size: 17pt)[ The horseshoe prior @carvalho2009handling assumes that each coefficient $β_i$ is conditionally independent with density $P_("HS")(β_i | τ )$, where $P_("HS")$ @@ -206,7 +221,7 @@ $ β_i | λ_i, τ &tilde "Normal"(0, sqrt(λ_i^2 τ^2)) \ - λ_i &tilde "Cauchy"^+ (0, 1) + λ_i &tilde "Cauchy"^+ (0, 1) $ where: @@ -219,55 +234,77 @@ ] ] -#slide( - title: [Discrete Mixtures -- Spike-and-Slab Prior], -)[ +#slide(title: [Discrete Mixtures -- Spike-and-Slab Prior])[ #side-by-side[ - #align( - center, - )[ + #align(center)[ #figure( { canvas( - length: 0.75cm, { + length: 0.75cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.1, x-min: -4, x-max: 4, y-max: 0.8, y-min: 0, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.1, + x-min: -4, + x-max: 4, + y-max: 0.8, + y-min: 0, + { plot.add( - domain: (-4, 4), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => hs(x, 1, 1), + domain: (-4, 4), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => hs(x, 1, 1), ) }, ) }, ) - }, caption: [$τ = 1, λ = 1$], numbering: none, + }, + caption: [$τ = 1, λ = 1$], + numbering: none, ) ] ][ - #align( - center, - )[ + #align(center)[ #figure( { canvas( - length: 0.75cm, { + length: 0.75cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.1, x-min: -4, x-max: 4, y-max: 0.8, y-min: 0, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.1, + x-min: -4, + x-max: 4, + y-max: 0.8, + y-min: 0, + { plot.add( - domain: (-4, 4), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => hs(x, 1, 1 / 2), + domain: (-4, 4), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => hs(x, 1, 1 / 2), ) }, ) }, ) - }, caption: [$τ = 1, λ = 1 / 2$], numbering: none, + }, + caption: [$τ = 1, λ = 1 / 2$], + numbering: none, ) ] ] ] -#slide( - title: [Discrete Mixtures versus Shinkrage Priors], -)[ +#slide(title: [Discrete Mixtures versus Shinkrage Priors])[ *Discrete mixtures* offer the correct representation of sparse problems @carvalho2009handling by placing positive prior probability on $β_i = 0$ (regression coefficient), but pose several difficulties: mostly @@ -279,12 +316,8 @@ be very attractive computationally: again due to the *continuous property*. ] -#slide( - title: [Horseshoe versus Laplace], -)[ - #text( - size: 17pt, - )[ +#slide(title: [Horseshoe versus Laplace])[ + #text(size: 17pt)[ The advantages of the Horseshoe prior over the Laplace prior are primarily: - *shrinkage*: The Horseshoe prior has infinitely heavy tails and an infinite @@ -304,9 +337,7 @@ ] ] -#slide( - title: [Effective Shinkrage Comparison], -)[ +#slide(title: [Effective Shinkrage Comparison])[ Makes more sense to compare the shinkrage effects of the proposed approaches so far. Assume for now that $σ^2 = τ^2 = 1$, and define $κ_i = 1 / (1 + λ_i^2)$. @@ -326,59 +357,79 @@ $ ] -#slide( - title: [Effective Shinkrage Comparison #footnote[spike-and-slab with $p = 1 / 2$ - would be very similar to Horseshoe but with discontinuities.]], -)[ +#slide(title: [Effective Shinkrage Comparison #footnote[spike-and-slab with $p = 1 / 2$ + would be very similar to Horseshoe but with discontinuities.]])[ #side-by-side[ - #align( - center, - )[ + #align(center)[ #figure( { canvas( - length: 0.75cm, { + length: 0.75cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.1, x-min: 0, x-max: 1, y-max: 0.8, y-min: 0, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.1, + x-min: 0, + x-max: 1, + y-max: 0.8, + y-min: 0, + { plot.add( - domain: (0.01, 0.99), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => beta(x, 0.5, 0.5), + domain: (0.01, 0.99), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => beta(x, 0.5, 0.5), ) }, ) }, ) - }, caption: [Laplace], numbering: none, + }, + caption: [Laplace], + numbering: none, ) ] ][ - #align( - center, - )[ + #align(center)[ #figure( { canvas( - length: 0.75cm, { + length: 0.75cm, + { plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.1, x-min: 0, x-max: 1, y-max: 0.8, y-min: 0, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.1, + x-min: 0, + x-max: 1, + y-max: 0.8, + y-min: 0, + { plot.add( - domain: (0.1, 0.9), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => shinkragelaplace(x, 1 / 2), + domain: (0.1, 0.9), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => shinkragelaplace(x, 1 / 2), ) }, ) }, ) - }, caption: [Horseshoe], numbering: none, + }, + caption: [Horseshoe], + numbering: none, ) ] ] ] -#slide( - title: [Shinkrage Priors -- Horseshoe+], -)[ - #text( - size: 17pt, - )[ +#slide(title: [Shinkrage Priors -- Horseshoe+])[ + #text(size: 17pt)[ Natural extension from the Horseshoe that has improved performance with highly sparse data @bhadra2015horseshoe. @@ -386,8 +437,8 @@ $ β_i | λ_i, η_i, τ &tilde "Normal"(0, λ_i) \ - λ_i | η_i, τ &tilde "Cauchy"^+(0, τ η_i) \ - η_i &tilde "Cauchy"^+ (0, 1) + λ_i | η_i, τ &tilde "Cauchy"^+(0, τ η_i) \ + η_i &tilde "Cauchy"^+ (0, 1) $ where: @@ -399,9 +450,7 @@ ] ] -#slide( - title: [Shinkrage Priors -- Regularized Horseshoe], -)[ +#slide(title: [Shinkrage Priors -- Regularized Horseshoe])[ The Horseshoe and Horseshoe+ guarantees that the strong signals will not be overshrunk. However, this property can also be harmful, especially when the parameters are weakly identified. @@ -413,16 +462,12 @@ coefficient. ] -#slide( - title: [Shinkrage Priors -- Regularized Horseshoe], -)[ - #text( - size: 16pt, - )[ +#slide(title: [Shinkrage Priors -- Regularized Horseshoe])[ + #text(size: 16pt)[ $ β_i | λ_i, τ, c &tilde "Normal" (0, sqrt(τ^2 tilde(λ_i)^2)) \ - tilde(λ_i)^2 &= (c^2 λ_i^2) / (c^2 + τ^2 λ_i^2) \ - λ_i &tilde "Cauchy"^+(0, 1) + tilde(λ_i)^2 &= (c^2 λ_i^2) / (c^2 + τ^2 λ_i^2) \ + λ_i &tilde "Cauchy"^+(0, 1) $ where: @@ -438,9 +483,7 @@ ] ] -#slide( - title: [Shinkrage Priors -- R2-D2], -)[ Still, we can do better. The *R2-D2* #footnote[ +#slide(title: [Shinkrage Priors -- R2-D2])[ Still, we can do better. The *R2-D2* #footnote[ $R^2$-induced Dirichlet Decomposition ] prior @zhang2022bayesian has heavier tails and higher concentration around zero than the previous approaches. @@ -452,16 +495,12 @@ coefficient between the dependent variable and its modeled expectation.}. Then using that prior to "distribute" throughout the $bold(β)$. ] -#slide( - title: [Shinkrage Priors -- R2-D2], -)[ - #text( - size: 16pt, - )[ +#slide(title: [Shinkrage Priors -- R2-D2])[ + #text(size: 16pt)[ $ - R^2 &tilde "Beta"(μ_(R^2) σ_(R^2), (1 - μ_(R^2)) σ_(R^2)) \ + R^2 &tilde "Beta"(μ_(R^2) σ_(R^2), (1 - μ_(R^2)) σ_(R^2)) \ bold(φ) &tilde "Dirichlet"(J, 1) \ - τ^2 &= (R^2) / (1 - R^2) \ + τ^2 &= (R^2) / (1 - R^2) \ bold(β) &= Z dot sqrt(bold(φ) τ^2) $ diff --git a/slides/11-hierarchical_models.typ b/slides/11-hierarchical_models.typ index 054b1b1..83fd05d 100644 --- a/slides/11-hierarchical_models.typ +++ b/slides/11-hierarchical_models.typ @@ -5,9 +5,7 @@ #new-section-slide("Hierarchical Models") -#slide( - title: "Recommended References", -)[ +#slide(title: "Recommended References")[ - #cite(, form: "prose"): - Chapter 5: Hierarchical models - Chapter 15: Hierarchical linear models @@ -29,9 +27,7 @@ #align(center)[#image("images/memes/hierarchical_models.jpg")] ] -#slide( - title: [I have many names...], -)[ +#slide(title: [I have many names...])[ Hierarchical models are also known for several names #footnote[ for the whole full list #link( @@ -50,9 +46,7 @@ - Nested Data Models ] -#slide( - title: [What are hierarchical models?], -)[ +#slide(title: [What are hierarchical models?])[ Statistical model specified in multiple levels that estimates parameters from the posterior distribution using a Bayesian approach. @@ -67,24 +61,21 @@ values. ] -#slide( - title: [What are hierarchical models?], -)[ - #text( - size: 16pt, - )[ +#slide(title: [What are hierarchical models?])[ + #text(size: 16pt)[ Hyperparameter $φ$ that parameterizes $θ_1, θ_2, dots, θ_K$, that are used to infer the posterior density of some random variable $bold(y) = y_1, y_2, dots, y_K$ ] - #align( - center, - )[ + #align(center)[ #canvas( - length: 0.9cm, { + length: 0.9cm, + { import draw: * set-style( - mark: (end: ">", fill: black, size: 0.3), stroke: (thickness: 2pt), radius: 1, + mark: (end: ">", fill: black, size: 0.3), + stroke: (thickness: 2pt), + radius: 1, ) circle((6, 0)) content((6, 0), [#align(center)[$φ$]]) @@ -133,25 +124,22 @@ ] ] -#slide( - title: [What are hierarchical models?], -)[ - #text( - size: 14pt, - )[ +#slide(title: [What are hierarchical models?])[ + #text(size: 14pt)[ Even that the observations directly inform only a single set of parameters, a hierarchical model couples individual parameters, and provides a "backdoor" for information flow. ] - #align( - center, - )[ + #align(center)[ #side-by-side[ #canvas( - length: 0.7cm, { + length: 0.7cm, + { import draw: * set-style( - mark: (end: ">", fill: black, size: 0.3), stroke: (thickness: 2pt), radius: 1, + mark: (end: ">", fill: black, size: 0.3), + stroke: (thickness: 2pt), + radius: 1, ) circle((6, 0)) content((6, 0), [#align(center)[$φ$]]) @@ -195,10 +183,13 @@ ) ][ #canvas( - length: 0.7cm, { + length: 0.7cm, + { import draw: * set-style( - mark: (end: ">", fill: black, size: 0.3), stroke: (thickness: 2pt), radius: 1, + mark: (end: ">", fill: black, size: 0.3), + stroke: (thickness: 2pt), + radius: 1, ) circle((6, 0)) content((6, 0), [#align(center)[$φ$]]) @@ -245,9 +236,7 @@ ) ] ] - #text( - size: 14pt, - )[ + #text(size: 14pt)[ For example, the observations from the $k$th group, $y_k$, informs directly the parameters that quantify the $k$th group's behavior, $θ_k$. These parameters, however, inform directly the population-level @@ -259,9 +248,7 @@ ] ] -#slide( - title: [When to Use Hierarchical Models?], -)[ +#slide(title: [When to Use Hierarchical Models?])[ #v(3em) *Hierarchical models* are used when information is available in *several levels @@ -270,12 +257,8 @@ also performing a crucial role in the development of *computational strategies*. ] -#slide( - title: [When to Use Hierarchical Models?], -)[ - #text( - size: 16pt, - )[ +#slide(title: [When to Use Hierarchical Models?])[ + #text(size: 16pt)[ Hierarchical models are particularly appropriate for research projects where participant data can be organized in more than one level #footnote[ also known as nested data. @@ -295,9 +278,7 @@ ] ] -#slide( - title: [When to Use Hierarchical Models?], -)[ +#slide(title: [When to Use Hierarchical Models?])[ Another good use case is *big data* @gelman2013bayesian. - simple nonhierarchical models are usually inappropriate for hierarchical data: @@ -310,9 +291,7 @@ thereby *avoiding problems of overfitting*. ] -#slide( - title: [When to Use Hierarchical Models?], -)[ +#slide(title: [When to Use Hierarchical Models?])[ #v(2em) Most important is *not to violate* the *exchangeability assumption* @@ -323,9 +302,7 @@ This assumption stems from the principle that *groups are _exchangeable_*. ] -#slide( - title: [Hyperprior], -)[ +#slide(title: [Hyperprior])[ In hierarchical models, we have a hyperprior, which is a prior's prior: #v(1em) @@ -333,7 +310,7 @@ $ bold(y) &tilde "Normal"(10, bold(θ)) \ bold(θ) &tilde "Normal"(0, φ) \ - φ &tilde "Exponential(1)" + φ &tilde "Exponential(1)" $ #v(2em) @@ -343,12 +320,8 @@ their own prior (which becomes a hyperprior) $φ$. ] -#slide( - title: [Frequentist versus Bayesian Approaches], -)[ - #text( - size: 18pt, - )[ +#slide(title: [Frequentist versus Bayesian Approaches])[ + #text(size: 18pt)[ There are also hierarchical models in frequentist statistics. They are mainly available in the lme4 package @lme4, and also in MixedModels.jl @MixedModels. @@ -370,9 +343,7 @@ ] ] -#slide( - title: [Frequentist versus Bayesian Approaches], -)[ +#slide(title: [Frequentist versus Bayesian Approaches])[ #v(3em) To sum up, *frequentist approach for hierarchical models is not robust* in both @@ -381,9 +352,7 @@ provide $p$-values due to *strong assumptions that are almost always violated*). ] -#slide( - title: [Approaches to Hierarchical Modeling], -)[ +#slide(title: [Approaches to Hierarchical Modeling])[ #v(1em) - *Varying-intercept* model: One group-level intercept besides the @@ -400,9 +369,7 @@ group-level coefficient(s). ] -#slide( - title: [Mathematical Specification of Hierarchical Models], -)[ +#slide(title: [Mathematical Specification of Hierarchical Models])[ #v(4em) We have $N$ observations organized in $J$ groups with $K$ independent variables. @@ -413,40 +380,34 @@ $ bold(y) &tilde "Normal"(α_j + bold(X) dot bold(β), σ) \ - α_j &tilde "Normal"(α, τ) \ - α &tilde "Normal"(μ_α, σ_α) \ + α_j &tilde "Normal"(α, τ) \ + α &tilde "Normal"(μ_α, σ_α) \ bold(β) &tilde "Normal"(μ_{bold(β)}, σ_(bold(β))) \ - τ &tilde "Cauchy"^+(0, ψ_(α)) \ - σ &tilde "Exponential"(λ_σ) + τ &tilde "Cauchy"^+(0, ψ_(α)) \ + σ &tilde "Exponential"(λ_σ) $ ] -#slide( - title: [Mathematical Specification -- Varying-Intercept Model], -)[ +#slide(title: [Mathematical Specification -- Varying-Intercept Model])[ - #text( - size: 14pt, - )[ + #text(size: 14pt)[ If you need to extend to more than one group, such as $J_1, J_2, dots$: $ - bold(y) &tilde "Normal"(α_j_1 + α_j_2 + bold(X) bold(β), σ) \ - α_(j_1) &tilde "Normal"(α_1, τ_α_j_1) \ - α_(j_2) &tilde "Normal"(α_2, τ_α_j_2) \ - α_1 &tilde "Normal"(μ_(α_1), σ_α_1) \ - α_2 &tilde "Normal"(μ_(α_2), σ_α_2) \ - bold(β) &tilde "Normal"(μ_(bold(β)), σ_(bold(β))) \ + bold(y) &tilde "Normal"(α_j_1 + α_j_2 + bold(X) bold(β), σ) \ + α_(j_1) &tilde "Normal"(α_1, τ_α_j_1) \ + α_(j_2) &tilde "Normal"(α_2, τ_α_j_2) \ + α_1 &tilde "Normal"(μ_(α_1), σ_α_1) \ + α_2 &tilde "Normal"(μ_(α_2), σ_α_2) \ + bold(β) &tilde "Normal"(μ_(bold(β)), σ_(bold(β))) \ τ_(α_j_1) &tilde "Cauchy"^+(0, ψ_α_j_1) \ τ_(α_j_2) &tilde "Cauchy"^+(0, ψ_α_j_2) \ - σ &tilde "Exponential"(λ_σ) + σ &tilde "Exponential"(λ_σ) $ ] ] -#slide( - title: [Mathematical Specification -- Varying-(Intercept-)Slope Model], -)[ +#slide(title: [Mathematical Specification -- Varying-(Intercept-)Slope Model])[ If we want a varying intercept, we just insert a column filled with $1$s in the data matrix $bold(X)$. @@ -459,12 +420,10 @@ intercept. ] -#slide( -title: [Mathematical Specification -- Varying-(Intercept-)Slope Model], -)[ -Hence, we have as a data matrix: +#slide(title: [Mathematical Specification -- Varying-(Intercept-)Slope Model])[ + Hence, we have as a data matrix: -#v(2em) + #v(2em) // typstfmt::off $ @@ -480,17 +439,15 @@ Hence, we have as a data matrix: // typstfmt::on ] -#slide( - title: [Mathematical Specification -- Varying-(Intercept-)Slope Model], -)[ +#slide(title: [Mathematical Specification -- Varying-(Intercept-)Slope Model])[ This example is for linear regression: $ - bold(y) &tilde "Normal"(bold(X) bold(β)_{j}, σ) \ + bold(y) &tilde "Normal"(bold(X) bold(β)_{j}, σ) \ bold(β)_j &tilde "Multivariate Normal"(bold(μ)_j, bold(Σ)) "for" j ∈ {1, dots, J} \ - bold(Σ) &tilde "LKJ"(η) \ - σ &tilde "Exponential"(λ_σ) + bold(Σ) &tilde "LKJ"(η) \ + σ &tilde "Exponential"(λ_σ) $ #v(1em) @@ -500,26 +457,22 @@ Hence, we have as a data matrix: filled with $1$s (intercept). ] -#slide( - title: [Mathematical Specification -- Varying-(Intercept-)Slope Model], -)[ +#slide(title: [Mathematical Specification -- Varying-(Intercept-)Slope Model])[ If you need to extend to more than one group, such as $J_1, J_2, dots$: $ - bold(y) &tilde "Normal"(α + bold(X) bold(β)_j_1 + bold(X) bold(β)_j_2, σ) \ + bold(y) &tilde "Normal"(α + bold(X) bold(β)_j_1 + bold(X) bold(β)_j_2, σ) \ bold(β)_j_1 &tilde "Multivariate Normal"(bold(μ)_j_1, bold(Σ)_1) "for" j_1 ∈ {1, dots, J_1} \ bold(β)_j_2 &tilde "Multivariate Normal"(bold(μ)_j_2, bold(Σ)_2) "for" j_2 ∈ {1, dots, J_2} \ - bold(Σ)_1 &tilde "LKJ"(η_1) \ - bold(Σ)_2 &tilde "LKJ"(η_2) \ - σ &tilde "Exponential"(λ_σ) + bold(Σ)_1 &tilde "LKJ"(η_1) \ + bold(Σ)_2 &tilde "LKJ"(η_2) \ + σ &tilde "Exponential"(λ_σ) $ ] -#slide( - title: [Priors for Covariance Matrices], -)[ +#slide(title: [Priors for Covariance Matrices])[ We can specify a prior for a covariance matrix $bold(Σ)$. #v(1em) @@ -538,20 +491,14 @@ Hence, we have as a data matrix: $bold(Σ)$ (is is the $bold(Σ)$'s diagonal). ] -#slide( - title: [Priors for Covariance Matrices], -)[ - #text( - size: 16pt, - )[ +#slide(title: [Priors for Covariance Matrices])[ + #text(size: 16pt)[ Additionally, the correlation matrix $bold(Ω)$ can be decomposed once more for greater computational efficiency. Since all correlations matrices are symmetric and positive definite (all of its eigenvalues are real numbers $RR$ and positive $>0$), we can use the - #link( - "https://en.wikipedia.org/wiki/Cholesky_decomposition", - )[Cholesky Decomposition] + #link("https://en.wikipedia.org/wiki/Cholesky_decomposition")[Cholesky Decomposition] to decompose it into a triangular matrix (which is much more computational efficient to handle): diff --git a/slides/12-mcmc.typ b/slides/12-mcmc.typ index 8398bd2..54734ef 100644 --- a/slides/12-mcmc.typ +++ b/slides/12-mcmc.typ @@ -6,12 +6,8 @@ #new-section-slide("Markov Chain Monte Carlo (MCMC) and Model Metrics") -#slide( - title: "Recommended References", -)[ - #text( - size: 18pt, - )[ +#slide(title: "Recommended References")[ + #text(size: 18pt)[ - #cite(, form: "prose"): - Chapter 10: Introduction to Bayesian computation - Chapter 11: Basics of Markov chain simulation @@ -37,12 +33,8 @@ #align(center)[#image("images/memes/computation.png")] ] -#slide( - title: [Monte Carlo Methods], -)[ - #side-by-side( - columns: (4fr, 1fr), - )[ +#slide(title: [Monte Carlo Methods])[ + #side-by-side(columns: (4fr, 1fr))[ - #link("http://mc-stan.org/")[Stan] is named after the mathematician Stanislaw Ulam, who was involved in the Manhattan project, and while trying to calculate @@ -57,18 +49,12 @@ ] ] -#slide( - title: [ - History Behind the Monte Carlo Methods - #footnote[those who are interested, should read #cite(, form: "prose").] - ], -)[ - #side-by-side( - columns: (4fr, 1fr), - )[ - #text( - size: 17pt, - )[ +#slide(title: [ + History Behind the Monte Carlo Methods + #footnote[those who are interested, should read #cite(, form: "prose").] +])[ + #side-by-side(columns: (4fr, 1fr))[ + #text(size: 17pt)[ - The idea came when Ulam was playing Solitaire while recovering from surgery. Ulam was trying to calculate the deterministic, i.e. analytical solution, of the probability of being dealt an already-won game. The calculations where almost @@ -86,12 +72,8 @@ ] ] -#slide( - title: [Why Do We Need MCMC?], -)[ - #text( - size: 18pt, - )[ +#slide(title: [Why Do We Need MCMC?])[ + #text(size: 18pt)[ The main computation barrier for Bayesian statistics is the denominator in Bayes' theorem, $P("data")$: @@ -115,9 +97,7 @@ ] ] -#slide( - title: [Why Do We Need MCMC?], -)[ +#slide(title: [Why Do We Need MCMC?])[ However, in the case of continuous values, the denominator $P("data")$ turns into a very big and nasty integral: @@ -135,9 +115,7 @@ *This is where Monte Carlo methods comes into play!* ] -#slide( - title: [Why Do We Need the Denominator $P("data")$?], -)[ +#slide(title: [Why Do We Need the Denominator $P("data")$?])[ To normalize the posterior with the intent of making it a *valid probability*. This means that the probability for all possible parameters' values must be $1$: @@ -152,9 +130,7 @@ $ ] -#slide( - title: [What If We Remove the Denominator $P("data")$?], -)[ +#slide(title: [What If We Remove the Denominator $P("data")$?])[ By removing the denominator $("data")$, we conclude that the posterior $P(θ | "data")$ is *proportional* to the product of the prior and the likelihood $P(θ) dot P("data" | θ)$: @@ -166,9 +142,7 @@ $ ] -#slide( - title: [Markov Chain Monte Carlo (MCMC)], -)[ +#slide(title: [Markov Chain Monte Carlo (MCMC)])[ Here is where *Markov Chain Monte Carlo* comes in: MCMC is an ample class of computational tools to approximate integrals and @@ -189,14 +163,10 @@ $P("data")$. ] -#slide( - title: [ - Markov Chains - ], -)[ - #side-by-side( - columns: (4fr, 1fr), - )[ +#slide(title: [ + Markov Chains +])[ + #side-by-side(columns: (4fr, 1fr))[ #v(2em) - We proceed by defining an *ergodic Markov chain* #footnote[ @@ -215,14 +185,10 @@ ] ] -#slide( - title: [ - Markov Chains - ], -)[ - #side-by-side( - columns: (4fr, 1fr), - )[ +#slide(title: [ + Markov Chains +])[ + #side-by-side(columns: (4fr, 1fr))[ - Markov chains have a property that the probability distribution of the next state *depends only on the current state and not in the sequence of events that @@ -241,18 +207,19 @@ ] ] -#slide( - title: [ - Example of a Markov Chain - ], -)[ - #align( - center, - )[ +#slide(title: [ + Example of a Markov Chain +])[ + #align(center)[ #canvas( - length: 0.9cm, { + length: 0.9cm, + { import draw: * - set-style(mark: (end: ">", fill: black), stroke: (thickness: 2pt), radius: 2) + set-style( + mark: (end: ">", fill: black), + stroke: (thickness: 2pt), + radius: 2, + ) circle((0, 0)) content((0, 0), [#align(center)[Sun]]) bezier-through((0, 2), (4, 4), (8, 2)) @@ -273,12 +240,8 @@ ] ] -#slide( - title: [Markov Chains], -)[ - #text( - size: 18pt, - )[ +#slide(title: [Markov Chains])[ + #text(size: 18pt)[ The efficacy of this approach depends on: - *how big $r$ must be* to guarantee an *adequate sample*. @@ -297,9 +260,7 @@ ] ] -#slide( - title: [MCMC Algorithms], -)[ +#slide(title: [MCMC Algorithms])[ We have *TONS* of MCMC algorithms #footnote[ see the #link( @@ -319,9 +280,7 @@ @neal2011mcmc @betancourtConceptualIntroductionHamiltonian2017. ] -#slide( - title: [MCMC Algorithms -- Metropolis-Hastings], -)[ +#slide(title: [MCMC Algorithms -- Metropolis-Hastings])[ These are the first MCMC algorithms. They use an *acceptance/rejection rule for the proposals*. They are characterized by proposals originated from a random walk in the parameter space. The *Gibbs algorithm* can be seen as a *special @@ -335,9 +294,7 @@ the parameter space @beskosOptimalTuningHybrid2013. ] -#slide( - title: [MCMC Algorithms -- Hamiltonian Monte Carlo], -)[ +#slide(title: [MCMC Algorithms -- Hamiltonian Monte Carlo])[ The current most efficient MCMC algorithms. They try to *avoid the random walk behavior by introducing an auxiliary vector of momenta using Hamiltonian dynamics*. The proposals are "guided" to higher density @@ -351,17 +308,11 @@ dimension in the parameter space @beskosOptimalTuningHybrid2013. ] -#slide( - title: [ - Metropolis Algorithm - ], -)[ - #side-by-side( - columns: (4fr, 1fr), - )[ - #text( - size: 18pt, - )[ +#slide(title: [ + Metropolis Algorithm +])[ + #side-by-side(columns: (4fr, 1fr))[ + #text(size: 18pt)[ The first broadly used MCMC algorithm to generate samples from a Markov chain was originated in the physics literature in the 1950s and is called Metropolis @metropolisEquationStateCalculations1953, in honor of the first author @@ -386,12 +337,8 @@ ] ] -#slide( - title: [Metropolis Algorithm], -)[ - #text( - size: 18pt, - )[ +#slide(title: [Metropolis Algorithm])[ + #text(size: 18pt)[ Metropolis is a random walk through the parameter sample space, where the probability of the Markov chain changing its state is defined as: @@ -415,12 +362,8 @@ ] ] -#slide( - title: [Metropolis Algorithm], -)[ - #algo( - line-numbers: false, - )[ +#slide(title: [Metropolis Algorithm])[ + #algo(line-numbers: false)[ Define an initial set $bold(θ)^0 ∈ RR^p$ that $P(bold(θ)^0 | bold(y)) > 0$ \ for $t = 1, 2, dots$ #i \ Sample a proposal of $bold(θ)^*$ from a proposal distribution in time @@ -440,20 +383,26 @@ ] ] -#slide( - title: [Visual Intuition -- Metropolis], -)[ - #align( - center, - )[ +#slide(title: [Visual Intuition -- Metropolis])[ + #align(center)[ #import draw: * #canvas( - length: 0.9cm, { + length: 0.9cm, + { set-style(stroke: (thickness: 2pt)) plot.plot( - size: (16, 9), x-label: none, y-label: "PDF", x-tick-step: 1, y-tick-step: 0.1, y-max: 0.55, { + size: (16, 9), + x-label: none, + y-label: "PDF", + x-tick-step: 1, + y-tick-step: 0.1, + y-max: 0.55, + { plot.add( - domain: (-4, 4), samples: 200, style: (stroke: (paint: julia-purple, thickness: 2pt)), x => gaussian(x, 0, 1), + domain: (-4, 4), + samples: 200, + style: (stroke: (paint: julia-purple, thickness: 2pt)), + x => gaussian(x, 0, 1), ) }, ) @@ -473,14 +422,10 @@ ] ] -#slide( - title: [ - Metropolis-Hastings Algorithm - ], -)[ - #side-by-side( - columns: (4fr, 1fr), - )[ +#slide(title: [ + Metropolis-Hastings Algorithm +])[ + #side-by-side(columns: (4fr, 1fr))[ In the 1970s emerged a generalization of the Metropolis algorithm, which *does not need that the proposal distributions be symmetric*: @@ -499,12 +444,8 @@ ] ] -#slide( - title: [Metropolis-Hastings Algorithm], -)[ - #algo( - line-numbers: false, - )[ +#slide(title: [Metropolis-Hastings Algorithm])[ + #algo(line-numbers: false)[ Define an initial set $bold(θ)^0 ∈ RR^p$ that $P(bold(θ)^0 | bold(y)) > 0$ \ for $t = 1, 2, dots$ #i \ Sample a proposal of $bold(θ)^*$ from a proposal distribution in time @@ -512,7 +453,11 @@ As an acceptance/rejection rule, compute the proportion of the probabilities: \ $ - r = ((P(bold(θ)^* | bold(y))) / (J_t (bold(θ)^* | bold(θ)^(t - 1)))) / ((P(bold(θ)^(t - 1) | bold(y))) / (J_t (bold(θ)^(t - 1) | bold(θ)^*))) + r = ((P(bold(θ)^* | bold(y))) / (J_t ( + bold(θ)^* | bold(θ)^(t - 1) + ))) / ((P(bold(θ)^(t - 1) | bold(y))) / (J_t ( + bold(θ)^(t - 1) | bold(θ)^* + ))) $ Assign: @@ -524,25 +469,19 @@ ] ] -#slide( - title: [Metropolis-Hastings Animation], -)[ -#v(4em) +#slide(title: [Metropolis-Hastings Animation])[ + #v(4em) -#align( - center, -)[ -See Metropolis-Hastings in action at #link( + #align(center)[ + See Metropolis-Hastings in action at #link( "https://chi-feng.github.io/mcmc-demo/app.html?algorithm=RandomWalkMH&target=banana", )[ `chi-feng/mcmc-demo` ]. -] + ] ] -#slide( - title: [Limitations of the Metropolis Algorithms], -)[ +#slide(title: [Limitations of the Metropolis Algorithms])[ The limitations of the Metropolis-Hastings algorithms are mainly *computational*: @@ -558,12 +497,8 @@ See Metropolis-Hastings in action at #link( overcome the low efficiency. ] -#slide( - title: [Gibbs Algorithm], -)[ - #side-by-side( - columns: (4fr, 1fr), - )[ +#slide(title: [Gibbs Algorithm])[ + #side-by-side(columns: (4fr, 1fr))[ To circumvent Metropolis' low acceptance rate, the Gibbs algorithm was conceived. Gibbs *do not have an acceptance/rejection rule* for the Markov chain state change: *all proposals are accepted!* @@ -579,9 +514,7 @@ See Metropolis-Hastings in action at #link( ] ] -#slide( - title: [Gibbs Algorithm], -)[ +#slide(title: [Gibbs Algorithm])[ The Gibbs algorithm is very useful in multidimensional sample spaces. It is also known as *alternating conditional sampling*, because we always sample a parameter *conditioned* on the probability of the other model's parameters. @@ -598,12 +531,8 @@ See Metropolis-Hastings in action at #link( $ ] -#slide( - title: [Gibbs Algorithm], -)[ - #algo( - line-numbers: false, - )[ +#slide(title: [Gibbs Algorithm])[ + #algo(line-numbers: false)[ Define an initial set $bold(θ)^0 ∈ RR^p$ that $P(bold(θ)^0 | bold(y)) > 0$ \ for $t = 1, 2, dots$ #i \ Assign: @@ -618,25 +547,19 @@ See Metropolis-Hastings in action at #link( ] ] -#slide( - title: [Gibbs Animation], -)[ -#v(4em) +#slide(title: [Gibbs Animation])[ + #v(4em) -#align( - center, -)[ -See Gibbs in action at #link( + #align(center)[ + See Gibbs in action at #link( "https://chi-feng.github.io/mcmc-demo/app.html?algorithm=GibbsSampling&target=banana", )[ `chi-feng/mcmc-demo` ]. -] + ] ] -#slide( - title: [Limitations of the Gibbs Algorithm], -)[ +#slide(title: [Limitations of the Gibbs Algorithm])[ The main limitation of Gibbs algorithm is with relation to *alternating conditional sampling*: @@ -650,12 +573,8 @@ See Gibbs in action at #link( movements*, and never multidimensional diagonal movements. ] -#slide( - title: [Hamiltonian Monte Carlo (HMC)], -)[ - #side-by-side( - columns: (4fr, 1fr), - )[ +#slide(title: [Hamiltonian Monte Carlo (HMC)])[ + #side-by-side(columns: (4fr, 1fr))[ Metropolis' low acceptance rate and Gibbs' low performance in multidimensional problems (where the posterior geometry is highly complex) made a new class of MCMC algorithms to emerge. @@ -670,12 +589,8 @@ See Gibbs in action at #link( ] ] -#slide( - title: [HMC Algorithm], -)[ - #text( - size: 18pt, - )[ +#slide(title: [HMC Algorithm])[ + #text(size: 18pt)[ HMC algorithm is an adaptation of the MH algorithm, and employs a guidance scheme to the generation of new proposals. It boosts the acceptance rate, and, consequently, has a better efficiency. @@ -694,9 +609,7 @@ See Gibbs in action at #link( ] ] -#slide( - title: [History of HMC Algorithm], -)[ +#slide(title: [History of HMC Algorithm])[ HMC was originally described in the physics literature #footnote[where is called "Hybrid" Monte Carlo (HMC)] @duaneHybridMonteCarlo1987. @@ -714,9 +627,7 @@ See Gibbs in action at #link( and #cite(, form: "prose"). ] -#slide( - title: [What Changes With HMC?], -)[ +#slide(title: [What Changes With HMC?])[ HMC uses Hamiltonian dynamics applied to particles efficiently exploring a posterior probability geometry, while also being robust to complex posterior's geometries. @@ -727,12 +638,8 @@ See Gibbs in action at #link( Gibbs' parameters correlation issues ] -#slide( - title: [Intuition Behind the HMC Algorithm], -)[ - #text( - size: 18pt, - )[ +#slide(title: [Intuition Behind the HMC Algorithm])[ + #text(size: 18pt)[ For every parameter $θ_j$, HMC adds a momentum variable $φ_j$. The posterior density $P(bold(θ) | y)$ is incremented by an independent momenta distribution $P(bold(φ))$, hence defining the following joint probability: @@ -757,9 +664,7 @@ See Gibbs in action at #link( ] ] -#slide( - title: [Momenta Distribution -- $P(bold(φ))$], -)[ +#slide(title: [Momenta Distribution -- $P(bold(φ))$])[ Generally we give $bold(φ)$ a multivariate normal distribution with mean $0$ and covariance $bold(M)$, a "mass matrix". @@ -772,15 +677,9 @@ See Gibbs in action at #link( $ φ_j tilde "Normal"(0, M_(j j)) $ ] -#slide( - title: [HMC Algorithm], -)[ - #text( - size: 13pt, - )[ - #algo( - line-numbers: false, - )[ +#slide(title: [HMC Algorithm])[ + #text(size: 13pt)[ + #algo(line-numbers: false)[ Define an initial set $bold(θ)^0 ∈ RR^p$ that $P(bold(θ)^0 | bold(y)) > 0$ \ Sample $bold(φ)$ from a $"Multivariate Normal"(bold(0),bold(M))$ \ Simultaneously sample $bold(θ)^*$ and $bold(φ)$ with $L$ steps and step-size $ε$ \ @@ -788,14 +687,20 @@ See Gibbs in action at #link( $bold(θ)^* <- bold(θ)$ \ for $1, 2, dots, L$ #i \ Use the $log$ of the posterior's gradient $bold(θ)^*$ to produce a half-step of $bold(φ)$: - $bold(φ) <- bold(φ) + 1 / 2 ε (dif log P(bold(θ)^* | bold(y))) / (dif θ)$ \ + $bold(φ) <- bold(φ) + 1 / 2 ε (dif log P( + bold(θ)^* | bold(y) + )) / (dif θ)$ \ Use $bold(φ)$ to update $bold(θ)^*$: $bold(θ)^* <- bold(θ)^* + ε bold(M)^(-1) bold(φ)$ \ Use again $bold(θ)^*$ $log$ gradient to produce a half-step of $bold(φ)$: - $bold(φ) <- bold(φ) + 1 / 2 ε (dif log P(bold(θ)^* | bold(y))) / (dif θ)$ #d \ + $bold(φ) <- bold(φ) + 1 / 2 ε (dif log P( + bold(θ)^* | bold(y) + )) / (dif θ)$ #d \ As an acceptance/rejection rule, compute: \ $ - r = (P (bold(θ)^* | bold(y) ) P (bold(φ)^*))/ (P (bold(θ)^(t - 1) | bold(y)) P(bold(φ)^(t - 1))) + r = (P (bold(θ)^* | bold(y)) P (bold(φ)^*)) / (P ( + bold(θ)^(t - 1) | bold(y) + ) P(bold(φ)^(t - 1))) $ Assign: @@ -807,25 +712,19 @@ See Gibbs in action at #link( ] ] -#slide( - title: [HMC Animation], -)[ -#v(4em) +#slide(title: [HMC Animation])[ + #v(4em) -#align( - center, -)[ -See HMC in action at #link( + #align(center)[ + See HMC in action at #link( "https://chi-feng.github.io/mcmc-demo/app.html?algorithm=HamiltonianHMC&target=banana", )[ `chi-feng/mcmc-demo` ]. -] + ] ] -#slide( - title: [An Interlude into Numerical Integration], -)[ +#slide(title: [An Interlude into Numerical Integration])[ In the field of ordinary differential equations (ODE), we have the idea of "discretizing" a system of ODEs by applying a small step-size $ε$ #footnote[sometimes also called $h$]. Such approaches are called "numerical integrators" and are composed by an ample @@ -838,12 +737,8 @@ See HMC in action at #link( future time $t$ from specific initial conditions. ] -#slide( - title: [An Interlude into Numerical Integration], -)[ - #side-by-side( - columns: (3fr, 2fr), - )[ +#slide(title: [An Interlude into Numerical Integration])[ + #side-by-side(columns: (3fr, 2fr))[ The problem is that Euler method, when applied to Hamiltonian dynamics, *does not preserve volume*. @@ -858,23 +753,20 @@ See HMC in action at #link( ][ #figure( - image("images/mcmc/euler_0_3.jpg", height: 55%), caption: [HMC numerically integrated using Euler with $ε = 0.3$ and $L = 20$], + image("images/mcmc/euler_0_3.jpg", height: 55%), + caption: [HMC numerically integrated using Euler with $ε = 0.3$ and $L = 20$], ) ] ] -#slide( - title: [ - An Interlude into Numerical Integration - #footnote[ - An excellent textbook for numerical and symplectic integrator is - #cite(, form: "prose"). - ] - ], -)[ - #side-by-side( - columns: (3fr, 2fr), - )[ +#slide(title: [ + An Interlude into Numerical Integration + #footnote[ + An excellent textbook for numerical and symplectic integrator is + #cite(, form: "prose"). + ] +])[ + #side-by-side(columns: (3fr, 2fr))[ To preserve volume, we need a numerical *symplectic integrator*. #v(1em) @@ -888,17 +780,14 @@ See HMC in action at #link( ][ #figure( - image("images/mcmc/leapfrog_0_3.jpg", height: 55%), caption: [HMC numerically integrated using leapfrog with $ε = 0.3$ and $L = 20$], + image("images/mcmc/leapfrog_0_3.jpg", height: 55%), + caption: [HMC numerically integrated using leapfrog with $ε = 0.3$ and $L = 20$], ) ] ] -#slide( - title: [Limitations of the HMC Algorithm], -)[ - #side-by-side( - columns: (3fr, 2fr), - )[ +#slide(title: [Limitations of the HMC Algorithm])[ + #side-by-side(columns: (3fr, 2fr))[ As you can see, HMC algorithm is highly sensible to the choice of leapfrog steps $L$ and step-size $ε$, @@ -913,14 +802,13 @@ See HMC in action at #link( ][ #figure( - image("images/mcmc/leapfrog_1_2.jpg", height: 55%), caption: [HMC numerically integrated using leapfrog with $ε = 1.2$ and $L = 20$], + image("images/mcmc/leapfrog_1_2.jpg", height: 55%), + caption: [HMC numerically integrated using leapfrog with $ε = 1.2$ and $L = 20$], ) ] ] -#slide( - title: [No-U-Turn-Sampler (NUTS)], -)[ +#slide(title: [No-U-Turn-Sampler (NUTS)])[ In HMC, we can adjust $ε$ during the algorithm runtime. But, for $L$, we need to to "dry run" the HMC sampler to find a good candidate value for $L$. @@ -934,9 +822,7 @@ See HMC in action at #link( It will automatically find $ε$ and $L$. ] -#slide( - title: [No-U-Turn-Sampler (NUTS)], -)[ +#slide(title: [No-U-Turn-Sampler (NUTS)])[ More specifically, we need a criterion that informs that we performed enough Hamiltonian dynamics simulation. @@ -957,9 +843,7 @@ See HMC in action at #link( $ ] -#slide( - title: [No-U-Turn-Sampler (NUTS)], -)[ +#slide(title: [No-U-Turn-Sampler (NUTS)])[ #v(2em) This suggests an algorithms that does not allow proposals be guided infinitely @@ -971,12 +855,8 @@ See HMC in action at #link( This means that such algorithm will *not allow u-turns*. ] -#slide( - title: [No-U-Turn-Sampler (NUTS)], -)[ - #text( - size: 18pt, - )[ +#slide(title: [No-U-Turn-Sampler (NUTS)])[ + #text(size: 18pt)[ NUTS uses the leapfrog integrator to create a binary tree where each leaf node is a proposal of the momenta vector $bold(φ)$ tracing both a forward ($t + 1$) as well as a backward ($t - 1$) path in a determined fictitious time $t$. @@ -985,14 +865,13 @@ See HMC in action at #link( forward or backward. #figure( - image("images/mcmc/nuts.jpg", height: 40%), caption: [NUTS growing leaf nodes forward], + image("images/mcmc/nuts.jpg", height: 40%), + caption: [NUTS growing leaf nodes forward], ) ] ] -#slide( - title: [No-U-Turn-Sampler (NUTS)], -)[ +#slide(title: [No-U-Turn-Sampler (NUTS)])[ #v(2em) NUTS also uses a procedure called Dual Averaging @nesterov2009primal to @@ -1005,19 +884,11 @@ See HMC in action at #link( $ε$ and $L$ are kept fixed during the sampling phase. ] -#slide( - title: [NUTS Algorithm], -)[ - #text( - size: 7pt, - )[ - #algo( - line-numbers: false, - )[ +#slide(title: [NUTS Algorithm])[ + #text(size: 7pt)[ + #algo(line-numbers: false)[ Define an initial set $bold(θ)^0 ∈ RR^p$ that $P(bold(θ)^0 | bold(y)) > 0$ \ - #text( - fill: julia-blue, - )[Instantiate an empty binary tree with $2^L$ leaf nodes] \ + #text(fill: julia-blue)[Instantiate an empty binary tree with $2^L$ leaf nodes] \ Sample $bold(φ)$ from a $"Multivariate Normal"(bold(0),bold(M))$ \ Simultaneously sample $bold(θ)^*$ and $bold(φ)$ with $L$ steps and step-size $ε$ \ Define the current value of $bold(θ)$ as the proposed value $bold(θ)^*$: @@ -1025,39 +896,37 @@ See HMC in action at #link( for $1, 2, dots, L$ #i \ #text(fill: julia-blue)[Choose a direction $v tilde "Uniform"({-1, 1})$] \ Use the $log$ of the posterior's gradient $bold(θ)^*$ to produce a half-step of $bold(φ)$: - $bold(φ) <- bold(φ) + 1 / 2 ε (dif log P(bold(θ)^* | bold(y))) / (dif θ)$ \ + $bold(φ) <- bold(φ) + 1 / 2 ε (dif log P( + bold(θ)^* | bold(y) + )) / (dif θ)$ \ Use $bold(φ)$ to update $bold(θ)^*$: $bold(θ)^* <- bold(θ)^* + ε bold(M)^(-1) bold(φ)$ \ Use again $bold(θ)^*$ $log$ gradient to produce a half-step of $bold(φ)$: - $bold(φ) <- bold(φ) + 1 / 2 ε (dif log P(bold(θ)^* | bold(y))) / (dif θ)$ #d \ + $bold(φ) <- bold(φ) + 1 / 2 ε (dif log P( + bold(θ)^* | bold(y) + )) / (dif θ)$ #d \ #text(fill: julia-blue)[Define the node $L_t^v$ as the proposal $bold(θ)$] \ - #text( - fill: julia-blue, - )[ + #text(fill: julia-blue)[ if the difference between proposal vector $bold(θ)^*$ and current vector $bold(θ)$ in the direction $v$ is lower than zero: $v (dif) / (dif t) ((bold(θ)^* - bold(θ)^*) dot (bold(θ)^* - bold(θ)^*)) / 2 < 0$ or $L$ steps have been reached ] #i \ - #text( - fill: julia-red, - )[ + #text(fill: julia-red)[ Stop sampling $bold(θ)^*$ in the direction $v$ and continue sampling only in the direction $-v$ ] #i \ - #text( - fill: julia-blue, - )[ + #text(fill: julia-blue)[ The difference between proposal vector $bold(θ)^*$ and current vector $bold(θ)$ in the direction $-v$ is lower than zero: $-v (dif) / (dif t) ((bold(θ)^* - bold(θ)^*) dot (bold(θ)^* - bold(θ)^*)) / 2 < 0$ or $L$ steps have been reached ] #i \ #text(fill: julia-red)[Stop sampling $bold(θ)^*$] #d #d #d\ - #text( - fill: julia-blue, - )[Choose a random node from the binary tree as the proposal] \ + #text(fill: julia-blue)[Choose a random node from the binary tree as the proposal] \ As an acceptance/rejection rule, compute: \ $ - r = (P (bold(θ)^* | bold(y) ) P (bold(φ)^*))/ (P (bold(θ)^(t - 1) | bold(y)) P(bold(φ)^(t - 1))) + r = (P (bold(θ)^* | bold(y)) P (bold(φ)^*)) / (P ( + bold(θ)^(t - 1) | bold(y) + ) P(bold(φ)^(t - 1))) $ Assign: @@ -1069,26 +938,20 @@ See HMC in action at #link( ] ] -#slide( - title: [NUTS Animation], -)[ -#v(4em) +#slide(title: [NUTS Animation])[ + #v(4em) -#align( - center, -)[ -See NUTS in action at #link( + #align(center)[ + See NUTS in action at #link( "https://chi-feng.github.io/mcmc-demo/app.html?algorithm=EfficientNUTS&target=banana", )[ `chi-feng/mcmc-demo` ]. -] + ] ] -#slide( - title: [Limitations of HMC and NUTS Algorithms -- #cite(, form: "prose")'s - Funnel], -)[ +#slide(title: [Limitations of HMC and NUTS Algorithms -- #cite(, form: "prose")'s + Funnel])[ The famous "Devil's Funnel" #footnote[very common em hierarchical models.]. @@ -1102,13 +965,9 @@ See NUTS in action at #link( #align(center)[#image("images/funnel/funnel.png", height: 40%)] ] -#slide( - title: [#cite(, form: "prose")'s Funnel and Non-Centered - Parameterization (NCP)], -)[ - #text( - size: 18pt, - )[ +#slide(title: [#cite(, form: "prose")'s Funnel and Non-Centered + Parameterization (NCP)])[ + #text(size: 18pt)[ The funnel occurs when we have a variable that its variance depends on another variable variance in an exponential scale. A canonical example of a centered parameterization (CP) is: @@ -1126,8 +985,8 @@ See NUTS in action at #link( $ P(tilde(y),tilde(x)) &= "Normal"(tilde(y) | 0, 1) dot "Normal"(tilde(x) | 0, 1) \ - y &= tilde(y) dot 3 + 0 \ - x &= tilde(x) dot e^(y / 2) + 0 + y &= tilde(y) dot 3 + 0 \ + x &= tilde(x) dot e^(y / 2) + 0 $ ] ] @@ -1136,28 +995,26 @@ See NUTS in action at #link( This example is for linear regression: $ - bold(y) &tilde "Normal"(α_j + bold(X) dot bold(β), σ ) \ - α_j &= z_j dot τ + α \ - z_j &tilde "Normal"(0, 1) \ - α &tilde "Normal"(μ_α, σ_α) \ + bold(y) &tilde "Normal"(α_j + bold(X) dot bold(β), σ) \ + α_j &= z_j dot τ + α \ + z_j &tilde "Normal"(0, 1) \ + α &tilde "Normal"(μ_α, σ_α) \ bold(β) &tilde "Normal"(μ_bold(β), σ_bold(β)) \ - τ &tilde "Cauchy"^+(0, ψ_α) \ - σ &tilde "Exponential"(λ_σ) + τ &tilde "Cauchy"^+(0, ψ_α) \ + σ &tilde "Exponential"(λ_σ) $ ] -#slide( - title: [Non-Centered Parameterization -- Varying-(Intercept-)Slope Model], -)[ +#slide(title: [Non-Centered Parameterization -- Varying-(Intercept-)Slope Model])[ This example is for linear regression: $ - bold(y) &tilde "Normal"(bold(X) bold(β)_j, σ) \ + bold(y) &tilde "Normal"(bold(X) bold(β)_j, σ) \ bold(β)_j &= bold(γ)_j dot bold(Σ) dot bold(γ)_j \ bold(γ)_j &tilde "Multivariate Normal"(bold(0), bold(I)) "for" j ∈ {1, dots, J} \ - bold(Σ) &tilde "LKJ"(η) \ - σ &tilde "Exponential"(λ_σ) + bold(Σ) &tilde "LKJ"(η) \ + σ &tilde "Exponential"(λ_σ) $ Each coefficient vector $bold(β)_j$ represents the model columns $bold(X)$ coefficients @@ -1165,9 +1022,7 @@ See NUTS in action at #link( filled with $1$s (intercept). ] -#slide( - title: [Stan and NUTS], -)[ +#slide(title: [Stan and NUTS])[ Stan was the first MCMC sampler to implement NUTS. Besides that, it has an automatic optimized adjustment routine for values of $L$ and $ε$ during @@ -1189,9 +1044,7 @@ See NUTS in action at #link( - *max tree depth* (in powers of $2$): 10 (which means $2^(10) = 1024$) ] -#slide( - title: [Turing and NUTS], -)[ +#slide(title: [Turing and NUTS])[ Turing also implements NUTS which lives, along with other MCMC samplers, inside the package AdvancedHMC.jl. @@ -1211,9 +1064,7 @@ See NUTS in action at #link( - *max tree depth* (in powers of $2$): 10 (which means $2^(10) = 1024$) ] -#slide( - title: [Markov Chain Convergence], -)[ +#slide(title: [Markov Chain Convergence])[ MCMC has an interesting property that it will *asymptotically converge to the target distribution* #footnote[ @@ -1229,9 +1080,7 @@ See NUTS in action at #link( convergence to the target distribution. ] -#slide( - title: [Convergence Metrics], -)[ +#slide(title: [Convergence Metrics])[ #v(2em) We have some options on how to measure if the Markov chains converged to the @@ -1248,9 +1097,7 @@ See NUTS in action at #link( Markov chain have mixed, and, potentially, converged. ] -#slide( - title: [Convergence Metrics -- Effective Sample Size @gelman2013bayesian], -)[ +#slide(title: [Convergence Metrics -- Effective Sample Size @gelman2013bayesian])[ $ hat(n)_"eff" = (m n) / (1 + sum_(t = 1)^T hat(ρ)_t) $ where: @@ -1262,12 +1109,8 @@ See NUTS in action at #link( - $hat(ρ)_t$: an autocorrelation estimate. ] -#slide( - title: [Convergence Metrics -- Rhat @gelman2013bayesian], -)[ - #text( - size: 18pt, - )[ +#slide(title: [Convergence Metrics -- Rhat @gelman2013bayesian])[ + #text(size: 18pt)[ $ hat(R) = sqrt((hat("var")^+ (ψ | y)) / W) $ where $hat("var")^+ (ψ | y)$ is the Markov chains' sample variance for a certain @@ -1284,26 +1127,20 @@ See NUTS in action at #link( ] ] -#slide( - title: [Traceplot -- Convergent Markov Chains], -)[ - #align( - center, - )[ +#slide(title: [Traceplot -- Convergent Markov Chains])[ + #align(center)[ #figure( - image("images/funnel/good_chains_traceplot.svg", height: 80%), caption: [A convergent Markov chains traceplot], + image("images/funnel/good_chains_traceplot.svg", height: 80%), + caption: [A convergent Markov chains traceplot], ) ] ] -#slide( - title: [Traceplot -- Divergent Markov Chains], -)[ - #align( - center, - )[ +#slide(title: [Traceplot -- Divergent Markov Chains])[ + #align(center)[ #figure( - image("images/funnel/bad_chains_traceplot.svg", height: 80%), caption: [A divergent Markov chains traceplot], + image("images/funnel/bad_chains_traceplot.svg", height: 80%), + caption: [A divergent Markov chains traceplot], ) ] ] @@ -1316,58 +1153,52 @@ See NUTS in action at #link( )[Stan's #text(fill: julia-red)[warnings] guide]. ] ])[ -#fit-to-height(1fr)[ -```shell -Warning messages: -1: There were 275 divergent transitions after warmup. See -http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup -to find out why this is a problem and how to eliminate them. -2: Examine the pairs() plot to diagnose sampling problems - -3: The largest R-hat is 1.12, indicating chains have not mixed. -Running the chains for more iterations may help. See -http://mc-stan.org/misc/warnings.html#r-hat -4: Bulk Effective Samples Size (ESS) is too low, indicating posterior -means and medians may be unreliable. -Running the chains for more iterations may help. See -http://mc-stan.org/misc/warnings.html#bulk-ess -5: Tail Effective Samples Size (ESS) is too low, indicating posterior -variances and tail quantiles may be unreliable. -Running the chains for more iterations may help. See -http://mc-stan.org/misc/warnings.html#tail-ess -``` -] -] - -#slide( - title: [ - Turing's Warning Messages - ], -)[ -#fit-to-height( - 1fr, -)[ -*Turing does not give warning messages!* -But you can check divergent transitions with `summarize(chn; -sections=[:internals])`: + #fit-to-height(1fr)[ + ```shell + Warning messages: + 1: There were 275 divergent transitions after warmup. See + http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup + to find out why this is a problem and how to eliminate them. + 2: Examine the pairs() plot to diagnose sampling problems + + 3: The largest R-hat is 1.12, indicating chains have not mixed. + Running the chains for more iterations may help. See + http://mc-stan.org/misc/warnings.html#r-hat + 4: Bulk Effective Samples Size (ESS) is too low, indicating posterior + means and medians may be unreliable. + Running the chains for more iterations may help. See + http://mc-stan.org/misc/warnings.html#bulk-ess + 5: Tail Effective Samples Size (ESS) is too low, indicating posterior + variances and tail quantiles may be unreliable. + Running the chains for more iterations may help. See + http://mc-stan.org/misc/warnings.html#tail-ess + ``` + ] +] -```shell -Summary Statistics - parameters mean std naive_se mcse ess rhat ess_per_sec - Symbol Float64 Float64 Float64 Float64 Float64 Float64 Float64 +#slide(title: [ + Turing's Warning Messages +])[ + #fit-to-height(1fr)[ + *Turing does not give warning messages!* + But you can check divergent transitions with `summarize(chn; +sections=[:internals])`: - lp -3.9649 1.7887 0.0200 0.1062 179.1235 1.0224 6.4133 - n_steps 9.1275 11.1065 0.1242 0.7899 38.3507 1.3012 1.3731 - acceptance_rate 0.5944 0.4219 0.0047 0.0322 40.5016 1.2173 1.4501 - tree_depth 2.2444 1.3428 0.0150 0.1049 32.8514 1.3544 1.1762 - numerical_error 0.1975 0.3981 0.0045 0.0273 59.8853 1.1117 2.1441 -``` -] + ```shell + Summary Statistics + parameters mean std naive_se mcse ess rhat ess_per_sec + Symbol Float64 Float64 Float64 Float64 Float64 Float64 Float64 + + lp -3.9649 1.7887 0.0200 0.1062 179.1235 1.0224 6.4133 + n_steps 9.1275 11.1065 0.1242 0.7899 38.3507 1.3012 1.3731 + acceptance_rate 0.5944 0.4219 0.0047 0.0322 40.5016 1.2173 1.4501 + tree_depth 2.2444 1.3428 0.0150 0.1049 32.8514 1.3544 1.1762 + numerical_error 0.1975 0.3981 0.0045 0.0273 59.8853 1.1117 2.1441 + ``` + ] ] -#slide( - title: [What To Do If the Markov Chains Do Not Converge?], -)[ +#slide(title: [What To Do If the Markov Chains Do Not Converge?])[ *First*: before making any fine adjustments in the number of Markov chains or the number of iterations per chain, etc. @@ -1383,21 +1214,18 @@ Summary Statistics 99% of the time. ] -#slide( - title: [What To Do If the Markov Chains Do Not Converge?], -)[ +#slide(title: [What To Do If the Markov Chains Do Not Converge?])[ #v(4em) #quote( - block: true, attribution: [#cite(, form: "prose")], + block: true, + attribution: [#cite(, form: "prose")], )[ When you have computational problems, often there’s a problem with your model. ] ] -#slide( - title: [What To Do If the Markov Chains Do Not Converge?], -)[ +#slide(title: [What To Do If the Markov Chains Do Not Converge?])[ If you experiencing convergence issues, *and you've discarded that something is wrong with you model*, here is a few steps to try #footnote[ @@ -1415,9 +1243,7 @@ Summary Statistics is 2,000 iterations and 4 chains). ] -#slide( - title: [What To Do If the Markov Chains Do Not Converge?], -)[ +#slide(title: [What To Do If the Markov Chains Do Not Converge?])[ 2. *Change the HMC's warmup adaptation routine*: make the HMC sampler to be more conservative in the proposals. This can be changed by increasing the hyperparameter *target acceptance rate of Metropolis proposals* @@ -1430,9 +1256,7 @@ Summary Statistics (CP) and non-centered parameterization (NCP). ] -#slide( - title: [What To Do If the Markov Chains Do Not Converge?], -)[ +#slide(title: [What To Do If the Markov Chains Do Not Converge?])[ 4. *Collect more data*: sometimes the model is too complex and we need a higher sample size for stable estimates. diff --git a/slides/13-model_comparison.typ b/slides/13-model_comparison.typ index 3fc278e..f222310 100644 --- a/slides/13-model_comparison.typ +++ b/slides/13-model_comparison.typ @@ -4,9 +4,7 @@ #new-section-slide("Model Comparison") -#slide( - title: "Recommended References", -)[ +#slide(title: "Recommended References")[ - #cite(, form: "prose") - Chapter 7: Evaluating, comparing, and expanding models - #cite(, form: "prose") - Chapter 11, Section 11.8: Cross @@ -26,18 +24,14 @@ #align(center)[#image("images/memes/model_comparison.jpg")] ] -#slide( - title: [Why Compare Models?], -)[ +#slide(title: [Why Compare Models?])[ #v(3em) After model parameters estimation, many times we want to measure its *predictive accuracy* by itself, or for *model comparison*, *model selection*, or computing a *model performance metric* @geisser1979predictive. ] -#slide( - title: [But What About Visual Posterior Predictive Checks?], -)[ +#slide(title: [But What About Visual Posterior Predictive Checks?])[ To analyze and compare models using visual posterior predictive checks is a *subjective and arbitrary approach*. @@ -54,9 +48,7 @@ @gelmanBayesianWorkflow2020. ] -#slide( - title: [Model Comparison Techniques], -)[ +#slide(title: [Model Comparison Techniques])[ We have several model comparison techniques that use *predictive accuracy*, but the main ones are: - Leave-one-out cross-validation (LOO) @vehtariPracticalBayesianModel2015. @@ -70,12 +62,8 @@ and it is asymptotically equal to LOO @vehtariPracticalBayesianModel2015. ] -#slide( - title: [Historical Interlude], -)[ - #text( - size: 16pt, - )[ +#slide(title: [Historical Interlude])[ + #text(size: 16pt)[ In the past, we did not have computational power and data abundance. Model comparison was done based on a theoretical divergence metric originated from information theory's entropy: @@ -88,7 +76,9 @@ so lower values are preferable: $ - D(y, bold(θ)) = -2 dot underbrace( + D( + y, bold(θ) + ) = -2 dot underbrace( sum^N_(i = 1) log 1 / S sum^S_(s = 1) P(y_i | bold(θ)^s), "log pointwise predictive density - lppd", ) @@ -98,9 +88,7 @@ ] ] -#slide( - title: [Historical Interlude -- AIC @akaike1998information], -)[ +#slide(title: [Historical Interlude -- AIC @akaike1998information])[ $ "AIC" = D(y, bold(θ)) + 2k = -2 "lppd"_("mle") + 2k $ @@ -122,9 +110,7 @@ parameters $k$: $N >> k$ ] -#slide( - title: [Historical Interlude -- DIC @spiegelhalter2002bayesian], -)[ +#slide(title: [Historical Interlude -- DIC @spiegelhalter2002bayesian])[ A generalization of the AIC, where we replace the maximum likelihood estimate for the posterior mean and $k$ by a data-based bias correction: @@ -141,9 +127,7 @@ and that $N >> k$. ] -#slide( - title: [Predictive Accuracy], -)[ +#slide(title: [Predictive Accuracy])[ With current computational power, we do not need approximations #footnote[AIC, DIC etc.]. #v(2em) @@ -155,9 +139,7 @@ But, first, let's define what is predictive accuracy. ] -#slide( - title: [Predictive Accuracy], -)[ +#slide(title: [Predictive Accuracy])[ Bayesian approaches measure predictive accuracy using posterior draws $tilde(y)$ from the model. For that we have the predictive posterior distribution: @@ -176,14 +158,14 @@ $p(tilde(y) | y)$, the *better* will be the model's predictive accuracy. ] -#slide( - title: [Predictive Accuracy], -)[ +#slide(title: [Predictive Accuracy])[ To make samples comparable, we calculate the expectation of this measure for each one of the $N$ sample observations: $ - op("elpd") = sum_(i=1)^N ∫ p_t(tilde(y)_i) log p(tilde(y)_i | y) dif tilde(y) + op("elpd") = sum_(i=1)^N ∫ p_t(tilde(y)_i) log p( + tilde(y)_i | y + ) dif tilde(y) $ where $op("elpd")$ is the expected log pointwise predictive density, and $p_t(tilde(y)_i)$ is @@ -194,12 +176,8 @@ approximations to estimate $op("elpd")$. ] -#slide( - title: [Leave-One-Out Cross-Validation (LOO)], -)[ - #text( - size: 18pt, - )[ +#slide(title: [Leave-One-Out Cross-Validation (LOO)])[ + #text(size: 18pt)[ We can compute the $op("elpd")$ using LOO @vehtariPracticalBayesianModel2015: $ @@ -218,12 +196,8 @@ ] ] -#slide( - title: [Widely Applicable Information Criteria (WAIC)], -)[ - #text( - size: 12pt, - )[ +#slide(title: [Widely Applicable Information Criteria (WAIC)])[ + #text(size: 12pt)[ WAIC @watanabe2010asymptotic, like LOO, is also an alternative approach to compute the $op("elpd")$, @@ -254,9 +228,7 @@ ] ] -#slide( - title: [$K$-fold Cross-Validation ($K$-fold CV)], -)[ +#slide(title: [$K$-fold Cross-Validation ($K$-fold CV)])[ In the same manner that we can compute the $op("elpd")$ using LOO with $N-1$ sample partitions, we can also compute it with any desired partition number. @@ -272,9 +244,7 @@ which almost involves a *high computational cost*. ] -#slide( - title: [Pareto Smoothed Importance Sampling LOO (PSIS-LOO)], -)[ +#slide(title: [Pareto Smoothed Importance Sampling LOO (PSIS-LOO)])[ PSIS uses *importance sampling* #footnote[another class of MCMC algorithm that we did not cover yet.], which means a importance weighting scheme approach. @@ -284,9 +254,7 @@ reliability. ] -#slide( - title: [Importance Sampling], -)[ +#slide(title: [Importance Sampling])[ If the $N$ samples are conditionally independent #footnote[ that is, they are independent if conditioned on the model's parameters, which is a basic assumption in any Bayesian (and frequentist) model @@ -307,13 +275,13 @@ $ ] -#slide( - title: [Importance Sampling], -)[ +#slide(title: [Importance Sampling])[ However, the posterior $P(θ | y$ often has low variance and shorter tails than the LOO distributions $P(θ | y_(-1))$. Hence, if we use: $ - P(tilde(y)_i | y_(-i)) ≈ (sum_(s=1)^S r_i^s P(tilde(y)_i|θ^s)) / (sum_(s=1)^S r_i^s) + P(tilde(y)_i | y_(-i)) ≈ (sum_(s=1)^S r_i^s P( + tilde(y)_i|θ^s + )) / (sum_(s=1)^S r_i^s) $ #v(1em) @@ -322,9 +290,7 @@ variance*. ] -#slide( - title: [Pareto Smoothed Importance Sampling], -)[ +#slide(title: [Pareto Smoothed Importance Sampling])[ We can enhance the IS-LOO estimate using a *Pareto Smoothed Importance Sampling* @vehtariPracticalBayesianModel2015. @@ -350,12 +316,8 @@ where $w$ is the truncated weights. ] -#slide( - title: [Pareto Smoothed Importance Sampling LOO (PSIS-LOO)], -)[ - #text( - size: 17pt, - )[ +#slide(title: [Pareto Smoothed Importance Sampling LOO (PSIS-LOO)])[ + #text(size: 17pt)[ We use the importance weights Pareto distribution's estimated shape parameter $hat(k)$ to assess its reliability: diff --git a/slides/backup_slides.typ b/slides/backup_slides.typ index d33caae..5229188 100644 --- a/slides/backup_slides.typ +++ b/slides/backup_slides.typ @@ -4,21 +4,17 @@ #new-section-slide("Backup Slides") -#slide( - title: [How the Normal distribution arose - #footnote[Origins can be traced back to Abraham de Moivre in 1738. A better explanation - can be found by - #link( +#slide(title: [How the Normal distribution arose + #footnote[Origins can be traced back to Abraham de Moivre in 1738. A better explanation + can be found by + #link( "http://www.stat.yale.edu/~pollard/Courses/241.fall2014/notes2014/Bin.Normal.pdf", - )[clicking here].]], -)[ - #text( - size: 16pt, - )[ + )[clicking here].]])[ + #text(size: 16pt)[ $ - "Binomial"(n, k) &= binom(n, k) p^k (1-p)^(n-k) \ - n! &≈ sqrt(2 π n) (n / e)^n \ + "Binomial"(n, k) &= binom(n, k) p^k (1-p)^(n-k) \ + n! &≈ sqrt(2 π n) (n / e)^n \ lim_(n → oo) binom(n, k) p^k (1-p)^(n-k) &= 1 / (sqrt(2 π n p q)) e^(-((k - n p)^2) / (2 n p q)) $ @@ -27,17 +23,15 @@ replacing $op("E")$ by $μ$ and $op("Var")$ by $σ^2$: $ - lim_(n → oo) binom(n, k) p^k (1-p)^(n-k) = 1 / (σ sqrt(2 π)) e^(-((k - μ)^2) / (σ^2)) + lim_(n → oo) binom(n, k) p^k (1-p)^(n-k) = 1 / (σ sqrt(2 π)) e^(-(( + k - μ + )^2) / (σ^2)) $ ] ] -#slide( - title: [QR Decomposition], -)[ - #text( - size: 13pt, - )[ +#slide(title: [QR Decomposition])[ + #text(size: 13pt)[ In Linear Algebra 101, we learn that any matrix (even non-square ones) can be decomposed into a product of two matrices: @@ -52,13 +46,13 @@ numerical stable. Mathematically speaking, the thing QR decomposition is: $ - bold(X) &= bold(Q)^* bold(R)^* \ + bold(X) &= bold(Q)^* bold(R)^* \ bold(Q)^* &= bold(Q) dot sqrt(n - 1) \ bold(R)^* &= 1 / (sqrt(n - 1)) dot bold(R) \ - bold(μ) &= α + bold(X) dot bold(β) + σ \ - &= α + bold(Q)^* dot bold(R)^* dot bold(β) + σ \ - &= α + bold(Q)^* dot (bold(R)^* dot bold(β)) + σ \ - &= α + bold(Q)^* dot tilde(bold(β)) + σ + bold(μ) &= α + bold(X) dot bold(β) + σ \ + &= α + bold(Q)^* dot bold(R)^* dot bold(β) + σ \ + &= α + bold(Q)^* dot (bold(R)^* dot bold(β)) + σ \ + &= α + bold(Q)^* dot tilde(bold(β)) + σ $ ] ] diff --git a/slides/slides.typ b/slides/slides.typ index e033a53..495d251 100644 --- a/slides/slides.typ +++ b/slides/slides.typ @@ -3,7 +3,10 @@ #import "utils.typ": * #show: clean-theme.with( - footer: [Bayesian Statistics, Jose Storopoli], short-title: [Bayesian Statistics], logo: image("images/logos/juliadatascience_white.png"), color: julia-purple, + footer: [Bayesian Statistics, Jose Storopoli], + short-title: [Bayesian Statistics], + logo: image("images/logos/juliadatascience_white.png"), + color: julia-purple, ) // customizations @@ -26,19 +29,13 @@ #align(center)[#image("images/memes/main.jpg")] ] -#slide( - title: "License", -)[ +#slide(title: "License")[ #v(2em) - #align( - center, - )[ + #align(center)[ The text and images from these slides have a - #link( - "https://creativecommons.org/licenses/by-nc-sa/4.0/deed.en", - )[ + #link("https://creativecommons.org/licenses/by-nc-sa/4.0/deed.en")[ Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0) ] diff --git a/slides/utils.typ b/slides/utils.typ index 9cb0f7e..3dafa8c 100644 --- a/slides/utils.typ +++ b/slides/utils.typ @@ -5,33 +5,81 @@ #let julia-red = rgb("#cb3c33") // functions -#let binormal(x, y, μ_a, σ_a, μ_b, σ_b, ρ) = calc.exp( - -( - calc.pow((x - μ_a) / σ_a, 2) + calc.pow((y - μ_b) / σ_b, 2) - (2 * ρ) * ((x - μ_a) / σ_a) * ((y - μ_b) / σ_b) - ) / (2 * (1 - calc.pow(ρ, 2))), -) / (2 * calc.pi * σ_a * σ_b * calc.pow(1 - calc.pow(ρ, 2)), 0.5) -#let conditionalbinormal(x, y_c, μ_a, σ_a, μ_b, σ_b, ρ) = calc.exp( - -( - calc.pow((x - μ_a) / σ_a, 2) + calc.pow((y_c - μ_b) / σ_b, 2) - (2 * ρ) * ((x - μ_a) / σ_a) * ((y_c - μ_b) / σ_b) - ) / (2 * (1 - calc.pow(ρ, 2))), -) / (2 * calc.pi * σ_a * σ_b * calc.pow(1 - calc.pow(ρ, 2)), 0.5) -#let sumtwonormals(x_a, x_b, μ_a, σ_a, w_a, μ_b, σ_b, w_b) = (w_a * gaussian(x_a, μ_a, σ_a)) + (w_b * gaussian(x_b, μ_b, σ_b)) +#let binormal(x, y, μ_a, σ_a, μ_b, σ_b, ρ) = ( + calc.exp(-( + calc.pow((x - μ_a) / σ_a, 2) + calc.pow((y - μ_b) / σ_b, 2) - (2 * ρ) * ( + (x - μ_a) / σ_a + ) * ((y - μ_b) / σ_b) + ) / (2 * (1 - calc.pow(ρ, 2)))) / ( + 2 * calc.pi * σ_a * σ_b * calc.pow(1 - calc.pow(ρ, 2)), + 0.5, + ) +) +#let conditionalbinormal(x, y_c, μ_a, σ_a, μ_b, σ_b, ρ) = ( + calc.exp(-( + calc.pow((x - μ_a) / σ_a, 2) + calc.pow((y_c - μ_b) / σ_b, 2) - (2 * ρ) * ( + (x - μ_a) / σ_a + ) * ((y_c - μ_b) / σ_b) + ) / (2 * (1 - calc.pow(ρ, 2)))) / ( + 2 * calc.pi * σ_a * σ_b * calc.pow(1 - calc.pow(ρ, 2)), + 0.5, + ) +) +#let sumtwonormals(x_a, x_b, μ_a, σ_a, w_a, μ_b, σ_b, w_b) = ( + (w_a * gaussian(x_a, μ_a, σ_a)) + (w_b * gaussian(x_b, μ_b, σ_b)) +) #let discreteuniform(a, b) = 1 / (b - a + 1) -#let binomial(x, n, p) = calc.fact(n) / (calc.fact(x) * calc.fact(n - x)) * calc.pow(p, x) * calc.pow(1 - p, n - x) +#let binomial(x, n, p) = ( + calc.fact(n) / (calc.fact(x) * calc.fact(n - x)) * calc.pow(p, x) * calc.pow( + 1 - p, + n - x, + ) +) #let poisson(x, λ) = calc.pow(λ, x) * calc.exp(-λ) / calc.fact(x) -#let negativebinomial(x, r, p) = calc.fact(x + r - 1) / (calc.fact(r - 1) * calc.fact(x)) * (calc.pow(1 - p, x) * calc.pow(p, r)) +#let negativebinomial(x, r, p) = ( + calc.fact(x + r - 1) / (calc.fact(r - 1) * calc.fact(x)) * ( + calc.pow(1 - p, x) * calc.pow(p, r) + ) +) #let continuousuniform(a, b) = 1 / (b - a) -#let gaussian(x, μ, σ) = 1 / (σ * calc.sqrt(2 * calc.pi)) * calc.exp(-(calc.pow(x - μ, 2)) / (2 * calc.pow(σ, 2))) -#let lognormal(x, μ, σ) = 1 / (x * σ * calc.sqrt(2 * calc.pi)) * calc.exp(-(calc.pow(calc.ln(x) - μ, 2)) / (2 * calc.pow(σ, 2))) +#let gaussian(x, μ, σ) = ( + 1 / (σ * calc.sqrt(2 * calc.pi)) * calc.exp(-(calc.pow(x - μ, 2)) / ( + 2 * calc.pow(σ, 2) + )) +) +#let lognormal(x, μ, σ) = ( + 1 / (x * σ * calc.sqrt(2 * calc.pi)) * calc.exp(-( + calc.pow(calc.ln(x) - μ, 2) + ) / (2 * calc.pow(σ, 2))) +) #let exponential(x, λ) = λ * calc.exp(-λ * x) -#let gamma(z) = (2.506628274631 * calc.sqrt(1 / z)) + (0.20888568 * calc.pow(1 / z, 1.5)) + (0.00870357 * calc.pow(1 / z, 2.5)) - ((174.2106599 * calc.pow(1 / z, 3.5)) / 25920) - ((715.6423511 * calc.pow(1 / z, 4.5)) / 1244160) * (calc.exp(-calc.ln(1 / z) - 1) * z) -#let gammadist(x, α, θ) = calc.pow(x, α - 1) * calc.exp(-x / θ) * gamma(α) * calc.pow(θ, α) -#let student(x, ν) = gamma((ν + 1) / 2) / (calc.sqrt(ν * calc.pi) * gamma(ν / 2)) * calc.pow(1 + (calc.pow(x, 2) / ν), (-(ν + 1) / 2)) +#let gamma(z) = ( + (2.506628274631 * calc.sqrt(1 / z)) + (0.20888568 * calc.pow(1 / z, 1.5)) + ( + 0.00870357 * calc.pow(1 / z, 2.5) + ) - ((174.2106599 * calc.pow(1 / z, 3.5)) / 25920) - ( + (715.6423511 * calc.pow(1 / z, 4.5)) / 1244160 + ) * (calc.exp(-calc.ln(1 / z) - 1) * z) +) +#let gammadist(x, α, θ) = ( + calc.pow(x, α - 1) * calc.exp(-x / θ) * gamma(α) * calc.pow(θ, α) +) +#let student(x, ν) = ( + gamma((ν + 1) / 2) / (calc.sqrt(ν * calc.pi) * gamma(ν / 2)) * calc.pow( + 1 + (calc.pow(x, 2) / ν), + (-(ν + 1) / 2), + ) +) #let cauchy(x, μ, σ) = 1 / (calc.pi * σ * (1 + (calc.pow((x - μ) / σ, 2)))) -#let beta(x, α, β) = calc.pow(x, α - 1) * calc.pow(1 - x, β - 1) / ((gamma(α) * gamma(β)) / gamma(α + β)) +#let beta(x, α, β) = ( + calc.pow(x, α - 1) * calc.pow(1 - x, β - 1) / ( + (gamma(α) * gamma(β)) / gamma(α + β) + ) +) #let logistic(x) = 1 / (1 + calc.exp(-x)) -#let normcdf(x, μ, σ) = 1 / ( - 1 + calc.exp(-0.07056 * calc.pow(((x - μ) / σ), 3) - 1.5976 * (x - μ) / σ) +#let normcdf(x, μ, σ) = ( + 1 / ( + 1 + calc.exp(-0.07056 * calc.pow(((x - μ) / σ), 3) - 1.5976 * (x - μ) / σ) + ) ) #let logodds(p) = calc.ln(p / (1 - p)) #let laplace(x, b) = (1 / (2 * b)) * calc.exp(-calc.abs(x) / b) diff --git a/treefmt.nix b/treefmt.nix index 49a8893..8d8e221 100644 --- a/treefmt.nix +++ b/treefmt.nix @@ -7,4 +7,13 @@ # Enable formatters programs.nixpkgs-fmt.enable = true; programs.prettier.enable = true; + + # Custom typstyle + settings.formatter.typstyle = { + command = "${pkgs.typstyle}/bin/typstyle"; + options = [ + "--check" + ]; + includes = [ "*.typ" "*.typst" ]; + }; } diff --git a/typstfmt-config.toml b/typstfmt-config.toml deleted file mode 100644 index e001094..0000000 --- a/typstfmt-config.toml +++ /dev/null @@ -1,4 +0,0 @@ -indent_space = 2 -max_line_length = 100 -experimental_args_breaking_consecutive = false -line_wrap = true