normalnormal-model.Rmd
This vignette introduces the Normal-Normal model. It is designed to be a benchmark.
Consider Normal samples \(X_i\) from \(m\) different sources. Each source is sampled \(n\) times.
Dirichlet parameter are also assumed to be sampled from Dirichlet distributions.
We assume that \(\mu_0^{(\mu)}, \sigma_0^{(\mu)}, \mu_0^{(\sigma)}, \sigma_0^{(\sigma)}\) are known.
The Normal-Normal model is implemented in rsamplestudy
.
Let’s generate some data:
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(ggplot2)
library(rsamplestudy)
n <- 200
m <- 5
list_pop <- rsamplestudy::fun_rnorm_population(n, m)
list_pop$df_pop %>%
ggplot() +
geom_point(aes(x = x, y = source, col = factor(source)), position = 'jitter')
list_pop$df_sources
#> # A tibble: 5 x 3
#> source mu sigma
#> <int> <dbl> <dbl>
#> 1 1 1.13 0.209
#> 2 2 1.15 0.547
#> 3 3 3.18 0.426
#> 4 4 2.15 0.246
#> 5 5 2.47 0.561
list_pop$list_hyper
#> $m_mu
#> [1] 2.287247
#>
#> $m_sigma
#> [1] 1.196772
#>
#> $s_mu
#> [1] -0.6942925
#>
#> $s_sigma
#> [1] 0.412293
Generate two sets of samples, under two different hypotheses:
k_ref <- 10
k_quest <- 10
source_ref <- 1
source_same <- source_ref
source_diff <- 3 # single questioned different source
list_samples_same <- rsamplestudy::make_dataset_splits(list_pop$df_pop, k_ref, k_quest, source_ref = source_ref, source_quest = source_same)
list_samples_diff <- rsamplestudy::make_dataset_splits(list_pop$df_pop, k_ref, k_quest, source_ref = source_ref, source_quest = source_diff)
Organize data, and elicit the hyperprior from the background data (assume vague hyperpriors):
list_data_same <- rstanBF::stanBF_prepare_rsamplestudy_data(list_pop, list_samples_same)
list_data_diff <- rstanBF::stanBF_prepare_rsamplestudy_data(list_pop, list_samples_diff)
list_hyper <- rstanBF::stanBF_elicit_hyperpriors(list_samples_same$df_background, 'NormNorm', 'vague')
Perform the NUTS sampling:
n.iter <- 5000
n.burnin <- 1000
n.cores <- 3
n.chains <- n.cores
obj_StanBF_same <- rstanBF::compute_BF_Stan(list_data_same, 'NormNorm', list_hyper, n.iter = n.iter, n.burnin = n.burnin, n.cores = n.cores, n.chains = n.chains, silent = TRUE)
obj_StanBF_diff <- rstanBF::compute_BF_Stan(list_data_diff, 'NormNorm', list_hyper, n.iter = n.iter, n.burnin = n.burnin, n.cores = n.cores, n.chains = n.chains, silent = TRUE)
obj_StanBF_same
#> stanBF object containing posterior samples from H1, H2.
#> Model: Normal-Normal
#> Obtained BF: 1697753
#> Ran with 3 chains, 5000 HMC iterations.
obj_StanBF_diff
#> stanBF object containing posterior samples from H1, H2.
#> Model: Normal-Normal
#> Obtained BF: 1.301013e-05
#> Ran with 3 chains, 5000 HMC iterations.
More details on the Stan outputs:
obj_StanBF_same$stanfit
#> $H1
#> Inference for Stan model: stan_NormNorm_H1.
#> 3 chains, each with iter=5000; warmup=1000; thin=1;
#> post-warmup draws per chain=4000, total post-warmup draws=12000.
#>
#> mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff
#> mu_ref 1.09 0.00 0.04 1.00 1.06 1.09 1.12 1.18 7517
#> sigma_ref 0.19 0.00 0.03 0.14 0.17 0.19 0.21 0.27 6888
#> sim_mu_ref 2.47 0.93 100.43 -194.84 -64.44 2.77 69.74 198.64 11659
#> sim_sigma_ref 79.93 0.55 60.26 3.48 31.20 68.20 115.00 223.64 12182
#> sim_d_ref 1.04 1.30 142.27 -284.03 -85.48 1.73 88.28 280.44 11962
#> pred_d_ref 1.09 0.00 0.20 0.69 0.96 1.09 1.22 1.49 11329
#> lp__ -7.95 0.02 1.05 -10.80 -8.36 -7.62 -7.19 -6.91 4637
#> Rhat
#> mu_ref 1
#> sigma_ref 1
#> sim_mu_ref 1
#> sim_sigma_ref 1
#> sim_d_ref 1
#> pred_d_ref 1
#> lp__ 1
#>
#> Samples were drawn using NUTS(diag_e) at Tue Mar 9 09:14:30 2021.
#> For each parameter, n_eff is a crude measure of effective sample size,
#> and Rhat is the potential scale reduction factor on split chains (at
#> convergence, Rhat=1).
#>
#> $H2
#> Inference for Stan model: stan_NormNorm_H2.
#> 3 chains, each with iter=5000; warmup=1000; thin=1;
#> post-warmup draws per chain=4000, total post-warmup draws=12000.
#>
#> mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff
#> mu_ref 1.11 0.00 0.06 1.00 1.07 1.11 1.14 1.22 9360
#> sigma_ref 0.17 0.00 0.05 0.10 0.14 0.16 0.19 0.29 7665
#> mu_quest 1.07 0.00 0.08 0.91 1.02 1.07 1.13 1.24 7677
#> sigma_quest 0.25 0.00 0.07 0.15 0.20 0.24 0.28 0.43 7074
#> sim_mu_ref -1.24 0.96 100.48 -197.19 -68.85 -1.96 66.14 196.53 10924
#> sim_sigma_ref 79.45 0.55 59.72 3.43 31.57 67.96 114.67 222.73 11592
#> sim_mu_quest 0.20 0.93 99.99 -192.64 -67.50 -0.70 67.63 197.41 11549
#> sim_sigma_quest 80.28 0.55 60.93 2.73 32.21 67.75 115.46 226.50 12178
#> sim_d_ref -0.41 1.36 142.25 -280.41 -87.49 -1.17 87.24 285.12 10973
#> sim_d_quest 1.46 1.32 143.05 -286.81 -86.95 0.42 88.71 286.26 11805
#> pred_d_ref 1.11 0.00 0.19 0.73 0.99 1.11 1.22 1.48 10935
#> pred_d_quest 1.08 0.00 0.27 0.53 0.91 1.08 1.25 1.62 11669
#> lp__ -21.84 0.02 1.54 -25.76 -22.61 -21.50 -20.70 -19.87 4920
#> Rhat
#> mu_ref 1
#> sigma_ref 1
#> mu_quest 1
#> sigma_quest 1
#> sim_mu_ref 1
#> sim_sigma_ref 1
#> sim_mu_quest 1
#> sim_sigma_quest 1
#> sim_d_ref 1
#> sim_d_quest 1
#> pred_d_ref 1
#> pred_d_quest 1
#> lp__ 1
#>
#> Samples were drawn using NUTS(diag_e) at Tue Mar 9 09:14:32 2021.
#> For each parameter, n_eff is a crude measure of effective sample size,
#> and Rhat is the potential scale reduction factor on split chains (at
#> convergence, Rhat=1).
obj_StanBF_diff$stanfit
#> $H1
#> Inference for Stan model: stan_NormNorm_H1.
#> 3 chains, each with iter=5000; warmup=1000; thin=1;
#> post-warmup draws per chain=4000, total post-warmup draws=12000.
#>
#> mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff
#> mu_ref 2.19 0.00 0.25 1.70 2.03 2.19 2.36 2.70 7447
#> sigma_ref 1.12 0.00 0.20 0.81 0.98 1.09 1.23 1.58 7281
#> sim_mu_ref -0.41 0.93 100.57 -196.93 -68.39 -0.69 67.03 197.12 11648
#> sim_sigma_ref 80.93 0.56 60.62 3.18 32.83 69.40 116.19 225.55 11684
#> sim_d_ref -0.69 1.32 141.43 -282.72 -87.36 0.08 86.50 272.43 11558
#> pred_d_ref 2.20 0.01 1.17 -0.13 1.44 2.21 2.95 4.53 10868
#> lp__ -41.51 0.01 1.02 -44.20 -41.92 -41.20 -40.78 -40.50 5088
#> Rhat
#> mu_ref 1
#> sigma_ref 1
#> sim_mu_ref 1
#> sim_sigma_ref 1
#> sim_d_ref 1
#> pred_d_ref 1
#> lp__ 1
#>
#> Samples were drawn using NUTS(diag_e) at Tue Mar 9 09:14:39 2021.
#> For each parameter, n_eff is a crude measure of effective sample size,
#> and Rhat is the potential scale reduction factor on split chains (at
#> convergence, Rhat=1).
#>
#> $H2
#> Inference for Stan model: stan_NormNorm_H2.
#> 3 chains, each with iter=5000; warmup=1000; thin=1;
#> post-warmup draws per chain=4000, total post-warmup draws=12000.
#>
#> mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff
#> mu_ref 1.21 0.00 0.08 1.06 1.16 1.21 1.25 1.36 7315
#> sigma_ref 0.23 0.00 0.07 0.14 0.18 0.22 0.26 0.39 6579
#> mu_quest 3.18 0.00 0.13 2.91 3.09 3.17 3.26 3.45 8651
#> sigma_quest 0.41 0.00 0.12 0.25 0.33 0.39 0.46 0.69 7623
#> sim_mu_ref 0.32 0.91 100.59 -196.72 -67.28 0.02 67.83 198.17 12133
#> sim_sigma_ref 80.40 0.55 60.40 3.35 31.81 68.68 116.72 223.96 11976
#> sim_mu_quest 0.63 0.93 100.64 -195.99 -67.32 0.58 68.27 201.36 11780
#> sim_sigma_quest 80.44 0.56 60.93 3.11 31.79 68.24 115.19 225.33 11887
#> sim_d_ref -0.23 1.29 141.78 -281.99 -85.88 -0.99 85.11 285.31 12137
#> sim_d_quest 1.54 1.29 143.20 -277.84 -88.42 0.58 87.72 289.70 12331
#> pred_d_ref 1.21 0.00 0.25 0.72 1.05 1.21 1.36 1.71 12119
#> pred_d_quest 3.18 0.00 0.45 2.29 2.90 3.18 3.45 4.07 12004
#> lp__ -28.82 0.03 1.58 -32.87 -29.57 -28.47 -27.66 -26.88 3785
#> Rhat
#> mu_ref 1
#> sigma_ref 1
#> mu_quest 1
#> sigma_quest 1
#> sim_mu_ref 1
#> sim_sigma_ref 1
#> sim_mu_quest 1
#> sim_sigma_quest 1
#> sim_d_ref 1
#> sim_d_quest 1
#> pred_d_ref 1
#> pred_d_quest 1
#> lp__ 1
#>
#> Samples were drawn using NUTS(diag_e) at Tue Mar 9 09:14:41 2021.
#> For each parameter, n_eff is a crude measure of effective sample size,
#> and Rhat is the potential scale reduction factor on split chains (at
#> convergence, Rhat=1).
Using general post-processing utils in bayesplot
package:
suppressPackageStartupMessages(library(rstan))
library(bayesplot)
#> This is bayesplot version 1.8.0
#> - Online documentation and vignettes at mc-stan.org/bayesplot
#> - bayesplot theme set to bayesplot::theme_default()
#> * Does _not_ affect other ggplot2 plots
#> * See ?bayesplot_theme_set for details on theme setting
results_same_H1 <- obj_StanBF_same$stanfit$H1 %>% As.mcmc.list()
results_same_H2 <- obj_StanBF_same$stanfit$H2 %>% As.mcmc.list()
First, we recover NUTS draws and true values. Then we use bayesplot
tools to plot them.
Notice that the orders of the variables in mcmc_recover_intervals
arguments must match.
Questioned data sampled under \(H_1\):
# H1
draws <- obj_StanBF_same$stanfit$H1 %>% As.mcmc.list(pars = c('mu_ref', 'sigma_ref'))
df_sources_ref <- list_pop$df_sources %>% filter(source == source_same) %>% select(mu, sigma)
true_sources <- as.numeric(df_sources_ref)
bayesplot::mcmc_recover_intervals(draws, true_sources)
# H2, reference and questioned
draws <- obj_StanBF_same$stanfit$H2 %>% As.mcmc.list(pars = c('mu_ref', 'sigma_ref', 'mu_quest', 'sigma_quest'))
df_sources <- list_pop$df_sources %>% filter(source == source_same) %>% select(mu, sigma)
true_sources <- rep(as.numeric(df_sources), 2)
bayesplot::mcmc_recover_intervals(draws, true_sources)
Questioned data sampled under \(H_2\):
# H1: assuming that sources are the same
# This is not true, so the posteriors do not fit well
draws <- obj_StanBF_diff$stanfit$H1 %>% As.mcmc.list(pars = c('mu_ref', 'sigma_ref'))
df_sources_ref <- list_pop$df_sources %>% filter(source == source_same) %>% select(mu, sigma)
true_sources <- as.numeric(df_sources_ref)
bayesplot::mcmc_recover_intervals(draws, true_sources)
# H2, reference and questioned from different sources
# This is true, the posteriors are also good
draws <- obj_StanBF_diff$stanfit$H2 %>% As.mcmc.list(pars = c('mu_ref', 'sigma_ref', 'mu_quest', 'sigma_quest'))
df_sources_ref <- list_pop$df_sources %>% filter(source == source_same) %>% select(mu, sigma)
df_sources_quest <- list_pop$df_sources %>% filter(source == source_diff) %>% select(mu, sigma)
true_sources <- c(as.numeric(df_sources_ref), as.numeric(df_sources_quest))
bayesplot::mcmc_recover_intervals(draws, true_sources)
To simplify, consider only when data is sampled from the same source.
Here, we plot the prior predictive distribution.
It should generate data which should be realistic, but not necessarily adapted to the observed data.
It represents the background knowledge of our data.
Here, we specified a very vague prior, so the generated values are really extreme compared to the one measured.
# H1: prior predictive
draws_prior <- obj_StanBF_same$stanfit$H1 %>% as.matrix(pars = c('sim_d_ref'))
data_ref <- list_pop$df_pop %>% filter(source == source_ref) %>% pull('x')
# extract a subset of draws
draws_prior_sub <- t(as.matrix(draws_prior[1:length(data_ref)]))
bayesplot::ppc_dens_overlay(data_ref, draws_prior_sub) +
scale_x_continuous(limits = c(-10, 10))
#> Warning: Removed 196 rows containing non-finite values (stat_density).
The posterior predictive distribution, instead, should be a good approximation of the observed data.
If it does not occur, it means that the model is not capable of predicting from the correct distribution.
In other words, the trained model is not adapted to the observed data.
draws_posterior <- obj_StanBF_same$stanfit$H1 %>% as.matrix(pars = c('pred_d_ref'))
draws_posterior_sub <- t(as.matrix(draws_posterior[length(draws_posterior) - (1:length(data_ref)) ]))
bayesplot::ppc_dens_overlay(data_ref, draws_posterior_sub)
To check that it is working, let’s consider the situation where the sources are different, and consider the reference and questioned items separately.
data_quest <- list_pop$df_pop %>% filter(source == source_diff) %>% pull('x')
# Reference data
draws_posterior <- obj_StanBF_diff$stanfit$H2 %>% as.matrix(pars = c('pred_d_ref'))
draws_posterior_sub <- t(as.matrix(draws_posterior[length(draws_posterior) - (1:length(data_ref)) ]))
bayesplot::ppc_dens_overlay(data_ref, draws_posterior_sub)
# Questioned data
draws_posterior <- obj_StanBF_diff$stanfit$H2 %>% as.matrix(pars = c('pred_d_quest'))
draws_posterior_sub <- t(as.matrix(draws_posterior[length(draws_posterior) - (1:length(data_quest)) ]))
bayesplot::ppc_dens_overlay(data_ref, draws_posterior_sub) + ggtitle('Predicting reference samples with the questioned data')
bayesplot::ppc_dens_overlay(data_quest, draws_posterior_sub) + ggtitle('Predicting questioned samples with the questioned data')