Fake data with interaction

Elef · October 14, 2020, 7:52am

Hi,

I would like to generate a fake response using rstan from a model with interaction of Continuous:Categorical variable.

I have made the following model with two variables,

continuous with two levels and

categorical of four levels.

sim_data <- 
 '
data {
int N;
real alpha;
real Category[N];       // Coefficient for each level
real Category_sigma[N]; // Uncertainty for the coefficient 
real x1_b;       //Coefficient for the continues 
real x1_b_sigma; // Uncertainty for the coefficient
real x1_Category[N];       // Interaction coefficients for each level
real x1_Category_sigma[N]; // Uncertainty for the coefficients
real Category_grid[N];
real<lower=0> sigma;

real x1_grid[N];
}

parameters {}
model {}

generated quantities {
real y[N];
real x1_beta;
real Category_betas[N];
real x1_Cat_beta[N];

x1_beta = normal_rng(x1_b, x1_b_sigma);
Category_betas = normal_rng(Category, Category_sigma);
x1_Cat_beta =  normal_rng(x1_Category, x1_Category_sigma);

for (n in 1:N) {
y[n] = normal_rng(
  alpha + Category_betas[n] +  // Categorical 
  x1_beta * x1_grid[n] +       // Continues
  x1_Cat_beta[n] * x1_grid[n] * Category_grid[n]  // Interaction
  ,sigma);
  }
}
'

compiled_example <- stan_model(model_code = sim_data)

and the fake input is

Category <- c('A', 'B','C', 'D')
x1_grid <- seq(0, 1, 1)


data_grid <- expand.grid(Category = Category, 
                        x1_grid  = x1_grid)

Then I sample by setting my own coefficients using the following,

N = nrow(data_grid)
sim_ex <- sampling(
  compiled_example, algorithm = 'Fixed_param',
  data = list(  N = N, 
                alpha = 0,
                Category = rep(c(49.4, 50, 50, 52.9), length.out = N),  # Coefficient for each level
                Category_sigma = rep(c(2, 1.8, 1.7, 2), length.out = N),#Uncertainty for the coefficients
                x1_b = -17,       # Coefficient for the continues 
                x1_b_sigma = 2.3, # Uncertainty for the coefficient
                x1_Category = rep(c(2, -1.5, .3, -.9), length.out = N),         # Interaction Coefficients 
                x1_Category_sigma = rep(c(.4, .4, .4, .4), length.out = N), #Uncertainty for the coefficients
                
                x1_grid = data_grid$x1_grid,
                Category_grid = rep(c(1, 2, 3, 4), length.out = N),

                sigma = 1
                
  ), refresh = 0)

Now, for the simpler case without interaction seems to work. But there is a mistake in the interaction and I don’t know how to solve it.

Also if you have links of examples in simulating fake data with Stan would be great to share them because I have found only a couple (1)(2)

Thanks!

bbbales2 · October 15, 2020, 11:58am

Are you getting an error at runtime or is it that the results do not look correct?

Elef · October 16, 2020, 6:38am

Are you getting an error at runtime or is it that the results do not look correct?

Hi,
It runs perfectly fine. My problem is that I don’t know how to model the interaction.

bbbales2 · October 16, 2020, 1:33pm

I see in your code you have an interaction term:

y[n] = normal_rng(
  alpha + Category_betas[n] +  // Categorical 
  x1_beta * x1_grid[n] +       // Continues
  x1_Cat_beta[n] * x1_grid[n] * Category_grid[n]  // Interaction
  ,sigma);

Do you mean that you’re having a problem fitting the data once you generate it?

Elef · October 29, 2020, 10:03am

Hi,

I have found the mistake, so I post the code that it works as I would expect.

The packages I used

library(dplyr)
library(rstan)
library(tidybayes)
library(reshape2)
library(nlme)
library(AlgDesign)
library(stringr)

The model with one discrete variable, 3 continuous and a 2-way interaction of the discrete with a continuous

sim_data_TA4 ←
’
data {
int N;
real CD_Base[N];
real CD_Base_sigma[N];
real Silicone_b;
real Silicone_b_sigma;
real Silicone_b_sq;
real Silicone_b_sq_sigma;
real Ts_b;
real Ts_b_sigma;
real Gl_b;
real Gl_b_sigma;
real CDb_Ts[N];
real CDb_Ts_sigma[N];
real<lower=0> sigma;

real Silicone_x[N];
real Ts_x[N];
real Gl_x[N];
}

parameters {}
model {}
generated quantities {
real y[N];
real S_beta;
real S_beta_sq;
real CD_betas[N];
real Ts_beta;
real Gl_beta;
real CDb_Ts_beta[N];

S_beta = normal_rng(Silicone_b, Silicone_b_sigma);
S_beta_sq = normal_rng(Silicone_b_sq, Silicone_b_sq_sigma);
for(n in 1:N)
CD_betas[n] = normal_rng(CD_Base[n], CD_Base_sigma[n]);
Ts_beta = normal_rng(Ts_b, Ts_b_sigma);
Gl_beta = normal_rng(Gl_b, Gl_b_sigma);
for(n in 1:N)
CDb_Ts_beta[n] = normal_rng(CDb_Ts[n], CDb_Ts_sigma[n]); // coef for interaction

for (n in 1:N) {
y[n] = normal_rng(
CD_betas[n] +
S_beta * Silicone_x[n] +
S_beta_sq * square(Silicone_x[n]) +
Ts_beta * Ts_x[n] +
Gl_beta * Gl_x[n] +
CDb_Ts_beta[n] * Ts_x[n] // interaction
,sigma);
}
}
’

compiled_TA4 ← stan_model(model_code = sim_data_TA4)

Make a fake DoE

cand_list = expand.grid(CD_levels = c(‘B’, ‘F’),
S_levels = c(0, .5, 1.5, 3),
Ts_levels = c(2.4, 5.6, 7.5),
Gl_levels = c(0, 1.5, 3))

doe_grid ← optFederov( ~ Ts_levels * CD_levels + S_levels + I(S_levels^2) + Gl_levels,
data = cand_list, nTrials = 22, crit=“D”)$design

doe_grid

Set the coefficients as I like

beta_S ← data.frame(beta = -19.7, sd_beta = .07) # beta = NA, sd_beta = 1.7
beta_S_sq ← data.frame(beta = 3.4, sd_beta = .02) # beta = NA, sd_beta = .5
beta_Ts ← data.frame(beta = -.59, sd_beta = .1) # beta = -.38, sd_beta = .48
beta_Gl ← data.frame(beta = -.4, sd_beta = .02) # beta = .4, sd_beta = .2

beta_CD ← data.frame(CD_levels = c(‘B’, ‘F’),
betas = c(51.6, 52.8),
sd_betas = c(.28, .11))
#Here are the coefficients for the the interaction
beta_CD_TS ← data.frame(CD_levels = c(‘B’, ‘F’),
betas_interaction = c(0, 1.7),
sd_betas_interaction = c(.07, .07))

beta_CD1 ← left_join(doe_grid, beta_CD, by = ‘CD_levels’)
beta_CD2 ← left_join(beta_CD1, beta_CD_TS, by = ‘CD_levels’)

Sample

N = beta_CD2 %>% nrow
sim_TA4 ← sampling(
compiled_TA4, algorithm = ‘Fixed_param’,
data = list( N = N,
CD = beta_CD2$betas,
CD_sigma = beta_CD2$sd_betas,

            S_b = beta_S$beta,
            S_b_sigma = beta_S$sd_beta,
            
            S_b_sq = beta_S_sq$beta,
            S_b_sq_sigma = beta_S_sq$sd_beta,
            
            CDb_Ts = beta_CD2$betas_interaction,
            CDb_Ts_sigma = beta_CD2$sd_betas_interaction,
            
            Ts_b = beta_Ts$beta,
            Ts_b_sigma = beta_Ts$sd_beta,
            
            Gl_b = beta_Gl$beta, 
            Gl_b_sigma = beta_Gl$sd_beta,
            
            S_x = doe_grid$S_levels,
            Ts_x = doe_grid$Ts_levels,
            Gl_x = doe_grid$Gl_levels,
            sigma = 6

), refresh = 0)

Test (apologies for using lm function in here :D )

fake_data0 ← rstan::extract(sim_TA4, pars = ‘y’) %>% as.data.frame %>% .[1:4, ]
sample_nr ←
rownames(fake_data0) %>% as.numeric %>% summary %>% .[c(‘Min.’, ‘Max.’)] %>% str_c(collapse = “_”)

fake_data0 ← data.frame(doe_grid, fake_data0 %>% t) %>%
melt(id = c(‘CD_levels’, ‘S_levels’, ‘Ts_levels’, ‘Gl_levels’)) %>%
rename(y_fake = value) %>% select(-variable)

mod_fake0 ← lm(y_fake ~ poly(S_levels, 2, raw = T) + CD_levels * Ts_levels + Gl_levels,
data = fake_data0)

fake_data0 %>% head(10)
mod_fake0 %>% summary

fake_coef ← broom::tidy(mod_fake0) %>%
mutate(
low = estimate - 2 * std.error,
high = estimate + 2 * std.error,
Sample_size = sample_nr
) %>% .[, ]

ggplot(fake_coef, aes(estimate, term, xmin = low, xmax = high, height = 0)) +
geom_point() +
geom_vline(xintercept = 0) +
geom_errorbar(colour = ‘red’, width = 0.25)

Elef · October 29, 2020, 10:05am

The problem was that the fake response didn’t have the desired properties because I made a mess with the part of interaction. Thanks for your time :)

bbbales2 · October 29, 2020, 5:27pm

Glad it worked out in the end!

Topic		Replies	Views
Problem Simulating Data with Generated Quantities (Dimension mismatch in assignment; type = real; right-hand side type = real[ ]) RStan rstan , techniques , specification	3	859	April 29, 2021
Simulating fake data for regression in Stan Modeling techniques , specification , performance	2	1325	October 9, 2021
Simulate data - quadratic relationship Modeling rstan	1	526	October 12, 2020
Sampling from a matrix in STAN Modeling rstan , techniques , performance	28	1319	September 1, 2021
Coding an interaction term between a categorical variable and a continuous variable in Stan, where the continuous variable is a vector of continuous data Modeling rstan , specification	3	1145	February 10, 2021

Fake data with interaction

Related topics