Fake data with interaction

Hi,

I would like to generate a fake response using rstan from a model with interaction of Continuous:Categorical variable.

I have made the following model with two variables,

  • continuous with two levels and

  • categorical of four levels.

    sim_data <- 
     '
    data {
    int N;
    real alpha;
    real Category[N];       // Coefficient for each level
    real Category_sigma[N]; // Uncertainty for the coefficient 
    real x1_b;       //Coefficient for the continues 
    real x1_b_sigma; // Uncertainty for the coefficient
    real x1_Category[N];       // Interaction coefficients for each level
    real x1_Category_sigma[N]; // Uncertainty for the coefficients
    real Category_grid[N];
    real<lower=0> sigma;
    
    real x1_grid[N];
    }
    
    parameters {}
    model {}
    
    generated quantities {
    real y[N];
    real x1_beta;
    real Category_betas[N];
    real x1_Cat_beta[N];
    
    x1_beta = normal_rng(x1_b, x1_b_sigma);
    Category_betas = normal_rng(Category, Category_sigma);
    x1_Cat_beta =  normal_rng(x1_Category, x1_Category_sigma);
    
    for (n in 1:N) {
    y[n] = normal_rng(
      alpha + Category_betas[n] +  // Categorical 
      x1_beta * x1_grid[n] +       // Continues
      x1_Cat_beta[n] * x1_grid[n] * Category_grid[n]  // Interaction
      ,sigma);
      }
    }
    '
    
    compiled_example <- stan_model(model_code = sim_data)
    

and the fake input is

Category <- c('A', 'B','C', 'D')
x1_grid <- seq(0, 1, 1)


data_grid <- expand.grid(Category = Category, 
                        x1_grid  = x1_grid)

Then I sample by setting my own coefficients using the following,

N = nrow(data_grid)
sim_ex <- sampling(
  compiled_example, algorithm = 'Fixed_param',
  data = list(  N = N, 
                alpha = 0,
                Category = rep(c(49.4, 50, 50, 52.9), length.out = N),  # Coefficient for each level
                Category_sigma = rep(c(2, 1.8, 1.7, 2), length.out = N),#Uncertainty for the coefficients
                x1_b = -17,       # Coefficient for the continues 
                x1_b_sigma = 2.3, # Uncertainty for the coefficient
                x1_Category = rep(c(2, -1.5, .3, -.9), length.out = N),         # Interaction Coefficients 
                x1_Category_sigma = rep(c(.4, .4, .4, .4), length.out = N), #Uncertainty for the coefficients
                
                x1_grid = data_grid$x1_grid,
                Category_grid = rep(c(1, 2, 3, 4), length.out = N),

                sigma = 1
                
  ), refresh = 0)

Now, for the simpler case without interaction seems to work. But there is a mistake in the interaction and I don’t know how to solve it.

Also if you have links of examples in simulating fake data with Stan would be great to share them because I have found only a couple (1)(2)

Thanks!

Are you getting an error at runtime or is it that the results do not look correct?

Are you getting an error at runtime or is it that the results do not look correct?

Hi,
It runs perfectly fine. My problem is that I don’t know how to model the interaction.

I see in your code you have an interaction term:

y[n] = normal_rng(
  alpha + Category_betas[n] +  // Categorical 
  x1_beta * x1_grid[n] +       // Continues
  x1_Cat_beta[n] * x1_grid[n] * Category_grid[n]  // Interaction
  ,sigma);

Do you mean that you’re having a problem fitting the data once you generate it?

Hi,

I have found the mistake, so I post the code that it works as I would expect.

The packages I used

library(dplyr)
library(rstan)
library(tidybayes)
library(reshape2)
library(nlme)
library(AlgDesign)
library(stringr)

The model with one discrete variable, 3 continuous and a 2-way interaction of the discrete with a continuous

sim_data_TA4 ←

data {
int N;
real CD_Base[N];
real CD_Base_sigma[N];
real Silicone_b;
real Silicone_b_sigma;
real Silicone_b_sq;
real Silicone_b_sq_sigma;
real Ts_b;
real Ts_b_sigma;
real Gl_b;
real Gl_b_sigma;
real CDb_Ts[N];
real CDb_Ts_sigma[N];
real<lower=0> sigma;

real Silicone_x[N];
real Ts_x[N];
real Gl_x[N];
}

parameters {}
model {}
generated quantities {
real y[N];
real S_beta;
real S_beta_sq;
real CD_betas[N];
real Ts_beta;
real Gl_beta;
real CDb_Ts_beta[N];

S_beta = normal_rng(Silicone_b, Silicone_b_sigma);
S_beta_sq = normal_rng(Silicone_b_sq, Silicone_b_sq_sigma);
for(n in 1:N)
CD_betas[n] = normal_rng(CD_Base[n], CD_Base_sigma[n]);
Ts_beta = normal_rng(Ts_b, Ts_b_sigma);
Gl_beta = normal_rng(Gl_b, Gl_b_sigma);
for(n in 1:N)
CDb_Ts_beta[n] = normal_rng(CDb_Ts[n], CDb_Ts_sigma[n]); // coef for interaction

for (n in 1:N) {
y[n] = normal_rng(
CD_betas[n] +
S_beta * Silicone_x[n] +
S_beta_sq * square(Silicone_x[n]) +
Ts_beta * Ts_x[n] +
Gl_beta * Gl_x[n] +
CDb_Ts_beta[n] * Ts_x[n] // interaction
,sigma);
}
}

compiled_TA4 ← stan_model(model_code = sim_data_TA4)

Make a fake DoE

cand_list = expand.grid(CD_levels = c(‘B’, ‘F’),
S_levels = c(0, .5, 1.5, 3),
Ts_levels = c(2.4, 5.6, 7.5),
Gl_levels = c(0, 1.5, 3))

doe_grid ← optFederov( ~ Ts_levels * CD_levels + S_levels + I(S_levels^2) + Gl_levels,
data = cand_list, nTrials = 22, crit=“D”)$design

doe_grid

Set the coefficients as I like

beta_S ← data.frame(beta = -19.7, sd_beta = .07) # beta = NA, sd_beta = 1.7
beta_S_sq ← data.frame(beta = 3.4, sd_beta = .02) # beta = NA, sd_beta = .5
beta_Ts ← data.frame(beta = -.59, sd_beta = .1) # beta = -.38, sd_beta = .48
beta_Gl ← data.frame(beta = -.4, sd_beta = .02) # beta = .4, sd_beta = .2

beta_CD ← data.frame(CD_levels = c(‘B’, ‘F’),
betas = c(51.6, 52.8),
sd_betas = c(.28, .11))
#Here are the coefficients for the the interaction
beta_CD_TS ← data.frame(CD_levels = c(‘B’, ‘F’),
betas_interaction = c(0, 1.7),
sd_betas_interaction = c(.07, .07))

beta_CD1 ← left_join(doe_grid, beta_CD, by = ‘CD_levels’)
beta_CD2 ← left_join(beta_CD1, beta_CD_TS, by = ‘CD_levels’)

Sample

N = beta_CD2 %>% nrow
sim_TA4 ← sampling(
compiled_TA4, algorithm = ‘Fixed_param’,
data = list( N = N,
CD = beta_CD2$betas,
CD_sigma = beta_CD2$sd_betas,

            S_b = beta_S$beta,
            S_b_sigma = beta_S$sd_beta,
            
            S_b_sq = beta_S_sq$beta,
            S_b_sq_sigma = beta_S_sq$sd_beta,
            
            CDb_Ts = beta_CD2$betas_interaction,
            CDb_Ts_sigma = beta_CD2$sd_betas_interaction,
            
            Ts_b = beta_Ts$beta,
            Ts_b_sigma = beta_Ts$sd_beta,
            
            Gl_b = beta_Gl$beta, 
            Gl_b_sigma = beta_Gl$sd_beta,
            
            S_x = doe_grid$S_levels,
            Ts_x = doe_grid$Ts_levels,
            Gl_x = doe_grid$Gl_levels,
            sigma = 6

), refresh = 0)

Test (apologies for using lm function in here :D )

fake_data0 ← rstan::extract(sim_TA4, pars = ‘y’) %>% as.data.frame %>% .[1:4, ]
sample_nr ←
rownames(fake_data0) %>% as.numeric %>% summary %>% .[c(‘Min.’, ‘Max.’)] %>% str_c(collapse = “_”)

fake_data0 ← data.frame(doe_grid, fake_data0 %>% t) %>%
melt(id = c(‘CD_levels’, ‘S_levels’, ‘Ts_levels’, ‘Gl_levels’)) %>%
rename(y_fake = value) %>% select(-variable)

mod_fake0 ← lm(y_fake ~ poly(S_levels, 2, raw = T) + CD_levels * Ts_levels + Gl_levels,
data = fake_data0)

fake_data0 %>% head(10)
mod_fake0 %>% summary

fake_coef ← broom::tidy(mod_fake0) %>%
mutate(
low = estimate - 2 * std.error,
high = estimate + 2 * std.error,
Sample_size = sample_nr
) %>% .[, ]

ggplot(fake_coef, aes(estimate, term, xmin = low, xmax = high, height = 0)) +
geom_point() +
geom_vline(xintercept = 0) +
geom_errorbar(colour = ‘red’, width = 0.25)

1 Like

The problem was that the fake response didn’t have the desired properties because I made a mess with the part of interaction. Thanks for your time :)

1 Like

Glad it worked out in the end!