Translation of an SEM with Ordered Categorical Variables from WinBUGS

Below is the specification of an SEM originally specified in WinBUGS. The most notable difference being

for(j in 1:P){
    z[i,j] ~ ordered_logistic(mu[i,j], thd[j]);
}

replacing

for(j in 1:P){
    y[i,j]~dnorm(mu[i,j],psi[j])I(thd[j,z[i,j]],thd[j,z[i,j]+1])
    ephat[i,j]<-y[i,j]-mu[i,j]
}

and the thresholds being estimated as unknown parameters, with the threshold data as priors.
The I() function is according to Bard similar to step(). However, I was not able to make this code run in WinBUGS as is, nor was I able to utilize step() correctly in the Stan model (see commented lines in the measurement equation model). Anyway, I considered I might not really need the intermediate normal distribution y, if I can estimate mu directly from zusing the ordered_logistic() function. This, however, leaves no way of estimating ephat.

  1. I was therefore wondering if this is really necessary?

  2. Also, the model fit will yield the following warning message: "Warning: 2 of 2 chains had an E-BFMI less than 0.2." Is this caused by the way my model is specified? Iā€™ve been unable to get this model to run on WinBUGS, and there are indications this may have to do with with the code snippet above with the indicator function, but it makes it difficult to know whether the warning is caused by my adaptations of the original model, or problems with the data or original model.

  3. Finally, I am trying to run the model using OpenCL on the GPU. This has turned out to be horrendously slow. I am therefore trying to understand what is causing the slowdown: Is there something inherent in this model that makes CPU computations better suited for it, or is it due to model specification not being optimized for GPU?

cmdstanr code

Stan model
data{
	int<lower=0> N; // number of observations
    int<lower=1> P; // number of variables
    array[N, P] int<lower=1, upper=5> z; // observed data
}

transformed data {
   	cov_matrix[4] R = diag_matrix(rep_vector(8.0, 4)); // prior covariance matrix for xi
	vector[4] u = rep_vector(0,4); // prior mean for xi
}

parameters {
	array[21] real lam; // pattern coefficients
	vector[N] eta; // latent variables
	array[N] vector[4] xi; // latent variables
	row_vector[4] gam; // coefficients
	vector<lower=0>[P] psi; // precisions of y
	real<lower=0> psd; // precision of eta
	cov_matrix[4] phi;
    array[P] ordered[4] thd; // Thresholds
	
}

transformed parameters {
    vector[P] sgm = 1/psi;
    real sgd = 1/psd;
    matrix[4, 4] phx = inverse(phi);
}

model {
	// Local variables
    array[N] vector[P] mu;
    array[N] vector[P] ephat;
    vector[N] nu;
    vector[N] dthat;
	array[21] real var_lam;
	real var_gam;

	//priors on precisions
    psi ~ gamma(10,8);
    psd ~ gamma(10,8);
    phi ~ wishart(30, R);

	//priors on loadings and coefficients
    var_lam[1] = 4.0 * psi[2];
    var_lam[2] = 4.0 * psi[4];
    var_lam[3] = 4.0 * psi[5];
    var_lam[4] = 4.0 * psi[6];
    var_lam[5] = 4.0 * psi[7];
    var_lam[6] = 4.0 * psi[8];
    var_lam[7] = 4.0 * psi[9];
    var_lam[8] = 4.0 * psi[11];
    var_lam[9] = 4.0 * psi[12];
    var_lam[10] = 4.0 * psi[13];
    var_lam[11] = 4.0 * psi[14];
    var_lam[12] = 4.0 * psi[15];
    var_lam[13] = 4.0 * psi[17];
    var_lam[14] = 4.0 * psi[18];
    var_lam[15] = 4.0 * psi[20];
    var_lam[16] = 4.0 * psi[21];
    var_lam[17] = 4.0 * psi[22];
    var_lam[18] = 4.0 * psi[23];
    var_lam[19] = 4.0 * psi[24];
    var_lam[20] = 4.0 * psi[25];
    var_lam[21] = 4.0 * psi[26];
    
    lam ~ normal(0.8,var_lam);

    var_gam = 4.0 * psd;

    gam ~ normal(0.6,var_gam);

    for(i in 1:N){
        //measurement equation model
        
        mu[i,1] = eta[i];
        mu[i,2] = lam[1] * eta[i];
        mu[i,3] = xi[i,1];
        mu[i,4] = lam[2] * xi[i,1];
        mu[i,5] = lam[3] * xi[i,1];
        mu[i,6] = lam[4] * xi[i,1];
        mu[i,7] = lam[5] * xi[i,1];
        mu[i,8] = lam[6] * xi[i,1];
        mu[i,9] = lam[7] * xi[i,1];
        mu[i,10] = xi[i,2];
        mu[i,11] = lam[8] * xi[i,2];
        mu[i,12] = lam[9] * xi[i,2];
        mu[i,13] = lam[10] * xi[i,2];
        mu[i,14] = lam[11] * xi[i,2];
        mu[i,15] = lam[12] * xi[i,2];
        mu[i,16] = xi[i,3];
        mu[i,17] = lam[13] * xi[i,3];
        mu[i,18] = lam[14] * xi[i,3];
        mu[i,19] = xi[i,4];
        mu[i,20] = lam[15] * xi[i,4];
        mu[i,21] = lam[16] * xi[i,4];
        mu[i,22] = lam[17] * xi[i,4];
        mu[i,23] = lam[18] * xi[i,4];
        mu[i,24] = lam[19] * xi[i,4];
        mu[i,25] = lam[20] * xi[i,4];
        mu[i,26] = lam[21] * xi[i,4];

		for(j in 1:P){
            // y[i,j] ~ normal(mu[i,j], psi[j]) * step(thd[j, z[i,j]] - y[i,j]);
			z[i,j] ~ ordered_logistic(mu[i,j], thd[j]);
        }
		// ephat[i] = y[i] - mu[i];

        // structural equation model

		nu[i] = gam * xi[i];
	} // end of i
        
	dthat = eta - nu;

	xi ~ multi_normal(u, phi);
	eta ~ normal(nu, psd);

} //end of model

R code
library(cmdstanr)
library(dplyr)
library(glue)

output_dir <-  glue::glue("{getwd()}/Chapter6/stan-output")
if( !dir.exists(output_dir) ) dir.create(output_dir)

model2  <-  paste0(getwd(), "/Chapter6/ch6-Stan-model-with-unknown-thresholds.stan")
model2 <- cmdstan_model(model2, 
    force_recompile = TRUE
)

data <- list(
    N = 338, 
    P = 26,
    z = read.csv(
            "./Chapter6/ch6-WinBUGS-data.dat",
            header = FALSE,
            nrows = 338,
            skip = 2,
            strip.white = TRUE
            ) %>%
        .[,1:(ncol(.) - 1)] %>%
        as.matrix() %>%
        unname()
)

Ļ‡_priors <- read.csv(
    "./Chapter6/ch6-WinBUGS-data.dat",
    header = FALSE,
    # nrows = 338,
    skip = 342,
    strip.white = TRUE
    ) %>%
.[,1:(ncol(.) - 1)] %>%
as.matrix() %>% 
unname()

thd <- matrix(
    c(
        -2.517,-1.245,-0.444, 0.848,
        -1.447,-0.420, 0.119, 1.245,
        -1.671,-0.869,-0.194, 0.679,
        -1.642,-0.869,-0.293, 0.332,
        -1.671,-0.827, 0.052, 0.756,
        -1.769,-1.098,-0.469, 0.255,
        -1.490,-0.670,-0.082, 0.880,
        -1.933,-0.880,-0.317, 1.008,
        -1.587,-0.624, 0.000, 1.008,
        -1.983,-1.348,-0.348, 1.045,
        -1.983,-1.229,-0.247, 0.869,
        -2.262,-1.426, 0.037, 1.330,
        -2.371,-1.295,-0.224, 0.651,
        -2.039,-1.112,-0.149, 1.169,
        -2.262,-1.198,-0.309, 1.198,
        -2.176,-1.537,-0.717, 0.597,
        -1.447,-0.786, 0.119, 1.008,
        -2.039,-1.769,-0.661, 0.642,
        -2.262,-1.468, 0.015, 1.214,
        -2.039,-1.406, 0.000, 1.140,
        -1.702,-1.058, 0.149, 0.902,
        -2.262,-1.426,-0.309, 0.971,
        -1.702,-0.615, 0.179, 1.229,
        -2.262,-1.671,-1.033, 0.420,
        -2.262,-1.468,-0.689, 1.045,
        -2.176,-1.537,-0.880, 0.661
    ),
    nrow = 26,
    ncol = 4,
    byrow = TRUE
)

inits <- list(
    list(
        lam = c(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0),
        psi = c(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0),
        psd = 1.0,
        gam = c(1.0, 1.0, 1.0, 1.0),
        phi = structure(
            .Data = c(1.0, 0.0, 0.0, 0.0,
                        0.0, 1.0, 0.0, 0.0,
                        0.0, 0.0, 1.0, 0.0,
                        0.0, 0.0, 0.0, 1.0),
            .Dim = c(4,4)),
        xi = Ļ‡_priors,
        thd = thd
        
    ),

    list(
        lam = c(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5),
        psi = c(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5),
        psd = 0.6,
        gam = c(0.0, 0.0, 0.0, 0.0),
        phi = structure(
            .Data = c(0.5, 0.0, 0.0, 0.0,
                    0.0, 0.5, 0.0, 0.0,
                    0.0, 0.0, 0.5, 0.0,
                    0.0, 0.0, 0.0, 0.5),
            .Dim = c(4,4)),
        xi = Ļ‡_priors,
        thd = thd
    )
)

fit2 <- model2$sample(
  data = data,
  seed = 69,
  refresh = 500,
  init = inits,
  output_dir = output_dir,
  sig_figs = 5,
  chains = n.chains,
  parallel_chains = getOption("mc.cores", 1),
  opencl_ids = NULL,
  iter_warmup = n.burnin,
  iter_sampling = n.iter,
  save_warmup = TRUE
)

data.rdata (4.5 KB)

WinBUGS model

This is a WinBUGS program for the real example in Chapter 6, Section 6.6.2.

Model: Structural Equation Model with Ordered Categorical Variables
Data Set Names:  YO.dat, and XI.dat, where XI.dat are input initial values for xi.
Sample Size: N=338

model{
	for(i in 1:N){
		#measurement equation model
		for(j in 1:P){
			y[i,j]~dnorm(mu[i,j],psi[j])I(thd[j,z[i,j]],thd[j,z[i,j]+1])
			ephat[i,j]<-y[i,j]-mu[i,j]
		}
		mu[i,1]<-eta[i]
		mu[i,2]<-lam[1]*eta[i]
		mu[i,3]<-xi[i,1]
		mu[i,4]<-lam[2]*xi[i,1]
		mu[i,5]<-lam[3]*xi[i,1]
		mu[i,6]<-lam[4]*xi[i,1]
		mu[i,7]<-lam[5]*xi[i,1]
		mu[i,8]<-lam[6]*xi[i,1]
		mu[i,9]<-lam[7]*xi[i,1]
		mu[i,10]<-xi[i,2]
		mu[i,11]<-lam[8]*xi[i,2]
		mu[i,12]<-lam[9]*xi[i,2]
		mu[i,13]<-lam[10]*xi[i,2]
		mu[i,14]<-lam[11]*xi[i,2]
    	mu[i,15]<-lam[12]*xi[i,2]
		mu[i,16]<-xi[i,3]
		mu[i,17]<-lam[13]*xi[i,3]
		mu[i,18]<-lam[14]*xi[i,3]		
		mu[i,19]<-xi[i,4]		
		mu[i,20]<-lam[15]*xi[i,4]		
		mu[i,21]<-lam[16]*xi[i,4]		
		mu[i,22]<-lam[17]*xi[i,4]		
		mu[i,23]<-lam[18]*xi[i,4]		
		mu[i,24]<-lam[19]*xi[i,4]		
		mu[i,25]<-lam[20]*xi[i,4]		
		mu[i,26]<-lam[21]*xi[i,4]		
																
		#structural equation model
		xi[i,1:4]~dmnorm(u[1:4],phi[1:4,1:4])
		eta[i]~dnorm(nu[i],psd)
		nu[i]<-gam[1]*xi[i,1]+gam[2]*xi[i,2]+gam[3]*xi[i,3]+gam[4]*xi[i,4]
		dthat[i]<-eta[i]-nu[i]
	}# end of i
	
	for(i in 1:4){u[i]<-0.0}
	
	#priors on loadings and coefficients
	var.lam[1]<-4.0*psi[2]	   var.lam[2]<-4.0*psi[4]	    var.lam[3]<-4.0*psi[5]
	var.lam[4]<-4.0*psi[6]	   var.lam[5]<-4.0*psi[7]	    var.lam[6]<-4.0*psi[8]
	var.lam[7]<-4.0*psi[9]	   var.lam[8]<-4.0*psi[11]	  var.lam[9]<-4.0*psi[12]
	var.lam[10]<-4.0*psi[13]	var.lam[11]<-4.0*psi[14]	var.lam[12]<-4.0*psi[15]
	var.lam[13]<-4.0*psi[17]	var.lam[14]<-4.0*psi[18]	var.lam[15]<-4.0*psi[20]
	var.lam[16]<-4.0*psi[21]	var.lam[17]<-4.0*psi[22]	var.lam[18]<-4.0*psi[23]
	var.lam[19]<-4.0*psi[24]	var.lam[20]<-4.0*psi[25]	var.lam[21]<-4.0*psi[26]
	for(i in 1:21){lam[i]~dnorm(0.8,var.lam[i])}	

	var.gam<-4.0*psd
	gam[1]~dnorm(0.6,var.gam)    gam[2]~dnorm(0.6,var.gam)    
	gam[3]~dnorm(0.4,var.gam)    gam[4]~dnorm(0.4,var.gam)
	
	#priors on precisions
	for(j in 1:P){
		psi[j]~dgamma(10,8)
		sgm[j]<-1/psi[j]
	}
	psd~dgamma(10,8)
	sgd<-1/psd
	phi[1:4,1:4]~dwish(R[1:4,1:4], 30)
	phx[1:4,1:4]<-inverse(phi[1:4,1:4])
	
} #end of model

Data Set
list(N=338, P=26, 
	R=structure(
		.Data=c(8.0, 0.0, 0.0, 0.0,
                    0.0, 8.0, 0.0, 0.0,
                    0.0, 0.0, 8.0, 0.0,
                    0.0, 0.0, 0.0, 8.0),
		.Dim=c(4,4)),
	thd=structure(
	      .Data=c(-200.000,-2.517,-1.245,-0.444, 0.848,200.000,
-200.000,-1.447,-0.420, 0.119, 1.245,200.000,
-200.000,-1.671,-0.869,-0.194, 0.679,200.000,
-200.000,-1.642,-0.869,-0.293, 0.332,200.000,
-200.000,-1.671,-0.827, 0.052, 0.756,200.000,
-200.000,-1.769,-1.098,-0.469, 0.255,200.000,
-200.000,-1.490,-0.670,-0.082, 0.880,200.000,
-200.000,-1.933,-0.880,-0.317, 1.008,200.000,
-200.000,-1.587,-0.624, 0.000, 1.008,200.000,
-200.000,-1.983,-1.348,-0.348, 1.045,200.000,
-200.000,-1.983,-1.229,-0.247, 0.869,200.000,
-200.000,-2.262,-1.426, 0.037, 1.330,200.000,
-200.000,-2.371,-1.295,-0.224, 0.651,200.000,
-200.000,-2.039,-1.112,-0.149, 1.169,200.000,
-200.000,-2.262,-1.198,-0.309, 1.198,200.000,
-200.000,-2.176,-1.537,-0.717, 0.597,200.000,
-200.000,-1.447,-0.786, 0.119, 1.008,200.000,
-200.000,-2.039,-1.769,-0.661, 0.642,200.000,
-200.000,-2.262,-1.468, 0.015, 1.214,200.000,
-200.000,-2.039,-1.406, 0.000, 1.140,200.000,
-200.000,-1.702,-1.058, 0.149, 0.902,200.000,
-200.000,-2.262,-1.426,-0.309, 0.971,200.000,
-200.000,-1.702,-0.615, 0.179, 1.229,200.000,
-200.000,-2.262,-1.671,-1.033, 0.420,200.000,
-200.000,-2.262,-1.468,-0.689, 1.045,200.000,
-200.000,-2.176,-1.537,-0.880, 0.661,200.000),
	      .Dim=c(26,6)),
	z=structure(
		.Data=c(paste YO.dat here),
		.Dim=c(338,26)))

Two different Initial Values
list(
	lam=c(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0),
	psi=c(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0),
	psd=1.0,
	gam=c(1.0, 1.0, 1.0, 1.0),
	phi=structure(
		.Data=c(1.0, 0.0, 0.0, 0.0, 
		             0.0, 1.0, 0.0, 0.0,
		             0.0, 0.0, 1.0, 0.0,
		             0.0, 0.0, 0.0, 1.0),
		.Dim=c(4,4)),
	xi=structure(
		.Data=c(paste XI.dat here),
		.Dim=c(338,4)))

list(
	lam=c(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5),
	psi=c(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5),
	psd=0.6,
	gam=c(0.0, 0.0, 0.0, 0.0),
	phi=structure(
		.Data=c(0.5, 0.0, 0.0, 0.0, 
		             0.0, 0.5, 0.0, 0.0,
		             0.0, 0.0, 0.5, 0.0,
		             0.0, 0.0, 0.0, 0.5),
		.Dim=c(4,4)),
	xi=structure(
		.Data=c(paste XI.dat here),
		.Dim=c(338,4)))

In my experience, this is an example of a model that one would be happy to get running in WinBUGS, but the sampling would be very slow and inefficient, so that the model would be difficult to use in practice. I am not sure that I can address all your questions, but here are some thoughts.

The I() operator in WinBUGS is doing censoring at the thresholds, so the specification ends up being related to the Chib & Greenberg (1998) data augmentation approach. That is a probit model and you are using logit, which is a small difference. I think the slow speed and sampling issues comes from the large amount of eta and xi parameters in the model, along with the fact that it is an ordered logistic model.

You could potentially obtain ephat in the generated quantities block, by generating random y[i,j] variates from a truncated logistic distribution (with mean mu[i,j] and scale of 1, I believe).

The blavaan approach to this model also follows Chib & Greenberg (and uses probit link) and treats what you call y as model parameters. Some further details about the approach can be found here. The underlying Stan model is complicated because it needs to handle ordinal variables with different numbers of categories and combinations of ordinal and continuous variables. But the current version of the model can be found here.

At the risk of committing blasphemy on this forum :P, have you tried to implement the model using JAGS?

I have a fair amount of experience with JAGS and WinBUGS and I know that:

JAGS has much better error handling; error messages are often useless in WinBUGS.
JAGS is much more powerful than WinBUGS due to slice sampling that is written in C++
JAGS directly integrates with R unlike WinBUGS.
JAGS is written in the BUGS language so translating the code is typically trivial.

Once you have that running you should try to implement in Stan.

BTW what book is that from?

Thank you for input. Since I am not thoroughly familiar with Stan, BUGS or Bayesian estimation in general, some concrete code examples would have been appreciated. Unfortunately, I found the model you linked a tad too complex for learning purposes.

Regarding the WinBUGS model, I was able to get it to run. This model finished in about 5-6 minutes in WinBUGS. Conversely, it took ~54 minutes for it finish in Stan. I tried fixing the thresholds (loading them as data instead of as parameters with initial values), however this took even longer, roughly 100 minutes. Seemingly, using an ordered logistic model will cause a significant increase in computational load. I will try a probit approach.

As for trying to reproduce the original BUGS model, which would be useful for benchmarking and evaluation purposed, I was wondering how to implement censoring to values of y[i,j]? According to the user manual, it seems I can estimate censored data by adding constraints to the variable. The example model, however, presumes a constant threshold for the entire array, while the BUGS model has variable thresholds of y[i,j] for each value of z[i,j]. Could this be implemented? Can constraints be provided as an array of vectors or matrix or similar?

I have tried JAGS on the model without any luck so far. I was able to get the model to run in WinBUGS, and subsequently in R2WinBUGS and R2OpenBUGS, however when I try the same model in JAGS (using rjags), I get the following error:

Compilation error on line 5.
BUGS I(,) notation is only allowed if all parameters are fixed

I am not sure what this implies. According to the OpenBUGS home page, OpenBUGSā€™ and JAGSā€™ censoring function is C(lower, upper). Changing I(,) to C(,), leaving everything else unchanged, caused the following error:

Error parsing model file:
syntax error on line 5 near ","

Any suggestions?


Regarding your last question: The model can be found in the books

Lee, S. Y. (2007). Structural equation modeling: A Bayesian approach . John Wiley & Sons.

Lee, S. Y., & Song, X. Y. (2012). Basic and advanced Bayesian structural equation modeling: With applications in the medical and behavioral sciences . John Wiley & Sons.

This is explained in some detail in the JAGS user manual. See section A.3 Censoring and truncation and 7.1. You need to either specify dinterval or T(,) for censoring or truncation.

I donā€™t know enough about these models to know for certain but I presume the random variable is defined to lie with the interval and so it is truncation not censoring.

You could always try both and compare to with the output from the book.

All the best.

Thanks for the input. Iā€™ve been trying to wrap my head around the difference between censoring and truncation, but yes I believe they use truncation. The model essentially uses real, ordinal data, with an integer range of 1-5, from a questionnaire of Likert-scale items, z[i, j], where i represents respodents and j the questions. Due to limitations in WinBUGS, instead of predicting z directly from mu, it uses an latent normal distribution, y, which is defined so that y[i,j] are assigned a value based on the value of z[i,j] and a range of predefined thresholds, thd[j, 1:6] for z[, j].

I can confirm replacing I() with T() worked, and the functions seem identical in practice. As you suggested, it runs considerably quicker than WinBUGS. As far as I can tell dinterval is probably not appropriate for this purpose.

So, Iā€™ve tried creating a direct translation of the BUGS model. Truncation in Stan is apparently achieved by defining upper and lower constraints to y_{ij}, and adding a truncation function T[L, U] to the distribution. From this I created two two-dimensional arrays of upper and lower boundaries, L[i, j] and U[i, j] in the transformed data section from thd[j, 1:6] and z[i, j]:

data{
	int<lower=0> N; // number of observations
    int<lower=1> P; // number of variables
    array[N, P] int<lower=1, upper=5> z; // observed data
    array[P, 6] real thd; // Thresholds
}

transformed data {
   	cov_matrix[4] R = diag_matrix(rep_vector(8.0, 4)); // prior covariance matrix for xi
	vector[4] u = rep_vector(0,4); // prior mean for xi
    array[N, P] real L; // Lower truncation point of y
    array[N, P] real U; // Upper truncation point of y
    for(i in 1:N){
		for(j in 1:P){
			L[i,j] = thd[j, z[i, j]]; // Lower truncation point assigned values from thd[j, 1:5] based on value of z[i,j]
            U[i,j] = thd[j, z[i, j] + 1]; // Lower truncation point assigned values from thd[j, 2:6] based on value of z[i,j]
		}
    }
}

parameters {
	array[21] real lam; // pattern coefficients
	vector[N] eta; // latent variables
	array[N] vector[4] xi; // latent variables
	row_vector[4] gam; // coefficients
	vector<lower=0>[P] psi; // precisions of y
	real<lower=0> psd; // precision of eta
	cov_matrix[4] phi;
    array[N, P] real<lower=L, upper=U> y; // Not sure if this is a correct assumption, but I chose a 2-D array of reals, as I figured this might let me assign constraints for each value of y.
	
}

transformed parameters {
    vector[P] sgm = 1/psi;
    real sgd = 1/psd;
    matrix[4, 4] phx = inverse(phi);
}

model {
	// Local variables
    array[N, P] real mu;
    array[N, P] real ephat;
    vector[N] nu;
    vector[N] dthat;
	array[21] real var_lam;
	real var_gam;

	//priors on precisions
    psi ~ gamma(10,8);
    psd ~ gamma(10,8);
    phi ~ wishart(30, R);

	//priors on loadings and coefficients
    var_lam[1] = 4.0 * psi[2];
    var_lam[2] = 4.0 * psi[4];
    var_lam[3] = 4.0 * psi[5];
    var_lam[4] = 4.0 * psi[6];
    var_lam[5] = 4.0 * psi[7];
    var_lam[6] = 4.0 * psi[8];
    var_lam[7] = 4.0 * psi[9];
    var_lam[8] = 4.0 * psi[11];
    var_lam[9] = 4.0 * psi[12];
    var_lam[10] = 4.0 * psi[13];
    var_lam[11] = 4.0 * psi[14];
    var_lam[12] = 4.0 * psi[15];
    var_lam[13] = 4.0 * psi[17];
    var_lam[14] = 4.0 * psi[18];
    var_lam[15] = 4.0 * psi[20];
    var_lam[16] = 4.0 * psi[21];
    var_lam[17] = 4.0 * psi[22];
    var_lam[18] = 4.0 * psi[23];
    var_lam[19] = 4.0 * psi[24];
    var_lam[20] = 4.0 * psi[25];
    var_lam[21] = 4.0 * psi[26];
    
    lam ~ normal(0.8,var_lam);

    var_gam = 4.0 * psd;

    gam ~ normal(0.6,var_gam);

    for(i in 1:N){
        //measurement equation model
        
        mu[i,1] = eta[i];
        mu[i,2] = lam[1] * eta[i];
        mu[i,3] = xi[i,1];
        mu[i,4] = lam[2] * xi[i,1];
        mu[i,5] = lam[3] * xi[i,1];
        mu[i,6] = lam[4] * xi[i,1];
        mu[i,7] = lam[5] * xi[i,1];
        mu[i,8] = lam[6] * xi[i,1];
        mu[i,9] = lam[7] * xi[i,1];
        mu[i,10] = xi[i,2];
        mu[i,11] = lam[8] * xi[i,2];
        mu[i,12] = lam[9] * xi[i,2];
        mu[i,13] = lam[10] * xi[i,2];
        mu[i,14] = lam[11] * xi[i,2];
        mu[i,15] = lam[12] * xi[i,2];
        mu[i,16] = xi[i,3];
        mu[i,17] = lam[13] * xi[i,3];
        mu[i,18] = lam[14] * xi[i,3];
        mu[i,19] = xi[i,4];
        mu[i,20] = lam[15] * xi[i,4];
        mu[i,21] = lam[16] * xi[i,4];
        mu[i,22] = lam[17] * xi[i,4];
        mu[i,23] = lam[18] * xi[i,4];
        mu[i,24] = lam[19] * xi[i,4];
        mu[i,25] = lam[20] * xi[i,4];
        mu[i,26] = lam[21] * xi[i,4];

		for(j in 1:P){
            ephat[i, j] = y[i, j] - mu[i, j]; 
            y[i,j] ~ normal(mu[i,j], psi[j])T[L[i, j], U[i, j]];
        }
		

        // structural equation model

		nu[i] = gam * xi[i];
	} // end of i
        
	dthat = eta - nu;

	xi ~ multi_normal(u, phi);
	eta ~ normal(nu, psd);

} //end of model

This model will run, but with a total execution time of 4246.6 seconds (~71 min) it is horrendously slow! It will also produce the following warnings:

Warning: 15896 of 16000 (99.0%) transitions ended with a divergence.

Warning: 104 of 16000 (1.0%) transitions hit the maximum treedepth limit of 10.

Given that WinBUGS fitted the original model in 5-6 min, and JAGS fitted an almost identical model (the only difference being I(,) was changed to T(,)) in less than 20 seconds, and the above warnings, it seem the Stan model must be misspecified somehow. Any input on what may be causing the fitting issues?

I think one issue is that, if you specify y as a parameter, you donā€™t need eta and xi (can marginalize them out of the likelihood). Conversely, if you specify eta and xi as parameters, you donā€™t need y. All three of these add many parameters to the model, which adds lots of computational overhead.

As a point of comparison, you can fit a similar model in blavaan using the code below (though the priors differ a little bit). Using your notation, blavaan treats y as a parameter and avoids eta and xi. The code is not fully optimized for this modelā€¦ for one thing, is might be better to use eta and xi as parameters instead of y. But the estimation time is faster (esp by sending each chain to a separate core), and it includes some marginal likelihood computations for information criteria (these are ā€œextraā€, in that they are not necessary for model estimation)ā€¦

I would not want to report the estimates from the short chains used below. The posterior means are fairly stable, but I doubt that information criteria or other tail metrics are stable. The short chains illustrate an issue with timing comparisons, which is that it depends on sampling efficiency. Stan is often more efficient, which means you can use fewer warmup/sampling iterations than other software. Sometimes people would use effective sample size per second for comparison, but you also want to ensure that the same effective sample size metric is being used across all software.

library(blavaan)
library(future); plan("multicore")

load('data.rdata')

z <- data.frame(data$z)
## some easy names for typing:
names(z) <- paste0("y", 1:26)

m1syn <- '
  eta =~ y1 + y2
  xi1 =~ y3 + y4 + y5 + y6 + y7
  xi2 =~ y10 + y11 + y12 + y13 + y14 + y15
  xi3 =~ y16 + y17 + y18
  xi4 =~ y19 + y20 + y21 + y22 + y23 + y24 + y25 + y26

  eta ~ xi1 + xi2 + xi3 + xi4 '

fit <- bsem(m1syn, data = z, burnin = 100, sample = 100, ordered = TRUE,
            dp = dpriors(lambda = "normal(1, .5)",
                         beta = "normal(.5, .5)",
                         psi = "gamma(10, 8)[sd]"),
            bcontrol = list(cores = 3))

Thanks for your feedback. I have some questions though. The model in my previous post is as close of a translation of the BUGS code in my original post. It does not have less variables than the Stan model, but still it will more than 10 times faster in BUGS 250-300 times faster in JAGS. It is therefore important for me to understand why my adaptation of it is so much slower in Stan. You commented I should use less parameters, and ditching either y, or eta and xi. I am not sure whether I understand what you mean by that. Do you mean I should declare them as local variable within the model instead? Or did you mean these variables can be dropped from the model altogether? I completely agree that at least y should be dropped as parameter, since itā€™s only there for fitting purposes. As for the latent variables eta and xi, theyā€™re strictly not necessary as monitored parameters either. However, I do not see how I can remove them from the model entirely.

Now, defining these three latent variables locally in the model, similar to what is described in chapter 35.5 in the Userā€™s Guide, seems possible. However, the problem I run into if I do is that if I define any of these variables locally and not as parameters, I get the following error returned when attempting to fit the model:

Exception: normal_lpdf: Random variable is nan, but must be not nan!

According to this answer to a similar problem:

So, the question becomes: If I am not to specify the aforementioned latent variable as parameters, how do I assign values to them?

Anyway, I have rewritten the model as described below. It runs in ~45 min now, so it is more efficient than the original model. Suggested alternative local variable declarations of y, eta and xi are listed as comments.

data{
	int<lower=0> N; // number of observations
    int<lower=1> P; // number of variables
    array[N, P] int<lower=1, upper=5> z; // observed data
	array[P] ordered[6] thd; // Thresholds
}

transformed data {
   	cov_matrix[4] R = diag_matrix(rep_vector(8.0, 4)); // prior covariance matrix for xi
	vector[4] u = rep_vector(0, 4); // prior mean for xi
    array[N] vector[P] L;
    array[N] vector[P] U;
    for(i in 1:N){
		for(j in 1:P){
			L[i,j] = thd[j, z[i, j]];
            U[i,j] = thd[j, z[i, j] + 1];
		}
    }
}

parameters {
	array[21] real lam; // pattern coefficients
	row_vector[4] gam; // coefficients
	vector<lower=0>[P] psi; // precisions of y
	real<lower=0> psd; // precision of eta
	cov_matrix[4] phi;
    array[N] vector<lower=L, upper=U>[P] y;
    array[N] vector[4] xi; // latent variables
    vector[N] eta; // latent variables
	
}

transformed parameters {
    
}

model {
	// Local variables
    vector[P] mu;
    // vector[4] xi; // latent variables
    vector[P] sgm = 1/psi;
    real sgd = 1/psd;
    matrix[4, 4] phx = inverse(phi);

    psi ~ gamma(10,8); //priors on precisions
    
	//priors on loadings and coefficients
    lam[1] ~ normal(0.8, 4.0 * psi[2]);
    lam[2] ~ normal(0.8, 4.0 * psi[4]);
    lam[3] ~ normal(0.8, 4.0 * psi[5]);
    lam[4] ~ normal(0.8, 4.0 * psi[6]);
    lam[5] ~ normal(0.8, 4.0 * psi[7]);
    lam[6] ~ normal(0.8, 4.0 * psi[8]);
    lam[7] ~ normal(0.8, 4.0 * psi[9]);
    lam[8] ~ normal(0.8, 4.0 * psi[11]);
    lam[9] ~ normal(0.8, 4.0 * psi[12]);
    lam[10] ~ normal(0.8, 4.0 * psi[13]);
    lam[11] ~ normal(0.8, 4.0 * psi[14]);
    lam[12] ~ normal(0.8, 4.0 * psi[15]);
    lam[13] ~ normal(0.8, 4.0 * psi[17]);
    lam[14] ~ normal(0.8, 4.0 * psi[18]);
    lam[15] ~ normal(0.8, 4.0 * psi[20]);
    lam[16] ~ normal(0.8, 4.0 * psi[21]);
    lam[17] ~ normal(0.8, 4.0 * psi[22]);
    lam[18] ~ normal(0.8, 4.0 * psi[23]);
    lam[19] ~ normal(0.8, 4.0 * psi[24]);
    lam[20] ~ normal(0.8, 4.0 * psi[25]);
    lam[21] ~ normal(0.8, 4.0 * psi[26]);
    
    psd ~ gamma(10,8); //priors on precisions
    gam ~ normal(0.6, 4 * psd);
    
    phi ~ wishart(30, R); //priors on precisions
    
    for(i in 1:N){

        xi[i] ~ multi_normal(u, phi);

        // structural equation model
        real nu;
        nu = gam * xi[i];
        // real eta;
        eta[i] ~ normal(nu, psd);
        real dthat = eta[i] - nu;
        

        //measurement equation model
        
        mu[1] = eta[i];
        mu[2] = lam[1] * eta[i];
        mu[3] = xi[i, 1];
        mu[4] = lam[2] * xi[i, 1];
        mu[5] = lam[3] * xi[i, 1];
        mu[6] = lam[4] * xi[i, 1];
        mu[7] = lam[5] * xi[i, 1];
        mu[8] = lam[6] * xi[i, 1];
        mu[9] = lam[7] * xi[i, 1];
        mu[10] = xi[i, 2];
        mu[11] = lam[8] * xi[i, 2];
        mu[12] = lam[9] * xi[i, 2];
        mu[13] = lam[10] * xi[i, 2];
        mu[14] = lam[11] * xi[i, 2];
        mu[15] = lam[12] * xi[i, 2];
        mu[16] = xi[i, 3];
        mu[17] = lam[13] * xi[i, 3];
        mu[18] = lam[14] * xi[i, 3];
        mu[19] = xi[i, 4];
        mu[20] = lam[15] * xi[i, 4];
        mu[21] = lam[16] * xi[i, 4];
        mu[22] = lam[17] * xi[i, 4];
        mu[23] = lam[18] * xi[i, 4];
        mu[24] = lam[19] * xi[i, 4];
        mu[25] = lam[20] * xi[i, 4];
        mu[26] = lam[21] * xi[i, 4];

		for(j in 1:P){
            // real y;
            y[i, j] ~ normal(mu[j], psi[j])T[L[i, j], U[i, j]];
            real ephat;
            ephat = y[i, j] - mu[j];
            // ephat[i, j] = y[i, j] - mu[j];
        }	
        
	} // end of i
} //end of model

Regarding the blavaan model.

While this program certainly would certainly be helpful for fitting this model, I am currently trying to learn how to write efficient SEMs in Stan for training purposes. Thereā€™s not a considerable amount of literature on the topic of Bayesian SEM, but I found the works of Lee (2007) and Song & Lee (2012), which provides code for how to fit SEMs in WinBUGS. Ultimately, I want to estimate a larger and more complex model, which, unfortunately, is beyond the the current capabilities of blavaan.

I donā€™t think you want to declare latent variables locally. Instead, you can marginalize them out of the likelihood using properties of the multivariate normal distribution. This is different from what you would do in WinBUGS, but it really helps when using Stan. I will try to distill the approach into a small Stan file for this model, once I can find a bit of free time.

1 Like

Thanks, a code example would have been most helpful. I understand you must have more pressing matters occupying your time, so I am very grateful for your efforts to help me.

Is phi a precision matrix or a covariance matrix? It looks like youā€™re using multi_normal so it should be a covariance but thatā€™s not how itā€™s specified.

Pull the multivariate normal out of the loop.

xi ~ multi_normal(u, phi);

Is valid as the multi normal can take an array of reals.

Itā€™s faster and more stable to use multi_normal_cholesky. This avoids the cholesky factorization and the inversion you were doing.

L_phi ~ inv_wishart_cholesky(30, R); //priors on cholesky factor of covariance
xi ~ multi_normal_cholesky(u, L_phi)

Then in this loop

		for(j in 1:P){
            // real y;
            y[i, j] ~ normal(mu[j], psi[j])T[L[i, j], U[i, j]];
            real ephat;
            ephat = y[i, j] - mu[j];
            // ephat[i, j] = y[i, j] - mu[j];
        }	

can be re written as

  y[i] ~ normal(mu, psi);	

because you have already declared the constraints on y the truncations are happening implicitly on the normal so itā€™s redundant to add them.

In the original BUGS model it is listed under #priors on precisions, but, I agree, it should be a covariance matrix, since a multinormal distribution require one, while the precision matrix being its inverse, phx. Can you elaborate what you mean I have not specified phi as a covariance matrix?

(Also, since the precision variables serve no obvious function in the structural equation model, Iā€™d be interested to know if Stan requires these precision variables, sgm, sgd and phx, at all, or can I simply remove them?)

Thanks for showing me how to vectorize xi and y, Iā€™ve been trying to figure out how to do that. I was unaware I could vectorize an array of vectors. I have to ask though: You said that

It isnā€™t immediately clear to me how I am doing Cholesky factorization and inversion. Isnā€™t this what inv_wishart_cholesky() and multi_normal_cholesky() is supposed to do? (ā€¦and wouldnā€™t Cholesky factorization make it more computationally efficient?)

As for the vectorization of the ā€˜y-loopā€™, the ephat variable is now gone. Like the precision variables, it serves no obvious purpose in the model. Is ephat no longer necessary? Could I likewise get rid of dthat?

Anyway, I tried implementing your suggestions, however the model is now much slower than any of the previous versions. Itā€™s been >90 min now, and it is still running. Obviously some changes to the model arenā€™t optimal. Any idea which?

The _cholesky distributions are parameterized in terms of a cholesky factorization. You were using the standard forms that take in a covariance matrix but to calculate the lpdf there is still a matrix inversion which uses a cholesky factorization. Providing the cholesky factorization directly allows this step to be skipped. Further, the cholesky version of mvn has derivatives directly instead of auto diff which is faster.

Your matrix is only 4 x 4 so itā€™s not much of a speedup but your code originally was doing it for each N.

As for the slowdown, Iā€™m not sure why. Itā€™s possible I misunderstood something in your code or weā€™ve introduced an unintended error.

Are you able to post the data or simulated data and the expected results? It will be easier to see whatā€™s going on with that available.

There is a link to an rdata file in the original post.

I worked on this model a bit yesterday. I donā€™t yet have everything working, but one big issue is that the psi parameters are not identified and should be fixed to 1 (it is a multivariate probit model).

1 Like

As @edm pointed out thereā€™s an Rdata file in the OP below the R code. It shouldnā€™t be many differences in the R code from the OP Stan model to the one I posted later, which should correspond better to the original BUGS model. The main difference in the R code between the former and latter is that the thresholds is modeled as data in the latter with the original six-column threshold matrix from the BUGS model:

thd <- matrix(
        c(
            -200.000,-2.517,-1.245,-0.444, 0.848,200.000,
            -200.000,-1.447,-0.420, 0.119, 1.245,200.000,
            -200.000,-1.671,-0.869,-0.194, 0.679,200.000,
            -200.000,-1.642,-0.869,-0.293, 0.332,200.000,
            -200.000,-1.671,-0.827, 0.052, 0.756,200.000,
            -200.000,-1.769,-1.098,-0.469, 0.255,200.000,
            -200.000,-1.490,-0.670,-0.082, 0.880,200.000,
            -200.000,-1.933,-0.880,-0.317, 1.008,200.000,
            -200.000,-1.587,-0.624, 0.000, 1.008,200.000,
            -200.000,-1.983,-1.348,-0.348, 1.045,200.000,
            -200.000,-1.983,-1.229,-0.247, 0.869,200.000,
            -200.000,-2.262,-1.426, 0.037, 1.330,200.000,
            -200.000,-2.371,-1.295,-0.224, 0.651,200.000,
            -200.000,-2.039,-1.112,-0.149, 1.169,200.000,
            -200.000,-2.262,-1.198,-0.309, 1.198,200.000,
            -200.000,-2.176,-1.537,-0.717, 0.597,200.000,
            -200.000,-1.447,-0.786, 0.119, 1.008,200.000,
            -200.000,-2.039,-1.769,-0.661, 0.642,200.000,
            -200.000,-2.262,-1.468, 0.015, 1.214,200.000,
            -200.000,-2.039,-1.406, 0.000, 1.140,200.000,
            -200.000,-1.702,-1.058, 0.149, 0.902,200.000,
            -200.000,-2.262,-1.426,-0.309, 0.971,200.000,
            -200.000,-1.702,-0.615, 0.179, 1.229,200.000,
            -200.000,-2.262,-1.671,-1.033, 0.420,200.000,
            -200.000,-2.262,-1.468,-0.689, 1.045,200.000,
            -200.000,-2.176,-1.537,-0.880, 0.661,200.000
        ),
        nrow = 26,
        ncol = 6,
        byrow = TRUE
    )

As for results, Lee (2007) presented the following estimate of the model:

Initial values of the psi parameters are loaded as 1 and 0.5 for chain one and two, respectively. Are you saying psi should remain constant?

I donā€™t have time to dive into the paper or really understand this model. I donā€™t know if you intended for those parameters with comments about precisions are supposed to be or not. I turned them into inv_gamma priors but you can easily change those back. I removed the wishart prior and just used lkj on the correlation.

This runs on my 2017 mac in 4 min (200 wu, 200 sampling) without any indication of fit issues. Itā€™s possible Iā€™ve introduced errors because I donā€™t fully understand the model.

data{
	int<lower=0> N; // number of observations
    int<lower=1> P; // number of variables
    array[N, P] int<lower=1, upper=5> z; // observed data
	array[P] ordered[6] thd; // Thresholds
}

transformed data {
   	cov_matrix[4] R = diag_matrix(rep_vector(8.0, 4)); // prior covariance matrix for xi
	vector[4] u = rep_vector(0, 4); // prior mean for xi
    array[N] vector[P] L;
    array[N] vector[P] U;
    for(i in 1:N){
		for(j in 1:P){
			L[i,j] = thd[j, z[i, j]];
            U[i,j] = thd[j, z[i, j] + 1];
		}
    }
}

parameters {
	array[21] real lam; // pattern coefficients
	row_vector[4] gam; // coefficients
	vector<lower=0>[P] psi; // precisions of y
	real<lower=0> psd; // precision of eta
	cholesky_factor_corr[4] L_phi;
  array[N] vector<lower=L, upper=U>[P] y;
  array[N] vector[4] xi; // latent variables
  vector[N] eta; // latent variables
	
}
model {
	// Local variables
    vector[P] mu;
    psi ~ inv_gamma(10,8); //priors on precisions
    
	//priors on loadings and coefficients
    lam[1] ~ normal(0.8, 4.0 * psi[2]);
    lam[2] ~ normal(0.8, 4.0 * psi[4]);
    lam[3] ~ normal(0.8, 4.0 * psi[5]);
    lam[4] ~ normal(0.8, 4.0 * psi[6]);
    lam[5] ~ normal(0.8, 4.0 * psi[7]);
    lam[6] ~ normal(0.8, 4.0 * psi[8]);
    lam[7] ~ normal(0.8, 4.0 * psi[9]);
    lam[8] ~ normal(0.8, 4.0 * psi[11]);
    lam[9] ~ normal(0.8, 4.0 * psi[12]);
    lam[10] ~ normal(0.8, 4.0 * psi[13]);
    lam[11] ~ normal(0.8, 4.0 * psi[14]);
    lam[12] ~ normal(0.8, 4.0 * psi[15]);
    lam[13] ~ normal(0.8, 4.0 * psi[17]);
    lam[14] ~ normal(0.8, 4.0 * psi[18]);
    lam[15] ~ normal(0.8, 4.0 * psi[20]);
    lam[16] ~ normal(0.8, 4.0 * psi[21]);
    lam[17] ~ normal(0.8, 4.0 * psi[22]);
    lam[18] ~ normal(0.8, 4.0 * psi[23]);
    lam[19] ~ normal(0.8, 4.0 * psi[24]);
    lam[20] ~ normal(0.8, 4.0 * psi[25]);
    lam[21] ~ normal(0.8, 4.0 * psi[26]);
    
    psd ~ inv_gamma(10,8); //priors on precisions
    gam ~ normal(0.6, 4 * psd);
    
    L_phi ~ lkj_corr_cholesky(1);
   // L_phi ~ inv_wishart_cholesky(30, R); //priors on precisions
    xi ~ multi_normal_cholesky(u, L_phi);
    
    for(i in 1:N){
        // structural equation model
        real nu = gam * xi[i];
        // real eta;
        eta[i] ~ normal(nu, psd);
        real dthat = eta[i] - nu;
        

        //measurement equation model
        
        mu[1] = eta[i];
        mu[2] = lam[1] * eta[i];
        mu[3] = xi[i, 1];
        mu[4] = lam[2] * xi[i, 1];
        mu[5] = lam[3] * xi[i, 1];
        mu[6] = lam[4] * xi[i, 1];
        mu[7] = lam[5] * xi[i, 1];
        mu[8] = lam[6] * xi[i, 1];
        mu[9] = lam[7] * xi[i, 1];
        mu[10] = xi[i, 2];
        mu[11] = lam[8] * xi[i, 2];
        mu[12] = lam[9] * xi[i, 2];
        mu[13] = lam[10] * xi[i, 2];
        mu[14] = lam[11] * xi[i, 2];
        mu[15] = lam[12] * xi[i, 2];
        mu[16] = xi[i, 3];
        mu[17] = lam[13] * xi[i, 3];
        mu[18] = lam[14] * xi[i, 3];
        mu[19] = xi[i, 4];
        mu[20] = lam[15] * xi[i, 4];
        mu[21] = lam[16] * xi[i, 4];
        mu[22] = lam[17] * xi[i, 4];
        mu[23] = lam[18] * xi[i, 4];
        mu[24] = lam[19] * xi[i, 4];
        mu[25] = lam[20] * xi[i, 4];
        mu[26] = lam[21] * xi[i, 4];

        y[i] ~ normal(mu, psi);	

	} // end of i
} //end of model