Estimating a half decided matrix with code attached

I am interested in estimating the first columns of H (I call H_star), with the 5,6 and 7 columns of H (I call Q) being decided already so H <- append_col(H_star, Q);. I realized through asking a serial of questions before that stan cannot deal with H filled with 1 and 0 directly, so I compromised to use a beta(0.0001, 0.0001) as a prior for H, that is I do not require the estimates of H to be 1 and 0, for example 0.98 and 0.05 is acceptable, I will deal with that later, I just want to figure out an easy way to check my method.

I have asked this question, and bgoodri has helped me, but I am still not sure how to do it, so I figured out it might be better that I post my code here.

# This is simulated data
library(MASS)
######## Set up parameters #####
p=10; q=9;D=p+q;
k=4; k1=2; k2=1; K=k+k1+k2; 
n=100
######### Generate Z ########
mu_z <- rep(0,K)
Sigma_z <- matrix(0, nrow=K, ncol=K) + diag(K)*1
Z <- mvrnorm(n, mu_z, Sigma_z)
cor(Z)
######### Generate H ########
set.seed(11)
H <- matrix(nrow=D, ncol=K)

for(j in 1:k) 
{H[,j] = rbinom(D, 1, 2/(j+2))}
for(j in (k+1):(k+k1)) 
{ for(i in 1:p) {H[i,j] =1}
  for(i in (p+1):(p+q)){H[i,j] =0} }
for(j in (k+k1+1):K) 
{ for(i in 1:p) {H[i,j] = 0}
  for(i in (p+1):D) {H[i,j] = 1} }

##Generate E##
set.seed(22)
E_mux1 <- runif(K, min =1, max = 5)
E_mux <- sort(E_mux1, decreasing = TRUE)
E_mux
e_x <- runif(K, min = 1, max = 7)
E_sigmax <- diag(K)*e_x
E_sigmax
E_x <- mvrnorm(p, E_mux, E_sigmax)
E_x
set.seed(33)
E_muy1 <- runif(K, min =1, max = 5)
E_muy <- sort(E_muy1, decreasing = TRUE)
E_muy
e_y <- runif(K, min = .01, max = .02)
E_sigmay <- diag(K)*e_y
E_sigmay
E_y <- mvrnorm(q, E_muy, E_sigmay)
E_y
E <- rbind(E_x, E_y)

## Generate Phi ##
Phi <- E*H
Phi

## Generate Theta ##
Theta <- Z%*%t(Phi)
Theta_x <- Theta[,1:p]
Theta_y <- Theta[,(p+1):D]
dim(Theta_x)
dim(Theta_y)
head(Theta_x)


### Generate X related parameter ###
X_p <- matrix(nrow=n, ncol=p)
{for(j in 1:p)
{ for(i in 1:n)
{X_p[i,j] = exp(Theta_x[i,j])/(1+exp(Theta_x[i,j]))}}}

set.seed(44)
X<- matrix(nrow=n, ncol=p)
{for(j in 1:p)
{ for(i in 1:n)
{X[i,j] = rbinom(1, 2, X_p[i,j])}}}


###  Generate Y related parameter ###
Y_mu <- matrix(nrow=n, ncol=q)
{for(j in 1:q)
{ for(i in 1:n)
{Y_mu[i,j] = Theta_y[i,j]}}}

set.seed(55)
Y<- matrix(nrow=n, ncol=q)
{for(j in 1:q)
{ for(i in 1:n)
{Y[i,j] = rnorm(1,Theta_y[i,j], sd=1/2)}}}

####################################
Q <- matrix(nrow=D, ncol=k1+k2)
for(j in 1:k1) 
{ for(i in 1:p) {Q[i,j] =1}
  for(i in (p+1):(p+q)){Q[i,j] =0} }
for(j in (k1+1):(k1+k2)) 
{ for(i in 1:p) {Q[i,j] = 0}
  for(i in (p+1):D) {Q[i,j] = 1} }
################################################################

# This is Stan code 
library(rstan)
library(parallel)
library(coda)
SBECCA_mod9 <- '
data {
int<lower=1> n; 
int<lower=1> p;
int<lower=1> q;
int<lower=1> k;
int<lower=1> k1;
int<lower=1> k2;
int<lower=1> D;
int<lower=1> K;
int<lower=0, upper=2> X[n,p];
real Y[n,q];
vector[K] mu_z;
cov_matrix[K] Sigma_z; 
matrix<lower=0,upper=1>[D,k1+k2] Q;
}
parameters {
matrix<lower=0,upper=1>[D,k] H_star;
matrix[D,K] E;
matrix[n,K] Z;
real<lower=0> alpha[K]; # top part of E 
real<lower=0> beta[K];  # bottom part of E
real<lower=0> sigma[q]; # modeling data Y, Y is normally distributed
}

transformed parameters {
matrix<lower=0,upper=1>[D,K] H;
matrix[D,K] Phii;
matrix[n,D] Theta;
matrix[n,p] Theta_x;
matrix[n,q] Theta_y;
matrix[n,p] p_x;
matrix[n,q] mu_y;


H <- append_col(H_star, Q);

Phii <- E .* H;
Theta <- Z*transpose(Phii);

for(i in 1:n)
{for(j in 1:p)
Theta_x[i,j] <- Theta[i,j];

for(l in 1:q)
Theta_y[i,l] <- Theta[i,l+p];}

{for(i in 1:n)
for(j in 1:p)
p_x[i,j] <- exp(Theta_x[i,j])/(1+exp(Theta_x[i,j]));}

{for(i in 1:n)
for(l in 1:q)
mu_y[i,l] <- Theta_y[i,l];}
}
model {

for(i in 1:n)
{Z[i] ~ multi_normal(mu_z, Sigma_z);}

for(j in 1:K)
{alpha[j] ~ inv_gamma(0.001, 0.001);
beta[j] ~ inv_gamma(0.001, 0.001);}      ## This is just an initial value 

############################################################################# for H
for(i in 1:D)
for(j in 1:k)
H_star[i,j] ~ beta(.0001,.0001);

############################################################################ for H

for(j in 1:K)
for(i in 1:p)
E[i,j] ~ normal(0, alpha[j]);

for(j in 1:K)
for(l in 1:q)
E[l,j] ~ normal(0, beta[j]);


for(j in 1:q)
sigma[j] ~ inv_gamma(0.01, 0.01);


#### Generate Data  ######
for(i in 1:n)
{
  for(j in 1:p)
  {
  X[i,j] ~ binomial(2,p_x[i,j]);
  }
  for(l in 1:q)
  {
  Y[i,l] ~ normal(mu_y[i,l], sigma[l]);
  }
}
}
'
SBECCA_dat9<-list("X"=X,"Y"=Y, "mu_z" = rep(0,K), "Sigma_z" = diag(rep(1,K)),"Q"= Q, "n"=n,"p"=p,"q"=q,"k"=k, "k1"=k1, "k2"=k2, "D"= D, "K"=K)
happyfit9 <- stan(model_code = SBECCA_mod9, data = SBECCA_dat9, thin=3, iter = 300, chains =2)
The code would run for a while, but H_star have all identical columns:
                     [,1]                [,2]                [,3]
 [1,] 0.76159433090304696 0.76159433090304696 0.76159433090304696
 [2,] 0.00000252762182439 0.00000252762182439 0.00000252762182439
 [3,] 0.00000562992396341 0.00000562992396341 0.00000562992396341
 [4,] 0.00075608847317560 0.00075608847317560 0.00075608847317560
 [5,] 0.49986117444682288 0.49986117444682288 0.49986117444682288
 [6,] 0.99989450015743730 0.99989450015743730 0.99989450015743730
 [7,] 0.00119168150027528 0.00119168150027528 0.00119168150027528
 [8,] 0.02542762521751423 0.02542762521751423 0.02542762521751423
 [9,] 0.00351209957327188 0.00351209957327188 0.00351209957327188
[10,] 0.48595862199768552 0.48595862199768552 0.48595862199768552
[11,] 0.99954809916464737 0.99954809916464737 0.99954809916464737
[12,] 0.95988290042075930 0.95988290042075930 0.95988290042075930
[13,] 0.50648686982941016 0.50648686982941016 0.50648686982941016
[14,] 0.00000000008389969 0.00000000008389969 0.00000000008389969
[15,] 0.46078465190553419 0.46078465190553419 0.46078465190553419
[16,] 0.48997977697870004 0.48997977697870004 0.48997977697870004
[17,] 0.01295566966101201 0.01295566966101201 0.01295566966101201
[18,] 0.18830439979959190 0.18830439979959190 0.18830439979959190
[19,] 0.00000029466150263 0.00000029466150263 0.00000029466150263
                     [,4]
 [1,] 0.76159433090304696
 [2,] 0.00000252762182439
 [3,] 0.00000562992396341
 [4,] 0.00075608847317560
 [5,] 0.49986117444682288
 [6,] 0.99989450015743730
 [7,] 0.00119168150027528
 [8,] 0.02542762521751423
 [9,] 0.00351209957327188
[10,] 0.48595862199768552
[11,] 0.99954809916464737
[12,] 0.95988290042075930
[13,] 0.50648686982941016
[14,] 0.00000000008389969
[15,] 0.46078465190553419
[16,] 0.48997977697870004
[17,] 0.01295566966101201
[18,] 0.18830439979959190
[19,] 0.00000029466150263

Why is that and how to fix it?

[edit: escaped code]

Hi ENFP99 -

You could invert the problem and model the observed values in H (for each row) as emerging from the label. Here, you’d have 2^ncol(H_star) labels. (If ncol(H_star) is 4, you’d have label 1 = (0, 0, 0, 1), label 2 = (0, 0, 1, 0), label 3 = (0, 0, 1, 1) etc.

I show here and here that this approach can work extremely well. In the first one, I provide Stan code to generate classifications for 1000 labels with a training set of 4500 observations and only 30 binary features. In the hold-out set it has predictive accuracy of ~43%.

The second example has 30k labels being predicted using 500 features and 90k training examples. It doesn’t use Stan–only very basic linear algebra and dplyr. It gets predictive accuracy in the hold-out set of around 95%.

I refactored your model into a more readable (to me) form. I still don’t understand what it’s doing, though.

functions {
  vector[] mat_to_array(matrix x) {
    ... you need to write this for efficiency ...
    ... we should add it to Stan ...        
  }
data {
  int<lower=1> n;
  int<lower=1> p;
  int<lower=1> q;
  int<lower=1> k;
  int<lower=1> k1;
  int<lower=1> k2;
  int<lower=1> D;
  int<lower=1> K;
  int<lower=0, upper=2> X[n,p];
  real Y[n,q];
  vector[K] mu_z;
  cov_matrix[K] Sigma_z;
  matrix<lower=0,upper=1>[D,k1+k2] Q;
}
parameters {
  matrix<lower=0,upper=1>[D,k] H_star;
  matrix[D,K] E;
  matrix[n, K] Z;
  real<lower=0> alpha[K]; # top part of E
  real<lower=0> beta[K];  # bottom part of E
  real<lower=0> sigma[q]; # modeling data Y, Y is normally distributed
}
transformed parameters {
  matrix<lower=0,upper=1>[D,K] H = append_col(H_star, Q);
  matrix[D,K] Phii  = E .* H;
  matrix[n,D] Theta = Z * Phii';
  matrix[n,p] Theta_x = Theta[ , 1:p];
  matrix[n,q] Theta_y = Theta[ , p + 1:p + q];
  matrix[n,p] p_x = inv_logit(Theta_x);
  matrix[n,q] mu_y = Theta[ , 1:q];
}
model {
  mat_to_array(Z) ~ multi_normal(mu_z, Sigma_z);
  alpha ~ inv_gamma(1e-3, 1e-3);
  beta ~ inv_gamma(1e-3, 1e-3);
  to_vector(H_star)  ~ beta(1e-4, 1e-4);
  for (j in 1:K)
    E[, j] ~ normal(0, alpha[j]);
  for(j in 1:K)
    E[ , j] ~ normal(0, beta[j]);
  sigma ~ inv_gamma(1e-2, 1e-2);
  for (i in 1:n) {
    X[i] ~ binomial(1, p_x[i]);
    X[i] ~ binomial(2, p_x[i]);
  }
  for(i in 1:n)
    Y[i] ~ normal(mu_y[i], sigma);
}

I didn’t check to see that it compiled. It’s odd to me to see two priors on E and two priors on X. I don’t think these can be right in the original program.

It’s really important to vectorize that multi-normal for efficiency. To do that, you need to go back and forth between the usage as a matrix and the usage as an array of vectors.

Also, it would be even better to Cholesky factor Sigma_z as transformed data and use multi-normal Cholesky:

transformed data {
  cholesky_factor[K] L_Sigma_z = cholesky_factor(Sigma_z);
}
...
  mat_to_array(Z) ~ multi_normal_cholesky(mu_z, L_Sigma_z);

The way to validate whether it works would be to simulate data form the model and see if you can recover 50% of the parameters in their posterior 50% intervals.