Hi!
I am still working with the data from my previous question:
In summary my data is vector Y
of Bernoulli variables that are linked to a parameter matrix theta
such that P(Y[v]) = Phi(theta[jj[v],ll[v]] + theta[jj[v],ll[v]])
where ll
and rr
index the item on the right or the left respectively and jj
indexes the respondent. In my previous question I was trying to infer a univariate normal distribution for each column of theta
.
Now I am making things much harder for myself by attempting to cluster items and respondents using low rank matrix factorization. I am aware of the non-identifiability/multimodality problem, and I am a novice when it comes to solving it, but I am trying anyway. I have been studying Rick Farouniās great posts on identification and Latent factor analysis, and Michael Betancourtās post on identifiability in mixture models.
I first tried ordering the priors on the latent factors, but it was obvious that this was not sufficient. Next, by analogy to the Farouniās approach, I am constraining square block of each factor matrix to be lower triangular with positive diagonals and initializing the chains from the MLE estimate, but different chains are still heading to different modes and Nuts is having trouble.
Is there anything else worth trying or is the model too pathological?
If itās too pathological then I might give Farouniās LF model a shake.
Thanks for your help!
Edit: This morning Iām thinking about the attempt below. My intuition is that 1. constraining both U and W doesnāt provide much over just constraining W, and 2. I likely need some constraints on the priors.
My code is below and an Rscript to generate to fit the model on simulated data is attached.simulate_LF.R (2.7 KB)
data{
// number of latent factors
int<lower=1> R;
// number of respondents
int<lower=R-1> J;
// number of rows / responses
int<lower=1> V;
// number of items
int<lower=R-1> K;
int<lower=1, upper=K> ll[V];
int<lower=1, upper=K> rr[V];
int<lower=1, upper=J> jj[V];
// recorded votes 1 / 0
int<lower=0,upper=1> y_vec[V];
// hyper prior for the distribution of lambda_U and lambda_V
// Salakhutdinov & Mnih use df=V0=R, sigma=I
cov_matrix[R] W0;
// hyperprior for the mean of lambda_U and lambda_V
// Slakhutdinov & Mnih use 0
vector<lower=0>[R] mu0;
}
transformed data {
real<lower=0> V0;
cov_matrix[R] I;
vector[R] zero_vec;
int L_entries;
int L_lengths[R+1];
zero_vec = rep_vector(0,R);
V0 = R;
L_entries = R*(R-1)/2;
for(r in 1:R){
I[:,r] = rep_vector(0,R);
I[r,r] = 1;
L_lengths[r] = (r-1)*R-(r-1)*r/2;
}
L_lengths[R+1] = L_entries;
}
parameters{
vector<lower=0>[R] c_U;
vector<lower=0>[R] c_W;
vector[L_entries] z_U;
vector[L_entries] z_W;
// priors for the factor means are ordered to force identifiability
vector[R] mu_U_raw;
vector[R] mu_W_raw;
row_vector[R] U_raw[J-R];
row_vector[L_entries] U_raw_lower_tri;
vector<lower=0>[R] U_diag;
// try constraining W to have a triangle
vector[R] W_raw[K-R];
vector[L_entries] W_raw_lower_tri;
vector<lower=0>[R] W_diag;
matrix[J,K] theta_raw;
}
transformed parameters{
matrix[R,R] L_U;
matrix[R,R] L_W;
matrix[R,R] W_sqr;
matrix[R,R] U_sqr;
row_vector[R] U[J];
vector[R] W[K];
vector[R] mu_U;
vector[R] mu_W;
matrix[J,K] theta;
// fill in the L matrices
for (r in 1:R){
U_sqr[r,1:r-1] = rep_row_vector(0,r-1);
U_sqr[r,r] = U_diag[r];
U_sqr[r,r+1:R] = U_raw_lower_tri[L_lengths[r] + 1: L_lengths[r+1]];
W_sqr[1:r-1,r] = rep_vector(0,r-1);
W_sqr[r,r] = W_diag[r];
W_sqr[r+1:R,r] = W_raw_lower_tri[L_lengths[r] + 1: L_lengths[r+1]];
L_U[1:r-1,r] = rep_vector(0,r-1);
L_U[r,r] = sqrt(c_U[r]);
L_U[r+1:R,r] = z_U[L_lengths[r] + 1: L_lengths[r+1]];
L_W[1:r-1,r] = rep_vector(0,r-1);
L_W[r,r] = sqrt(c_W[r]);
L_W[r+1:R,r] = z_W[L_lengths[r] + 1: L_lengths[r+1]];
}
mu_U = mu0 + L_U * mu_U_raw;
mu_W = mu0 + L_W * mu_W_raw;
for (k in 1:K-R) W[k] = mu_W + L_W*W_raw[k];
for (j in 1:J-R)U[j] = mu_U' + U_raw[j]*L_U;
for (r in 1:R){
W[K-R+r] = mu_W + L_W*W_sqr[1:R,r];
U[J-R+r] = mu_U' + U_sqr[r,1:R]*L_U;
}
for (j in 1:J) {
for (k in 1:K) theta[j,k] = U[j] * W[k] + theta_raw[j,k];
}
}
model{
// TODO: consider using LKJ priors (STAN Recommended) instead of normal-wishart
vector [V] p;
// z and c are the prior on the lower entries of the cholesky matrix
z_U ~ normal(0,1);
z_W ~ normal(0,1);
for (r in 1:R){
c_U[r] ~ chi_square(V0 + 2 - r + 1);
c_W[r] ~ chi_square(V0 + 2 - r + 1);
}
//mu_X_raw is the prior on mu_X
mu_U_raw ~ normal(0,1);
mu_W_raw ~ normal(0,1);
for(j in 1:J-R) U_raw[j] ~ normal(0,1);
for(k in 1:K-R) W_raw[k] ~ normal(0,1);
W_raw_lower_tri ~ normal(0,1);
W_diag ~ normal(0,1);
U_raw_lower_tri ~ normal(0,1);
U_diag ~ normal(0,1);
for(k in 1:K) theta_raw[k] ~ normal(0, 1);
for (v in 1:V) p[v] = Phi_approx(theta[jj[v],ll[v]] - theta[jj[v],rr[v]]);
y_vec ~ bernoulli(p);
}