@jsocolar sorry to keep bothering. I managed to implement a model with errors on y which does recover the true parameters in simulated data well, but I’m stuck trying to expand it to having y ~ x , with both y and x having missclassifications. Here’s one of the attempts I made (also looking at Latent Categorical Predictor Variable Model ) . Here I’m assuming I know the error rate is 0.1.
data {
int N; // number of obs
int<lower=0, upper=1> y[N] ; // response
int<lower=0, upper=1> x1[N]; // predictor
}
parameters {
real intercept;
real b1;
vector[N] mu_x; // latent variable for x
}
transformed parameters {
vector[N] mu;
for(n in 1:N) {
if(mu_x[n] >0)
mu[n] = intercept + b1;
else
mu[n] = intercept;
}
}
model {
intercept ~ normal(0, 5);
b1 ~ std_normal();
mu_x ~ std_normal();
for (n in 1:N) {
if (x1[n] == 1 )
target += log_sum_exp(log1m(0.1) + bernoulli_logit_lpmf(1 | mu_x[n]),
log(0.1) + bernoulli_logit_lpmf(0 | mu_x[n]));
else
target += log_sum_exp(log1m(0.1) + bernoulli_logit_lpmf(0 | mu_x[n]),
log(0.1) + bernoulli_logit_lpmf(1 | mu_x[n]));
if (y[n] == 1) {
target += log_sum_exp(log1m(0.1) + bernoulli_logit_lpmf(1 | (mu[n])),
log(0.1) + bernoulli_logit_lpmf(0 | (mu[n])));
} else {
target += log_sum_exp(log1m(0.1) + bernoulli_logit_lpmf(0 | (mu[n])),
log(0.1) + bernoulli_logit_lpmf(1 | (mu[n])));
}
}
}
I tested this with some simulated data but it does not sample well:
intercept <- 1
## real x1
x1r <- as.numeric(rbernoulli(100, p = 0.6))
p_error <- 0.1
set.seed(1232)
errors_y <- rbernoulli(100, p_error)
errors_x <- rbernoulli(100, p_error)
## x1 with errors
x1 <- x1r
x1[errors_x] <- as.numeric(!x1[errors_x])
b1 <- 1.3
mu <- intercept + b1 * x1
## real y
yr <- as.numeric(sapply(brms::inv_logit_scaled(mu), function(x) rbernoulli(1, x)))
yr
## y with errors
y <- yr
y[errors_y] <- as.numeric(!y[errors_y])
data_m <- list(N = 100
, y = y
, x1 = x1)
I tried some alternatives, but those did not get even close to retrieving the true parameters. Not sure what the solution should be.
Edit:
The following seems to be working, but I’d be very thankful for some feedback because I’m not sure whether I’m making a mistake here:
data {
int N; // number of obs
int<lower=0, upper=1> y[N] ; // response
int<lower=0, upper=1> x1[N]; // predictor
}
parameters {
real intercept;
real b1;
vector[N] mu_x;
}
transformed parameters {
vector[N] mu;
vector[N] mu_x_il = inv_logit(mu_x);
for(n in 1:N) {
mu[n] = intercept + b1 * mu_x_il[n];
}
}
model {
intercept ~ std_normal();
b1 ~ std_normal();
mu_x ~ normal(0, 10);
for (n in 1:N) {
if (x1[n] == 1)
target += log_sum_exp(log1m(0.1) + bernoulli_logit_lpmf(1 | mu_x[n]),
log(0.1) + bernoulli_logit_lpmf(0 | mu_x[n]));
else
target += log_sum_exp(log1m(0.1) + bernoulli_logit_lpmf(0 | mu_x[n]),
log(0.1) + bernoulli_logit_lpmf(1 | mu_x[n]));
if (y[n] == 1) {
target += log_sum_exp(log1m(0.1) + bernoulli_logit_lpmf(1 | (mu[n])),
log(0.1) + bernoulli_logit_lpmf(0 | (mu[n])));
} else {
target += log_sum_exp(log1m(0.1) + bernoulli_logit_lpmf(0 | (mu[n])),
log(0.1) + bernoulli_logit_lpmf(1 | (mu[n])));
}
}
}