Ordinal model cumulative logit parameter estimates

I noticed a difference in the parameter estimates for a centred versus non-centred version of an ordered logistic model. This became apparent when I implemented a model per the stan users guide (1.8 Ordered logistic and probit regression | Stan User’s Guide and 12.6 Ordered Logistic Distribution | Stan Functions Reference) and then compared it to an equivalent brms version.

Start with polr and a simple dataset as a reference point.

library(data.table)
library(MASS)
library(foreign)

dat <- data.table(
  read.dta("https://stats.idre.ucla.edu/stat/data/ologit.dta")
)
head(dat)

# mean centre per brms approach to prop odds
dat[, pared2 := pared - mean(pared)]
dat[, gpa2 := gpa - mean(gpa)]
dat[, apply2 := as.numeric(apply)]

m1 <- polr(apply ~ pared + gpa, data = dat, Hess=TRUE)
m2 <- polr(apply ~ pared2 + gpa2, data = dat, Hess=TRUE)

The two models have the same parameter estimates but different intercepts

# same ors, different intercepts
summary(m1)
Coefficients:
       Value Std. Error t value
pared 1.0457     0.2656   3.937
gpa   0.6042     0.2539   2.379

Intercepts:
                            Value  Std. Error t value
unlikely|somewhat likely    2.1763 0.7671     2.8370 
somewhat likely|very likely 4.2716 0.7922     5.3924 

summary(m2)
Coefficients:
        Value Std. Error t value
pared2 1.0457     0.2656   3.936
gpa2   0.6043     0.2539   2.379

Intercepts:
                            Value   Std. Error t value
unlikely|somewhat likely     0.1995  0.1030     1.9372
somewhat likely|very likely  2.2948  0.1719    13.3494

but both models produce the same probabilities when you run predict.

cbind(
  predict(
    m1,
    newdata = data.table(pared = 0, gpa = mean(dat$gpa)),
    type = "p"),
  predict(
    m2,
    newdata = data.table(pared2 = min(dat$pared2), gpa2 = 0),
    type = "p")
)
                      [,1]       [,2]
unlikely        0.59005138 0.59005227
somewhat likely 0.33120156 0.33120059
very likely     0.07874706 0.07874714

Initially, I implemented the model using stan. I have aligned the variable names and formatting to be as similar as possible to the brms model.

data {
  int<lower=1> N;
  int Y[N];
  int<lower=2> nthres; 
  int<lower=1> K; 
  matrix[N, K] X;
  int prior_only;
}
parameters {
  vector[K] b;
  ordered[nthres] Intercept;
}
model {
  if (!prior_only) {
    vector[N] mu = X * b;
    for (n in 1:N) {
      target += ordered_logistic_lpmf(Y[n] | mu[n], Intercept);
    }
  }
  target += normal_lpdf(b | 0, 10);
  target += student_t_lpdf(Intercept | 3, 0, 2.5);
}
generated quantities {}

next, using brms::make_stancode(apply2 ~ pared + gpa, data = dat, family=brms::cumulative("logit")) I produced the following (I removed the custom functions as they are unnecessary here).

data {
  int<lower=1> N;  // total number of observations
  int Y[N];  // response variable
  int<lower=2> nthres;  // number of thresholds
  int<lower=1> K;  // number of population-level effects
  matrix[N, K] X;  // population-level design matrix
  int prior_only;  // should the likelihood be ignored?
}
transformed data {
  int Kc = K;
  matrix[N, Kc] Xc;  // centered version of X
  vector[Kc] means_X;  // column means of X before centering
  for (i in 1:K) {
    means_X[i] = mean(X[, i]);
    Xc[, i] = X[, i] - means_X[i];
  }
}
parameters {
  vector[Kc] b;  // population-level effects
  ordered[nthres] Intercept;  // temporary thresholds for centered predictors
}
transformed parameters {
  real<lower=0> disc = 1;  // discrimination parameters
}
model {
  // likelihood including constants
  if (!prior_only) {
    // initialize linear predictor term
    vector[N] mu = Xc * b;
    for (n in 1:N) {
      target += ordered_logistic_lpmf(Y[n] | mu[n], Intercept);
    }
  }
  // priors including constants
  target += student_t_lpdf(Intercept | 3, 0, 2.5);
}
generated quantities {
  // compute actual thresholds
  vector[nthres] b_Intercept = Intercept + dot_product(means_X, b);
}

The main difference between the two is that the latter centres the covariates.

ld <- brms::make_standata(apply2 ~ pared + gpa,
                          data = dat,
                          family=brms::cumulative("logit"))
f1 <- rstan::sampling(object  = mod1,
                      data    = ld,
                      chains  = 1,
                      iter    = 5000,
                      warmup  = 1000,
                      refresh = 1000)
f2 <- rstan::sampling(object  = mod2,
                      data    = ld,
                      chains  = 1,
                      iter    = 5000,
                      warmup  = 1000,
                      refresh = 1000)

After sampling from the two models I get:

standard rstan version:

                mean se_mean   sd    2.5%     25%     50%     75%   97.5% n_eff Rhat
b[1]            1.06    0.01 0.28    0.53    0.87    1.06    1.24    1.62  2724    1
b[2]            0.47    0.01 0.24   -0.01    0.30    0.46    0.63    0.95  1315    1
Intercept[1]    1.76    0.02 0.73    0.36    1.26    1.75    2.24    3.20  1353    1
Intercept[2]    3.85    0.02 0.75    2.40    3.34    3.85    4.35    5.32  1339    1

brms (centred covariate):

                  mean se_mean   sd    2.5%     25%     50%     75%   97.5% n_eff Rhat
b[1]              1.05    0.00 0.27    0.53    0.87    1.05    1.23    1.58  3474    1
b[2]              0.60    0.00 0.26    0.11    0.43    0.60    0.77    1.10  3914    1
b_Intercept[1]    2.17    0.01 0.77    0.66    1.66    2.17    2.69    3.65  4010    1
b_Intercept[2]    4.27    0.01 0.80    2.73    3.74    4.27    4.81    5.84  4031    1

the brms (centred covariate) version appears to align with the m1 output from polr, whereas the other version does not. However, as with polr when I predict the probability of being in each group for a set of covariates I get the same values using both models and the results are similar to the frequentist version.

post1b <- as.matrix(f1, pars = "b")
post2b <- as.matrix(f2, pars = "b")

post1c <- as.matrix(f1, pars = "Intercept")
post2c <- as.matrix(f2, pars = "b_Intercept")

q1 <- exp(
  post1c - mean(dat$gpa)*post1b[, 2])/
  (1 + exp(post1c - mean(dat$gpa)*post1b[, 2]))

q2 <- exp(
  post2c - mean(dat$gpa)*post2b[, 2])/
  (1 + exp(post2c - mean(dat$gpa)*post2b[, 2]))

p1 <- cbind(q1[,1],q1[,2]-q1[,1])
p1 <- cbind(p1,1-rowSums(p1))

p2 <- cbind(q2[,1],q2[,2]-q2[,1])
p2 <- cbind(p2,1-rowSums(p2))


cbind(
  colMeans(p1), 
  colMeans(p2)
)
           [,1]       [,2]
[1,] 0.58882504 0.59000145
[2,] 0.33111789 0.33097647
[3,] 0.08005707 0.07902209

It seems like I would be unable to compare the inference on the basis of the parameter estimates across these two parmaterisations, even though the predictions from each model are the same. For example, in the first model, the parameter associated with the gpa is smaller than what is reported in the second model. Is this true or have I simply made a mistake in the implementation/interpretation?