I realize I used a 0.2 rather than 2 (the 02 notation in the post confused me). I thought the easiest thing to do would be to simulate to show you that it works as expected. Bottom line:
=== N = 256 ===
-- x mode: cdf --
alpha 90% interval: [-0.158, 1.110]
beta 90% interval: [3.294, 7.119]
beta interval width: 3.825
-- x mode: pdf --
alpha 90% interval: [0.923, 2.412]
beta 90% interval: [-96.184, -2.224]
beta interval width: 93.959
=== N = 1024 ===
-- x mode: cdf --
alpha 90% interval: [0.837, 1.426]
beta 90% interval: [2.146, 3.637]
beta interval width: 1.491
-- x mode: pdf --
alpha 90% interval: [0.566, 1.171]
beta 90% interval: [-2.650, 37.312]
beta interval width: 39.962
You can see that with 1024 data points, the posterior intervals do indeed shrink in both cases as one would expect with a regression. By the time you get N = 1024, the true value (3) shows up in both posterior 90% intervals.
Here’s the prompt I used to generate the code:
I would like you to simulate data and fit that data using cmdstanpy for two different models with 1000 simulated data points.
Data generation 1: v ~ gamma(16, 0.2); x = gamma_cdf(v | 16, 0.2)
Data generation 2: v ~ gamma(16, 0.2); x = gamma_pdf(v | 16, 0.2)
So in one case, I simulate and take the cdf and in another, I simulate and take the pdf. In case 1, it's just generating uniform(0, 1) variates.
Then after generating N=256 x values in both of these situations, I want you to generate y values for both according to: y ~ bernoulli_logit(alpha + beta * x) where alpha = 1 and beta = 3.
We will now have (x1, y1) and (x2, y2). I want you to fit both of these models with a Stan model with improper uniform priors and likelihood y ~ bernoulli_logit(alpha + beta * x).
I then want you to fit both data sets to the Stan model (we only need one Stan model here) and print a summary of the results. Then I want you to generate new data sets with N=1024 rather than N = 256 and fit those. I'm looking to see if the posterior interval around beta shrinks as the data increases.
Please give me the Python code to do this.
And here’s the Python that GPT 5.2 plus generated today.
"""
Simulate two x-generating mechanisms from a Gamma(shape=16, rate=0.2) draw v:
(1) x = GammaCDF(v) -> x ~ Uniform(0, 1)
(2) x = GammaPDF(v) -> x is a density value (positive, typically small)
Then simulate y ~ BernoulliLogit(alpha + beta * x) with alpha=1, beta=3,
fit a logistic regression in Stan with improper uniform priors (i.e., no priors),
and compare posterior intervals for beta at N=256 and N=1024.
"""
from __future__ import annotations
import os
import tempfile
from dataclasses import dataclass
from typing import Dict, Tuple
import numpy as np
from cmdstanpy import CmdStanModel
from scipy.stats import gamma as gamma_dist
def inv_logit(z: np.ndarray) -> np.ndarray:
"""Numerically stable logistic function."""
z = np.asarray(z, dtype=np.float64)
out = np.empty_like(z)
pos = z >= 0
out[pos] = 1.0 / (1.0 + np.exp(-z[pos]))
ez = np.exp(z[~pos])
out[~pos] = ez / (1.0 + ez)
return out
def simulate_xy(
n: int,
x_mode: str,
alpha: float,
beta: float,
rng: np.random.Generator,
shape: float = 16.0,
rate: float = 0.2,
) -> Tuple[np.ndarray, np.ndarray]:
"""
Simulate (x, y) where v ~ Gamma(shape, rate), x is either CDF(v) or PDF(v),
and y ~ BernoulliLogit(alpha + beta * x).
"""
if x_mode not in {"cdf", "pdf"}:
raise ValueError("x_mode must be 'cdf' or 'pdf'.")
scale = 1.0 / rate
v = gamma_dist(a=shape, scale=scale).rvs(size=n, random_state=rng)
if x_mode == "cdf":
x = gamma_dist(a=shape, scale=scale).cdf(v)
else:
x = gamma_dist(a=shape, scale=scale).pdf(v)
p = inv_logit(alpha + beta * x)
y = rng.binomial(n=1, p=p, size=n).astype(np.int32)
return x.astype(np.float64), y
def write_stan_model(stan_path: str) -> None:
"""Write a logistic regression Stan program with improper uniform priors."""
stan_code = r"""
data {
int<lower=1> N;
vector[N] x;
array[N] int<lower=0, upper=1> y;
}
parameters {
real alpha;
real beta;
}
model {
// Improper uniform priors: no prior terms added.
y ~ bernoulli_logit(alpha + beta * x);
}
"""
with open(stan_path, "w", encoding="utf-8") as f:
f.write(stan_code)
@dataclass(frozen=True)
class FitSummary:
alpha_q025: float
alpha_q975: float
beta_q025: float
beta_q975: float
beta_width: float
def fit_and_summarize(
model: CmdStanModel,
x: np.ndarray,
y: np.ndarray,
seed: int,
chains: int = 4,
iter_warmup: int = 1000,
iter_sampling: int = 1000,
) -> Tuple[object, FitSummary]:
"""Fit the Stan model and return CmdStanMCMC plus beta interval width."""
data = {"N": int(len(x)), "x": x, "y": y.tolist()}
fit = model.sample(
data=data,
seed=seed,
chains=chains,
parallel_chains=min(chains, os.cpu_count() or 1),
iter_warmup=iter_warmup,
iter_sampling=iter_sampling,
show_progress=False,
)
summ = fit.summary()
alpha_q025 = float(summ.loc["alpha", "5%"])
alpha_q975 = float(summ.loc["alpha", "95%"])
beta_q025 = float(summ.loc["beta", "5%"])
beta_q975 = float(summ.loc["beta", "95%"])
beta_width = beta_q975 - beta_q025
return fit, FitSummary(alpha_q025, alpha_q975, beta_q025, beta_q975, beta_width)
def run_experiment(
model: CmdStanModel,
n: int,
rng: np.random.Generator,
alpha_true: float = 1.0,
beta_true: float = 3.0,
) -> Dict[str, FitSummary]:
"""Simulate and fit both x-modes at a given n."""
results: Dict[str, FitSummary] = {}
for x_mode in ["cdf", "pdf"]:
x, y = simulate_xy(n=n, x_mode=x_mode, alpha=alpha_true, beta=beta_true, rng=rng)
_, fs = fit_and_summarize(
model=model,
x=x,
y=y,
seed=int(rng.integers(1, 2**31 - 1)),
)
results[x_mode] = fs
return results
def main() -> None:
rng = np.random.default_rng(20260116)
with tempfile.TemporaryDirectory() as td:
stan_path = os.path.join(td, "logit_improper.stan")
write_stan_model(stan_path)
model = CmdStanModel(stan_file=stan_path)
for n in [256, 1024]:
res = run_experiment(model=model, n=n, rng=rng)
print(f"\n=== N = {n} ===")
for mode in ["cdf", "pdf"]:
fs = res[mode]
print(f"\n-- x mode: {mode} --")
print(f"alpha 90% interval: [{fs.alpha_q025:.3f}, {fs.alpha_q975:.3f}]")
print(f"beta 90% interval: [{fs.beta_q025:.3f}, {fs.beta_q975:.3f}]")
print(f"beta interval width: {fs.beta_width:.3f}")
print("\nNote: cmdstanpy also provides full summaries via fit.summary().")
if __name__ == "__main__":
main()