Random walk prior

There are lots of situations where we want parameters to evolve smoothly, and I’ve seen the random walk prior proposed for this purpose (eg here):

gamma[0] ~ normal(0,1)
gamma[k] ~ normal(gamma[k-1], tau)

This results in the parameters having prior distributions that depend on k:

gamma[k] ~ normal(0, sqrt(1 + k * tau^2))

This dependence on k seems undesirable–if the MLE of gamma is 1 for all k, our estimates will approach 1 as k increases, displaying a trend that isn’t there.

Shouldn’t we be using a stationary smoothness prior (one that doesn’t depend on k)?

Aware! The code about splines adds the intercept twice too. One time in calling bs
and another time in a0. B has full rank checked by rankMatrix.

To model the random walk just do:

gamma_raw ~ normal(0, 1);
gamma = cumulative_sum(gamma_raw) * tau;

If you want smoothing, I’d go instead with the mgcv package and specify bs=“ts” or bs=“ad”.
See: https://stats.stackexchange.com/questions/305338/adaptive-gam-smooths-in-mgcv

If I’m not mistaken the variance for gamma[1] -> Infinitive. Does an intercept makes sense?
I’d say no and define a long tailed prior to the variance of the first element of the random walk.

And yes, I wouldn’t specify the random walk like you cited.

1 Like

If you want to be Bayesian, use INLA. Similar interface to mgcv.

How about Stan with mgcv. ;-)

Isn’t that brms?

Yes. brms uses mgcv as far as I know. brms uses the design matrices from mgcv and glue it together with Stan code. I prefer to extract the design matrix myself and use it in Stan on my own.

1 Like

Are there any case studies using this approach anywhere?

I don’t know. I’d like to see using the tensor spline options of mgcv.

to generate a design matrix, I use for example:

sp.design <- smoothCon(s(x, k=Nmeasures, bs="tp", m = 5)
                       ,data=data.frame(x = da),knots=NULL)[[1]]$X

and predict on smoothCon output to generate a Design Matrix at different locations.


Ah just wrote a example so here you go, with some basic plots. Warning that it takes a while to run… and actually I ran it outside of rstan b/c for some reason rstan wasn’t working… so a few key lines at the end are commented out. It makes a design matrix from a 2-D b-splines but that’s a detail and since the matrix is huge and pretty sparse it uses the sparse matrix multiplication.


linear model using the (sparse) design matrix:

data {
  int N;
  vector[N] y;

  int n_w;
  int n_v;
  int n_u;
  vector[n_w] w;
  int v[n_v];
  int u[n_u];

  int n_knots_m;
  int n_knots_p;

  vector[n_knots_m] knot_points_m;
  vector[n_knots_p] knot_points_p;

parameters {
  vector[n_knots_m * n_knots_p] knot_weights;
  real<lower=0> sigma;

model {
  sigma ~ gamma(4, 10);
  knot_weights ~ normal(0, 1);

  y ~ normal(
    csr_matrix_times_vector(N, n_knots_m * n_knots_p, w, v, u, knot_weights),

Here’s the script for running it:


n_knots_m <- 10
n_knots_p <- 18
n_knots_in_grid <- n_knots_m * n_knots_p

knot_points_m <- seq(from=0, to=100, length.out=n_knots_m)
knot_points_p <- seq(from=0, to=180, length.out=n_knots_p)

knot_mu <- 0
knot_sd <- 2
knot_value_grid <- matrix(data=plogis(rnorm(n_knots_in_grid, knot_mu, knot_sd)),
  nrow = n_knots_m, ncol = n_knots_p)
knot_value_long <- knot_value_grid %>% data.frame %>%
  `colnames<-`(paste0(1:n_knots_p)) %>%
  dplyr::mutate(M=as.numeric(1:n_knots_m)) %>%
  tidyr::gather(P, value, -M) %>%
  dplyr::mutate(P=as.numeric(P)) %>%
  dplyr::mutate(m=knot_points_m[M], p=knot_points_p[P]) %>%
  dplyr::arrange(P, M)

measure_points_m <- seq(from=0, to=100, length.out=200)
measure_points_p <- seq(from=0, to=180, length.out=200)
measurements_long <- expand.grid(m=measure_points_m, p=measure_points_p)

bs_m <- splines::bs(x=measurements_long$m, knots=knot_points_m, degree=5,
  Boundary.knots=c(knot_points_m[1] - diff(knot_points_m)[1],
                   knot_points_m[n_knots_m] + diff(knot_points_m)[n_knots_m-1]))
bs_p <- splines::bs(x=measurements_long$p, knots=knot_points_p, degree=5,
  Boundary.knots=c(knot_points_p[1] - diff(knot_points_p)[1],
                   knot_points_p[n_knots_p] + diff(knot_points_p)[n_knots_p-1]))

is <- vector(mode='numeric')
js <- vector(mode='numeric')
xs <- vector(mode='numeric')

for (m in 1:n_knots_m) {
  for (p in 1:n_knots_p) {
    j <- (m - 1) * n_knots_p + p
    y <- bs_m[,m] * bs_p[,p]
    is_ <- which(y > 1e-8)
    js_ <- rep(j, length(is_))
    xs_ <- y[is_]
    is <- c(is, is_)
    js <- c(js, js_)
    xs <- c(xs, xs_)

bs_p_m <- Matrix::sparseMatrix(i=is, j=js, x=xs,
  dims=c(nrow(measurements_long), n_knots_in_grid))

measurements_long <- measurements_long %>%
  dplyr::mutate(surface = as.vector(bs_p_m %*% knot_value_long[,'value']),
                observation = surface + rnorm(n=n(), mean=0, sd=0.2),
                prediction = as.vector(bs_p_m %*% MatrixModels:::lm.fit.sparse(bs_p_m, observation)))

picks <- sample(1:nrow(measurements_long), 200)
pl <- ggplot(data=measurements_long) +
  aes(x=m, y=p, fill=observation)) +
  aes(x=m, y=p, colour=prediction), size=2.5, shape=15)  +
  aes(x=m, y=p), colour='black', size=2.5, shape=0)

stan_data <- rstan::extract_sparse_parts(bs_p_m)
stan_data[['N']] <- nrow(measurements_long)
stan_data[['y']] <- measurements_long[['observation']]
#stan_data[['m']] <- measurements_long[['m']]
#stan_data[['p']] <- measurements_long[['p']]
stan_data[['n_w']] <- length(stan_data[['w']])
stan_data[['n_v']] <- length(stan_data[['v']])
stan_data[['n_u']] <- length(stan_data[['u']])

stan_data[['n_knots_m']] <- n_knots_m
stan_data[['n_knots_p']] <- n_knots_p

stan_data[['knot_points_m']] <- knot_points_m
stan_data[['knot_points_p']] <- knot_points_p

rstan::stan_rdump(list=names(stan_data), file='data.rdump',

#m <- rstan::stan_model(file='sparse-lm.stan')
#s <- rstan::sampling(m, data=stan_data, chains=1, iter=100, warmup=200,
#  thin=1, seed=12, verbose=TRUE)

s <- rstan::read_stan_csv(csvfiles='output.csv') %>%
a <- apply(s$knot_weights, 2, mean)
plot(a, MatrixModels:::lm.fit.sparse(bs_p_m,
  measurements_long$observation), ylim=c(0,2), xlim=c(0,2))

1 Like

Hi all, I think we’ve got a bit off topic. I don’t actually care about splines, that was just one example where I’d seen the random walk prior. Andrew also uses a similar random walk prior in this paper (page 7), although he puts no prior on gamma[1].

Maybe this avoids the problem of non-stationarity? @andre.pfeuffer, is this what you meant by “the variance for gamma[1] -> Infinitive”?

Random walk priors are essentially the same as spines. See Rue and Held Gaussian Markov Random Fields: Theory and Applications. Chapter 3 if I remember correctly.

Thanks for the pointer, I’ll check it out.

Just to clarify, I should’ve said “splines on the mgcv sense are random walk priors”. There exist some weird splines out there for which that statement might not be true.



Phi is 1 for a random walk, thus the variance of the whole process -> infinity. In an AR(1) process you could initialize X_1, the first element, by using this variance. It is correct to put no prior gamma[1], but
for sampling efficiency reason, I’d like to put a prior with flat tails, eg. cauchy(0, 5). Practically we sample from a process with finite number of elements.

A RW isn’t an AR process. It’s initialised differently.

It has finite variance after a finite number of steps if the distribution for x[1] is specified.

If instead you build without an initial condition you get an intrinsic GMRF (ie an improper Gaussian) that can be made proper by either imposing a sum to zero constraint or fixing one value (x[1]=0 for instance)

1 Like

On a technical note, the variance depending on the size and shape of the conditional dependence structure is almost always the case for multivaririate Gaussian that are specified through conditionals. You need to compensate through a well scaled prior.

A discussion of it in the context of random walks is here https://www.sciencedirect.com/science/article/pii/S2211675313000407

1 Like

I am a bit confused about the meaning of intercept =TRUE in bs, does this really add a constant basis function (which in my understanding is an intercept). When I inspect the B matrix created here once with intercept=TRUE and once with intercept=FALSE I see that the former one has one additional row filled with 0's…

X <- seq(from=-5, to=5, by=.1) # generating inputs
BT <- t(bs(X, knots=seq(-5,5,1), degree=3, intercept = TRUE)) # creating the B-splines
BF <- t(bs(X, knots=seq(-5,5,1), degree=3, intercept = FALSE)) # creating the B-splines
mean(BT[-1,]==BF) #gives `
BT[1,] # gives a vector of 0's

So in this case one really needs a0 and it appears there is no intercept redundancy…

The spline code lives in the transposed world

BT <- t(bs(X, knots=seq(-5,5,1), degree=3, intercept = TRUE)) # creating the B-splines
Y_true <- as.vector(a0X + a %% B) # generating the output

and later in Stan

Y_hat = a0*X + to_vector(a * B);

So in our Design Matrix we set the factor to one of the coefficients to 0 for all i in 1,…, N, eg.
something like:

y_i = x1_i * 0 + x2_i * b2_i + …

recall x1_i * b1_i == x1_i * 0, following b1_i is arbitrary.

What is it’s use or where is my mistake?

If you run the slightly modified example from the Spline case study

X <- seq(from=-5, to=5, by=.1) # generating inputs
B <- t(bs(X, knots=seq(-5,5,1), degree=3, intercept = TRUE)) # creating the B-splines
num_data <- length(X); num_basis <- nrow(B)
a0 <- 0.2 # intercept
a <- rnorm(num_basis, 0, 1) # coefficients of B-splines
Y_true <- as.vector(a0*X + a%*%B) # generating the output
Y <- Y_true + rnorm(length(X),0,.2) # adding noise
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
fit<-sampling(sm,iter=5000,control = list(adapt_delta=0.95))
post <- as.array(fit)
mcmc_intervals(post, pars=c("a[1]")) + vline_at(a[1])
mcmc_intervals(post, pars=c("a0")) + vline_at(a0)

you see that a[1] is consistent with being 0 in Stan, albeit in R it’s non-zero. So it’s not used in Stan. So this brings me back to my original question, what is the point of the intercept=TRUE option in bs

data { 
  int num_data; 
  int num_basis; 
  vector[num_data] Y; 
  vector[num_data] X; 
  matrix[num_basis, num_data] B; 
parameters { 
  row_vector[num_basis] a_raw; 
  real a0; 
  real<lower=0> sigma; 
  real<lower=0> tau; 
transformed parameters { 
  row_vector[num_basis] a; 
  vector[num_data] Y_hat; 
  a = a_raw*tau;  
  Y_hat = a0*X + to_vector(a*B); 
model { 
  a_raw ~ normal(0, 1); 
  tau ~ cauchy(0, 1); 
  sigma ~ cauchy(0, 1); 
  Y ~ normal(Y_hat, sigma); 

I’m also checking the random walk prior mentioned in the same blog. Did you find some way to generate a suitable prior?