Reproducibility of a non-linear model

Hi all.

I have been having some trouble with the reproducibility of my model. The problem is fairly ill behaved (several different modes some of which may be close to one another in terms of the log posterior value). I am thus using a combination of several equivalent models to obtain consistently “good” initialization (these are inefficient but I have the computing power to waste on the server, as neither pathfinder nor optimization yields consistently good initializations). The problem now comes in when I run my model with the good initialization on a cluster, it still lands in a mode with significantly lower probability then the initial values, which is something I would not except to happen. When I run this on my laptop, even with the same seed as that of the cluster I stay in the mode where I specified my initial values.

The cluster has the following specifications:
OS: Scientific Linux 7.9 (Nitrogen)
cmdstan: 2.34.0
cmdstanpy: 1.2.1

And my laptop specs are:
OS: Ubuntu 22.04.4 LTS
cmdstan: 2.34.0
cmdstanpy: 1.2.1

Both the cluster and my laptop has the same architecture of cmdstan installed through the conda-forge channel.

My stan code for the model (compiled with cpp_options{'STAN_THREADS': True}):

functions {
    vector NRTL(vector x, vector T, vector p12, vector p21, real a) {
        int N = rows(x);
        vector[N] t12 = p12[1] + p12[2] * T + p12[3] ./ T + p12[4] * log(T);
        vector[N] t21 = p21[1] + p21[2] * T + p21[3] ./ T + p21[4] * log(T);
        vector[N] dt12_dT = p12[2] - p12[3] ./ square(T) + p12[4] ./ T;
        vector[N] dt21_dT = p21[2] - p21[3] ./ square(T) + p21[4] ./ T;   
        vector[N] G12 = exp(-a * t12);
        vector[N] G21 = exp(-a * t21);
        vector[N] term1 = ( ( (1-x) .* G12 .* (1 - a*t12) + x .* square(G12) ) ./ square((1-x) + x .* G12) ) .* dt12_dT;
        vector[N] term2 = ( ( x .* G21 .* (1 - a*t21) + (1-x) .* square(G21) ) ./ square(x + (1-x) .* G21) ) .* dt21_dT;
        return -8.314 * square(T) .* x .* (1-x) .* ( term1 + term2);
    }
}

data {
    int N_points;
    vector<lower=0, upper=1>[N_points] x;
    vector[N_points] T;
    vector[N_points] y;
    vector[4] scaling;
    real<lower=0> a;
}

transformed data {
    real<lower=0> error=0.01;
}

parameters {
    vector[4] p12_raw;
    vector[4] p21_raw;
    real<lower=0> v;
}

model {
    vector[4] p12 = p12_raw .* scaling;
    vector[4] p21 = p21_raw .* scaling;
    vector[N_points] y_means = NRTL(x, T, p12, p21, a);
    
    p12_raw ~ std_normal();
    p21_raw ~ std_normal();

    v ~ exponential(2);

    y ~ normal(y_means, sqrt(square(error*abs(y))+v));
}
Full code for finding good intializations

For more details, I run 3 models in total in the same python script.

  1. Equivalent GP model (significantly better behaved than original problem), to find initializations for the variance
functions {
    vector NRTL(vector x, real T, real t12, real t21, real dt12_dT, real dt21_dT, real a) {
        int N = rows(x);
        real G12 = exp(-a * t12);
        real G21 = exp(-a * t21);
        vector[N] term1 = ( ( (1-x) * G12 * (1 - a*t12) + x * square(G12) ) ./ square((1-x) + x * G12) ) * dt12_dT;
        vector[N] term2 = ( ( x * G21 * (1 - a*t21) + (1-x) * square(G21) ) ./ square(x + (1-x) * G21) ) * dt21_dT;
        return -8.314 * square(T) * x .* (1-x) .* ( term1 + term2);
    }
}

data {
    int N_points;
    vector<lower=0, upper=1>[N_points] x;
    vector[N_points] y;
    vector[4] scaling;
    real<lower=0> a;
    int N_temps;
    vector[N_temps] T_unique;
    array[N_temps, 2] int T_idx;
}

transformed data {
    real<lower=0> error=0.01;
    matrix[2*N_temps, 4] tau_base;
    matrix[2*N_temps, 2*N_temps] KNN;

    for (i in 1:N_temps) {
        tau_base[i, :] = [1, T_unique[i], 1.0 / T_unique[i], log(T_unique[i])];
        tau_base[i+N_temps, :] = [0, 1, -1.0 / square(T_unique[i]), 1.0 / T_unique[i]];
    }

    KNN = add_diag(tau_base * tau_base', 1e-8);
}

parameters {
    vector[N_temps] t12;
    vector[N_temps] t21;
    vector[N_temps] dt12_dT;
    vector[N_temps] dt21_dT;
    real<lower=0> v;
}

model {
    vector[2*N_temps] all12 = append_row(t12, dt12_dT);
    vector[2*N_temps] all21 = append_row(t21, dt21_dT);
    
    all12 ~ multi_normal(rep_vector(0, 2*N_temps), KNN);
    all21 ~ multi_normal(rep_vector(0, 2*N_temps), KNN);
    
    v ~ exponential(2);

    {
        vector[N_points] y_means;
        for (i in 1:N_temps) {
            y_means[T_idx[i,1]:T_idx[i,2]] = NRTL(x[T_idx[i,1]:T_idx[i,2]], T_unique[i], t12[i], t21[i], dt12_dT[i], dt21_dT[i], a);
        }
        y ~ normal(y_means, sqrt(square(error*abs(y))+v));
    }
}
  1. Model with constant variance to find good initializations for the parameters
functions {
    vector NRTL(vector x, vector T, vector p12, vector p21, real a) {
        int N = rows(x);
        vector[N] t12 = p12[1] + p12[2] * T + p12[3] ./ T + p12[4] * log(T);
        vector[N] t21 = p21[1] + p21[2] * T + p21[3] ./ T + p21[4] * log(T);
        vector[N] dt12_dT = p12[2] - p12[3] ./ square(T) + p12[4] ./ T;
        vector[N] dt21_dT = p21[2] - p21[3] ./ square(T) + p21[4] ./ T;   
        vector[N] G12 = exp(-a * t12);
        vector[N] G21 = exp(-a * t21);
        vector[N] term1 = ( ( (1-x) .* G12 .* (1 - a*t12) + x .* square(G12) ) ./ square((1-x) + x .* G12) ) .* dt12_dT;
        vector[N] term2 = ( ( x .* G21 .* (1 - a*t21) + (1-x) .* square(G21) ) ./ square(x + (1-x) .* G21) ) .* dt21_dT;
        return -8.314 * square(T) .* x .* (1-x) .* ( term1 + term2);
    }
}

data {
    int N_points;
    vector<lower=0, upper=1>[N_points] x;
    vector[N_points] T;
    vector[N_points] y;
    vector[4] scaling;
    real<lower=0> a;
    real<lower=0> v;
}

transformed data {
    real<lower=0> error=0.01;
}

parameters {
    vector[4] p12_raw;
    vector[4] p21_raw;
}

model {
    vector[4] p12 = p12_raw .* scaling;
    vector[4] p21 = p21_raw .* scaling;
    vector[N_points] y_means = NRTL(x, T, p12, p21, a);
    
    p12_raw ~ std_normal();
    p21_raw ~ std_normal();

    y ~ normal(y_means, sqrt(square(error*abs(y))+v));
}
  1. Running the model in the main body with the good initial values.
Python code
import numpy as np #type: ignore
import cmdstanpy #type: ignore
import os
import sys
import random
import json

# set seed
random.seed(1)

#set number of cpu's available
ncpu = 8

# get path from command line inputs
path = sys.argv[1]

# set stan number of threads environmental variables
os.environ['STAN_NUM_THREADS'] = f'{ncpu}'

# Set paths for data_file and stan model
data_file = f'{path}.json'
model1 = cmdstanpy.CmdStanModel(exe_file='Stan Models/model1')
model2 = cmdstanpy.CmdStanModel(exe_file='Stan Models/model2')
model3 = cmdstanpy.CmdStanModel(exe_file='Stan Models/model3')

# Make path to store results
os.makedirs(path)

# Step 1: Sampling tau, v with random initializations
output_dir1 = f'{path}/Step1'
print('Step 1: Started. Sampling tau, v with random initializations')
fit1 = model1.sample(data=data_file, chains=5*ncpu, parallel_chains=ncpu, iter_warmup=1000,
                    iter_sampling=1000, max_treedepth=10, adapt_delta=0.8, refresh=100, output_dir=output_dir1)
print('Step 1: Completed')
print('Step 1: Diagnostics:')
print(fit1.diagnose())
print('Step 1: Summary:')
print(fit1.summary().iloc[:,3:])

# Step 2: Obtain p12, p21 using constant v
output_dir2 = f'{path}/Step2'

# create directories
os.makedirs(output_dir2)

# find index of max lp and chain with max lp
max_lp_all_idx = np.argmax(fit1.method_variables()['lp__'].T.flatten())

# extract v and save to data.json file
data = json.load(open(data_file, 'r'))
data['v'] = fit1.v[max_lp_all_idx]
with open(data_file, 'w') as f:
    json.dump(data, f)

# print inits to screen
print("Variance for Step 2:")
print(data['v'])

print('Step 2: Started. Sampling p12, p21 using constant v (from MAP) from step 1')
fit2 = model2.sample(data=data_file, chains=5*ncpu, parallel_chains=ncpu, iter_warmup=1000,
                    iter_sampling=1000, max_treedepth=10, adapt_delta=0.8, refresh=100, output_dir=output_dir2)
print('Step 2: Completed')

print('Step 2: Diagnostics:')
print(fit2.diagnose())
print('Step 2: Summary:')
print(fit2.summary().iloc[:,3:])

# Step 3. Sampling p12,p21,v using MAP initializations from previous steps
output_dir3 = f'{path}/Step3'
inits3 = f'{output_dir3}/inits.json'

# create directories to store inits
os.makedirs(output_dir3)

# extracting and saving inits
max_lp_all_idx = np.argmax(fit2.method_variables()['lp__'].T.flatten())
inits = {'p12_raw': fit2.p12_raw[max_lp_all_idx,:].tolist(),
         'p21_raw': fit2.p21_raw[max_lp_all_idx,:].tolist(),
         'v': data['v']}
with open(inits3, 'w') as f:
    json.dump(inits, f)

print('Step 3: Started. Sampling p12, p21, v using initializations from step 1&2')
fit3 = model3.sample(data=data_file, chains=ncpu, parallel_chains=ncpu, iter_warmup=10000,
                    iter_sampling=10000, max_treedepth=10, adapt_delta=0.99, refresh=100, 
                    thin=5, inits=inits3, output_dir=output_dir3)
print('Step 3: Completed')

print('Step 3: Diagnostics')
print(fit3.diagnose())
print('Step 3: Summary')
print(fit3.summary().iloc[:, 3:])

print('Program done executing!!')
print('Exiting')

The results from the server gives the trace plot for the log posterior density as (x-axis is sample number where the chains were flattened. 0-2000 is chain 1, 2001-3000 is chain 2 as so on):
image

While running this on my laptop gave the output (for 4 chains instead of 8):
image

Any input on how to correct for this or what may have possibly gone wrong would be much appreciated.

For convenience, attached is my data and inits file, for a single dataset (I could not upload .json files so I uploaded these as .txt files instead).
data.txt (652 Bytes)
inits.txt (122 Bytes)