Ragged data in stan

Start with a short summary of your inquiry.
Hi everyone,

I am trying to fit a bivariate Hawkes process using pystan. My data consists of on_data and off_data, each one consists of sublists of different lenghts. The code seems to be working when I pad the sublists with -1’s to have them have same lenght.

However, I am not sure how to not consider the -1’s in my code. When the sampling is finished, fit[‘lam_a’] and fit[lam_b] (the Hawkes estimated intensities ) give me a list of lists with some missing values (I guess those correspond to the padded values). Also, the estimated parameters by stan (alpha, gamma and mu) give different values than the ones I estimated in the synthetic data, which I believe it’s because of the padded values

How can I disregard the padded values in stan so that the intensities (lam_a and lam_b) are evaluated only up to the non padded values.

Thanks

#code to generate synthetic data 

gamma = np.zeros([2,2])

gamma[0,1] = 0.1
gamma[1,0] = 1.0
gamma[0,0] = 0.5
gamma[1,1] = 5.

alpha = np.zeros([2,2])
alpha[0,1] = 0.1
alpha[1,0] = 0.2
alpha[0,0] = 0.5
alpha[1,1] = 0.6

M=5

mu = np.zeros([M,2])


for m in range(M):
    mu[m,0] = 0.1
    mu[m,1] = 0.3

T = 50
marks_outer = []
N_list = []
times_outer = []

# generate baseline events

for m in range(M):
    N_list.append(np.random.poisson(np.sum(mu[m])*T))
    times = []
    
    for i in range(N_list[m]):
        times.append(T*np.random.rand())
    times_outer.append(times)

for m in range(M):
    marks = []
    for i in range(N_list[m]):
        if (mu[m,0]/(mu[m,0]+mu[m,1])) > np.random.rand():
            marks.append(0)
        else:
            marks.append(1)
    marks_outer.append(marks)

cnt = 0

for m in range(M):
    while cnt < N_list[m]:
        for i in range(cnt, N_list[m]):
            k = marks_outer[m][i]
            cnt+=1
            
            g = np.random.poisson(alpha[0,k])
            for j in range(g):
                tnew = times_outer[m][i] + np.random.exponential(1./gamma[0,k])
                if tnew < T:
                    times_outer[m].append(tnew)
                    marks_outer[m].append(0)
                    N_list[m]+=1

            g = np.random.poisson(alpha[1,k])
            for j in range(g):
                tnew = times_outer[m][i] + np.random.exponential(1./gamma[1,k])
                if tnew < T:
                    times_outer[m].append(tnew)
                    marks_outer[m].append(1)
                    N_list[m]+=1

times_outer, marks_outer = (list(t) for t in zip(*sorted(zip(times_outer, marks_outer))))

times0_outer = []
for m in range(len(times_outer)):
    times0 = []
    for i in range(len(times_outer[m])):
        if marks_outer[m][i] == 0:
            times0.append(times_outer[m][i])
    times0_outer.append(times0)


times1_outer = []
for m in range(len(times_outer)):
    times1 = []
    for i in range(len(times_outer[m])):
        if marks_outer[m][i] == 1:
            times1.append(times_outer[m][i])
    times1_outer.append(times1)

times_off = []
for m in range(len(times0_outer)):
    times_off.append(sorted(times0_outer[m]))

times_on = []
for m in range(len(times1_outer)):
    times_on.append(sorted(times1_outer[m]))

data_on = times_on
data_off = times_off

# list of lengths
na = [len(i) for i in data_on ]
nb = [len(i) for i in data_off ]

#padding -1 values to have the same lenghts for each sublist
on_data = []
for i in data_on:
    on_data.append(np.pad(i, (0, max(na)- len(i)), constant_values = -1))

off_data = []
for i in data_off:
    off_data.append(np.pad(i, (0, max(nb)- len(i)), constant_values = -1))

#Stan code

model_code = """
data {
    int  M; //number of gangs
    int<lower=0>  Na[M]; // list of lenght for each on_data
    int<lower=0>  Nb[M]; // list of lenght for each off_data
    int maxNa;
    int maxNb;
    vector[maxNa] ta[M];
    vector[maxNb] tb[M];
    int T;
}
parameters {
    vector<lower=0>[2] mu[M]; //baseline
    matrix<lower=0, upper = 24>[2,2] gamma ; //decay 
    matrix<lower=0,upper=1>[2,2] alpha; //adjacency
  
}
transformed parameters {


vector[maxNa] lam_a[M];
vector[maxNb] lam_b[M];

 
for (m in 1:M){
lam_a[m,1] = mu[m,1];
lam_b[m,1] = mu[m,2];
}


for(m in 1:M){

for (j in 1:Na[m]){
    lam_a[m,j]=mu[m,1];
    for(k in 1:(j-1)){
        if (ta[m,j]>ta[m,k]){
         lam_a[m,j] = lam_a[m,j]+alpha[1,1]*gamma[1,1]*exp(-gamma[1,1]*(ta[m,j]-ta[m,k]));
            }
        }
    for(k in 1:Nb[m]){
        if (ta[m,j]>tb[m,k]) {
            lam_a[m,j] = lam_a[m,j]+alpha[1,2]*gamma[1,2]*exp(-gamma[1,2]*(ta[m,j]-tb[m,k]));
               }
            }
}


for (j in 1:Nb[m]){
    lam_b[m,j]=mu[m,2];
    for(k in 1:(j-1)){
        if (tb[m,j]>tb[m,k])
        {
         lam_b[m,j] = lam_b[m,j]+alpha[2,2]*gamma[2,2]*exp(-gamma[2,2]*(tb[m,j]-tb[m,k]));
        }
        }
    for(k in 1:Na[m]){
        if (tb[m,j]>ta[m,k]) {
            lam_b[m,j] = lam_b[m,j]+alpha[2,1]*gamma[2,1]*exp(-gamma[2,1]*(tb[m,j]-ta[m,k]));
           }
        }
  }
    }
  }



model {
    alpha[1,1] ~  beta(1,1);
    alpha[1,2] ~ beta(1,1);
    alpha[2,1] ~ beta(1,1);
    alpha[2,2] ~ beta(1,1);
    for(m in 1:M){
        mu[m,1] ~ cauchy(0,10);
        mu[m,2] ~ cauchy(0,10);
        }
    gamma[1,1] ~ cauchy(0, 10);
    gamma[2,1] ~ cauchy(0, 10);
    gamma[1,2] ~ cauchy(0, 10);
    gamma[2,2] ~ cauchy(0, 10);

    for(m in 1:M){
        for(j in 1:Na[m]){
            target+=log(lam_a[m,j]);
            }
        for(j in 1:Nb[m]){
            target+=log(lam_b[m,j]);
            }
        target+=-mu[m,1]*T-mu[m,2]*T-(alpha[1,1]+alpha[2,1])*Na[m]-(alpha[1,2]+alpha[2,2])*Nb[m];
        }

}

"""

#fit the data

hawkes_data = {"Na": na,"Nb": nb, "ta":on_data, "tb": off_data, "T":T, "M": len(on_data), "maxNa": max(na), "maxNb": max(nb) }
posterior = stan.build(model_code, data=hawkes_data, random_seed=2)
fit = posterior.sample(num_chains=1, num_samples=1000)