Treating missing data "NA" in functioning model

Hello, I’m running this prediction model with success for complete data. I’m new to R and Stan and would very much appreciate some ideas on how to approach this.


Data

https://raw.githubusercontent.com/nelsonamayad/Elecciones-presidenciales-2018/master/Elecciones%202018/encuestas2018.csv

Code

library(RCurl)

# Import survey data GitHub:
encuestas <- read.csv(text=getURL("https://raw.githubusercontent.com/nelsonamayad/Elecciones-presidenciales-2018/master/Elecciones%202018/encuestas2018.csv"))

library(tidyverse)
library(kableExtra)

# Prepare data for prior calculation
priors <- encuestas %>% select(n,fecha,ivan_duque,gustavo_petro,sergio_fajardo,german_vargas_lleras,humberto_delacalle) %>% filter(as.Date(fecha)>as.Date("2018-03-11"))
priors <- priors %>% select(-n,-fecha) %>% gather(candidato, int_voto)

# Calculate priors
priors <- priors %>% group_by(candidato) %>% mutate(prior_mu=mean(int_voto),prior_sigma=sd(int_voto))
priors <- priors %>% distinct(prior_mu,prior_sigma) 

# Priors table
priors %>% kable("html", digits=0,caption = "Priors por candidato") %>% kable_styling(full_width = F)

library(lubridate)

#1. Debug surveys:
df <- encuestas %>% select(n,fecha, encuestadora,ivan_duque,gustavo_petro,sergio_fajardo,german_vargas_lleras, humberto_delacalle, margen_error,tipo, muestra_int_voto)

#2. Select polls from 2018-03-11:
df <- df %>% filter(as.Date(fecha, tz="GMT") >= as.Date('2018-03-11', tz="GMT"))

#3. Create duration variable
df <- df %>% mutate(dd = as.Date(as.character(today()), format="%Y-%m-%d") - as.Date(as.character(fecha), format="%Y-%m-%d"))
df <- df %>% mutate(dd = as.numeric(dd))
df <- df %>% mutate(dd = 100*(dd/sum(dd)))

#4. Code pollster
df <- df %>% mutate(enc=encuestadora)
df$encuestadora <- match(df$encuestadora, unique(df$encuestadora))

#5. Dummy poll type (=1 si presencial):
df <- df %>% mutate(tipo=ifelse(tipo=="Presencial",1,0))

#6. Other tweaks:
df <- df %>% rename(m_error = margen_error)

#7. Change date format:
df <- df %>% select(-fecha,-n) %>% gather(candidato, int_voto, ivan_duque,gustavo_petro,sergio_fajardo,german_vargas_lleras,humberto_delacalle)

#8. Create dataframes per candidate:
id <- df %>% filter(candidato=="ivan_duque") 
gp <- df %>% filter(candidato=="gustavo_petro") 
sf <- df %>% filter(candidato=="sergio_fajardo") 
gvl <- df %>% filter(candidato=="german_vargas_lleras") 
hdlc <- df %>% filter(candidato=="humberto_delacalle") 


Stan

data{
    int<lower=1> N;
    int<lower=1> N_encuestadora;
    real<lower=0> int_voto[N];
    int encuestadora[N];
    real<lower=0> muestra_int_voto[N];
    real<lower=0> m_error[N];
    real dd[N];
    real tipo[N];
}
parameters{
    real a1;
    vector[N_encuestadora] a_;
    real a_enc;
    real<lower=0> s_enc;
    real a2;
    real a3;
    real a4;
    real a5;
    real<lower=0> s;
}
model{
    vector[N] m;
    s ~ cauchy( 0 , 5 );
    a5 ~ normal( 0 , 10 );
    a4 ~ normal( 0 , 10 );
    a3 ~ normal( 0 , 10 );
    a2 ~ normal( 0 , 10 );
    s_enc ~ cauchy( 0 , 5 );
    a_enc ~ normal( 0 , 10 );
    a_ ~ normal( a_enc , s_enc );
    a1 ~ normal( 29.6 , 5 );  //Priors Fajardo: mu=12, sd=3
    for ( i in 1:N ) {
        m[i] = a1+a_[encuestadora[i]]+a2*muestra_int_voto[i]+a3*m_error[i]+a4*dd[i]+a5*tipo[i];
    }
    int_voto ~ normal( m , s );
}
generated quantities{
    vector[N] m;
    real dev;
    dev = 0;
    for ( i in 1:N ) {
        m[i] = a1+a_[encuestadora[i]]+a2 * muestra_int_voto[i]+a3*m_error[i]+a4*dd[i]+a5*tipo[i];
    }
    dev = dev + (-2)*normal_lpdf( int_voto | m , s );
}

This won’t work; missing data “NAs”

library(rstan)
options(mc.cores = parallel::detectCores())
fajardo_fit <- stan(file='fajardo.stan',data=list(N=17,N_encuestadora=6,int_voto=sf$int_voto,encuestadora=sf$encuestadora, muestra_int_voto=sf$muestra_int_voto,m_error=sf$m_error,dd=sf$dd,tipo=sf$tipo),control=list(adapt_delta=0.95),iter=4000,chains=4)

but this will

library(rstan)
options(mc.cores = parallel::detectCores())
petro_fit <- stan(file='petro.stan',data=list(N=25,N_encuestadora=100,int_voto=gp$int_voto,encuestadora=gp$encuestadora, muestra_int_voto=gp$muestra_int_voto,m_error=gp$m_error,dd=gp$dd,tipo=gp$tipo),control=list(adapt_delta=0.95),iter=4000,chains=4)

Yeah when you try to pass NAs from R to Stan it doesn’t work. In your example that doesn’t work the int_voto variable of the list has NAs.

Can you just replace those with a placeholder value like -42 or 0? Another idea is to pass in an array that is the same size and is 0 if the data is not NA and 1 if the data is NA.

Are you trying to model your missing data as coming from some distribution? You could also use the transformed parameters block to have a vector that has both data and unknown parameters.

Check out chapter 11 of the Stan manual. It’s all about handling missing data and NAs.

1 Like