Hi All,
HOPING TO GET
Some guidance / advicefor a bernoulli model with non-integer response (need to estimate an integer)
Thanks,
Steve
OUTLINE
Currently we have a bernoulli process where we want to estimate the the likelihood of a biological event occuring in a given month. Our observations, however, have error and they are not integer values (but range from 0 to 1).
THIS IS WHAT WE ARE AIMING FOR…
- somehow estimate the binary record from real numbers (between 0 and 1) for use in a bernoulli process (modelling block).
ISSUES
In the past (in gaussian models) we have estimated the true state from a measurement as a transformed parameter. Here however we run into issues with integers in transformed parameter and model blocks.
Common error: Identifier expected after sized type in local (or model block) variable declaration. (No transformations/constraints allowed.)
TRIED, BUT DOESNT QUITE WORK
(1) Calculate the integer value outside of stan (in R), but doesn’t estimate the parameters appropriately
df$runif <- runif(n=dim(df)[1], min=0, max=1)
df$event_io <- ifelse(df$runif <= df$event, yes=1, no=0)
(2) Numerically assigning 1|0 based on thresholds within stan
Creating an io parameter in stan based on whether p_event_w_error is above or below a threshold (fixed and estimated). Again, the outputs are not as per our generated data.
(3) Looked at using a Beta distribution and bernoulli
Here we used a threshold 0.001 outside of stan to state whether something was a 1 or 0.
We then had int data, and real data and used a flag to use either a model form.
STAN SCAFFOLD (in non working form)
data {
int<lower=1> N; // rows of data
//VBGF core terms
vector<lower=0>[N] m01;
vector<lower=0>[N] m02;
...
vector<lower=0>[N] m11;
vector<lower=0>[N] m12;
real<lower=0, upper=1> event[N];
}
parameters {
//Prob of event happening in a given month
real<lower=0, upper=1> p01;
real<lower=0, upper=1> p02;
...
real<lower=0, upper=1> p11;
real<lower=0, upper=1> p12;
}
transformed parameters {
vector[N] theta;
theta = 1 - pow(1-p01,m01).*pow(1-p02,m02).*pow(1-p03,m03).*pow(1-p04,m04).*pow(1-p05,m05).*pow(1-p06,m06).*pow(1-p07,m07).*pow(1-p08,m08).*pow(1-p09,m09).*pow(1-p10,m10).*pow(1-p11,m11).*pow(1-p12,m12);
}
model {
int<lower=0, upper=1> event_io[N];
for (n in 1:N){
//attempt to estimate an integer record of whether event occurred probabilistically...fails
event_io[n] ~ bernoulli(event[n]);
// Likelihood
event_io[n] ~ bernoulli(theta[n]);
}
} //end model
GENERATED DATA
#### Package / Library Requirements ####
rm(list=ls())
options(stringsAsFactors = FALSE)
options(scipen = 999)
#' Packages (not in environment) -------------------------------------------
list.of.packages <- c("magrittr", "tidyr", "dplyr", "stringr", "purrr"
, "rebus", "lubridate"
, "rstan", "shinystan", "stringr", "StanHeaders"
, "overlapping", "stringi")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
#' Libraries ---------------------------------------------------------------
req_packages <- list.of.packages
sapply(req_packages, require, character.only = TRUE, quietly=TRUE)
#### Create Generated Data #####
#--> Sorting dataframe ####
##set the number of animals
num_rows <- 1000
##set start and end dates
start_date1 <- as.Date("2000-01-01")
start_date2 <- as.Date("2002-01-01")
end_date1 <- as.Date("2002-12-31")
end_date2 <- as.Date("2004-12-31")
random_date1 <- sample(seq(start_date1, end_date1, by = "1 day"), num_rows, replace=TRUE)
random_date2 <- sample(seq(start_date2, end_date2, by = "1 day"), num_rows, replace=TRUE)
## Create a dataframe
df <- data.frame(
randval = sample(seq(1, 100), num_rows, replace=TRUE), #50:50 random value
date1 = random_date1,
date2 = random_date2,
m01 = rep(NA,num_rows),
m02 = rep(NA,num_rows),
m03 = rep(NA,num_rows),
m04 = rep(NA,num_rows),
m05 = rep(NA,num_rows),
m06 = rep(NA,num_rows),
m07 = rep(NA,num_rows),
m08 = rep(NA,num_rows),
m09 = rep(NA,num_rows),
m10 = rep(NA,num_rows),
m11 = rep(NA,num_rows),
m12 = rep(NA,num_rows),
event = rep(NA,num_rows)
)
##Fix dates when in df
df[(df$date1 > df$date2), "date1"] <- df[(df$date1 > df$date2), "date2"] - df[(df$date1 > df$date2), "randval"]
#--> Sorted month counts ####
for (n in 1:dim(df)[1]){
use_date1 <- df$date1[n]
use_date2 <- df$date2[n]
# Generate a sequence of dates at daily intervals between date1 and date2
date_sequence <- seq(use_date1 , use_date2 , by = "1 day")
# Assuming months_vector is the result from the previous code
months_vector <- format(date_sequence, "%B")
# Define the custom order of months
custom_order <- data.frame(
Month = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
, QtyDays = c(31 , 28.25 , 31 , 30 , 31 , 30 ,31 , 31 , 30 , 31 , 30 ,31)
)
tmp <- merge(custom_order
, table(months_vector) %>% as.data.frame()
,by.x="Month", "months_vector"
, all=TRUE)
tmp$count <- tmp$Freq/tmp$QtyDays
# Order the dataframe by the custom order of the "Month" column
tmp <- tmp[order(match(tmp$Month, c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"))), ]
tmp <- tmp %>% mutate(count = ifelse(is.na(count), 0, count))
tmp_vec <- tmp$count
df[n,names(df)[names(df) %>% str_detect(START %R% "m" %R% DGT)]] <- tmp_vec
}
#--> Checking if it event occurred ####
df_m <- df[,names(df)[names(df) %>% str_detect(START %R% "m" %R% DGT)]]
event_vec <- c(0.05, 0.05 , 0.1 , 0.4 , 0.7 , 0.4 ,0.2 , 0.15 , 0.05 , 0.02 , 0.02 ,0.01)
event_chance <- data.frame(Month = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
, prob_event = event_vec
)
for (n in 1:dim(df_m)[1]){
df$event[n] <- 1 - prod( (1-event_chance$prob_event)^df_m[n,] )
}
#--> Add "basic" io case outside of stan ####
df$runif <- runif(n=dim(df)[1], min=0, max=1)
df$event_io <- ifelse(df$runif <= df$event, yes=1, no=0)
sdata <- list(
name = "generated"
, N = df %>% dim() %>% pluck(1)
, m01 = df %>% select(m01) %>% pluck(1)
, m02 = df %>% select(m02) %>% pluck(1)
, m03 = df %>% select(m03) %>% pluck(1)
, m04 = df %>% select(m04) %>% pluck(1)
, m05 = df %>% select(m05) %>% pluck(1)
, m06 = df %>% select(m06) %>% pluck(1)
, m07 = df %>% select(m07) %>% pluck(1)
, m08 = df %>% select(m08) %>% pluck(1)
, m09 = df %>% select(m09) %>% pluck(1)
, m10 = df %>% select(m10) %>% pluck(1)
, m11 = df %>% select(m11) %>% pluck(1)
, m12 = df %>% select(m12) %>% pluck(1)
, event = df %>% select(prob_moulted) %>% pluck(1)
, event_io = df %>% select(prob_moulted_io) %>% pluck(1)
)
sdata %>% str()
#####