library(brms) library(ggplot2) library(dplyr) library(tidyr) ###simulate some data from an uncorrelated varying intercepts and varying slopes model set.seed(987456) drug_f <- rep(letters[1:13], each=85) drug <- rep(1:13, each=85) N <- length(drug) pr <- c(runif(10, min=0.2, max=0.6), runif(5, min=0.025, max=0.15)) gene1 <- rbinom(N, 1, pr[1]) gene2 <- rbinom(N, 1, pr[2]) gene3 <- rbinom(N, 1, pr[3]) gene4 <- rbinom(N, 1, pr[4]) gene5 <- rbinom(N, 1, pr[5]) gene6 <- rbinom(N, 1, pr[6]) gene7 <- rbinom(N, 1, pr[7]) gene8 <- rbinom(N, 1, pr[8]) gene9 <- rbinom(N, 1, pr[9]) gene10 <- rbinom(N, 1, pr[10]) gene11 <- rbinom(N, 1, pr[11]) gene12 <- rbinom(N, 1, pr[12]) gene13 <- rbinom(N, 1, pr[13]) gene14 <- rbinom(N, 1, pr[14]) gene15 <- rbinom(N, 1, pr[15]) r_1 <- rt(13,3) a <- -4 r_b1 <- rt(13,3) r_b2 <- rt(13,3) r_b3 <- rt(13,3) r_b4 <- rt(13,3) r_b5 <- rt(13,3) r_b6 <- rt(13,3) r_b7 <- rt(13,3) r_b8 <- rt(13,3) r_b9 <- rt(13,3) r_b10 <- rt(13,3) r_b11 <- rt(13,3) r_b12 <- rt(13,3) r_b13 <- rt(13,3) r_b14 <- rt(13,3) r_b15 <- rt(13,3) b1 <- 2.5 b2 <- 0 b3 <- 3 b4 <- 0 b5 <- 0 b6 <- 0 b7 <- 5 b8 <- 3 b9 <- 0 b10 <- 0 b11 <- 0 b12 <- 0 b13 <- 0 b14 <- 0 b15 <- 0 p <- plogis((a + r_1[drug]) + (b1 + r_b1[drug])*gene1 + (b2 + r_b2[drug])*gene2 + (b3 + r_b3[drug])*gene3 + (b4 + r_b4[drug])*gene4 + (b5 + r_b5[drug])*gene5 + (b6 + r_b6[drug])*gene6 + (b7 + r_b7[drug])*gene7 + (b8 + r_b8[drug])*gene8 + (b9 + r_b9[drug])*gene9 + (b10 + r_b10[drug])*gene10 + (b11 + r_b11[drug])*gene11 + (b12 + r_b12[drug])*gene12 + (b13 + r_b13[drug])*gene13 + (b14 + r_b14[drug])*gene14 + (b15 + r_b15[drug])*gene15) y <- rbinom(N, 1, p) d1 <- cbind.data.frame(y, drug_f, gene1, gene2, gene3, gene4, gene5, gene6, gene7, gene8, gene9, gene10, gene11, gene12, gene13, gene14, gene15) str(d1) ###plot data d2 <- d1 %>% pivot_longer(gene1:gene15, names_to = "gene", values_to = "presence") d2$gene <- factor(d2$gene) str(d2) p1 <- ggplot(d2, aes(x=presence, y=y, colour=drug_f)) + geom_jitter(size=3, width=0.15, height=0.1) + facet_wrap(~ drug_f + gene) p1 ###run model #set up model, varying intercepts and slopes model where varying effects modeled via student distribution bf_logistic <- bf(y ~ 1 + gene1 + gene2 + gene3 + gene4 + gene5 + gene6 + gene7 + gene8 + gene9 + gene10 + gene11 + gene12 + gene13 + gene14 + gene15 + (1 + gene1 + gene2 + gene3 + gene4 + gene5 + gene6 + gene7 + gene8 + gene9 + gene10 + gene11 + gene12 + gene13 + gene14 + gene15 | gr(drug_f, dist="student"))) + bernoulli() #set up regularized hs prior for logistic model p <- 5 #guess at number of relevant variables D <- 16 #number of variables in the model (is this correct even for hierarchical model??? I assume it is number of population-level effects) sigma <- 2 #as recommended for logistic regression in slide 14 https://github.com/avehtari/modelselection/blob/master/regularizedhorseshoe_slides.pdf n <- nrow(d1) #number of observations in dataset scale_global <- (p/(D-p))*(sigma/sqrt(n)) prior1 <- c( prior(horseshoe(df = 1, scale_global = scale_global), class=b), prior(cauchy(0, 1), class=sd), prior(lkj(2), class=cor) ) m1 <- brm(bf_logistic, prior = prior1, data=d1, chains=4, iter=2000, warmup=1000, control = list(adapt_delta = 0.999, max_treedepth=13), cores=4) m1