rm(list = ls())
set.seed(2024)
library(tidyverse)
library(tidymodels)
library(phyloseq)
library(here)
library(skimr)
library(patchwork)
theme_set(theme_bw())
# load data
ps.rare <- readRDS(here('data','following_study','ps_rarefied.rds')) %>%
filter_taxa(function(x) sum(x > 0)/length(x) > 0.1, prune = TRUE)
In this section, we want to try to use machine learning models to predict the epilepsy status of the dogs based on their microbiome data. Here we are using the rarefied data with each taxa has at least 10% prevalence in our data.
Prediction accuracy is not only affected by the model but also by the structure of the data. We used three common models in microbial study: logistic regression, random forest, and support vector machine.
For each model, we tried to use:
microbial data at phylum level
microbial data at ASV level
dimension reduced microbial data by PCoA
method
We used 70% data to train models and 30% data for testing. All models are trained and tested using 10-fold cross-validation repeated 10 times. The best model is selected based on the accuracy metric. As model performance varies, each model is trained and tested 30 times, and the mean and standard deviation of the accuracy are reported.
ASV level
ps.rare.asv <- ps.rare
sam <- data.frame(sample_data(ps.rare.asv)) %>%
dplyr::select(Epileptic.or.Control, Household)
otu <- data.frame(otu_table(ps.rare.asv))
# combine data to a data frame
ps.data <- bind_cols(sam, otu)
data_split <- group_initial_split(ps.data, group = 'Household', prop = 0.7)
Recipe <- recipe(Epileptic.or.Control ~ ., data = training(data_split)) %>%
update_role(Household, new_role = 'group variable')
Recipe
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 322
## group variable: 1
Phylum level
ps.rare.phylum <- ps.rare %>%
tax_glom(taxrank = 'Phylum')
sam <- data.frame(sample_data(ps.rare.phylum)) %>%
dplyr::select(Epileptic.or.Control, Household)
otu <- data.frame(otu_table(ps.rare.phylum))
# combine data to a data frame
ps.data <- bind_cols(sam, otu)
data_split <- group_initial_split(ps.data, group = 'Household', prop = 0.7)
Recipe <- recipe(Epileptic.or.Control ~ ., data = training(data_split)) %>%
update_role(Household, new_role = 'group variable')
Recipe
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 7
## group variable: 1
PCoA
pcoa <- ps.rare %>%
ordinate(method = 'PCoA', distance = 'bray')
plot_scree(pcoa)
n.vecter <- 30
pcoa$values$Cum_corr_eig[n.vecter]
## [1] 0.6925963
All model training script are stored in
code/machine-learning-models folder.
All model performance results are stored in
data/following_study/ml-performance folder.
model.performance <- list.files(here('data', 'following_study', 'ml-performance'),
pattern = '.csv',
full.names = TRUE) %>% map_dfr(read.csv)
model.performance %>%
ggplot(aes(x = data, y = accuracy, fill = data)) +
geom_boxplot() +
geom_jitter(width = 0.2, alpha = 0.5) +
labs(x = 'Data', y = 'Accuracy') +
theme(legend.position = 'none')
ggsave(here('figures', 'Figure6.png'))
## Saving 7 x 5 in image
ggsave(here('figures', 'Figure6.pdf'))
## Saving 7 x 5 in image
model.performance.summary <- model.performance %>%
group_by(method, data) %>%
summarise(mean = mean(accuracy), sd = sd(accuracy))
## `summarise()` has grouped output by 'method'. You can override using the
## `.groups` argument.
model.performance.summary
## # A tibble: 3 × 4
## # Groups: method [1]
## method data mean sd
## <chr> <chr> <dbl> <dbl>
## 1 random forest ASV 0.547 0.0629
## 2 random forest PCoA 0.588 0.0899
## 3 random forest phylum 0.562 0.0572
anova(lm(accuracy ~ data, data = model.performance))
## Analysis of Variance Table
##
## Response: accuracy
## Df Sum Sq Mean Sq F value Pr(>F)
## data 2 0.02585 0.0129259 2.5326 0.0853 .
## Residuals 87 0.44404 0.0051039
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1