Set Up

rm(list = ls())
set.seed(2024)
library(tidyverse)
library(tidymodels)
library(phyloseq)
library(here)
library(skimr)
library(patchwork)
theme_set(theme_bw())
# load data
ps.rare <- readRDS(here('data','following_study','ps_rarefied.rds')) %>% 
  filter_taxa(function(x) sum(x > 0)/length(x) > 0.1, prune = TRUE)

Start

In this section, we want to try to use machine learning models to predict the epilepsy status of the dogs based on their microbiome data. Here we are using the rarefied data with each taxa has at least 10% prevalence in our data.

Prediction accuracy is not only affected by the model but also by the structure of the data. We used three common models in microbial study: logistic regression, random forest, and support vector machine.

For each model, we tried to use:

  1. microbial data at phylum level

  2. microbial data at ASV level

  3. dimension reduced microbial data by PCoA method

We used 70% data to train models and 30% data for testing. All models are trained and tested using 10-fold cross-validation repeated 10 times. The best model is selected based on the accuracy metric. As model performance varies, each model is trained and tested 30 times, and the mean and standard deviation of the accuracy are reported.

Data Preprocess

ASV level

ps.rare.asv <- ps.rare
sam <- data.frame(sample_data(ps.rare.asv)) %>% 
    dplyr::select(Epileptic.or.Control, Household)
otu <- data.frame(otu_table(ps.rare.asv))
# combine data to a data frame
ps.data <- bind_cols(sam, otu)
data_split <- group_initial_split(ps.data, group = 'Household', prop = 0.7)
Recipe <- recipe(Epileptic.or.Control ~ ., data = training(data_split)) %>%
        update_role(Household, new_role = 'group variable')
Recipe
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:          1
## predictor:      322
## group variable:   1

Phylum level

ps.rare.phylum <- ps.rare  %>% 
    tax_glom(taxrank = 'Phylum')
sam <- data.frame(sample_data(ps.rare.phylum)) %>% 
    dplyr::select(Epileptic.or.Control, Household)
otu <- data.frame(otu_table(ps.rare.phylum))
# combine data to a data frame
ps.data <- bind_cols(sam, otu)
data_split <- group_initial_split(ps.data, group = 'Household', prop = 0.7)
Recipe <- recipe(Epileptic.or.Control ~ ., data = training(data_split)) %>%
        update_role(Household, new_role = 'group variable') 
Recipe
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:        1
## predictor:      7
## group variable: 1

PCoA

pcoa <- ps.rare %>% 
  ordinate(method = 'PCoA', distance = 'bray')
plot_scree(pcoa)

n.vecter <- 30
pcoa$values$Cum_corr_eig[n.vecter]
## [1] 0.6925963

Model Training and Testing

All model training script are stored in code/machine-learning-models folder.

All model performance results are stored in data/following_study/ml-performance folder.

Results

model.performance <- list.files(here('data', 'following_study', 'ml-performance'),
                                pattern = '.csv', 
                                full.names = TRUE) %>% map_dfr(read.csv)
model.performance %>% 
  ggplot(aes(x = data, y = accuracy, fill = data)) +
  geom_boxplot() +
  geom_jitter(width = 0.2, alpha = 0.5) +
  labs(x = 'Data', y = 'Accuracy') +
  theme(legend.position = 'none')

ggsave(here('figures', 'Figure6.png'))
## Saving 7 x 5 in image
ggsave(here('figures', 'Figure6.pdf'))
## Saving 7 x 5 in image
model.performance.summary <- model.performance %>% 
  group_by(method, data) %>% 
  summarise(mean = mean(accuracy), sd = sd(accuracy)) 
## `summarise()` has grouped output by 'method'. You can override using the
## `.groups` argument.
model.performance.summary
## # A tibble: 3 × 4
## # Groups:   method [1]
##   method        data    mean     sd
##   <chr>         <chr>  <dbl>  <dbl>
## 1 random forest ASV    0.547 0.0629
## 2 random forest PCoA   0.588 0.0899
## 3 random forest phylum 0.562 0.0572
anova(lm(accuracy ~ data, data = model.performance))
## Analysis of Variance Table
## 
## Response: accuracy
##           Df  Sum Sq   Mean Sq F value Pr(>F)  
## data       2 0.02585 0.0129259  2.5326 0.0853 .
## Residuals 87 0.44404 0.0051039                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1