rm(list = ls())
set.seed(2024)
library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.5.2

## Warning: package 'tibble' was built under R version 4.5.2

## Warning: package 'tidyr' was built under R version 4.5.2

## Warning: package 'readr' was built under R version 4.5.2

## Warning: package 'purrr' was built under R version 4.5.2

## Warning: package 'dplyr' was built under R version 4.5.2

## Warning: package 'lubridate' was built under R version 4.5.2

library(readxl)
library(here)
library(phyloseq)
library(DECIPHER)
theme_set(theme_bw())

Read in metadata

file <- 'Epilepsy study mapping file with patient info updated 1.22.24 for microbiome analysis(fixed).xlsx'
meta_data <- read_excel(here('data', 'following_study', file), 
                        n_max = 118,
                        na = '.') %>%
    arrange(str_rank(Household, numeric = T))
# the sample id2 column was in float, we we change it to integer
str(meta_data)

## tibble [118 × 26] (S3: tbl_df/tbl/data.frame)
##  $ SampleID                                     : chr [1:118] "Netti_J1" "Netti_J47" "Netti_J2" "Netti_J62" ...
##  $ SampleID2                                    : num [1:118] 83 84 87 88 20 19 101 102 61 62 ...
##  $ i7 Index                                     : num [1:118] 701 706 701 710 701 702 701 704 701 702 ...
##  $ i7 sequence                                  : chr [1:118] "TAAGGCGA" "TAGGCATG" "TAAGGCGA" "CGAGGCTG" ...
##  $ i5 Index                                     : num [1:118] 502 510 503 508 505 506 506 503 507 505 ...
##  $ i5 Sequence                                  : chr [1:118] "CTCTCTAT" "CGTCTAAT" "TATCCTCT" "CTAAGCCT" ...
##  $ Bag                                          : num [1:118] 42 42 44 44 10 10 51 51 31 31 ...
##  $ Tube                                         : num [1:118] 1 2 1 2 2 1 1 2 1 2 ...
##  $ Dog Name                                     : chr [1:118] "Annie" "Cody" "Rachel" "Sue" ...
##  $ Breed                                        : chr [1:118] "Cattle Dog Mix" "Soft Coated Wheaton" "Miniature Poodle" "Miniature Poodle" ...
##  $ Predominant Breed                            : chr [1:118] "Mixed Breed" "Soft Coated Wheaton" "Poodle" "Poodle" ...
##  $ Secondary Breed                              : chr [1:118] "Cattle Dog" NA "Miniature" "Miniature" ...
##  $ Breed Group (1)                              : chr [1:118] "Herder" "Terrier" "Pointer Spaniel" "Pointer Spaniel" ...
##  $ Breed Group (2)                              : chr [1:118] NA NA NA NA ...
##  $ Breed Group (3)                              : chr [1:118] NA NA NA NA ...
##  $ Age (months)                                 : num [1:118] 24 84 66 24 12 72 72 48 96 108 ...
##  $ Sex                                          : chr [1:118] "FS" "MN" "FS" "FS" ...
##  $ Household                                    : num [1:118] 1 1 2 2 3 3 4 4 5 5 ...
##  $ Household: Pheno/drug naïve                  : chr [1:118] "pheno" "pheno" "pheno" "pheno" ...
##  $ Epileptic or Control                         : chr [1:118] "Control" "Epileptic" "Epileptic" "Control" ...
##  $ Pheno Y/N                                    : chr [1:118] NA "Yes" "Yes" NA ...
##  $ Seizure Freedom (Y/N)                        : chr [1:118] NA "Yes" "No" NA ...
##  $ Seizure Control (Satisfactory/Unsatisfactory): chr [1:118] NA "S" "US" NA ...
##  $ State                                        : chr [1:118] "Colorado" "Colorado" "Illinois" "Illinois" ...
##  $ Zip Code                                     : chr [1:118] "80015" "80015" "62704" "62704" ...
##  $ COMMENTS                                     : chr [1:118] NA NA NA NA ...

meta_data$Household <- meta_data$Household %>% 
    as.character()
meta_data$`Age (months)` <- meta_data$`Age (months)` %>%
    str_remove('\\.0') %>% 
    as.numeric()
meta_data$`Zip Code` <- meta_data$`Zip Code` %>%
    str_remove('\\.0') %>%
    str_replace('O','0')
# remove comments row in excel
meta_data <- meta_data %>%
  dplyr::select(-COMMENTS) %>%
  data.frame()

meta_data <- meta_data %>% add_column(study = 'present')
meta_data <- meta_data %>% 
    arrange(str_rank(Household, numeric = T), Epileptic.or.Control)

Sex column contains reproductive status. Here, separate them into two columns

meta_data <- meta_data %>%
    separate(Sex, into = c("Sex", "Reproductive.Status"), sep = 1) %>% 
    mutate(Spayed.or.Neutered = (Reproductive.Status != 'I'))

Pheno.Y.N. indicates whether the dogs are exposed to phenobarbital or not. Here assign NA to No.

meta_data$Pheno.Y.N[is.na(meta_data$Pheno.Y.N)] <- 'No'

Set Epileptic.or.Control as factor

meta_data$Epileptic.or.Control <- factor(meta_data$Epileptic.or.Control, levels = c('Control','Epileptic'))

Store Data to Phyloseq

Store the metadata, sequence table, and taxonomy table into a phyloseq object

# match sample name same as that in the asv table
rownames(meta_data) <- 'Netti' %>%
    str_c(sprintf("%03d", as.integer(meta_data$SampleID2)))
st <- readRDS(here('data','following_study','st.rds')) # sequence table
taxa <- readRDS(here('data','following_study','taxsp.rds')) # taxonomy table

# match st order with metadata (since metadata is sorted by household)
st <- st[meta_data$SampleID2,]
if(!identical(nrow(meta_data), nrow(st))) stop('meta data does not match with sequence table')
if(!identical(rownames(meta_data), rownames(st))) stop('rownames in meta data does not match with sequence table')
# create a phyloseq object
ps <- phyloseq(sample_data(meta_data),
               otu_table(st, taxa_are_rows = FALSE),
               tax_table(taxa))

# Add ASV sequences
seqs <- Biostrings::DNAStringSet(taxa_names(ps))
names(seqs) <- taxa_names(ps)
ps <- merge_phyloseq(ps, seqs)
taxa_names(ps) <- paste0("ASV", seq(ntaxa(ps)))

Quality Assurance

Do some QA on the sequencing libraries:

plot(rowSums(st), log="y")

which(rowSums(st) < 2e+3)

## Netti108 Netti080 Netti040 
##       58       89      107

which(rowSums(st) < 100)

## named integer(0)

All of the data has more than 100 reads, but sample 40,80 and 108 have relative low sequencing depth.

Phylogenetic Tree construction

With the DECIPHER package, use AlignSeqs to align sequence and then use Treeline to construct a phylogenetic tree.

set.seed(2024)
alignment <- AlignSeqs(refseq(ps), processors=NULL) 
tree <- Treeline(alignment, method = "ME", processors=NULL)
WriteDendrogram(tree, file=here('data','following_study','tree.nwk'))

add phylogenetic information to pyholseq objects

tree <- read_tree(here('data','following_study','tree.nwk'))
ps <- merge_phyloseq(ps, phy_tree(tree))
plot_tree(ps, method = 'treeonly')

Data Cleaning

Remove duplicated samples

Some of the dog fecal samples were measured more than once. Here, we keep the one with the most reads.

In household 52, sample 106,107,108 are duplicates of sample 103.

sort(rowSums(st)[str_c('Netti',c('103','106','107','108'))])

## Netti108 Netti107 Netti106 Netti103 
##      255   114520   133741   149299

sample 105 is the duplicate of sample 104.

sort(rowSums(st)[str_c('Netti',c('104','105'))])

## Netti104 Netti105 
##   145926   166678

So, for household 52, sample 103,104 are kept.

In household 54, sample 63,113,115 are duplicates of sample 114.

rowSums(st)[str_c('Netti',c('063','113','114','115'))] %>% sort()

## Netti113 Netti115 Netti063 Netti114 
##    97725   113687   132155   139269

sample 64,117,118 are duplicates of sample 116.

rowSums(st)[str_c('Netti',c('064','116','117','118'))] %>% sort()

## Netti116 Netti117 Netti118 Netti064 
##   117612   131987   133092   146129

So, for household 52, sample 64,114 are kept.

now we can remove sample 106,107,108,113,115,63,116,117,118 since they are duplicates with lower sequencing depth.

duplicated.samples <- c(104,106,107,108,113,115,63,116,117,118)
ps <- subset_samples(ps, !(SampleID2 %in% duplicated.samples))

Discard samples

Some of the households have more than 2 samples, we force to keep two dogs to maintain the structure of the study design.

Keep epileptic and control dogs with higher read depth.

n.samples <- table(sample_data(ps)$Household)
n.samples[n.samples != 2]

## 
## 17 18 26 
##  4  4  4

There are 4 dogs in household 17, 18, and 26.

For household 17, we keep Netti 109 and 112.

sample_data(ps) %>%
    data.frame() %>% 
    mutate(read.depth = rowSums(otu_table(ps))) %>% 
    filter(Household == 17) %>%
    dplyr::select(SampleID2, Household, Epileptic.or.Control, read.depth)

##          SampleID2 Household Epileptic.or.Control read.depth
## Netti110       110        17              Control     112540
## Netti112       112        17              Control     199017
## Netti109       109        17            Epileptic     136678
## Netti111       111        17            Epileptic     120236

For household 18, we keep Netti 049 and 050.

sample_data(ps) %>%
    data.frame() %>% 
    mutate(read.depth = rowSums(otu_table(ps))) %>% 
    filter(Household == 18) %>%
    dplyr::select(SampleID2, Household, Epileptic.or.Control, read.depth)

##          SampleID2 Household Epileptic.or.Control read.depth
## Netti009         9        18              Control     112187
## Netti049        49        18              Control     116721
## Netti010        10        18            Epileptic     129445
## Netti050        50        18            Epileptic     132397

For household 26, we keep Netti 025 and 026

sample_data(ps) %>%
    data.frame() %>% 
    mutate(read.depth = rowSums(otu_table(ps))) %>% 
    filter(Household == 26) %>%
    dplyr::select(SampleID2, Household, Epileptic.or.Control, read.depth)

##          SampleID2 Household Epileptic.or.Control read.depth
## Netti037        37        26              Control     127295
## Netti025        25        26              Control     132664
## Netti038        38        26            Epileptic     143881
## Netti026        26        26            Epileptic     152676

Remove exceeded samples to make a paired data.

remove.list <- c(110,111,9,10,37,38)
ps <- subset_samples(ps, !(SampleID2 %in% remove.list))
ps

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3339 taxa and 102 samples ]
## sample_data() Sample Data:       [ 102 samples by 28 sample variables ]
## tax_table()   Taxonomy Table:    [ 3339 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 3339 tips and 3338 internal nodes ]
## refseq()      DNAStringSet:      [ 3339 reference sequences ]

Now we have 102 samples (data are paired within 51 households)

Rarefied Data

The sequencing depth for each sample is different, we need to check and bring them to the same level.

# check distribution of sequencing depth
sample_sums(ps) %>% hist()

sample_sums(ps) %>% sort() %>% head()

## Netti040 Netti080 Netti048 Netti028 Netti032 Netti098 
##      325      718    44590    49454    59889    61569

Here, we see most of sample has sequencing depth greater than 40000, we can bring all the samples to this sequencing depth level, and drop the two samples (Netti040 and Netti080) that have low sequencing depth.

# make read depth even
ps.rarefied <- rarefy_even_depth(ps, sample.size = 44590)
ps.rarefied

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2548 taxa and 100 samples ]
## sample_data() Sample Data:       [ 100 samples by 28 sample variables ]
## tax_table()   Taxonomy Table:    [ 2548 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 2548 tips and 2547 internal nodes ]
## refseq()      DNAStringSet:      [ 2548 reference sequences ]

Two samples are lost after rarefying. To keep the consistency between the rarefied and non-rarefied data and the structure of the pairwise study design, we remove pairs from household 37 and 46.

house.rm <- meta_data[c('Netti040','Netti080'),'Household']
house.rm

## [1] "46" "37"

ps <- subset_samples(ps, !(Household %in% house.rm))
# since some households removed, remove taxa that are not present in the remained sample
ps <- filter_taxa(ps, function (x) sum(x) > 0, prune = TRUE)
ps

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2879 taxa and 98 samples ]
## sample_data() Sample Data:       [ 98 samples by 28 sample variables ]
## tax_table()   Taxonomy Table:    [ 2879 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 2879 tips and 2878 internal nodes ]
## refseq()      DNAStringSet:      [ 2879 reference sequences ]

ps.rarefied <- subset_samples(ps.rarefied, !(Household %in% house.rm))
ps.rarefied

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2548 taxa and 98 samples ]
## sample_data() Sample Data:       [ 98 samples by 28 sample variables ]
## tax_table()   Taxonomy Table:    [ 2548 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 2548 tips and 2547 internal nodes ]
## refseq()      DNAStringSet:      [ 2548 reference sequences ]

check if data are paired by household

if (!identical(sample_data(ps)$Household[seq(1,98,2)], sample_data(ps)$Household[seq(1,98,2)+1])) stop('data is not paired')
if (!identical(sample_data(ps.rarefied)$Household[seq(1,98,2)], sample_data(ps.rarefied)$Household[seq(1,98,2)+1])) stop('data is not paired')

Save Data

check names are matched for both datasets

if (!identical(rownames(otu_table(ps)), rownames(sample_data(ps)))) stop('sample names are not matched!')
if (!identical(rownames(otu_table(ps.rarefied)), rownames(sample_data(ps.rarefied)))) stop('sample names are not matched!')

Save both dataset to the data folder.

saveRDS(ps, file = here('data','following_study','ps.rds'))
saveRDS(ps.rarefied, file = here('data','following_study','ps_rarefied.rds'))

sessioninfo::session_info()

## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value
##  version  R version 4.5.1 (2025-06-13)
##  os       macOS Tahoe 26.3
##  system   aarch64, darwin20
##  ui       X11
##  language (EN)
##  collate  en_US.UTF-8
##  ctype    en_US.UTF-8
##  tz       America/New_York
##  date     2026-02-21
##  pandoc   3.8.3 @ /opt/homebrew/bin/ (via rmarkdown)
##  quarto   1.8.25 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/quarto
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package      * version  date (UTC) lib source
##  ade4           1.7-23   2025-02-14 [1] CRAN (R 4.5.0)
##  ape            5.8-1    2024-12-16 [1] CRAN (R 4.5.0)
##  Biobase        2.70.0   2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
##  BiocGenerics * 0.56.0   2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
##  biomformat     1.38.0   2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
##  Biostrings   * 2.78.0   2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
##  bslib          0.10.0   2026-01-26 [1] CRAN (R 4.5.2)
##  cachem         1.1.0    2024-05-16 [1] CRAN (R 4.5.0)
##  cellranger     1.1.0    2016-07-27 [1] CRAN (R 4.5.0)
##  cli            3.6.5    2025-04-23 [1] CRAN (R 4.5.0)
##  cluster        2.1.8.2  2026-02-05 [1] CRAN (R 4.5.2)
##  codetools      0.2-20   2024-03-31 [1] CRAN (R 4.5.1)
##  crayon         1.5.3    2024-06-20 [1] CRAN (R 4.5.0)
##  data.table     1.18.2.1 2026-01-27 [1] CRAN (R 4.5.2)
##  DBI            1.2.3    2024-06-02 [1] CRAN (R 4.5.0)
##  DECIPHER     * 3.6.0    2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
##  digest         0.6.39   2025-11-19 [1] CRAN (R 4.5.2)
##  dplyr        * 1.2.0    2026-02-03 [1] CRAN (R 4.5.2)
##  evaluate       1.0.5    2025-08-27 [1] CRAN (R 4.5.0)
##  farver         2.1.2    2024-05-13 [1] CRAN (R 4.5.0)
##  fastmap        1.2.0    2024-05-15 [1] CRAN (R 4.5.0)
##  forcats      * 1.0.1    2025-09-25 [1] CRAN (R 4.5.0)
##  foreach        1.5.2    2022-02-02 [1] CRAN (R 4.5.0)
##  generics     * 0.1.4    2025-05-09 [1] CRAN (R 4.5.0)
##  ggplot2      * 4.0.2    2026-02-03 [1] CRAN (R 4.5.2)
##  glue           1.8.0    2024-09-30 [1] CRAN (R 4.5.0)
##  gtable         0.3.6    2024-10-25 [1] CRAN (R 4.5.0)
##  here         * 1.0.2    2025-09-15 [1] CRAN (R 4.5.0)
##  hms            1.1.4    2025-10-17 [1] CRAN (R 4.5.0)
##  htmltools      0.5.9    2025-12-04 [1] CRAN (R 4.5.2)
##  igraph         2.2.2    2026-02-12 [1] CRAN (R 4.5.2)
##  IRanges      * 2.44.0   2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
##  iterators      1.0.14   2022-02-05 [1] CRAN (R 4.5.0)
##  jquerylib      0.1.4    2021-04-26 [1] CRAN (R 4.5.0)
##  jsonlite       2.0.0    2025-03-27 [1] CRAN (R 4.5.0)
##  knitr          1.51     2025-12-20 [1] CRAN (R 4.5.2)
##  labeling       0.4.3    2023-08-29 [1] CRAN (R 4.5.0)
##  lattice        0.22-9   2026-02-09 [1] CRAN (R 4.5.2)
##  lifecycle      1.0.5    2026-01-08 [1] CRAN (R 4.5.2)
##  lubridate    * 1.9.5    2026-02-04 [1] CRAN (R 4.5.2)
##  magrittr       2.0.4    2025-09-12 [1] CRAN (R 4.5.0)
##  MASS           7.3-65   2025-02-28 [1] CRAN (R 4.5.1)
##  Matrix         1.7-4    2025-08-28 [1] CRAN (R 4.5.0)
##  mgcv           1.9-4    2025-11-07 [1] CRAN (R 4.5.0)
##  multtest       2.66.0   2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
##  nlme           3.1-168  2025-03-31 [1] CRAN (R 4.5.1)
##  otel           0.2.0    2025-08-29 [1] CRAN (R 4.5.0)
##  permute        0.9-10   2026-02-06 [1] CRAN (R 4.5.2)
##  phyloseq     * 1.54.0   2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
##  pillar         1.11.1   2025-09-17 [1] CRAN (R 4.5.0)
##  pkgconfig      2.0.3    2019-09-22 [1] CRAN (R 4.5.0)
##  plyr           1.8.9    2023-10-02 [1] CRAN (R 4.5.0)
##  purrr        * 1.2.1    2026-01-09 [1] CRAN (R 4.5.2)
##  R6             2.6.1    2025-02-15 [1] CRAN (R 4.5.0)
##  RColorBrewer   1.1-3    2022-04-03 [1] CRAN (R 4.5.0)
##  Rcpp           1.1.1    2026-01-10 [1] CRAN (R 4.5.2)
##  readr        * 2.2.0    2026-02-19 [1] CRAN (R 4.5.2)
##  readxl       * 1.4.5    2025-03-07 [1] CRAN (R 4.5.0)
##  reshape2       1.4.5    2025-11-12 [1] CRAN (R 4.5.0)
##  rhdf5          2.54.1   2025-12-02 [1] https://bioc-release.r-universe.dev (R 4.5.2)
##  rhdf5filters   1.22.0   2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
##  Rhdf5lib       1.32.0   2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
##  rlang          1.1.7    2026-01-09 [1] CRAN (R 4.5.2)
##  rmarkdown      2.30     2025-09-28 [1] CRAN (R 4.5.0)
##  rprojroot      2.1.1    2025-08-26 [1] CRAN (R 4.5.0)
##  rstudioapi     0.18.0   2026-01-16 [1] CRAN (R 4.5.2)
##  S4Vectors    * 0.48.0   2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
##  S7             0.2.1    2025-11-14 [1] CRAN (R 4.5.2)
##  sass           0.4.10   2025-04-11 [1] CRAN (R 4.5.0)
##  scales         1.4.0    2025-04-24 [1] CRAN (R 4.5.0)
##  Seqinfo      * 1.0.0    2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
##  sessioninfo    1.2.3    2025-02-05 [1] CRAN (R 4.5.0)
##  stringi        1.8.7    2025-03-27 [1] CRAN (R 4.5.0)
##  stringr      * 1.6.0    2025-11-04 [1] CRAN (R 4.5.0)
##  survival       3.8-6    2026-01-16 [1] CRAN (R 4.5.2)
##  tibble       * 3.3.1    2026-01-11 [1] CRAN (R 4.5.2)
##  tidyr        * 1.3.2    2025-12-19 [1] CRAN (R 4.5.2)
##  tidyselect     1.2.1    2024-03-11 [1] CRAN (R 4.5.0)
##  tidyverse    * 2.0.0    2023-02-22 [1] CRAN (R 4.5.0)
##  timechange     0.4.0    2026-01-29 [1] CRAN (R 4.5.2)
##  tzdb           0.5.0    2025-03-15 [1] CRAN (R 4.5.0)
##  vctrs          0.7.1    2026-01-23 [1] CRAN (R 4.5.2)
##  vegan          2.7-2    2025-10-08 [1] CRAN (R 4.5.0)
##  withr          3.0.2    2024-10-28 [1] CRAN (R 4.5.0)
##  xfun           0.56     2026-01-18 [1] CRAN (R 4.5.2)
##  XVector      * 0.50.0   2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
##  yaml           2.3.12   2025-12-10 [1] CRAN (R 4.5.2)
## 
##  [1] /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library
##  * ── Packages attached to the search path.
## 
## ──────────────────────────────────────────────────────────────────────────────

Metadata Cleaning

Yixuan Yang