rm(list = ls())
set.seed(2024)
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
library(readxl)
library(here)
library(phyloseq)
library(DECIPHER)
theme_set(theme_bw())
file <- 'Epilepsy study mapping file with patient info updated 1.22.24 for microbiome analysis(fixed).xlsx'
meta_data <- read_excel(here('data', 'following_study', file),
n_max = 118,
na = '.') %>%
arrange(str_rank(Household, numeric = T))
# the sample id2 column was in float, we we change it to integer
str(meta_data)
## tibble [118 × 26] (S3: tbl_df/tbl/data.frame)
## $ SampleID : chr [1:118] "Netti_J1" "Netti_J47" "Netti_J2" "Netti_J62" ...
## $ SampleID2 : num [1:118] 83 84 87 88 20 19 101 102 61 62 ...
## $ i7 Index : num [1:118] 701 706 701 710 701 702 701 704 701 702 ...
## $ i7 sequence : chr [1:118] "TAAGGCGA" "TAGGCATG" "TAAGGCGA" "CGAGGCTG" ...
## $ i5 Index : num [1:118] 502 510 503 508 505 506 506 503 507 505 ...
## $ i5 Sequence : chr [1:118] "CTCTCTAT" "CGTCTAAT" "TATCCTCT" "CTAAGCCT" ...
## $ Bag : num [1:118] 42 42 44 44 10 10 51 51 31 31 ...
## $ Tube : num [1:118] 1 2 1 2 2 1 1 2 1 2 ...
## $ Dog Name : chr [1:118] "Annie" "Cody" "Rachel" "Sue" ...
## $ Breed : chr [1:118] "Cattle Dog Mix" "Soft Coated Wheaton" "Miniature Poodle" "Miniature Poodle" ...
## $ Predominant Breed : chr [1:118] "Mixed Breed" "Soft Coated Wheaton" "Poodle" "Poodle" ...
## $ Secondary Breed : chr [1:118] "Cattle Dog" NA "Miniature" "Miniature" ...
## $ Breed Group (1) : chr [1:118] "Herder" "Terrier" "Pointer Spaniel" "Pointer Spaniel" ...
## $ Breed Group (2) : chr [1:118] NA NA NA NA ...
## $ Breed Group (3) : chr [1:118] NA NA NA NA ...
## $ Age (months) : num [1:118] 24 84 66 24 12 72 72 48 96 108 ...
## $ Sex : chr [1:118] "FS" "MN" "FS" "FS" ...
## $ Household : num [1:118] 1 1 2 2 3 3 4 4 5 5 ...
## $ Household: Pheno/drug naïve : chr [1:118] "pheno" "pheno" "pheno" "pheno" ...
## $ Epileptic or Control : chr [1:118] "Control" "Epileptic" "Epileptic" "Control" ...
## $ Pheno Y/N : chr [1:118] NA "Yes" "Yes" NA ...
## $ Seizure Freedom (Y/N) : chr [1:118] NA "Yes" "No" NA ...
## $ Seizure Control (Satisfactory/Unsatisfactory): chr [1:118] NA "S" "US" NA ...
## $ State : chr [1:118] "Colorado" "Colorado" "Illinois" "Illinois" ...
## $ Zip Code : chr [1:118] "80015" "80015" "62704" "62704" ...
## $ COMMENTS : chr [1:118] NA NA NA NA ...
meta_data$Household <- meta_data$Household %>%
as.character()
meta_data$`Age (months)` <- meta_data$`Age (months)` %>%
str_remove('\\.0') %>%
as.numeric()
meta_data$`Zip Code` <- meta_data$`Zip Code` %>%
str_remove('\\.0') %>%
str_replace('O','0')
# remove comments row in excel
meta_data <- meta_data %>%
dplyr::select(-COMMENTS) %>%
data.frame()
meta_data <- meta_data %>% add_column(study = 'present')
meta_data <- meta_data %>%
arrange(str_rank(Household, numeric = T), Epileptic.or.Control)
Sex column contains reproductive status. Here, separate them into two columns
meta_data <- meta_data %>%
separate(Sex, into = c("Sex", "Reproductive.Status"), sep = 1) %>%
mutate(Spayed.or.Neutered = (Reproductive.Status != 'I'))
Pheno.Y.N. indicates whether the dogs are exposed to phenobarbital or not. Here assign NA to No.
meta_data$Pheno.Y.N[is.na(meta_data$Pheno.Y.N)] <- 'No'
Set Epileptic.or.Control as factor
meta_data$Epileptic.or.Control <- factor(meta_data$Epileptic.or.Control, levels = c('Control','Epileptic'))
Store the metadata, sequence table, and taxonomy table into a phyloseq object
# match sample name same as that in the asv table
rownames(meta_data) <- 'Netti' %>%
str_c(sprintf("%03d", as.integer(meta_data$SampleID2)))
st <- readRDS(here('data','following_study','st.rds')) # sequence table
taxa <- readRDS(here('data','following_study','taxsp.rds')) # taxonomy table
# match st order with metadata (since metadata is sorted by household)
st <- st[meta_data$SampleID2,]
if(!identical(nrow(meta_data), nrow(st))) stop('meta data does not match with sequence table')
if(!identical(rownames(meta_data), rownames(st))) stop('rownames in meta data does not match with sequence table')
# create a phyloseq object
ps <- phyloseq(sample_data(meta_data),
otu_table(st, taxa_are_rows = FALSE),
tax_table(taxa))
# Add ASV sequences
seqs <- Biostrings::DNAStringSet(taxa_names(ps))
names(seqs) <- taxa_names(ps)
ps <- merge_phyloseq(ps, seqs)
taxa_names(ps) <- paste0("ASV", seq(ntaxa(ps)))
Do some QA on the sequencing libraries:
plot(rowSums(st), log="y")
which(rowSums(st) < 2e+3)
## Netti108 Netti080 Netti040
## 58 89 107
which(rowSums(st) < 100)
## named integer(0)
All of the data has more than 100 reads, but sample 40,80 and 108 have relative low sequencing depth.
With the DECIPHER package, use AlignSeqs to
align sequence and then use Treeline to construct a
phylogenetic tree.
set.seed(2024)
alignment <- AlignSeqs(refseq(ps), processors=NULL)
tree <- Treeline(alignment, method = "ME", processors=NULL)
WriteDendrogram(tree, file=here('data','following_study','tree.nwk'))
add phylogenetic information to pyholseq objects
tree <- read_tree(here('data','following_study','tree.nwk'))
ps <- merge_phyloseq(ps, phy_tree(tree))
plot_tree(ps, method = 'treeonly')
Some of the dog fecal samples were measured more than once. Here, we keep the one with the most reads.
In household 52, sample 106,107,108 are duplicates of sample 103.
sort(rowSums(st)[str_c('Netti',c('103','106','107','108'))])
## Netti108 Netti107 Netti106 Netti103
## 255 114520 133741 149299
sample 105 is the duplicate of sample 104.
sort(rowSums(st)[str_c('Netti',c('104','105'))])
## Netti104 Netti105
## 145926 166678
So, for household 52, sample 103,104 are kept.
In household 54, sample 63,113,115 are duplicates of sample 114.
rowSums(st)[str_c('Netti',c('063','113','114','115'))] %>% sort()
## Netti113 Netti115 Netti063 Netti114
## 97725 113687 132155 139269
sample 64,117,118 are duplicates of sample 116.
rowSums(st)[str_c('Netti',c('064','116','117','118'))] %>% sort()
## Netti116 Netti117 Netti118 Netti064
## 117612 131987 133092 146129
So, for household 52, sample 64,114 are kept.
now we can remove sample 106,107,108,113,115,63,116,117,118 since they are duplicates with lower sequencing depth.
duplicated.samples <- c(104,106,107,108,113,115,63,116,117,118)
ps <- subset_samples(ps, !(SampleID2 %in% duplicated.samples))
Some of the households have more than 2 samples, we force to keep two dogs to maintain the structure of the study design.
Keep epileptic and control dogs with higher read depth.
n.samples <- table(sample_data(ps)$Household)
n.samples[n.samples != 2]
##
## 17 18 26
## 4 4 4
There are 4 dogs in household 17, 18, and 26.
For household 17, we keep Netti 109 and 112.
sample_data(ps) %>%
data.frame() %>%
mutate(read.depth = rowSums(otu_table(ps))) %>%
filter(Household == 17) %>%
dplyr::select(SampleID2, Household, Epileptic.or.Control, read.depth)
## SampleID2 Household Epileptic.or.Control read.depth
## Netti110 110 17 Control 112540
## Netti112 112 17 Control 199017
## Netti109 109 17 Epileptic 136678
## Netti111 111 17 Epileptic 120236
For household 18, we keep Netti 049 and 050.
sample_data(ps) %>%
data.frame() %>%
mutate(read.depth = rowSums(otu_table(ps))) %>%
filter(Household == 18) %>%
dplyr::select(SampleID2, Household, Epileptic.or.Control, read.depth)
## SampleID2 Household Epileptic.or.Control read.depth
## Netti009 9 18 Control 112187
## Netti049 49 18 Control 116721
## Netti010 10 18 Epileptic 129445
## Netti050 50 18 Epileptic 132397
For household 26, we keep Netti 025 and 026
sample_data(ps) %>%
data.frame() %>%
mutate(read.depth = rowSums(otu_table(ps))) %>%
filter(Household == 26) %>%
dplyr::select(SampleID2, Household, Epileptic.or.Control, read.depth)
## SampleID2 Household Epileptic.or.Control read.depth
## Netti037 37 26 Control 127295
## Netti025 25 26 Control 132664
## Netti038 38 26 Epileptic 143881
## Netti026 26 26 Epileptic 152676
Remove exceeded samples to make a paired data.
remove.list <- c(110,111,9,10,37,38)
ps <- subset_samples(ps, !(SampleID2 %in% remove.list))
ps
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 3339 taxa and 102 samples ]
## sample_data() Sample Data: [ 102 samples by 28 sample variables ]
## tax_table() Taxonomy Table: [ 3339 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 3339 tips and 3338 internal nodes ]
## refseq() DNAStringSet: [ 3339 reference sequences ]
Now we have 102 samples (data are paired within 51 households)
The sequencing depth for each sample is different, we need to check and bring them to the same level.
# check distribution of sequencing depth
sample_sums(ps) %>% hist()
sample_sums(ps) %>% sort() %>% head()
## Netti040 Netti080 Netti048 Netti028 Netti032 Netti098
## 325 718 44590 49454 59889 61569
Here, we see most of sample has sequencing depth greater than 40000, we can bring all the samples to this sequencing depth level, and drop the two samples (Netti040 and Netti080) that have low sequencing depth.
# make read depth even
ps.rarefied <- rarefy_even_depth(ps, sample.size = 44590)
ps.rarefied
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2548 taxa and 100 samples ]
## sample_data() Sample Data: [ 100 samples by 28 sample variables ]
## tax_table() Taxonomy Table: [ 2548 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 2548 tips and 2547 internal nodes ]
## refseq() DNAStringSet: [ 2548 reference sequences ]
Two samples are lost after rarefying. To keep the consistency between the rarefied and non-rarefied data and the structure of the pairwise study design, we remove pairs from household 37 and 46.
house.rm <- meta_data[c('Netti040','Netti080'),'Household']
house.rm
## [1] "46" "37"
ps <- subset_samples(ps, !(Household %in% house.rm))
# since some households removed, remove taxa that are not present in the remained sample
ps <- filter_taxa(ps, function (x) sum(x) > 0, prune = TRUE)
ps
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2879 taxa and 98 samples ]
## sample_data() Sample Data: [ 98 samples by 28 sample variables ]
## tax_table() Taxonomy Table: [ 2879 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 2879 tips and 2878 internal nodes ]
## refseq() DNAStringSet: [ 2879 reference sequences ]
ps.rarefied <- subset_samples(ps.rarefied, !(Household %in% house.rm))
ps.rarefied
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2548 taxa and 98 samples ]
## sample_data() Sample Data: [ 98 samples by 28 sample variables ]
## tax_table() Taxonomy Table: [ 2548 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 2548 tips and 2547 internal nodes ]
## refseq() DNAStringSet: [ 2548 reference sequences ]
check if data are paired by household
if (!identical(sample_data(ps)$Household[seq(1,98,2)], sample_data(ps)$Household[seq(1,98,2)+1])) stop('data is not paired')
if (!identical(sample_data(ps.rarefied)$Household[seq(1,98,2)], sample_data(ps.rarefied)$Household[seq(1,98,2)+1])) stop('data is not paired')
check names are matched for both datasets
if (!identical(rownames(otu_table(ps)), rownames(sample_data(ps)))) stop('sample names are not matched!')
if (!identical(rownames(otu_table(ps.rarefied)), rownames(sample_data(ps.rarefied)))) stop('sample names are not matched!')
Save both dataset to the data folder.
saveRDS(ps, file = here('data','following_study','ps.rds'))
saveRDS(ps.rarefied, file = here('data','following_study','ps_rarefied.rds'))
sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 4.5.1 (2025-06-13)
## os macOS Tahoe 26.3
## system aarch64, darwin20
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/New_York
## date 2026-02-21
## pandoc 3.8.3 @ /opt/homebrew/bin/ (via rmarkdown)
## quarto 1.8.25 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/quarto
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date (UTC) lib source
## ade4 1.7-23 2025-02-14 [1] CRAN (R 4.5.0)
## ape 5.8-1 2024-12-16 [1] CRAN (R 4.5.0)
## Biobase 2.70.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## BiocGenerics * 0.56.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## biomformat 1.38.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## Biostrings * 2.78.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## bslib 0.10.0 2026-01-26 [1] CRAN (R 4.5.2)
## cachem 1.1.0 2024-05-16 [1] CRAN (R 4.5.0)
## cellranger 1.1.0 2016-07-27 [1] CRAN (R 4.5.0)
## cli 3.6.5 2025-04-23 [1] CRAN (R 4.5.0)
## cluster 2.1.8.2 2026-02-05 [1] CRAN (R 4.5.2)
## codetools 0.2-20 2024-03-31 [1] CRAN (R 4.5.1)
## crayon 1.5.3 2024-06-20 [1] CRAN (R 4.5.0)
## data.table 1.18.2.1 2026-01-27 [1] CRAN (R 4.5.2)
## DBI 1.2.3 2024-06-02 [1] CRAN (R 4.5.0)
## DECIPHER * 3.6.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## digest 0.6.39 2025-11-19 [1] CRAN (R 4.5.2)
## dplyr * 1.2.0 2026-02-03 [1] CRAN (R 4.5.2)
## evaluate 1.0.5 2025-08-27 [1] CRAN (R 4.5.0)
## farver 2.1.2 2024-05-13 [1] CRAN (R 4.5.0)
## fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.5.0)
## forcats * 1.0.1 2025-09-25 [1] CRAN (R 4.5.0)
## foreach 1.5.2 2022-02-02 [1] CRAN (R 4.5.0)
## generics * 0.1.4 2025-05-09 [1] CRAN (R 4.5.0)
## ggplot2 * 4.0.2 2026-02-03 [1] CRAN (R 4.5.2)
## glue 1.8.0 2024-09-30 [1] CRAN (R 4.5.0)
## gtable 0.3.6 2024-10-25 [1] CRAN (R 4.5.0)
## here * 1.0.2 2025-09-15 [1] CRAN (R 4.5.0)
## hms 1.1.4 2025-10-17 [1] CRAN (R 4.5.0)
## htmltools 0.5.9 2025-12-04 [1] CRAN (R 4.5.2)
## igraph 2.2.2 2026-02-12 [1] CRAN (R 4.5.2)
## IRanges * 2.44.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## iterators 1.0.14 2022-02-05 [1] CRAN (R 4.5.0)
## jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.5.0)
## jsonlite 2.0.0 2025-03-27 [1] CRAN (R 4.5.0)
## knitr 1.51 2025-12-20 [1] CRAN (R 4.5.2)
## labeling 0.4.3 2023-08-29 [1] CRAN (R 4.5.0)
## lattice 0.22-9 2026-02-09 [1] CRAN (R 4.5.2)
## lifecycle 1.0.5 2026-01-08 [1] CRAN (R 4.5.2)
## lubridate * 1.9.5 2026-02-04 [1] CRAN (R 4.5.2)
## magrittr 2.0.4 2025-09-12 [1] CRAN (R 4.5.0)
## MASS 7.3-65 2025-02-28 [1] CRAN (R 4.5.1)
## Matrix 1.7-4 2025-08-28 [1] CRAN (R 4.5.0)
## mgcv 1.9-4 2025-11-07 [1] CRAN (R 4.5.0)
## multtest 2.66.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## nlme 3.1-168 2025-03-31 [1] CRAN (R 4.5.1)
## otel 0.2.0 2025-08-29 [1] CRAN (R 4.5.0)
## permute 0.9-10 2026-02-06 [1] CRAN (R 4.5.2)
## phyloseq * 1.54.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## pillar 1.11.1 2025-09-17 [1] CRAN (R 4.5.0)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.5.0)
## plyr 1.8.9 2023-10-02 [1] CRAN (R 4.5.0)
## purrr * 1.2.1 2026-01-09 [1] CRAN (R 4.5.2)
## R6 2.6.1 2025-02-15 [1] CRAN (R 4.5.0)
## RColorBrewer 1.1-3 2022-04-03 [1] CRAN (R 4.5.0)
## Rcpp 1.1.1 2026-01-10 [1] CRAN (R 4.5.2)
## readr * 2.2.0 2026-02-19 [1] CRAN (R 4.5.2)
## readxl * 1.4.5 2025-03-07 [1] CRAN (R 4.5.0)
## reshape2 1.4.5 2025-11-12 [1] CRAN (R 4.5.0)
## rhdf5 2.54.1 2025-12-02 [1] https://bioc-release.r-universe.dev (R 4.5.2)
## rhdf5filters 1.22.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## Rhdf5lib 1.32.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## rlang 1.1.7 2026-01-09 [1] CRAN (R 4.5.2)
## rmarkdown 2.30 2025-09-28 [1] CRAN (R 4.5.0)
## rprojroot 2.1.1 2025-08-26 [1] CRAN (R 4.5.0)
## rstudioapi 0.18.0 2026-01-16 [1] CRAN (R 4.5.2)
## S4Vectors * 0.48.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## S7 0.2.1 2025-11-14 [1] CRAN (R 4.5.2)
## sass 0.4.10 2025-04-11 [1] CRAN (R 4.5.0)
## scales 1.4.0 2025-04-24 [1] CRAN (R 4.5.0)
## Seqinfo * 1.0.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## sessioninfo 1.2.3 2025-02-05 [1] CRAN (R 4.5.0)
## stringi 1.8.7 2025-03-27 [1] CRAN (R 4.5.0)
## stringr * 1.6.0 2025-11-04 [1] CRAN (R 4.5.0)
## survival 3.8-6 2026-01-16 [1] CRAN (R 4.5.2)
## tibble * 3.3.1 2026-01-11 [1] CRAN (R 4.5.2)
## tidyr * 1.3.2 2025-12-19 [1] CRAN (R 4.5.2)
## tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.5.0)
## tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.5.0)
## timechange 0.4.0 2026-01-29 [1] CRAN (R 4.5.2)
## tzdb 0.5.0 2025-03-15 [1] CRAN (R 4.5.0)
## vctrs 0.7.1 2026-01-23 [1] CRAN (R 4.5.2)
## vegan 2.7-2 2025-10-08 [1] CRAN (R 4.5.0)
## withr 3.0.2 2024-10-28 [1] CRAN (R 4.5.0)
## xfun 0.56 2026-01-18 [1] CRAN (R 4.5.2)
## XVector * 0.50.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## yaml 2.3.12 2025-12-10 [1] CRAN (R 4.5.2)
##
## [1] /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library
## * ── Packages attached to the search path.
##
## ──────────────────────────────────────────────────────────────────────────────