rm(list = ls())
set.seed(2024)
library(here)
library(phyloseq)
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
library(gt)
## Warning: package 'gt' was built under R version 4.5.2
theme_set(theme_bw())
ps.rarefied <- readRDS(here('data','following_study','ps_rarefied.rds'))
ps.rarefied
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2548 taxa and 98 samples ]
## sample_data() Sample Data: [ 98 samples by 28 sample variables ]
## tax_table() Taxonomy Table: [ 2548 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 2548 tips and 2547 internal nodes ]
## refseq() DNAStringSet: [ 2548 reference sequences ]
sam <- data.frame(sample_data(ps.rarefied)) %>% rownames_to_column('Sample')
epi.dog <- sam %>% filter(Epileptic.or.Control == 'Epileptic')
ctl.dog <- sam %>% filter(Epileptic.or.Control == 'Control')
# double-check if data are paired
if (!identical(epi.dog$Household, ctl.dog$Household)) stop('data is not paired')
# dataset 1: orginal dataset
data1 <- list(epi.dog, ctl.dog)
In this section, we are trying to compare the ASV similarity between dogs from the same and different households.
To do this, we need two datasets:
the original dataset where each IE dog paired with a control dog from the same household.
a new dataset where each IE dog paired with a control dog that randomly selected from a different household.
sum(data1[[1]]$Predominant.Breed == data1[[2]]$Predominant.Breed)
## [1] 33
Most households (33/49) have dogs in same predominant breed. So, we need to control the breed effect when comparing ASV similarities.
Now, we want to pair each IE dog with a control dog that randomly selected from another household, and we will try to randomly select one (with replacement) that has the same predominate breed with the IE dog as possible.
select.dog <- function(pair1, pair2) {
diff.pair <- data.frame()
map_dfr(1:nrow(pair1), function(i) {
# for each dog in pair1, randomly select a dog from the different household
sub.data <- filter(pair2, Household != pair1[i,'Household'])
# if there's no dog in pair2 has same predominant breed as that is pair1,
# randomly select a dog from a different household
# else select one from the different household with the same predominate breed
if (nrow(filter(sub.data, Predominant.Breed == pair1[i,'Predominant.Breed'])) == 0){
selected <- sub.data %>%
sample_n(1)
} else {
selected <- sub.data %>%
filter(Predominant.Breed == pair1[i,'Predominant.Breed']) %>%
sample_n(1)
}
return(selected)
})
}
# dataset 2: paired dogs from different household
data2 <- list(epi.dog, select.dog(epi.dog, ctl.dog))
sum(data2[[1]]$Predominant.Breed == data2[[2]]$Predominant.Breed)
## [1] 31
Here, we can see with replacement, if we randomly select a dog from the control group and forcing the breed to be the same if possible, the pair with the same breed is at most 31 out of 49. Some of the households have dogs in same breed, but that breed has never appear in other households.
31 paired dogs from different households with same predominate breed is the best we can get. This is because some of the households have dogs with unique breed.
unique.breed.household <- sapply(1:nrow(epi.dog), function(i) {
# if breed in household is the same
if (data1[[1]]$Predominant.Breed[i] == data1[[2]]$Predominant.Breed[i]) {
# if the breed is not in the other households
if(!(data1[[1]]$Predominant.Breed[i] %in% data1[[2]]$Predominant.Breed[-i])) {
return(epi.dog[i,'Household'])
}
}
}) %>% unlist
unique.breed.household
## [1] "3" "9" "17" "19" "20" "21" "23" "25" "30" "48" "51"
By dropping two households (9 and 30), we can make sure that the breed effect is balanced.
droped.household <- sample(unique.breed.household, 2); droped.household
## [1] "9" "30"
# regenerate the two dataset
sub.sam <- sam %>% filter(!(Household %in% droped.household))
epi.dog <- sub.sam %>% filter(Epileptic.or.Control == 'Epileptic')
ctl.dog <- sub.sam %>% filter(Epileptic.or.Control == 'Control')
# double-check if data are paired
if (!identical(epi.dog$Household, ctl.dog$Household)) stop('data is not paired')
data1 <- list(epi.dog, ctl.dog)
data2 <- list(epi.dog, select.dog(epi.dog, ctl.dog))
sum(data1[[1]]$Predominant.Breed == data1[[2]]$Predominant.Breed)
## [1] 31
sum(data2[[1]]$Predominant.Breed == data2[[2]]$Predominant.Breed)
## [1] 31
Now, we can calculate the Jaccard similarity between IE and control dogs for the two datasets.
get.similarity <- function(ps, paired.data) {
dissimilarities <- phyloseq::distance(ps, method = 'jaccard', binary = TRUE)
dist <- as.matrix(1 - dissimilarities)
similarity <- dist[cbind(paired.data[[1]][,'Sample'], paired.data[[2]][,'Sample'])]
return(similarity)
}
similarity <- data.frame(epi.dog,
same.house.sim = get.similarity(ps.rarefied, data1),
diff.house.sim = get.similarity(ps.rarefied, data2))
Visualize the similarity between dogs from same/different households. We can see dogs from the same households have higher ASV similarities.
similarity %>%
ggplot() +
geom_point(aes(x = diff.house.sim, y = same.house.sim)) +
geom_abline(intercept = 0, slope = 1, color = "blue", linewidth = 1) +
tune::coord_obs_pred() +
labs(x = 'Jaccard Similarity (Different Household)', y = 'Jaccard Similarity (Same Household)')
## Registered S3 method overwritten by 'parsnip':
## method from
## print.nullmodel vegan
ggsave(here('figures','Figure2.png'), width = 170, units = "mm")
## Saving 170 x 127 mm image
ggsave(here('figures','Figure2.pdf'), width = 170, units = "mm")
## Saving 170 x 127 mm image
Pairwise t test
t.test(similarity$same.house.sim,
similarity$diff.house.sim,
paired = TRUE, alternative = 'greater')
##
## Paired t-test
##
## data: similarity$same.house.sim and similarity$diff.house.sim
## t = 9.2606, df = 46, p-value = 2.209e-12
## alternative hypothesis: true mean difference is greater than 0
## 95 percent confidence interval:
## 0.1078057 Inf
## sample estimates:
## mean difference
## 0.1316741
similarity %>%
select(same.house.sim, diff.house.sim) %>%
summary()
## same.house.sim diff.house.sim
## Min. :0.09479 Min. :0.08333
## 1st Qu.:0.29749 1st Qu.:0.17402
## Median :0.33761 Median :0.22283
## Mean :0.34728 Mean :0.21561
## 3rd Qu.:0.40343 3rd Qu.:0.25070
## Max. :0.59809 Max. :0.36402
Check the frequency of sharing within a household for each ASV. The expected proportion for each ASV is \(prevlance^2\). We can see most of the ASVs were shared within a household more often than expected by their study-wide prevalence.
ASV.present <- ps.rarefied %>%
transform_sample_counts(sign) %>%
psmelt()
shared.rate <- ASV.present %>%
group_by(Household, OTU) %>%
summarise(shared = sum(Abundance) == 2) %>%
group_by(OTU) %>%
summarise(share.rate = sum(shared)/49) %>%
arrange(str_rank(OTU, numeric = TRUE))
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by Household and OTU.
## ℹ Output is grouped by Household.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(Household, OTU))` for per-operation grouping
## (`?dplyr::dplyr_by`) instead.
prevalence <- ASV.present %>%
group_by(OTU) %>%
summarise(prevalence = sum(Abundance)/98) %>%
arrange(str_rank(OTU, numeric = TRUE))
df <- prevalence %>%
full_join(shared.rate, by = 'OTU')
ggplot(data = df) +
geom_point(aes(x = prevalence, y = share.rate), alpha = 0.3) +
stat_function(fun = function(x) x * (98*x-1)/97, color = "blue", linewidth = 1) +
coord_fixed() +
labs(x = 'Prevalence of ASV', y = 'Proportion of Household: ASV Present in Both Dogs')
ggsave(here('figures','Figure3.png'), width = 170, units = "mm")
## Saving 170 x 127 mm image
ggsave(here('figures','Figure3.pdf'), width = 170, units = "mm")
## Saving 170 x 127 mm image
For ASVs with prevalence greater than 0.1, 286/322 of them were shared within a household than expected.
shared.ASV <- df %>% rowwise() %>%
mutate(fold.diff=(share.rate/prevalence^2)) %>%
mutate(residual=(share.rate-prevalence^2)) %>%
filter(prevalence > 0.1)
nrow(shared.ASV)
## [1] 322
sum(shared.ASV$residual > 0)
## [1] 286
For more details…
shared.ASV %>% head(20) %>% gt()
| OTU | prevalence | share.rate | fold.diff | residual |
|---|---|---|---|---|
| ASV1 | 1.0000000 | 1.0000000 | 1.0000000 | 0.0000000000 |
| ASV2 | 0.9897959 | 0.9795918 | 0.9998937 | -0.0001041233 |
| ASV3 | 0.9693878 | 0.9591837 | 1.0207202 | 0.0194710537 |
| ASV4 | 1.0000000 | 1.0000000 | 1.0000000 | 0.0000000000 |
| ASV5 | 0.9285714 | 0.9183673 | 1.0650888 | 0.0561224490 |
| ASV6 | 1.0000000 | 1.0000000 | 1.0000000 | 0.0000000000 |
| ASV7 | 0.9897959 | 0.9795918 | 0.9998937 | -0.0001041233 |
| ASV8 | 0.6122449 | 0.4285714 | 1.1433333 | 0.0537276135 |
| ASV9 | 0.9795918 | 0.9591837 | 0.9995660 | -0.0004164931 |
| ASV10 | 0.8775510 | 0.7755102 | 1.0070308 | 0.0054144107 |
| ASV11 | 0.8673469 | 0.7755102 | 1.0308651 | 0.0232194919 |
| ASV12 | 0.3367347 | 0.1632653 | 1.4398531 | 0.0498750521 |
| ASV13 | 0.7755102 | 0.6326531 | 1.0519391 | 0.0312369846 |
| ASV14 | 0.6122449 | 0.4489796 | 1.1977778 | 0.0741357768 |
| ASV15 | 0.7142857 | 0.6122449 | 1.2000000 | 0.1020408163 |
| ASV16 | 0.6122449 | 0.4285714 | 1.1433333 | 0.0537276135 |
| ASV17 | 0.6326531 | 0.3673469 | 0.9177940 | -0.0329029571 |
| ASV18 | 0.8163265 | 0.7346939 | 1.1025000 | 0.0683048730 |
| ASV19 | 0.7448980 | 0.5918367 | 1.0666166 | 0.0369637651 |
| ASV20 | 0.8265306 | 0.7346939 | 1.0754458 | 0.0515410246 |
sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 4.5.1 (2025-06-13)
## os macOS Tahoe 26.3
## system aarch64, darwin20
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/New_York
## date 2026-02-23
## pandoc 3.8.3 @ /opt/homebrew/bin/ (via rmarkdown)
## quarto 1.8.25 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/quarto
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date (UTC) lib source
## ade4 1.7-23 2025-02-14 [1] CRAN (R 4.5.0)
## ape 5.8-1 2024-12-16 [1] CRAN (R 4.5.0)
## Biobase 2.70.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## BiocGenerics 0.56.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## biomformat 1.38.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## Biostrings 2.78.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## bslib 0.10.0 2026-01-26 [1] CRAN (R 4.5.2)
## cachem 1.1.0 2024-05-16 [1] CRAN (R 4.5.0)
## class 7.3-23 2025-01-01 [1] CRAN (R 4.5.1)
## cli 3.6.5 2025-04-23 [1] CRAN (R 4.5.0)
## cluster 2.1.8.2 2026-02-05 [1] CRAN (R 4.5.2)
## codetools 0.2-20 2024-03-31 [1] CRAN (R 4.5.1)
## crayon 1.5.3 2024-06-20 [1] CRAN (R 4.5.0)
## data.table 1.18.2.1 2026-01-27 [1] CRAN (R 4.5.2)
## dials 1.4.2 2025-09-04 [1] CRAN (R 4.5.0)
## DiceDesign 1.10 2023-12-07 [1] CRAN (R 4.5.0)
## digest 0.6.39 2025-11-19 [1] CRAN (R 4.5.2)
## dplyr * 1.2.0 2026-02-03 [1] CRAN (R 4.5.2)
## evaluate 1.0.5 2025-08-27 [1] CRAN (R 4.5.0)
## farver 2.1.2 2024-05-13 [1] CRAN (R 4.5.0)
## fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.5.0)
## forcats * 1.0.1 2025-09-25 [1] CRAN (R 4.5.0)
## foreach 1.5.2 2022-02-02 [1] CRAN (R 4.5.0)
## fs 1.6.6 2025-04-12 [1] CRAN (R 4.5.0)
## furrr 0.3.1 2022-08-15 [1] CRAN (R 4.5.0)
## future 1.69.0 2026-01-16 [1] CRAN (R 4.5.2)
## future.apply 1.20.2 2026-02-20 [1] CRAN (R 4.5.2)
## generics 0.1.4 2025-05-09 [1] CRAN (R 4.5.0)
## ggplot2 * 4.0.2 2026-02-03 [1] CRAN (R 4.5.2)
## globals 0.19.0 2026-02-02 [1] CRAN (R 4.5.2)
## glue 1.8.0 2024-09-30 [1] CRAN (R 4.5.0)
## gower 1.0.2 2024-12-17 [1] CRAN (R 4.5.0)
## GPfit 1.0-9 2025-04-12 [1] CRAN (R 4.5.0)
## gt * 1.3.0 2026-01-22 [1] CRAN (R 4.5.2)
## gtable 0.3.6 2024-10-25 [1] CRAN (R 4.5.0)
## hardhat 1.4.2 2025-08-20 [1] CRAN (R 4.5.0)
## here * 1.0.2 2025-09-15 [1] CRAN (R 4.5.0)
## hms 1.1.4 2025-10-17 [1] CRAN (R 4.5.0)
## htmltools 0.5.9 2025-12-04 [1] CRAN (R 4.5.2)
## igraph 2.2.2 2026-02-12 [1] CRAN (R 4.5.2)
## ipred 0.9-15 2024-07-18 [1] CRAN (R 4.5.0)
## IRanges 2.44.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## iterators 1.0.14 2022-02-05 [1] CRAN (R 4.5.0)
## jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.5.0)
## jsonlite 2.0.0 2025-03-27 [1] CRAN (R 4.5.0)
## knitr 1.51 2025-12-20 [1] CRAN (R 4.5.2)
## labeling 0.4.3 2023-08-29 [1] CRAN (R 4.5.0)
## lattice 0.22-9 2026-02-09 [1] CRAN (R 4.5.2)
## lava 1.8.2 2025-10-30 [1] CRAN (R 4.5.0)
## lhs 1.2.0 2024-06-30 [1] CRAN (R 4.5.0)
## lifecycle 1.0.5 2026-01-08 [1] CRAN (R 4.5.2)
## listenv 0.10.0 2025-11-02 [1] CRAN (R 4.5.0)
## lubridate * 1.9.5 2026-02-04 [1] CRAN (R 4.5.2)
## magrittr 2.0.4 2025-09-12 [1] CRAN (R 4.5.0)
## MASS 7.3-65 2025-02-28 [1] CRAN (R 4.5.1)
## Matrix 1.7-4 2025-08-28 [1] CRAN (R 4.5.0)
## mgcv 1.9-4 2025-11-07 [1] CRAN (R 4.5.0)
## multtest 2.66.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## nlme 3.1-168 2025-03-31 [1] CRAN (R 4.5.1)
## nnet 7.3-20 2025-01-01 [1] CRAN (R 4.5.1)
## otel 0.2.0 2025-08-29 [1] CRAN (R 4.5.0)
## parallelly 1.46.1 2026-01-08 [1] CRAN (R 4.5.2)
## parsnip 1.4.1 2026-01-11 [1] CRAN (R 4.5.2)
## permute 0.9-10 2026-02-06 [1] CRAN (R 4.5.2)
## phyloseq * 1.54.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## pillar 1.11.1 2025-09-17 [1] CRAN (R 4.5.0)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.5.0)
## plyr 1.8.9 2023-10-02 [1] CRAN (R 4.5.0)
## prodlim 2025.04.28 2025-04-28 [1] CRAN (R 4.5.0)
## purrr * 1.2.1 2026-01-09 [1] CRAN (R 4.5.2)
## R6 2.6.1 2025-02-15 [1] CRAN (R 4.5.0)
## ragg 1.5.0 2025-09-02 [1] CRAN (R 4.5.0)
## RColorBrewer 1.1-3 2022-04-03 [1] CRAN (R 4.5.0)
## Rcpp 1.1.1 2026-01-10 [1] CRAN (R 4.5.2)
## readr * 2.2.0 2026-02-19 [1] CRAN (R 4.5.2)
## recipes 1.3.1 2025-05-21 [1] CRAN (R 4.5.0)
## reshape2 1.4.5 2025-11-12 [1] CRAN (R 4.5.0)
## rhdf5 2.54.1 2025-12-02 [1] https://bioc-release.r-universe.dev (R 4.5.2)
## rhdf5filters 1.22.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## Rhdf5lib 1.32.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## rlang 1.1.7 2026-01-09 [1] CRAN (R 4.5.2)
## rmarkdown 2.30 2025-09-28 [1] CRAN (R 4.5.0)
## rpart 4.1.24 2025-01-07 [1] CRAN (R 4.5.1)
## rprojroot 2.1.1 2025-08-26 [1] CRAN (R 4.5.0)
## rsample 1.3.2 2026-01-30 [1] CRAN (R 4.5.2)
## rstudioapi 0.18.0 2026-01-16 [1] CRAN (R 4.5.2)
## S4Vectors 0.48.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## S7 0.2.1 2025-11-14 [1] CRAN (R 4.5.2)
## sass 0.4.10 2025-04-11 [1] CRAN (R 4.5.0)
## scales 1.4.0 2025-04-24 [1] CRAN (R 4.5.0)
## Seqinfo 1.0.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## sessioninfo 1.2.3 2025-02-05 [1] CRAN (R 4.5.0)
## stringi 1.8.7 2025-03-27 [1] CRAN (R 4.5.0)
## stringr * 1.6.0 2025-11-04 [1] CRAN (R 4.5.0)
## survival 3.8-6 2026-01-16 [1] CRAN (R 4.5.2)
## systemfonts 1.3.1 2025-10-01 [1] CRAN (R 4.5.0)
## textshaping 1.0.4 2025-10-10 [1] CRAN (R 4.5.0)
## tibble * 3.3.1 2026-01-11 [1] CRAN (R 4.5.2)
## tidyr * 1.3.2 2025-12-19 [1] CRAN (R 4.5.2)
## tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.5.0)
## tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.5.0)
## timechange 0.4.0 2026-01-29 [1] CRAN (R 4.5.2)
## timeDate 4052.112 2026-01-28 [1] CRAN (R 4.5.2)
## tune 2.0.1 2025-10-17 [1] CRAN (R 4.5.0)
## tzdb 0.5.0 2025-03-15 [1] CRAN (R 4.5.0)
## vctrs 0.7.1 2026-01-23 [1] CRAN (R 4.5.2)
## vegan 2.7-2 2025-10-08 [1] CRAN (R 4.5.0)
## withr 3.0.2 2024-10-28 [1] CRAN (R 4.5.0)
## workflows 1.3.0 2025-08-27 [1] CRAN (R 4.5.0)
## xfun 0.56 2026-01-18 [1] CRAN (R 4.5.2)
## xml2 1.5.2 2026-01-17 [1] CRAN (R 4.5.2)
## XVector 0.50.0 2025-10-29 [1] Bioconductor 3.22 (R 4.5.1)
## yaml 2.3.12 2025-12-10 [1] CRAN (R 4.5.2)
## yardstick 1.3.2 2025-01-22 [1] CRAN (R 4.5.0)
##
## [1] /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library
## * ── Packages attached to the search path.
##
## ──────────────────────────────────────────────────────────────────────────────