rm(list = ls())
set.seed(2024)
library(here)
library(Biostrings)
library(ShortRead)
library(patchwork)
library(dada2)
path <- "~/project_data/Epilepsy_data/221024_UNC2X_KKYJ5-KKYGJ-NETTI-in/221024_UNC2X_KKYJ5-KKYGJ-NETTI"
fnF <- list.files(path, pattern="_R1_", full.names = TRUE)
fnR <- list.files(path, pattern="_R2_", full.names = TRUE)
head(list.files(path))
## [1] "Netti001_ATCTCAGG-CTCTCTAT_L001_R1_001.fastq.gz"
## [2] "Netti001_ATCTCAGG-CTCTCTAT_L001_R2_001.fastq.gz"
## [3] "Netti002_TAAGGCGA-CTAAGCCT_L001_R1_001.fastq.gz"
## [4] "Netti002_TAAGGCGA-CTAAGCCT_L001_R2_001.fastq.gz"
## [5] "Netti003_GTAGAGGA-GTAAGGAG_L001_R1_001.fastq.gz"
## [6] "Netti003_GTAGAGGA-GTAAGGAG_L001_R2_001.fastq.gz"
Check if sample names are matched for forward and reverse
sam.F <- sapply(strsplit(basename(fnF), "_"), `[`, 1)
sam.R <- sapply(strsplit(basename(fnR), "_"), `[`, 1)
if(!identical(sam.F, sam.R)) stop("F/R fastqs mismatch")
Here, we pick three samples for forward and reverse read and check their quality profiles.
plotQualityProfile(fnF[c(1,10,100)]) / plotQualityProfile(fnR[c(1,10,100)])
randomly pick sample 13 and 53 to check their complexity
plotComplexity(fnF[[13]]) + plotComplexity(fnR[[53]])
check primers
filtered.dir <- file.path(path, "filtered")
filtF <- file.path(filtered.dir, basename(fnF))
filtR <- file.path(filtered.dir, basename(fnR))
FWD <- "GTGYCAGCMGCCGCGGTAA"
REV <- "GGACTACNVGGGTWTCTAAT"
nchar(c(FWD, REV))
## [1] 19 20
primerHits <- function(primer, fn) {
# Counts number of reads in which the primer is found
nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
return(sum(nhits > 0))
}
primerHits(FWD, fnF[[1]])
## [1] 177911
primerHits(REV, fnR[[1]])
## [1] 176631
primer count matched. looks ok.
trim primers and filter out low quality reads
out <- filterAndTrim(fnF, filtF, fnR, filtR,
trimLeft=c(19, 20), truncLen=c(200,150),
maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE,
compress=TRUE, multithread=TRUE) # On Windows set multithread=FALSE
## Creating output directory: /Users/yixuanyang/project_data/Epilepsy_data/221024_UNC2X_KKYJ5-KKYGJ-NETTI-in/221024_UNC2X_KKYJ5-KKYGJ-NETTI/filtered
head(out)
## reads.in reads.out
## Netti001_ATCTCAGG-CTCTCTAT_L001_R1_001.fastq.gz 188595 160517
## Netti002_TAAGGCGA-CTAAGCCT_L001_R1_001.fastq.gz 182265 152272
## Netti003_GTAGAGGA-GTAAGGAG_L001_R1_001.fastq.gz 192837 171477
## Netti004_CGTACTAG-CTAAGCCT_L001_R1_001.fastq.gz 180622 146564
## Netti005_TAGGCATG-CTCTCTAT_L001_R1_001.fastq.gz 176941 151368
## Netti006_AAGAGGCA-TATCCTCT_L001_R1_001.fastq.gz 162155 137982
85% of reads retained. Perhaps 3 failed libraries with very few reads. All others look fine.
summary(out[,2]/out[,1])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.4168 0.8460 0.8587 0.8466 0.8718 0.9108
summary(out[,2])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 466 128634 145536 141288 158433 261030
plot(sort(out[,2]))
check complexity again
plotComplexity(filtF[[13]]) + plotComplexity(filtR[[53]])
There’s still a lower complexity bump. Could be real.
Show error model plot
errF <- learnErrors(filtF, multi=TRUE)
## 114180230 total bases in 630830 reads from 4 samples will be used for learning the error rates.
errR <- learnErrors(filtR, multi=TRUE)
## 101685740 total bases in 782198 reads from 5 samples will be used for learning the error rates.
plotErrors(errF)
plotErrors(errR)
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
apply DADA2 algorithm
ddF <- dada(filtF, err=errF, pool="pseudo", multithread=TRUE)
## Sample 1 - 160517 reads in 31260 unique sequences.
## Sample 2 - 152272 reads in 46114 unique sequences.
## Sample 3 - 171477 reads in 39668 unique sequences.
## Sample 4 - 146564 reads in 46732 unique sequences.
## Sample 5 - 151368 reads in 35560 unique sequences.
## Sample 6 - 137982 reads in 31297 unique sequences.
## Sample 7 - 146555 reads in 35770 unique sequences.
## Sample 8 - 134114 reads in 31330 unique sequences.
## Sample 9 - 138946 reads in 34546 unique sequences.
## Sample 10 - 141456 reads in 30753 unique sequences.
## Sample 11 - 122053 reads in 33344 unique sequences.
## Sample 12 - 143824 reads in 34749 unique sequences.
## Sample 13 - 159719 reads in 37708 unique sequences.
## Sample 14 - 101606 reads in 30497 unique sequences.
## Sample 15 - 202948 reads in 44498 unique sequences.
## Sample 16 - 121731 reads in 30914 unique sequences.
## Sample 17 - 162037 reads in 38808 unique sequences.
## Sample 18 - 142591 reads in 34307 unique sequences.
## Sample 19 - 152242 reads in 32932 unique sequences.
## Sample 20 - 196130 reads in 37070 unique sequences.
## Sample 21 - 157688 reads in 29963 unique sequences.
## Sample 22 - 103649 reads in 23467 unique sequences.
## Sample 23 - 142385 reads in 34923 unique sequences.
## Sample 24 - 136065 reads in 29059 unique sequences.
## Sample 25 - 146219 reads in 31806 unique sequences.
## Sample 26 - 172943 reads in 41287 unique sequences.
## Sample 27 - 115870 reads in 26741 unique sequences.
## Sample 28 - 51059 reads in 12127 unique sequences.
## Sample 29 - 145620 reads in 34774 unique sequences.
## Sample 30 - 138901 reads in 36322 unique sequences.
## Sample 31 - 146159 reads in 35942 unique sequences.
## Sample 32 - 71855 reads in 16828 unique sequences.
## Sample 33 - 143088 reads in 32308 unique sequences.
## Sample 34 - 156085 reads in 35940 unique sequences.
## Sample 35 - 158442 reads in 37991 unique sequences.
## Sample 36 - 92024 reads in 23703 unique sequences.
## Sample 37 - 151706 reads in 29681 unique sequences.
## Sample 38 - 173890 reads in 33420 unique sequences.
## Sample 39 - 157611 reads in 41233 unique sequences.
## Sample 40 - 571 reads in 470 unique sequences.
## Sample 41 - 129160 reads in 26116 unique sequences.
## Sample 42 - 171107 reads in 39203 unique sequences.
## Sample 43 - 152853 reads in 39096 unique sequences.
## Sample 44 - 159040 reads in 40320 unique sequences.
## Sample 45 - 140541 reads in 46005 unique sequences.
## Sample 46 - 128589 reads in 29893 unique sequences.
## Sample 47 - 146049 reads in 35864 unique sequences.
## Sample 48 - 57347 reads in 19197 unique sequences.
## Sample 49 - 136815 reads in 29615 unique sequences.
## Sample 50 - 150986 reads in 33622 unique sequences.
## Sample 51 - 166403 reads in 39630 unique sequences.
## Sample 52 - 184236 reads in 42046 unique sequences.
## Sample 53 - 119639 reads in 33480 unique sequences.
## Sample 54 - 150089 reads in 45210 unique sequences.
## Sample 55 - 156408 reads in 40076 unique sequences.
## Sample 56 - 142458 reads in 34014 unique sequences.
## Sample 57 - 116714 reads in 27317 unique sequences.
## Sample 58 - 115304 reads in 27033 unique sequences.
## Sample 59 - 139943 reads in 32913 unique sequences.
## Sample 60 - 150293 reads in 30908 unique sequences.
## Sample 61 - 147009 reads in 31351 unique sequences.
## Sample 62 - 152145 reads in 30082 unique sequences.
## Sample 63 - 160789 reads in 37335 unique sequences.
## Sample 64 - 175807 reads in 40986 unique sequences.
## Sample 65 - 163476 reads in 32909 unique sequences.
## Sample 66 - 119938 reads in 27980 unique sequences.
## Sample 67 - 122536 reads in 34635 unique sequences.
## Sample 68 - 209613 reads in 49356 unique sequences.
## Sample 69 - 128432 reads in 28966 unique sequences.
## Sample 70 - 137275 reads in 33410 unique sequences.
## Sample 71 - 99654 reads in 22395 unique sequences.
## Sample 72 - 147962 reads in 40269 unique sequences.
## Sample 73 - 132528 reads in 37519 unique sequences.
## Sample 74 - 126124 reads in 37476 unique sequences.
## Sample 75 - 150654 reads in 31913 unique sequences.
## Sample 76 - 138679 reads in 29849 unique sequences.
## Sample 77 - 127377 reads in 28736 unique sequences.
## Sample 78 - 94212 reads in 23181 unique sequences.
## Sample 79 - 115965 reads in 29360 unique sequences.
## Sample 80 - 1005 reads in 639 unique sequences.
## Sample 81 - 217468 reads in 48340 unique sequences.
## Sample 82 - 125894 reads in 31073 unique sequences.
## Sample 83 - 156317 reads in 36320 unique sequences.
## Sample 84 - 132871 reads in 28841 unique sequences.
## Sample 85 - 150966 reads in 32652 unique sequences.
## Sample 86 - 101651 reads in 25473 unique sequences.
## Sample 87 - 180802 reads in 37187 unique sequences.
## Sample 88 - 141555 reads in 40020 unique sequences.
## Sample 89 - 81715 reads in 19794 unique sequences.
## Sample 90 - 141486 reads in 31816 unique sequences.
## Sample 91 - 175431 reads in 38626 unique sequences.
## Sample 92 - 158406 reads in 39773 unique sequences.
## Sample 93 - 147421 reads in 41114 unique sequences.
## Sample 94 - 207730 reads in 49657 unique sequences.
## Sample 95 - 128770 reads in 27301 unique sequences.
## Sample 96 - 210026 reads in 55148 unique sequences.
## Sample 97 - 146581 reads in 37962 unique sequences.
## Sample 98 - 75865 reads in 21534 unique sequences.
## Sample 99 - 115386 reads in 28053 unique sequences.
## Sample 100 - 136694 reads in 28991 unique sequences.
## Sample 101 - 157799 reads in 36976 unique sequences.
## Sample 102 - 141006 reads in 35581 unique sequences.
## Sample 103 - 174200 reads in 45919 unique sequences.
## Sample 104 - 173405 reads in 40589 unique sequences.
## Sample 105 - 200648 reads in 51145 unique sequences.
## Sample 106 - 153790 reads in 42830 unique sequences.
## Sample 107 - 136848 reads in 38088 unique sequences.
## Sample 108 - 466 reads in 376 unique sequences.
## Sample 109 - 160466 reads in 35099 unique sequences.
## Sample 110 - 130629 reads in 37296 unique sequences.
## Sample 111 - 137106 reads in 37274 unique sequences.
## Sample 112 - 261030 reads in 67638 unique sequences.
## Sample 113 - 125079 reads in 33447 unique sequences.
## Sample 114 - 165022 reads in 38718 unique sequences.
## Sample 115 - 140845 reads in 34323 unique sequences.
## Sample 116 - 145451 reads in 40405 unique sequences.
## Sample 117 - 159561 reads in 45857 unique sequences.
## Sample 118 - 162431 reads in 38320 unique sequences.
##
## selfConsist step 2
ddR <- dada(filtR, err=errR, pool="pseudo", multithread=TRUE)
## Sample 1 - 160517 reads in 19263 unique sequences.
## Sample 2 - 152272 reads in 26165 unique sequences.
## Sample 3 - 171477 reads in 28924 unique sequences.
## Sample 4 - 146564 reads in 29456 unique sequences.
## Sample 5 - 151368 reads in 21415 unique sequences.
## Sample 6 - 137982 reads in 21010 unique sequences.
## Sample 7 - 146555 reads in 22598 unique sequences.
## Sample 8 - 134114 reads in 23761 unique sequences.
## Sample 9 - 138946 reads in 31048 unique sequences.
## Sample 10 - 141456 reads in 20423 unique sequences.
## Sample 11 - 122053 reads in 14535 unique sequences.
## Sample 12 - 143824 reads in 20324 unique sequences.
## Sample 13 - 159719 reads in 23343 unique sequences.
## Sample 14 - 101606 reads in 19178 unique sequences.
## Sample 15 - 202948 reads in 29291 unique sequences.
## Sample 16 - 121731 reads in 20359 unique sequences.
## Sample 17 - 162037 reads in 23544 unique sequences.
## Sample 18 - 142591 reads in 20797 unique sequences.
## Sample 19 - 152242 reads in 26682 unique sequences.
## Sample 20 - 196130 reads in 28600 unique sequences.
## Sample 21 - 157688 reads in 22221 unique sequences.
## Sample 22 - 103649 reads in 13949 unique sequences.
## Sample 23 - 142385 reads in 41145 unique sequences.
## Sample 24 - 136065 reads in 21557 unique sequences.
## Sample 25 - 146219 reads in 40739 unique sequences.
## Sample 26 - 172943 reads in 28588 unique sequences.
## Sample 27 - 115870 reads in 18408 unique sequences.
## Sample 28 - 51059 reads in 6727 unique sequences.
## Sample 29 - 145620 reads in 22852 unique sequences.
## Sample 30 - 138901 reads in 25835 unique sequences.
## Sample 31 - 146159 reads in 20173 unique sequences.
## Sample 32 - 71855 reads in 14181 unique sequences.
## Sample 33 - 143088 reads in 35665 unique sequences.
## Sample 34 - 156085 reads in 22158 unique sequences.
## Sample 35 - 158442 reads in 22055 unique sequences.
## Sample 36 - 92024 reads in 16750 unique sequences.
## Sample 37 - 151706 reads in 19654 unique sequences.
## Sample 38 - 173890 reads in 23995 unique sequences.
## Sample 39 - 157611 reads in 31431 unique sequences.
## Sample 40 - 571 reads in 473 unique sequences.
## Sample 41 - 129160 reads in 20013 unique sequences.
## Sample 42 - 171107 reads in 22659 unique sequences.
## Sample 43 - 152853 reads in 22194 unique sequences.
## Sample 44 - 159040 reads in 34885 unique sequences.
## Sample 45 - 140541 reads in 28283 unique sequences.
## Sample 46 - 128589 reads in 26891 unique sequences.
## Sample 47 - 146049 reads in 21973 unique sequences.
## Sample 48 - 57347 reads in 29435 unique sequences.
## Sample 49 - 136815 reads in 18621 unique sequences.
## Sample 50 - 150986 reads in 21979 unique sequences.
## Sample 51 - 166403 reads in 29829 unique sequences.
## Sample 52 - 184236 reads in 26156 unique sequences.
## Sample 53 - 119639 reads in 17532 unique sequences.
## Sample 54 - 150089 reads in 32873 unique sequences.
## Sample 55 - 156408 reads in 27214 unique sequences.
## Sample 56 - 142458 reads in 29650 unique sequences.
## Sample 57 - 116714 reads in 29165 unique sequences.
## Sample 58 - 115304 reads in 20912 unique sequences.
## Sample 59 - 139943 reads in 29707 unique sequences.
## Sample 60 - 150293 reads in 29782 unique sequences.
## Sample 61 - 147009 reads in 21554 unique sequences.
## Sample 62 - 152145 reads in 22916 unique sequences.
## Sample 63 - 160789 reads in 23864 unique sequences.
## Sample 64 - 175807 reads in 23416 unique sequences.
## Sample 65 - 163476 reads in 21736 unique sequences.
## Sample 66 - 119938 reads in 32027 unique sequences.
## Sample 67 - 122536 reads in 35009 unique sequences.
## Sample 68 - 209613 reads in 35043 unique sequences.
## Sample 69 - 128432 reads in 30901 unique sequences.
## Sample 70 - 137275 reads in 24467 unique sequences.
## Sample 71 - 99654 reads in 14051 unique sequences.
## Sample 72 - 147962 reads in 28909 unique sequences.
## Sample 73 - 132528 reads in 21865 unique sequences.
## Sample 74 - 126124 reads in 19244 unique sequences.
## Sample 75 - 150654 reads in 23106 unique sequences.
## Sample 76 - 138679 reads in 18327 unique sequences.
## Sample 77 - 127377 reads in 16605 unique sequences.
## Sample 78 - 94212 reads in 20129 unique sequences.
## Sample 79 - 115965 reads in 16636 unique sequences.
## Sample 80 - 1005 reads in 607 unique sequences.
## Sample 81 - 217468 reads in 32660 unique sequences.
## Sample 82 - 125894 reads in 25805 unique sequences.
## Sample 83 - 156317 reads in 25063 unique sequences.
## Sample 84 - 132871 reads in 17857 unique sequences.
## Sample 85 - 150966 reads in 24263 unique sequences.
## Sample 86 - 101651 reads in 21078 unique sequences.
## Sample 87 - 180802 reads in 23386 unique sequences.
## Sample 88 - 141555 reads in 25189 unique sequences.
## Sample 89 - 81715 reads in 23326 unique sequences.
## Sample 90 - 141486 reads in 22026 unique sequences.
## Sample 91 - 175431 reads in 28253 unique sequences.
## Sample 92 - 158406 reads in 40271 unique sequences.
## Sample 93 - 147421 reads in 33728 unique sequences.
## Sample 94 - 207730 reads in 33472 unique sequences.
## Sample 95 - 128770 reads in 27047 unique sequences.
## Sample 96 - 210026 reads in 33807 unique sequences.
## Sample 97 - 146581 reads in 26009 unique sequences.
## Sample 98 - 75865 reads in 18343 unique sequences.
## Sample 99 - 115386 reads in 23362 unique sequences.
## Sample 100 - 136694 reads in 17571 unique sequences.
## Sample 101 - 157799 reads in 25343 unique sequences.
## Sample 102 - 141006 reads in 22531 unique sequences.
## Sample 103 - 174200 reads in 27035 unique sequences.
## Sample 104 - 173405 reads in 25241 unique sequences.
## Sample 105 - 200648 reads in 36020 unique sequences.
## Sample 106 - 153790 reads in 29157 unique sequences.
## Sample 107 - 136848 reads in 23550 unique sequences.
## Sample 108 - 466 reads in 363 unique sequences.
## Sample 109 - 160466 reads in 35457 unique sequences.
## Sample 110 - 130629 reads in 23804 unique sequences.
## Sample 111 - 137106 reads in 18978 unique sequences.
## Sample 112 - 261030 reads in 68976 unique sequences.
## Sample 113 - 125079 reads in 29484 unique sequences.
## Sample 114 - 165022 reads in 24574 unique sequences.
## Sample 115 - 140845 reads in 22116 unique sequences.
## Sample 116 - 145451 reads in 43777 unique sequences.
## Sample 117 - 159561 reads in 24432 unique sequences.
## Sample 118 - 162431 reads in 27545 unique sequences.
##
## selfConsist step 2
mm <- mergePairs(ddF, filtF, ddR, filtR, verbose=TRUE)
## 155218 paired-reads (in 765 unique pairings) successfully merged out of 160093 (in 1953 pairings) input.
## 142513 paired-reads (in 1515 unique pairings) successfully merged out of 151349 (in 4670 pairings) input.
## 162087 paired-reads (in 1591 unique pairings) successfully merged out of 170592 (in 4320 pairings) input.
## 138493 paired-reads (in 1202 unique pairings) successfully merged out of 144852 (in 3724 pairings) input.
## 142949 paired-reads (in 1902 unique pairings) successfully merged out of 150599 (in 4591 pairings) input.
## 130850 paired-reads (in 1499 unique pairings) successfully merged out of 137446 (in 3788 pairings) input.
## 139607 paired-reads (in 1501 unique pairings) successfully merged out of 145887 (in 3820 pairings) input.
## 128668 paired-reads (in 1260 unique pairings) successfully merged out of 133493 (in 3117 pairings) input.
## 126532 paired-reads (in 962 unique pairings) successfully merged out of 138128 (in 3492 pairings) input.
## 136333 paired-reads (in 962 unique pairings) successfully merged out of 140884 (in 2245 pairings) input.
## 117934 paired-reads (in 846 unique pairings) successfully merged out of 121612 (in 1968 pairings) input.
## 138076 paired-reads (in 847 unique pairings) successfully merged out of 143217 (in 2087 pairings) input.
## 149403 paired-reads (in 1280 unique pairings) successfully merged out of 158792 (in 3385 pairings) input.
## 96757 paired-reads (in 1024 unique pairings) successfully merged out of 101189 (in 2534 pairings) input.
## 188923 paired-reads (in 1541 unique pairings) successfully merged out of 202070 (in 4115 pairings) input.
## 115432 paired-reads (in 1584 unique pairings) successfully merged out of 121204 (in 3762 pairings) input.
## 155097 paired-reads (in 1074 unique pairings) successfully merged out of 161436 (in 2756 pairings) input.
## 135747 paired-reads (in 1158 unique pairings) successfully merged out of 142041 (in 2902 pairings) input.
## 145323 paired-reads (in 1164 unique pairings) successfully merged out of 151702 (in 2981 pairings) input.
## 190133 paired-reads (in 1111 unique pairings) successfully merged out of 195575 (in 2594 pairings) input.
## 153000 paired-reads (in 1155 unique pairings) successfully merged out of 157290 (in 2699 pairings) input.
## 98881 paired-reads (in 1143 unique pairings) successfully merged out of 103184 (in 2590 pairings) input.
## 132875 paired-reads (in 1474 unique pairings) successfully merged out of 140627 (in 4411 pairings) input.
## 129832 paired-reads (in 1183 unique pairings) successfully merged out of 135699 (in 3034 pairings) input.
## 141174 paired-reads (in 880 unique pairings) successfully merged out of 145098 (in 2426 pairings) input.
## 166966 paired-reads (in 1004 unique pairings) successfully merged out of 172302 (in 2754 pairings) input.
## 110359 paired-reads (in 1092 unique pairings) successfully merged out of 115311 (in 2851 pairings) input.
## 49925 paired-reads (in 299 unique pairings) successfully merged out of 50879 (in 717 pairings) input.
## 139038 paired-reads (in 1403 unique pairings) successfully merged out of 145012 (in 3746 pairings) input.
## 132350 paired-reads (in 1736 unique pairings) successfully merged out of 138192 (in 4512 pairings) input.
## 138583 paired-reads (in 1360 unique pairings) successfully merged out of 145367 (in 3503 pairings) input.
## 68878 paired-reads (in 979 unique pairings) successfully merged out of 70979 (in 1985 pairings) input.
## 136764 paired-reads (in 1376 unique pairings) successfully merged out of 142102 (in 3501 pairings) input.
## 149323 paired-reads (in 1436 unique pairings) successfully merged out of 155392 (in 3710 pairings) input.
## 154391 paired-reads (in 602 unique pairings) successfully merged out of 157807 (in 1629 pairings) input.
## 88346 paired-reads (in 913 unique pairings) successfully merged out of 91494 (in 2157 pairings) input.
## 146987 paired-reads (in 1194 unique pairings) successfully merged out of 151366 (in 2676 pairings) input.
## 168600 paired-reads (in 1212 unique pairings) successfully merged out of 173531 (in 2684 pairings) input.
## 148325 paired-reads (in 1338 unique pairings) successfully merged out of 156345 (in 4108 pairings) input.
## 340 paired-reads (in 34 unique pairings) successfully merged out of 506 (in 142 pairings) input.
## 124963 paired-reads (in 1021 unique pairings) successfully merged out of 128760 (in 2448 pairings) input.
## 163090 paired-reads (in 1415 unique pairings) successfully merged out of 170168 (in 3648 pairings) input.
## 145060 paired-reads (in 1554 unique pairings) successfully merged out of 152215 (in 4144 pairings) input.
## 148674 paired-reads (in 1658 unique pairings) successfully merged out of 158280 (in 4668 pairings) input.
## 131786 paired-reads (in 1502 unique pairings) successfully merged out of 139496 (in 4554 pairings) input.
## 122940 paired-reads (in 1296 unique pairings) successfully merged out of 127944 (in 3207 pairings) input.
## 138181 paired-reads (in 1531 unique pairings) successfully merged out of 144950 (in 3996 pairings) input.
## 50892 paired-reads (in 727 unique pairings) successfully merged out of 57086 (in 2343 pairings) input.
## 131199 paired-reads (in 918 unique pairings) successfully merged out of 136322 (in 2391 pairings) input.
## 145723 paired-reads (in 1244 unique pairings) successfully merged out of 150230 (in 3109 pairings) input.
## 154851 paired-reads (in 1860 unique pairings) successfully merged out of 165479 (in 5047 pairings) input.
## 174570 paired-reads (in 1603 unique pairings) successfully merged out of 183319 (in 4333 pairings) input.
## 115046 paired-reads (in 969 unique pairings) successfully merged out of 119053 (in 2561 pairings) input.
## 140645 paired-reads (in 1782 unique pairings) successfully merged out of 148967 (in 5302 pairings) input.
## 149077 paired-reads (in 1287 unique pairings) successfully merged out of 155379 (in 3671 pairings) input.
## 135158 paired-reads (in 1848 unique pairings) successfully merged out of 141751 (in 4514 pairings) input.
## 111130 paired-reads (in 906 unique pairings) successfully merged out of 116052 (in 2536 pairings) input.
## 112013 paired-reads (in 739 unique pairings) successfully merged out of 114831 (in 1794 pairings) input.
## 132738 paired-reads (in 1461 unique pairings) successfully merged out of 139214 (in 3623 pairings) input.
## 143461 paired-reads (in 1352 unique pairings) successfully merged out of 149590 (in 3347 pairings) input.
## 140540 paired-reads (in 1147 unique pairings) successfully merged out of 146556 (in 2662 pairings) input.
## 144694 paired-reads (in 1056 unique pairings) successfully merged out of 151777 (in 2478 pairings) input.
## 151159 paired-reads (in 1703 unique pairings) successfully merged out of 159894 (in 4370 pairings) input.
## 166894 paired-reads (in 1702 unique pairings) successfully merged out of 174971 (in 4141 pairings) input.
## 156946 paired-reads (in 1560 unique pairings) successfully merged out of 162732 (in 3634 pairings) input.
## 112248 paired-reads (in 1179 unique pairings) successfully merged out of 119205 (in 3508 pairings) input.
## 112592 paired-reads (in 1086 unique pairings) successfully merged out of 121249 (in 3868 pairings) input.
## 195989 paired-reads (in 2110 unique pairings) successfully merged out of 208141 (in 6107 pairings) input.
## 122055 paired-reads (in 1209 unique pairings) successfully merged out of 127990 (in 3202 pairings) input.
## 131986 paired-reads (in 507 unique pairings) successfully merged out of 136730 (in 1711 pairings) input.
## 97350 paired-reads (in 630 unique pairings) successfully merged out of 99268 (in 1398 pairings) input.
## 139030 paired-reads (in 1366 unique pairings) successfully merged out of 146694 (in 4117 pairings) input.
## 126463 paired-reads (in 1295 unique pairings) successfully merged out of 131795 (in 3264 pairings) input.
## 118372 paired-reads (in 1429 unique pairings) successfully merged out of 125293 (in 3816 pairings) input.
## 142510 paired-reads (in 1357 unique pairings) successfully merged out of 149908 (in 3388 pairings) input.
## 132929 paired-reads (in 1172 unique pairings) successfully merged out of 138209 (in 2762 pairings) input.
## 121923 paired-reads (in 873 unique pairings) successfully merged out of 126671 (in 2241 pairings) input.
## 88961 paired-reads (in 897 unique pairings) successfully merged out of 93331 (in 2466 pairings) input.
## 110527 paired-reads (in 1434 unique pairings) successfully merged out of 115502 (in 3347 pairings) input.
## 733 paired-reads (in 53 unique pairings) successfully merged out of 952 (in 194 pairings) input.
## 208270 paired-reads (in 1778 unique pairings) successfully merged out of 216714 (in 4384 pairings) input.
## 119865 paired-reads (in 1498 unique pairings) successfully merged out of 125120 (in 3567 pairings) input.
## 146778 paired-reads (in 1663 unique pairings) successfully merged out of 155617 (in 4204 pairings) input.
## 128218 paired-reads (in 1256 unique pairings) successfully merged out of 132297 (in 2724 pairings) input.
## 143586 paired-reads (in 1320 unique pairings) successfully merged out of 150230 (in 3469 pairings) input.
## 96246 paired-reads (in 1217 unique pairings) successfully merged out of 101046 (in 3047 pairings) input.
## 173818 paired-reads (in 1104 unique pairings) successfully merged out of 180133 (in 3014 pairings) input.
## 132699 paired-reads (in 1489 unique pairings) successfully merged out of 140201 (in 4507 pairings) input.
## 77050 paired-reads (in 1293 unique pairings) successfully merged out of 81291 (in 3003 pairings) input.
## 135122 paired-reads (in 1626 unique pairings) successfully merged out of 140877 (in 3874 pairings) input.
## 165670 paired-reads (in 1879 unique pairings) successfully merged out of 174384 (in 4875 pairings) input.
## 149367 paired-reads (in 1204 unique pairings) successfully merged out of 157298 (in 3872 pairings) input.
## 138068 paired-reads (in 1115 unique pairings) successfully merged out of 146432 (in 3826 pairings) input.
## 197585 paired-reads (in 1683 unique pairings) successfully merged out of 206697 (in 4927 pairings) input.
## 123071 paired-reads (in 1442 unique pairings) successfully merged out of 128250 (in 3536 pairings) input.
## 201519 paired-reads (in 1690 unique pairings) successfully merged out of 208720 (in 4691 pairings) input.
## 137521 paired-reads (in 1624 unique pairings) successfully merged out of 145568 (in 4401 pairings) input.
## 70600 paired-reads (in 1270 unique pairings) successfully merged out of 74814 (in 3097 pairings) input.
## 109366 paired-reads (in 1220 unique pairings) successfully merged out of 114726 (in 3343 pairings) input.
## 131259 paired-reads (in 1365 unique pairings) successfully merged out of 136119 (in 3098 pairings) input.
## 148897 paired-reads (in 1953 unique pairings) successfully merged out of 156890 (in 5008 pairings) input.
## 133531 paired-reads (in 1444 unique pairings) successfully merged out of 139984 (in 3774 pairings) input.
## 165769 paired-reads (in 1880 unique pairings) successfully merged out of 173154 (in 4918 pairings) input.
## 165057 paired-reads (in 1850 unique pairings) successfully merged out of 172443 (in 4563 pairings) input.
## 189323 paired-reads (in 2158 unique pairings) successfully merged out of 199439 (in 5841 pairings) input.
## 145899 paired-reads (in 1563 unique pairings) successfully merged out of 152576 (in 4299 pairings) input.
## 129144 paired-reads (in 1704 unique pairings) successfully merged out of 135885 (in 4377 pairings) input.
## 257 paired-reads (in 34 unique pairings) successfully merged out of 412 (in 122 pairings) input.
## 151374 paired-reads (in 1483 unique pairings) successfully merged out of 159502 (in 4433 pairings) input.
## 123368 paired-reads (in 1549 unique pairings) successfully merged out of 129676 (in 4455 pairings) input.
## 132664 paired-reads (in 976 unique pairings) successfully merged out of 136442 (in 2649 pairings) input.
## 238812 paired-reads (in 2897 unique pairings) successfully merged out of 258279 (in 10716 pairings) input.
## 114336 paired-reads (in 1520 unique pairings) successfully merged out of 124113 (in 4633 pairings) input.
## 155773 paired-reads (in 1679 unique pairings) successfully merged out of 164005 (in 4288 pairings) input.
## 130685 paired-reads (in 1606 unique pairings) successfully merged out of 140033 (in 4206 pairings) input.
## 134059 paired-reads (in 1606 unique pairings) successfully merged out of 143880 (in 5518 pairings) input.
## 148776 paired-reads (in 1722 unique pairings) successfully merged out of 158071 (in 4980 pairings) input.
## 152186 paired-reads (in 1650 unique pairings) successfully merged out of 161368 (in 4421 pairings) input.
sta <- makeSequenceTable(mm)
st <- removeBimeraDenovo(sta, method="consensus", multithread=TRUE)
percentage of chimeras can be found by
1 - sum(st)/sum(sta)
## [1] 0.11825
around 90% ASVs were reduced
1 - ncol(st)/ncol(sta)
## [1] 0.9004324
sq <- getSequences(st)
table(nchar(sq))
##
## 181 184 185 186 188 190 191 192 193 194 195 196 198 199 200 201
## 95 3 33 4 4 5 1 2 3 4 4 2 4 2 5 3
## 202 203 204 205 206 207 209 211 213 214 215 216 217 218 219 220
## 6 4 4 2 1 3 4 3 3 2 8 2 3 2 1 10
## 221 222 223 226 227 228 229 230 232 233 234 236 238 239 240 241
## 2 1 2 3 2 3 1 1 1 2 5 1 1 2 1 1
## 243 244 248 249 250 251 252 253 254 255 256 258 260 261 262 264
## 1 3 1 2 4 9 812 1960 145 38 6 1 4 1 5 2
## 265 266 268 269 270 271 272 275 276 277 278 279 280 281 282 283
## 7 1 1 7 3 3 2 2 3 3 2 3 1 1 4 5
## 284 285 286 289 290 291 292 293 294 295 296 298 299
## 2 1 4 2 5 3 5 2 3 1 3 2 3
# 252-254... like before. Did they change primers since then?
lens <- sort(unique(nchar(sq)))
tot <- sapply(lens, function(l) sum(colSums(st)[nchar(sq)==l]))
names(tot) <- as.character(lens)
tot
## 181 184 185 186 188 190 191 192 193 194
## 612 107 350 40 13 162 6 7 12 26
## 195 196 198 199 200 201 202 203 204 205
## 10 19 13 18 10 62 15 169 17 5
## 206 207 209 211 213 214 215 216 217 218
## 2 16 101 7 9 5 374 5 28 4
## 219 220 221 222 223 226 227 228 229 230
## 2 44 5 2 6 18 471 595 3 2
## 232 233 234 236 238 239 240 241 243 244
## 2 4 28 5 2 6 5 2 9 12
## 248 249 250 251 252 253 254 255 256 258
## 5 6 17 116 7368328 6515035 60035 1365 55 2
## 260 261 262 264 265 266 268 269 270 271
## 11 2 13 50 146 2 2 1479 11 7
## 272 275 276 277 278 279 280 281 282 283
## 13 4 9 8 6 46 3 2 10 13
## 284 285 286 289 290 291 292 293 294 295
## 6 2 11 4 116 7 18 4 8 3
## 296 298 299
## 6 344 6
# All in 252-254 basically
getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(ddF, getN), sapply(ddR, getN), sapply(mm, getN), rowSums(st))
# If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
rownames(track) <- sam.F
head(track)
## input filtered denoisedF denoisedR merged nonchim
## Netti001 188595 160517 160248 160338 155218 139734
## Netti002 182265 152272 151743 151860 142513 124105
## Netti003 192837 171477 170891 171101 162087 143286
## Netti004 180622 146564 145659 145693 138493 121424
## Netti005 176941 151368 150888 151045 142949 116380
## Netti006 162155 137982 137660 137719 130850 112388
tax <- assignTaxonomy(st, "~/tax/silva_nr99_v138.1_train_set.fa.gz", multithread=TRUE)
taxsp <- addSpecies(tax, "~/tax/silva_species_assignment_v138.1.fa.gz")
rownames(st) <- sam.F
save all data
saveRDS(st, here("data", "following_study", "st.rds"))
saveRDS(tax, here("data", "following_study", "tax.rds"))
saveRDS(taxsp, here("data", "following_study", "taxsp.rds")) # tax at species level
sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 4.4.1 (2024-06-14)
## os macOS 15.5
## system aarch64, darwin20
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/New_York
## date 2025-06-13
## pandoc 3.5 @ /Users/yixuanyang/miniforge3/bin/ (via rmarkdown)
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date (UTC) lib source
## abind 1.4-8 2024-09-12 [2] CRAN (R 4.4.1)
## Biobase * 2.66.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## BiocGenerics * 0.52.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## BiocParallel * 1.39.0 2024-05-04 [2] Bioconductor 3.20 (R 4.4.0)
## Biostrings * 2.74.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## bitops 1.0-9 2024-10-03 [2] CRAN (R 4.4.1)
## bslib 0.9.0 2025-01-30 [1] CRAN (R 4.4.1)
## cachem 1.1.0 2024-05-16 [2] CRAN (R 4.4.0)
## cli 3.6.4 2025-02-13 [2] CRAN (R 4.4.1)
## codetools 0.2-20 2024-03-31 [2] CRAN (R 4.4.1)
## colorspace 2.1-1 2024-07-26 [2] CRAN (R 4.4.0)
## crayon 1.5.3 2024-06-20 [2] CRAN (R 4.4.0)
## dada2 * 1.33.0 2024-05-04 [2] Bioconductor 3.20 (R 4.4.0)
## DelayedArray 0.32.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## deldir 2.0-4 2024-02-28 [2] CRAN (R 4.4.0)
## digest 0.6.37 2024-08-19 [2] CRAN (R 4.4.1)
## dplyr 1.1.4 2023-11-17 [2] CRAN (R 4.4.0)
## evaluate 1.0.3 2025-01-10 [1] CRAN (R 4.4.1)
## farver 2.1.2 2024-05-13 [2] CRAN (R 4.4.0)
## fastmap 1.2.0 2024-05-15 [2] CRAN (R 4.4.0)
## generics 0.1.3 2022-07-05 [2] CRAN (R 4.4.0)
## GenomeInfoDb * 1.42.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## GenomeInfoDbData 1.2.13 2024-10-05 [2] Bioconductor
## GenomicAlignments * 1.42.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## GenomicRanges * 1.58.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## ggplot2 3.5.1 2024-04-23 [2] CRAN (R 4.4.0)
## glue 1.8.0 2024-09-30 [2] CRAN (R 4.4.1)
## gtable 0.3.6 2024-10-25 [2] CRAN (R 4.4.1)
## here * 1.0.1 2020-12-13 [2] CRAN (R 4.4.0)
## htmltools 0.5.8.1 2024-04-04 [2] CRAN (R 4.4.0)
## httr 1.4.7 2023-08-15 [2] CRAN (R 4.4.0)
## hwriter 1.3.2.1 2022-04-08 [2] CRAN (R 4.4.0)
## interp 1.1-6 2024-01-26 [2] CRAN (R 4.4.0)
## IRanges * 2.40.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## jpeg 0.1-10 2022-11-29 [2] CRAN (R 4.4.0)
## jquerylib 0.1.4 2021-04-26 [2] CRAN (R 4.4.0)
## jsonlite 2.0.0 2025-03-27 [1] CRAN (R 4.4.1)
## knitr 1.50 2025-03-16 [1] CRAN (R 4.4.1)
## labeling 0.4.3 2023-08-29 [2] CRAN (R 4.4.0)
## lattice 0.22-6 2024-03-20 [2] CRAN (R 4.4.1)
## latticeExtra 0.6-30 2022-07-04 [2] CRAN (R 4.4.0)
## lifecycle 1.0.4 2023-11-07 [2] CRAN (R 4.4.0)
## magrittr 2.0.3 2022-03-30 [2] CRAN (R 4.4.0)
## Matrix 1.7-1 2024-10-18 [2] CRAN (R 4.4.1)
## MatrixGenerics * 1.18.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## matrixStats * 1.4.1 2024-09-08 [2] CRAN (R 4.4.1)
## munsell 0.5.1 2024-04-01 [2] CRAN (R 4.4.0)
## patchwork * 1.3.0 2024-09-16 [2] CRAN (R 4.4.1)
## pillar 1.10.2 2025-04-05 [1] CRAN (R 4.4.1)
## pkgconfig 2.0.3 2019-09-22 [2] CRAN (R 4.4.0)
## plyr 1.8.9 2023-10-02 [2] CRAN (R 4.4.0)
## png 0.1-8 2022-11-29 [2] CRAN (R 4.4.0)
## pwalign 1.2.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## R6 2.6.1 2025-02-15 [1] CRAN (R 4.4.1)
## RColorBrewer 1.1-3 2022-04-03 [2] CRAN (R 4.4.0)
## Rcpp * 1.0.14 2025-01-12 [1] CRAN (R 4.4.1)
## RcppParallel 5.1.9 2024-08-19 [2] CRAN (R 4.4.1)
## reshape2 1.4.4 2020-04-09 [2] CRAN (R 4.4.0)
## rlang 1.1.5 2025-01-17 [2] CRAN (R 4.4.1)
## rmarkdown 2.29 2024-11-04 [2] CRAN (R 4.4.1)
## rprojroot 2.0.4 2023-11-05 [2] CRAN (R 4.4.0)
## Rsamtools * 2.21.2 2024-09-26 [2] Bioconductor 3.20 (R 4.4.1)
## rstudioapi 0.17.1 2024-10-22 [2] CRAN (R 4.4.1)
## S4Arrays 1.6.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## S4Vectors * 0.44.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## sass 0.4.9 2024-03-15 [2] CRAN (R 4.4.0)
## scales 1.3.0 2023-11-28 [2] CRAN (R 4.4.0)
## sessioninfo 1.2.2 2021-12-06 [2] CRAN (R 4.4.0)
## ShortRead * 1.63.2 2024-09-26 [2] Bioconductor 3.20 (R 4.4.1)
## SparseArray 1.6.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## stringi 1.8.7 2025-03-27 [1] CRAN (R 4.4.1)
## stringr 1.5.1 2023-11-14 [2] CRAN (R 4.4.0)
## SummarizedExperiment * 1.36.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## tibble 3.2.1 2023-03-20 [2] CRAN (R 4.4.0)
## tidyselect 1.2.1 2024-03-11 [2] CRAN (R 4.4.0)
## UCSC.utils 1.2.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## vctrs 0.6.5 2023-12-01 [2] CRAN (R 4.4.0)
## withr 3.0.2 2024-10-28 [2] CRAN (R 4.4.1)
## xfun 0.52 2025-04-02 [1] CRAN (R 4.4.1)
## XVector * 0.46.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## yaml 2.3.10 2024-07-26 [2] CRAN (R 4.4.0)
## zlibbioc 1.52.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
##
## [1] /Users/yixuanyang/Library/R/arm64/4.4/library
## [2] /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
##
## ──────────────────────────────────────────────────────────────────────────────