######################### # R code for study: # # Can bacterial indicators of a grassy woodland restoration inform # ecosystem assessment and microbiota-mediated human health? # # Authors: Craig Liddicoat, Philip Weinstein, Andrew Bissett, # Nicholas J.C. Gellie, Jacob G. Mills, Michelle Waycott, Martin F. Breed # # Input data available at: # https://figshare.com/s/1acbc273dfc93da272be # (DOI: 10.25909/5cbef4802c4d8) # ######################### .libPaths() # "C:/R/R-3.5.1/library" R.Version() # "R version 3.5.1 (2018-07-02)" citation() # R Core Team (2018). R: A language and environment for statistical computing. R Foundation for Statistical # Computing, Vienna, Austria. URL https://www.R-project.org/ library(readxl); packageVersion("readxl") # '1.1.0' library(readr); packageVersion("readr") # '1.1.1' library(plyr); packageVersion("plyr") # '1.8.4' library(dplyr); packageVersion("dplyr") # '0.7.6' library(vegan);packageVersion("vegan") # '2.5.2' library(phyloseq); packageVersion("phyloseq") # '1.24.0' library(ggplot2); packageVersion("ggplot2") # '3.0.0' library(reshape2); packageVersion("reshape2") # '1.4.3' library(moments); packageVersion("moments") # '0.14' library(doParallel); packageVersion("doParallel") # '1.0.11' library(raster); packageVersion("raster") # '2.6.7' library(sp); packageVersion("sp") # '1.3.1' library(rgdal); packageVersion("rgdal") # '1.3.3' library(ordPens); packageVersion("ordPens") # '0.3.1' library(DESeq2); packageVersion("DESeq2") # '1.20.0' library(ggrepel); packageVersion("ggrepel") # '0.8.0' library(nabor); packageVersion("nabor") # '0.4.7' library(seqinr); packageVersion("seqinr") # '3.4.5' library(themetagenomics); packageVersion("themetagenomics") # '0.1.0' library(gplots); packageVersion("gplots") # ‘3.0.1’ library(gtools); packageVersion("gtools") # ‘3.8.1’ library(ggsignif); packageVersion("ggsignif") # ‘0.4.0’ ######################### workdir <- "C:/Workspace/PROJ/PAPER-Trending-Taxa-Resto/modelling" setwd(workdir) getwd() # "C:/Workspace/PROJ/PAPER-Trending-Taxa-Resto/modelling" datadir <- "C:/Workspace/PROJ/PAPER-Trending-Taxa-Resto/datasets" datadir par.default <- par() # define projection WGS84 WGS84<-"+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0" RO.default<-rasterOptions() # datatype: FLT8S, chunksize: 1e+07, maxmemory: 1e+08 rasterOptions(datatype="FLT4S",progress="text",chunksize=1e+08,maxmemory=1e+09,tmpdir="C:/Workspace/TEMP/temp-rasters") # maxmemory = max no of cells to read into memory #### Mt Bold restoration gradient - import sample contextual data #------------------------- context <- read_excel(path= paste0(datadir,"/","Mt-Bold-context-data.xlsx"), sheet=1, range="A1:BM151", col_names = TRUE) context <- as.data.frame(context) str(context) # fix variables that should be numeric unique(context$`NH3-N`) # [1] "11" "6" "12" "2" "8" "9" "7" "< 1" "5" "3" "10" "4" "15" "14" "16" "17" context$`NH3-N` <- as.numeric(context$`NH3-N`) unique(context$`NO3-`) # [1] "9" "2" "6" "5" "8" "14" "11" "10" "20" "17" "4" "1" "3" "12" "< 1" "22" # [17] "209" "33" context$`NO3-` <- as.numeric(context$`NO3-`) unique(context$Sulphur) context$Sulphur <- as.numeric(context$Sulphur) table(context$`Data Type`) # MiSeq 16S MiSeq 18S MiSeq ITS # 50 50 50 unique( paste0(context$Latitude,"&",context$Longitude) ) # [1] "-35.091696&138.654574" "-35.090961&138.65477" "-35.0906934&138.6549204" # [4] "-35.085397&138.658856" "-35.085429&138.659129" "-35.085733&138.659439" # [7] "-35.09727&138.671064" "-35.097408&138.67071" "-35.09698&138.670996" # [10] "-35.098208&138.677674" "-35.097955&138.677728" "-35.098403&138.677728" # [13] "-35.097955&138.677626" "-35.102971&138.684416" "-35.103062&138.684929" # [16] "-35.103011&138.685117" "-35.091744&138.676234" "-35.091563&138.676184" # [19] "-35.091272&138.67605" "-35.102142&138.650709" "-35.101944&138.650715" # [22] "-35.101999&138.650319" "-35.081399&138.658608" "-35.081391&138.658225" # [25] "-35.081551&138.657617" "-35.081479&138.657191" # i.e. 26 unique spatial coordinates str(context) # 'data.frame': 150 obs. of 65 variables: # $ Sample ID : chr "bpa-base-genomics-amplicon-16s-19281_1-agejk" "bpa-base-genomics-amplicon-16s-19282_1-agejk" "bpa-base-genomics-amplicon-16s-19283_1-agejk" "bpa-base-genomics-amplicon-16s-19284_1-agejk" ... # $ BPA ID : chr "102.100.100.19281" "102.100.100.19282" "102.100.100.19283" "102.100.100.19284" ... # $ BioSample Accession : chr "SAMN07488642" "SAMN07488643" "SAMN07488644" "SAMN07488645" ... # $ Data Type : chr "MiSeq 16S" "MiSeq 16S" "MiSeq 16S" "MiSeq 16S" ... # $ Collection Site : chr "Mount bold" "Mount bold" "Mount bold" "Mount bold" ... # $ Sample ID2 : chr "MB-2009-1-10" "MB-2009-1-20" "MB-2009-2-10" "MB-2009-2-20" ... # $ Sample ID3 : chr "2009.1.10" "2009.1.20" "2009.2.10" "2009.2.20" ... # $ Latitude : num -35.1 -35.1 -35.1 -35.1 -35.1 ... # $ Longitude : num 139 139 139 139 139 ... # $ Date Sampled : POSIXct, format: "2014-12-17" "2014-12-17" "2014-12-17" ... # $ Soil Depth (cm) : num 0 20 0 20 0 20 0 20 0 20 ... # $ Horizon : chr "A" "A" "A" "A" ... # $ Storage : chr "Frozen" "Frozen" "Frozen" "Frozen" ... # $ Broad Land Use : chr "Conservation and natural environments" "Conservation and natural environments" "Conservation and natural environments" "Conservation and natural environments" ... # $ Detailed Land Use : chr "Surface water supply" "Surface water supply" "Surface water supply" "Surface water supply" ... # $ Ecological Zone : chr "Temperate" "Temperate" "Temperate" "Temperate" ... # $ Vegetation Type : chr "Woodland" "Woodland" "Woodland" "Woodland" ... # $ Vegetation Cover : num 80 80 80 80 80 80 90 90 90 90 ... # $ Elevation : num 334 334 336 336 337 337 349 349 349 349 ... # $ Slope : num 3 3 3 3 0 0 3 3 3 3 ... # $ Slope Aspect : chr "SE" "SE" "SE" "SE" ... # $ Profile Position : chr "Summit/ridge" "Summit/ridge" "Summit/ridge" "Summit/ridge" ... # $ Australian Soil Classification : chr "chromosols" "chromosols" "chromosols" "chromosols" ... # $ FAO Soil Classification : chr "NA" "NA" "NA" "NA" ... # $ Immediate Previous Land Use : chr "Native/exotic pasture mosaic" "Native/exotic pasture mosaic" "Native/exotic pasture mosaic" "Native/exotic pasture mosaic" ... # $ Date since change in Land Use : chr "2009" "2009" "2009" "2009" ... # $ Crop Rotation 1 year since present : chr "NA" "NA" "NA" "NA" ... # $ Crop Rotation 2 years since present: chr "NA" "NA" "NA" "NA" ... # $ Crop Rotation 3 years since present: chr "NA" "NA" "NA" "NA" ... # $ Crop Rotation 4 years since present: chr "NA" "NA" "NA" "NA" ... # $ Crop Rotation 5 years since present: chr "NA" "NA" "NA" "NA" ... # $ Agrochemical Additions : chr "NA" "NA" "NA" "NA" ... # $ Tillage : chr "NA" "NA" "NA" "NA" ... # $ Fire History : chr "NA" "NA" "NA" "NA" ... # $ Fire Intensity : chr "NA" "NA" "NA" "NA" ... # $ Flooding : chr "NA" "NA" "NA" "NA" ... # $ Extreme Events : chr "NA" "NA" "NA" "NA" ... # $ Moisture : num 1.87 2.85 2.62 4.23 2.51 3.58 2.42 3.81 2.34 3.71 ... # $ Colour : chr "LTGR" "GRYW" "LTGR" "YWGR" ... # $ Gravel : num 5 5 5 5 5 5 5 5 5 5 ... # $ Texture : num 2 2 2 2 2 2 2 2 2 2 ... # $ Course Sand : num 44.6 50.9 45.1 44.3 41.1 ... # $ Fine Sand : num 32.7 28.5 30.2 30.1 34.3 ... # $ Sand : num 77.3 79.3 75.3 74.5 75.4 ... # $ Silt : num 6.21 4.15 10.31 9.85 8.23 ... # $ Clay : num 16.5 16.5 14.3 15.7 16.4 ... # $ NH3-N : num 11 6 12 2 8 9 7 NA 5 NA ... # $ NO3- : num 9 2 6 5 8 5 14 11 14 10 ... # $ Colwell P : num 10 7 12 7 12 8 19 16 29 24 ... # $ Colwell K : num 244 124 183 73 179 105 266 111 201 120 ... # $ Sulphur : num NA NA NA NA NA NA NA NA NA NA ... # $ Organic Carbon : num 2.93 1.5 3.27 1.23 2.73 1.42 3.12 1.25 2.77 1.35 ... # $ Conductivity : num 0.06 0.031 0.105 0.023 0.062 ... # $ CaCl2pH : num 4.6 4.6 4.5 4.5 4.5 4.6 4.6 4.7 4.7 4.9 ... # $ H2O pH : num 5.7 5.8 5.6 5.6 5.6 5.8 5.6 5.7 5.7 5.9 ... # $ DTPA Cu : num 0.72 0.66 0.79 0.75 0.88 0.82 0.88 0.93 1.05 0.97 ... # $ DTPA Fe : num 231 244 300 298 348 ... # $ DTPA Mn : num 6.55 3.84 7.29 4.05 6.18 4.23 6.84 2.78 6.1 3.64 ... # $ DTPA Zn : num 1.02 0.5 0.99 0.41 1.09 0.63 5.57 1.57 2.6 1.15 ... # $ Exc Al : num 0.176 0.23 0.276 0.377 0.287 0.259 0.256 0.326 0.182 0.191 ... # $ Exc Ca : num 3.85 2.68 3.68 2.14 3.65 2.57 4.16 2.54 4 2.34 ... # $ Exc Mg : num 1.03 0.81 1.04 0.79 1.01 0.77 1.27 0.77 1.17 0.76 ... # $ Exc K : num 0.42 0.22 0.34 0.11 0.35 0.17 0.57 0.2 0.39 0.21 ... # $ Exc Na : num 0.14 0.1 0.28 0.09 0.18 0.1 0.25 0.1 0.17 0.08 ... # $ B Hot CaCl2 : num 0.67 0.58 0.62 0.57 0.69 ... ## check spatial location of samples shp.context <- context coordinates(shp.context) <- ~ Longitude + Latitude class(shp.context) # "SpatialPointsDataFrame" shp.context@proj4string # NA shp.context@proj4string <- CRS(WGS84) plot(shp.context) # save as ArcGIS shapefile writeOGR(obj=shp.context, dsn="C:/Workspace/PROJ/PAPER-MICRO-Hazard-Surv/GIS-working", layer="Mt-Bold-context-data-shapefile", driver="ESRI Shapefile") ## Labels for context data? # 'data.frame': 150 obs. of 64 variables: # $ Sample ID : chr "bpa-base-genomics-amplicon-16s-19281_1-agejk" "bpa-base-genomics-amplicon-16s-19282_1-agejk" "bpa-base-genomics-amplicon-16s-19283_1-agejk" "bpa-base-genomics-amplicon-16s-19284_1-agejk" ... # $ BPA ID : chr "102.100.100.19281" "102.100.100.19282" "102.100.100.19283" "102.100.100.19284" ... # $ BioSample Accession : chr "SAMN07488642" "SAMN07488643" "SAMN07488644" "SAMN07488645" ... # $ Data Type : chr "MiSeq 16S" "MiSeq 16S" "MiSeq 16S" "MiSeq 16S" ... # $ Collection Site : chr "Mount bold" "Mount bold" "Mount bold" "Mount bold" ... # $ Sample ID2 : chr "MB-2009-1-10" "MB-2009-1-20" "MB-2009-2-10" "MB-2009-2-20" ... # $ Latitude : num -35.1 -35.1 -35.1 -35.1 -35.1 ... # $ Longitude : num 139 139 139 139 139 ... # $ Date Sampled : POSIXct, format: "2014-12-17" "2014-12-17" "2014-12-17" "2014-12-17" ... # $ Soil Depth (cm) ... unique(context$`Sample ID`) # qty 150, includes 16s, 18s, its data unique(context$`BPA ID`) # qty 50 # [1] "102.100.100.19281" "102.100.100.19282" "102.100.100.19283" "102.100.100.19284" # [5] "102.100.100.19285" "102.100.100.19286" "102.100.100.19287" "102.100.100.19288" # ... # [45] "102.100.100.19325" "102.100.100.19326" "102.100.100.19327" "102.100.100.19328" # [49] "102.100.100.19329" "102.100.100.19330" ## create new 'BPA_ID_short' context$BPA_ID_short <- sub(pattern="102.100.100.", replacement="X", # prefix X is used below when import the OTU data x=context$`BPA ID`) unique(context$BPA_ID_short) # [1] "X19281" "X19282" "X19283" "X19284" "X19285" "X19286" "X19287" "X19288" "X19289" "X19290" # [11] "X19291" "X19292" "X19293" "X19294" "X19295" "X19296" "X19297" "X19298" "X19299" "X19300" # [21] "X19301" "X19302" "X19303" "X19304" "X19305" "X19306" "X19307" "X19308" "X19309" "X19310" # [31] "X19311" "X19312" "X19313" "X19314" "X19315" "X19316" "X19317" "X19318" "X19319" "X19320" # [41] "X19321" "X19322" "X19323" "X19324" "X19325" "X19326" "X19327" "X19328" "X19329" "X19330" unique(context$`BioSample Accession`) # [1] "SAMN07488642" "SAMN07488643" "SAMN07488644" "SAMN07488645" "SAMN07488646" "SAMN07488647" # [7] "SAMN07488648" "SAMN07488649" "SAMN07488650" "SAMN07488651" "SAMN07488652" "SAMN07488653" # [13] "SAMN07488654" "SAMN07488655" "SAMN07488656" "SAMN07488657" "SAMN07488658" "SAMN07488659" # [19] "SAMN07488660" "SAMN07488661" "SAMN07488662" "SAMN07488663" "SAMN07488664" "SAMN07488665" # [25] "SAMN07488666" "SAMN07488667" "SAMN07488668" "SAMN07488669" "SAMN07488670" "SAMN07488671" # [31] "SAMN07488672" "SAMN07488673" "SAMN07488674" "SAMN07488675" "SAMN07488676" "SAMN07488677" # [37] "SAMN07488678" "SAMN07488679" "SAMN07488680" "SAMN07488681" "SAMN07488682" "SAMN07488683" # [43] "SAMN07488684" "SAMN07488685" "SAMN07488686" "SAMN07488687" "SAMN07488688" "SAMN07488689" # [49] "SAMN07488690" "SAMN07488691" unique(context$`Sample ID2`) # [1] "MB-2009-1-10" "MB-2009-1-20" "MB-2009-2-10" "MB-2009-2-20" "MB-2009-3-10" "MB-2009-3-20" # [7] "MB-neg-1-10" "MB-neg-1-20" "MB-neg-2-10" "MB-neg-2-20" "MB-neg-3-10" "MB-neg-3-20" # [13] "MB-2005-1-10" "MB-2005-1-20" "MB-2005-2-10" "MB-2005-2-20" "MB-2005-3-10" "MB-2005-3-20" # [19] "MB-2008-1-10" "MB-2008-1-20" "MB-2008-2-10" "MB-2008-2-20" "MB-2008-3-10" "MB-2008-3-20" # [25] "MB-2007-1-10" "MB-2007-1-20" "MB-2007-2-10" "MB-2007-2-20" "MB-2007-3-10" "MB-2007-3-20" # [31] "MB-posA-1-10" "MB-posA-1-20" "MB-posA-2-10" "MB-posA-2-20" "MB-posA-3-10" "MB-posA-3-20" # [37] "MB-posB-1-10" "MB-posB-1-20" "MB-posB-2-10" "MB-posB-2-20" "MB-posB-3-10" "MB-posB-3-20" # [43] "MB-posC-1-10" "MB-posC-1-20" "MB-posC-2-10" "MB-posC-2-20" "MB-posC-3-10" "MB-posC-3-20" # [49] "MB-posC-4-10" "MB-posC-4-20" unique(context$`Sample ID3`) # [1] "2009.1.10" "2009.1.20" "2009.2.10" "2009.2.20" "2009.3.10" "2009.3.20" "neg.1.10" "neg.1.20" # [9] "neg.2.10" "neg.2.20" "neg.3.10" "neg.3.20" "2005.1.10" "2005.1.20" "2005.2.10" "2005.2.20" # [17] "2005.3.10" "2005.3.20" "2008.1.10" "2008.1.20" "2008.2.10" "2008.2.20" "2008.3.10" "2008.3.20" # [25] "2007.1.10" "2007.1.20" "2007.2.10" "2007.2.20" "2007.3.10" "2007.3.20" "a.1.10" "a.1.20" # [33] "a.2.10" "a.2.20" "a.3.10" "a.3.20" "b.1.10" "b.1.20" "b.2.10" "b.2.20" # [41] "b.3.10" "b.3.20" "c.1.10" "c.1.20" "c.2.10" "c.2.20" "c.3.10" "c.3.20" # [49] "c.4.10" "c.4.20" ## create variable for reveg age & depth context$Reveg_age_and_depth <- NA # cleared sel <- which(context$`Sample ID2` %in% c("MB-neg-1-10","MB-neg-2-10","MB-neg-3-10") ) context$Reveg_age_and_depth[sel] <- "Cleared (0-10 cm)" sel <- which(context$`Sample ID2` %in% c("MB-neg-1-20","MB-neg-2-20","MB-neg-3-20") ) context$Reveg_age_and_depth[sel] <- "Cleared (20-30 cm)" # 2005: Reveg 10 years ago sel <- which(context$`Sample ID2` %in% c("MB-2005-1-10","MB-2005-2-10","MB-2005-3-10") ) context$Reveg_age_and_depth[sel] <- "10 years (0-10 cm)" sel <- which(context$`Sample ID2` %in% c("MB-2005-1-20","MB-2005-2-20","MB-2005-3-20") ) context$Reveg_age_and_depth[sel] <- "10 years (20-30 cm)" # 2007: Reveg 8 years sel <- which(context$`Sample ID2` %in% c("MB-2007-1-10","MB-2007-2-10","MB-2007-3-10") ) context$Reveg_age_and_depth[sel] <- "8 years (0-10 cm)" sel <- which(context$`Sample ID2` %in% c("MB-2007-1-20","MB-2007-2-20","MB-2007-3-20") ) context$Reveg_age_and_depth[sel] <- "8 years (20-30 cm)" # 2008: Reveg 7 years sel <- which(context$`Sample ID2` %in% c("MB-2008-1-10","MB-2008-2-10","MB-2008-3-10") ) context$Reveg_age_and_depth[sel] <- "7 years (0-10 cm)" sel <- which(context$`Sample ID2` %in% c("MB-2008-1-20","MB-2008-2-20","MB-2008-3-20") ) context$Reveg_age_and_depth[sel] <- "7 years (20-30 cm)" # 2009: Reveg 6 years sel <- which(context$`Sample ID2` %in% c("MB-2009-1-10","MB-2009-2-10","MB-2009-3-10") ) context$Reveg_age_and_depth[sel] <- "6 years (0-10 cm)" sel <- which(context$`Sample ID2` %in% c("MB-2009-1-20","MB-2009-2-20","MB-2009-3-20") ) context$Reveg_age_and_depth[sel] <- "6 years (20-30 cm)" # Remnant A sel <- which(context$`Sample ID2` %in% c("MB-posA-1-10","MB-posA-2-10","MB-posA-3-10") ) context$Reveg_age_and_depth[sel] <- "Remnant A (0-10 cm)" sel <- which(context$`Sample ID2` %in% c("MB-posA-1-20","MB-posA-2-20","MB-posA-3-20") ) context$Reveg_age_and_depth[sel] <- "Remnant A (20-30 cm)" # Remnant B sel <- which(context$`Sample ID2` %in% c("MB-posB-1-10","MB-posB-2-10","MB-posB-3-10") ) context$Reveg_age_and_depth[sel] <- "Remnant B (0-10 cm)" sel <- which(context$`Sample ID2` %in% c("MB-posB-1-20","MB-posB-2-20","MB-posB-3-20") ) context$Reveg_age_and_depth[sel] <- "Remnant B (20-30 cm)" # Remnant C sel <- which(context$`Sample ID2` %in% c("MB-posC-1-10","MB-posC-2-10","MB-posC-3-10","MB-posC-4-10") ) context$Reveg_age_and_depth[sel] <- "Remnant C (0-10 cm)" sel <- which(context$`Sample ID2` %in% c("MB-posC-1-20","MB-posC-2-20","MB-posC-3-20","MB-posC-4-20") ) context$Reveg_age_and_depth[sel] <- "Remnant C (20-30 cm)" unique(context$Reveg_age_and_depth) # [1] "6 years (0-10 cm)" "6 years (20-30 cm)" "Cleared (0-10 cm)" # [4] "Cleared (20-30 cm)" "10 years (0-10 cm)" "10 years (20-30 cm)" # [7] "7 years (0-10 cm)" "7 years (20-30 cm)" "8 years (0-10 cm)" # [10] "8 years (20-30 cm)" "Remnant A (0-10 cm)" "Remnant A (20-30 cm)" # [13] "Remnant B (0-10 cm)" "Remnant B (20-30 cm)" "Remnant C (0-10 cm)" # [16] "Remnant C (20-30 cm)" unique(context$`Soil Depth (cm)`) # 0 20 unique(paste0(context$`Sample ID2`,"__",context$`Data Type`)) # [1] "MB-2009-1-10__MiSeq 16S" "MB-2009-1-20__MiSeq 16S" "MB-2009-2-10__MiSeq 16S" # [4] "MB-2009-2-20__MiSeq 16S" "MB-2009-3-10__MiSeq 16S" "MB-2009-3-20__MiSeq 16S" # ... # [145] "MB-posC-2-10__MiSeq ITS" "MB-posC-2-20__MiSeq ITS" "MB-posC-3-10__MiSeq ITS" # [148] "MB-posC-3-20__MiSeq ITS" "MB-posC-4-10__MiSeq ITS" "MB-posC-4-20__MiSeq ITS" # #------------------------- #### Mt Bold restoration gradient - Bacteria 16s data # - import OTUs, taxonomy table # - create phyloseq object (phy.16s) #------------------------- ### isolate sample context data sel <- which(context$`Data Type` == "MiSeq 16S") # qty 50 samp.16s <- context[sel, ] # use 'Sample ID3' as the row name to match otu_table later in phyloseq object samp.16s$`Sample ID3` # [1] "2009.1.10" "2009.1.20" "2009.2.10" "2009.2.20" "2009.3.10" "2009.3.20" "neg.1.10" # [8] "neg.1.20" "neg.2.10" "neg.2.20" "neg.3.10" "neg.3.20" "2005.1.10" "2005.1.20" # [15] "2005.2.10" "2005.2.20" "2005.3.10" "2005.3.20" "2008.1.10" "2008.1.20" "2008.2.10" # [22] "2008.2.20" "2008.3.10" "2008.3.20" "2007.1.10" "2007.1.20" "2007.2.10" "2007.2.20" # [29] "2007.3.10" "2007.3.20" "a.1.10" "a.1.20" "a.2.10" "a.2.20" "a.3.10" # [36] "a.3.20" "b.1.10" "b.1.20" "b.2.10" "b.2.20" "b.3.10" "b.3.20" # [43] "c.1.10" "c.1.20" "c.2.10" "c.2.20" "c.3.10" "c.3.20" "c.4.10" # [50] "c.4.20" row.names(samp.16s) <- samp.16s$`Sample ID3` row.names(samp.16s) # same as above! # OTU raw data matrix from Gellie et al. (2017a) doi: 10.4227/05/5878480a91885 ### OTU table raw.otu.16s <- read_excel(path= paste0(datadir,"/","Mt_Bold_Bacteria_16S_OTU_raw_data_matrix.xlsx"), sheet=1, range="A1:BD3317", col_names = TRUE) raw.otu.16s <- as.data.frame(raw.otu.16s) str(raw.otu.16s) # this is a combined OTU abundance table & taxonomy table dim(raw.otu.16s) # 3316 56 row.names(raw.otu.16s) <- raw.otu.16s$OTUId head(row.names(raw.otu.16s)) # [1] "AMD_16S_OTUa_14" "AMD_16S_OTUa_143" "AMD_16S_OTUa_409" "AMD_16S_OTUa_82" "AMD_16S_OTUa_288" # [6] "AMD_16S_OTUa_52" # now remove 1st column (it is now stored as the row.name) raw.otu.16s <- raw.otu.16s[ , -1] names(raw.otu.16s) # [1] "kingdom" "phylum" "class" "order" "family" "genus" "species" "neg.1.10" "neg.1.20" # [10] "neg.2.10" "neg.2.20" "neg.3.10" "neg.3.20" "2009.1.10" "2009.1.20" "2009.2.10" "2009.2.20" "2009.3.10" # [19] "2009.3.20" "2008.1.10" "2008.1.20" "2008.2.10" "2008.2.20" "2008.3.10" "2008.3.20" "2007.1.10" "2007.1.20" # [28] "2007.2.10" "2007.2.20" "2007.3.10" "2007.3.20" "2005.1.10" "2005.1.20" "2005.2.10" "2005.2.20" "2005.3.10" # [37] "2005.3.20" "a.1.10" "a.1.20" "a.2.10" "a.2.20" "a.3.10" "a.3.20" "b.1.10" "b.1.20" # [46] "b.2.10" "b.2.20" "b.3.10" "b.3.20" "c.1.10" "c.1.20" "c.2.10" "c.2.20" "c.4.10" # [55] "c.4.20" unique(names(raw.otu.16s)) ## align available OTU abundance data with samples? sel <- which(row.names(samp.16s) %in% names(raw.otu.16s)) samp.withOTUs.16s <- row.names(samp.16s)[sel] samp.withOTUs.16s # [1] "2009.1.10" "2009.1.20" "2009.2.10" "2009.2.20" "2009.3.10" "2009.3.20" "neg.1.10" # [8] "neg.1.20" "neg.2.10" "neg.2.20" "neg.3.10" "neg.3.20" "2005.1.10" "2005.1.20" # [15] "2005.2.10" "2005.2.20" "2005.3.10" "2005.3.20" "2008.1.10" "2008.1.20" "2008.2.10" # [22] "2008.2.20" "2008.3.10" "2008.3.20" "2007.1.10" "2007.1.20" "2007.2.10" "2007.2.20" # [29] "2007.3.10" "2007.3.20" "a.1.10" "a.1.20" "a.2.10" "a.2.20" "a.3.10" # [36] "a.3.20" "b.1.10" "b.1.20" "b.2.10" "b.2.20" "b.3.10" "b.3.20" # [43] "c.1.10" "c.1.20" "c.2.10" "c.2.20" "c.4.10" "c.4.20" # only keep sample data where OTU abundance data also available samp.16s <- samp.16s[sel, ] dim(samp.16s) # 48 67 # Subset for OTU table and convert to matrix otu.16s <- as.matrix( raw.otu.16s[, samp.withOTUs.16s ] ) dim(otu.16s) # 3316 48 ## Taxonomy table also comes from raw.otu.16s tax.16s <- raw.otu.16s[, c("kingdom","phylum","class","order","family","genus","species") ] # rownames are already matching to OTUs # convert to matrix tax.16s <- as.matrix(tax.16s) dim(tax.16s) # 3316 7 ## Create 'otuTable' # otu_table - Works on any numeric matrix. # You must also specify if the species are rows or columns OTU.16s <- otu_table(otu.16s, taxa_are_rows = TRUE) ## Create 'taxonomyTable' # tax_table - Works on any character matrix. # The rownames must match the OTU names (taxa_names) of the otu_table if you plan to combine it with a phyloseq-object. TAX.16s <- tax_table(tax.16s) ## Create a phyloseq object, merging OTU & TAX tables phy.16s = phyloseq(OTU.16s, TAX.16s) phy.16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 3316 taxa and 48 samples ] # tax_table() Taxonomy Table: [ 3316 taxa by 7 taxonomic ranks ] sample_names(phy.16s) # [1] "2009.1.10" "2009.1.20" "2009.2.10" "2009.2.20" "2009.3.10" "2009.3.20" "neg.1.10" # [8] "neg.1.20" "neg.2.10" "neg.2.20" "neg.3.10" "neg.3.20" "2005.1.10" "2005.1.20" # [15] "2005.2.10" "2005.2.20" "2005.3.10" "2005.3.20" "2008.1.10" "2008.1.20" "2008.2.10" # [22] "2008.2.20" "2008.3.10" "2008.3.20" "2007.1.10" "2007.1.20" "2007.2.10" "2007.2.20" # [29] "2007.3.10" "2007.3.20" "a.1.10" "a.1.20" "a.2.10" "a.2.20" "a.3.10" # [36] "a.3.20" "b.1.10" "b.1.20" "b.2.10" "b.2.20" "b.3.10" "b.3.20" # [43] "c.1.10" "c.1.20" "c.2.10" "c.2.20" "c.4.10" "c.4.20" ### Now Add sample data to phyloseq object # sample_data - Works on any data.frame. The rownames must match the sample names in # the otu_table if you plan to combine them as a phyloseq-object dim(samp.16s) # 48 67 length(which(samp.16s$`Sample ID3` %in% sample_names(phy.16s))) # 48 str(samp.16s) # add variable 'Reveg_age', simplify from ... samp.16s$Reveg_age_and_depth # [1] "6 years (0-10 cm)" "6 years (20-30 cm)" "6 years (0-10 cm)" "6 years (20-30 cm)" # [5] "6 years (0-10 cm)" "6 years (20-30 cm)" "Cleared (0-10 cm)" "Cleared (20-30 cm)" # [9] "Cleared (0-10 cm)" "Cleared (20-30 cm)" "Cleared (0-10 cm)" "Cleared (20-30 cm)" # [13] "10 years (0-10 cm)" "10 years (20-30 cm)" "10 years (0-10 cm)" "10 years (20-30 cm)" # [17] "10 years (0-10 cm)" "10 years (20-30 cm)" "7 years (0-10 cm)" "7 years (20-30 cm)" # [21] "7 years (0-10 cm)" "7 years (20-30 cm)" "7 years (0-10 cm)" "7 years (20-30 cm)" # [25] "8 years (0-10 cm)" "8 years (20-30 cm)" "8 years (0-10 cm)" "8 years (20-30 cm)" # [29] "8 years (0-10 cm)" "8 years (20-30 cm)" "Remnant A (0-10 cm)" "Remnant A (20-30 cm)" # [33] "Remnant A (0-10 cm)" "Remnant A (20-30 cm)" "Remnant A (0-10 cm)" "Remnant A (20-30 cm)" # [37] "Remnant B (0-10 cm)" "Remnant B (20-30 cm)" "Remnant B (0-10 cm)" "Remnant B (20-30 cm)" # [41] "Remnant B (0-10 cm)" "Remnant B (20-30 cm)" "Remnant C (0-10 cm)" "Remnant C (20-30 cm)" # [45] "Remnant C (0-10 cm)" "Remnant C (20-30 cm)" "Remnant C (0-10 cm)" "Remnant C (20-30 cm)" samp.16s$Reveg_age <- c( "6 years", "6 years", "6 years", "6 years", "6 years", "6 years", "Cleared", "Cleared", "Cleared", "Cleared", "Cleared", "Cleared", "10 years", "10 years", "10 years", "10 years", "10 years", "10 years", "7 years", "7 years", "7 years", "7 years", "7 years", "7 years", "8 years", "8 years", "8 years", "8 years", "8 years", "8 years", "Remnant A", "Remnant A", "Remnant A", "Remnant A", "Remnant A", "Remnant A", "Remnant B", "Remnant B", "Remnant B", "Remnant B", "Remnant B", "Remnant B", "Remnant C", "Remnant C", "Remnant C", "Remnant C", "Remnant C", "Remnant C" ) # set as ordered factor samp.16s$Reveg_age <- factor(samp.16s$Reveg_age, levels=c( "Cleared", "6 years", "7 years", "8 years", "10 years", "Remnant A", "Remnant B", "Remnant C" ) ,ordered=TRUE) # add variable $depth, simplify from ... samp.16s$`Soil Depth (cm)` samp.16s$depth <- samp.16s$`Soil Depth (cm)` samp.16s$depth <- as.character(samp.16s$depth) samp.16s$depth[ samp.16s$depth=="0" ] <- "0-10 cm" samp.16s$depth[ samp.16s$depth=="20" ] <- "20-30 cm" samp.16s$depth <- as.factor(samp.16s$depth) samp.16s$depth SAMP.16s <- sample_data(samp.16s) ### Combine SAMPDATA into phyloseq object phy.16s <- merge_phyloseq(phy.16s, SAMP.16s) phy.16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 3316 taxa and 48 samples ] # sample_data() Sample Data: [ 48 samples by 69 sample variables ] # tax_table() Taxonomy Table: [ 3316 taxa by 7 taxonomic ranks ] ## remove taxa not assigned as Bacteria levels(as.factor( get_taxa_unique(phy.16s, taxonomic.rank = "kingdom" ))) # keep_taxa <- which(tax_table(phy.16s)[, "kingdom"] == "k__Bacteria") phy.16s <- prune_taxa(phy.16s, taxa = row.names(tax_table(phy.16s)[keep_taxa, ]) ) phy.16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 3316 taxa and 48 samples ] # sample_data() Sample Data: [ 48 samples by 69 sample variables ] # tax_table() Taxonomy Table: [ 3316 taxa by 7 taxonomic ranks ] ## remove taxa not assigned at the phylum level rem_taxa <- which(tax_table(phy.16s)[, "phylum"] == "unclassified") # empty!! # phy.16s <- prune_taxa(phy.16s, taxa = row.names(tax_table(phy.16s)[-rem_taxa, ]) ) # phy.16s rank_names(phy.16s) # "kingdom" "phylum" "class" "order" "family" "genus" "species" sort( as.character( unique( tax_table(phy.16s)[, "phylum"] ) )) sort( as.character( unique( tax_table(phy.16s)[, "class"] ) )) sort( as.character( unique( tax_table(phy.16s)[, "order"] ) )) sort( as.character( unique( tax_table(phy.16s)[, "family"] ) )) # ## remove taxa associated with chloroplast, streptophyta, and mitochondria rem_taxa1 <- which(tax_table(phy.16s)[, "class"] == "c__Chloroplast") # qty 4 OTUs rem_taxa2 <- which(tax_table(phy.16s)[, "order"] == "o__Streptophyta") # qty 3 OTUs rem_taxa3 <- which(tax_table(phy.16s)[, "family"] == "f__mitochondria") # qty 26 OTUs c(rem_taxa1,rem_taxa2,rem_taxa3) # [1] 277 599 3065 3297 277 599 3065 154 276 343 394 406 648 877 1160 1249 1261 1870 1900 1910 1938 2173 2234 2358 2611 2773 # [27] 2792 2952 3042 3064 3098 3176 3218 unique(c(rem_taxa1,rem_taxa2,rem_taxa3)) # [1] 277 599 3065 3297 154 276 343 394 406 648 877 1160 1249 1261 1870 1900 1910 1938 2173 2234 2358 2611 2773 2792 2952 3042 # [27] 3064 3098 3176 3218 ... i.e. "o__Streptophyta" overlap with "c__Chloroplast" phy.16s <- prune_taxa(phy.16s, taxa = row.names(tax_table(phy.16s)[-unique(c(rem_taxa1,rem_taxa2,rem_taxa3)), ]) ) phy.16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 3286 taxa and 48 samples ] # sample_data() Sample Data: [ 48 samples by 69 sample variables ] # tax_table() Taxonomy Table: [ 3286 taxa by 7 taxonomic ranks ] ## remove taxa if <100 reads across samples min( taxa_sums(phy.16s) ) # 100 # OK, taxa with less than 100 reads have already been filtered out head(phy.16s@otu_table) #temp <- phy.16s ## remove taxa that do not occur in at least two samples phy.16s <- prune_taxa( taxa = apply( otu_table(phy.16s), 1, function(x) {sum(x > 0) >= 2 }), x = phy.16s) phy.16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 3282 taxa and 48 samples ] # sample_data() Sample Data: [ 48 samples by 69 sample variables ] # tax_table() Taxonomy Table: [ 3282 taxa by 7 taxonomic ranks ] sample_sums(phy.16s) # 2009.1.10 2009.1.20 2009.2.10 2009.2.20 2009.3.10 2009.3.20 neg.1.10 neg.1.20 neg.2.10 neg.2.20 neg.3.10 neg.3.20 2005.1.10 # 47060 74284 62799 68846 56507 70281 69396 62607 68983 70581 67248 82546 79082 # 2005.1.20 2005.2.10 2005.2.20 2005.3.10 2005.3.20 2008.1.10 2008.1.20 2008.2.10 2008.2.20 2008.3.10 2008.3.20 2007.1.10 2007.1.20 # 62040 83126 74915 67023 80489 51184 67130 51653 79834 58444 51222 54031 72549 # 2007.2.10 2007.2.20 2007.3.10 2007.3.20 a.1.10 a.1.20 a.2.10 a.2.20 a.3.10 a.3.20 b.1.10 b.1.20 b.2.10 # 46444 62689 64552 44969 55335 54357 60801 76768 70132 71100 59018 41990 63401 # b.2.20 b.3.10 b.3.20 c.1.10 c.1.20 c.2.10 c.2.20 c.4.10 c.4.20 # 57449 57735 80107 68995 49446 40015 51419 33625 39191 min(sample_sums(phy.16s)) # 33625 summary( sample_sums(phy.16s) ) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 33625 53437 62744 62154 70356 83126 levels(as.factor( get_taxa_unique(phy.16s, taxonomic.rank = "genus" ))) # [1] "g__[Clostridium]" "g__A17" "g__Achromobacter" # [4] "g__Acinetobacter" "g__Actinoallomurus" "g__Actinomadura" # [7] "g__Actinomycetales" "g__Actinomycetospora" "g__Adhaeribacter" # [10] "g__Afifella" "g__Afipia" "g__Agrobacterium" # [13] "g__Algoriphagus" "g__Alicyclobacillus" "g__Alkaliphilus" # [16] "g__Ammoniphilus" "g__Amycolatopsis" "g__Aneurinibacillus" # [19] "g__Aquicella" "g__Arthrobacter" "g__Asteroleplasma" # [22] "g__Asticcacaulis" "g__Azospirillum" "g__Bacillus" # [25] "g__Bacteroides" "g__Balneimonas" "g__Bdellovibrio" # [28] "g__Beijerinckia" "g__Bosea" "g__Bradyrhizobium" # [31] "g__Brevibacillus" "g__Brownia" "g__BSV43" # [34] "g__Burkholderia" "g__Byssovorax" "g__Caloramator" # [37] "g__Candidatus_Azobacteroides" "g__Candidatus_Koribacter" "g__Candidatus_Solibacter" # [40] "g__Candidatus_Xiphinematobacter" "g__Caulobacter" "g__Cellulomonas" # [43] "g__Chitinophaga" "g__Chryseobacterium" "g__Chthoniobacter" # [46] "g__Clostridium" "g__Cohnella" "g__Collimonas" # [49] "g__Conexibacter" "g__Coprococcus" "g__Cryocola" # [52] "g__Cystobacter" "g__Cytophagales" "g__DA101" # [55] "g__Dactylosporangium" "g__Desulfosporosinus" "g__Devosia" # [58] "g__Dongia" "g__Dyadobacter" "g__Dyella" # [61] "g__Dysgonomonas" "g__Edaphobacter" "g__Ellin506" # [64] "g__Enhydrobacter" "g__Ensifer" "g__Enterobacter" # [67] "g__Erwinia" "g__Ferruginibacter" "g__FFCH10602" # [70] "g__Fimbriimonas" "g__Flavisolibacter" "g__Flavobacterium" # [73] "g__Frankia" "g__Friedmanniella" "g__Gaiella" # [76] "g__Gemmata" "g__Geobacter" "g__Geodermatophilus" # [79] "g__Georgenia" "g__Georgfuchsia" "g__heteroC45_4W" # [82] "g__Hyphomicrobium" "g__Inquilinus" "g__Janthinobacterium" # [85] "g__JG37-AG-70" "g__Kaistia" "g__Kaistobacter" # [88] "g__Kibdelosporangium" "g__Kineosporia" "g__Kitasatospora" # [91] "g__Kribbella" "g__Ktedonobacter" "g__Kutzneria" # [94] "g__Labrys" "g__Legionella" "g__Leifsonia" # [97] "g__Luteibacter" "g__Lysinibacillus" "g__Marmoricola" # [100] "g__Massilia" "g__Mesorhizobium" "g__Methylibium" # [103] "g__Methylobacterium" "g__Methylotenera" "g__Microbacterium" # [106] "g__Microlunatus" "g__Modestobacter" "g__Mucilaginibacter" # [109] "g__Mycobacterium" "g__Niastella" "g__Nitrosovibrio" # [112] "g__Nitrospira" "g__Nocardioides" "g__Nocardiopsis" # [115] "g__Nonomuraea" "g__Oceanobacillus" "g__Opitutus" # [118] "g__OR-59" "g__Oryzihumus" "g__Paenibacillus" # [121] "g__Pedobacter" "g__Pedomicrobium" "g__Pedosphaera" # [124] "g__Pelomonas" "g__Pelosinus" "g__Phenylobacterium" # [127] "g__Phyllobacterium" "g__Pimelobacter" "g__Pirellula" # [130] "g__Planctomyces" "g__Polyangium" "g__Prosthecobacter" # [133] "g__Pseudomonas" "g__Pseudonocardia" "g__Pullulanibacillus" # [136] "g__Rahnella" "g__Ramlibacter" "g__Reyranella" # [139] "g__Rhodanobacter" "g__Rhodoblastus" "g__Rhodococcus" # [142] "g__Rhodomicrobium" "g__Rhodopila" "g__Rhodoplanes" # [145] "g__Rickettsiella" "g__Roseomonas" "g__Rubrivivax" # [148] "g__Rubrobacter" "g__Rudaea" "g__Rummeliibacillus" # [151] "g__Sedimentibacter" "g__Sediminibacterium" "g__Segetibacter" # [154] "g__Shimazuella" "g__Singulisphaera" "g__Skermanella" # [157] "g__Smaragdicoccus" "g__SMB53" "g__Solibacillus" # [160] "g__Solirubrobacter" "g__Solitalea" "g__Sorangium" # [163] "g__Sphingobacterium" "g__Sphingomonas" "g__Sporocytophaga" # [166] "g__Sporosarcina" "g__Stenotrophomonas" "g__Steroidobacter" # [169] "g__Streptacidiphilus" "g__Streptomyces" "g__Tatlockia" # [172] "g__Telmatospirillum" "g__Tepidibacter" "g__Terracoccus" # [175] "g__Terriglobus" "g__Tissierella_Soehngenia" "g__Turicibacter" # [178] "g__Uliginosibacterium" "g__Verrucomicrobium" "g__Viridibacillus" # [181] "g__Williamsia" "g__Yonghaparkia" "unclassified" ntaxa(phy.16s) # 3282 nsamples(phy.16s) # 48 sample_names(phy.16s) # [1] "2009.1.10" "2009.1.20" "2009.2.10" "2009.2.20" "2009.3.10" "2009.3.20" "neg.1.10" # [8] "neg.1.20" "neg.2.10" "neg.2.20" "neg.3.10" "neg.3.20" "2005.1.10" "2005.1.20" # [15] "2005.2.10" "2005.2.20" "2005.3.10" "2005.3.20" "2008.1.10" "2008.1.20" "2008.2.10" # [22] "2008.2.20" "2008.3.10" "2008.3.20" "2007.1.10" "2007.1.20" "2007.2.10" "2007.2.20" # [29] "2007.3.10" "2007.3.20" "a.1.10" "a.1.20" "a.2.10" "a.2.20" "a.3.10" # [36] "a.3.20" "b.1.10" "b.1.20" "b.2.10" "b.2.20" "b.3.10" "b.3.20" # [43] "c.1.10" "c.1.20" "c.2.10" "c.2.20" "c.4.10" "c.4.20" length(phy.16s@sam_data$Reveg_age_and_depth) # 48 length(unique(phy.16s@sam_data$Reveg_age_and_depth)) # 16 unique(phy.16s@sam_data$Reveg_age_and_depth) # [1] "6 years (0-10 cm)" "6 years (20-30 cm)" "Cleared (0-10 cm)" # [4] "Cleared (20-30 cm)" "10 years (0-10 cm)" "10 years (20-30 cm)" # [7] "7 years (0-10 cm)" "7 years (20-30 cm)" "8 years (0-10 cm)" # [10] "8 years (20-30 cm)" "Remnant A (0-10 cm)" "Remnant A (20-30 cm)" # [13] "Remnant B (0-10 cm)" "Remnant B (20-30 cm)" "Remnant C (0-10 cm)" # [16] "Remnant C (20-30 cm)" # "neg.1.10" "neg.2.10" "neg.3.10" # "neg.1.20" "neg.2.20" "neg.3.20" # # "2009.1.10" "2009.2.10" "2009.3.10" # "2009.1.20" "2009.2.20" "2009.3.20" # # "2008.1.10" "2008.2.10" "2008.3.10" # "2008.1.20" "2008.2.20" "2008.3.20" # # "2007.1.10" "2007.2.10" "2007.3.10" # "2007.1.20" "2007.2.20" "2007.3.20" # # "2005.1.10" "2005.2.10" "2005.3.10" # "2005.1.20" "2005.2.20" "2005.3.20" # # "a.1.10" "a.2.10" "a.3.10" # "a.1.20" "a.2.20" "a.3.20" # # "b.1.10" "b.2.10" "b.3.10" # "b.1.20" "b.2.20" "b.3.20" # # "c.1.10" "c.2.10" "c.4.10" # "c.1.20" "c.2.20" "c.4.20" # #------------------------- #### Mt Bold restoration gradient - Bacteria 16s data - Preliminary data analysis # Rarefy once and explore: # - exp(H), effective no of species # - ordination plots (B diversity) # - Rel abundance of Phyla (bar plot) # - PERMANOVA # - Rarefaction curve # - db-RDA / CCA #------------------------- min( sample_sums(phy.16s) ) # 33625 min(taxa_sums(phy.16s)) # 100 # rarefy #1 seed <- 123 r1.16s <- rarefy_even_depth(phy.16s, sample.size = min(sample_sums(phy.16s)), rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE) min(taxa_sums(r1.16s)) # 36 sample_sums(r1.16s) # all 33625 ntaxa(r1.16s) # 3282 class( r1.16s@sam_data$Reveg_age ) # "ordered" "factor" unique( r1.16s@sam_data$Reveg_age ) # [1] 6 years Cleared 10 years 7 years 8 years Remnant A Remnant B Remnant C # Levels: Cleared < 6 years < 7 years < 8 years < 10 years < Remnant A < Remnant B < Remnant C shan.r1.16s <- plot_richness(r1.16s, measures=c("Shannon")) shan.r1.16s str(shan.r1.16s) # out <- data.frame(sample=shan.r1.16s$data$samples,shannon=shan.r1.16s$data$value, # type=shan.r1.16s$data$Reveg.age , depth=shan.r1.16s$data$Soil.Depth..cm.) # # str(out) # out$depth[ out$depth==0 ] <- "0-10 cm" # auto change to character vector now # out$depth[ out$depth=="20" ] <- "20-30 cm" # out$depth <- as.factor(out$depth) out <- data.frame(sample=shan.r1.16s$data$samples,shannon=shan.r1.16s$data$value, type=shan.r1.16s$data$Reveg_age, depth=shan.r1.16s$data$depth) # # calculate effective no of species out$eff_no_spp <- exp(out$shannon) str(out) # 'data.frame': 48 obs. of 5 variables: # $ sample : Factor w/ 48 levels "2005.1.10","2005.1.20",..: 19 20 21 22 23 24 43 44 45 46 ... # $ shannon : num 6.3 6.26 6.11 6 6.02 ... # $ type : Ord.factor w/ 8 levels "Cleared"<"6 years"<..: 2 2 2 2 2 2 1 1 1 1 ... # $ depth : Factor w/ 2 levels "0-10 cm","20-30 cm": 1 2 1 2 1 2 1 2 1 2 ... # $ eff_no_spp: num 545 524 450 405 412 ... #melt.out <- melt(data=out) p <- ggplot(data=out, aes(type, eff_no_spp)) + geom_point() + theme(axis.text.x = element_text(angle=90, vjust=0.5) ) + labs(x = "Reveg age", y = "Effective no of species") + facet_wrap( ~ depth) p ### ORDINATION PLOT [finished-plot] # # # # # # # # # # # # # # # ### NMDS + Bray-Curtis - 0-10 & 20-30 all on one plot !!!!!!!!!!! set.seed(123) ord <- ordinate(r1.16s, "NMDS", "bray") ord # Call: # metaMDS(comm = veganifyOTU(physeq), distance = distance) # # global Multidimensional Scaling using monoMDS # # Data: wisconsin(sqrt(veganifyOTU(physeq))) # Distance: bray # # Dimensions: 2 # Stress: 0.1681273 # Stress type 1, weak ties # Two convergent solutions found after 20 tries # Scaling: centring, PC rotation, halfchange scaling # Species: expanded scores based on ‘wisconsin(sqrt(veganifyOTU(physeq)))’ p <- plot_ordination(r1.16s, ord, type="samples", color="Reveg_age", shape="depth") p str(p) p$labels$shape <- "Depth" p + theme_bw() + ggtitle("(a) 16S NMDS Bray") p + theme_bw() + geom_polygon(aes(fill=Reveg.age),alpha=0.6) + geom_point(size=2) + ggtitle("(a) 16S NMDS Bray") p + theme_bw() + geom_polygon(aes(fill=Reveg_age),alpha=0.4) + ggtitle("(a)") + scale_colour_discrete(name ="Sample type\n(Reveg age)") + scale_fill_discrete(name ="Sample type\n(Reveg age)") cols <- c("Cleared" = "#e31a1c", "6 years" = "#addd8e", "7 years" = "#78c679", "8 years" = "#41ab5d", "10 years"= "#238443", "Remnant A" = "#4292c6", "Remnant B" = "#2171b5", "Remnant C" = "#084594") pp <- p + theme_bw() + geom_polygon(aes(fill=Reveg_age),alpha=0.4) + # ggtitle("(a)") + scale_colour_manual(values = cols, name ="Sample type\n(Reveg age)") + scale_fill_manual(values = cols, name ="Sample type\n(Reveg age)") + annotate(geom="text", x= -0.5, y= -0.38, label = "Stress = 0.168", hjust=0, vjust=1) # hjust = 0, vjust=1 pp ggsave(plot=pp, filename = paste0("finished-plots/","Ord-NMDS-Bray-16S-vFINAL.tiff"), width = 12, height = 10, units = "cm", dpi = 600, compression = "lzw") # # # # # # # # # # # # # # # ### Relative abundance of Phyla - bar plot rank_names(phy.16s) # "kingdom" "phylum" "class" "order" "family" "genus" "species" #rel_abun.Phyla.phy.16s <- transform_sample_counts( tax_glom( phy.16s, taxrank = "phylum" ), function(x) x / sum(x)) rel_abun.Phyla.phy.16s <- transform_sample_counts( tax_glom( phy.16s, taxrank = "phylum" ), function(x) (100/3)*x / sum(x)) # multiplied by (100/3) so totals sum to 100% # identify significantly trending phyla and those with substantial proportion? # Abundance is calculated in plot_bar p<-plot_bar( rel_abun.Phyla.phy.16s , x = "Reveg_age", fill = "phylum", facet_grid = ~depth ) p str(p) # List of 9 # $ data :'data.frame': 1296 obs. of 74 variables: # ... # ..$ Reveg_age_and_depth : chr [1:1296] "7 years (20-30 cm)" "Cleared (0-10 cm)" "Remnant B (20-30 cm)" "8 years (0-10 cm)" ... # ..$ Reveg_age : Ord.factor w/ 8 levels "Cleared"<"6 years"<..: 3 1 7 4 8 7 8 7 7 7 ... # ..$ depth : Factor w/ 2 levels "0-10 cm","20-30 cm": 2 1 2 1 1 1 1 2 1 2 ... # ..$ kingdom : Factor w/ 1 level "k__Bacteria": 1 1 1 1 1 1 1 1 1 1 ... # ..$ phylum : Factor w/ 27 levels "p__Acidobacteria",..: 20 20 1 20 20 20 20 1 20 20 ... ## Identify which Phyla are rare ??? hist(p$data$Abundance) out <- data.frame(Abun = p$data$Abundance, Phylum = p$data$phylum, Samp_type = p$data$Reveg_age_and_depth, Reveg_age = p$data$Reveg_age, Depth = p$data$depth) str(out) # 'data.frame': 1296 obs. of 5 variables: # $ Abun : num 13.7 13.4 13.2 12.9 12.8 ... # $ Phylum : Factor w/ 27 levels "p__Acidobacteria",..: 20 20 1 20 20 20 20 1 20 20 ... # $ Samp_type: Factor w/ 16 levels "10 years (0-10 cm)",..: 6 9 14 7 15 13 15 14 13 14 ... # $ Reveg_age: Ord.factor w/ 8 levels "Cleared"<"6 years"<..: 3 1 7 4 8 7 8 7 7 7 ... # $ Depth : Factor w/ 2 levels "0-10 cm","20-30 cm": 2 1 2 1 1 1 1 2 1 2 ... ## Set threshold for major phyla summary(out$Abun) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 0.000000 0.005208 0.060848 1.234568 1.399714 13.613942 ## how many Phyla at thresholds > 1.5% ? levels(as.factor(as.character( out$Phylum[which(out$Abun >= 1.5)] ))) # qty 9 # [1] "p__Acidobacteria" "p__Actinobacteria" "p__Bacteroidetes" "p__Chloroflexi" # [5] "p__Firmicutes" "p__Gemmatimonadetes" "p__Planctomycetes" "p__Proteobacteria" # [9] "p__Verrucomicrobia" major_phyla <- levels(as.factor(as.character( out$Phylum[which(out$Abun >= 1.5)] ))) major_phyla # [1] "p__Acidobacteria" "p__Actinobacteria" "p__Bacteroidetes" "p__Chloroflexi" # [5] "p__Firmicutes" "p__Gemmatimonadetes" "p__Planctomycetes" "p__Proteobacteria" # [9] "p__Verrucomicrobia" sel.row <- which(out$Phylum %in% major_phyla) 100*sum( out$Abun[sel.row] )/sum(out$Abun) # 97.22755 % of relative abundance is covered 100 - 97.22755 # 2.77245 % left representing rare phyla str(out) # note that $Phylum is a factor # out$Phylum <- as.character(out$Phylum) # out$Phylum[-sel.row] <- "Other minor phyla" # out$Phylum <- as.factor(out$Phylum) sel.row.plot <- which(p$data$phylum %in% major_phyla) p$data$phylum <- as.character(p$data$phylum) p$data$phylum[ -sel.row.plot ] <- "Other minor phyla" p$data$phylum <- as.factor(p$data$phylum) levels(p$data$phylum) # [1] "Other minor phyla" "p__Acidobacteria" "p__Actinobacteria" "p__Bacteroidetes" # [5] "p__Chloroflexi" "p__Firmicutes" "p__Gemmatimonadetes" "p__Planctomycetes" # [9] "p__Proteobacteria" "p__Verrucomicrobia" p$labels$fill <- "Phyla" p$labels$x <- "Reveg age" #old_phyla_16S <- levels(p$data$phylum) old_phyla_16S <- c( "p__Acidobacteria" , "p__Actinobacteria" , "p__Bacteroidetes" , "p__Chloroflexi" , "p__Firmicutes" , "p__Gemmatimonadetes", "p__Planctomycetes", "p__Proteobacteria" , "p__Verrucomicrobia", "Other minor phyla" ) ## re-label phyla for plot new_phyla_16S <- c( "Acidobacteria", "Actinobacteria", "Bacteroidetes", "Chloroflexi", "Firmicutes" , "Gemmatimonadetes", "Planctomycetes", "Proteobacteria", "Verrucomicrobia", "Other minor phyla" ) #change to character, then revert to factor afterwards p$data$phylum <- as.character(p$data$phylum) for (i in 1:length(old_phyla_16S)) { #i<-1 p$data$phylum[which(p$data$phylum == old_phyla_16S[i])] <- new_phyla_16S[i] } p$data$phylum <- factor(p$data$phylum, levels = new_phyla_16S, ordered = TRUE) ## Change colour - length(old_phyla_16S) # 10 new_phyla_16S # [1] "Acidobacteria" "Actinobacteria" "Bacteroidetes" "Chloroflexi" # [5] "Firmicutes" "Gemmatimonadetes" "Planctomycetes" "Proteobacteria" # [9] "Verrucomicrobia" "Other minor phyla" cols.phlya.16S <-c( "Acidobacteria"= "#DB7F67", "Actinobacteria" = "#F9EA9A", "Bacteroidetes" = "#6494AA", "Chloroflexi" = "#99E1D9", "Firmicutes" = "#EAC8CA", "Gemmatimonadetes" = "#E9B872", "Planctomycetes" = "#AEF6C7", "Proteobacteria" = "#F3A712", "Verrucomicrobia" = "#DB324D", "Other minor phyla" = "#5B5750" ) # change x-axis labels levels(p$data$Reveg_age) # "Cleared" "6 years" "7 years" "8 years" "10 years" "Remnant A" "Remnant B" "Remnant C" p$data$Reveg_age <- factor(p$data$Reveg_age, levels = levels(p$data$Reveg_age), labels = c("Clear", "6 yr", "7 yr", "8 yr", "10 yr", "Rem A", "Rem B", "Rem C")) pp <- p + theme_bw() + #ggtitle("(a)") + scale_fill_manual(values = cols.phlya.16S, name ="Phyla") pp$theme$axis.text.x$vjust <- 0.5 pp$theme$axis.text.x$hjust <- 0.5 pp$theme$axis.text.x$angle <- 90 pp$theme$axis.text.x$angle <- 90 pp$labels$y <- "OTU relative abundance (%)" pp getwd() # "C:/Workspace/PROJ/PAPER-Trending-Taxa-Resto/modelling" ggsave(plot=pp, filename = paste0("finished-plots/","Plot-Bar-Relative-abun-Phyla-16S-vFINAL.tiff"), width = 14, height = 12, units = "cm", dpi = 600, compression = "lzw") ### PERMANOVA # PERMANOVA with phyloseq? # http://deneflab.github.io/MicrobeMiseq/demos/mothur_2_phyloseq.html ##In this example we are testing the hypothesis that the three stations we collected samples from have different centroids # set.seed(1) # # Calculate bray curtis distance matrix # erie_bray <- phyloseq::distance(erie_scale, method = "bray") # # make a data frame from the sample_data # sampledf <- data.frame(sample_data(erie)) # # Adonis test # adonis(erie_bray ~ Station, data = sampledf) # # Homogeneity of dispersion test # beta <- betadisper(erie_bray, sampledf$Station) # permutest(beta) # https://github.com/joey711/phyloseq/issues/689 # metadata <- as(sample_data(physeq), "data.frame") # adonis(distance(physeq, method="bray") ~ Treatment, data = metadata) ##PERMANOVA on bray curtis distance matrix # Test hypothesis that microbiota vary (with different centroids) by Reveg age # Calculate bray curtis distance matrix set.seed(123) r1.16s.bray <- phyloseq::distance(r1.16s, method = "bray") sampledf <- data.frame(sample_data(r1.16s)) str(sampledf) # Adonis test adonis(r1.16s.bray ~ Reveg_age, data = sampledf) # Call: # adonis(formula = r1.16s.bray ~ Reveg_age, data = sampledf) # # Permutation: free # Number of permutations: 999 # # Terms added sequentially (first to last) # # Df SumsOfSqs MeanSqs F.Model R2 Pr(>F) # Reveg_age 7 3.5671 0.50959 5.4858 0.4898 0.001 *** # Residuals 40 3.7157 0.09289 0.5102 # Total 47 7.2828 1.0000 # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Homogeneity of dispersion test beta <- betadisper(r1.16s.bray, sampledf$Reveg_age) permutest(beta) # Permutation test for homogeneity of multivariate dispersions # Permutation: free # Number of permutations: 999 # # Response: Distances # Df Sum Sq Mean Sq F N.Perm Pr(>F) # Groups 7 0.006306 0.0009008 0.2202 999 0.97 # Residuals 40 0.163607 0.0040902 # These outputs tell us that our adonis test is significant so we can reject the null hypothesis # that microbiota have the same centroid across different Reveg_age. # Additionally, our betadisper results are not significant, meaning we cannot reject the null # hypothesis that our groups have the same dispersions. This means we can be more confident that # our adonis result is a real result, and not due to differences in group dispersions. # Adonis test adonis(r1.16s.bray ~ Reveg_age + depth, data = sampledf) # Call: # adonis(formula = r1.16s.bray ~ Reveg_age + depth, data = sampledf) # # Permutation: free # Number of permutations: 999 # # Terms added sequentially (first to last) # # Df SumsOfSqs MeanSqs F.Model R2 Pr(>F) # Reveg_age 7 3.5671 0.50959 7.0625 0.48980 0.001 *** # depth 1 0.9017 0.90169 12.4968 0.12381 0.001 *** # Residuals 39 2.8140 0.07215 0.38639 # Total 47 7.2828 1.00000 # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 beta <- betadisper(r1.16s.bray, sampledf$depth) permutest(beta) # Permutation test for homogeneity of multivariate dispersions # Permutation: free # Number of permutations: 999 # # Response: Distances # Df Sum Sq Mean Sq F N.Perm Pr(>F) # Groups 1 0.001254 0.0012542 0.43 999 0.543 # Residuals 46 0.134165 0.0029166 ### Rarefaction curve ## Code for calculating rarefaction curves using phyloseq are here (provided by Bela Hausmann) # https://github.com/joey711/phyloseq/issues/143 # # example # data('GlobalPatterns') # # psdata <- GlobalPatterns # psdata # sample_sums(psdata) # # CL3 CC1 SV1 M31Fcsw M11Fcsw M31Plmr M11Plmr F21Plmr M31Tong M11Tong LMEpi24M SLEpi20M AQC1cm # # 864077 1135457 697509 1543451 2076476 718943 433894 186297 2000402 100187 2117592 1217312 1167748 # # AQC4cm AQC7cm NP2 NP3 NP5 TRRsed1 TRRsed2 TRRsed3 TS28 TS29 Even1 Even2 Even3 # # 2357181 1699293 523634 1478965 1652754 58688 493126 279704 937466 1211071 1216137 971073 1078241 # summary(sample_sums(psdata)) # # Min. 1st Qu. Median Mean 3rd Qu. Max. # # 58688 567103 1106849 1085257 1527330 2357181 set.seed(42) calculate_rarefaction_curves <- function(psdata, measures, depths) { require('plyr') # ldply require('reshape2') # melt estimate_rarified_richness <- function(psdata, measures, depth) { if(max(sample_sums(psdata)) < depth) return() psdata <- prune_samples(sample_sums(psdata) >= depth, psdata) rarified_psdata <- rarefy_even_depth(psdata, depth, replace = FALSE, verbose = FALSE) # default: replace = TRUE alpha_diversity <- estimate_richness(rarified_psdata, measures = measures) # as.matrix forces the use of melt.array, which includes the Sample names (rownames) molten_alpha_diversity <- melt(as.matrix(alpha_diversity), varnames = c('Sample', 'Measure'), value.name = 'Alpha_diversity') molten_alpha_diversity } names(depths) <- depths # this enables automatic addition of the Depth to the output by ldply rarefaction_curve_data <- ldply(depths, estimate_rarified_richness, psdata = psdata, measures = measures, .id = 'Depth', .progress = ifelse(interactive(), 'text', 'none')) # convert Depth from factor to numeric rarefaction_curve_data$Depth <- as.numeric(levels(rarefaction_curve_data$Depth))[rarefaction_curve_data$Depth] rarefaction_curve_data } ### Repeat for my data sample_sums(phy.16s) # 2009.1.10 2009.1.20 2009.2.10 2009.2.20 2009.3.10 2009.3.20 neg.1.10 neg.1.20 neg.2.10 neg.2.20 neg.3.10 neg.3.20 # 47060 74284 62799 68846 56507 70281 69396 62607 68983 70581 67248 82546 # 2005.1.10 2005.1.20 2005.2.10 2005.2.20 2005.3.10 2005.3.20 2008.1.10 2008.1.20 2008.2.10 2008.2.20 2008.3.10 2008.3.20 # 79082 62040 83126 74915 67023 80489 51184 67130 51653 79834 58444 51222 # 2007.1.10 2007.1.20 2007.2.10 2007.2.20 2007.3.10 2007.3.20 a.1.10 a.1.20 a.2.10 a.2.20 a.3.10 a.3.20 # 54031 72549 46444 62689 64552 44969 55335 54357 60801 76768 70132 71100 # b.1.10 b.1.20 b.2.10 b.2.20 b.3.10 b.3.20 c.1.10 c.1.20 c.2.10 c.2.20 c.4.10 c.4.20 # 59018 41990 63401 57449 57735 80107 68995 49446 40015 51419 33625 39191 summary(sample_sums(phy.16s)) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 33625 53437 62744 62154 70356 83126 ## Examine if species accumulation (rarefaction curves) are converging ## (i.e. reaching horizontal asymptote) or not? sample_variables(phy.16s) # [1] "Sample ID" "BPA ID" "BioSample Accession" # [4] "Data Type" "Collection Site" "Sample ID2" # [7] "Sample ID3" "Latitude" "Longitude" # [10] "Date Sampled" "Soil Depth (cm)" "Horizon" # [13] "Storage" "Broad Land Use" "Detailed Land Use" # [16] "Ecological Zone" "Vegetation Type" "Vegetation Cover" # [19] "Elevation" "Slope" "Slope Aspect" # [22] "Profile Position" "Australian Soil Classification" "FAO Soil Classification" # [25] "Immediate Previous Land Use" "Date since change in Land Use" "Crop Rotation 1 year since present" # [28] "Crop Rotation 2 years since present" "Crop Rotation 3 years since present" "Crop Rotation 4 years since present" # [31] "Crop Rotation 5 years since present" "Agrochemical Additions" "Tillage" # [34] "Fire History" "Fire Intensity" "Flooding" # [37] "Extreme Events" "Moisture" "Colour" # [40] "Gravel" "Texture" "Course Sand" # [43] "Fine Sand" "Sand" "Silt" # [46] "Clay" "NH3-N" "NO3-" # [49] "Colwell P" "Colwell K" "Sulphur" # [52] "Organic Carbon" "Conductivity" "CaCl2pH" # [55] "H2O pH" "DTPA Cu" "DTPA Fe" # [58] "DTPA Mn" "DTPA Zn" "Exc Al" # [61] "Exc Ca" "Exc Mg" "Exc K" # [64] "Exc Na" "B Hot CaCl2" "BPA_ID_short" # [67] "Reveg_age_and_depth" "Reveg_age" "depth" phy.16s@sam_data$depth ## run separately for 0-10 cm psdata1 <- prune_samples( phy.16s@sam_data$depth == "0-10 cm", phy.16s ) set.seed(123) rarefaction_curve_data <- calculate_rarefaction_curves(psdata1, c('Observed'), sort(c(min(sample_sums(psdata1)), 1, 10, 100, 1000, 1:90 * 1000)) ) # cover sequence read depths up to max value: 83126 rarefaction_curve_data # need to remove 'X' from start of Sample values, as returned from rarefaction function rarefaction_curve_data$Sample <- as.character(rarefaction_curve_data$Sample) rarefaction_curve_data$Sample rarefaction_curve_data$Sample <- gsub(pattern = "X", x=rarefaction_curve_data$Sample, replacement="") rarefaction_curve_data$Sample ## Add sample data rarefaction_curve_data_summary_verbose <- merge(rarefaction_curve_data, data.frame(sample_data(psdata1)), by.x = 'Sample', by.y = 'row.names') sample_data(psdata1) cols <- c("Cleared" = "#e31a1c", "6 years" = "#addd8e", "7 years" = "#78c679", "8 years" = "#41ab5d", "10 years"= "#238443", "Remnant A" = "#4292c6", "Remnant B" = "#2171b5", "Remnant C" = "#084594") p <- ggplot( data = rarefaction_curve_data_summary_verbose, mapping = aes(x = Depth, y = Alpha_diversity, colour = Reveg_age, group = Sample)) + theme_bw() + scale_colour_manual(values = cols, name ="Sample type\n(Reveg age)") + ggtitle("(a) Mt Bold (0-10 cm)") + labs(x = "OTU sequence depth", y = "Observed OTUs (count)") + geom_line() + geom_vline(xintercept = min(sample_sums(psdata1)), linetype="dotted") + ylim(0,2450) p ggsave(plot=p, filename = paste0("finished-plots/","Rarefaction-curve-0-10-MtBold-vFINAL.tiff"), width = 10, height = 8, units = "cm", dpi = 600, compression = "lzw") ## run separately for 20-30 cm psdata2 <- prune_samples( phy.16s@sam_data$depth == "20-30 cm", phy.16s ) set.seed(123) rarefaction_curve_data <- calculate_rarefaction_curves(psdata2, c('Observed'), sort(c(min(sample_sums(psdata1)), 1, 10, 100, 1000, 1:90 * 1000)) ) # cover min value: 33625; and max value: 83126 summary(sample_sums(psdata2)) rarefaction_curve_data # need to remove 'X' from start of Sample values, as returned from rarefaction function rarefaction_curve_data$Sample <- as.character(rarefaction_curve_data$Sample) #rarefaction_curve_data$Sample rarefaction_curve_data$Sample <- gsub(pattern = "X", x=rarefaction_curve_data$Sample, replacement="") rarefaction_curve_data$Sample ### summarize alpha diversity #rarefaction_curve_data_summary <- ddply(rarefaction_curve_data, c('Depth', 'Sample', 'Measure'), #summarise, Alpha_diversity_mean = mean(Alpha_diversity), Alpha_diversity_sd = sd(Alpha_diversity)) ## Add sample data rarefaction_curve_data_summary_verbose <- merge(rarefaction_curve_data, data.frame(sample_data(psdata2)), by.x = 'Sample', by.y = 'row.names') sample_data(psdata2) cols <- c("Cleared" = "#e31a1c", "6 years" = "#addd8e", "7 years" = "#78c679", "8 years" = "#41ab5d", "10 years"= "#238443", "Remnant A" = "#4292c6", "Remnant B" = "#2171b5", "Remnant C" = "#084594") p <- ggplot( data = rarefaction_curve_data_summary_verbose, mapping = aes(x = Depth, y = Alpha_diversity, colour = Reveg_age, group = Sample)) + theme_bw() + scale_colour_manual(values = cols, name ="Sample type\n(Reveg age)") + ggtitle("(b) 20-30 cm") + labs(x = "OTU sequence depth", y = "Observed no of OTUs") + geom_line() + geom_vline(xintercept = min(sample_sums(psdata2)), linetype="dotted") + ylim(0,2450) p ggsave(plot=p, filename = paste0("finished-plots/","Rarefaction-curve-20-30.tiff"), width = 10, height = 8, units = "cm", dpi = 600, compression = "lzw") ### db-RDA / CCA ? ## e.g. see Regueiro et al 2015 - Water Research paper # db-RDA = constrained redundancy analysis # (or use 'capscale' function in vegan) # In Ramette 2007: # (CCA:) The approach is very similar to that of RDA, except that # CCA is based on unimodal species-environment relationships # whereas RDA is based on linear models (ter Braak, 1986). # Also see Constrained Ordination here: #http://deneflab.github.io/MicrobeMiseq/demos/mothur_2_phyloseq.html ## start with evenly sampled data # rarefy #1 seed <- 123 r1.16s <- rarefy_even_depth(phy.16s, sample.size = min(sample_sums(phy.16s)), rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE) min(taxa_sums(r1.16s)) # 36 sample_sums(r1.16s) # all 33625 ntaxa(r1.16s) # 3282 ## check environental variables sample_variables(r1.16s) # [1] "Sample ID" "BPA ID" "BioSample Accession" # [4] "Data Type" "Collection Site" "Sample ID2" # [7] "Sample ID3" "Latitude" "Longitude" # [10] "Date Sampled" "Soil Depth (cm)" "Horizon" # [13] "Storage" "Broad Land Use" "Detailed Land Use" # [16] "Ecological Zone" "Vegetation Type" "Vegetation Cover" # [19] "Elevation" "Slope" "Slope Aspect" # [22] "Profile Position" "Australian Soil Classification" "FAO Soil Classification" # [25] "Immediate Previous Land Use" "Date since change in Land Use" "Crop Rotation 1 year since present" # [28] "Crop Rotation 2 years since present" "Crop Rotation 3 years since present" "Crop Rotation 4 years since present" # [31] "Crop Rotation 5 years since present" "Agrochemical Additions" "Tillage" # [34] "Fire History" "Fire Intensity" "Flooding" # [37] "Extreme Events" "Moisture" "Colour" # [40] "Gravel" "Texture" "Course Sand" # [43] "Fine Sand" "Sand" "Silt" # [46] "Clay" "NH3-N" "NO3-" # [49] "Colwell P" "Colwell K" "Sulphur" # [52] "Organic Carbon" "Conductivity" "CaCl2pH" # [55] "H2O pH" "DTPA Cu" "DTPA Fe" # [58] "DTPA Mn" "DTPA Zn" "Exc Al" # [61] "Exc Ca" "Exc Mg" "Exc K" # [64] "Exc Na" "B Hot CaCl2" "BPA_ID_short" # [67] "Reveg_age_and_depth" "Reveg_age" "depth" ## Add variable CEC r1.16s r1.16s@sam_data$CEC <- NA r1.16s@sam_data$CEC <- rowSums(r1.16s@sam_data[ , c("Exc.Ca","Exc.Mg", "Exc.K","Exc.Na")], na.rm = TRUE, dims = 1) # units of CEC are meq/100g sample_data(r1.16s)[ , "CEC"] sample_variables(r1.16s) # [1] "Sample ID" "BPA ID" "BioSample Accession" # [4] "Data Type" "Collection Site" "Sample ID2" # [7] "Sample ID3" "Latitude" "Longitude" # [10] "Date Sampled" "Soil Depth (cm)" "Horizon" # [13] "Storage" "Broad Land Use" "Detailed Land Use" # [16] "Ecological Zone" "Vegetation Type" "Vegetation Cover" # [19] "Elevation" "Slope" "Slope Aspect" # [22] "Profile Position" "Australian Soil Classification" "FAO Soil Classification" # [25] "Immediate Previous Land Use" "Date since change in Land Use" "Crop Rotation 1 year since present" # [28] "Crop Rotation 2 years since present" "Crop Rotation 3 years since present" "Crop Rotation 4 years since present" # [31] "Crop Rotation 5 years since present" "Agrochemical Additions" "Tillage" # [34] "Fire History" "Fire Intensity" "Flooding" # [37] "Extreme Events" "Moisture" "Colour" # [40] "Gravel" "Texture" "Course Sand" # [43] "Fine Sand" "Sand" "Silt" # [46] "Clay" "NH3-N" "NO3-" # [49] "Colwell P" "Colwell K" "Sulphur" # [52] "Organic Carbon" "Conductivity" "CaCl2pH" # [55] "H2O pH" "DTPA Cu" "DTPA Fe" # [58] "DTPA Mn" "DTPA Zn" "Exc Al" # [61] "Exc Ca" "Exc Mg" "Exc K" # [64] "Exc Na" "B Hot CaCl2" "BPA_ID_short" # [67] "Reveg_age_and_depth" "Reveg_age" "depth" # [70] "CEC" ## try "RDA" # https://rdrr.io/bioc/phyloseq/man/ordinate.html # CAP = [Partial] Constrained Analysis of Principal Coordinates or distance-based RDA, # via capscale. See capscale.phyloseq for more details. In particular, a formula argument # must be provided. ### 0-10 cm r1.16s.0_10 <- prune_samples( r1.16s@sam_data$depth == "0-10 cm", r1.16s ) set.seed(123) dbRDA.0_10 <- ordinate(r1.16s.0_10, method = "CAP", distance = "bray", # left hand side is ignored & is defined by the physeq and distance arguments # right hand side gives the constraining variables formula = ~ Sand + Silt + Clay + Colwell.P + Colwell.K + Organic.Carbon + Conductivity + CaCl2pH + Exc.Al + Exc.Ca + Exc.Mg + Exc.K + Exc.Na + CEC + Condition(Reveg_age) ) str(dbRDA.0_10) dbRDA.0_10$CCA$wa row.names(dbRDA.0_10$CCA$wa) # [1] "2009.1.10" "2009.2.10" "2009.3.10" "neg.1.10" "neg.2.10" "neg.3.10" "2005.1.10" "2005.2.10" "2005.3.10" # [10] "2008.1.10" "2008.2.10" "2008.3.10" "2007.1.10" "2007.2.10" "2007.3.10" "a.1.10" "a.2.10" "a.3.10" # [19] "b.1.10" "b.2.10" "b.3.10" "c.1.10" "c.2.10" "c.4.10" # formula = varespec ~ N + P + K library(vegan) ##https://www.rdocumentation.org/packages/vegan/versions/2.3-5/topics/capscale dbRDA.0_10 plot(dbRDA.0_10) # colour according to sample type cols <- c("Cleared" = "#e31a1c", "6 years" = "#addd8e", "7 years" = "#78c679", "8 years" = "#41ab5d", "10 years"= "#238443", "Remnant A" = "#4292c6", "Remnant B" = "#2171b5", "Remnant C" = "#084594") cols.cap.0_10 <- c( "2009.1.10"= "#addd8e", "2009.2.10"= "#addd8e", "2009.3.10"= "#addd8e", "neg.1.10"= "#e31a1c", "neg.2.10"= "#e31a1c", "neg.3.10"= "#e31a1c", "2005.1.10"= "#238443", "2005.2.10"= "#238443", "2005.3.10"= "#238443", "2008.1.10" = "#78c679", "2008.2.10" = "#78c679", "2008.3.10" = "#78c679", "2007.1.10"= "#41ab5d", "2007.2.10"= "#41ab5d", "2007.3.10"= "#41ab5d", "a.1.10" = "#4292c6", "a.2.10" = "#4292c6", "a.3.10" = "#4292c6", "b.1.10" = "#2171b5", "b.2.10" = "#2171b5", "b.3.10" = "#2171b5", "c.1.10"= "#084594", "c.2.10"= "#084594", "c.4.10"= "#084594" ) #https://www.rdocumentation.org/packages/vegan/versions/2.4-2/topics/plot.cca plot(dbRDA.0_10, cex=1.2) points(dbRDA.0_10, display = "sites", pch=21, bg=cols.cap.0_10) # col = border colour, bg = fill colour #https://www.r-bloggers.com/mastering-r-plot-part-1-colors-legends-and-lines/ legend("bottomright",legend=c("Cleared", "6 years", "7 years", "8 years", "10 years", "Remnant A", "Remnant B", "Remnant C") ,pt.bg=cols,pch=21,bty="o",ncol=2 ) # ,cex=1.2,pt.cex=1.2 getwd() # "C:/Workspace/PROJ/PAPER-Trending-Taxa-Resto/modelling" dev.print(tiff, filename = paste0("finished-plots/","CAP-dbRDA-0-10cm-vFINAL.tiff"), width = 26, height = 16, units = "cm", res=600, compression = "lzw") dev.print(tiff, filename = paste0("finished-plots/","CAP-dbRDA-0-10cm-vFINAL.tiff"), width = 24, height = 16, units = "cm", res=600, compression = "lzw") #dev.off() set.seed(123) anova(dbRDA.0_10) # Permutation test for capscale under reduced model # Permutation: free # Number of permutations: 999 # # Model: capscale(formula = OTU ~ Sand + Silt + Clay + Colwell.P + Colwell.K + Organic.Carbon + Conductivity + CaCl2pH + Exc.Al + Exc.Ca + Exc.Mg + Exc.K + Exc.Na + CEC + Condition(Reveg_age), data = data, distance = distance) # Df SumOfSqs F Pr(>F) # Model 13 0.98343 1.2391 0.16 # Residual 3 0.18315 ##http://cc.oulu.fi/~jarioksa/softhelp/vegan/html/plot.cca.html # Functions to plot or extract results of constrained correspondence analysis (cca), # redundancy analysis (rda) or constrained analysis of principal coordinates (capscale). #http://cc.oulu.fi/~jarioksa/softhelp/vegan/html/ordiplot.html fig <- ordiplot(dbRDA.0_10) identify(fig, "sites") # check site IDs by clicking into map, # which will add an identifying label !! # #------------------------- #### Mt Bold restoration gradient - Merged-sample bootstrap resampling # - alpha diversity (including bootstrap confidence intervals for group differences) # - relative abundance - identify trending taxa #------------------------- ### Bootstrap resampling ... ## create function to rarefy for initial sample > merge samples by type > rarefy again > calculate shannon's index # # # # # # # # # # # # calc_AlphaDiv_in_parallel <- function(phy_obj, merge_by, min_merge_no ) { # rarefy seed <- 123+j r16s <- rarefy_even_depth(phy_obj, sample.size = min(sample_sums(phy_obj)), rngseed = seed, replace = TRUE, trimOTUs = TRUE, verbose = TRUE) # RUN THIS FOR SAVED OUTPUT #rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE) # Merge samples of the same type # to avoid bias in sample contributions, only include groups with three samples levs <- levels(as.factor( eval(parse(text= paste0("r16s@sam_data$",merge_by))) )) count <- numeric(length = length(levs)) for (i in 1:length(levs)) { sel <- which( eval(parse(text= paste0("r16s@sam_data$",merge_by))) == levs[i]) count[i] <- length(sel) } sel.rem <- which(count != min_merge_no) # 3 # remove samples not corresponding to a triplicate if (length(sel.rem)>0) {levs <- levs[-sel.rem]} # determine which samples to leave in for merging - i.e. triplicates only sel <- which( eval(parse(text= paste0("r16s@sam_data$",merge_by))) %in% levs) # subset samples sub.r16s <- subset_samples( samples = sample_names(r16s)[sel] , r16s ) # merge samples merged.r16s <- merge_samples(sub.r16s, group= eval(parse(text= paste0("sub.r16s@sam_data$",merge_by))) ) # note: merging converts $depth to 1 2 1 2 ..., and degrades variable $Reveg.age # now repair these merged.r16s@sam_data$depth[merged.r16s@sam_data$depth==1] <- "0-10 cm" merged.r16s@sam_data$depth[merged.r16s@sam_data$depth==2] <- "20-30 cm" merged.r16s@sam_data$Reveg_age <- NA for (i in 1:nsamples(merged.r16s)) { if (row.names(merged.r16s@sam_data)[i] == "10 years (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "10 years"} if (row.names(merged.r16s@sam_data)[i] == "10 years (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "10 years"} if (row.names(merged.r16s@sam_data)[i] == "6 years (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "6 years"} if (row.names(merged.r16s@sam_data)[i] == "6 years (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "6 years"} if (row.names(merged.r16s@sam_data)[i] == "7 years (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "7 years"} if (row.names(merged.r16s@sam_data)[i] == "7 years (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "7 years"} if (row.names(merged.r16s@sam_data)[i] == "8 years (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "8 years"} if (row.names(merged.r16s@sam_data)[i] == "8 years (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "8 years"} if (row.names(merged.r16s@sam_data)[i] == "Cleared (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Cleared"} if (row.names(merged.r16s@sam_data)[i] == "Cleared (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Cleared"} if (row.names(merged.r16s@sam_data)[i] == "Remnant A (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Remnant A"} if (row.names(merged.r16s@sam_data)[i] == "Remnant A (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Remnant A"} if (row.names(merged.r16s@sam_data)[i] == "Remnant B (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Remnant B"} if (row.names(merged.r16s@sam_data)[i] == "Remnant B (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Remnant B"} if (row.names(merged.r16s@sam_data)[i] == "Remnant C (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Remnant C"} if (row.names(merged.r16s@sam_data)[i] == "Remnant C (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Remnant C"} } # rarefy back to min sample size for this bootstrap sample - i.e. rarefied-merged-rarefied seed <- 234+j rmr16s <- rarefy_even_depth(merged.r16s, sample.size = min(sample_sums(phy_obj)), rngseed = seed, replace = TRUE, trimOTUs = TRUE, verbose = TRUE) # RUN THIS FOR SAVED OUTPUT #rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE) # - - - - - - - - - - ### ALPHA DIVERSITY ## calculate Shannon's index shan.rmr16s <- plot_richness(rmr16s, measures=c("Shannon")) # export data out <- data.frame(sample=shan.rmr16s$data$samples,shannon=shan.rmr16s$data$value, Reveg_age=shan.rmr16s$data$Reveg_age, depth=shan.rmr16s$data$depth) out$eff_no <- exp(out$shannon) out$calc_type <- "bootstrap" return(out) } # # # # # # # # # # # # # bootstrap outputs will automatically be stored in a list: b_out # number of bootstrap resamples B <- 100 # iterate in parallel cl<-makeCluster( detectCores()-1 ) # detectCores()-1 registerDoParallel(cl) # remember to stopcluster() b_out <- foreach(j=1:B, .packages=c('phyloseq')) %dopar% calc_AlphaDiv_in_parallel(phy_obj=phy.16s, merge_by="Reveg_age_and_depth", min_merge_no=3 ) stopCluster(cl) names(b_out[[1]]) # "sample" "shannon" "Reveg_age" "depth" "eff_no" "calc_type" dim(b_out[[1]]) # 16 6 #b_out__alpha_div_otus_mt_bold <- b_out getwd() # "C:/Workspace/PROJ/PAPER-Trending-Taxa-Resto/modelling" saveRDS(b_out__alpha_div_otus_mt_bold, file = "b_out__alpha_div_otus_mt_bold.RDS") ### 1st calculate diversity from one rarefying step ### then append bootstrap-derived uncertainty...(?) # rarefy #1 seed <- 123 r1.16s <- rarefy_even_depth(phy.16s, sample.size = min(sample_sums(phy.16s)), rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE) shan.r1.16s <- plot_richness(r1.16s, measures=c("Shannon")) out <- data.frame(sample=shan.r1.16s$data$samples,shannon=shan.r1.16s$data$value, Reveg_age=shan.r1.16s$data$Reveg_age , depth=shan.r1.16s$data$depth) # out$eff_no <- exp(out$shannon) # calculate effective no of species out$calc_type <- "rarefyx1" str(out) # 'data.frame': 48 obs. of 6 variables: # $ sample : Factor w/ 48 levels "2005.1.10","2005.1.20",..: 19 20 21 22 23 24 43 44 45 46 ... # $ shannon : num 6.3 6.26 6.11 6 6.02 ... # $ Reveg_age: Ord.factor w/ 8 levels "Cleared"<"6 years"<..: 2 2 2 2 2 2 1 1 1 1 ... # $ depth : Factor w/ 2 levels "0-10 cm","20-30 cm": 1 2 1 2 1 2 1 2 1 2 ... # $ eff_no : num 545 524 450 405 412 ... # $ calc_type: chr "rarefyx1" "rarefyx1" "rarefyx1" "rarefyx1" ... names(out) # "sample" "shannon" "Reveg_age" "depth" "eff_no" "calc_type" names(b_out[[1]]) # "sample" "shannon" "Reveg_age" "depth" "eff_no" "calc_type" dim(b_out[[1]]) # 16 6 temp <- out for (j in 1:B) { temp <- rbind(temp,b_out[[j]]) } head(temp) temp[1:50, ] tail(temp) names(temp) # "sample" "shannon" "Reveg_age" "depth" "eff_no" "calc_type" melt.out <- melt(temp,id.vars = c("Reveg_age","depth","calc_type"), measure.vars = "eff_no") ## plot ## [finished-plot] # # # # # # # # # # # # # ## apply same standard colours ("cols") cols <- c("Cleared" = "#e31a1c", "6 years" = "#addd8e", "7 years" = "#78c679", "8 years" = "#41ab5d", "10 years"= "#238443", "Remnant A" = "#4292c6", "Remnant B" = "#2171b5", "Remnant C" = "#084594") p <- ggplot(data=melt.out, aes(x=Reveg_age, value)) + ggtitle("(d)") + # run for (a), (b), (c), (d) geom_violin(data = melt.out[ which(melt.out$calc_type == "bootstrap"), ], aes(color = Reveg_age) ) + scale_colour_manual(values = cols) + geom_point(data = melt.out[ which(melt.out$calc_type == "rarefyx1"), ], color="gray20", shape=1 ) + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5) ) + labs(x = "Reveg age", y = "Effective OTUs (count)") + theme(legend.position="none") + facet_wrap( ~ depth) + scale_x_discrete(labels=c("Cleared" = "Clear", "6 years" = "6 yr", "7 years" = "7 yr", "8 years" = "8 yr", "10 years"= "10 yr", "Remnant A" = "Rem A", "Remnant B" = "Rem B", "Remnant C" = "Rem C")) p #ggsave(plot=p, filename = paste0("finished-plots/","Alpha-Div-Eff-No-Species-16S-a-WITH-WITH.tiff"), width = 10, height = 8, units = "cm", dpi = 600, compression = "lzw") #ggsave(plot=p, filename = paste0("finished-plots/","Alpha-Div-Eff-No-Species-16S-b-WITH-WITHOUT.tiff"), width = 10, height = 8, units = "cm", dpi = 600, compression = "lzw") #ggsave(plot=p, filename = paste0("finished-plots/","Alpha-Div-Eff-No-Species-16S-c-WITHOUT-WITH.tiff"), width = 10, height = 8, units = "cm", dpi = 600, compression = "lzw") ggsave(plot=p, filename = paste0("finished-plots/","Alpha-Div-Eff-No-Species-16S-d-WITHOUT-WITHOUT.tiff"), width = 10, height = 8, units = "cm", dpi = 600, compression = "lzw") ## Now just plot surface 0-10 cm ## USING WITH REPLACEMMENT, WITH REPLACEMENT temp.surf <- temp[which(temp$depth=="0-10 cm"), ] melt.out.surf <- melt(temp.surf,id.vars = c("Reveg_age","calc_type"), measure.vars = "eff_no") cols <- c("Cleared" = "#e31a1c", "6 years" = "#addd8e", "7 years" = "#78c679", "8 years" = "#41ab5d", "10 years"= "#238443", "Remnant A" = "#4292c6", "Remnant B" = "#2171b5", "Remnant C" = "#084594") p <- ggplot(data=melt.out.surf, aes(x=Reveg_age, value)) + ggtitle("(a)") + geom_violin(data = melt.out.surf[ which(melt.out.surf$calc_type == "bootstrap"), ], aes(color = Reveg_age) ) + scale_colour_manual(values = cols) + geom_point(data = melt.out.surf[ which(melt.out.surf$calc_type == "rarefyx1"), ], color="gray20", shape=1 ) + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5) ) + theme(axis.title.x = element_blank()) + labs(y = "Effective OTUs (count)") + # x = "Reveg age", theme(legend.position="none") + scale_x_discrete(labels=c("Cleared" = "Clear", "6 years" = "6 yr", "7 years" = "7 yr", "8 years" = "8 yr", "10 years"= "10 yr", "Remnant A" = "Rem A", "Remnant B" = "Rem B", "Remnant C" = "Rem C")) p ggsave(plot=p, filename = paste0("finished-plots/","Alpha-Div-Eff-No-Species-16S-Surface-Only-vFINAL.tiff"), width = 6, height = 8, units = "cm", dpi = 600, compression = "lzw") ## Now consider three groupings of the surface 0-10 cm, bootstrap only results: ## (i) Clear + 6 yr; (ii) 7,8, 10 yr; (iii) Rem A, B, C str(melt.out.surf) # 'data.frame': 824 obs. of 4 variables: # $ Reveg_age: Factor w/ 8 levels "Cleared","6 years",..: 2 2 2 1 1 1 5 5 5 3 ... # $ calc_type: chr "rarefyx1" "rarefyx1" "rarefyx1" "rarefyx1" ... # $ variable : Factor w/ 1 level "eff_no": 1 1 1 1 1 1 1 1 1 1 ... # $ value : num 545 450 412 512 269 ... mb_surf_boot <- melt.out.surf[ which(melt.out.surf$calc_type == "bootstrap"), ] str(mb_surf_boot) # 'data.frame': 800 obs. of 4 variables: # $ Reveg_age: Factor w/ 8 levels "Cleared","6 years",..: 5 2 3 4 1 6 7 8 5 2 ... # $ calc_type: chr "bootstrap" "bootstrap" "bootstrap" "bootstrap" ... # $ variable : Factor w/ 1 level "eff_no": 1 1 1 1 1 1 1 1 1 1 ... # $ value : num 485 507 632 562 474 ... levels(mb_surf_boot$Reveg_age) # "Cleared" "6 years" "7 years" "8 years" "10 years" "Remnant A" "Remnant B" "Remnant C" mb_surf_boot$group <- NA ## (i) Clear + 6 yr sel <- which(mb_surf_boot$Reveg_age %in% c("Cleared", "6 years")) # qty 200 mb_surf_boot$group[sel] <- "group1" # "Clear to 6 yr" summary(mb_surf_boot$value[sel]) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 457.1 468.8 486.3 487.6 506.5 524.8 ## (ii) 7,8, 10 yr sel <- which(mb_surf_boot$Reveg_age %in% c("7 years", "8 years" , "10 years")) # qty 300 mb_surf_boot$group[sel] <- "group2" # "7 to 10 yr" summary(mb_surf_boot$value[sel]) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 470.3 487.9 562.4 559.9 629.5 651.4 ## (iii) Rem A, B, C sel <- which(mb_surf_boot$Reveg_age %in% c("Remnant A", "Remnant B", "Remnant C")) # qty 300 mb_surf_boot$group[sel] <- "group3" # "Rem A,B,C" summary(mb_surf_boot$value[sel]) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 532.5 553.7 578.1 598.4 662.3 684.8 mb_surf_boot$group_ord <- factor(mb_surf_boot$group, levels = c( "group1", "group2", "group3"), labels = c("Cleared to\n 6 years", "7 to 10\nyears", "Remnants\nA, B, C"), ordered = TRUE ) p <- ggplot(data = mb_surf_boot, aes(x=group_ord, y=value)) + geom_boxplot() + labs( x = NULL, y = "Effective OTUs (count)" ) + theme_bw() p ggsave(plot=p, filename = paste0("finished-plots/","Alpha-Div-Eff-No-OTUs-16S-Surface-Bootstrap-Only-3groups-vFINAL.tiff"), width = 8.3, height = 8.3, units = "cm", dpi = 600, compression = "lzw") ## Add significance bars ## test for significant differences between groups, based on 95% CI of difference. mb_surf_boot[1:20, ] # each 8 rows represents result from a single merged-sample bootstrap iteration diffgp1_gp2 <- list() diffgp1_gp3 <- list() diffgp2_gp3 <- list() for (b in 1:100) { #b<-3 sel <- c(1:8) + (b-1)*8 print(paste(b)) print(paste(sel)) #(i) Clear + 6 yr idx1 <- which(mb_surf_boot$group[sel] == "group1") #(ii) 7,8, 10 yr idx2 <- which(mb_surf_boot$group[sel] == "group2") #(iii) Rem A, B, C idx3 <- which(mb_surf_boot$group[sel] == "group3") diffgp1_gp2[[b]] <- mean(mb_surf_boot$value[sel[idx1]]) - mean(mb_surf_boot$value[sel[idx2]]) diffgp1_gp3[[b]] <- mean(mb_surf_boot$value[sel[idx1]]) - mean(mb_surf_boot$value[sel[idx3]]) diffgp2_gp3[[b]] <- mean(mb_surf_boot$value[sel[idx2]]) - mean(mb_surf_boot$value[sel[idx3]]) } hist(unlist(diffgp1_gp2)) mean(unlist(diffgp1_gp2)) # -72.30426 quantile(unlist(diffgp1_gp2), probs = c(0.025,0.975), na.rm = TRUE) # 2.5% 97.5% # -81.90456 -62.19218 hist(unlist(diffgp1_gp3)) mean(unlist(diffgp1_gp3)) # -110.8683 quantile(unlist(diffgp1_gp3), probs = c(0.025,0.975), na.rm = TRUE) # 2.5% 97.5% # -121.41333 -99.56556 hist(unlist(diffgp2_gp3)) mean(unlist(diffgp2_gp3)) # -38.56401 quantile(unlist(diffgp2_gp3), probs = c(0.025,0.975), na.rm = TRUE) # 2.5% 97.5% # -47.90219 -30.17284 p <- ggplot(data = mb_surf_boot, aes(x=group_ord, y=value)) + geom_boxplot() + labs( x = NULL, y = "Effective OTUs (count)" ) + theme_bw() + theme( panel.grid.major = element_blank(), panel.grid.minor = element_blank() ) + geom_signif(y_position = c(665,695,710), xmin=c(1,2,1), xmax=c(2,3,3), annotation = c("95% CI = -81.9, -62.2", "95% CI = -47.9, -30.2", "95% CI = -121.4, -99.6"), tip_length = 0.01, textsize = 2.2) p ggsave(plot=p, filename = paste0("finished-plots/","Alpha-Div-Eff-No-OTUs-16S-Surface-Bootstrap-Only-3groups-withSigBars-vFINAL.tiff"), width = 8.3, height = 8.3, units = "cm", dpi = 600, compression = "lzw") ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## #### Evaluate genera that correlate/trend with restoration (reveg age) ## Display next available taxonomic rank for "unclassified" genera calc_RelAbun_in_parallel <- function(phy_obj, merge_by, min_merge_no, all_genera) { # ##TESTING ##for (j in 1:10) { ##phy_obj=phy.16s; merge_by="Reveg_age_and_depth"; min_merge_no=3; all_genera=genera ##j=1 ##all_genera <- c( "g__Bacillus", "g__Clostridium", "unclassified") ##this_genus <- "g__Bacillus" # rarefy seed <- 123+j r16s <- rarefy_even_depth(phy_obj, sample.size = min(sample_sums(phy_obj)), rngseed = seed, replace = TRUE, trimOTUs = TRUE, verbose = TRUE) # Merge samples of the same type # to avoid bias in sample contributions, only include groups with three samples levs <- levels(as.factor( eval(parse(text= paste0("r16s@sam_data$",merge_by))) )) count <- numeric(length = length(levs)) for (i in 1:length(levs)) { sel <- which( eval(parse(text= paste0("r16s@sam_data$",merge_by))) == levs[i]) count[i] <- length(sel) } sel.rem <- which(count != min_merge_no) # 3 # remove samples not corresponding to a triplicate if (length(sel.rem)>0) {levs <- levs[-sel.rem]} # determine which samples to leave in for merging - i.e. triplicates only sel <- which( eval(parse(text= paste0("r16s@sam_data$",merge_by))) %in% levs) # subset samples sub.r16s <- subset_samples( samples = sample_names(r16s)[sel] , r16s ) # merge samples merged.r16s <- merge_samples(sub.r16s, group= eval(parse(text= paste0("sub.r16s@sam_data$",merge_by))) ) # note: merging converts $depth to 1 2 1 2 ..., and degrades variable $Reveg.age # now repair these merged.r16s@sam_data$depth[merged.r16s@sam_data$depth==1] <- "0-10 cm" merged.r16s@sam_data$depth[merged.r16s@sam_data$depth==2] <- "20-30 cm" merged.r16s@sam_data$Reveg_age <- NA for (i in 1:nsamples(merged.r16s)) { if (row.names(merged.r16s@sam_data)[i] == "10 years (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "10 years"} if (row.names(merged.r16s@sam_data)[i] == "10 years (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "10 years"} if (row.names(merged.r16s@sam_data)[i] == "6 years (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "6 years"} if (row.names(merged.r16s@sam_data)[i] == "6 years (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "6 years"} if (row.names(merged.r16s@sam_data)[i] == "7 years (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "7 years"} if (row.names(merged.r16s@sam_data)[i] == "7 years (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "7 years"} if (row.names(merged.r16s@sam_data)[i] == "8 years (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "8 years"} if (row.names(merged.r16s@sam_data)[i] == "8 years (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "8 years"} if (row.names(merged.r16s@sam_data)[i] == "Cleared (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Cleared"} if (row.names(merged.r16s@sam_data)[i] == "Cleared (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Cleared"} if (row.names(merged.r16s@sam_data)[i] == "Remnant A (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Remnant A"} if (row.names(merged.r16s@sam_data)[i] == "Remnant A (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Remnant A"} if (row.names(merged.r16s@sam_data)[i] == "Remnant B (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Remnant B"} if (row.names(merged.r16s@sam_data)[i] == "Remnant B (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Remnant B"} if (row.names(merged.r16s@sam_data)[i] == "Remnant C (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Remnant C"} if (row.names(merged.r16s@sam_data)[i] == "Remnant C (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Remnant C"} } # rarefy back to min sample size for this bootstrap sample - i.e. rarefied-merged-rarefied seed <- 234+j rmr16s <- rarefy_even_depth(merged.r16s, sample.size = min(sample_sums(phy_obj)), rngseed = seed, replace = TRUE, trimOTUs = TRUE, verbose = TRUE) # - - - - - - - - - - ### RELATIVE ABUNDANCE relabun.merged.16s <- transform_sample_counts(rmr16s, function(x) x / sum(x) ) # store results out <- list() for (g in 1:length(all_genera)) { #g=185 # g=24 this_genus <- all_genera[g] # as.character(all_genera[g]) out[[ this_genus ]] <- list() # this_genus # what if genus is NOT in this resample? if (!this_genus %in% levels(as.factor( get_taxa_unique(relabun.merged.16s, taxonomic.rank = "genus" ))) ) { out[[ this_genus ]][["df"]] <- NA r_0_10 <- NA r_20_30 <- NA beta_0_10 <- NA beta_20_30 <- NA p_ordAOV_0_10 <- NA p_ordAOV_20_30 <- NA } else { ## for genus = "unclassified" - get next available taxonomic info if (this_genus == "unclassified") { # work through taxonomic table to find all genus = "unclassified" sel_unclass <- which(tax_table(relabun.merged.16s)[ ,"genus"] == "unclassified" ) # qty 2356 ranks <- c("family","order","class","phylum","kingdom") #count_taxa <- 0 # use this to check count of all unclassified genera ## continue loop until all "unclassified" genera accounted for while (length(sel_unclass) > 0) { # find next available taxonomic classification # start with first taxonomic rank, and work up idx_start <- min(sel_unclass) idx_rank <- 1 this_rank <- ranks[idx_rank] this_taxaname <- as.character( tax_table(relabun.merged.16s)[ idx_start , this_rank ] ) while (this_taxaname == "unclassified") { idx_rank <- idx_rank +1 this_rank <- ranks[idx_rank] this_taxaname <- as.character( tax_table(relabun.merged.16s)[ idx_start , this_rank ] ) } this_fullname <- paste0("unclassified (",this_rank,": ",gsub(pattern=".__",x=this_taxaname,replacement=""),")") # gsub() to tidy-up label if (this_rank == "family") {sel_done <- which(tax_table(relabun.merged.16s)[ sel_unclass, this_rank] == this_taxaname )} if (this_rank == "order") {sel_done <- which(tax_table(relabun.merged.16s)[ sel_unclass, this_rank] == this_taxaname & tax_table(relabun.merged.16s)[ sel_unclass, "family"] == "unclassified" )} if (this_rank == "class") {sel_done <- which(tax_table(relabun.merged.16s)[ sel_unclass, this_rank] == this_taxaname & tax_table(relabun.merged.16s)[ sel_unclass, "order" ] == "unclassified" & tax_table(relabun.merged.16s)[ sel_unclass, "family"] == "unclassified" )} if (this_rank == "phylum") {sel_done <- which(tax_table(relabun.merged.16s)[ sel_unclass, this_rank] == this_taxaname & tax_table(relabun.merged.16s)[ sel_unclass, "class"] == "unclassified" & tax_table(relabun.merged.16s)[ sel_unclass, "order" ] == "unclassified" & tax_table(relabun.merged.16s)[ sel_unclass, "family"] == "unclassified" )} keep_taxa <- row.names( tax_table(relabun.merged.16s) )[ sel_unclass[sel_done] ] subsel <- prune_taxa(relabun.merged.16s, taxa = keep_taxa ) #print(paste0(this_fullname,", qty: ",length(keep_taxa))) #count_taxa <- count_taxa + length(keep_taxa) #print(paste0("count total unclassified taxa: ",count_taxa)) # # # # routine for "unclassified" genera only # # # # df <- data.frame( sample = sample_names( subsel ), Reveg_age = subsel@sam_data$Reveg_age, depth = subsel@sam_data$depth, rel_abun = sample_sums( subsel ) ) # assign 'reveg_age_score' df$reveg_age_score <- NA df$reveg_age_score[which(df$Reveg_age == "Cleared")] <- 0 df$reveg_age_score[which(df$Reveg_age == "6 years")] <- 6 df$reveg_age_score[which(df$Reveg_age == "7 years")] <- 7 df$reveg_age_score[which(df$Reveg_age == "8 years")] <- 8 df$reveg_age_score[which(df$Reveg_age == "10 years")] <- 10 df$reveg_age_score[which(df$Reveg_age == "Remnant A")] <- 20 # set equal for all remnant df$reveg_age_score[which(df$Reveg_age == "Remnant B")] <- 20 # set equal for all remnant df$reveg_age_score[which(df$Reveg_age == "Remnant C")] <- 20 # set equal for all remnant # assign 'reveg_order' df$reveg_order <- NA df$reveg_order[which(df$Reveg_age == "Cleared")] <- 1 df$reveg_order[which(df$Reveg_age == "6 years")] <- 2 df$reveg_order[which(df$Reveg_age == "7 years")] <- 3 df$reveg_order[which(df$Reveg_age == "8 years")] <- 4 df$reveg_order[which(df$Reveg_age == "10 years")] <- 5 df$reveg_order[which(df$Reveg_age == "Remnant A")] <- 6 # set equal for all remnant df$reveg_order[which(df$Reveg_age == "Remnant B")] <- 6 # set equal for all remnant df$reveg_order[which(df$Reveg_age == "Remnant C")] <- 6 # set equal for all remnant df$genus <- this_fullname df$calc_type <- "bootstrap" out[[ this_fullname ]] <- list() out[[ this_fullname ]][["df"]] <- df # 0-10 cm sel <- which(df$depth=="0-10 cm") # ignore if not present in 3 or more sample types if (!length(which(df$rel_abun[sel]>0)) >=3 ) { r_0_10 <- NA beta_0_10 <- NA p_ordAOV_0_10 <- NA } else { r_0_10 <- cor(df$reveg_age_score[sel],df$rel_abun[sel]) # here using non-standardized coefficient beta (because standardized coefficients can make it # difficult to make comparisons across groups - as the standardization is different for each group) x<-df$reveg_age_score[sel]; y<-df$rel_abun[sel] beta_0_10 <- coef(lm(y~x))[2] mod <- ordAOV(x=df$reveg_order[sel], y=df$rel_abun[sel], type = "RLRT", nsim=10000) p_ordAOV_0_10 <- mod$p.value # when = 0, that means p-value < 2.2e-16 } # 20-30 cm sel <- which(df$depth=="20-30 cm") # ignore if not present in 3 or more sample types if (!length(which(df$rel_abun[sel]>0)) >=3 ) { r_20_30 <- NA beta_20_30 <- NA p_ordAOV_20_30 <- NA } else { r_20_30 <- cor(df$reveg_age_score[sel],df$rel_abun[sel]) x<-df$reveg_age_score[sel]; y<-df$rel_abun[sel] beta_20_30 <- coef(lm(y~x))[2] mod <- ordAOV(x=df$reveg_order[sel], y=df$rel_abun[sel], type = "RLRT", nsim=10000) p_ordAOV_20_30 <- mod$p.value # when = 0, that means p-value < 2.2e-16 } summary <- data.frame(r_0_10=r_0_10, beta_0_10=beta_0_10, p_ordAOV_0_10=p_ordAOV_0_10, r_20_30=r_20_30, beta_20_30=beta_20_30, p_ordAOV_20_30=p_ordAOV_20_30, B=j, genus= this_fullname ) out[[ this_fullname ]][["summary"]] <- summary # store OTUs out[[ this_fullname ]][["otus"]] <- taxa_names(subsel) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # sel_unclass <- sel_unclass[-c(sel_done)] } # END while loop ## now for all classified genera } else { ##g<-2 # TESTING sel_keep_taxa <- which(tax_table(relabun.merged.16s)[ ,"genus"] == this_genus ) keep_taxa <- row.names( tax_table(relabun.merged.16s) )[sel_keep_taxa] subsel <- prune_taxa(relabun.merged.16s, taxa = keep_taxa ) df <- data.frame( sample = sample_names( subsel ), Reveg_age = subsel@sam_data$Reveg_age, depth = subsel@sam_data$depth, rel_abun = sample_sums( subsel ) ) # assign 'reveg_age_score' df$reveg_age_score <- NA df$reveg_age_score[which(df$Reveg_age == "Cleared")] <- 0 df$reveg_age_score[which(df$Reveg_age == "6 years")] <- 6 df$reveg_age_score[which(df$Reveg_age == "7 years")] <- 7 df$reveg_age_score[which(df$Reveg_age == "8 years")] <- 8 df$reveg_age_score[which(df$Reveg_age == "10 years")] <- 10 df$reveg_age_score[which(df$Reveg_age == "Remnant A")] <- 20 # set equal for all remnant df$reveg_age_score[which(df$Reveg_age == "Remnant B")] <- 20 # set equal for all remnant df$reveg_age_score[which(df$Reveg_age == "Remnant C")] <- 20 # set equal for all remnant # assign 'reveg_order' df$reveg_order <- NA df$reveg_order[which(df$Reveg_age == "Cleared")] <- 1 df$reveg_order[which(df$Reveg_age == "6 years")] <- 2 df$reveg_order[which(df$Reveg_age == "7 years")] <- 3 df$reveg_order[which(df$Reveg_age == "8 years")] <- 4 df$reveg_order[which(df$Reveg_age == "10 years")] <- 5 df$reveg_order[which(df$Reveg_age == "Remnant A")] <- 6 # set equal for all remnant df$reveg_order[which(df$Reveg_age == "Remnant B")] <- 6 # set equal for all remnant df$reveg_order[which(df$Reveg_age == "Remnant C")] <- 6 # set equal for all remnant df$genus <- this_genus df$calc_type <- "bootstrap" out[[ this_genus ]][["df"]] <- df # 0-10 cm sel <- which(df$depth=="0-10 cm") # ignore if not present in 3 or more sample types if (!length(which(df$rel_abun[sel]>0)) >=3 ) { r_0_10 <- NA beta_0_10 <- NA p_ordAOV_0_10 <- NA } else { r_0_10 <- cor(df$reveg_age_score[sel],df$rel_abun[sel]) # here using non-standardized coefficient beta (because standardized coefficients can make it # difficult to make comparisons across groups - as the standardization is different for each group) x<-df$reveg_age_score[sel]; y<-df$rel_abun[sel] beta_0_10 <- coef(lm(y~x))[2] mod <- ordAOV(x=df$reveg_order[sel], y=df$rel_abun[sel], type = "RLRT", nsim=10000) p_ordAOV_0_10 <- mod$p.value # when = 0, that means p-value < 2.2e-16 } # 20-30 cm sel <- which(df$depth=="20-30 cm") # ignore if not present in 3 or more sample types if (!length(which(df$rel_abun[sel]>0)) >=3 ) { r_20_30 <- NA beta_20_30 <- NA p_ordAOV_20_30 <- NA } else { r_20_30 <- cor(df$reveg_age_score[sel],df$rel_abun[sel]) x<-df$reveg_age_score[sel]; y<-df$rel_abun[sel] beta_20_30 <- coef(lm(y~x))[2] mod <- ordAOV(x=df$reveg_order[sel], y=df$rel_abun[sel], type = "RLRT", nsim=10000) p_ordAOV_20_30 <- mod$p.value # when = 0, that means p-value < 2.2e-16 } summary <- data.frame(r_0_10=r_0_10, beta_0_10=beta_0_10, p_ordAOV_0_10=p_ordAOV_0_10, r_20_30=r_20_30, beta_20_30=beta_20_30, p_ordAOV_20_30=p_ordAOV_20_30, B=j, genus=this_genus) out[[ this_genus ]][["summary"]] <- summary # store OTUs out[[ this_genus ]][["otus"]] <- taxa_names(subsel) #print(paste0("finished genus # ",g, " = ",this_genus)) } # END else for classified genera } # END else for available genera } # END for all_genera[g] #print(paste0("BOOTSTRAP # ",j," COMPLETE !!")) return(out) } # END function # define genera rank_names(phy.16s) # "kingdom" "phylum" "class" "order" "family" "genus" "species" genera <- levels(as.factor( get_taxa_unique(phy.16s, taxonomic.rank = "genus" ))) genera # bootstrap outputs will automatically be stored in a list: b_out # number of bootstrap resamples B <- 100 # iterate in parallel cl<-makeCluster( detectCores()-1 ) # detectCores()-1 registerDoParallel(cl) # remember to stopcluster() #system.time( b_out <- foreach(j=1:B, .packages=c('phyloseq','ordPens') ) %dopar% calc_RelAbun_in_parallel(phy_obj = phy.16s, merge_by = "Reveg_age_and_depth", min_merge_no = 3 ,all_genera = genera) # #) #user system elapsed #2.25 0.74 895.89 stopCluster(cl) b_out__rel_abun_parallel_mt_bold <- b_out getwd() # "C:/Workspace/PROJ/PAPER-Trending-Taxa-Resto/modelling" saveRDS(b_out__rel_abun_parallel_mt_bold, file = "b_out__rel_abun_parallel_mt_bold.RDS") b_out <- readRDS(file = "b_out__rel_abun_parallel_mt_bold.RDS") ## prelim inspection of outputs B # 100 length(b_out) # 100 length(b_out[[1]]) # 401 length(b_out[[2]]) # 401 length(b_out[[3]]) # 401 length(b_out[[4]]) # 401 length(b_out[[5]]) # 401 length(b_out[[6]]) # 401 length(b_out[[7]]) # 401 length(b_out[[8]]) # 401 length(b_out[[9]]) # 401 length(b_out[[10]]) # 401 length(b_out[[100]]) # 401 names(b_out[[1]]) names(b_out[[1]]) == names(b_out[[2]]) # all true all( names(b_out[[1]]) == names(b_out[[3]]) ) # all true all( names(b_out[[1]]) == names(b_out[[4]]) ) # all true all( names(b_out[[1]]) == names(b_out[[5]]) ) # all true all( names(b_out[[1]]) == names(b_out[[6]]) ) # all true all( names(b_out[[1]]) == names(b_out[[7]]) ) # all true all( names(b_out[[1]]) == names(b_out[[8]]) ) # all true all( names(b_out[[1]]) == names(b_out[[9]]) ) # all true all( names(b_out[[1]]) == names(b_out[[10]]) ) # all true all( names(b_out[[1]]) == names(b_out[[100]]) ) # all true new_genera <- names(b_out[[1]]) # check outputs b_out[[1]][[ "g__Bacillus" ]][["df"]] b_out[[1]][[ "g__Bacillus" ]][["summary"]] b_out[[1]][[ "g__Bacillus" ]][["otus"]] b_out[[1]][[ "unclassified" ]][["df"]] # NULL b_out[[1]][[ "unclassified" ]][["summary"]] # NULL b_out[[1]][[ "g__Clostridium" ]][["df"]] b_out[[1]][[ "g__Clostridium" ]][["summary"]] ## extract all bootstrap summary data all_bs_summaries <- data.frame(r_0_10=NA,beta_0_10=NA,p_ordAOV_0_10=NA, r_20_30=NA,beta_20_30=NA,p_ordAOV_20_30=NA, B=NA,genus=NA) for (g in 1:length( new_genera )) { #g<-1 # get pseudo correlation and p-value (for ordinal AOV) temp.summary <- as.data.frame(matrix(ncol=8)) names(temp.summary) <- c("r_0_10","beta_0_10","p_ordAOV_0_10","r_20_30","beta_20_30","p_ordAOV_20_30","B","genus") for (j in 1:B) { if (!is.null(b_out[[j]][[ new_genera[g] ]][["summary"]])) { if (!is.na(b_out[[j]][[ new_genera[g] ]][["summary"]]$r_0_10)) { temp.summary <- rbind(temp.summary, b_out[[j]][[ new_genera[g] ]][["summary"]]) } } } temp.summary <- temp.summary[-1, ] # remove 1st row with NAs all_bs_summaries <- rbind(all_bs_summaries,temp.summary) print(paste0("completed genus: ",g)) } head(all_bs_summaries) all_bs_summaries <- all_bs_summaries[-1, ] # remove 1st row with NAs hist(all_bs_summaries$r_0_10) hist(all_bs_summaries$beta_0_10) hist(all_bs_summaries$p_ordAOV_0_10) hist(all_bs_summaries$r_20_30) hist(all_bs_summaries$beta_20_30) hist(all_bs_summaries$p_ordAOV_20_30) ## get summary of summary Cor. and p-values for 0-10 cm summary_relabun.16s <- data.frame(genus=new_genera, rel_abun_0_10_mean_perc=NA, rel_abun_0_10_min_perc=NA, rel_abun_0_10_max_perc=NA, cor_0_10_mean=NA, cor_0_10_95ci_lower=NA, cor_0_10_95ci_upper=NA, beta_0_10_mean=NA, beta_0_10_95ci_lower=NA, beta_0_10_95ci_upper=NA, p_ordAOV_0_10_mean=NA, p_ordAOV_0_10_95ci_lower=NA, p_ordAOV_0_10_95ci_upper=NA, missing_in_cleared=FALSE, missing_in_remnants=FALSE, perc_B_with_data=NA, B=NA ) for (g in 1:length( new_genera )) { # get pseudo correlation and p-value (for ordinal AOV) sel <- which(all_bs_summaries$genus == new_genera[g]) temp.summary <- all_bs_summaries[sel, ] summary_relabun.16s$cor_0_10_mean[g] <- mean(temp.summary$r_0_10, na.rm = TRUE) summary_relabun.16s$cor_0_10_95ci_lower[g] <- quantile(temp.summary$r_0_10, probs = 0.025, na.rm = TRUE) summary_relabun.16s$cor_0_10_95ci_upper[g] <- quantile(temp.summary$r_0_10, probs = 0.975, na.rm = TRUE) summary_relabun.16s$beta_0_10_mean[g] <- mean(temp.summary$beta_0_10, na.rm = TRUE) summary_relabun.16s$beta_0_10_95ci_lower[g] <- quantile(temp.summary$beta_0_10, probs = 0.025, na.rm = TRUE) summary_relabun.16s$beta_0_10_95ci_upper[g] <- quantile(temp.summary$beta_0_10, probs = 0.975, na.rm = TRUE) summary_relabun.16s$p_ordAOV_0_10_mean[g] <- mean(temp.summary$p_ordAOV_0_10, na.rm = TRUE) summary_relabun.16s$p_ordAOV_0_10_95ci_lower[g] <- quantile(temp.summary$p_ordAOV_0_10, probs = 0.025, na.rm = TRUE) summary_relabun.16s$p_ordAOV_0_10_95ci_upper[g] <- quantile(temp.summary$p_ordAOV_0_10, probs = 0.975, na.rm = TRUE) summary_relabun.16s$perc_B_with_data[g] <- 100*(length(temp.summary$B)/B) summary_relabun.16s$B[g] <- B # get mean, min, max relative abundance temp.df <- as.data.frame(matrix(ncol=8)) names(temp.df) <- c("sample","Reveg_age","depth","rel_abun","reveg_age_score","reveg_order","genus","calc_type") for (j in 1:B) { if (!is.null(b_out[[j]][[ new_genera[g] ]][["df"]])) { if (is.data.frame(b_out[[j]][[ new_genera[g] ]][["df"]])) { temp.df <- rbind(temp.df, b_out[[j]][[ new_genera[g] ]][["df"]]) } } } temp.df <- temp.df[-1, ] # remove 1st row with NAs # rel abundance as % summary_relabun.16s$rel_abun_0_10_mean_perc[g] <- 100*mean(temp.df$rel_abun, na.rm = TRUE) summary_relabun.16s$rel_abun_0_10_min_perc[g] <- 100*min(temp.df$rel_abun, na.rm = TRUE) summary_relabun.16s$rel_abun_0_10_max_perc[g] <- 100*max(temp.df$rel_abun, na.rm = TRUE) # test if missing (all ...$rel_abun = 0) from cleared, or missing from remnants - based on 95% threshold sel <- which(temp.df$sample == "Cleared (0-10 cm)") if (length(sel)>0) { #if ( all(temp.df$rel_abun[sel] ==0, na.rm = TRUE)) { summary_relabun.16s$missing_in_cleared[g] <- TRUE } if ( length(which(temp.df$rel_abun[sel] ==0))/length(na.omit(temp.df$rel_abun[sel])) >= 0.95 ) { summary_relabun.16s$missing_in_cleared[g] <- TRUE } } sel <- which(temp.df$sample %in% c("Remnant A (0-10 cm)","Remnant B (0-10 cm)","Remnant C (0-10 cm)")) if (length(sel)>0) { #if (all(temp.df$rel_abun[sel] ==0, na.rm = TRUE)) { summary_relabun.16s$missing_in_remnants[g] <- TRUE } if ( length(which(temp.df$rel_abun[sel] ==0))/length(na.omit(temp.df$rel_abun[sel])) >= 0.95 ) { summary_relabun.16s$missing_in_remnants[g] <- TRUE } } print(paste0("completed new_genera[",g,"]: ", new_genera[g])) } dim(summary_relabun.16s) # 401 17 temp2 <- summary_relabun.16s ## add OTU Ids dim(summary_relabun.16s) # 401 17 length(new_genera) # 401 names(summary_relabun.16s) # [1] "genus" "rel_abun_0_10_mean_perc" "rel_abun_0_10_min_perc" "rel_abun_0_10_max_perc" # [5] "cor_0_10_mean" "cor_0_10_95ci_lower" "cor_0_10_95ci_upper" "beta_0_10_mean" # [9] "beta_0_10_95ci_lower" "beta_0_10_95ci_upper" "p_ordAOV_0_10_mean" "p_ordAOV_0_10_95ci_lower" # [13] "p_ordAOV_0_10_95ci_upper" "missing_in_cleared" "missing_in_remnants" "perc_B_with_data" # [17] "B" temp2$otus <- NA for (g in 1:length( new_genera )) { #g<-10 temp_otu_list <- NA for (j in 1:B) { #j<-3 if (!is.null(b_out[[j]][[ new_genera[g] ]][["otus"]])) { if (is.character(b_out[[j]][[ new_genera[g] ]][["otus"]])) { temp_otu_list <- c(temp_otu_list, b_out[[j]][[ new_genera[g] ]][["otus"]]) } } } temp_otu_list <- temp_otu_list[-1] # remove first NA value temp_otu_list <- unique(temp_otu_list) temp_otu_list <- paste0(temp_otu_list,collapse=";") temp2$otus[g] <- temp_otu_list print(paste0("completed new_genera[",g,"]: ", new_genera[g])) } temp2$otus # keep copy temp.summary_relabun.16s <- summary_relabun.16s # update to version with OTUs summary_relabun.16s <- temp2 ## Now exclude bacteria with low presence (where present in <3 sample types will result in NA rows) ## and where data is not available for at least 30% of the bootstrap samples ok <- complete.cases(summary_relabun.16s) sel.rm <- which(ok==FALSE) summary_relabun.16s$genus[sel.rm] # [1] g__Acinetobacter g__Azospirillum # [3] g__Brownia g__Collimonas # [5] g__JG37-AG-70 g__Solitalea # [7] unclassified unclassified (order: Burkholderiales) # [9] unclassified (family: Rhodocyclaceae) unclassified (order: S1198) # [11] unclassified (order: Desulfuromonadales) unclassified (family: [Entotheonellaceae]) # [13] unclassified (class: PAUC37f) unclassified (class: ABY1) # [15] unclassified (class: Gitt-GS-136) unclassified (order: MVS-40) # [17] unclassified (order: [Saprospirales]) unclassified (family: Rikenellaceae) # [19] unclassified (class: OP11-4) summary_relabun.16s[sel.rm, ] summary_relabun.16s <- summary_relabun.16s[-sel.rm, ] sel.rm <- which(summary_relabun.16s$perc_B_with_data < 30) # 12 cases summary_relabun.16s$genus[sel.rm] # [1] g__Algoriphagus g__Aneurinibacillus # [3] g__Candidatus_Azobacteroides g__Nocardiopsis # [5] g__Rhodanobacter g__Sedimentibacter # [7] g__Tepidibacter unclassified (family: 0319-6A21) # [9] unclassified (phylum: TM7) unclassified (order: C114) # [11] unclassified (family: Thermoactinomycetaceae) unclassified (family: Paenibacillaceae) summary_relabun.16s <- summary_relabun.16s[-sel.rm, ] dim(summary_relabun.16s) # 370 18 ## Note g__[Clostridium] refer s__glycolicum # also see ref: # Yutin N, Galperin MY. A genomic update on clostridial phylogeny: # Gram-negative spore-formers and other misplaced clostridia. Environmental microbiology. # 2013;15(10):2631-2641. doi:10.1111/1462-2920.12173. # Also see: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5043541/ # Hence, some have proposed to rename C. difficile as Peptoclostridium difficile, and to transfer C. glycolicum and C. mayombei to this genus as well [1]. plot(x = summary_relabun.16s$cor_0_10_mean , y =summary_relabun.16s$p_ordAOV_0_10_mean ) plot(x = summary_relabun.16s$cor_0_10_mean , y =-log10(summary_relabun.16s$p_ordAOV_0_10_mean) ) # plot(x = summary_relabun.16s$beta_0_10_mean , y =-log10(summary_relabun.16s$p_ordAOV_0_10_mean) ) ## Beta values are poorly distibuted for interpretation purposes ## To display results comparing pseudo-correlation and ordinal AOV p-values ... # use false dicovery rate correction - Sequential Bonferroni (Holm 1979) # plot as -log10(p_value) # show means in black # show uncertainty in grey (from collation of bootstrap results) # set threshold for labelling, consider: upper quartile of -log10(p), upper quartile of abs(correlation) # labels: left-adjust if correlation < 0 (neg) # labels: right-adjust if correlation > 0 (pos) # labels: "Increasing", "Decreasing taxa names(summary_relabun.16s) # [1] "genus" "rel_abun_0_10_mean_perc" "rel_abun_0_10_min_perc" "rel_abun_0_10_max_perc" # [5] "cor_0_10_mean" "cor_0_10_95ci_lower" "cor_0_10_95ci_upper" "beta_0_10_mean" # [9] "beta_0_10_95ci_lower" "beta_0_10_95ci_upper" "p_ordAOV_0_10_mean" "p_ordAOV_0_10_95ci_lower" # [13] "p_ordAOV_0_10_95ci_upper" "missing_in_cleared" "missing_in_remnants" "perc_B_with_data" # [17] "B" "otus" # ## False Discovery Rate correction (Benjamini & Hochberg 1995) # http://www.statisticshowto.com/benjamini-hochberg-procedure/ # what is m? # after excluding taxa with missing data or # less than 30% representation from the bootstrap samples m <- dim(summary_relabun.16s)[1] # 370 alpha <- 0.05 p_values <- summary_relabun.16s[ order(summary_relabun.16s$p_ordAOV_0_10_mean,decreasing = FALSE) , "p_ordAOV_0_10_mean" ] plot(x=1:m, y=p_values, xlab="k", ylab="P(k)") # criteria values for Benjamini & Hochberg test test_values <- rep(NA, times=length(p_values)) for (i in 1:m) { test_values[i] <- (i/m)*alpha } test_values test_results <- rep(NA, times=length(p_values)) ## calculate test results for (i in 1:m) { if ( p_values[i] < test_values[i] ) { test_results[i] <- "yes" } } test_results # get index of largest ranked p-value with "yes" result (i.e. p-value is smaller than test criteria) idx <- rev(which(!is.na(test_results)))[1] summary_relabun.16s$sigBH <- NA if (length(1:idx) >0) { summary_relabun.16s[ order(summary_relabun.16s$p_ordAOV_0_10_mean,decreasing = FALSE)[1:idx] , "sigBH" ] <- "sig" } length(1:idx) # 29 plot(x=1:m, y=p_values, xlab="k (index of ranked P-values)", ylab="P-value(k)", xlim=c(0,100), ylim=c(0,0.05)) # title("(a) 16S", adj=0) abline(a=0, b=(alpha/m), col="red" ) text(x = 80, y = 0.007, labels = "slope = alpha/m", col = "red") points(x=c(1:m)[1:idx], y=p_values[1:idx], col="purple" ) text(x = 10, y = 0.008, adj=0.5, labels = "P(k) <\n(k/m)*alpha", col = "purple") dev.print(tiff, filename = paste0("finished-plots/","Benjamini-Hochberg-significant-p-values-16s-vFINAL.tiff") , width = 14, height = 14, units = "cm", res=600, compression = "lzw" ) # show -log10(p_value) summary_relabun.16s$minuslog10_p_ordAOV_0_10_mean <- -log10(summary_relabun.16s$p_ordAOV_0_10_mean) ## store summary results in new object summary_data <- summary_relabun.16s names(summary_data) # [1] "genus" "rel_abun_0_10_mean_perc" "rel_abun_0_10_min_perc" # [4] "rel_abun_0_10_max_perc" "cor_0_10_mean" "cor_0_10_95ci_lower" # [7] "cor_0_10_95ci_upper" "beta_0_10_mean" "beta_0_10_95ci_lower" # [10] "beta_0_10_95ci_upper" "p_ordAOV_0_10_mean" "p_ordAOV_0_10_95ci_lower" # [13] "p_ordAOV_0_10_95ci_upper" "missing_in_cleared" "missing_in_remnants" # [16] "perc_B_with_data" "B" "otus" # [19] "sigBH" "minuslog10_p_ordAOV_0_10_mean" summary_data$genus_label <- gsub(pattern = "g__", x = summary_data$genus, replacement = "") ## Add * to denote significance if satisfies Benjamini Hochberg test summary_data$genus_label_with_sig <- NA # join significance indicator to label for (i in 1:dim(summary_data)[1]) { if (!is.na(summary_data$sigBH[i])) { summary_data$genus_label_with_sig[i] <- paste0(summary_data$genus_label[i],"(*)") } else { summary_data$genus_label_with_sig[i] <- summary_data$genus_label[i] } } # test plot plot(x = summary_data$cor_0_10_mean , y =-log10(summary_data$p_ordAOV_0_10_mean) ) ## highlight top 10 correlated decreasing and increasing summary_data$top_10 <- NA ## Examine top 10 increasing + top 10 decreasing taxa top10_inc <- summary_data[ order(summary_data$cor_0_10_mean,decreasing = TRUE)[1:10], ] top10_dec <- summary_data[ order(summary_data$cor_0_10_mean,decreasing = FALSE)[1:10], ] sel <- which(summary_data$genus %in% top10_inc$genus) summary_data$genus[sel] # [1] g__Bradyrhizobium g__Candidatus_Koribacter # [3] g__Candidatus_Solibacter g__Candidatus_Xiphinematobacter # [5] g__DA101 g__Edaphobacter # [7] g__Rhodopila unclassified (order: Solibacterales) # [9] unclassified (family: Rhodospirillaceae) unclassified (family: [Leptospirillaceae]) summary_data$top_10[sel] <- "increasing" sel <- which(summary_data$genus %in% top10_dec$genus) summary_data$genus[sel] # [1] g__Ammoniphilus g__Bacillus # [3] g__Cytophagales g__Flavisolibacter # [5] g__Rummeliibacillus g__Sporosarcina # [7] unclassified (order: Ellin5290) unclassified (class: C0119) # [9] unclassified (family: Actinospicaceae) unclassified (family: Ellin5301) summary_data$top_10[sel] <- "decreasing" top10_inc_dec <- rbind(top10_inc, top10_dec[order(top10_dec$cor_0_10_mean, decreasing = TRUE) , ] ) top10_inc_dec[ ,c("genus_label_with_sig","rel_abun_0_10_mean_perc","cor_0_10_mean","p_ordAOV_0_10_mean")] # genus_label_with_sig rel_abun_0_10_mean_perc cor_0_10_mean p_ordAOV_0_10_mean # 54 DA101(*) 2.22657063 0.9481062 0.0016510 # 40 Candidatus_Xiphinematobacter 0.95947955 0.9396292 0.0050620 # 30 Bradyrhizobium 4.59756134 0.9158027 0.0221970 # 39 Candidatus_Solibacter 3.06456320 0.9021967 0.0229350 # 38 Candidatus_Koribacter 3.75802045 0.8820458 0.0089780 # 204 unclassified (family: Rhodospirillaceae) 2.49254275 0.8729929 0.0426730 # 143 Rhodopila 0.01451115 0.7912259 0.0770870 # 62 Edaphobacter 0.01843123 0.7718945 0.1168360 # 193 unclassified (order: Solibacterales) 0.74135688 0.7654445 0.0720650 # 290 unclassified (family: [Leptospirillaceae]) 0.01884944 0.7559469 0.1068296 # 240 unclassified (class: C0119) 0.32134758 -0.8280791 0.0048950 # 71 Flavisolibacter 0.11913569 -0.8456658 0.0381860 # 16 Ammoniphilus(*) 0.18729368 -0.8495190 0.0009950 # 301 unclassified (family: Ellin5301) 0.06712639 -0.8522358 0.0057340 # 53 Cytophagales 0.20004089 -0.8526565 0.0338420 # 166 Sporosarcina(*) 0.89015056 -0.8535728 0.0027120 # 196 unclassified (order: Ellin5290) 0.95400186 -0.8617709 0.0063880 # 267 unclassified (family: Actinospicaceae) 0.09128996 -0.8635059 0.0088320 # 150 Rummeliibacillus(*) 0.08866729 -0.8745504 0.0006750 # 24 Bacillus(*) 6.99326394 -0.9475292 0.0031680 summary_data$type <- "means" getwd() # "C:/Workspace/PROJ/PAPER-Trending-Taxa-Resto/modelling" write.csv(summary_data,file = "summary_rel_abun_16s_vFINAL.csv", quote = FALSE, row.names = FALSE) sel <- which(summary_data$minuslog10_p_ordAOV_0_10_mean == Inf) summary_data$genus_label_with_sig[sel] # [1] "Segetibacter(*)" "Terracoccus(*)" # [3] "Tissierella_Soehngenia(*)" "unclassified (class: Thermomicrobia)(*)" # [5] "unclassified (order: Clostridiales)(*)" "unclassified (phylum: MVP-21)(*)" summary_data$genus <- as.character(summary_data$genus) str(summary_data) # 'data.frame': 370 obs. of 24 variables: # $ genus : chr "g__[Clostridium]" "g__A17" "g__Achromobacter" "g__Actinoallomurus" ... # $ rel_abun_0_10_mean_perc : num 0.00439 0.3019 0.01084 0.24821 0.03671 ... # $ rel_abun_0_10_min_perc : num 0 0.0684 0 0.0684 0 ... # $ rel_abun_0_10_max_perc : num 0.0476 0.7257 0.1071 0.7792 0.1309 ... # $ cor_0_10_mean : num -0.74 0.588 -0.452 0.605 0.321 ... # $ cor_0_10_95ci_lower : num -0.8641 0.4391 -0.6064 0.5118 -0.0652 ... # $ cor_0_10_95ci_upper : num -0.476 0.71 -0.276 0.694 0.579 ... # $ beta_0_10_mean : num -5.06e-06 9.15e-05 -1.70e-05 1.40e-04 1.08e-05 ... # $ beta_0_10_95ci_lower : num -8.51e-06 6.48e-05 -2.49e-05 1.12e-04 -1.63e-06 ... # $ beta_0_10_95ci_upper : num -1.97e-06 1.18e-04 -1.00e-05 1.75e-04 1.99e-05 ... # $ p_ordAOV_0_10_mean : num 0.00387 0.18826 0.74731 0.24373 0.84674 ... # $ p_ordAOV_0_10_95ci_lower : num 0 0.0938 0.1151 0.1613 0.1907 ... # $ p_ordAOV_0_10_95ci_upper : num 0 0.304 1 0.327 1 ... # $ missing_in_cleared : logi FALSE FALSE FALSE FALSE FALSE FALSE ... # $ missing_in_remnants : logi TRUE FALSE FALSE FALSE FALSE FALSE ... # $ perc_B_with_data : num 73 100 100 100 100 100 100 100 100 100 ... # $ B : num 100 100 100 100 100 100 100 100 100 100 ... # $ otus : chr "AMD_16S_OTUa_27646" "AMD_16S_OTUa_3514;AMD_16S_OTUa_2388;AMD_16S_OTUa_7021;AMD_16S_OTUa_17054;AMD_16S_OTUa_102790;AMD_16S_OTUa_8918;"| __truncated__ "AMD_16S_OTUa_3233" "AMD_16S_OTUa_3492;AMD_16S_OTUa_14366;AMD_16S_OTUa_1533;AMD_16S_OTUa_6799;AMD_16S_OTUa_7142;AMD_16S_OTUb_10;AMD_"| __truncated__ ... # $ sigBH : chr "sig" NA NA NA ... # $ minuslog10_p_ordAOV_0_10_mean: num 2.4122 0.7252 0.1265 0.6131 0.0723 ... # $ genus_label : chr "[Clostridium]" "A17" "Achromobacter" "Actinoallomurus" ... # $ genus_label_with_sig : chr "[Clostridium](*)" "A17" "Achromobacter" "Actinoallomurus" ... # $ top_10 : chr NA NA NA NA ... # $ type : chr "means" "means" "means" "means" ... ## also highlight top 30 increasing + top 30 decreasing taxa top30_inc.mb <- summary_data[ order(summary_data$cor_0_10_mean,decreasing = TRUE)[1:30], ] top30_dec.mb <- summary_data[ order(summary_data$cor_0_10_mean,decreasing = FALSE)[1:30], ] temp <- rbind(top30_inc.mb, top30_dec.mb[order(top30_dec.mb$cor_0_10_mean, decreasing = TRUE) , ] ) temp$cor_0_10_mean # [1] 0.9481062 0.9396292 0.9158027 0.9021967 0.8820458 0.8729929 0.7912259 0.7718945 0.7654445 0.7559469 # [11] 0.7531342 0.7469560 0.7448172 0.7107212 0.7090636 0.6961291 0.6955107 0.6936320 0.6897496 0.6886832 # [21] 0.6875564 0.6770214 0.6710047 0.6670354 0.6658357 0.6616901 0.6604869 0.6597611 0.6492170 0.6437030 # [31] -0.7236866 -0.7395185 -0.7405571 -0.7416423 -0.7453118 -0.7453218 -0.7678235 -0.7723370 -0.7750369 -0.7757374 # [41] -0.7778713 -0.7807761 -0.7864219 -0.7907936 -0.7939386 -0.7991952 -0.8003639 -0.8056883 -0.8066860 -0.8216378 # [51] -0.8280791 -0.8456658 -0.8495190 -0.8522358 -0.8526565 -0.8535728 -0.8617709 -0.8635059 -0.8745504 -0.9475292 names(temp) # [1] "genus" "rel_abun_0_10_mean_perc" "rel_abun_0_10_min_perc" # [4] "rel_abun_0_10_max_perc" "cor_0_10_mean" "cor_0_10_95ci_lower" # [7] "cor_0_10_95ci_upper" "beta_0_10_mean" "beta_0_10_95ci_lower" # [10] "beta_0_10_95ci_upper" "p_ordAOV_0_10_mean" "p_ordAOV_0_10_95ci_lower" # [13] "p_ordAOV_0_10_95ci_upper" "missing_in_cleared" "missing_in_remnants" # [16] "perc_B_with_data" "B" "otus" # [19] "sigBH" "minuslog10_p_ordAOV_0_10_mean" "genus_label" # [22] "genus_label_with_sig" "top_10" "type" temp[ ,c("genus_label_with_sig", "rel_abun_0_10_mean_perc", "rel_abun_0_10_min_perc" , "rel_abun_0_10_max_perc", "cor_0_10_mean", "cor_0_10_95ci_lower", "cor_0_10_95ci_upper" , "p_ordAOV_0_10_mean" , "p_ordAOV_0_10_95ci_lower" , "p_ordAOV_0_10_95ci_upper" , "missing_in_cleared" , "missing_in_remnants", "perc_B_with_data" )] # genus_label_with_sig rel_abun_0_10_mean_perc rel_abun_0_10_min_perc rel_abun_0_10_max_perc cor_0_10_mean cor_0_10_95ci_lower cor_0_10_95ci_upper p_ordAOV_0_10_mean # 54 DA101(*) 2.226570632 0.220074349 6.06394052 0.9481062 0.9365952 0.9581656 0.001651000 # 40 Candidatus_Xiphinematobacter 0.959479554 0.000000000 4.14275093 0.9396292 0.9149739 0.9549467 0.005062000 # 30 Bradyrhizobium 4.597561338 2.408921933 6.54275093 0.9158027 0.8831637 0.9477957 0.022197000 # 39 Candidatus_Solibacter 3.064563197 1.635687732 5.11821561 0.9021967 0.8743656 0.9382634 0.022935000 # 38 Candidatus_Koribacter 3.758020446 1.870631970 5.36505576 0.8820458 0.8410081 0.9207924 0.008978000 # 204 unclassified (family: Rhodospirillaceae) 2.492542751 0.657249071 6.11747212 0.8729929 0.8496185 0.8951025 0.042673000 # 143 Rhodopila 0.014511152 0.000000000 0.08624535 0.7912259 0.4930783 0.9357029 0.077087000 # 62 Edaphobacter 0.018431227 0.000000000 0.09814126 0.7718945 0.5804471 0.9128148 0.116836000 # 193 unclassified (order: Solibacterales) 0.741356877 0.169516729 2.28698885 0.7654445 0.7175340 0.8169502 0.072065000 # 290 unclassified (family: [Leptospirillaceae]) 0.018849442 0.000000000 0.10408922 0.7559469 0.4149157 0.9129101 0.106829592 # 189 unclassified (order: Ellin6513) 1.117801115 0.035687732 4.96654275 0.7531342 0.7143745 0.8025518 0.108745000 # 232 unclassified (family: Gemmataceae) 1.676081784 0.797026022 2.86096654 0.7469560 0.6605880 0.8152498 0.018713000 # 246 unclassified (order: Acidobacteriales) 0.071302974 0.000000000 0.39851301 0.7448172 0.6492487 0.8484135 0.127990000 # 142 Rhodomicrobium 0.142719331 0.000000000 0.67211896 0.7107212 0.6646081 0.7560028 0.123866000 # 155 Singulisphaera 0.016120818 0.000000000 0.10111524 0.7090636 0.4507045 0.9035795 0.101376000 # 202 unclassified (order: Acidimicrobiales) 0.397037175 0.118959108 0.76728625 0.6961291 0.4834035 0.8453244 0.093900000 # 194 unclassified (family: Acetobacteraceae) 1.351650558 0.517472119 2.36431227 0.6955107 0.5767568 0.8004094 0.147551000 # 363 unclassified (family: Pseudonocardiaceae) 0.006137546 0.000000000 0.10706320 0.6936320 0.5058636 0.9193353 0.156731034 # 211 unclassified (family: Isosphaeraceae) 0.668414498 0.270631970 1.47211896 0.6897496 0.5929725 0.7747526 0.090992000 # 203 unclassified (order: WD2101) 2.667721190 1.510780669 4.16951673 0.6886832 0.5814194 0.7816023 0.051506000 # 184 unclassified (family: Koribacteraceae) 8.050388476 2.724163569 19.57769517 0.6875564 0.6537038 0.7246696 0.110539000 # 272 unclassified (order: Phycisphaerales) 0.093081784 0.014869888 0.22899628 0.6770214 0.4149141 0.8528925 0.232495000 # 188 unclassified (family: Methylocystaceae) 0.734903346 0.083271375 2.11449814 0.6710047 0.5972248 0.7257107 0.078692000 # 273 unclassified (class: EC1113) 0.087252788 0.014869888 0.23494424 0.6670354 0.2787983 0.9197270 0.162109000 # 144 Rhodoplanes 3.351778810 0.913011152 5.69219331 0.6658357 0.6137173 0.7091670 0.085469000 # 88 Kibdelosporangium 0.049485130 0.000000000 0.65724907 0.6616901 0.4748183 0.8149326 0.153998000 # 374 unclassified (order: CV90) 0.009773234 0.000000000 0.05947955 0.6604869 0.3442408 0.8535122 0.260247000 # 277 unclassified (class: P2-11E) 0.030349442 0.000000000 0.25278810 0.6597611 0.2987594 0.8842194 0.308982857 # 317 unclassified (family: Nitrosomonadaceae) 0.011031599 0.000000000 0.11003717 0.6492170 0.5458511 0.7562727 0.200271000 # 294 unclassified (family: Myxococcaceae) 0.018271375 0.000000000 0.11598513 0.6437030 0.4798666 0.7674777 0.201634000 # 113 Nocardioides 0.249676580 0.020817844 0.70483271 -0.7236866 -0.8367514 -0.6270717 0.026314000 # 1 [Clostridium](*) 0.004390335 0.000000000 0.04758364 -0.7395185 -0.8641371 -0.4763210 0.003871233 # 177 Turicibacter 0.075823420 0.000000000 0.30929368 -0.7405571 -0.8766080 -0.5590324 0.029428000 # 266 unclassified (family: Dolo_23) 0.081221190 0.000000000 0.29442379 -0.7416423 -0.8593911 -0.6082742 0.010619000 # 128 Pimelobacter 0.015055762 0.000000000 0.10706320 -0.7453118 -0.8519020 -0.6329192 0.045737000 # 355 unclassified (family: Peptostreptococcaceae)(*) 0.010139405 0.000000000 0.08624535 -0.7453218 -0.8555024 -0.6569599 0.001309091 # 227 unclassified (order: Sphaerobacterales) 0.107825279 0.000000000 0.41040892 -0.7678235 -0.8485071 -0.6632361 0.028674000 # 288 unclassified (order: JG30-KF-CM45) 0.048386617 0.000000000 0.21710037 -0.7723370 -0.8805079 -0.6117995 0.018015000 # 77 Geobacter 0.120806691 0.000000000 0.49070632 -0.7750369 -0.8464994 -0.6931913 0.007191000 # 134 Pseudonocardia(*) 0.078284387 0.000000000 0.33605948 -0.7757374 -0.8832496 -0.6570166 0.001533000 # 159 Solibacillus(*) 0.310998141 0.000000000 1.53457249 -0.7778713 -0.8025899 -0.7541720 0.000502000 # 303 unclassified (order: Bacillales) 0.017650558 0.000000000 0.11598513 -0.7807761 -0.8959266 -0.6601069 0.015574000 # 125 Pelosinus 0.007661710 0.000000000 0.05650558 -0.7864219 -0.9209193 -0.6061397 0.026276000 # 20 Arthrobacter(*) 0.061237918 0.000000000 0.32118959 -0.7907936 -0.8502761 -0.7122096 0.000727000 # 36 Caloramator 0.033382900 0.000000000 0.13680297 -0.7939386 -0.9271788 -0.6438571 0.051218000 # 230 unclassified (family: Nitrospiraceae)(*) 0.079325279 0.000000000 0.35687732 -0.7991952 -0.8618027 -0.7379926 0.003091000 # 258 unclassified (family: Nocardioidaceae) 0.039382900 0.000000000 0.16654275 -0.8003639 -0.9248644 -0.6445480 0.013344000 # 158 SMB53(*) 0.066302974 0.000000000 0.37472119 -0.8056883 -0.8528527 -0.7615005 0.000081000 # 46 Clostridium 0.268163569 0.005947955 0.79405204 -0.8066860 -0.8696682 -0.7239457 0.012783000 # 50 Coprococcus(*) 0.023702602 0.000000000 0.14869888 -0.8216378 -0.9138263 -0.7397086 0.001122000 # 240 unclassified (class: C0119) 0.321347584 0.017843866 1.36802974 -0.8280791 -0.8568805 -0.7946317 0.004895000 # 71 Flavisolibacter 0.119135688 0.000000000 0.41338290 -0.8456658 -0.9230224 -0.7468617 0.038186000 # 16 Ammoniphilus(*) 0.187293680 0.000000000 0.71375465 -0.8495190 -0.8759602 -0.8263384 0.000995000 # 301 unclassified (family: Ellin5301) 0.067126394 0.000000000 0.29442379 -0.8522358 -0.9398327 -0.7179867 0.005734000 # 53 Cytophagales 0.200040892 0.000000000 0.95762082 -0.8526565 -0.8965299 -0.8057259 0.033842000 # 166 Sporosarcina(*) 0.890150558 0.000000000 3.23568773 -0.8535728 -0.8702681 -0.8385979 0.002712000 # 196 unclassified (order: Ellin5290) 0.954001859 0.205204461 2.33754647 -0.8617709 -0.8944359 -0.8211393 0.006388000 # 267 unclassified (family: Actinospicaceae) 0.091289963 0.000000000 0.29144981 -0.8635059 -0.9151828 -0.7977863 0.008832000 # 150 Rummeliibacillus(*) 0.088667286 0.000000000 0.41933086 -0.8745504 -0.9171821 -0.8242394 0.000675000 # 24 Bacillus(*) 6.993263941 1.073605948 15.42304833 -0.9475292 -0.9546374 -0.9389884 0.003168000 # p_ordAOV_0_10_95ci_lower p_ordAOV_0_10_95ci_upper missing_in_cleared missing_in_remnants perc_B_with_data # 54 0.0004000 0.0038625 FALSE FALSE 100 # 40 0.0006375 0.0123350 FALSE FALSE 100 # 30 0.0101000 0.0362775 FALSE FALSE 100 # 39 0.0104425 0.0372275 FALSE FALSE 100 # 38 0.0022950 0.0203250 FALSE FALSE 100 # 204 0.0290650 0.0578325 FALSE FALSE 100 # 143 0.0012325 0.2473325 FALSE FALSE 100 # 62 0.0042500 0.2997375 FALSE FALSE 100 # 193 0.0392950 0.1047100 FALSE FALSE 100 # 290 0.0086100 0.2601225 TRUE FALSE 98 # 189 0.0735075 0.1342150 FALSE FALSE 100 # 232 0.0041475 0.0452725 FALSE FALSE 100 # 246 0.0484375 0.2114725 TRUE FALSE 100 # 142 0.0902025 0.1564775 FALSE FALSE 100 # 155 0.0052175 0.2413750 FALSE FALSE 100 # 202 0.0203700 0.1890200 FALSE FALSE 100 # 194 0.0650375 0.2544525 FALSE FALSE 100 # 363 0.0051425 0.3149975 TRUE FALSE 58 # 211 0.0401600 0.1575850 FALSE FALSE 100 # 203 0.0018800 0.1536450 FALSE FALSE 100 # 184 0.0835975 0.1371400 FALSE FALSE 100 # 272 0.0386825 1.0000000 FALSE FALSE 100 # 188 0.0513225 0.1164450 FALSE FALSE 100 # 273 0.0071975 1.0000000 FALSE FALSE 100 # 144 0.0567925 0.1109050 FALSE FALSE 100 # 88 0.0596475 0.2847275 FALSE FALSE 100 # 374 0.0195075 1.0000000 FALSE FALSE 100 # 277 0.0263525 1.0000000 FALSE FALSE 70 # 317 0.1107550 0.2808675 FALSE FALSE 100 # 294 0.1004475 0.3346675 FALSE FALSE 100 # 113 0.0059275 0.0669775 FALSE FALSE 100 # 1 0.0000000 0.0000000 FALSE TRUE 73 # 177 0.0030325 0.1184350 FALSE FALSE 100 # 266 0.0006475 0.0328750 FALSE FALSE 100 # 128 0.0000000 0.3521775 FALSE TRUE 100 # 355 0.0000000 0.0118600 FALSE FALSE 99 # 227 0.0064650 0.0747050 FALSE FALSE 100 # 288 0.0023850 0.0645200 FALSE FALSE 100 # 77 0.0000475 0.0280975 FALSE FALSE 100 # 134 0.0000000 0.0080675 FALSE FALSE 100 # 159 0.0000000 0.0016000 FALSE FALSE 100 # 303 0.0000000 0.1088275 FALSE FALSE 100 # 125 0.0000000 0.2129625 FALSE FALSE 100 # 20 0.0000000 0.0037000 FALSE FALSE 100 # 36 0.0030700 0.2161750 FALSE FALSE 100 # 230 0.0002000 0.0103525 FALSE FALSE 100 # 258 0.0002000 0.0631775 FALSE FALSE 100 # 158 0.0000000 0.0007525 FALSE FALSE 100 # 46 0.0009950 0.0345525 FALSE FALSE 100 # 50 0.0000000 0.0094100 FALSE FALSE 100 # 240 0.0014000 0.0105000 FALSE FALSE 100 # 71 0.0090900 0.0806775 FALSE FALSE 100 # 16 0.0000000 0.0030525 FALSE FALSE 100 # 301 0.0000000 0.0213725 FALSE FALSE 100 # 53 0.0168175 0.0575725 FALSE FALSE 100 # 166 0.0010425 0.0047525 FALSE FALSE 100 # 196 0.0014475 0.0152025 FALSE FALSE 100 # 267 0.0002475 0.0267250 FALSE FALSE 100 # 150 0.0000000 0.0033050 FALSE FALSE 100 # 24 0.0018475 0.0048000 FALSE FALSE 100 write.csv(x = temp[ ,c("genus_label_with_sig", "rel_abun_0_10_mean_perc", "rel_abun_0_10_min_perc" , "rel_abun_0_10_max_perc", "cor_0_10_mean", "cor_0_10_95ci_lower", "cor_0_10_95ci_upper" , "p_ordAOV_0_10_mean" , "p_ordAOV_0_10_95ci_lower" , "p_ordAOV_0_10_95ci_upper" , "missing_in_cleared" , "missing_in_remnants", "perc_B_with_data" )], file = "top30-trending-MtBold-table-output-vFINAL.csv" ) ## record this for plotting later summary_data$top_30 <- NA sel <- which(summary_data$genus %in% top30_inc.mb$genus) summary_data$genus[sel] # [1] "g__Bradyrhizobium" "g__Candidatus_Koribacter" "g__Candidatus_Solibacter" # [4] "g__Candidatus_Xiphinematobacter" "g__DA101" "g__Edaphobacter" # [7] "g__Kibdelosporangium" "g__Rhodomicrobium" "g__Rhodopila" # [10] "g__Rhodoplanes" "g__Singulisphaera" "unclassified (family: Koribacteraceae)" # [13] "unclassified (family: Methylocystaceae)" "unclassified (order: Ellin6513)" "unclassified (order: Solibacterales)" # [16] "unclassified (family: Acetobacteraceae)" "unclassified (order: Acidimicrobiales)" "unclassified (order: WD2101)" # [19] "unclassified (family: Rhodospirillaceae)" "unclassified (family: Isosphaeraceae)" "unclassified (family: Gemmataceae)" # [22] "unclassified (order: Acidobacteriales)" "unclassified (order: Phycisphaerales)" "unclassified (class: EC1113)" # [25] "unclassified (class: P2-11E)" "unclassified (family: [Leptospirillaceae])" "unclassified (family: Myxococcaceae)" # [28] "unclassified (family: Nitrosomonadaceae)" "unclassified (family: Pseudonocardiaceae)" "unclassified (order: CV90) summary_data$top_30[sel] <- "increasing" sel <- which(summary_data$genus %in% top30_dec.mb$genus) summary_data$genus[sel] # [1] "g__[Clostridium]" "g__Ammoniphilus" "g__Arthrobacter" # [4] "g__Bacillus" "g__Caloramator" "g__Clostridium" # [7] "g__Coprococcus" "g__Cytophagales" "g__Flavisolibacter" # [10] "g__Geobacter" "g__Nocardioides" "g__Pelosinus" # [13] "g__Pimelobacter" "g__Pseudonocardia" "g__Rummeliibacillus" # [16] "g__SMB53" "g__Solibacillus" "g__Sporosarcina" # [19] "g__Turicibacter" "unclassified (order: Ellin5290)" "unclassified (order: Sphaerobacterales)" # [22] "unclassified (family: Nitrospiraceae)" "unclassified (class: C0119)" "unclassified (family: Nocardioidaceae)" # [25] "unclassified (family: Dolo_23)" "unclassified (family: Actinospicaceae)" "unclassified (order: JG30-KF-CM45)" # [28] "unclassified (family: Ellin5301)" "unclassified (order: Bacillales)" "unclassified (family: Peptostreptococcaceae)" summary_data$top_30[sel] <- "decreasing" ## which are missing in cleared, increasing with reveg_age, and found in remnants? sel <- which(summary_data$missing_in_cleared == TRUE & summary_data$cor_0_10_mean >0 & summary_data$missing_in_remnants == FALSE) # qty 8 summary_data[sel[order(summary_data$cor_0_10_mean[sel],decreasing=TRUE)], c("genus_label_with_sig","rel_abun_0_10_mean_perc", "rel_abun_0_10_min_perc" , "rel_abun_0_10_max_perc", "cor_0_10_mean", "cor_0_10_95ci_lower", "cor_0_10_95ci_upper" , "p_ordAOV_0_10_mean" , "p_ordAOV_0_10_95ci_lower" , "p_ordAOV_0_10_95ci_upper" , "missing_in_cleared" , "missing_in_remnants", "perc_B_with_data" )] # genus_label_with_sig rel_abun_0_10_mean_perc rel_abun_0_10_min_perc rel_abun_0_10_max_perc cor_0_10_mean cor_0_10_95ci_lower # 290 unclassified (family: [Leptospirillaceae]) 0.018849442 0 0.10408922 0.7559469 0.41491570 # 246 unclassified (order: Acidobacteriales) 0.071302974 0 0.39851301 0.7448172 0.64924870 # 363 unclassified (family: Pseudonocardiaceae) 0.006137546 0 0.10706320 0.6936320 0.50586359 # 351 unclassified (family: Hyphomicrobiaceae) 0.007505576 0 0.08624535 0.5930759 0.50179097 # 69 FFCH10602 0.061914498 0 0.49665428 0.4746768 0.42881845 # 28 Beijerinckia 0.004687732 0 0.10111524 0.4657753 0.41261588 # 357 unclassified (phylum: Firmicutes) 0.009762082 0 0.06542751 0.2120771 -0.08305872 # 360 unclassified (family: [Chthoniobacteraceae]) 0.012776952 0 0.13680297 0.1313504 -0.01633558 # cor_0_10_95ci_upper p_ordAOV_0_10_mean p_ordAOV_0_10_95ci_lower p_ordAOV_0_10_95ci_upper missing_in_cleared missing_in_remnants perc_B_with_data # 290 0.9129101 0.1068296 0.0086100 0.2601225 TRUE FALSE 98 # 246 0.8484135 0.1279900 0.0484375 0.2114725 TRUE FALSE 100 # 363 0.9193353 0.1567310 0.0051425 0.3149975 TRUE FALSE 58 # 351 0.7069897 0.2647250 0.1573750 0.3564075 TRUE FALSE 32 # 69 0.5369696 0.8433850 0.3203625 1.0000000 TRUE FALSE 100 # 28 0.5588157 0.7238500 0.2628550 1.0000000 TRUE FALSE 70 # 357 0.5146706 0.4643650 0.0325325 1.0000000 TRUE FALSE 100 # 360 0.3413023 0.7253266 0.2199150 1.0000000 TRUE FALSE 64 write.csv(x = summary_data[sel[order(summary_data$cor_0_10_mean[sel],decreasing=TRUE)], c("genus_label_with_sig","rel_abun_0_10_mean_perc", "rel_abun_0_10_min_perc" , "rel_abun_0_10_max_perc", "cor_0_10_mean", "cor_0_10_95ci_lower", "cor_0_10_95ci_upper" , "p_ordAOV_0_10_mean" , "p_ordAOV_0_10_95ci_lower" , "p_ordAOV_0_10_95ci_upper" , "missing_in_cleared" , "missing_in_remnants", "perc_B_with_data" )], file = "missing-in-cleared-increase-with-reveg_age-found-in-remnants-MtBold-table-output-vFINAL.csv" ) ## which are found in cleared, decreasing with reveg_age, and missing in remnants? sel <- which(summary_data$missing_in_cleared == FALSE & summary_data$cor_0_10_mean <0 & summary_data$missing_in_remnants == TRUE) # qty 14 summary_data[sel[order(summary_data$cor_0_10_mean[sel],decreasing=FALSE)], c("genus_label_with_sig","rel_abun_0_10_mean_perc", "rel_abun_0_10_min_perc" , "rel_abun_0_10_max_perc", "cor_0_10_mean", "cor_0_10_95ci_lower", "cor_0_10_95ci_upper" , "p_ordAOV_0_10_mean" , "p_ordAOV_0_10_95ci_lower" , "p_ordAOV_0_10_95ci_upper" , "missing_in_cleared" , "missing_in_remnants", "perc_B_with_data" )] # genus_label_with_sig rel_abun_0_10_mean_perc rel_abun_0_10_min_perc rel_abun_0_10_max_perc cor_0_10_mean cor_0_10_95ci_lower # 128 Pimelobacter 0.015055762 0 0.10706320 -0.7453118 -0.8519020 # 1 [Clostridium](*) 0.004390335 0 0.04758364 -0.7395185 -0.8641371 # 15 Alkaliphilus(*) 0.017553903 0 0.13382900 -0.7235315 -0.8148180 # 176 Tissierella_Soehngenia(*) 0.011682156 0 0.10111524 -0.7197876 -0.7639383 # 320 unclassified (class: Thermomicrobia)(*) 0.020046468 0 0.18141264 -0.7136777 -0.7940928 # 395 unclassified (family: Gracilibacteraceae)(*) 0.003308550 0 0.05353160 -0.7073713 -0.8301308 # 153 Segetibacter(*) 0.004654275 0 0.05650558 -0.6992809 -0.7986380 # 377 unclassified (order: Clostridiales)(*) 0.004182156 0 0.05055762 -0.6911464 -0.7438330 # 174 Terracoccus(*) 0.004037175 0 0.04758364 -0.6844459 -0.8229151 # 383 unclassified (class: Clostridia) 0.003719331 0 0.04758364 -0.6794644 -0.7741872 # 104 Methylotenera(*) 0.009122677 0 0.10408922 -0.6064238 -0.7890030 # 378 unclassified (family: Ruminococcaceae) 0.003550186 0 0.04460967 -0.5565981 -0.8002607 # 397 unclassified (phylum: MVP-21)(*) 0.003265799 0 0.03866171 -0.5406571 -0.7662520 # 334 unclassified (family: Flavobacteriaceae) 0.009585502 0 0.13977695 -0.5265187 -0.6668697 # cor_0_10_95ci_upper p_ordAOV_0_10_mean p_ordAOV_0_10_95ci_lower p_ordAOV_0_10_95ci_upper missing_in_cleared missing_in_remnants perc_B_with_data # 128 -0.6329192 0.045737000 0 0.3521775 FALSE TRUE 100 # 1 -0.4763210 0.003871233 0 0.0000000 FALSE TRUE 73 # 15 -0.5925817 0.003624000 0 0.0000000 FALSE TRUE 100 # 176 -0.6801475 0.000000000 0 0.0000000 FALSE TRUE 31 # 320 -0.6071268 0.000000000 0 0.0000000 FALSE TRUE 100 # 395 -0.5063324 0.002585366 0 0.0000000 FALSE TRUE 41 # 153 -0.4430527 0.000000000 0 0.0000000 FALSE TRUE 83 # 377 -0.6397787 0.000000000 0 0.0000000 FALSE TRUE 53 # 174 -0.5085239 0.000000000 0 0.0000000 FALSE TRUE 100 # 383 -0.4801026 0.028857500 0 0.1340050 FALSE TRUE 40 # 104 -0.4012331 0.002124742 0 0.0000000 FALSE TRUE 97 # 378 -0.2522756 0.253389333 0 1.0000000 FALSE TRUE 75 # 397 -0.3838143 0.000000000 0 0.0000000 FALSE TRUE 53 # 334 -0.4153547 0.004588136 0 0.0322900 FALSE TRUE 59 write.csv(x = summary_data[sel[order(summary_data$cor_0_10_mean[sel],decreasing=FALSE)], c("genus_label_with_sig","rel_abun_0_10_mean_perc", "rel_abun_0_10_min_perc" , "rel_abun_0_10_max_perc", "cor_0_10_mean", "cor_0_10_95ci_lower", "cor_0_10_95ci_upper" , "p_ordAOV_0_10_mean" , "p_ordAOV_0_10_95ci_lower" , "p_ordAOV_0_10_95ci_upper" , "missing_in_cleared" , "missing_in_remnants", "perc_B_with_data" )], file = "found-in-cleared-decrease-with-reveg_age-missing-in-remnants-MtBold-table-output-vFINAL.csv" ) # https://en.wikipedia.org/wiki/Peptostreptococcaceae # The Peptostreptococcaceae are a family of Gram-positive bacteria in the class Clostridia. # It appears to be over-represented in the guts of colorectal cancer patients.[1] # Ahn et al 2013. "Human Gut Microbiome and Risk for Colorectal Cancer". J Natl Cancer Inst. 105 (24): 1907-1911. doi:10.1093/jnci/djt300 #### Store top X trending bacteria - to later test correlation with log fold change # across natural and human-altered sites across southern Australia # store: "genus" - can look up summary_data$cor_0_10_mean[] later length(summary_data$genus) # 370 dim(summary_data) # 370 25 ## Consider combinations of top increasing and top decreasing taxa inc <- 1:150 inc # 1 ... 150 top <- list() for (i in inc) { #i <-1 top_inc <- summary_data[ order(summary_data$cor_0_10_mean,decreasing = TRUE)[1:i], ] top_dec <- summary_data[ order(summary_data$cor_0_10_mean,decreasing = FALSE)[1:i], ] top_inc_dec <- rbind(top_inc, top_dec[order(top_dec$cor_0_10_mean, decreasing = TRUE) , ] ) top[[i]] <- top_inc_dec print(paste0("completed ",i)) } ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ## use the list top[[]] later to test associations with 'natural' and 'human-altered' across Australia ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! p <- ggplot(summary_data, aes(x=cor_0_10_mean, y=minuslog10_p_ordAOV_0_10_mean)) + geom_point() + theme_bw() p y_axis_title <- expression(paste("-log10(",italic("P"),"-value)")) p <- ggplot(summary_data, aes(x=cor_0_10_mean, y=minuslog10_p_ordAOV_0_10_mean)) + #geom_point() + theme_bw() + xlab("Correlation" ) + ylab(y_axis_title) + #ylab("-log10(P-value)") + geom_vline(xintercept = 0, col="dark grey", linetype="dashed") + annotate(geom="text", x=1 , y= -0.05, label = "Increasing with reveg age", colour="#1a9850", hjust = 1, vjust=1, size=3) + # green #https://stackoverflow.com/questions/26684023/how-to-left-align-text-in-annotate-from-ggplot2 annotate(geom="text", x= -1, y= -0.05, label = "Decreasing with reveg age",colour="#d73027", hjust = 0, vjust=1, size=3) + # red # https://www.r-bloggers.com/repel-overlapping-text-labels-in-ggplot2/ # http://colorbrewer2.org/#type=diverging&scheme=RdYlGn&n=7 geom_point() + geom_point(data=filter(summary_data[which(summary_data$cor_0_10_mean<0), ], top_30 == "decreasing" ),colour="#fc8d59") + # orange geom_point(data=filter(summary_data[which(summary_data$cor_0_10_mean>=0), ], top_30 == "increasing" ), colour = "#91cf60" ) + # pale green geom_point(data=filter(summary_data[which(summary_data$cor_0_10_mean<0), ], top_10 == "decreasing" ),colour="#d73027") + # red geom_point(data=filter(summary_data[which(summary_data$cor_0_10_mean>=0), ], top_10 == "increasing" ), colour = "#1a9850" ) + # green geom_text_repel(data=filter(summary_data[which(summary_data$cor_0_10_mean<0), ], top_10 == "decreasing"), aes(label=genus_label_with_sig), size=2, colour="#d73027", segment.colour="grey") + # red geom_text_repel(data=filter(summary_data[which(summary_data$cor_0_10_mean>=0), ], top_10 == "increasing"), aes(label=genus_label_with_sig), size=2, colour="#1a9850", segment.colour="grey") # green p ggsave(plot = p, filename = paste0("finished-plots/","Key-trending-taxa-with-reveg-age-16S-Cor-and-p-values-vFINAL.tiff"), width = 11, height = 11, units = "cm", dpi = 600, compression = "lzw" ) ### for genera with high correlation coefficient (top 10 inc/dec) - collate data for plotting genera.highcor <- as.character(top10_inc_dec$genus) length(genera.highcor) # 20 genera.highcor # [1] "g__DA101" "g__Candidatus_Xiphinematobacter" # [3] "g__Bradyrhizobium" "g__Candidatus_Solibacter" # [5] "g__Candidatus_Koribacter" "unclassified (family: Rhodospirillaceae)" # [7] "g__Rhodopila" "g__Edaphobacter" # [9] "unclassified (order: Solibacterales)" "unclassified (family: [Leptospirillaceae])" # [11] "unclassified (class: C0119)" "g__Flavisolibacter" # [13] "g__Ammoniphilus" "unclassified (family: Ellin5301)" # [15] "g__Cytophagales" "g__Sporosarcina" # [17] "unclassified (order: Ellin5290)" "unclassified (family: Actinospicaceae)" # [19] "g__Rummeliibacillus" "g__Bacillus" str(top10_inc_dec) # 'data.frame': 20 obs. of 23 variables: # $ genus : Factor w/ 401 levels "g__[Clostridium]",..: 54 40 30 39 38 300 143 62 383 227 ... # $ rel_abun_0_10_mean_perc : num 2.227 0.959 4.598 3.065 3.758 ... # $ rel_abun_0_10_min_perc : num 0.22 0 2.41 1.64 1.87 ... # $ rel_abun_0_10_max_perc : num 6.06 4.14 6.54 5.12 5.37 ... # $ cor_0_10_mean : num 0.948 0.94 0.916 0.902 0.882 ... # $ cor_0_10_95ci_lower : num 0.937 0.915 0.883 0.874 0.841 ... # $ cor_0_10_95ci_upper : num 0.958 0.955 0.948 0.938 0.921 ... # $ beta_0_10_mean : num 0.001511 0.000451 0.001491 0.001154 0.000963 ... # $ beta_0_10_95ci_lower : num 0.001429 0.000411 0.001371 0.001034 0.000868 ... # $ beta_0_10_95ci_upper : num 0.001575 0.000486 0.001595 0.001247 0.001047 ... # $ p_ordAOV_0_10_mean : num 0.00165 0.00506 0.0222 0.02294 0.00898 ... # $ p_ordAOV_0_10_95ci_lower : num 0.0004 0.000638 0.0101 0.010443 0.002295 ... # $ p_ordAOV_0_10_95ci_upper : num 0.00386 0.01233 0.03628 0.03723 0.02032 ... # $ missing_in_cleared : logi FALSE FALSE FALSE FALSE FALSE FALSE ... # $ missing_in_remnants : logi FALSE FALSE FALSE FALSE FALSE FALSE ... # $ perc_B_with_data : num 100 100 100 100 100 100 100 100 100 98 ... # $ B : num 100 100 100 100 100 100 100 100 100 100 ... # $ otus : chr "AMD_16S_OTUa_976;AMD_16S_OTUa_3058;AMD_16S_OTUa_1027;AMD_16S_OTUa_1414;AMD_16S_OTUa_2420;AMD_16S_OTUa_3004;AMD_"| __truncated__ "AMD_16S_OTUa_483;AMD_16S_OTUa_765;AMD_16S_OTUa_1135;AMD_16S_OTUa_1088;AMD_16S_OTUa_3681;AMD_16S_OTUa_34952;AMD_"| __truncated__ "AMD_16S_OTUa_14" "AMD_16S_OTUa_662;AMD_16S_OTUa_252;AMD_16S_OTUa_1265;AMD_16S_OTUa_754;AMD_16S_OTUa_810;AMD_16S_OTUa_1132;AMD_16S"| __truncated__ ... # $ sigBH : chr "sig" NA NA NA ... # $ minuslog10_p_ordAOV_0_10_mean: num 2.78 2.3 1.65 1.64 2.05 ... # $ genus_label : chr "DA101" "Candidatus_Xiphinematobacter" "Bradyrhizobium" "Candidatus_Solibacter" ... # $ genus_label_with_sig : chr "DA101(*)" "Candidatus_Xiphinematobacter" "Bradyrhizobium" "Candidatus_Solibacter" ... # $ top_10 : logi NA NA NA NA NA NA ... # use B=1 and g=1 as starting template names(b_out[[1]][[ genera.highcor[1] ]][["summary"]]) # [1] "r_0_10" "beta_0_10" "p_ordAOV_0_10" "r_20_30" "beta_20_30" "p_ordAOV_20_30" "B" # [8] "genus" ### Now via routine - Plot top 10 key trending taxa - BASED ON PSEUDO-CORRELATION, 1st ABOVE - (increasing & decreasing) ### Compare to one-off rarefied data # rarefy #1 seed <- 123 r1.16s <- rarefy_even_depth(phy.16s, sample.size = min(sample_sums(phy.16s)), rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE) ## RELATIVE ABUNDANCE relabun.r1.16s <- transform_sample_counts(r1.16s, function(x) x / sum(x) ) # # # # # # # # # # Make function to plot key trending taxa for Mt Bold restoration gradient # # # # # # # # # plot_key_trending_taxa <- function(this_genus, plot_file_label, idx) { # need to define 'this_genus' if ( grepl(pattern="unclassified",x=this_genus) ) { # need to carefully select unclassified genera to the right level... this_taxaname <- substr(x=this_genus, start=2+regexpr(pattern=": ",text=this_genus)[1], stop = nchar(this_genus)-1 ) # "kingdom" "phylum" "class" "order" "family" "genus" "species" if ( grepl(pattern="family",x=this_genus) ) {sel_keep_taxa <- which(tax_table(relabun.r1.16s)[ , "family"] == this_taxaname & tax_table(relabun.r1.16s)[ , "genus"] == "unclassified" )} if ( grepl(pattern="order",x=this_genus) ) {sel_keep_taxa <- which(tax_table(relabun.r1.16s)[ , "order"] == this_taxaname & tax_table(relabun.r1.16s)[ , "family"] == "unclassified" & tax_table(relabun.r1.16s)[ , "genus"] == "unclassified" )} if ( grepl(pattern="class",x=this_genus) ) {sel_keep_taxa <- which(tax_table(relabun.r1.16s)[ , "class"] == this_taxaname & tax_table(relabun.r1.16s)[ , "order" ] == "unclassified" & tax_table(relabun.r1.16s)[ , "family"] == "unclassified" & tax_table(relabun.r1.16s)[ , "genus"] == "unclassified" )} if ( grepl(pattern="phylum",x=this_genus) ) {sel_keep_taxa <- which(tax_table(relabun.r1.16s)[ , "phylum"] == this_taxaname & tax_table(relabun.r1.16s)[ , "class"] == "unclassified" & tax_table(relabun.r1.16s)[ , "order" ] == "unclassified" & tax_table(relabun.r1.16s)[ , "family"] == "unclassified" & tax_table(relabun.r1.16s)[ , "genus"] == "unclassified" )} } else { sel_keep_taxa <- which(tax_table(relabun.r1.16s)[ ,"genus"] == this_genus ) } # may not have rarer taxa in the one-off rarefied dataset to join bootstrap-merged data to... # so setup dummy dataframe in this case if (length(sel_keep_taxa) == 0) { out <- data.frame( sample = NA, Reveg_age = NA, depth = "0-10 cm", # give this known factor to not cause probs with plot facet later rel_abun = NA, calc_type = "rarefyx1" ) } else { keep_taxa <- row.names( tax_table(relabun.r1.16s) )[sel_keep_taxa] subsel <- prune_taxa(relabun.r1.16s, taxa = keep_taxa ) out <- data.frame( sample = sample_names( subsel ), Reveg_age = subsel@sam_data$Reveg_age, depth = subsel@sam_data$depth, rel_abun = sample_sums( subsel ) ) out$calc_type <- "rarefyx1" } # collate rarefyx1 & bootstrap temp <- out for (j in 1:B) { if (!is.null(b_out[[j]][[ this_genus ]][["df"]])) { if (is.data.frame(b_out[[j]][[ this_genus ]][["df"]])) { # keep only the same fields as once-rarefied samples temp <- rbind(temp,b_out[[j]][[ this_genus ]][["df"]][ , c("sample", "Reveg_age", "depth", "rel_abun", "calc_type")]) } } } # convert to % temp$rel_abun <- 100*temp$rel_abun # need to ensure Reveg_age set as a factor for consistent plotting # only an issue when data for "rarefyx1" not available - but enforce here anyway temp$Reveg_age <- factor(temp$Reveg_age, levels = c("Cleared","6 years","7 years","8 years", "10 years","Remnant A","Remnant B","Remnant C"), labels = c("Clear","6 yr","7 yr","8 yr", "10 yr","Rem A","Rem B","Rem C"), ordered = TRUE) melt.out <- melt(temp,id.vars = c("Reveg_age","depth","calc_type"), measure.vars = "rel_abun") ## apply same standard colours ("cols") cols <- c("Clear" = "#e31a1c", "6 yr" = "#addd8e", "7 yr" = "#78c679", "8 yr" = "#41ab5d", "10 yr"= "#238443", "Rem A" = "#4292c6", "Rem B" = "#2171b5", "Rem C" = "#084594") ## plot # 1st option is to keep display of Reveg age as factor on x-axis when no one-off rarefy data if ( is.na(melt.out$value[ which(melt.out$calc_type == "rarefyx1")[1] ]) ) { p <- ggplot(data=melt.out, aes(x=Reveg_age, value)) + #ggtitle("a") + #ggtitle( gsub(pattern="g__", this_genus, replacement = "" ) ) + ggtitle( paste0("#",idx," ",gsub(pattern="g__", this_genus, replacement = "" ) ) ) + geom_violin(data = melt.out[ which(melt.out$calc_type == "bootstrap"), ], aes(color = Reveg_age) ) + scale_colour_manual(values = cols) + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5) ) + labs(x = "Reveg age", y = "Relative abundance (%)") + theme(legend.position="none") + facet_wrap( ~ depth) } else { p <- ggplot(data=melt.out, aes(x=Reveg_age, value)) + #ggtitle("a") + #ggtitle( gsub(pattern="g__", this_genus, replacement = "" ) ) + ggtitle( paste0("#",idx," ",gsub(pattern="g__", this_genus, replacement = "" ) ) ) + geom_violin(data = melt.out[ which(melt.out$calc_type == "bootstrap"), ], aes(color = Reveg_age) ) + scale_colour_manual(values = cols) + geom_point(data = melt.out[ which(melt.out$calc_type == "rarefyx1"), ], color="gray20", shape=1 ) + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5) ) + labs(x = "Reveg age", y = "Relative abundance (%)") + theme(legend.position="none") + facet_wrap( ~ depth) } print(p) #ggsave(plot=p, filename = paste0("finished-plots/","Rel_abun_16s_",plot_file_label,"_",this_genus,".tiff"), width = 10, height = 8, units = "cm", dpi = 600, compression = "lzw") ggsave(plot=p, filename = paste0("finished-plots/","Rel_abun_16s_",plot_file_label,"_",sub(pattern=":",x=this_genus,replacement=""),"-vFINAL.tiff"), width = 10, height = 8, units = "cm", dpi = 600, compression = "lzw") # need to replace ":" from file.path name } # # # # # # # # # key_trending_taxa.16s <- c(genera.highcor[1:10],genera.highcor[20:11]) # increasing = 1:10 ; decreasing = 20:11 key_trending_labels <- c(rep("Increasing",times=10 ), rep("Decreasing",times=10 ) ) key_trending_taxa.16s # [1] "g__DA101" "g__Candidatus_Xiphinematobacter" # [3] "g__Bradyrhizobium" "g__Candidatus_Solibacter" # [5] "g__Candidatus_Koribacter" "unclassified (family: Rhodospirillaceae)" # [7] "g__Rhodopila" "g__Edaphobacter" # [9] "unclassified (order: Solibacterales)" "unclassified (family: [Leptospirillaceae])" # [11] "g__Bacillus" "g__Rummeliibacillus" # [13] "unclassified (family: Actinospicaceae)" "unclassified (order: Ellin5290)" # [15] "g__Sporosarcina" "g__Cytophagales" # [17] "unclassified (family: Ellin5301)" "g__Ammoniphilus" # [19] "g__Flavisolibacter" "unclassified (class: C0119)" key_trending_labels # [1] "Increasing" "Increasing" "Increasing" "Increasing" "Increasing" "Increasing" "Increasing" "Increasing" "Increasing" # [10] "Increasing" "Decreasing" "Decreasing" "Decreasing" "Decreasing" "Decreasing" "Decreasing" "Decreasing" "Decreasing" # [19] "Decreasing" "Decreasing" for (i in 1:length(key_trending_taxa.16s)) { idx <- ifelse(test = i>10, yes = i-10, no = i) plot_key_trending_taxa(this_genus=key_trending_taxa.16s[i] , plot_file_label= paste0(key_trending_labels[i],"-",idx), idx) print(paste0("completed taxa # ",i)) } # #------------------------- #### Examine Biomes of Australian Soil Environments (BASE) data # - initial filter of BASE samples #------------------------- # BASE contextual data were sourced from: https://data.bioplatforms.com/organization/bpa-base base_sites <- read_xlsx("C:/Workspace/PROJ/PAPER-Phidu-Aust-Infectious/datasets/BASE_Sample_Contextual_Data_Update_2017-12-19.xlsx", sheet=1, range="A1:BK7729", col_names = TRUE) base_sites <- as.data.frame(base_sites) names(base_sites) # [1] "Sample ID" "BPA ID" "BioSample Accession" # [4] "Data Type" "Collection Site" "Latitude" # [7] "Longitude" "Date Sampled" "Soil Depth (cm)" # [10] "Horizon" "Storage" "Broad Land Use" # [13] "Detailed Land Use" "Ecological Zone" "Vegetation Type" # [16] "Vegetation Cover" "Elevation" "Slope" # [19] "Slope Aspect" "Profile Position" "Australian Soil Classification" # [22] "FAO Soil Classification" "Immediate Previous Land Use" "Date since change in Land Use" # [25] "Crop Rotation 1 year since present" "Crop Rotation 2 years since present" "Crop Rotation 3 years since present" # [28] "Crop Rotation 4 years since present" "Crop Rotation 5 years since present" "Agrochemical Additions" # [31] "Tillage" "Fire History" "Fire Intensity" # [34] "Flooding" "Extreme Events" "Moisture" # [37] "Colour" "Gravel" "Texture" # [40] "Course Sand" "Fine Sand" "Sand" # [43] "Silt" "Clay" "NH3-N" # [46] "NO3???" "Colwell P" "Colwell K" # [49] "Sulphur" "Organic Carbon" "Conductivity" # [52] "CaCl2pH" "H2O pH" "DTPA Cu" # [55] "DTPA Fe" "DTPA Mn" "DTPA Zn" # [58] "Exc Al" "Exc Ca" "Exc Mg" # [61] "Exc K" "Exc Na" "B Hot CaCl2" str(base_sites) # 'data.frame': 7728 obs. of 63 variables: # $ Sample ID : chr "bpa-base-genomics-amplicon-16s-10603_1-a815d" "bpa-base-genomics-amplicon-16s-10605_1-a815d" "bpa-base-genomics-amplicon-16s-10606_1-a815d" "bpa-base-genomics-amplicon-16s-10609_1-a815d" ... # $ BPA ID : chr "102.100.100.10603" "102.100.100.10605" "102.100.100.10606" "102.100.100.10609" ... # $ BioSample Accession : chr "SAMN04874086" "SAMN04874088" "SAMN04874089" "SAMN04874092" ... # $ Data Type : chr "MiSeq 16S" "MiSeq 16S" "MiSeq 16S" "MiSeq 16S" ... # $ Collection Site : chr "xmas island" "xmas island" "xmas island" "xmas island" ... # $ Latitude : num -10.5 -10.5 -10.5 -10.5 -10.5 ... # $ Longitude : num 106 106 106 106 106 ... # $ Date Sampled : POSIXct, format: "2013-04-09" "2013-04-09" "2013-04-09" "2013-03-14" ... # $ Soil Depth (cm) : chr "0" "0" "20" "0" ... # $ Horizon : chr "O,A" "E" "C" "A" ... # $ Storage : chr "Frozen" "Frozen" "Frozen" "Frozen" ... # $ Broad Land Use : chr "Conservation and natural environments" "Conservation and natural environments" "Conservation and natural environments" "Conservation and natural environments" ... # $ Detailed Land Use : chr "National Park" "National Park" "National Park" "National Park" ... # $ Ecological Zone : chr NA NA NA NA ... # $ Vegetation Type : chr NA NA NA NA ... # $ Vegetation Cover : chr "90" "85" "85" "80" ... # $ Elevation : num 198 196 196 235 235 267 267 264 176 176 ... # $ Slope : num 0 10 10 0 0 4 4 0 5 5 ... # $ Slope Aspect : chr NA "North" "North" NA ... # $ Profile Position : chr NA NA NA NA ... # $ Australian Soil Classification : chr NA NA NA NA ... # $ FAO Soil Classification : chr NA NA NA NA ... # $ Immediate Previous Land Use : chr NA NA NA "Mines" ... # $ Date since change in Land Use : chr NA NA NA NA ... # $ Crop Rotation 1 year since present : logi NA NA NA NA NA NA ... # $ Crop Rotation 2 years since present: chr NA NA NA NA ... # $ Crop Rotation 3 years since present: chr NA NA NA NA ... # $ Crop Rotation 4 years since present: chr NA NA NA NA ... # $ Crop Rotation 5 years since present: chr NA NA NA NA ... # $ Agrochemical Additions : chr NA NA NA NA ... # $ Tillage : logi NA NA NA NA NA NA ... # $ Fire History : chr NA NA NA NA ... # $ Fire Intensity : chr NA NA NA NA ... # $ Flooding : chr NA NA NA NA ... # $ Extreme Events : chr NA NA NA NA ... # $ Moisture : num 31.8 26.8 25 25.6 21.3 ... # $ Colour : chr "GRYW" "BRGR" "GRBR" "BR" ... # $ Gravel : chr "5" "25-30" "35-40" "5" ... # $ Texture : num 1.5 1.5 1.5 2 2.5 1.5 2 1.5 1.5 1.5 ... # $ Course Sand : num 18.9 65.8 51.5 45.9 25.1 ... # $ Fine Sand : num 44.5 19.6 22.5 28.6 25.3 ... # $ Sand : num 63.4 85.3 73.9 74.5 50.4 ... # $ Silt : num 15.56 7.85 16.44 10.81 8.63 ... # $ Clay : num 21.07 6.82 9.62 14.69 40.93 ... # $ NH3-N : chr "29" "51" "16" "43" ... # $ NO3- : chr "10" "53" "8" "17" ... # $ Colwell P : chr "235" "590" "494" "103" ... # $ Colwell K : chr "90" "126" "47" "63" ... # $ Sulphur : logi NA NA NA NA NA NA ... # $ Organic Carbon : chr "2.6" "5.08" "2.74" "2.59" ... # $ Conductivity : chr "0.14000000000000001" "0.36199999999999999" "0.26500000000000001" "0.127" ... # $ CaCl2pH : num 6.4 7.2 7.4 6.4 6.1 6.6 7.1 6.5 7.4 7.6 ... # $ H2O pH : chr "6.8" "7.8" "7.9" "6.7" ... # $ DTPA Cu : num 2.91 1.54 1.63 3.46 3.21 1.09 0.69 2.17 2.41 2.32 ... # $ DTPA Fe : chr "24.97" "23.42" "22.37" "23.35" ... # $ DTPA Mn : num 186.3 82 76.6 143.2 78.5 ... # $ DTPA Zn : num 21.97 10.88 4.75 9.35 5.63 ... # $ Exc Al : num 0.031 0.022 0.019 0.023 0.024 0.019 0.027 0.018 0.025 0.026 ... # $ Exc Ca : num 19.41 59.26 41.52 14.46 5.06 ... # $ Exc Mg : num 2.15 3.13 1.15 1.92 0.55 2.58 0.66 1.72 1.64 0.74 ... # $ Exc K : num 0.23 0.32 0.12 0.16 0.05 0.25 0.09 0.21 0.19 0.05 ... # $ Exc Na : chr "0.15" "0.17" "0.14000000000000001" "0.18" ... # $ B Hot CaCl2 : chr "1" "2.02" "1.34" "0.74" ... names(base_sites) shp.base.all <- base_sites[, c("Sample ID", "BPA ID", "BioSample Accession", "Data Type", "Collection Site", "Latitude", "Longitude", "Date Sampled", "Soil Depth (cm)","Broad Land Use","Detailed Land Use", "Sand", "Silt", "Clay", "NH3-N", "Colwell P", "Colwell K", "Sulphur", "Organic Carbon", "Conductivity", "CaCl2pH", "H2O pH" )] dim(shp.base.all) # 7728 22 ok <- complete.cases(shp.base.all[c("Sample ID", "BPA ID", "BioSample Accession","Data Type","Latitude","Longitude")]) sel <- which(ok==TRUE) shp.base.all <- shp.base.all[sel, ] dim(shp.base.all) # 5014 22 shp.base.all$Latitude_copy <- shp.base.all$Latitude shp.base.all$Longitude_copy <- shp.base.all$Longitude coordinates(shp.base.all) <- ~ Longitude + Latitude # of form: ~x+y shp.base.all@proj4string <- CRS(WGS84) plot(shp.base.all) # contains sites in polar region and offshore (Christmas Island?) # crop data to exclude offshore islands and external territories shp.base.all <- crop(shp.base.all, extent(112.9211, 153.6387, -43.74051, -10.01742)) plot(shp.base.all) writeOGR(obj=shp.base.all, dsn = datadir,layer="BASE-sites-all-WGS84-from-2017-12-19-download-vFINAL", driver="ESRI Shapefile") # use criteria to select comparison dataset shp.base.select <- shp.base.all dim(shp.base.select) # 3692 22 temp <- shp.base.select plot(shp.base.select) ## Filter BASE sites - only surface, and exclude extremes of pH, salinity, very high clay content str(shp.base.select@data) # 'data.frame': 3692 obs. of 22 variables: # $ Sample ID : chr "bpa-base-genomics-amplicon-16s-12449_1-ac9e5" "bpa-base-genomics-amplicon-16s-12450_1-ac9e5" "bpa-base-genomics-amplicon-16s-12451_1-ac9e5" "bpa-base-genomics-amplicon-16s-12481_1-ac9e5" ... # $ BPA ID : chr "102.100.100.12449" "102.100.100.12450" "102.100.100.12451" "102.100.100.12481" ... # $ BioSample Accession: chr "SAMN04874141" "SAMN04874142" "SAMN04874143" "SAMN04874166" ... # $ Data Type : chr "MiSeq 16S" "MiSeq 16S" "MiSeq 16S" "MiSeq 16S" ... # $ Collection Site : chr "Narrabri" "Narrabri" "Narrabri" "Kinchega NP" ... # $ Date Sampled : POSIXct, format: "2014-05-01" "2014-05-01" "2014-05-01" "2014-03-10" ... # $ Soil Depth (cm) : chr "20" "0" "20" "0" ... # $ Broad Land Use : chr "Production from Dryland Agriculture and Plantations" "Production from Dryland Agriculture and Plantations" "Production from Dryland Agriculture and Plantations" "Conservation and natural environments" ... # $ Detailed Land Use : chr "cotton" "cotton" "cotton" "National Park" ... # $ Sand : num 20.4 24 17.4 77.5 71.1 ... # $ Silt : num 28.86 27.31 26.88 3.92 3.99 ... # $ Clay : num 50.7 48.7 55.7 18.6 24.9 ... # $ NH3-N : chr "2" "2" "4" "1" ... # $ Colwell P : chr "51" "92" "57" "19" ... # $ Colwell K : chr "455" "687" "418" "198" ... # $ Sulphur : logi NA NA NA NA NA NA ... # $ Organic Carbon : chr "1.02" "1.32" "0.95" "0.17" ... # $ Conductivity : chr "0.109" "0.123" "8.7999999999999995E-2" "0.159" ... # $ CaCl2pH : num 6.9 6.8 6.5 8.1 8.2 6.5 7.4 6.4 7.2 5.4 ... # $ H2O pH : chr "7.9" "7.6" "7.2" "9" ... # $ Latitude_copy : num -30.2 -30.2 -30.2 -32.5 -32.5 ... # $ Longitude_copy : num 150 150 150 142 142 ... shp.base.select@data$`H2O pH` <- as.numeric(shp.base.select@data$`H2O pH`) shp.base.select@data$Conductivity <- as.numeric(shp.base.select@data$Conductivity) shp.base.select@data$`NH3-N` <- as.numeric(shp.base.select@data$`NH3-N`) shp.base.select@data$`Colwell P` <- as.numeric(shp.base.select@data$`Colwell P`) shp.base.select@data$`Colwell K` <- as.numeric(shp.base.select@data$`Colwell K`) ## Filter BASE sites # Surface only, and filter out samples with extreme, outlying soil conditions: # - strongly acid (pHH2O < 4.5), strongly alkaline (pHH2O > 9) # - saline (electrical conductivity > 2 dS/m) # - very high (> 50%) clay content table(shp.base.select@data$`Soil Depth (cm)`) # 0 0-5 10_20 20 900 # 1872 4 8 1802 6 sel <- which(shp.base.select@data$`Soil Depth (cm)` == "0" | shp.base.select@data$`Soil Depth (cm)` == "0-5") shp.base.select <- shp.base.select[sel, ] ## Exclude Mt Bold sites table(shp.base.select@data$`Collection Site`) unique(shp.base.select@data$`Collection Site`) sel.rm <- which(shp.base.select@data$`Collection Site` == "Mount bold") # 100 shp.base.select <- shp.base.select[-sel.rm, ] dim(shp.base.select) # 1776 22 hist(shp.base.select@data$Clay) # low frequency tail of very high > 50 hist(shp.base.select@data$`H2O pH`) hist(shp.base.select@data$Conductivity) hist(shp.base.select@data$`Organic Carbon`) # compare to Mt Bold hist(phy.16s@sam_data$Clay) hist(phy.16s@sam_data$`H2O pH`) hist(phy.16s@sam_data$Conductivity) hist(phy.16s@sam_data$`Organic Carbon`) # exclude extremes of pH Strongly Acidic < 4.5 and Strongly alkaline > 9 (Fierer & Jackson 2006 PNAS) sel.rm <- which(shp.base.select@data$`H2O pH` < 4.5 | shp.base.select@data$`H2O pH` > 9) shp.base.select@data$`H2O pH`[sel.rm] # [1] 4.0 4.0 4.3 4.2 9.4 4.3 4.3 4.2 4.3 4.1 4.3 4.3 4.4 4.4 4.3 9.1 9.2 9.3 9.2 9.1 9.3 9.3 9.1 9.4 9.4 4.0 4.0 4.3 4.3 # [30] 4.2 4.3 4.1 4.3 4.3 4.4 4.4 4.3 9.1 9.2 9.3 9.2 9.1 9.3 9.3 9.1 9.4 9.4 9.1 4.2 4.0 4.0 4.3 4.2 9.4 4.3 4.3 4.2 4.3 # [59] 4.1 4.3 4.3 4.4 4.4 4.3 9.1 9.2 9.3 9.2 9.1 9.3 9.3 9.1 9.4 9.4 9.4 9.6 9.3 4.2 4.0 4.0 4.0 4.3 4.2 9.4 4.3 4.3 4.2 # [88] 4.3 4.1 4.3 4.3 4.4 4.4 4.3 9.1 9.2 9.3 9.2 9.1 9.3 9.3 9.1 9.4 9.4 9.4 shp.base.select <- shp.base.select[-sel.rm, ] sel.rm <- which(shp.base.select@data$Clay > 50) shp.base.select@data$Clay[sel.rm] # 65.29 65.00 64.00 69.00 63.00 67.00 65.00 67.00 58.76 shp.base.select <- shp.base.select[-sel.rm, ] # now check salinity hist(shp.base.select@data$Conductivity) sel.rm <- which(shp.base.select@data$Conductivity > 2) shp.base.select@data$Conductivity[sel.rm] # 2.349 28.580 28.580 6.100 28.420 32.140 32.140 14.020 8.893 shp.base.select <- shp.base.select[-sel.rm, ] dim(shp.base.select) # 1653 22 table(shp.base.select@data$`Data Type`) # MiSeq MiSeq 16S MiSeq 18S MiSeq A16S Miseq ITS MiSeq ITS # 435 239 175 613 24 167 ## how many unique 'BPA ID' ? length(unique(shp.base.select@data$`BPA ID`)) # 593 writeOGR(obj=shp.base.select, dsn = datadir,layer="BASE-sites-ALL-vFINAL-WGS84-from-2017-12-19-download", driver="ESRI Shapefile") plot(shp.base.select) ## show example data head( shp.base.select@data ) # Sample ID BPA ID BioSample Accession Data Type Collection Site # 51 bpa-base-genomics-amplicon-16s-12450_1-ac9e5 102.100.100.12450 SAMN04874142 MiSeq 16S Narrabri # 75 bpa-base-genomics-amplicon-16s-12481_1-ac9e5 102.100.100.12481 SAMN04874166 MiSeq 16S Kinchega NP # 77 bpa-base-genomics-amplicon-16s-12483_1-ac9e5 102.100.100.12483 SAMN04874168 MiSeq 16S Sturt NP # 79 bpa-base-genomics-amplicon-16s-12485_1-ac9e5 102.100.100.12485 SAMN04874170 MiSeq 16S Sturt NP # 81 bpa-base-genomics-amplicon-16s-12487_1-ac9e5 102.100.100.12487 SAMN04874172 MiSeq 16S Nokoleche reserve # 83 bpa-base-genomics-amplicon-16s-12489_1-ac9e5 102.100.100.12489 SAMN04874174 MiSeq 16S Gundabooka NP # Date Sampled Soil Depth (cm) Broad Land Use Detailed Land Use Sand Silt Clay # 51 2014-05-01 0 Production from Dryland Agriculture and Plantations cotton 23.97 27.31 48.72 # 75 2014-03-10 0 Conservation and natural environments National Park 77.50 3.92 18.58 # 77 2014-03-11 0 Conservation and natural environments National Park 83.01 6.02 10.98 # 79 2014-03-11 0 Conservation and natural environments National Park 91.06 1.00 7.94 # 81 2014-03-12 0 Conservation and natural environments National Park 65.84 9.33 24.83 # 83 2014-03-13 0 Conservation and natural environments National Park 75.27 4.96 19.77 # NH3-N Colwell P Colwell K Sulphur Organic Carbon Conductivity CaCl2pH H2O pH Latitude_copy Longitude_copy # 51 2 92 687 NA 1.32 0.123 6.8 7.6 -30.20116 149.5974 # 75 1 19 198 NA 0.17 0.159 8.1 9.0 -32.52605 142.2193 # 77 2 6 230 NA 7.0000000000000007E-2 0.019 6.5 7.2 -29.05566 141.8980 # 79 1 6 179 NA 0.1 0.018 6.4 7.1 -29.03592 141.3528 # 81 NA 10 398 NA 0.47 0.026 5.4 6.3 -29.94563 144.1117 # 83 1 9 283 NA 0.34 0.013 4.7 5.7 -30.39500 142.7178 # #------------------------- #### Secondary filtering of BASE data # - match available 16S data # - isolate "altered" versus "natural" sites #------------------------- ## remove duplicate sample IDs ... corresponding to 16S, ITS, 18S, A16S microbial datasets shp.base.select@data$sampID <- as.character( lapply(X = shp.base.select@data$`BPA ID`, FUN = function(x) { sub(x=x, pattern = "102.100.100.", replacement = "")} ) ) sel.dup <- duplicated(shp.base.select@data$sampID) length(which(sel.dup==FALSE)) # 593 shp.base.select <- shp.base.select[-which(sel.dup==TRUE), ] dim(shp.base.select) # 593 23 base_select <- shp.base.select@data dim(base_select) # 593 23 ## Now see which samples have OTU data??? # Set rownames as sample names - so will match in otu_table when combine later in phyloseq-object row.names(base_select) row.names(base_select) <- base_select$sampID row.names(base_select) dim(base_select) # 593 23 length(unique(base_select$sampID)) # 593 str(base_select) # 'data.frame': 593 obs. of 23 variables: # $ Sample ID : chr "bpa-base-genomics-amplicon-16s-12450_1-ac9e5" "bpa-base-genomics-amplicon-16s-12481_1-ac9e5" "bpa-base-genomics-amplicon-16s-12483_1-ac9e5" "bpa-base-genomics-amplicon-16s-12485_1-ac9e5" ... # $ BPA ID : chr "102.100.100.12450" "102.100.100.12481" "102.100.100.12483" "102.100.100.12485" ... # $ BioSample Accession: chr "SAMN04874142" "SAMN04874166" "SAMN04874168" "SAMN04874170" ... # $ Data Type : chr "MiSeq 16S" "MiSeq 16S" "MiSeq 16S" "MiSeq 16S" ... # $ Collection Site : chr "Narrabri" "Kinchega NP" "Sturt NP" "Sturt NP" ... # $ Date Sampled : POSIXct, format: "2014-05-01" "2014-03-10" "2014-03-11" "2014-03-11" ... # $ Soil Depth (cm) : chr "0" "0" "0" "0" ... # $ Broad Land Use : chr "Production from Dryland Agriculture and Plantations" "Conservation and natural environments" "Conservation and natural environments" "Conservation and natural environments" ... # $ Detailed Land Use : chr "cotton" "National Park" "National Park" "National Park" ... # $ Sand : num 24 77.5 83 91.1 65.8 ... # $ Silt : num 27.31 3.92 6.02 1 9.33 ... # $ Clay : num 48.72 18.58 10.98 7.94 24.83 ... # $ NH3-N : num 2 1 2 1 NA 1 6 5 8 6 ... # $ Colwell P : num 92 19 6 6 10 9 24 26 26 27 ... # $ Colwell K : num 687 198 230 179 398 283 230 220 258 207 ... # $ Sulphur : logi NA NA NA NA NA NA ... # $ Organic Carbon : chr "1.32" "0.17" "7.0000000000000007E-2" "0.1" ... # $ Conductivity : num 0.123 0.159 0.019 0.018 0.026 0.013 0.054 0.036 0.101 0.038 ... # $ CaCl2pH : num 6.8 8.1 6.5 6.4 5.4 4.7 5.2 5.3 5.1 5 ... # $ H2O pH : num 7.6 9 7.2 7.1 6.3 5.7 5.8 5.8 5.7 5.6 ... # $ Latitude_copy : num -30.2 -32.5 -29.1 -29 -29.9 ... # $ Longitude_copy : num 150 142 142 141 144 ... # $ sampID : chr "12450" "12481" "12483" "12485" ... ## Add 'X' to 'Sample ID number' to match columns imported below head(paste0("X",base_select$sampID)) # "X12450" "X12481" "X12483" "X12485" "X12487" "X12489" base_select$sampID <- paste0("X",base_select$sampID) head(base_select$sampID) # "X12450" "X12481" "X12483" "X12485" "X12487" "X12489" # # # # # # # # # # # # # # # # # # # # # # # # # # # # ### Now match to bacterial 16S data # # # # # # # # # # # # # # # # # # # # # # # # # # # # list.files(datadir) ## OTU table base.otu.16s <- as.data.frame(read.csv(paste0(datadir,"/BASE-Download-2017-12-19/","BASE_16S_OTU.csv"), header = TRUE)) dim(base.otu.16s) # 91929 1024 # i.e. 91929 obs(OTUs) of 1024 vars(samples) row.names(base.otu.16s) <- base.otu.16s$OTUId head(row.names(base.otu.16s)) # "16S_OTUa_4" "16S_OTUa_1" "16S_OTUa_3" "16S_OTUa_2" "16S_OTUa_5" "16S_OTUa_362" # now remove 1st column (it is stored as row.name) base.otu.16s <- base.otu.16s[ , -1] head(sort(names(base.otu.16s))) # "X10603" "X10605" "X10606" "X10609" "X10610" "X10611" tail(sort(names(base.otu.16s))) # "X9583" "X9584" "X9585" "X9586" "X9587" "X9588" temp <- base.otu.16s # keep only those in select group above sel.var <- which( names(base.otu.16s) %in% base_select$sampID ) # qty 360 base.sites.16s <- sort( names(base.otu.16s)[sel.var] , decreasing = FALSE) base.sites.16s # [1] "X10714" "X10716" "X12424" "X12426" "X12428" "X12430" "X12432" "X12434" "X12436" "X12438" "X12440" "X12442" "X12444" # [14] "X12447" "X12450" "X12459" "X12461" "X12463" "X12465" "X12467" "X12469" "X12471" "X12473" "X12475" "X12477" "X12479" # [27] "X12481" "X12483" "X12485" "X12487" "X12489" "X12491" "X12493" "X12495" "X12497" "X12499" "X12501" "X12503" "X12505" # [40] "X12507" "X12509" "X12511" "X12513" "X12515" "X12517" "X12519" "X12521" "X12523" "X12525" "X12527" "X12529" "X12560" # [53] "X12562" "X12564" "X12566" "X12568" "X12570" "X12572" "X12574" "X12576" "X12578" "X12580" "X12582" "X12614" "X12616" # [66] "X12618" "X12620" "X12622" "X12624" "X12816" "X12818" "X12819" "X12824" "X12828" "X12830" "X12832" "X12834" "X12836" # [79] "X12838" "X12858" "X12860" "X12862" "X12876" "X12878" "X12880" "X12882" "X12884" "X12886" "X12891" "X12897" "X12899" # [92] "X12901" "X12939" "X13260" "X13262" "X13264" "X13266" "X13272" "X13274" "X13276" "X13278" "X13280" "X13282" "X13284" # [105] "X13286" "X13731" "X13733" "X13735" "X13737" "X13739" "X13892" "X13893" "X13894" "X13895" "X13896" "X13897" "X13898" # [118] "X13899" "X13900" "X13901" "X13902" "X13903" "X13904" "X13905" "X13906" "X7035" "X7037" "X7039" "X7041" "X7043" # [131] "X7045" "X7047" "X7049" "X7053" "X7055" "X7057" "X7059" "X7061" "X7063" "X7065" "X7067" "X7069" "X7075" # [144] "X7077" "X7079" "X7081" "X7083" "X7085" "X7091" "X7093" "X7095" "X7823" "X7826" "X7827" "X7829" "X7831" # [157] "X7833" "X7835" "X7837" "X7839" "X7841" "X7843" "X7845" "X7847" "X7849" "X7851" "X7853" "X7855" "X7857" # [170] "X7861" "X7863" "X7882" "X7884" "X7886" "X7888" "X7890" "X7892" "X7894" "X7896" "X7898" "X7900" "X7902" # [183] "X7904" "X7906" "X7908" "X7910" "X7912" "X7914" "X7916" "X7918" "X7920" "X7922" "X8076" "X8078" "X8080" # [196] "X8082" "X8084" "X8086" "X8088" "X8090" "X8091" "X8093" "X8097" "X8099" "X8101" "X8102" "X8104" "X8106" # [209] "X8108" "X8110" "X8112" "X8114" "X8116" "X8118" "X8120" "X8122" "X8124" "X8126" "X8128" "X8130" "X8132" # [222] "X8135" "X8136" "X8138" "X8140" "X8142" "X8144" "X8146" "X8148" "X8150" "X8152" "X8154" "X8156" "X8158" # [235] "X8160" "X8162" "X8164" "X8166" "X8168" "X8170" "X8172" "X8174" "X8180" "X8182" "X8184" "X8190" "X8192" # [248] "X8194" "X8198" "X8200" "X8201" "X8202" "X8203" "X8204" "X8205" "X8206" "X8210" "X8212" "X8214" "X8216" # [261] "X8217" "X8218" "X8220" "X8262" "X8264" "X8266" "X8268" "X8270" "X8272" "X8274" "X8276" "X8278" "X8280" # [274] "X8282" "X8284" "X8286" "X8288" "X8292" "X8294" "X8296" "X8453" "X8455" "X8457" "X8459" "X8461" "X8463" # [287] "X8465" "X8467" "X8469" "X8471" "X8487" "X8489" "X8491" "X8493" "X8495" "X8497" "X8499" "X8501" "X8503" # [300] "X8505" "X8507" "X8509" "X8511" "X8513" "X8515" "X8517" "X8519" "X8521" "X8523" "X8525" "X8527" "X8529" # [313] "X8531" "X9430" "X9432" "X9434" "X9436" "X9438" "X9440" "X9442" "X9444" "X9446" "X9448" "X9450" "X9452" # [326] "X9454" "X9456" "X9458" "X9460" "X9462" "X9464" "X9466" "X9468" "X9484" "X9486" "X9488" "X9490" "X9492" # [339] "X9494" "X9496" "X9498" "X9500" "X9502" "X9504" "X9506" "X9508" "X9510" "X9512" "X9514" "X9524" "X9528" # [352] "X9530" "X9567" "X9576" "X9577" "X9579" "X9581" "X9583" "X9586" "X9588" length(base.sites.16s) # 360 # reduce back to available 16S data base_select <- base_select[which(base_select$sampID %in% base.sites.16s), ] dim(base_select) # 360 23 table(base_select$`Broad Land Use`) # Conservation and natural environments Production from Dryland Agriculture and Plantations # 267 77 # Production from Irrigated Agriculture and Plantations Production from Relatively Natural Environments # 1 15 table(base_select$`Detailed Land Use`) # Cereals -wheat cotton Grazing native vegetation # 23 18 16 # irrigated seasonal horticulture National Park Native/exotic pasture mosaic # 1 180 6 # Natural feature protection Nature conservation Other conserved area # 2 3 1 # Pasture legume/grass mixtures Protected landscape Rehabilitation # 16 37 7 # Reserve Residual native cover Sown grasses # 1 30 2 # Strict nature reserves sugar Tree fruits -apple # 1 5 6 # Wilderness area # 5 sort(unique(base_select$`Detailed Land Use`)) # [1] "Cereals -wheat" "cotton" "Grazing native vegetation" # [4] "irrigated seasonal horticulture" "National Park" "Native/exotic pasture mosaic" # [7] "Natural feature protection" "Nature conservation" "Other conserved area" # [10] "Pasture legume/grass mixtures" "Protected landscape" "Rehabilitation" # [13] "Reserve" "Residual native cover" "Sown grasses" # [16] "Strict nature reserves" "sugar" "Tree fruits -apple" # [19] "Wilderness area" ## Choose "human-altered" and "natural" samples # "human-altered" environments altered <- c( "Cereals -wheat", "cotton", "irrigated seasonal horticulture", "Pasture legume/grass mixtures", "Rehabilitation", "Sown grasses", "sugar", "Tree fruits -apple" ) # natural environments natural <- c( "National Park", "Nature conservation", "Strict nature reserves", "Wilderness area" ) sel.alt <- which(base_select$`Detailed Land Use` %in% altered) # 78 sel.nat <- which(base_select$`Detailed Land Use` %in% natural) # 189 base_select$alt_vs_nat <- NA sel.alt <- which(base_select$`Detailed Land Use` %in% altered) # 78 base_select$alt_vs_nat[sel.alt] <- "altered" sel.nat <- which(base_select$`Detailed Land Use` %in% natural) # 189 base_select$alt_vs_nat[sel.nat] <- "natural" ## Excluded samples due to ambiguity and wetlands: full <- sort(unique(base_select$`Detailed Land Use`)) sel <- which(full %in% altered | full %in% natural) full[-sel] # [1] "Grazing native vegetation" "Native/exotic pasture mosaic" "Natural feature protection" # [4] "Other conserved area" "Protected landscape" "Reserve" # [7] "Residual native cover" ## Exclude any samples without altered or natural class sel.rm <- which( is.na(base_select$alt_vs_nat) ) # 93 temp <- base_select base_select <- base_select[-sel.rm, ] table(base_select$alt_vs_nat) # altered natural # 78 189 ## Later, after matching to available 16S OTU data, attempt to co-locate altered and natural samples... altered # [1] "Cereals -wheat" "cotton" "irrigated seasonal horticulture" # [4] "Pasture legume/grass mixtures" "Rehabilitation" "Sown grasses" # [7] "sugar" "Tree fruits -apple" natural # [1] "National Park" "Nature conservation" "Strict nature reserves" "Wilderness area" base_select.final <- base_select ## Now where are altered and natural samples base_select.final$Latitude_copy2 <- base_select.final$Latitude_copy base_select.final$Longitude_copy2 <- base_select.final$Longitude_copy coordinates(base_select.final) <- ~ Longitude_copy + Latitude_copy base_select.final@proj4string <- CRS(WGS84) table(base_select.final@data$alt_vs_nat) # altered natural # 78 189 sel.alt <- which(base_select.final@data$alt_vs_nat == "altered") plot(base_select.final[sel.alt,]) sel.nat <- which(base_select.final@data$alt_vs_nat == "natural") points(base_select.final[sel.nat,], col="red") # doesn't show full extent as only layered on top of limited plot area!!! plot(base_select.final[sel.nat, ], col="red") points(base_select.final[sel.alt, ]) ## choose natural sites that are in the geographic vicinity of altered sites!! ## want to find nearest neighbours in geographic space x.alt <- base_select.final@data[sel.alt , c("BPA ID","Latitude_copy2","Longitude_copy2")] dim(x.alt) # 78 3 x.nat <- base_select.final@data[sel.nat , c("BPA ID","Latitude_copy2","Longitude_copy2")] dim(x.nat) # 189 3 # Find nearest neighbour in first point set of all points in second pointset set.seed(123) nn <- knn(data=x.nat[ ,c("Latitude_copy2","Longitude_copy2") ], query=x.alt[ ,c("Latitude_copy2","Longitude_copy2") ], k=189) str(nn) # List of 2 # $ nn.idx : int [1:78, 1:189] 95 95 95 95 95 95 95 95 95 95 ... # $ nn.dists: num [1:78, 1:189] 5.04 5.04 5.04 5.04 5.04 ... length(unique( as.numeric( nn[[1]] ))) # 189 head(nn[[1]]) head(nn[[2]]) hist(as.numeric( nn[[2]] )) ## From the nearest neighbour calculations, keep only distinct indices where the geographic distance is less than 5 degrees sel <- which(nn[[2]][] < 5,arr.ind = TRUE) length(unique(as.numeric( nn[[1]][sel] ))) # 139 idx <- sort( unique( as.numeric( nn[[1]][sel] )) ) plot( x.nat[idx, c("Longitude_copy2","Latitude_copy2")] ) keep.nat <- as.character( x.nat[idx, "BPA ID"] ) keep.alt <- as.character( x.alt[ , "BPA ID"] ) length(c(keep.alt,keep.nat)) # 217 sel <- which(base_select.final@data$`BPA ID` %in% c(keep.nat,keep.alt)) # 217 shp.base.matched <- base_select.final[sel, ] str(shp.base.matched@data) # 'data.frame': 217 obs. of 24 variables: # $ Sample ID : chr "bpa-base-genomics-amplicon-16s-12450_1-ac9e5" "bpa-base-genomics-amplicon-16s-12481_1-ac9e5" "bpa-base-genomics-amplicon-16s-12489_1-ac9e5" "bpa-base-genomics-amplicon-16s-12491_1-ac9e5" ... # $ BPA ID : chr "102.100.100.12450" "102.100.100.12481" "102.100.100.12489" "102.100.100.12491" ... # $ BioSample Accession: chr "SAMN04874142" "SAMN04874166" "SAMN04874174" "SAMN04874176" ... # $ Data Type : chr "MiSeq 16S" "MiSeq 16S" "MiSeq 16S" "MiSeq 16S" ... # $ Collection Site : chr "Narrabri" "Kinchega NP" "Gundabooka NP" "Namadgi NP" ... # $ Date Sampled : POSIXct, format: "2014-05-01" "2014-03-10" "2014-03-13" "2014-05-12" ... # $ Soil Depth (cm) : chr "0" "0" "0" "0" ... # $ Broad Land Use : chr "Production from Dryland Agriculture and Plantations" "Conservation and natural environments" "Conservation and natural environments" "Conservation and natural environments" ... # $ Detailed Land Use : chr "cotton" "National Park" "National Park" "National Park" ... # $ Sand : num 24 77.5 75.3 47.2 61.2 ... # $ Silt : num 27.31 3.92 4.96 26.45 26.61 ... # $ Clay : num 48.7 18.6 19.8 26.4 12.2 ... # $ NH3-N : num 2 1 1 6 5 8 6 12 2 1 ... # $ Colwell P : num 92 19 9 24 26 26 27 34 106 91 ... # $ Colwell K : num 687 198 283 230 220 258 207 262 851 855 ... # $ Sulphur : logi NA NA NA NA NA NA ... # $ Organic Carbon : chr "1.32" "0.17" "0.34" "4.07" ... # $ Conductivity : num 0.123 0.159 0.013 0.054 0.036 0.101 0.038 0.091 0.159 0.132 ... # $ CaCl2pH : num 6.8 8.1 4.7 5.2 5.3 5.1 5 5.1 6.6 6.9 ... # $ H2O pH : num 7.6 9 5.7 5.8 5.8 5.7 5.6 5.6 7.5 7.7 ... # $ sampID : chr "X12450" "X12481" "X12489" "X12491" ... # $ alt_vs_nat : chr "altered" "natural" "natural" "natural" ... # $ Latitude_copy2 : num -30.2 -32.5 -30.4 -35.9 -35.9 ... # $ Longitude_copy2 : num 150 142 143 149 149 ... plot(shp.base.matched[which(shp.base.matched@data$alt_vs_nat=="natural"), ], col="green") points(shp.base.matched[which(shp.base.matched@data$alt_vs_nat=="altered"), ], col="red") writeOGR(obj=shp.base.matched, dsn = datadir,layer="BASE-sites-MATCHED-WGS84-from-2017-12-19-download-vFINAL", driver="ESRI Shapefile") table(shp.base.matched@data$alt_vs_nat) # altered natural # 78 139 table(shp.base.matched@data$`Detailed Land Use`[ which(shp.base.matched@data$alt_vs_nat=="altered") ]) # Cereals -wheat cotton irrigated seasonal horticulture # 23 18 1 # Pasture legume/grass mixtures Rehabilitation Sown grasses # 16 7 2 # sugar Tree fruits -apple # 5 6 table(shp.base.matched@data$`Detailed Land Use`[ which(shp.base.matched@data$alt_vs_nat=="natural") ]) # National Park Nature conservation Strict nature reserves # 135 3 1 # adjust base.sites.16s sel <- which(base.sites.16s %in% shp.base.matched@data$sampID) # 217 base.sites.16s <- base.sites.16s[sel] # #------------------------- #### Now examine BASE OTU data # Prepare BASE microbiome data # Examine differences between "altered" and "natural" samples #------------------------- # subset & convert to matrix base.otu.16s <- as.matrix( base.otu.16s[ , base.sites.16s] ) ## Taxonomy base.tax.16s <- as.data.frame(read_excel(paste0(datadir,"/BASE-Download-2017-12-19/","BASE_16S_taxonomy.xlsx"), sheet=1,col_names = TRUE)) dim(base.tax.16s) # 91928 8 # i.e. 91928 obs(OTUs) of 8 vars(OTUId + 7 taxonomic ranks) names(base.tax.16s) # [1] "OTUId" "kingdom" "phylum" "class" "order" "family" "genus" "species" row.names(base.tax.16s) <- base.tax.16s$OTUId head(row.names(base.tax.16s)) # "16S_OTUa_4" "16S_OTUa_1" "16S_OTUa_3" "16S_OTUa_2" "16S_OTUa_5" "16S_OTUa_362" # now remove 1st column (it is stored as row.name) base.tax.16s <- base.tax.16s[ , -1] # convert to matrix base.tax.16s <- as.matrix(base.tax.16s) ## check are there any OTUs without corresponding Taxonomy? length(row.names(base.otu.16s)) # 91929 length(row.names(base.tax.16s)) # 91928 sel.otus.match <- which( row.names(base.otu.16s) %in% row.names(base.tax.16s) ) base.otu.16s <- base.otu.16s[ sel.otus.match, ] dim(base.otu.16s) # 91928 217 dim(base.tax.16s) # 91928 7 ## Create 'otuTable' # otu_table - Works on any numeric matrix. # You must also specify if the species are rows or columns OTU.16s <- otu_table(base.otu.16s, taxa_are_rows = TRUE) ## Create 'taxonomyTable' # tax_table - Works on any character matrix. # The rownames must match the OTU names (taxa_names) of the otu_table if you plan to combine it with a phyloseq-object. TAX.16s <- tax_table(base.tax.16s) ## Create a phyloseq object, merging OTU & TAX tables base.physeq.16s = phyloseq(OTU.16s, TAX.16s) base.physeq.16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 91928 taxa and 217 samples ] # tax_table() Taxonomy Table: [ 91928 taxa by 7 taxonomic ranks ] ## check sample data table, are there any sites without corresponding data? dim(shp.base.matched) # 217 24 base.samp.16s <- shp.base.matched@data names(base.samp.16s) # [1] "Sample ID" "BPA ID" "BioSample Accession" "Data Type" "Collection Site" "Date Sampled" "Soil Depth (cm)" # [8] "Broad Land Use" "Detailed Land Use" "Sand" "Silt" "Clay" "NH3-N" "Colwell P" # [15] "Colwell K" "Sulphur" "Organic Carbon" "Conductivity" "CaCl2pH" "H2O pH" "clim_kpn_maj" # [22] "alt_vs_nat" "sampID" "Latitude_copy2" "Longitude_copy2" sel <- which(base.samp.16s$sampID %in% sample_names(base.physeq.16s)) # qty 217 base.samp.16s <- base.samp.16s[sel, ] row.names(base.samp.16s) <- base.samp.16s$sampID base.samp.16s ### Now Add sample data to phyloseq object # sample_data - Works on any data.frame. The rownames must match the sample names in # the otu_table if you plan to combine them as a phyloseq-object SAMPDATA.16s <- sample_data(base.samp.16s) ### Combine SAMPDATA into phyloseq object base.physeq.16s <- merge_phyloseq(base.physeq.16s, SAMPDATA.16s) base.physeq.16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 91928 taxa and 217 samples ] # sample_data() Sample Data: [ 217 samples by 24 sample variables ] # tax_table() Taxonomy Table: [ 91928 taxa by 7 taxonomic ranks ] temp <- base.physeq.16s # remove taxa not assigned as Bacteria keep_taxa <- which(tax_table(base.physeq.16s)[, "kingdom"] == "k__Bacteria") base.physeq.16s <- prune_taxa(base.physeq.16s, taxa = row.names(tax_table(base.physeq.16s)[keep_taxa, ]) ) base.physeq.16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 91925 taxa and 217 samples ] # sample_data() Sample Data: [ 217 samples by 24 sample variables ] # tax_table() Taxonomy Table: [ 91925 taxa by 7 taxonomic ranks ] # remove taxa not assigned at the phylum level rem_taxa <- which(tax_table(base.physeq.16s)[, "phylum"] == "unclassified") # qty 710 OTUs base.physeq.16s <- prune_taxa(base.physeq.16s, taxa = row.names(tax_table(base.physeq.16s)[-rem_taxa, ]) ) base.physeq.16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 91215 taxa and 217 samples ] # sample_data() Sample Data: [ 217 samples by 24 sample variables ] # tax_table() Taxonomy Table: [ 91215 taxa by 7 taxonomic ranks ] temp <- base.physeq.16s rank_names(base.physeq.16s) # "kingdom" "phylum" "class" "order" "family" "genus" "species" sort( as.character( unique( tax_table(base.physeq.16s)[, "phylum"] ) )) sort( as.character( unique( tax_table(base.physeq.16s)[, "class"] ) )) sort( as.character( unique( tax_table(base.physeq.16s)[, "order"] ) )) sort( as.character( unique( tax_table(base.physeq.16s)[, "family"] ) )) # remove taxa associated with chloroplast, streptophyta, and mitochondria rem_taxa1 <- which(tax_table(base.physeq.16s)[, "class"] == "c__Chloroplast") # qty 308 OTUs rem_taxa2 <- which(tax_table(base.physeq.16s)[, "order"] == "o__Streptophyta") # qty 24 OTUs rem_taxa3 <- which(tax_table(base.physeq.16s)[, "family"] == "f__mitochondria") # qty 1241 OTUs length( c(rem_taxa1,rem_taxa2,rem_taxa3) ) # 1573 length( unique(c(rem_taxa1,rem_taxa2,rem_taxa3)) ) # 1549 base.physeq.16s <- prune_taxa(base.physeq.16s, taxa = row.names(tax_table(base.physeq.16s)[-unique(c(rem_taxa1,rem_taxa2,rem_taxa3)), ]) ) base.physeq.16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 89666 taxa and 217 samples ] # sample_data() Sample Data: [ 217 samples by 24 sample variables ] # tax_table() Taxonomy Table: [ 89666 taxa by 7 taxonomic ranks ] ## remove taxa that have < 100 reads across samples base.d16s <- prune_taxa(taxa = taxa_sums(base.physeq.16s) >= 100 , x = base.physeq.16s) base.d16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 12508 taxa and 217 samples ] # sample_data() Sample Data: [ 217 samples by 24 sample variables ] # tax_table() Taxonomy Table: [ 12508 taxa by 7 taxonomic ranks ] ## remove taxa that do not occur in at least two samples base.d16s <- prune_taxa( taxa = apply( otu_table(base.d16s), 1, function(x) {sum(x > 0) >= 2 }), x = base.d16s) base.d16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 12413 taxa and 217 samples ] # sample_data() Sample Data: [ 217 samples by 24 sample variables ] # tax_table() Taxonomy Table: [ 12413 taxa by 7 taxonomic ranks ] ntaxa(base.d16s) # 12413 nsamples(base.d16s) # 217 min(sample_sums(base.d16s)) # 6377 max(sample_sums(base.d16s)) # 170321 hist( sample_sums(base.d16s) ) sort( sample_sums(base.d16s) )[1:10] # X8503 X7861 X8088 X12465 X13264 X12473 X12884 X12438 X12436 X12430 # 6377 9935 12765 13041 13371 15852 16545 17614 17780 17990 # # # # # # # # # # # # ## plot rarefaction curve for BASE data set.seed(123) rarefaction_curve_data.BASE <- calculate_rarefaction_curves(base.d16s, c('Observed'), sort(c(min(sample_sums(base.d16s)), 1, 100, 1000, 5:90 * 2000)) ) # cover max value: 170562 ; and include min value: 6377 str(rarefaction_curve_data.BASE) # 'data.frame': 6260 obs. of 4 variables: # $ Depth : num 1 1 1 1 1 1 1 1 1 1 ... # $ Sample : Factor w/ 217 levels "X12424","X12426",..: 1 2 3 4 5 6 7 8 9 10 ... # $ Measure : Factor w/ 1 level "Observed": 1 1 1 1 1 1 1 1 1 1 ... # $ Alpha_diversity: num 1 1 1 1 1 1 1 1 1 1 ... # need to remove 'X' from start of Sample values, as returned from rarefaction function rarefaction_curve_data.BASE$Sample <- as.character(rarefaction_curve_data.BASE$Sample) rarefaction_curve_data.BASE$Sample # rarefaction_curve_data.BASE$Sample <- gsub(pattern = "X", x=rarefaction_curve_data.BASE$Sample, replacement="") # rarefaction_curve_data.BASE$Sample ## Add sample data rarefaction_curve_data_BASE_summary_verbose <- merge(x=rarefaction_curve_data.BASE, y=data.frame(sample_data(base.d16s)), by.x = 'Sample', by.y = 'row.names') sample_data(base.d16s) cols <- c("altered" = "#F8766D", "natural" = "#00BFC4") p <- ggplot( data = rarefaction_curve_data_BASE_summary_verbose, mapping = aes(x = Depth, y = Alpha_diversity, colour = alt_vs_nat, group = Sample)) + theme_bw() + scale_colour_manual(values = cols, name ="Sample type") + ggtitle("(b) Australia-wide samples ") + labs(x = "OTU sequence depth", y = "Observed OTUs (count)") + geom_line() + geom_vline(xintercept = min(sample_sums(base.d16s)), linetype="dotted") + #geom_vline(xintercept = 10000, linetype="dotted") + ylim(0,5500) p ggsave(plot=p, filename = paste0("finished-plots/","Rarefaction-curve-Aust-wide-samples-vFINAL.tiff"), width = 10, height = 8, units = "cm", dpi = 600, compression = "lzw") # # # # # # # # # # # # table(base.d16s@sam_data$alt_vs_nat) # altered natural # 78 139 ## Use lowest number of reads to create rarefied dataset #set.seed(123) seed <- 123 base.r16s <- rarefy_even_depth(base.d16s, sample.size = min(sample_sums(base.d16s)), rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE) sample_sums(base.r16s) # all 6377 ntaxa(base.r16s) # 12413 nsamples(base.r16s) # 217 plot_richness(base.r16s, measures=c("Observed"), color = "alt_vs_nat" ) #rich <- plot_richness(base.r16s, measures=c("Observed") ) plot_richness(base.r16s, measures=c("Shannon"), color = "alt_vs_nat") #shan <- plot_richness(base.r16s, measures=c("Shannon")) ## ordination set.seed(123) ord <- ordinate(base.r16s, "NMDS", "bray") #p7 <- plot_ordination(mbd, mbd_ord, type="samples", color="sample_label") p <- plot_ordination(base.r16s, ord, type="samples", color="alt_vs_nat", shape= "Detailed.Land.Use") p str(p) p$labels$shape <- "Depth" sort(unique(p$data$Detailed.Land.Use)) # [1] "Cereals -wheat" "cotton" "irrigated seasonal horticulture" # [4] "National Park" "Nature conservation" "Pasture legume/grass mixtures" # [7] "Rehabilitation" "Sown grasses" "Strict nature reserves" # [10] "sugar" "Tree fruits -apple" cols <- c("altered" = "#F8766D", "natural" = "#00BFC4") shapes <- c( "Cereals -wheat" =0, "cotton" =1, "irrigated seasonal horticulture" =2, "National Park" = 3, "Nature conservation" =4, "Pasture legume/grass mixtures" =5, "Rehabilitation" =6, "Sown grasses" =7, "Strict nature reserves" =8, "sugar" =9, "Tree fruits -apple" =10 ) pp <- p + theme_bw() + scale_colour_manual(values = cols, name ="Type") + scale_shape_manual(values = shapes, name ="Land use") pp ## Save plot later when show Aust-wide data in context with Mt Bold surface samples... #ggsave(plot=pp, filename = paste0("finished-plots/","Ord-NMDS-Bray-BASE-16S-Altered-vs-Natural.tiff"), width = 14, height = 12, units = "cm", dpi = 600, compression = "lzw") table(base.r16s@sam_data$alt_vs_nat) # altered natural # 78 139 # Calculate bray curtis distance matrix set.seed(123) base.16s.bray <- phyloseq::distance(base.r16s, method = "bray") sampledf <- data.frame(sample_data(base.r16s)) str(sampledf) # Adonis test adonis(base.16s.bray ~ alt_vs_nat, data = sampledf) # Call: # adonis(formula = base.16s.bray ~ alt_vs_nat, data = sampledf) # # Permutation: free # Number of permutations: 999 # # Terms added sequentially (first to last) # # Df SumsOfSqs MeanSqs F.Model R2 Pr(>F) # alt_vs_nat 1 7.188 7.1880 22.656 0.09533 0.001 *** # Residuals 215 68.214 0.3173 0.90467 # Total 216 75.402 1.00000 # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Homogeneity of dispersion test beta <- betadisper(base.16s.bray, sampledf$alt_vs_nat) permutest(beta) # Permutation test for homogeneity of multivariate dispersions # Permutation: free # Number of permutations: 999 # # Response: Distances # Df Sum Sq Mean Sq F N.Perm Pr(>F) # Groups 1 0.03106 0.0310607 3.6609 999 0.062 . # Residuals 215 1.82415 0.0084844 # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # These outputs tell us that our adonis test is significant so we can reject the null hypothesis # that microbiota have the same centroid across 'altered' and 'natural' sample types. # Also, betadisper results are not significant at the 0.05 level, suggesting there is no # significant difference in dispersion between the two groups. ### Also, betadisper results are not significant, meaning we can't reject the null ### hypothesis that our groups have the same dispersions. This brings more confidence to ### the adonis result, as it does not appear to be due to differences in group dispersions. table( base.r16s@sam_data$alt_vs_nat ) # artificial natural # 78 139 ## Differential abundance: DESeq2 ??? ## Following tutorial: http://joey711.github.io/phyloseq-extensions/DESeq2.html # Refs: # McMurdie and Holmes (2014) Waste Not, Want Not: Why Rarefying Microbiome Data is Inadmissible. PLoS Computational Biology in press # McMurdie and Holmes (2013) phyloseq: An R Package for Reproducible Interactive Analysis and Graphics of Microbiome Census Data. PLoS ONE. 8(4):e61217 # Love MI, Huber W, Anders S (2014) Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biology 15(12): 550 # * * * * * * * * * * * * * * * * #https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5123278/ # Thorsen et al (2016) found that DESeq2 exhibits low values and low variance of false positive rates, # coupled with high area under the curve (AUC) performance, for differential relative abundance testing # with unbalanced conditions (e.g. in the range 10-25% cases and 90-75% control samples). This suggests # the DESeq2 algorithm is adept to analyse our unbalanced grouping of artificial versus natural samples # for southern temperate Australia. # * * * * * * * * * * * * * * * * base.d16s # NOT RAREFIED - JUST REMOVED taxa with < 100 reads # phyloseq-class experiment-level object # otu_table() OTU Table: [ 12413 taxa and 217 samples ] # sample_data() Sample Data: [ 217 samples by 24 sample variables ] # tax_table() Taxonomy Table: [ 12413 taxa by 7 taxonomic ranks ] # convert to DESeq dataset base.d16s_deseq <- phyloseq_to_deseq2(base.d16s, ~ alt_vs_nat) # Differential expression analysis based on the Negative Binomial (a.k.a. Gamma-Poisson) distribution #http://rstudio-pubs-static.s3.amazonaws.com/218660_7cbe1343ac434dbab59924a80ac2e815.html # Experiments without replicates do not allow for estimation of the dispersion of counts # around the expected value for each group, which is critical for differential expression # analysis. If an experimental design is supplied which does not contain the necessary # degrees of freedom for differential analysis, DESeq will provide a message to the user # and follow the strategy outlined in Anders and Huber (2010) under the section 'Working # without replicates', wherein all the samples are considered as replicates of a single # group for the estimation of dispersion. As noted in the reference above: "Some # overestimation of the variance may be expected, which will make that approach conservative." # Furthermore, "while one may not want to draw strong conclusions from such an analysis, # it may still be useful for exploration and hypothesis generation." base.d16s_deseq <- DESeq(base.d16s_deseq, test="Wald", fitType="parametric") # estimating size factors # estimating dispersions # gene-wise dispersion estimates # mean-dispersion relationship # final dispersion estimates # fitting model and testing # -- replacing outliers and refitting for 5428 genes # -- DESeq argument 'minReplicatesForReplace' = 7 # -- original counts are preserved in counts(dds) # estimating dispersions # fitting model and testing res <- results(base.d16s_deseq) ## From DESeq2 vignette: # threshold on Cook’s distance, such that if one or more samples for a row have a # distance higher, the p-value for the row is set to NA. The default cutoff is the # .99 quantile of the F(p, m-p) distribution, where p is the number of coefficients # being fitted and m is the number of samples. ## run iteratively for these alpha levels: #alpha <- 0.05 #alpha <- 0.01 ## A large number of taxa are observed to be significantly differentially abundant. ## Therefore, use more discriminating significance threshold to highlight key taxa that are significantly differentially abundant alpha <- 0.001 # use DESEq2 suggested threshold for false dicovery rate (FDR) correction # http://bioconductor.org/packages/release/bioc/vignettes/DESeq/inst/doc/DESeq.pdf res_deseq_ALL <- cbind(as(res, "data.frame"), as(tax_table(base.d16s)[rownames(res), ], "matrix")) sigtab <- res[which(res$padj < alpha), ] sigtab <- cbind(as(sigtab, "data.frame"), as(tax_table(base.d16s)[rownames(sigtab), ], "matrix")) head(sigtab) # baseMean log2FoldChange lfcSE stat pvalue padj kingdom phylum # 16S_OTUa_4 1998.998353 1.452201 0.1780199 8.157522 3.419670e-16 1.583943e-14 k__Bacteria p__Proteobacteria # 16S_OTUa_2 1.171176 -3.769975 0.9397391 -4.011725 6.027677e-05 3.055580e-04 k__Bacteria p__Actinobacteria # 16S_OTUa_362 727.145896 -3.583802 0.7622550 -4.701579 2.581574e-06 1.888338e-05 k__Bacteria p__Actinobacteria # 16S_OTUa_154 793.927660 -3.287172 0.4160523 -7.900861 2.769824e-15 1.128191e-13 k__Bacteria p__Firmicutes # 16S_OTUa_23 586.155532 2.022098 0.3833136 5.275310 1.325317e-07 1.328871e-06 k__Bacteria p__Acidobacteria # 16S_OTUa_214 343.312734 1.721106 0.2572076 6.691503 2.208897e-11 4.661841e-10 k__Bacteria p__Proteobacteria # class order family genus species # 16S_OTUa_4 c__Alphaproteobacteria o__Rhizobiales f__Bradyrhizobiaceae g__Bradyrhizobium unclassified # 16S_OTUa_2 c__Actinobacteria o__Actinomycetales f__Pseudonocardiaceae unclassified unclassified # 16S_OTUa_362 c__Rubrobacteria o__Rubrobacterales f__Rubrobacteraceae g__Rubrobacter unclassified # 16S_OTUa_154 c__Bacilli o__Bacillales f__Bacillaceae g__Bacillus s__longiquaesitum # 16S_OTUa_23 c__Acidobacteriia o__Acidobacteriales f__Acidobacteriaceae unclassified unclassified # 16S_OTUa_214 c__Alphaproteobacteria o__Rhizobiales f__Hyphomicrobiaceae g__Rhodoplanes unclassified dim(sigtab) # alpha = 0.001: 2443 13 sigtab$minuslog10_padj <- -log10(sigtab$padj) hist(sigtab$minuslog10_padj) hist(sigtab$log2FoldChange) plot(sigtab$log2FoldChange, sigtab$minuslog10_padj) # https://www.r-graph-gallery.com/275-add-text-labels-with-ggplot2/ # 3/ custom geom_label like any other geom. # ggplot(data, aes(x=wt, y=mpg, fill=cyl)) + # geom_label(label=rownames(data), color="white", size=5) ## Exploratory plots sel <- which(sigtab$genus == "g__DA101") points(sigtab$log2FoldChange[sel], sigtab$minuslog10_padj[sel], col="green") sel <- which(sigtab$genus == "g__Bacillus") points(sigtab$log2FoldChange[sel], sigtab$minuslog10_padj[sel], col="red") temp <- sigtab #sigtab <- temp # add extra info - phylum, unclassified group, OTU sigtab$phylum_label <- sigtab$phylum sigtab$phylum_label <- sub(pattern="p__", x=sigtab$phylum_label, replacement="") sigtab$genus_label <- sigtab$genus sigtab$genus_label <- sub(pattern="g__", x=sigtab$genus_label, replacement="") sigtab$OTU <- NA for (i in 1:length(sigtab$genus_label)) { ranks <- c("genus","family","order","class","phylum","kingdom") idx_rank <- 1 # if "unclassified" if (sigtab$genus_label[i] == "unclassified") { this_taxaname <- "unclassified" while (this_taxaname == "unclassified") { idx_rank <- idx_rank +1 this_rank <- ranks[idx_rank] this_taxaname <- as.character( sigtab[ i , this_rank ] ) } this_fullname <- paste0("unclassified (",this_rank,": ",gsub(pattern=".__",x=this_taxaname,replacement=""),")") sigtab$genus_label[i] <- this_fullname } # add OTUId sigtab$OTU[i] <- paste0( row.names(sigtab)[i]) } sel <- which(sigtab$log2FoldChange < -20) sigtab$genus_label[sel] # [1] "unclassified (family: A4b)" "Kouleothrix" "unclassified (family: Syntrophobacteraceae)" # [4] "unclassified (class: C0119)" "unclassified (order: JG30-KF-CM45)" "Kouleothrix" # [7] "Adhaeribacter" "unclassified (class: C0119)" "unclassified (family: Cytophagaceae)" # [10] "Flavisolibacter" "Agromyces" "unclassified (family: Ellin517)" sel <- which(sigtab$log2FoldChange > 20) sigtab$genus_label[sel] # [1] "unclassified (order: Ellin6513)" ## show highlights top 30 increasing/ decreasing #sigtab_top30inc_top30dec <- sigtab top30_inc <- sigtab[ order(sigtab$log2FoldChange, decreasing = TRUE)[1:30], ] top30_dec <- sigtab[ order(sigtab$log2FoldChange, decreasing = FALSE)[1:30], ] sigtab_top30inc_top30dec <- rbind(top30_inc, top30_dec[order(top30_dec$log2FoldChange, decreasing = TRUE) , ] ) str(sigtab_top30inc_top30dec) sigtab_top30inc_top30dec[, c("log2FoldChange", "padj", "minuslog10_padj", "genus", "phylum_label", "genus_label","OTU" )] # log2FoldChange padj minuslog10_padj genus phylum_label genus_label OTU # 16S_OTUa_974 25.172495 4.435652e-237 236.353043 unclassified Acidobacteria unclassified (order: Ellin6513) 16S_OTUa_974 # 16S_OTUa_1185 9.389936 3.792552e-30 29.421068 g__Rhodoplanes Proteobacteria Rhodoplanes 16S_OTUa_1185 # 16S_OTUa_99 8.032908 2.254966e-26 25.646860 g__Acidocella Proteobacteria Acidocella 16S_OTUa_99 # 16S_OTUa_3518 7.385307 1.552712e-37 36.808909 g__Acidocella Proteobacteria Acidocella 16S_OTUa_3518 # 16S_OTUa_2310 7.333431 1.163347e-21 20.934291 unclassified Acidobacteria unclassified (order: Ellin6513) 16S_OTUa_2310 # 16S_OTUa_970 7.277487 1.311848e-25 24.882116 g__Candidatus_Xiphinematobacter Verrucomicrobia Candidatus_Xiphinematobacter 16S_OTUa_970 # 16S_OTUa_6357 7.268114 1.433949e-34 33.843466 g__Acidocella Proteobacteria Acidocella 16S_OTUa_6357 # 16S_OTUa_767 7.200946 1.861433e-31 30.730153 g__Mycobacterium Actinobacteria Mycobacterium 16S_OTUa_767 # 16S_OTUa_1996 7.139567 2.772435e-16 15.557139 unclassified Acidobacteria unclassified (order: Ellin6513) 16S_OTUa_1996 # 16S_OTUa_1713 7.080815 3.499338e-17 16.456014 g__Rhodoplanes Proteobacteria Rhodoplanes 16S_OTUa_1713 # 16S_OTUa_291 6.869052 2.196779e-26 25.658214 unclassified Acidobacteria unclassified (order: Ellin6513) 16S_OTUa_291 # 16S_OTUa_55 6.752387 7.718579e-63 62.112463 unclassified Acidobacteria unclassified (family: Acidobacteriaceae) 16S_OTUa_55 # 16S_OTUa_614 6.658730 1.063254e-12 11.973363 unclassified Acidobacteria unclassified (family: Koribacteraceae) 16S_OTUa_614 # 16S_OTUa_93 6.576136 3.914791e-23 22.407291 unclassified Proteobacteria unclassified (family: Rhodospirillaceae) 16S_OTUa_93 # 16S_OTUa_313 6.433793 1.593357e-35 34.797687 unclassified Acidobacteria unclassified (order: Ellin6513) 16S_OTUa_313 # 16S_OTUa_569 6.393954 6.990683e-24 23.155480 unclassified Proteobacteria unclassified (family: Rhodospirillaceae) 16S_OTUa_569 # 16S_OTUa_1342 6.370992 3.141012e-18 17.502930 g__Candidatus_Xiphinematobacter Verrucomicrobia Candidatus_Xiphinematobacter 16S_OTUa_1342 # 16S_OTUa_678 6.366299 1.906504e-16 15.719762 g__DA101 Verrucomicrobia DA101 16S_OTUa_678 # 16S_OTUa_1462 6.353754 4.473596e-16 15.349343 unclassified Proteobacteria unclassified (order: Ellin329) 16S_OTUa_1462 # 16S_OTUa_3078 6.320051 3.689599e-16 15.433021 unclassified Verrucomicrobia unclassified (family: auto67_4W) 16S_OTUa_3078 # 16S_OTUa_993 6.319565 2.404543e-19 18.618967 g__Devosia Proteobacteria Devosia 16S_OTUa_993 # 16S_OTUa_198 6.283699 3.370048e-23 22.472364 g__DA101 Verrucomicrobia DA101 16S_OTUa_198 # 16S_OTUa_265 6.169335 5.526841e-40 39.257523 unclassified Planctomycetes unclassified (family: Isosphaeraceae) 16S_OTUa_265 # 16S_OTUa_2140 6.166104 3.461251e-23 22.460767 g__Candidatus_Xiphinematobacter Verrucomicrobia Candidatus_Xiphinematobacter 16S_OTUa_2140 # 16S_OTUa_638 6.108922 3.485164e-24 23.457777 g__Actinomadura Actinobacteria Actinomadura 16S_OTUa_638 # 16S_OTUa_2477 6.096288 6.084463e-16 15.215778 unclassified Acidobacteria unclassified (order: Ellin6513) 16S_OTUa_2477 # 16S_OTUa_332 6.093945 1.312030e-15 14.882056 g__Candidatus_Xiphinematobacter Verrucomicrobia Candidatus_Xiphinematobacter 16S_OTUa_332 # 16S_OTUa_1419 6.000062 2.573931e-17 16.589403 unclassified Acidobacteria unclassified (order: Ellin6513) 16S_OTUa_1419 # 16S_OTUa_1942 5.990485 1.016307e-09 8.992975 g__Alicyclobacillus Firmicutes Alicyclobacillus 16S_OTUa_1942 # 16S_OTUa_3810 5.936344 1.025516e-20 19.989058 unclassified Planctomycetes unclassified (family: Isosphaeraceae) 16S_OTUa_3810 # 16S_OTUa_736 -10.318227 3.064959e-22 21.513575 unclassified Proteobacteria unclassified (family: [Entotheonellaceae]) 16S_OTUa_736 # 16S_OTUa_91612 -10.356257 1.900064e-37 36.721232 unclassified Acidobacteria unclassified (family: Ellin6075) 16S_OTUa_91612 # 16S_OTUa_3542 -10.448068 4.191648e-19 18.377615 g__Adhaeribacter Bacteroidetes Adhaeribacter 16S_OTUa_3542 # 16S_OTUa_1930 -10.518611 1.071835e-18 17.969872 unclassified Chloroflexi unclassified (family: A4b) 16S_OTUa_1930 # 16S_OTUa_935 -10.526970 3.761392e-27 26.424651 unclassified Planctomycetes unclassified (order: WD2101) 16S_OTUa_935 # 16S_OTUa_1598 -10.768462 2.104407e-23 22.676870 unclassified Acidobacteria unclassified (order: iii1-15) 16S_OTUa_1598 # 16S_OTUa_955 -11.345726 8.632769e-28 27.063850 unclassified Bacteroidetes unclassified (family: Chitinophagaceae) 16S_OTUa_955 # 16S_OTUa_1737 -11.483088 6.118672e-26 25.213343 unclassified Proteobacteria unclassified (family: Syntrophobacteraceae) 16S_OTUa_1737 # 16S_OTUa_1412 -11.493639 4.727215e-27 26.325395 unclassified Gemmatimonadetes unclassified (order: Ellin5290) 16S_OTUa_1412 # 16S_OTUa_3777 -11.533906 2.880910e-12 11.540470 g__Pontibacter Bacteroidetes Pontibacter 16S_OTUa_3777 # 16S_OTUa_1476 -11.548065 4.121204e-25 24.384976 unclassified Gemmatimonadetes unclassified (family: Ellin5301) 16S_OTUa_1476 # 16S_OTUa_2064 -11.609436 1.091108e-30 29.962132 unclassified Firmicutes unclassified (family: Peptostreptococcaceae) 16S_OTUa_2064 # 16S_OTUa_659 -11.750807 2.400916e-31 30.619623 unclassified Proteobacteria unclassified (family: Ectothiorhodospiraceae) 16S_OTUa_659 # 16S_OTUa_2658 -11.901604 3.640959e-29 28.438784 g__Pontibacter Bacteroidetes Pontibacter 16S_OTUa_2658 # 16S_OTUa_1264 -11.988669 5.825714e-17 16.234651 g__Aquicella Proteobacteria Aquicella 16S_OTUa_1264 # 16S_OTUa_624 -12.251775 1.150980e-16 15.938932 g__Aquicella Proteobacteria Aquicella 16S_OTUa_624 # 16S_OTUa_1082 -13.209336 1.485403e-21 20.828156 g__Aquicella Proteobacteria Aquicella 16S_OTUa_1082 # 16S_OTUa_612 -13.971261 8.330372e-17 16.079336 g__Salinimicrobium Bacteroidetes Salinimicrobium 16S_OTUa_612 # 16S_OTUa_2924 -25.444858 2.760997e-83 82.558934 unclassified Verrucomicrobia unclassified (family: Ellin517) 16S_OTUa_2924 # 16S_OTUa_2059 -25.954320 1.572349e-99 98.803451 g__Flavisolibacter Bacteroidetes Flavisolibacter 16S_OTUa_2059 # 16S_OTUa_1884 -26.013018 7.599922e-160 159.119191 g__Adhaeribacter Bacteroidetes Adhaeribacter 16S_OTUa_1884 # 16S_OTUa_3145 -26.153938 5.551915e-74 73.255557 unclassified Bacteroidetes unclassified (family: Cytophagaceae) 16S_OTUa_3145 # 16S_OTUa_1267 -26.161968 1.911532e-128 127.718618 unclassified Chloroflexi unclassified (order: JG30-KF-CM45) 16S_OTUa_1267 # 16S_OTUa_1637 -26.181902 1.610248e-79 78.793107 unclassified Chloroflexi unclassified (class: C0119) 16S_OTUa_1637 # 16S_OTUa_1325 -26.201459 8.089671e-87 86.092069 g__Kouleothrix Chloroflexi Kouleothrix 16S_OTUa_1325 # 16S_OTUa_1686 -26.493781 2.759203e-97 96.559216 unclassified Proteobacteria unclassified (family: Syntrophobacteraceae) 16S_OTUa_1686 # 16S_OTUa_3809 -26.556304 6.181901e-110 109.208878 g__Agromyces Actinobacteria Agromyces 16S_OTUa_3809 # 16S_OTUa_1094 -26.698064 3.429010e-81 80.464831 unclassified Chloroflexi unclassified (class: C0119) 16S_OTUa_1094 # 16S_OTUa_576 -27.561335 7.087966e-85 84.149478 g__Kouleothrix Chloroflexi Kouleothrix 16S_OTUa_576 # 16S_OTUa_803 -27.667797 5.650357e-98 97.247924 unclassified Chloroflexi unclassified (family: A4b) 16S_OTUa_803 getwd() # "C:/Workspace/PROJ/PAPER-Trending-Taxa-Resto/modelling" write.csv(sigtab_top30inc_top30dec, file = "top30-inc-dec-BASE-altered-vs-natural-vFINAL.csv",quote = FALSE, row.names = TRUE) # for summary table of results write.csv(sigtab_top30inc_top30dec[ ,c("OTU","genus_label","phylum_label","log2FoldChange","minuslog10_padj")], file = "top30-inc-dec-BASE-altered-vs-natural-SUMMARY-TABLE-vFINAL.csv",quote = FALSE, row.names = TRUE) write.csv(sigtab,file = "all-sigtab-DESEQ2-results-BASE-altered-vs-natural-alpha-0.001-vFINAL.csv",quote = FALSE, row.names = TRUE ) ## want to look for top X (inc/dec) from Mt Bold - to see how they trend across southern Aust temperate zone ?? ## look for these in sigtab str(sigtab) sigtab$species <- as.character(sigtab$species) sigtab$genus <- as.character(sigtab$genus) sigtab$family <- as.character(sigtab$family) sigtab$order <- as.character(sigtab$order) sigtab$class <- as.character(sigtab$class) sigtab$phylum <- as.character(sigtab$phylum) #### Iterate through list of top 1:150 trending genus from Mt Bold restoration gradient ## Expect this to show declining relationship ## Choose representative number of increasing/decreasing genera to examine fold change between altered > natural Australia-wide groups #inc <- 1:150 # inc was set earlier when top[[]] {list} was populated topXresults <- data.frame(top_no=inc, adj_rsquared=NA) # use sigtab dataframe from above missing_from_topX <- list() for (X in inc) { #X<-1 # store results in data.frame mtbold.vs.sthnAust.16s <- data.frame(genus_or_group=NA, cor=NA, genus_from_DESeq2=NA, species_from_DESeq2=NA, OTU_from_DESeq2=NA, log2FoldChange=NA ) for (g in 1:length(top[[X]]$genus)) { #g<-2 # find how many OTUs from DESeq2 results correspond to that genus from Mt Bold results # note: that "unclassified" genera are already labelled in sigtab from DESEq2 results #sel <- which(sigtab$genus_label == top[[X]]$genus[g]) sel <- which(sigtab$genus_label == sub(pattern="g__",replacement="", x = top[[X]]$genus[g] )) if (length(sel)==0) { temp <- as.data.frame(matrix(nrow = 1, ncol=6) ) names(temp) <- c("genus_or_group", "cor", "genus_from_DESeq2", "species_from_DESeq2", "OTU_from_DESeq2", "log2FoldChange") temp$genus_or_group <- top[[X]]$genus[g] temp$cor <- top[[X]]$cor_0_10_mean[g] temp$genus_from_DESeq2 <- NA temp$species_from_DESeq2 <- NA temp$OTU_from_DESeq2 <- NA temp$log2FoldChange <- NA } else { temp <- as.data.frame(matrix(nrow = length(sel), ncol=6) ) names(temp) <- c("genus_or_group", "cor", "genus_from_DESeq2", "species_from_DESeq2", "OTU_from_DESeq2", "log2FoldChange") temp$genus_or_group <- top[[X]]$genus[g] temp$cor <- top[[X]]$cor_0_10_mean[g] for (i in 1:length(sel)) { #i<-1 temp$genus_from_DESeq2[i] <- sigtab$genus_label[sel[i] ] temp$species_from_DESeq2[i] <- sigtab$species[sel[i] ] temp$OTU_from_DESeq2[i] <- row.names(sigtab)[sel[i] ] temp$log2FoldChange[i] <- sigtab$log2FoldChange[sel[i] ] } } mtbold.vs.sthnAust.16s <- rbind(mtbold.vs.sthnAust.16s, temp) } # remove NA 1st row mtbold.vs.sthnAust.16s <- mtbold.vs.sthnAust.16s[-1, ] ## which genera were missing ok <- complete.cases(mtbold.vs.sthnAust.16s) sel <- which(ok==TRUE) mtbold.vs.sthnAust.16s$genus_or_group[-sel] #"unclassified (family: [Leptospirillaceae])" "g__Edaphobacter" "g__Rummeliibacillus" # keep record of which genera are missing missing_from_topX[[X]] <- mtbold.vs.sthnAust.16s$genus_or_group[-sel] # assess strength of relationship between genera distribution over Mt Bold restoration gradient vs Australia-wide comparison groups lmod <- lm( log2FoldChange ~ cor, data=mtbold.vs.sthnAust.16s[ sel, ]) topXresults$adj_rsquared[X] <- summary(lmod)$adj.r.squared print(paste0("completed ", X, "; correlation: ", cor(mtbold.vs.sthnAust.16s$cor, mtbold.vs.sthnAust.16s$log2FoldChange,use="complete.obs") )) } plot(topXresults$top_no,topXresults$adj_rsquared) y_axis_title <- expression(paste("Adjusted ",R^2 )) X <- which(topXresults$adj_rsquared== max(topXresults$adj_rsquared)) # X # 2 lbl1 <- paste0( "Adj.R^2 == ", round(topXresults$adj_rsquared[X],digits=2)) lbl2 <- paste0( "Adj.R^2 == ", round(topXresults$adj_rsquared[30],digits=2)) lbl3 <- paste0( "Alpha = ", alpha ) p <- ggplot(data=topXresults, aes(top_no, adj_rsquared)) + geom_point(data = topXresults[X, ], col="royalblue1", size=3.5) + # red = "#d73027" geom_segment(x=topXresults$top_no[X], y=0, xend=topXresults$top_no[X], yend=topXresults$adj_rsquared[X], col="royalblue1", linetype="dashed") + # y=0.15 "#fc4e2a" geom_point(data = topXresults[30, ], col="royalblue1", size=4) + # red = "#d73027" geom_segment(x=topXresults$top_no[30], y=0, xend=topXresults$top_no[30], yend=topXresults$adj_rsquared[30], col="royalblue1", linetype="dashed") + # y=0.15 "#fc4e2a" geom_point() + theme_bw() + annotate("text", x= topXresults$top_no[X]+5, y=topXresults$adj_rsquared[X], label=lbl1, parse=TRUE, col="blue", hjust=0 ) + annotate("text", x= topXresults$top_no[30]+15, y=topXresults$adj_rsquared[30], label=lbl2, parse=TRUE, col="blue", hjust=0 ) + ggtitle("(c)") + # (c) 0.001; (f) 0.01; (i) 0.05 #annotate("text", x= 150, y=0.45, label=lbl3, col="purple", hjust=1 ) + labs(x = "Number of top trending taxa (Mt Bold)", y = y_axis_title) p ggsave(plot=p, filename = paste0("finished-plots/","Adj-R-squared-and-no-of-top-trending-taxa-curve-ALPHA-0.001-vFINAL.tiff"), width = 8, height = 10, units = "cm", dpi = 600, compression = "lzw") top[[2]] # genus rel_abun_0_10_mean_perc rel_abun_0_10_min_perc rel_abun_0_10_max_perc cor_0_10_mean # 54 g__DA101 2.22657063 0.2200743 6.0639405 0.9481062 # 40 g__Candidatus_Xiphinematobacter 0.95947955 0.0000000 4.1427509 0.9396292 # 150 g__Rummeliibacillus 0.08866729 0.0000000 0.4193309 -0.8745504 # 24 g__Bacillus 6.99326394 1.0736059 15.4230483 -0.9475292 # cor_0_10_95ci_lower cor_0_10_95ci_upper beta_0_10_mean beta_0_10_95ci_lower beta_0_10_95ci_upper p_ordAOV_0_10_mean # 54 0.9365952 0.9581656 0.0015110385 0.0014285189 1.575318e-03 0.001651 # 40 0.9149739 0.9549467 0.0004506522 0.0004112418 4.864279e-04 0.005062 # 150 -0.9171821 -0.8242394 -0.0001210917 -0.0001409952 -9.594312e-05 0.000675 # 24 -0.9546374 -0.9389884 -0.0061640291 -0.0063195472 -6.013960e-03 0.003168 # p_ordAOV_0_10_95ci_lower p_ordAOV_0_10_95ci_upper missing_in_cleared missing_in_remnants perc_B_with_data B # 54 0.0004000 0.0038625 FALSE FALSE 100 100 # 40 0.0006375 0.0123350 FALSE FALSE 100 100 # 150 0.0000000 0.0033050 FALSE FALSE 100 100 # 24 0.0018475 0.0048000 FALSE FALSE 100 100 # etc ... ## Now for this X = 2 - plot the relationship !!! #X<-2 X # 2 # store results in data.frame mtbold.vs.sthnAust.16s <- data.frame(genus_or_group=NA, cor=NA, genus_from_DESeq2=NA, species_from_DESeq2=NA, OTU_from_DESeq2=NA, log2FoldChange=NA ) for (g in 1:length(top[[X]]$genus)) { # find how many OTUs from DESeq2 results correspond to that genus from Mt Bold results # note: that "unclassified" genera are already labelled in sigtab from DESEq2 results #sel <- which(sigtab$genus_label == top[[X]]$genus[g]) sel <- which(sigtab$genus_label == sub(pattern="g__",replacement="", x = top[[X]]$genus[g] )) if (length(sel)==0) { temp <- as.data.frame(matrix(nrow = 1, ncol=6) ) names(temp) <- c("genus_or_group", "cor", "genus_from_DESeq2", "species_from_DESeq2", "OTU_from_DESeq2", "log2FoldChange") temp$genus_or_group <- top[[X]]$genus[g] temp$cor <- top[[X]]$cor_0_10_mean[g] temp$genus_from_DESeq2 <- NA temp$species_from_DESeq2 <- NA temp$OTU_from_DESeq2 <- NA temp$log2FoldChange <- NA } else { temp <- as.data.frame(matrix(nrow = length(sel), ncol=6) ) names(temp) <- c("genus_or_group", "cor", "genus_from_DESeq2", "species_from_DESeq2", "OTU_from_DESeq2", "log2FoldChange") temp$genus_or_group <- top[[X]]$genus[g] temp$cor <- top[[X]]$cor_0_10_mean[g] for (i in 1:length(sel)) { #i<-1 temp$genus_from_DESeq2[i] <- sigtab$genus_label[sel[i] ] temp$species_from_DESeq2[i] <- sigtab$species[sel[i] ] temp$OTU_from_DESeq2[i] <- row.names(sigtab)[sel[i] ] temp$log2FoldChange[i] <- sigtab$log2FoldChange[sel[i] ] } } mtbold.vs.sthnAust.16s <- rbind(mtbold.vs.sthnAust.16s, temp) } # remove NA 1st row mtbold.vs.sthnAust.16s <- mtbold.vs.sthnAust.16s[-1, ] head(mtbold.vs.sthnAust.16s) tail(mtbold.vs.sthnAust.16s) ## plot pseudo-Cor. of genera/group (x-axis) versus Log2FoldChange of relevant OTUs (y-axis) # Adj. R^2 value topXresults$top_no[X] # 2 topXresults$adj_rsquared[X] # https://stackoverflow.com/questions/9723239/ggplot2-annotation-with-superscripts lbl <- paste0( "Adj.R^2 == ", round(topXresults$adj_rsquared[X],digits=2)) lbl2 <- paste0( "Alpha = ", alpha ) p <- ggplot(data = mtbold.vs.sthnAust.16s[ which(complete.cases(mtbold.vs.sthnAust.16s)) ,] ) + theme_bw() + geom_point(aes(x=cor, y=log2FoldChange, colour=genus_or_group),alpha=0.4 ) + labs(x="Correlation with reveg age (Mt Bold)", y="Log2-fold-change for OTUs (altered vs natural)" ) + #guides(colour = guide_legend(title = "Genus or group")) + guides(colour = FALSE) + geom_vline(xintercept=0, col="dark grey", linetype="dashed") + geom_hline(yintercept=0, col="dark grey", linetype="dashed") + geom_smooth(method = lm, se = TRUE, aes(x=cor, y=log2FoldChange), color = "blue", fill = "lightblue") + geom_text(x=0.2, y=5, label="Natural", hjust=0, vjust=0, size=4.5, col="#00BFC4", fontface = "bold") + geom_text(x=-0.2, y=-5, label="Disturbed", hjust=1, vjust=1, size=4.5, col="#F8766D", fontface = "bold") + annotate("text", x= -0.5, y=2.5, label=lbl, parse=TRUE, col="blue") + ggtitle("(a)") #+ #(a) 0.001 ; (d) 0.01 ; (g) 0.05 p getwd() # "C:/Workspace/PROJ/PAPER-Trending-Taxa-Resto/modelling" ggsave(plot=p, filename = paste0("finished-plots/","Top",X,"-comparison-MtBold-vs-southern-Australia-Cor-vs-log2FoldChange-ALPHA-0.001-vFINAL.tiff"), width = 8, height = 10, units = "cm", dpi = 600, compression = "lzw") write.csv(x=mtbold.vs.sthnAust.16s[ which(complete.cases(mtbold.vs.sthnAust.16s)) ,], file="comparison-MtBold-vs-southern-Australia-Cor-vs-log2FoldChange-vFINAL.csv", row.names = FALSE) missing_from_topX[[X]] # "g__Rummeliibacillus" ## Compare to X = 30 X <- 30 # store results in data.frame mtbold.vs.sthnAust.16s <- data.frame(genus_or_group=NA, cor=NA, genus_from_DESeq2=NA, species_from_DESeq2=NA, OTU_from_DESeq2=NA, log2FoldChange=NA ) for (g in 1:length(top[[X]]$genus)) { # find how many OTUs from DESeq2 results correspond to that genus from Mt Bold results # note: that "unclassified" genera are already labelled in sigtab from DESEq2 results #sel <- which(sigtab$genus_label == top[[X]]$genus[g]) sel <- which(sigtab$genus_label == sub(pattern="g__",replacement="", x = top[[X]]$genus[g] )) if (length(sel)==0) { temp <- as.data.frame(matrix(nrow = 1, ncol=6) ) names(temp) <- c("genus_or_group", "cor", "genus_from_DESeq2", "species_from_DESeq2", "OTU_from_DESeq2", "log2FoldChange") temp$genus_or_group <- top[[X]]$genus[g] temp$cor <- top[[X]]$cor_0_10_mean[g] temp$genus_from_DESeq2 <- NA temp$species_from_DESeq2 <- NA temp$OTU_from_DESeq2 <- NA temp$log2FoldChange <- NA } else { temp <- as.data.frame(matrix(nrow = length(sel), ncol=6) ) names(temp) <- c("genus_or_group", "cor", "genus_from_DESeq2", "species_from_DESeq2", "OTU_from_DESeq2", "log2FoldChange") temp$genus_or_group <- top[[X]]$genus[g] temp$cor <- top[[X]]$cor_0_10_mean[g] for (i in 1:length(sel)) { #i<-1 temp$genus_from_DESeq2[i] <- sigtab$genus_label[sel[i] ] temp$species_from_DESeq2[i] <- sigtab$species[sel[i] ] temp$OTU_from_DESeq2[i] <- row.names(sigtab)[sel[i] ] temp$log2FoldChange[i] <- sigtab$log2FoldChange[sel[i] ] } } mtbold.vs.sthnAust.16s <- rbind(mtbold.vs.sthnAust.16s, temp) } # remove NA 1st row mtbold.vs.sthnAust.16s <- mtbold.vs.sthnAust.16s[-1, ] # Adj. R^2 value topXresults$top_no[X] # 30 topXresults$adj_rsquared[X] # # https://stackoverflow.com/questions/9723239/ggplot2-annotation-with-superscripts lbl <- paste0( "Adj.R^2 == ", round(topXresults$adj_rsquared[X],digits=2)) lbl2 <- paste0( "Alpha = ", alpha ) p <- ggplot(data = mtbold.vs.sthnAust.16s[ which(complete.cases(mtbold.vs.sthnAust.16s)) ,] ) + theme_bw() + geom_point(aes(x=cor, y=log2FoldChange, colour=genus_or_group),alpha=0.4 ) + labs(x="Correlation with reveg age (Mt Bold)", y="Log2-fold-change for OTUs (altered vs natural)" ) + #guides(colour = guide_legend(title = "Genus or group")) + guides(colour = FALSE) + geom_vline(xintercept=0, col="dark grey", linetype="dashed") + geom_hline(yintercept=0, col="dark grey", linetype="dashed") + geom_smooth(method = lm, se = TRUE, aes(x=cor, y=log2FoldChange), color = "blue", fill = "lightblue") + geom_text(x=0.2, y=15, label="Natural", hjust=0, vjust=0, size=4.5, col="#00BFC4", fontface = "bold") + geom_text(x=-0.2, y=-15, label="Disturbed", hjust=1, vjust=1, size=4.5, col="#F8766D", fontface = "bold") + annotate("text", x= -0.5, y=10, label=lbl, parse=TRUE, col="blue") + ggtitle("(b)") #+ # (b) 0.001; (e) 0.01; (h) 0.05 #annotate("text", x= 1, y=-20, label=lbl2, col="purple", hjust=1 ) # #https://stackoverflow.com/questions/7263849/what-do-hjust-and-vjust-do-when-making-a-plot-using-ggplot p ggsave(plot=p, filename = paste0("finished-plots/","Top",X,"-comparison-MtBold-vs-southern-Australia-Cor-vs-log2FoldChange-ALPHA-0.001-vFINAL.tiff"), width = 8, height = 10, units = "cm", dpi = 600, compression = "lzw") write.csv(x=mtbold.vs.sthnAust.16s, file="comparison-MtBold-vs-Australia-Cor-vs-log2FoldChange-Top30-vFINAL.csv", row.names = FALSE) ### Re-do volcano plot but colour code all OTUs corresponding to top-trending taxa !!!!!!!! # For X=2 X <- 2 plot(sigtab$log2FoldChange, sigtab$minuslog10_padj) top[[X]]$genus # [1] "g__DA101" "g__Candidatus_Xiphinematobacter" "g__Rummeliibacillus" "g__Bacillus" plot_data <- data.frame(log2foldchange=sigtab$log2FoldChange, minuslog10_padj=sigtab$minuslog10_padj, trend_MtBold=NA, genus_label=sigtab$genus_label, phylum_label=sigtab$phylum_label) sel.inc.MtBold <- which(top[[X]]$cor_0_10_mean > 0) sel <- which(sigtab$genus %in% top[[X]]$genus[sel.inc.MtBold]) points(sigtab$log2FoldChange[sel], sigtab$minuslog10_padj[sel], col="green") increasing <- data.frame(log2foldchange=sigtab$log2FoldChange[sel], minuslog10_padj=sigtab$minuslog10_padj[sel], trend_MtBold="Increasing", genus_label=sigtab$genus_label[sel], phylum_label=sigtab$phylum_label[sel]) sel.dec.MtBold <- which(top[[X]]$cor_0_10_mean < 0) sel <- which(sigtab$genus %in% top[[X]]$genus[sel.dec.MtBold]) points(sigtab$log2FoldChange[sel], sigtab$minuslog10_padj[sel], col="red") decreasing <- data.frame(log2foldchange=sigtab$log2FoldChange[sel], minuslog10_padj=sigtab$minuslog10_padj[sel], trend_MtBold="Decreasing", genus_label=sigtab$genus_label[sel], phylum_label=sigtab$phylum_label[sel]) plot_data <- rbind(plot_data,increasing,decreasing) str(plot_data) # 'data.frame': 2528 obs. of 5 variables: # $ log2foldchange : num 1.45 -3.77 -3.58 -3.29 2.02 ... # $ minuslog10_padj: num 13.8 3.51 4.72 12.95 5.88 ... # $ trend_MtBold : chr NA NA NA NA ... # $ genus_label : Factor w/ 395 levels "02d06","A17",..: 33 296 163 30 236 157 364 86 278 157 ... # $ phylum_label : Factor w/ 26 levels "Acidobacteria",..: 19 2 2 13 1 19 14 19 19 19 ... # https://stackoverflow.com/questions/17148679/construct-a-manual-legend-for-a-complicated-plot ## Highlight top trending taxa from Mt Bold in context of Aust-wide DESeq2 results cols <- c("Increasing"="green","Decreasing"="red") p <- ggplot(data=plot_data, aes(log2foldchange, minuslog10_padj)) + theme_bw() + geom_point() + geom_point(data = subset(plot_data, trend_MtBold=="Decreasing" ), aes(colour="Decreasing"), alpha=0.5) + geom_point(data = subset(plot_data, trend_MtBold=="Increasing" ), aes(colour="Increasing"), alpha=0.5) + labs(x = "Log2-fold-change (altered vs natural)", y = "-log10(P-adjusted)") + scale_colour_manual(name="Top trending taxa from\nMt Bold restoration gradient",values=cols) + geom_text_repel(data= plot_data[ which( abs(plot_data$log2foldchange) >20) , ], aes(x= log2foldchange, y=minuslog10_padj), label=plot_data$genus_label[which( abs(plot_data$log2foldchange) >20)], size=2, segment.colour="grey") + theme(legend.position="bottom") p ggsave(plot=p, filename = paste0("finished-plots/","Aust-wide-volcano-plot-with-MtBold-top-trending-genera-OTUs-vFINAL.tiff"), width = 18, height = 12, units = "cm", dpi = 600, compression = "lzw") missing_from_topX[[X]] # "g__Rummeliibacillus" ## Highlight phyla in DESeq2 results p <- ggplot(data=plot_data, aes(log2foldchange, minuslog10_padj)) + theme_bw() + geom_point(aes(x= log2foldchange, y=minuslog10_padj, colour=phylum_label)) + labs(x = "Log2-fold-change (altered vs natural)", y = "-log10(P-adjusted)") + theme(legend.position="bottom") + theme(legend.key.size = unit(0.55, 'lines')) + guides(col = guide_legend(ncol = 4)) + # , byrow = FALSE guides(colour = guide_legend(title = "Phylum")) + geom_text_repel(data= plot_data[ which( abs(plot_data$log2foldchange) >7.5) , ], aes(x= log2foldchange, y=minuslog10_padj), label=plot_data$phylum_label[which( abs(plot_data$log2foldchange) >7.5)], size=2, segment.colour="grey") p ggsave(plot=p, filename = paste0("finished-plots/","Aust-wide-volcano-plot-BY-PHYLUM-vFINAL.tiff"), width = 18, height = 12, units = "cm", dpi = 600, compression = "lzw") # #------------------------- #### What are rel abun differences between "altered" and "natural" - for Mt Bold top 10 inc/dec genera? #------------------------- base.d16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 12413 taxa and 217 samples ] # sample_data() Sample Data: [ 217 samples by 24 sample variables ] # tax_table() Taxonomy Table: [ 12413 taxa by 7 taxonomic ranks ] ### RELATIVE ABUNDANCE relabun.base.d16s <- transform_sample_counts(base.d16s, function(x) x / sum(x) ) relabun.base.d16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 12413 taxa and 217 samples ] # sample_data() Sample Data: [ 217 samples by 24 sample variables ] # tax_table() Taxonomy Table: [ 12413 taxa by 7 taxonomic ranks ] genera.base.d16s <- levels(as.factor( get_taxa_unique(relabun.base.d16s, taxonomic.rank = "genus" ))) genera.base.d16s all_genera <- genera.base.d16s phy_obj <- relabun.base.d16s # store results out <- list() for (g in 1:length(all_genera)) { # this_genus <- all_genera[g] # as.character(all_genera[g]) out[[ this_genus ]] <- list() # this_genus ## for genus = "unclassified" - get next available taxonomic info if (this_genus == "unclassified") { # work through taxonomic table to find all genus = "unclassified" sel_unclass <- which(tax_table(phy_obj)[ ,"genus"] == "unclassified" ) # ranks <- c("family","order","class","phylum","kingdom") ## continue loop until all "unclassified" genera accounted for while (length(sel_unclass) > 0) { # find next available taxonomic classification # start with first taxonomic rank, and work up idx_start <- min(sel_unclass) idx_rank <- 1 this_rank <- ranks[idx_rank] this_taxaname <- as.character( tax_table(phy_obj)[ idx_start , this_rank ] ) while (this_taxaname == "unclassified") { idx_rank <- idx_rank +1 this_rank <- ranks[idx_rank] this_taxaname <- as.character( tax_table(phy_obj)[ idx_start , this_rank ] ) } this_fullname <- paste0("unclassified (",this_rank,": ",gsub(pattern=".__",x=this_taxaname,replacement=""),")") # gsub() to tidy-up label if (this_rank == "family") {sel_done <- which(tax_table(phy_obj)[ sel_unclass, this_rank] == this_taxaname )} if (this_rank == "order") {sel_done <- which(tax_table(phy_obj)[ sel_unclass, this_rank] == this_taxaname & tax_table(phy_obj)[ sel_unclass, "family"] == "unclassified" )} if (this_rank == "class") {sel_done <- which(tax_table(phy_obj)[ sel_unclass, this_rank] == this_taxaname & tax_table(phy_obj)[ sel_unclass, "order" ] == "unclassified" & tax_table(phy_obj)[ sel_unclass, "family"] == "unclassified" )} if (this_rank == "phylum") {sel_done <- which(tax_table(phy_obj)[ sel_unclass, this_rank] == this_taxaname & tax_table(phy_obj)[ sel_unclass, "class"] == "unclassified" & tax_table(phy_obj)[ sel_unclass, "order" ] == "unclassified" & tax_table(phy_obj)[ sel_unclass, "family"] == "unclassified" )} keep_taxa <- row.names( tax_table(phy_obj) )[ sel_unclass[sel_done] ] subsel <- prune_taxa(phy_obj, taxa = keep_taxa ) # # # # routine for "unclassified" genera only # # # # df <- data.frame( sample = sample_names( subsel ), alt_vs_nat = subsel@sam_data$alt_vs_nat, rel_abun = sample_sums( subsel ) ) df$genus <- this_fullname out[[ this_fullname ]] <- list() out[[ this_fullname ]][["df"]] <- df # store OTUs out[[ this_fullname ]][["otus"]] <- taxa_names(subsel) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # sel_unclass <- sel_unclass[-c(sel_done)] print(paste0("finished unclassified group: ",this_fullname)) } # END while loop ## now for all classified genera } else { ##g<-2 # TESTING sel_keep_taxa <- which(tax_table(phy_obj)[ ,"genus"] == this_genus ) keep_taxa <- row.names( tax_table(phy_obj) )[sel_keep_taxa] subsel <- prune_taxa(phy_obj, taxa = keep_taxa ) df <- data.frame( sample = sample_names( subsel ), alt_vs_nat = subsel@sam_data$alt_vs_nat, rel_abun = sample_sums( subsel ) ) df$genus <- this_genus out[[ this_genus ]][["df"]] <- df # store OTUs out[[ this_genus ]][["otus"]] <- taxa_names(subsel) print(paste0("finished genus # ",g, " = ",this_genus)) } # END else for classified genera } # END for all_genera[g] names(out) length(names(out)) # 813 length(unique(names(out))) # 813 saveRDS(out, file = "out__rel_abun_data_Aust_wide_samples.RDS") #out <- readRDS("out__rel_abun_data_Aust_wide_samples.RDS") ## Check relationship in Aust-wide natural vs human-altered for top few trending genera at Mt Bold ## increasing genera top10_inc$genus # [1] g__DA101 g__Candidatus_Xiphinematobacter # [3] g__Bradyrhizobium g__Candidatus_Solibacter # [5] g__Candidatus_Koribacter unclassified (family: Rhodospirillaceae) # [7] g__Rhodopila g__Edaphobacter # [9] unclassified (order: Solibacterales) unclassified (family: [Leptospirillaceae]) library(gtools) for (i in 1:length(top10_inc$genus)) { df <- out[[ as.character(top10_inc$genus[i]) ]][["df"]] df$rel_abun <- 100*df$rel_abun filename <- gsub(pattern = "\\(|\\)|\\:", replacement = "_", x = top10_inc$genus[i] ) grDevices::tiff(file=paste0("finished-plots/","Rel-abun-comparison-Top10-Increasing-",i,"-",filename,".tif"), width = 12, height = 12, units = "cm", res = 600, compression = "lzw") boxplot( rel_abun ~ alt_vs_nat, data = df, ylab="Relative abundance (%)") ## one-sided test ## https://stats.stackexchange.com/questions/201125/understanding-the-wilcoxon-rank-sum-one-sided-test wt <- wilcox.test(rel_abun ~ alt_vs_nat, alternative="less", conf.int=FALSE, data=df) mtext( text = sub(pattern = ".__", replacement = "", top10_inc$genus[i]), side = 3, line=1, adj=0 , cex=1.3 ) mtext( text = paste0("Increases with restoration at Mt Bold #",i), side = 3, line=0.1, adj=0 , cex=0.8 ) if (wt$p.value<0.001) { text(1, par("usr")[3] + 0.8*(par("usr")[4]-par("usr")[3]), paste0("Wilcoxon rank-sum\none-sided test\nW = ",wt$statistic,"; P < 0.001"), adj=c(0.5,0) , cex=0.8) } else { text(1, par("usr")[3] + 0.8*(par("usr")[4]-par("usr")[3]), paste0("Wilcoxon rank-sum\none-sided test\nW = ",wt$statistic,"; P = ", round(wt$p.value, digits = 3)), adj=c(0.5,0) , cex=0.8) } dev.off() } ## decreasing genera top10_dec$genus # [1] g__Bacillus g__Rummeliibacillus # [3] unclassified (family: Actinospicaceae) unclassified (order: Ellin5290) # [5] g__Sporosarcina g__Cytophagales # [7] unclassified (family: Ellin5301) g__Ammoniphilus # [9] g__Flavisolibacter unclassified (class: C0119) for (i in 1:length(top10_dec$genus)) { df <- out[[ as.character(top10_dec$genus[i]) ]][["df"]] df$rel_abun <- 100*df$rel_abun filename <- gsub(pattern = "\\(|\\)|\\:", replacement = "_", x = top10_dec$genus[i] ) grDevices::tiff(file=paste0("finished-plots/","Rel-abun-comparison-Top10-Decreasing-",i,"-",filename,".tif"), width = 12, height = 12, units = "cm", res = 600, compression = "lzw") boxplot( rel_abun ~ alt_vs_nat, data = df, ylab="Relative abundance (%)") ## one-sided test ## https://stats.stackexchange.com/questions/201125/understanding-the-wilcoxon-rank-sum-one-sided-test wt <- wilcox.test(rel_abun ~ alt_vs_nat, alternative="greater", conf.int=FALSE, data=df) mtext( text = sub(pattern = ".__", replacement = "", top10_dec$genus[i]), side = 3, line=1, adj=0 , cex=1.3 ) mtext( text = paste0("Decreases with restoration at Mt Bold #",i), side = 3, line=0.1, adj=0 , cex=0.8 ) if (wt$p.value<0.001) { text(2, par("usr")[3] + 0.8*(par("usr")[4]-par("usr")[3]), paste0("Wilcoxon rank-sum\none-sided test\nW = ",wt$statistic,"; P < 0.001"), adj=c(0.5,0) , cex=0.8) } else { text(2, par("usr")[3] + 0.8*(par("usr")[4]-par("usr")[3]), paste0("Wilcoxon rank-sum\none-sided test\nW = ",wt$statistic,"; P = ", round(wt$p.value, digits = 3)), adj=c(0.5,0) , cex=0.8) } dev.off() } ### Examine 2-d index of cumulative % top increasing / decreasing genera for (i in 1:length(c(top10_inc$genus,top10_dec$genus))) { #i<-12 df <- out[[ c(as.character(top10_inc$genus),as.character(top10_dec$genus))[i] ]][["df"]] df$rel_abun <- 100*df$rel_abun df$inc_or_dec <- ifelse(test = i<=10, yes = "increasing with restoration", no = "decreasing with restoration") # collate outputs into single dataframe if (i==1) {temp <- df} if (i>1) {temp <- rbind(temp,df)} print(paste0("completed genus ",i," of ",length(c(top10_inc$genus,top10_dec$genus)))) } ## summarise rel abun by sample of increasing genera vs decreasing genera names(temp) #"sample" "alt_vs_nat" "rel_abun" "genus" "inc_or_dec" length(unique(temp$sample)) # 217 length(unique(temp$genus)) # 20 unique(temp$inc_or_dec) # "increasing with restoration" "decreasing with restoration" # create template to store cumulative rel abun data temp.summary <- temp[1:217, c("sample","alt_vs_nat")] temp.summary$inc_rel_abun <- NA temp.summary$dec_rel_abun <- NA length(unique(temp.summary$sample)) # 217 ## summaring increasing genera sel <- which(temp$inc_or_dec == "increasing with restoration") # qty 2170 for (j in 1:length(temp.summary$sample)) { #j<-1 idx_samps <- which(temp$sample[sel] == temp.summary$sample[j]) temp.summary$inc_rel_abun[j] <- sum(temp$rel_abun[ sel[idx_samps] ]) } ## summaring decreasing genera sel <- which(temp$inc_or_dec == "decreasing with restoration") # qty 2170 for (j in 1:length(temp.summary$sample)) { #j<-1 idx_samps <- which(temp$sample[sel] == temp.summary$sample[j]) temp.summary$dec_rel_abun[j] <- sum(temp$rel_abun[ sel[idx_samps] ]) } plot(temp.summary$dec_rel_abun, temp.summary$inc_rel_abun, col=temp.summary$alt_vs_nat) ## ggplot cols <- c("altered" = "#F8766D", "natural" = "#00BFC4") p <- ggplot(data = temp.summary ) + theme_bw() + geom_point(aes(x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat) ) + #ggtitle("(a)") + labs(x="Relative abundance of top 10 decreasing\nbacteria with Mt Bold restoration (%)", y="Relative abundance of top 10 increasing\nbacteria with Mt Bold restoration (%)" ) + scale_color_manual(labels = c("Human-\naltered", "Natural"), values = c("#F8766D", "#00BFC4")) + guides(colour = guide_legend(title = "Sample type")) p <- ggplot(data = temp.summary ) + theme_bw() + geom_point(aes(x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat) ) + geom_density_2d(data = temp.summary[which(temp.summary$alt_vs_nat=="natural"), ], aes( x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat), alpha=.4) + geom_density_2d(data = temp.summary[which(temp.summary$alt_vs_nat=="altered"), ], aes( x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat), alpha=.4) + labs(x="OTU relative abundance of top 10 decreasing\nbacteria with Mt Bold restoration (%)", y="OTU relative abundance of top 10 increasing\nbacteria with Mt Bold restoration (%)" ) + scale_color_manual(labels = c("Human-altered", "Natural"), values = c("#F8766D", "#00BFC4")) + theme(legend.position="bottom") + guides(colour = guide_legend(title = "Sample type")) p getwd() # "C:/Workspace/PROJ/PAPER-Trending-Taxa-Resto/modelling" ggsave(plot=p, filename = paste0("finished-plots/","Rel-Abun-top10Inc-vs-top10Dec-ALT-vs-NAT-vFINAL.tiff"), width = 11, height = 11, units = "cm", dpi = 600, compression = "lzw") # Test hypothesis that 2-d cumulative rel abun of top10 inc/dec bacteria vary (with different centroids) by human-altered vs natural groups names(temp.summary) # "sample" "alt_vs_nat" "inc_rel_abun" "dec_rel_abun" # Calculate distance matrix - "euclidean" set.seed(123) test <- vegan::adonis(formula = temp.summary ~ alt_vs_nat, data =temp.summary, method = "euclidean") test$aov.tab # Permutation: free # Number of permutations: 999 # # Terms added sequentially (first to last) # # Df SumsOfSqs MeanSqs F.Model R2 Pr(>F) # alt_vs_nat 1 7132.1 7132.1 119.33 0.35692 0.001 *** # Residuals 215 12850.5 59.8 0.64308 # Total 216 19982.5 1.00000 # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Homogeneity of dispersion test set.seed(123) beta <- betadisper(vegdist(x=temp.summary, method = "euclidean"), temp.summary$alt_vs_nat) permutest(beta) # Permutation test for homogeneity of multivariate dispersions # Permutation: free # Number of permutations: 999 # # Response: Distances # Df Sum Sq Mean Sq F N.Perm Pr(>F) # Groups 1 214.6 214.574 9.1395 999 0.003 ** # Residuals 215 5047.7 23.478 # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # These outputs tell us that our adonis test is significant so we can reject the null hypothesis # that 2-d distribution of cumulative relative abundances have the same centroid between human-altered and natural groups. # However, betadisper results are significant, meaning we can reject the null hypothesis that # human-altered and natural groups have the same dispersions. # Although, homogeneity of dispersions are different, they visually group in different zones of the plot, lending support # to the significant adonis permanova result. # #------------------------- #### What are rel abun differences between "altered" and "natural" - for top 10 inc/dec genera identified at Mt Bold? ## Add Mt Bold surface samples to plot #------------------------- base.d16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 12413 taxa and 217 samples ] # sample_data() Sample Data: [ 217 samples by 24 sample variables ] # tax_table() Taxonomy Table: [ 12413 taxa by 7 taxonomic ranks ] ### RELATIVE ABUNDANCE relabun.base.d16s <- transform_sample_counts(base.d16s, function(x) x / sum(x) ) relabun.base.d16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 12413 taxa and 217 samples ] # sample_data() Sample Data: [ 217 samples by 24 sample variables ] # tax_table() Taxonomy Table: [ 12413 taxa by 7 taxonomic ranks ] genera.base.d16s <- levels(as.factor( get_taxa_unique(relabun.base.d16s, taxonomic.rank = "genus" ))) genera.base.d16s all_genera <- genera.base.d16s phy_obj <- relabun.base.d16s # store results out <- list() for (g in 1:length(all_genera)) { # this_genus <- all_genera[g] # as.character(all_genera[g]) out[[ this_genus ]] <- list() # this_genus ## for genus = "unclassified" - get next available taxonomic info if (this_genus == "unclassified") { # work through taxonomic table to find all genus = "unclassified" sel_unclass <- which(tax_table(phy_obj)[ ,"genus"] == "unclassified" ) # ranks <- c("family","order","class","phylum","kingdom") ## continue loop until all "unclassified" genera accounted for while (length(sel_unclass) > 0) { # find next available taxonomic classification # start with first taxonomic rank, and work up idx_start <- min(sel_unclass) idx_rank <- 1 this_rank <- ranks[idx_rank] this_taxaname <- as.character( tax_table(phy_obj)[ idx_start , this_rank ] ) while (this_taxaname == "unclassified") { idx_rank <- idx_rank +1 this_rank <- ranks[idx_rank] this_taxaname <- as.character( tax_table(phy_obj)[ idx_start , this_rank ] ) } this_fullname <- paste0("unclassified (",this_rank,": ",gsub(pattern=".__",x=this_taxaname,replacement=""),")") # gsub() to tidy-up label if (this_rank == "family") {sel_done <- which(tax_table(phy_obj)[ sel_unclass, this_rank] == this_taxaname )} if (this_rank == "order") {sel_done <- which(tax_table(phy_obj)[ sel_unclass, this_rank] == this_taxaname & tax_table(phy_obj)[ sel_unclass, "family"] == "unclassified" )} if (this_rank == "class") {sel_done <- which(tax_table(phy_obj)[ sel_unclass, this_rank] == this_taxaname & tax_table(phy_obj)[ sel_unclass, "order" ] == "unclassified" & tax_table(phy_obj)[ sel_unclass, "family"] == "unclassified" )} if (this_rank == "phylum") {sel_done <- which(tax_table(phy_obj)[ sel_unclass, this_rank] == this_taxaname & tax_table(phy_obj)[ sel_unclass, "class"] == "unclassified" & tax_table(phy_obj)[ sel_unclass, "order" ] == "unclassified" & tax_table(phy_obj)[ sel_unclass, "family"] == "unclassified" )} keep_taxa <- row.names( tax_table(phy_obj) )[ sel_unclass[sel_done] ] subsel <- prune_taxa(phy_obj, taxa = keep_taxa ) # # # # routine for "unclassified" genera only # # # # df <- data.frame( sample = sample_names( subsel ), alt_vs_nat = subsel@sam_data$alt_vs_nat, rel_abun = sample_sums( subsel ) ) df$genus <- this_fullname out[[ this_fullname ]] <- list() out[[ this_fullname ]][["df"]] <- df # store OTUs out[[ this_fullname ]][["otus"]] <- taxa_names(subsel) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # sel_unclass <- sel_unclass[-c(sel_done)] print(paste0("finished unclassified group: ",this_fullname)) } # END while loop ## now for all classified genera } else { ##g<-2 # TESTING sel_keep_taxa <- which(tax_table(phy_obj)[ ,"genus"] == this_genus ) keep_taxa <- row.names( tax_table(phy_obj) )[sel_keep_taxa] subsel <- prune_taxa(phy_obj, taxa = keep_taxa ) df <- data.frame( sample = sample_names( subsel ), alt_vs_nat = subsel@sam_data$alt_vs_nat, rel_abun = sample_sums( subsel ) ) df$genus <- this_genus out[[ this_genus ]][["df"]] <- df # store OTUs out[[ this_genus ]][["otus"]] <- taxa_names(subsel) print(paste0("finished genus # ",g, " = ",this_genus)) } # END else for classified genera } # END for all_genera[g] names(out) length(names(out)) # 813 length(unique(names(out))) # 813 saveRDS(out, file = "out__rel_abun_data_Aust_wide_samples.RDS") #out <- readRDS("out__rel_abun_data_Aust_wide_samples.RDS") ## Check relationship in Aust-wide natural vs human-altered for top few trending genera at Mt Bold ## increasing genera top10_inc$genus # [1] g__DA101 g__Candidatus_Xiphinematobacter # [3] g__Bradyrhizobium g__Candidatus_Solibacter # [5] g__Candidatus_Koribacter unclassified (family: Rhodospirillaceae) # [7] g__Rhodopila g__Edaphobacter # [9] unclassified (order: Solibacterales) unclassified (family: [Leptospirillaceae]) library(gtools) for (i in 1:length(top10_inc$genus)) { df <- out[[ as.character(top10_inc$genus[i]) ]][["df"]] df$rel_abun <- 100*df$rel_abun filename <- gsub(pattern = "\\(|\\)|\\:", replacement = "_", x = top10_inc$genus[i] ) grDevices::tiff(file=paste0("finished-plots/","Rel-abun-comparison-Top10-Increasing-",i,"-",filename,".tif"), width = 12, height = 12, units = "cm", res = 600, compression = "lzw") boxplot( rel_abun ~ alt_vs_nat, data = df, ylab="Relative abundance (%)") ## one-sided test ## https://stats.stackexchange.com/questions/201125/understanding-the-wilcoxon-rank-sum-one-sided-test wt <- wilcox.test(rel_abun ~ alt_vs_nat, alternative="less", conf.int=FALSE, data=df) mtext( text = sub(pattern = ".__", replacement = "", top10_inc$genus[i]), side = 3, line=1, adj=0 , cex=1.3 ) mtext( text = paste0("Increases with restoration at Mt Bold #",i), side = 3, line=0.1, adj=0 , cex=0.8 ) if (wt$p.value<0.001) { text(1, par("usr")[3] + 0.8*(par("usr")[4]-par("usr")[3]), paste0("Wilcoxon rank-sum\none-sided test\nW = ",wt$statistic,"; P < 0.001"), adj=c(0.5,0) , cex=0.8) } else { text(1, par("usr")[3] + 0.8*(par("usr")[4]-par("usr")[3]), paste0("Wilcoxon rank-sum\none-sided test\nW = ",wt$statistic,"; P = ", round(wt$p.value, digits = 3)), adj=c(0.5,0) , cex=0.8) } dev.off() } ## decreasing genera top10_dec$genus # [1] g__Bacillus g__Rummeliibacillus # [3] unclassified (family: Actinospicaceae) unclassified (order: Ellin5290) # [5] g__Sporosarcina g__Cytophagales # [7] unclassified (family: Ellin5301) g__Ammoniphilus # [9] g__Flavisolibacter unclassified (class: C0119) for (i in 1:length(top10_dec$genus)) { df <- out[[ as.character(top10_dec$genus[i]) ]][["df"]] df$rel_abun <- 100*df$rel_abun filename <- gsub(pattern = "\\(|\\)|\\:", replacement = "_", x = top10_dec$genus[i] ) grDevices::tiff(file=paste0("finished-plots/","Rel-abun-comparison-Top10-Decreasing-",i,"-",filename,".tif"), width = 12, height = 12, units = "cm", res = 600, compression = "lzw") boxplot( rel_abun ~ alt_vs_nat, data = df, ylab="Relative abundance (%)") ## one-sided test ## https://stats.stackexchange.com/questions/201125/understanding-the-wilcoxon-rank-sum-one-sided-test wt <- wilcox.test(rel_abun ~ alt_vs_nat, alternative="greater", conf.int=FALSE, data=df) mtext( text = sub(pattern = ".__", replacement = "", top10_dec$genus[i]), side = 3, line=1, adj=0 , cex=1.3 ) mtext( text = paste0("Decreases with restoration at Mt Bold #",i), side = 3, line=0.1, adj=0 , cex=0.8 ) if (wt$p.value<0.001) { text(2, par("usr")[3] + 0.8*(par("usr")[4]-par("usr")[3]), paste0("Wilcoxon rank-sum\none-sided test\nW = ",wt$statistic,"; P < 0.001"), adj=c(0.5,0) , cex=0.8) } else { text(2, par("usr")[3] + 0.8*(par("usr")[4]-par("usr")[3]), paste0("Wilcoxon rank-sum\none-sided test\nW = ",wt$statistic,"; P = ", round(wt$p.value, digits = 3)), adj=c(0.5,0) , cex=0.8) } dev.off() } ### Examine 2-d index of cumulative % top increasing / decreasing genera for (i in 1:length(c(top10_inc$genus,top10_dec$genus))) { #i<-12 df <- out[[ c(as.character(top10_inc$genus),as.character(top10_dec$genus))[i] ]][["df"]] df$rel_abun <- 100*df$rel_abun df$inc_or_dec <- ifelse(test = i<=10, yes = "increasing with restoration", no = "decreasing with restoration") # collate outputs into single dataframe if (i==1) {temp <- df} if (i>1) {temp <- rbind(temp,df)} print(paste0("completed genus ",i," of ",length(c(top10_inc$genus,top10_dec$genus)))) } ## summarise rel abun by sample of increasing genera vs decreasing genera names(temp) #"sample" "alt_vs_nat" "rel_abun" "genus" "inc_or_dec" length(unique(temp$sample)) # 217 length(unique(temp$genus)) # 20 unique(temp$inc_or_dec) # "increasing with restoration" "decreasing with restoration" # create template to store cumulative rel abun data temp.summary <- temp[1:217, c("sample","alt_vs_nat")] temp.summary$inc_rel_abun <- NA temp.summary$dec_rel_abun <- NA length(unique(temp.summary$sample)) # 217 ## summaring increasing genera sel <- which(temp$inc_or_dec == "increasing with restoration") # qty 2170 for (j in 1:length(temp.summary$sample)) { #j<-1 idx_samps <- which(temp$sample[sel] == temp.summary$sample[j]) temp.summary$inc_rel_abun[j] <- sum(temp$rel_abun[ sel[idx_samps] ]) } ## summaring decreasing genera sel <- which(temp$inc_or_dec == "decreasing with restoration") # qty 2170 for (j in 1:length(temp.summary$sample)) { #j<-1 idx_samps <- which(temp$sample[sel] == temp.summary$sample[j]) temp.summary$dec_rel_abun[j] <- sum(temp$rel_abun[ sel[idx_samps] ]) } plot(temp.summary$dec_rel_abun, temp.summary$inc_rel_abun, col=temp.summary$alt_vs_nat) ## ggplot cols <- c("altered" = "#F8766D", "natural" = "#00BFC4") p <- ggplot(data = temp.summary ) + theme_bw() + geom_point(aes(x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat) ) + #ggtitle("(a)") + labs(x="Relative abundance of top 10 decreasing\nbacteria with Mt Bold restoration (%)", y="Relative abundance of top 10 increasing\nbacteria with Mt Bold restoration (%)" ) + scale_color_manual(labels = c("Human-\naltered", "Natural"), values = c("#F8766D", "#00BFC4")) + guides(colour = guide_legend(title = "Sample type")) p p <- ggplot(data = temp.summary ) + theme_bw() + geom_point(aes(x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat) ) + geom_density_2d(data = temp.summary[which(temp.summary$alt_vs_nat=="natural"), ], aes( x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat), alpha=.4) + geom_density_2d(data = temp.summary[which(temp.summary$alt_vs_nat=="altered"), ], aes( x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat), alpha=.4) + labs(x="OTU relative abundance of top 10 decreasing\nbacteria with Mt Bold restoration (%)", y="OTU relative abundance of top 10 increasing\nbacteria with Mt Bold restoration (%)" ) + scale_color_manual(labels = c("Human-altered", "Natural"), values = c("#F8766D", "#00BFC4")) + theme(legend.position="bottom") + guides(colour = guide_legend(title = "Sample type")) p getwd() # "C:/Workspace/PROJ/PAPER-Trending-Taxa-Resto/modelling" ggsave(plot=p, filename = paste0("finished-plots/","Rel-Abun-top10Inc-vs-top10Dec-ALT-vs-NAT-vFINAL.tiff"), width = 11, height = 11, units = "cm", dpi = 600, compression = "lzw") ### Repeat this plot, but also include Mt Bold surface samples # note surface sample phyloseq object for Mt Bold has been prepared in code below!! mtbold_surf.16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 3238 taxa and 24 samples ] # sample_data() Sample Data: [ 24 samples by 69 sample variables ] # tax_table() Taxonomy Table: [ 3238 taxa by 7 taxonomic ranks ] ### RELATIVE ABUNDANCE relabun.mtbold_surf.16s <- transform_sample_counts(mtbold_surf.16s, function(x) x / sum(x) ) relabun.mtbold_surf.16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 3238 taxa and 24 samples ] # sample_data() Sample Data: [ 24 samples by 69 sample variables ] # tax_table() Taxonomy Table: [ 3238 taxa by 7 taxonomic ranks ] genera.mtbold_surf.16s <- levels(as.factor( get_taxa_unique(relabun.mtbold_surf.16s, taxonomic.rank = "genus" ))) genera.mtbold_surf.16s all_genera <- genera.mtbold_surf.16s phy_obj <- relabun.mtbold_surf.16s # store results out <- list() for (g in 1:length(all_genera)) { # this_genus <- all_genera[g] # as.character(all_genera[g]) out[[ this_genus ]] <- list() # this_genus ## for genus = "unclassified" - get next available taxonomic info if (this_genus == "unclassified") { # work through taxonomic table to find all genus = "unclassified" sel_unclass <- which(tax_table(phy_obj)[ ,"genus"] == "unclassified" ) # ranks <- c("family","order","class","phylum","kingdom") ## continue loop until all "unclassified" genera accounted for while (length(sel_unclass) > 0) { # find next available taxonomic classification # start with first taxonomic rank, and work up idx_start <- min(sel_unclass) idx_rank <- 1 this_rank <- ranks[idx_rank] this_taxaname <- as.character( tax_table(phy_obj)[ idx_start , this_rank ] ) while (this_taxaname == "unclassified") { idx_rank <- idx_rank +1 this_rank <- ranks[idx_rank] this_taxaname <- as.character( tax_table(phy_obj)[ idx_start , this_rank ] ) } this_fullname <- paste0("unclassified (",this_rank,": ",gsub(pattern=".__",x=this_taxaname,replacement=""),")") # gsub() to tidy-up label if (this_rank == "family") {sel_done <- which(tax_table(phy_obj)[ sel_unclass, this_rank] == this_taxaname )} if (this_rank == "order") {sel_done <- which(tax_table(phy_obj)[ sel_unclass, this_rank] == this_taxaname & tax_table(phy_obj)[ sel_unclass, "family"] == "unclassified" )} if (this_rank == "class") {sel_done <- which(tax_table(phy_obj)[ sel_unclass, this_rank] == this_taxaname & tax_table(phy_obj)[ sel_unclass, "order" ] == "unclassified" & tax_table(phy_obj)[ sel_unclass, "family"] == "unclassified" )} if (this_rank == "phylum") {sel_done <- which(tax_table(phy_obj)[ sel_unclass, this_rank] == this_taxaname & tax_table(phy_obj)[ sel_unclass, "class"] == "unclassified" & tax_table(phy_obj)[ sel_unclass, "order" ] == "unclassified" & tax_table(phy_obj)[ sel_unclass, "family"] == "unclassified" )} keep_taxa <- row.names( tax_table(phy_obj) )[ sel_unclass[sel_done] ] subsel <- prune_taxa(phy_obj, taxa = keep_taxa ) # # # # routine for "unclassified" genera only # # # # df <- data.frame( sample = sample_names( subsel ), #alt_vs_nat = subsel@sam_data$alt_vs_nat, alt_vs_nat = subsel@sam_data$Reveg_age, # modify this to capture reveg age rel_abun = sample_sums( subsel ) ) df$genus <- this_fullname out[[ this_fullname ]] <- list() out[[ this_fullname ]][["df"]] <- df # store OTUs out[[ this_fullname ]][["otus"]] <- taxa_names(subsel) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # sel_unclass <- sel_unclass[-c(sel_done)] print(paste0("finished unclassified group: ",this_fullname)) } # END while loop ## now for all classified genera } else { ##g<-2 # TESTING sel_keep_taxa <- which(tax_table(phy_obj)[ ,"genus"] == this_genus ) keep_taxa <- row.names( tax_table(phy_obj) )[sel_keep_taxa] subsel <- prune_taxa(phy_obj, taxa = keep_taxa ) df <- data.frame( sample = sample_names( subsel ), #alt_vs_nat = subsel@sam_data$alt_vs_nat, alt_vs_nat = subsel@sam_data$Reveg_age, rel_abun = sample_sums( subsel ) ) df$genus <- this_genus out[[ this_genus ]][["df"]] <- df # store OTUs out[[ this_genus ]][["otus"]] <- taxa_names(subsel) print(paste0("finished genus # ",g, " = ",this_genus)) } # END else for classified genera } # END for all_genera[g] names(out) length(names(out)) # 400 length(unique(names(out))) # 400 saveRDS(out, file = "out__rel_abun_data_MtBold_surf_samples.RDS") #out <- readRDS("out__rel_abun_data_MtBold_surf_samples.RDS") ### Examine 2-d index of cumulative % top increasing / decreasing genera out.Aust <- readRDS("out__rel_abun_data_Aust_wide_samples.RDS") out.mtBold <- readRDS("out__rel_abun_data_MtBold_surf_samples.RDS") for (i in 1:length(c(top10_inc$genus,top10_dec$genus))) { #i<-12 df <- out.Aust[[ c(as.character(top10_inc$genus),as.character(top10_dec$genus))[i] ]][["df"]] df$rel_abun <- 100*df$rel_abun df$inc_or_dec <- ifelse(test = i<=10, yes = "increasing with restoration", no = "decreasing with restoration") # collate outputs into single dataframe if (i==1) {temp <- df} if (i>1) {temp <- rbind(temp,df)} print(paste0("completed genus ",i," of ",length(c(top10_inc$genus,top10_dec$genus)))) } for (i in 1:length(c(top10_inc$genus,top10_dec$genus))) { #i<-12 df2 <- out.mtBold[[ c(as.character(top10_inc$genus),as.character(top10_dec$genus))[i] ]][["df"]] df2$rel_abun <- 100*df2$rel_abun df2$inc_or_dec <- ifelse(test = i<=10, yes = "increasing with restoration", no = "decreasing with restoration") # collate outputs into single dataframe if (i==1) {temp2 <- df2} if (i>1) {temp2 <- rbind(temp2,df2)} print(paste0("completed genus ",i," of ",length(c(top10_inc$genus,top10_dec$genus)))) } ## summarise rel abun by sample of increasing genera vs decreasing genera names(temp) #"sample" "alt_vs_nat" "rel_abun" "genus" "inc_or_dec" length(unique(temp$sample)) # 217 length(unique(temp$genus)) # 20 unique(temp$inc_or_dec) # "increasing with restoration" "decreasing with restoration" names(temp2) head(temp2) # sample alt_vs_nat rel_abun genus inc_or_dec # 2009.1.10 2009.1.10 6 years 0.3059924 g__DA101 increasing with restoration # 2009.2.10 2009.2.10 6 years 0.2866288 g__DA101 increasing with restoration # 2009.3.10 2009.3.10 6 years 0.2902295 g__DA101 increasing with restoration # neg.1.10 neg.1.10 Cleared 0.4495936 g__DA101 increasing with restoration # neg.2.10 neg.2.10 Cleared 0.2478872 g__DA101 increasing with restoration # neg.3.10 neg.3.10 Cleared 0.5011301 g__DA101 increasing with restoration ## reduce to mean of each site (merge triplicates) temp2$sample <- paste0(temp2$alt_vs_nat,temp2$genus) length(unique(temp2$sample)) # 160 # = 480/3 temp2_sum <- data.frame(sample = unique(temp2$sample), alt_vs_nat = NA, rel_abun = NA, genus = NA, inc_or_dec = NA) for (i in 1:length(temp2_sum$sample)) { # i<-1 sel <- which(temp2$sample == temp2_sum$sample[i]) temp2_sum$alt_vs_nat[i] <- as.character( unique( temp2$alt_vs_nat[sel] )) temp2_sum$rel_abun[i] <- mean( temp2$rel_abun[sel] ) temp2_sum$genus[i] <- as.character( unique( temp2$genus[sel] )) temp2_sum$inc_or_dec[i] <- as.character( unique( temp2$inc_or_dec[sel] )) } temp2_sum$sample <- temp2_sum$alt_vs_nat ## join rel abun data from Aust-wide and Mt Bold surface samples str(temp) # 'data.frame': 4340 obs. of 5 variables: # $ sample : Factor w/ 217 levels "X12424","X12426",..: 1 2 3 4 5 6 7 8 9 10 ... # $ alt_vs_nat: Factor w/ 2 levels "altered","natural": 2 2 2 2 2 2 2 2 2 2 ... # $ rel_abun : num 1.552 1.773 2.086 1.151 0.791 ... # $ genus : chr "g__DA101" "g__DA101" "g__DA101" "g__DA101" ... # $ inc_or_dec: chr "increasing with restoration" "increasing with restoration" "increasing with restoration" "increasing with restoration" ... str(temp2_sum) # 'data.frame': 160 obs. of 5 variables: # $ sample : chr "6 years" "Cleared" "10 years" "7 years" ... # $ alt_vs_nat: chr "6 years" "Cleared" "10 years" "7 years" ... # $ rel_abun : num 0.294 0.4 1.263 0.436 0.601 ... # $ genus : chr "g__DA101" "g__DA101" "g__DA101" "g__DA101" ... # $ inc_or_dec: chr "increasing with restoration" "increasing with restoration" "increasing with restoration" "increasing with restoration" ... temp$sample <- as.character(temp$sample) temp$alt_vs_nat <- as.character(temp$alt_vs_nat) temp2_sum$sample <- as.character(temp2_sum$sample) temp2_sum$alt_vs_nat <- as.character(temp2_sum$alt_vs_nat) temp.both <- rbind(temp,temp2_sum) dim(temp.both) # 4500 5 length(unique(temp.both$sample)) # 225 length(unique(temp.both$genus)) # 20 unique(temp.both$inc_or_dec) # "increasing with restoration" "decreasing with restoration" head(temp.both) tail(temp.both) str(temp.both) # 'data.frame': 4500 obs. of 5 variables: # $ sample : chr "X12424" "X12426" "X12428" "X12430" ... # $ alt_vs_nat: chr "natural" "natural" "natural" "natural" ... # $ rel_abun : num 1.552 1.773 2.086 1.151 0.791 ... # $ genus : chr "g__DA101" "g__DA101" "g__DA101" "g__DA101" ... # $ inc_or_dec: chr "increasing with restoration" "increasing with restoration" "increasing with restoration" "increasing with restoration" ... # create template to store cumulative rel abun data temp.summary <- data.frame(sample = unique(temp.both$sample)) temp.summary$alt_vs_nat <- NA temp.summary$inc_rel_abun <- NA temp.summary$dec_rel_abun <- NA length(unique(temp.summary$sample)) # 225 str(temp.summary) # 'data.frame': 225 obs. of 4 variables: # $ sample : Factor w/ 225 levels "10 years","6 years",..: 9 10 11 12 13 14 15 16 17 18 ... # $ alt_vs_nat : logi NA NA NA NA NA NA ... # $ inc_rel_abun: logi NA NA NA NA NA NA ... # $ dec_rel_abun: logi NA NA NA NA NA NA ... temp.summary$sample <- as.character(temp.summary$sample) ## summaring increasing genera sel <- which(temp.both$inc_or_dec == "increasing with restoration") # qty 2250 for (j in 1:length(temp.summary$sample)) { #j<-218 idx_samps <- which(temp.both$sample[sel] == temp.summary$sample[j]) temp.summary$inc_rel_abun[j] <- sum(temp.both$rel_abun[ sel[idx_samps] ]) temp.summary$alt_vs_nat[j] <- unique(temp.both$alt_vs_nat[ sel[idx_samps] ]) #temp.summary$sample[j] <- unique(temp.both$sample[ sel[idx_samps] ]) } ## summaring decreasing genera sel <- which(temp.both$inc_or_dec == "decreasing with restoration") # qty 2250 for (j in 1:length(temp.summary$sample)) { #j<-1 idx_samps <- which(temp.both$sample[sel] == temp.summary$sample[j]) temp.summary$dec_rel_abun[j] <- sum(temp.both$rel_abun[ sel[idx_samps] ]) temp.summary$alt_vs_nat[j] <- unique(temp.both$alt_vs_nat[ sel[idx_samps] ]) #temp.summary$sample[j] <- unique(temp.both$sample[ sel[idx_samps] ]) } plot(temp.summary$dec_rel_abun, temp.summary$inc_rel_abun, col=factor(temp.summary$alt_vs_nat)) ## ggplot cols1 <- c("altered" = "#F8766D", "natural" = "#00BFC4") cols2 <- c("Cleared" = "#e31a1c", "6 years" = "#addd8e", "7 years" = "#78c679", "8 years" = "#41ab5d", "10 years"= "#238443", "Remnant A" = "#4292c6", "Remnant B" = "#2171b5", "Remnant C" = "#084594" ) cols <- c("altered" = "#F8766D", "natural" = "#00BFC4", "Cleared" = "#e31a1c", "6 years" = "#addd8e", "7 years" = "#78c679", "8 years" = "#41ab5d", "10 years"= "#238443", "Remnant A" = "#4292c6", "Remnant B" = "#2171b5", "Remnant C" = "#084594" ) p <- ggplot(data = temp.summary ) + theme_bw() + #labs(title = "(a)") + geom_point(aes(x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat, shape = alt_vs_nat), data = temp.summary[which(temp.summary$alt_vs_nat %in% names(cols1)), ] ) + geom_point(aes(x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat), size=3, shape=16, data = temp.summary[which(temp.summary$sample %in% names(cols2)), ] , show.legend = FALSE ) + geom_text_repel(aes(x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat), data = temp.summary[which(temp.summary$sample %in% names(cols2)), ], label= temp.summary[which(temp.summary$sample %in% names(cols2)), "alt_vs_nat"], size=2.5, segment.colour="grey") + # red colour="#d73027", geom_density_2d(data = temp.summary[which(temp.summary$alt_vs_nat=="natural"), ], aes( x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat), alpha=.4) + geom_density_2d(data = temp.summary[which(temp.summary$alt_vs_nat=="altered"), ], aes( x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat), alpha=.4) + labs(x="OTU relative abundance of top 10 decreasing\nbacteria with Mt Bold restoration (%)", y="OTU relative abundance of top 10 increasing\nbacteria with Mt Bold restoration (%)" ) + scale_color_manual(values = cols, guide = FALSE) + scale_shape_manual(values = c(0, 3),name ="Australia-wide samples", labels = c("Human-altered", "Natural") ) + theme(legend.position="bottom") p getwd() # "C:/Workspace/PROJ/PAPER-Trending-Taxa-Resto/modelling" ggsave(plot=p, filename = paste0("finished-plots/","Rel-Abun-top10Inc-vs-top10Dec-ALT-vs-NAT-with-MtBold-vFINAL.tiff"), width = 12, height = 12, units = "cm", dpi = 600, compression = "lzw") ## Modified plot p <- ggplot(data = temp.summary ) + theme_bw() + geom_point(aes(x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat, shape = alt_vs_nat), data = temp.summary[which(temp.summary$alt_vs_nat %in% names(cols1)), ] ) + geom_point(aes(x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat), size=3.5, shape=16, data = temp.summary[which(temp.summary$sample %in% names(cols2)), ] , show.legend = FALSE ) + geom_text_repel(aes(x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat), data = temp.summary[which(temp.summary$sample %in% names(cols2)), ], label= temp.summary[which(temp.summary$sample %in% names(cols2)), "alt_vs_nat"], size=2.75, segment.colour="grey") + # red colour="#d73027", geom_density_2d(data = temp.summary[which(temp.summary$alt_vs_nat=="natural"), ], aes( x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat), alpha=.4) + geom_density_2d(data = temp.summary[which(temp.summary$alt_vs_nat=="altered"), ], aes( x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat), alpha=.4) + labs(x="OTU relative abundance of top 10 decreasing\nbacteria with Mt Bold restoration (%)", y="OTU relative abundance of top 10 increasing\nbacteria with Mt Bold restoration (%)" ) + scale_color_manual(values = cols, guide = FALSE) + # scale_shape_manual(values = c(0, 3),name =" Australia-wide samples", labels = c("Human-altered", "Natural") ) + scale_x_reverse() + theme(legend.position=c(0.2,0.9), legend.background = element_blank(), legend.title = element_text(size=10), #legend.text = element_text(margin = margin(t=-5, b=-5, unit = "pt")), legend.spacing.y = unit(0.25,"line"), legend.key.height=unit(0.8,"line"), panel.grid.major = element_blank(), panel.grid.minor = element_blank() ) p getwd() # "C:/Workspace/PROJ/PAPER-Trending-Taxa-Resto/modelling" ggsave(plot=p, filename = paste0("finished-plots/","Rel-Abun-top10Inc-vs-top10Dec-ALT-vs-NAT-with-MtBold-vFINAL.tiff"), width = 12, height = 12, units = "cm", dpi = 600, compression = "lzw") ## Plot for graphical abstract p <- ggplot(data = temp.summary ) + theme_bw() + geom_point(aes(x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat, shape = alt_vs_nat), data = temp.summary[which(temp.summary$alt_vs_nat %in% names(cols1)), ] ) + geom_point(aes(x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat), size=3.5, shape=16, data = temp.summary[which(temp.summary$sample %in% names(cols2)), ] , show.legend = FALSE ) + geom_text_repel(aes(x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat), data = temp.summary[which(temp.summary$sample %in% names(cols2)), ], label= temp.summary[which(temp.summary$sample %in% names(cols2)), "alt_vs_nat"], size=2.75, segment.colour="grey") + # red colour="#d73027", geom_density_2d(data = temp.summary[which(temp.summary$alt_vs_nat=="natural"), ], aes( x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat), alpha=.4) + geom_density_2d(data = temp.summary[which(temp.summary$alt_vs_nat=="altered"), ], aes( x= dec_rel_abun, y=inc_rel_abun, colour=alt_vs_nat), alpha=.4) + labs(x="% Opportunistic taxa", y="% Niche-adapted taxa" ) + scale_color_manual(values = cols, guide = FALSE) + # scale_shape_manual(values = c(0, 3),name =" Australia-wide samples", labels = c("Disturbed", "Natural") ) + scale_x_reverse() + theme(legend.position=c(0.25,0.9), legend.background = element_blank(), legend.title = element_text(size=10), #legend.text = element_text(margin = margin(t=-5, b=-5, unit = "pt")), legend.spacing.y = unit(0.25,"line"), legend.key.height=unit(0.8,"line"), axis.title = element_text(size=14), axis.title.x = element_text(color = "#cb181d"), axis.title.y = element_text(color = "#2171b5"), axis.ticks.length=unit(-3, "pt"), axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt")), # , size = rel(0.8), vjust=0.5, axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 5,"pt")), # , size = rel(0.8) panel.grid.major = element_blank(), panel.grid.minor = element_blank() ) p getwd() # "C:/Workspace/PROJ/PAPER-Trending-Taxa-Resto/modelling" ggsave(plot=p, filename = paste0("finished-plots/","Rel-Abun-top10Inc-vs-top10Dec-ALT-vs-NAT-with-MtBold-vFINAL-Graph-Abs.tiff"), width = 10, height = 10, units = "cm", dpi = 600, compression = "lzw") # # # # # # # # # # # Test hypothesis that 2-d cumulative rel abun of top10 inc/dec bacteria vary (with different centroids) by human-altered vs natural groups names(temp.summary) # "sample" "alt_vs_nat" "inc_rel_abun" "dec_rel_abun" # exclude Mt Bold samples from this analysis temp.summary$sample sel.rm <- which(temp.summary$sample %in% c("6 years", "Cleared", "10 years", "7 years", "8 years", "Remnant A", "Remnant B", "Remnant C")) # Calculate distance matrix - "euclidean" set.seed(123) test <- vegan::adonis(formula = temp.summary[-sel.rm, ] ~ alt_vs_nat, data =temp.summary[-sel.rm, ], method = "euclidean") test$aov.tab # Permutation: free # Number of permutations: 999 # # Terms added sequentially (first to last) # # Df SumsOfSqs MeanSqs F.Model R2 Pr(>F) # alt_vs_nat 1 7132.1 7132.1 119.33 0.35692 0.001 *** # Residuals 215 12850.5 59.8 0.64308 # Total 216 19982.5 1.00000 # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Homogeneity of dispersion test set.seed(123) beta <- betadisper(vegdist(x=temp.summary[-sel.rm, ], method = "euclidean"), temp.summary$alt_vs_nat[-sel.rm]) permutest(beta) # Permutation test for homogeneity of multivariate dispersions # Permutation: free # Number of permutations: 999 # # Response: Distances # Df Sum Sq Mean Sq F N.Perm Pr(>F) # Groups 1 214.6 214.574 9.1395 999 0.003 ** # Residuals 215 5047.7 23.478 # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # These outputs tell us that our adonis test is significant so we can reject the null hypothesis # that 2-d distribution of cumulative relative abundances have the same centroid between human-altered and natural groups. # However, betadisper results are significant, meaning we can reject the null hypothesis that # human-altered and natural groups have the same dispersions. # Although, homogeneity of dispersions are different, they visually group in different zones of the plot, lending support # to the significant adonis permanova result. # #------------------------- #### Examine patterns for human-associated taxa? #------------------------- ## look for this list bact <- c( "Achromobacter", "Acinetobacter","Actinomadura","Actinomyces", "Actinoplanes","Aeromonas", "Akkermansia","Alcaligenes", "Alistipes", "Amycolatopsis", "Bacillus", "Bacteroides", "Bifidobacterium", "Blautia", "Borrelia","Burkholderia","Campylobacter","Chromobacterium","Chryseobacterium", "Chryseomonas", "Clostridium", "Coprococcus","Coxiella","Cytophaga","Enterobacter", "Enterococcus","Escherichia", "Eubacterium", "Faecalibacterium", "Flavimonas","Flavobacterium","Francisella","Herbaspirillum","Janthinobacterium", "Klebsiella","Kluyvera", "Lactobacillus","Legionella","Leptospira","Listeria", "Mycobacterium","Nocardia","Ochrobactrum", "Pantoea", "Prevotella", "Proteus","Pseudomonas","Ralstonia","Rhodococcus","Rickettsia", "Ruminococcus","Salmonella", # Blautia formerly known as Ruminococcus "Serratia","Shigella","Sphingobacterium","Sphingomonas","Staphylococcus","Stenotrophomonas", "Streptococcus","Streptomyces","Thermoactinomyces","Yersinia" ) ### examine all DESeq2 results human <- data.frame(bact=bact, genus=NA, phylum=NA, mean_cor=NA, mean_log2FoldChange=NA ) human$genus <- paste0("g__",human$bact) for (i in 1:length(bact)) { #i<-1 sel <- which(all_bs_summaries$genus == human$genus[i]) if (length(sel)>=1) { human$mean_cor[i] <- mean( all_bs_summaries$r_0_10[sel], na.rm=TRUE ) } sel <- which(res_deseq_ALL$genus == human$genus[i]) if (length(sel)>=1) { human$mean_log2FoldChange[i] <- mean( res_deseq_ALL$log2FoldChange[sel], na.rm=TRUE ) human$phylum[i] <- unique(as.character(res_deseq_ALL$phylum[sel])) human$phylum[i] <- gsub(pattern="p__", replacement="", x=human$phylum[i]) } } # remove NA cases ok <- complete.cases(human) sel <- which(ok==TRUE) human <- human[sel, ] dim(human) # 20 5 human # bact genus phylum mean_cor mean_log2FoldChange # 1 Achromobacter g__Achromobacter Proteobacteria -0.45169312 -3.22878738 # 3 Actinomadura g__Actinomadura Actinobacteria 0.32147310 0.86597865 # 10 Amycolatopsis g__Amycolatopsis Actinobacteria 0.02582253 0.33362689 # 11 Bacillus g__Bacillus Firmicutes -0.94752922 -1.03569467 # 12 Bacteroides g__Bacteroides Bacteroidetes -0.28837581 -2.88117169 # 16 Burkholderia g__Burkholderia Proteobacteria 0.11877657 2.05008585 # 19 Chryseobacterium g__Chryseobacterium Bacteroidetes -0.14278793 -0.92344929 # 21 Clostridium g__Clostridium Firmicutes -0.80668595 -1.81145443 # 22 Coprococcus g__Coprococcus Firmicutes -0.82163780 -1.48996543 # 25 Enterobacter g__Enterobacter Proteobacteria -0.21462042 -0.87595237 # 31 Flavobacterium g__Flavobacterium Bacteroidetes -0.24596935 -0.92338957 # 34 Janthinobacterium g__Janthinobacterium Proteobacteria -0.56040745 0.44141970 # 38 Legionella g__Legionella Proteobacteria -0.47425225 -0.45675638 # 41 Mycobacterium g__Mycobacterium Actinobacteria 0.38789734 1.31342804 # 47 Pseudomonas g__Pseudomonas Proteobacteria -0.16866722 -0.91381716 # 49 Rhodococcus g__Rhodococcus Actinobacteria -0.64747067 -1.20977743 # 55 Sphingobacterium g__Sphingobacterium Bacteroidetes -0.15847743 -1.44425653 # 56 Sphingomonas g__Sphingomonas Proteobacteria 0.56449348 0.09559622 # 58 Stenotrophomonas g__Stenotrophomonas Proteobacteria 0.08635399 -1.94943014 # 60 Streptomyces g__Streptomyces Actinobacteria -0.60905889 -1.18303646 str(human) human$bact <- as.character(human$bact) human$bact <- factor(human$bact) human$phylum <- factor(human$phylum) str(human) # 'data.frame': 20 obs. of 5 variables: # $ bact : Factor w/ 20 levels "Achromobacter",..: 1 2 3 4 5 6 7 8 9 10 ... # $ genus : chr "g__Achromobacter" "g__Actinomadura" "g__Amycolatopsis" "g__Bacillus" ... # $ phylum : Factor w/ 4 levels "Actinobacteria",..: 4 1 1 3 2 4 2 3 3 4 ... # $ mean_cor : num -0.4517 0.3215 0.0258 -0.9475 -0.2884 ... # $ mean_log2FoldChange: num -3.229 0.866 0.334 -1.036 -2.881 ... levels(human$phylum) # "Actinobacteria" "Bacteroidetes" "Firmicutes" "Proteobacteria" set.seed(123) p <- ggplot(data = human ) + theme_bw() + geom_point(aes(x= mean_cor, y=mean_log2FoldChange, colour=phylum) ) + labs(x="Correlation with reveg age (Mt Bold)", y="Log2-fold-change for OTUs (altered vs natural)" ) + geom_vline(xintercept=0, col="dark grey", linetype="dashed") + geom_hline(yintercept=0, col="dark grey", linetype="dashed") + geom_text(x=0.08, y=1.57, label="Natural", hjust=0, vjust=0, size=4.5, col="#00BFC4", fontface = "bold") + #geom_text(x=0.04, y=0.9, label="Natural", hjust=0, vjust=0, size=4.5, col="#00BFC4", fontface = "bold") + geom_text(x=-0.8, y=-2.5, label="Disturbed", hjust=0, vjust=0, size=4.5, col="#F8766D", fontface = "bold") + geom_text_repel(data=human, aes(x= mean_cor, y=mean_log2FoldChange), label=human$bact, size=2, segment.colour="grey") + scale_colour_manual(name = "Phylum", values = c("Actinobacteria" = "#d73027", "Bacteroidetes" = "#66bd63", "Firmicutes" = "#3288bd", "Proteobacteria" = "#bc80bd") ) + theme( panel.grid.major = element_blank(), panel.grid.minor = element_blank() ) p getwd() # "C:/Workspace/PROJ/PAPER-Trending-Taxa-Resto/modelling" ggsave(plot=p, filename = paste0("finished-plots/","Human-associated-Cor-with-reveg-age-vs-Log2-fold-change-ALT-vs-NAT-vFINAL.tiff"), width = 11, height = 9, units = "cm", dpi = 600, compression = "lzw") write.csv(x=human, file="human-associated-taxa-results-Cor--vs-log2FoldChange-vFINAL.csv", row.names = FALSE) ## identify which appear to be increasing and decreasing with restoration? ## increasing sel <- which(human$mean_cor >0 & human$mean_log2FoldChange >0) human.inc <- human$genus[sel] human.inc # "g__Actinomadura" "g__Amycolatopsis" "g__Burkholderia" "g__Mycobacterium" "g__Sphingomonas" ## decreasing sel <- which(human$mean_cor <0 & human$mean_log2FoldChange <0) human.dec <- human$genus[sel] human.dec # [1] "g__Achromobacter" "g__Bacillus" "g__Bacteroides" "g__Chryseobacterium" "g__Clostridium" # [6] "g__Coprococcus" "g__Enterobacter" "g__Flavobacterium" "g__Legionella" "g__Pseudomonas" # [11] "g__Rhodococcus" "g__Sphingobacterium" "g__Streptomyces" ## repeat comparison relative abundance and one-sided Wilcox rank-sum tests from previous section ## increasing for (i in 1:length(human.inc)) { df <- out[[ as.character(human.inc[i]) ]][["df"]] df$rel_abun <- 100*df$rel_abun filename <- gsub(pattern = "\\(|\\)|\\:", replacement = "_", x = human.inc[i] ) grDevices::tiff(file=paste0("finished-plots/","Rel-abun-comparison-Human-associated-Increasing-",i,"-",filename,".tif"), width = 12, height = 12, units = "cm", res = 600, compression = "lzw") boxplot( rel_abun ~ alt_vs_nat, data = df, ylab="Relative abundance (%)") ## one-sided test ## https://stats.stackexchange.com/questions/201125/understanding-the-wilcoxon-rank-sum-one-sided-test wt <- wilcox.test(rel_abun ~ alt_vs_nat, alternative="less", conf.int=FALSE, data=df) mtext( text = sub(pattern = ".__", replacement = "", human.inc[i]), side = 3, line=1, adj=0 , cex=1.3 ) mtext( text = paste0("Increases with restoration at Mt Bold #",i), side = 3, line=0.1, adj=0 , cex=0.8 ) if (wt$p.value<0.001) { text(1, par("usr")[3] + 0.8*(par("usr")[4]-par("usr")[3]), paste0("Wilcoxon rank-sum\none-sided test\nW = ",wt$statistic,"; P < 0.001"), adj=c(0.5,0) , cex=0.8) } else { text(1, par("usr")[3] + 0.8*(par("usr")[4]-par("usr")[3]), paste0("Wilcoxon rank-sum\none-sided test\nW = ",wt$statistic,"; P = ", round(wt$p.value, digits = 3)), adj=c(0.5,0) , cex=0.8) } dev.off() } ## decreasing for (i in 1:length(human.dec)) { df <- out[[ as.character(human.dec[i]) ]][["df"]] df$rel_abun <- 100*df$rel_abun filename <- gsub(pattern = "\\(|\\)|\\:", replacement = "_", x = human.dec[i] ) grDevices::tiff(file=paste0("finished-plots/","Rel-abun-comparison-Human-associated-Decreasing-",i,"-",filename,".tif"), width = 12, height = 12, units = "cm", res = 600, compression = "lzw") boxplot( rel_abun ~ alt_vs_nat, data = df, ylab="Relative abundance (%)") ## one-sided test ## https://stats.stackexchange.com/questions/201125/understanding-the-wilcoxon-rank-sum-one-sided-test wt <- wilcox.test(rel_abun ~ alt_vs_nat, alternative="greater", conf.int=FALSE, data=df) mtext( text = sub(pattern = ".__", replacement = "", human.dec[i]), side = 3, line=1, adj=0 , cex=1.3 ) mtext( text = paste0("Decreases with restoration at Mt Bold #",i), side = 3, line=0.1, adj=0 , cex=0.8 ) if (wt$p.value<0.001) { text(2, par("usr")[3] + 0.8*(par("usr")[4]-par("usr")[3]), paste0("Wilcoxon rank-sum\none-sided test\nW = ",wt$statistic,"; P < 0.001"), adj=c(0.5,0) , cex=0.8) } else { text(2, par("usr")[3] + 0.8*(par("usr")[4]-par("usr")[3]), paste0("Wilcoxon rank-sum\none-sided test\nW = ",wt$statistic,"; P = ", round(wt$p.value, digits = 3)), adj=c(0.5,0) , cex=0.8) } dev.off() } # #------------------------- #### Summary table # i) Top 10 increasing # ii) Top 10 decreasing # iii) Human-associated increasing # iv) Human-associated decreasing #------------------------- ## run top 10 increasing/decreasing separate to human-associated # from above out <- readRDS("out__rel_abun_data_Aust_wide_samples.RDS") ## prepare res_deseq_ALL data frame with unclassified labels temp <- res_deseq_ALL res_deseq_ALL$genus_label <- res_deseq_ALL$genus res_deseq_ALL$genus_label <- sub(pattern="g__", x=res_deseq_ALL$genus_label, replacement="") res_deseq_ALL$OTU <- NA for (i in 1:length(res_deseq_ALL$genus_label)) { ranks <- c("genus","family","order","class","phylum","kingdom") idx_rank <- 1 # if "unclassified" if (res_deseq_ALL$genus_label[i] == "unclassified") { this_taxaname <- "unclassified" while (this_taxaname == "unclassified") { idx_rank <- idx_rank +1 this_rank <- ranks[idx_rank] this_taxaname <- as.character( res_deseq_ALL[ i , this_rank ] ) } this_fullname <- paste0("unclassified (",this_rank,": ",gsub(pattern=".__",x=this_taxaname,replacement=""),")") res_deseq_ALL$genus_label[i] <- this_fullname } # add OTUId res_deseq_ALL$OTU[i] <- paste0( row.names(res_deseq_ALL)[i]) } # # # # # # # ### i) Top 10 increasing top10_inc$genus # [1] g__DA101 g__Candidatus_Xiphinematobacter # [3] g__Bradyrhizobium g__Candidatus_Solibacter # [5] g__Candidatus_Koribacter unclassified (family: Rhodospirillaceae) # [7] g__Rhodopila g__Edaphobacter # [9] unclassified (order: Solibacterales) unclassified (family: [Leptospirillaceae]) ref <- top10_inc$genus tab.temp <- data.frame(genus_group=as.character(ref), genus_group_label=NA, phylum=NA, phylum_label=NA, cor_mean=NA, cor_lwr95ci=NA, cor_upr95ci=NA, mean_log2FoldChange=NA, relabun_median_Alt=NA, relabun_lwrIQR_Alt=NA, relabun_uprIQR_Alt=NA, relabun_median_Nat=NA, relabun_lwrIQR_Nat=NA, relabun_uprIQR_Nat=NA, expect_trend_restor=NA, Wilcox_W=NA, p_value=NA, n_otus=NA ) tab.temp$genus_group_label <- sub(pattern = "g__",replacement = "",x = tab.temp$genus_group) for (i in 1:length(ref)) { #i<-1 # testing ## get relative abundance info between Aust-wide altered vs natural samples this_genus <- as.character(ref[i]) df <- out[[ this_genus ]][["df"]] df$rel_abun <- 100*df$rel_abun # table(df$alt_vs_nat) # altered natural # 78 139 tab.temp$relabun_median_Alt[i] <- median(df$rel_abun[ which(df$alt_vs_nat=="altered") ]) tab.temp$relabun_lwrIQR_Alt[i] <- quantile(df$rel_abun[ which(df$alt_vs_nat=="altered") ], probs = 0.25 ) # , na.rm = TRUE tab.temp$relabun_uprIQR_Alt[i] <- quantile(df$rel_abun[ which(df$alt_vs_nat=="altered") ], probs = 0.75 ) # , na.rm = TRUE tab.temp$relabun_median_Nat[i] <- median(df$rel_abun[ which(df$alt_vs_nat=="natural") ]) tab.temp$relabun_lwrIQR_Nat[i] <- quantile(df$rel_abun[ which(df$alt_vs_nat=="natural") ], probs = 0.25 ) # , na.rm = TRUE tab.temp$relabun_uprIQR_Nat[i] <- quantile(df$rel_abun[ which(df$alt_vs_nat=="natural") ], probs = 0.75 ) # , na.rm = TRUE tab.temp$expect_trend_restor <- "Increasing" # UPDATE THIS !!!!!!!!!!!!!!!!!!!!!!!! ## one-sided test ## https://stats.stackexchange.com/questions/201125/understanding-the-wilcoxon-rank-sum-one-sided-test wt <- wilcox.test(rel_abun ~ alt_vs_nat, alternative="less", conf.int=FALSE, data=df) # "less" for increasing with restoration / "greater" for decreasing with restoration !!!!!!!!!! UPDATED THIS !!!!!!!!!!!!!! tab.temp$Wilcox_W[i] <- wt$statistic if (wt$p.value<0.001) { tab.temp$p_value[i] <- paste0("< 0.001") } else { tab.temp$p_value[i] <- paste0(round(wt$p.value,digits = 3)) } ## get correlation info from Mt Bold restoration sel <- which(as.character(all_bs_summaries$genus) == this_genus ) if (length(sel)>=1) { tab.temp$cor_mean[i] <- mean( all_bs_summaries$r_0_10[sel], na.rm=TRUE ) tab.temp$cor_lwr95ci[i] <- quantile( all_bs_summaries$r_0_10[sel], probs = .025, na.rm=TRUE ) tab.temp$cor_upr95ci[i] <- quantile( all_bs_summaries$r_0_10[sel], probs = .975, na.rm=TRUE ) tab.temp$n_otus[i] <- length( out[[ this_genus ]][["otus"]] ) } ## get log2FoldChange info between altered > natural if (grepl(pattern = "unclassified",x = this_genus)){ sel <- which( res_deseq_ALL$genus_label == this_genus ) } else { sel <- which( res_deseq_ALL$genus == this_genus ) } if (length(sel)>=1) { tab.temp$mean_log2FoldChange[i] <- mean( res_deseq_ALL$log2FoldChange[sel], na.rm=TRUE ) tab.temp$phylum[i] <- unique(as.character(res_deseq_ALL$phylum[sel])) tab.temp$phylum_label[i] <- sub(pattern="p__", replacement="", x=tab.temp$phylum[i]) } } tab.inc <- tab.temp tab.inc # # # # # # # # # # # # # # # ### ii) Top 10 decreasing top10_dec$genus # [1] g__Bacillus g__Rummeliibacillus # [3] unclassified (family: Actinospicaceae) unclassified (order: Ellin5290) # [5] g__Sporosarcina g__Cytophagales # [7] unclassified (family: Ellin5301) g__Ammoniphilus # [9] g__Flavisolibacter unclassified (class: C0119) ref <- top10_dec$genus tab.temp <- data.frame(genus_group=as.character(ref), genus_group_label=NA, phylum=NA, phylum_label=NA, cor_mean=NA, cor_lwr95ci=NA, cor_upr95ci=NA, mean_log2FoldChange=NA, relabun_median_Alt=NA, relabun_lwrIQR_Alt=NA, relabun_uprIQR_Alt=NA, relabun_median_Nat=NA, relabun_lwrIQR_Nat=NA, relabun_uprIQR_Nat=NA, expect_trend_restor=NA, Wilcox_W=NA, p_value=NA, n_otus=NA ) tab.temp$genus_group_label <- sub(pattern = "g__",replacement = "",x = tab.temp$genus_group) for (i in 1:length(ref)) { #i<-1 # testing ## get relative abundance info between Aust-wide altered vs natural samples this_genus <- as.character(ref[i]) df <- out[[ this_genus ]][["df"]] df$rel_abun <- 100*df$rel_abun # table(df$alt_vs_nat) # altered natural # 78 139 tab.temp$relabun_median_Alt[i] <- median(df$rel_abun[ which(df$alt_vs_nat=="altered") ]) tab.temp$relabun_lwrIQR_Alt[i] <- quantile(df$rel_abun[ which(df$alt_vs_nat=="altered") ], probs = 0.25 ) # , na.rm = TRUE tab.temp$relabun_uprIQR_Alt[i] <- quantile(df$rel_abun[ which(df$alt_vs_nat=="altered") ], probs = 0.75 ) # , na.rm = TRUE tab.temp$relabun_median_Nat[i] <- median(df$rel_abun[ which(df$alt_vs_nat=="natural") ]) tab.temp$relabun_lwrIQR_Nat[i] <- quantile(df$rel_abun[ which(df$alt_vs_nat=="natural") ], probs = 0.25 ) # , na.rm = TRUE tab.temp$relabun_uprIQR_Nat[i] <- quantile(df$rel_abun[ which(df$alt_vs_nat=="natural") ], probs = 0.75 ) # , na.rm = TRUE tab.temp$expect_trend_restor <- "Decreasing" # UPDATE THIS !!!!!!!!!!!!!!!!!!!!!!!! ## one-sided test ## https://stats.stackexchange.com/questions/201125/understanding-the-wilcoxon-rank-sum-one-sided-test wt <- wilcox.test(rel_abun ~ alt_vs_nat, alternative="greater", conf.int=FALSE, data=df) # "less" for increasing with restoration / "greater" for decreasing with restoration !!!!!!!!!! UPDATED THIS !!!!!!!!!!!!!! tab.temp$Wilcox_W[i] <- wt$statistic if (wt$p.value<0.001) { tab.temp$p_value[i] <- paste0("< 0.001") } else { tab.temp$p_value[i] <- paste0(round(wt$p.value,digits = 3)) } ## get correlation info from Mt Bold restoration sel <- which(as.character(all_bs_summaries$genus) == this_genus ) if (length(sel)>=1) { tab.temp$cor_mean[i] <- mean( all_bs_summaries$r_0_10[sel], na.rm=TRUE ) tab.temp$cor_lwr95ci[i] <- quantile( all_bs_summaries$r_0_10[sel], probs = .025, na.rm=TRUE ) tab.temp$cor_upr95ci[i] <- quantile( all_bs_summaries$r_0_10[sel], probs = .975, na.rm=TRUE ) tab.temp$n_otus[i] <- length( out[[ this_genus ]][["otus"]] ) } ## get log2FoldChange info between altered > natural if (grepl(pattern = "unclassified",x = this_genus)){ sel <- which( res_deseq_ALL$genus_label == this_genus ) } else { sel <- which( res_deseq_ALL$genus == this_genus ) } if (length(sel)>=1) { tab.temp$mean_log2FoldChange[i] <- mean( res_deseq_ALL$log2FoldChange[sel], na.rm=TRUE ) tab.temp$phylum[i] <- unique(as.character(res_deseq_ALL$phylum[sel])) tab.temp$phylum_label[i] <- sub(pattern="p__", replacement="", x=tab.temp$phylum[i]) } } tab.dec <- tab.temp tab.dec # # # # # # # # # # # # # # # ### iii) Human-associated increasing human.inc # "g__Actinomadura" "g__Amycolatopsis" "g__Burkholderia" "g__Mycobacterium" "g__Sphingomonas" ref <- human.inc tab.temp <- data.frame(genus_group=as.character(ref), genus_group_label=NA, phylum=NA, phylum_label=NA, cor_mean=NA, cor_lwr95ci=NA, cor_upr95ci=NA, mean_log2FoldChange=NA, relabun_median_Alt=NA, relabun_lwrIQR_Alt=NA, relabun_uprIQR_Alt=NA, relabun_median_Nat=NA, relabun_lwrIQR_Nat=NA, relabun_uprIQR_Nat=NA, expect_trend_restor=NA, Wilcox_W=NA, p_value=NA, n_otus=NA ) tab.temp$genus_group_label <- sub(pattern = "g__",replacement = "",x = tab.temp$genus_group) for (i in 1:length(ref)) { #i<-1 # testing ## get relative abundance info between Aust-wide altered vs natural samples this_genus <- as.character(ref[i]) df <- out[[ this_genus ]][["df"]] df$rel_abun <- 100*df$rel_abun # table(df$alt_vs_nat) # altered natural # 78 139 tab.temp$relabun_median_Alt[i] <- median(df$rel_abun[ which(df$alt_vs_nat=="altered") ]) tab.temp$relabun_lwrIQR_Alt[i] <- quantile(df$rel_abun[ which(df$alt_vs_nat=="altered") ], probs = 0.25 ) # , na.rm = TRUE tab.temp$relabun_uprIQR_Alt[i] <- quantile(df$rel_abun[ which(df$alt_vs_nat=="altered") ], probs = 0.75 ) # , na.rm = TRUE tab.temp$relabun_median_Nat[i] <- median(df$rel_abun[ which(df$alt_vs_nat=="natural") ]) tab.temp$relabun_lwrIQR_Nat[i] <- quantile(df$rel_abun[ which(df$alt_vs_nat=="natural") ], probs = 0.25 ) # , na.rm = TRUE tab.temp$relabun_uprIQR_Nat[i] <- quantile(df$rel_abun[ which(df$alt_vs_nat=="natural") ], probs = 0.75 ) # , na.rm = TRUE tab.temp$expect_trend_restor <- "Increasing" # UPDATE THIS !!!!!!!!!!!!!!!!!!!!!!!! ## one-sided test ## https://stats.stackexchange.com/questions/201125/understanding-the-wilcoxon-rank-sum-one-sided-test wt <- wilcox.test(rel_abun ~ alt_vs_nat, alternative="less", conf.int=FALSE, data=df) # "less" for increasing with restoration / "greater" for decreasing with restoration !!!!!!!!!! UPDATED THIS !!!!!!!!!!!!!! tab.temp$Wilcox_W[i] <- wt$statistic if (wt$p.value<0.001) { tab.temp$p_value[i] <- paste0("< 0.001") } else { tab.temp$p_value[i] <- paste0(round(wt$p.value,digits = 3)) } ## get correlation info from Mt Bold restoration sel <- which(as.character(all_bs_summaries$genus) == this_genus ) if (length(sel)>=1) { tab.temp$cor_mean[i] <- mean( all_bs_summaries$r_0_10[sel], na.rm=TRUE ) tab.temp$cor_lwr95ci[i] <- quantile( all_bs_summaries$r_0_10[sel], probs = .025, na.rm=TRUE ) tab.temp$cor_upr95ci[i] <- quantile( all_bs_summaries$r_0_10[sel], probs = .975, na.rm=TRUE ) tab.temp$n_otus[i] <- length( out[[ this_genus ]][["otus"]] ) } ## get log2FoldChange info between altered > natural if (grepl(pattern = "unclassified",x = this_genus)){ sel <- which( res_deseq_ALL$genus_label == this_genus ) } else { sel <- which( res_deseq_ALL$genus == this_genus ) } if (length(sel)>=1) { tab.temp$mean_log2FoldChange[i] <- mean( res_deseq_ALL$log2FoldChange[sel], na.rm=TRUE ) tab.temp$phylum[i] <- unique(as.character(res_deseq_ALL$phylum[sel])) tab.temp$phylum_label[i] <- sub(pattern="p__", replacement="", x=tab.temp$phylum[i]) } } tab.hum.inc <- tab.temp tab.hum.inc # # # # # # # # # # # # # # # ### iv) Human-associated decreasing human.dec # [1] "g__Achromobacter" "g__Bacillus" "g__Bacteroides" "g__Chryseobacterium" "g__Clostridium" # [6] "g__Coprococcus" "g__Enterobacter" "g__Flavobacterium" "g__Legionella" "g__Pseudomonas" # [11] "g__Rhodococcus" "g__Sphingobacterium" "g__Streptomyces" ref <- human.dec tab.temp <- data.frame(genus_group=as.character(ref), genus_group_label=NA, phylum=NA, phylum_label=NA, cor_mean=NA, cor_lwr95ci=NA, cor_upr95ci=NA, mean_log2FoldChange=NA, relabun_median_Alt=NA, relabun_lwrIQR_Alt=NA, relabun_uprIQR_Alt=NA, relabun_median_Nat=NA, relabun_lwrIQR_Nat=NA, relabun_uprIQR_Nat=NA, expect_trend_restor=NA, Wilcox_W=NA, p_value=NA, n_otus=NA ) tab.temp$genus_group_label <- sub(pattern = "g__",replacement = "",x = tab.temp$genus_group) for (i in 1:length(ref)) { #i<-1 # testing ## get relative abundance info between Aust-wide altered vs natural samples this_genus <- as.character(ref[i]) df <- out[[ this_genus ]][["df"]] df$rel_abun <- 100*df$rel_abun # table(df$alt_vs_nat) # altered natural # 78 139 tab.temp$relabun_median_Alt[i] <- median(df$rel_abun[ which(df$alt_vs_nat=="altered") ]) tab.temp$relabun_lwrIQR_Alt[i] <- quantile(df$rel_abun[ which(df$alt_vs_nat=="altered") ], probs = 0.25 ) # , na.rm = TRUE tab.temp$relabun_uprIQR_Alt[i] <- quantile(df$rel_abun[ which(df$alt_vs_nat=="altered") ], probs = 0.75 ) # , na.rm = TRUE tab.temp$relabun_median_Nat[i] <- median(df$rel_abun[ which(df$alt_vs_nat=="natural") ]) tab.temp$relabun_lwrIQR_Nat[i] <- quantile(df$rel_abun[ which(df$alt_vs_nat=="natural") ], probs = 0.25 ) # , na.rm = TRUE tab.temp$relabun_uprIQR_Nat[i] <- quantile(df$rel_abun[ which(df$alt_vs_nat=="natural") ], probs = 0.75 ) # , na.rm = TRUE tab.temp$expect_trend_restor <- "Decreasing" # UPDATE THIS !!!!!!!!!!!!!!!!!!!!!!!! ## one-sided test ## https://stats.stackexchange.com/questions/201125/understanding-the-wilcoxon-rank-sum-one-sided-test wt <- wilcox.test(rel_abun ~ alt_vs_nat, alternative="greater", conf.int=FALSE, data=df) # "less" for increasing with restoration / "greater" for decreasing with restoration !!!!!!!!!! UPDATED THIS !!!!!!!!!!!!!! tab.temp$Wilcox_W[i] <- wt$statistic if (wt$p.value<0.001) { tab.temp$p_value[i] <- paste0("< 0.001") } else { tab.temp$p_value[i] <- paste0(round(wt$p.value,digits = 3)) } ## get correlation info from Mt Bold restoration sel <- which(as.character(all_bs_summaries$genus) == this_genus ) if (length(sel)>=1) { tab.temp$cor_mean[i] <- mean( all_bs_summaries$r_0_10[sel], na.rm=TRUE ) tab.temp$cor_lwr95ci[i] <- quantile( all_bs_summaries$r_0_10[sel], probs = .025, na.rm=TRUE ) tab.temp$cor_upr95ci[i] <- quantile( all_bs_summaries$r_0_10[sel], probs = .975, na.rm=TRUE ) tab.temp$n_otus[i] <- length( out[[ this_genus ]][["otus"]] ) } ## get log2FoldChange info between altered > natural if (grepl(pattern = "unclassified",x = this_genus)){ sel <- which( res_deseq_ALL$genus_label == this_genus ) } else { sel <- which( res_deseq_ALL$genus == this_genus ) } if (length(sel)>=1) { tab.temp$mean_log2FoldChange[i] <- mean( res_deseq_ALL$log2FoldChange[sel], na.rm=TRUE ) tab.temp$phylum[i] <- unique(as.character(res_deseq_ALL$phylum[sel])) tab.temp$phylum_label[i] <- sub(pattern="p__", replacement="", x=tab.temp$phylum[i]) } } tab.hum.dec <- tab.temp tab.hum.dec ## join all results together tab.inc <- tab.inc[order(tab.inc$cor_mean, decreasing = TRUE), ] tab.dec <- tab.dec[order(tab.dec$cor_mean, decreasing = FALSE), ] tab.hum.inc <- tab.hum.inc[order(tab.hum.inc$cor_mean, decreasing = TRUE), ] tab.hum.dec <- tab.hum.dec[order(tab.hum.dec$cor_mean, decreasing = FALSE), ] tab <- rbind(tab.inc, tab.dec, tab.hum.inc, tab.hum.dec) names(tab) # [1] "genus_group" "genus_group_label" "phylum" "phylum_label" "cor_mean" # [6] "cor_lwr95ci" "cor_upr95ci" "mean_log2FoldChange" "relabun_median_Alt" "relabun_lwrIQR_Alt" # [11] "relabun_uprIQR_Alt" "relabun_median_Nat" "relabun_lwrIQR_Nat" "relabun_uprIQR_Nat" "expect_trend_restor" # [16] "Wilcox_W" "p_value" "n_otus" write.csv(tab,file = "table-of-summary-stats-top10-inc-dec-human-associated-in-dec-vFINAL.csv") tab.summary <- data.frame( Genus_or_group = tab$genus_group_label, Phylum = tab$phylum_label, cor_mean_95CI = paste0(round(tab$cor_mean,2)," (",round(tab$cor_lwr95ci,2),", ",round(tab$cor_upr95ci,2),")"), mean_log2_fold_change = round(tab$mean_log2FoldChange,2), Alt_relabun_med_IQR = paste0(round(tab$relabun_median_Alt,3)," (",round(tab$relabun_lwrIQR_Alt,3),", ",round(tab$relabun_uprIQR_Alt,3),")"), Nat_relabun_med_IQR = paste0(round(tab$relabun_median_Nat,3)," (",round(tab$relabun_lwrIQR_Nat,3),", ",round(tab$relabun_uprIQR_Nat,3),")"), Expect_trend_restor = tab$expect_trend_restor, W = round(tab$Wilcox_W,0), P_value = tab$p_value, No_OTUs = tab$n_otus ) sel <- which(tab.summary$P_value == "< 0.001" | as.numeric(as.character(tab.summary$P_value)) <= 0.05) tab.summary$P_value[sel] tab.summary$Expect_trend_restor <- as.character(tab.summary$Expect_trend_restor) tab.summary$Expect_trend_restor[sel] <- paste0(tab.summary$Expect_trend_restor[sel],"*") write.csv(tab.summary,file = "TIDIED-table-of-summary-stats-top10-inc-dec-human-associated-in-dec-vFINAL.csv") #------------------------- #### Visualisation of microbiota - comparing Mt Bold to Aust-wide #------------------------- ## Mt Bold phyloseq object for 16s data phy.16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 3282 taxa and 48 samples ] # sample_data() Sample Data: [ 48 samples by 69 sample variables ] # tax_table() Taxonomy Table: [ 3282 taxa by 7 taxonomic ranks ] min(taxa_sums(phy.16s)) # 100 ## Aust-wide phyloseq object for 16s data (selected human-altered and natural samples) base.d16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 12413 taxa and 217 samples ] # sample_data() Sample Data: [ 217 samples by 24 sample variables ] # tax_table() Taxonomy Table: [ 12413 taxa by 7 taxonomic ranks ] min(taxa_sums(base.d16s)) # 100 mtbold_surf.16s <- prune_samples(phy.16s@sam_data$`Soil Depth (cm)` == 0, x = phy.16s) mtbold_surf.16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 3282 taxa and 24 samples ] # sample_data() Sample Data: [ 24 samples by 69 sample variables ] # tax_table() Taxonomy Table: [ 3282 taxa by 7 taxonomic ranks ] min(taxa_sums(mtbold_surf.16s)) # 0 # after removing deeper samples, now there are some taxa that are not represented mtbold_surf.16s <- prune_taxa(taxa = taxa_sums(mtbold_surf.16s) > 0 , x = mtbold_surf.16s) mtbold_surf.16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 3238 taxa and 24 samples ] # sample_data() Sample Data: [ 24 samples by 69 sample variables ] # tax_table() Taxonomy Table: [ 3238 taxa by 7 taxonomic ranks ] min(taxa_sums(mtbold_surf.16s)) # 1 # decided not to further prune taxa at this point (as rare taxa had been pruned previously) mtbold_and_austwide.16s <- merge_phyloseq(mtbold_surf.16s,base.d16s) min(taxa_sums(mtbold_and_austwide.16s)) # 1 sample_names(mtbold_and_austwide.16s) mtbold_and_austwide.16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 15651 taxa and 241 samples ] # sample_data() Sample Data: [ 241 samples by 73 sample variables ] # tax_table() Taxonomy Table: [ 15651 taxa by 7 taxonomic ranks ] # # # # # # # # # # # # # ## Visualise composition at genus level for Mt Bold surface vs Aust-wide selection taxa_sums(mtbold_and_austwide.16s) # agglomerate taxa of the same type, at the genus level, based on full taxonomic classification # so that unclassified do not group together out of taxonomic context! simp_mtbold_and_austwide.16s <- mtbold_and_austwide.16s rank_names(simp_mtbold_and_austwide.16s) # "kingdom" "phylum" "class" "order" "family" "genus" "species" #table(simp_mtbold_and_austwide.16s@tax_table[ , "genus"]) # very(!) large number of unclassified for (i in 1:dim(simp_mtbold_and_austwide.16s@tax_table)[1]) { #i<-1 simp_mtbold_and_austwide.16s@tax_table[i , "genus"] <- paste( simp_mtbold_and_austwide.16s@tax_table[i , c("kingdom","phylum","class","order","family","genus") ], collapse= " > " ) print(paste0("completed ",i)) } head(simp_mtbold_and_austwide.16s@tax_table[ , "genus"]) # AMD_16S_OTUa_14 "k__Bacteria > p__Proteobacteria > c__Alphaproteobacteria > o__Rhizobiales > f__Bradyrhizobiaceae > g__Bradyrhizobium" # AMD_16S_OTUa_143 "k__Bacteria > p__Firmicutes > c__Bacilli > o__Bacillales > f__Bacillaceae > g__Bacillus" # AMD_16S_OTUa_409 "k__Bacteria > p__Acidobacteria > c__Acidobacteriia > o__Acidobacteriales > f__Koribacteraceae > unclassified" # AMD_16S_OTUa_82 "k__Bacteria > p__Acidobacteria > c__Acidobacteriia > o__Acidobacteriales > f__Koribacteraceae > unclassified" # AMD_16S_OTUa_288 "k__Bacteria > p__Acidobacteria > c__Acidobacteriia > o__Acidobacteriales > f__Koribacteraceae > unclassified" # AMD_16S_OTUa_52 "k__Bacteria > p__Proteobacteria > c__Alphaproteobacteria > o__Rhizobiales > f__Hyphomicrobiaceae > g__Rhodoplanes" # agglomerate using new genus labels simp_mtbold_and_austwide.16s <- tax_glom(physeq = simp_mtbold_and_austwide.16s, taxrank = "genus") # 3 mins length(taxa_sums(simp_mtbold_and_austwide.16s)) # 831 taxa_sums(simp_mtbold_and_austwide.16s) summary( taxa_sums(simp_mtbold_and_austwide.16s) ) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 42 447 1827 17043 8778 601304 min( sample_sums(simp_mtbold_and_austwide.16s) ) # 6377 sort( sample_sums(simp_mtbold_and_austwide.16s))[1:10] # X8503 X7861 X8088 X12465 X13264 X12473 X12884 X12438 X12436 X12430 # 6377 9935 12765 13041 13371 15852 16545 17614 17780 17990 ## ORDINATION PLOT - # # # # # # # # # # # # # # # ### NMDS + Bray-Curtis # rarefy #1 seed <- 123 simp.r1.16s <- rarefy_even_depth(simp_mtbold_and_austwide.16s, sample.size = min(sample_sums(simp_mtbold_and_austwide.16s)), rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE) min(taxa_sums(simp.r1.16s)) # 2 sample_sums(simp.r1.16s) # all 6377 ntaxa(simp.r1.16s) # 831 simp.r1.16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 831 taxa and 241 samples ] # sample_data() Sample Data: [ 241 samples by 73 sample variables ] # tax_table() Taxonomy Table: [ 831 taxa by 7 taxonomic ranks ] sel <- which(simp.r1.16s@sam_data$Detailed.Land.Use == "Surface water supply") # qty 24 simp.r1.16s@sam_data$Detailed.Land.Use[sel] <- "Mt Bold samples" simp.r1.16s@sam_data$Reveg_age[sel] # [1] 6 years 6 years 6 years Cleared Cleared Cleared 10 years 10 years 10 years # [10] 7 years 7 years 7 years 8 years 8 years 8 years Remnant A Remnant A Remnant A # [19] Remnant B Remnant B Remnant B Remnant C Remnant C Remnant C # Levels: Cleared < 6 years < 7 years < 8 years < 10 years < Remnant A < Remnant B < Remnant C class( simp.r1.16s@sam_data$Reveg_age ) # "ordered" "factor" class( simp.r1.16s@sam_data$alt_vs_nat ) # "character" simp.r1.16s@sam_data$alt_vs_nat[sel] # all NA simp.r1.16s@sam_data$alt_vs_nat[sel] <- as.character( simp.r1.16s@sam_data$Reveg_age[sel] ) simp.r1.16s@sam_data$alt_vs_nat[sel] # [1] "6 years" "6 years" "6 years" "Cleared" "Cleared" "Cleared" "10 years" "10 years" "10 years" # [10] "7 years" "7 years" "7 years" "8 years" "8 years" "8 years" "Remnant A" "Remnant A" "Remnant A" # [19] "Remnant B" "Remnant B" "Remnant B" "Remnant C" "Remnant C" "Remnant C" temp <- simp.r1.16s simp.r1.16s@sam_data$alt_vs_nat <- factor(simp.r1.16s@sam_data$alt_vs_nat, levels = c("altered", "natural", "Cleared", "6 years", "7 years", "8 years", "10 years", "Remnant A", "Remnant B", "Remnant C" ), ordered = TRUE) ## ordination set.seed(123) ord <- ordinate(simp.r1.16s, "NMDS", "bray") ord ord$stress # 0.1440775 p <- plot_ordination(simp.r1.16s, ord, type="samples", color="alt_vs_nat", shape= "Detailed.Land.Use") p str(p) p$data$Detailed.Land.Use <- factor(p$data$Detailed.Land.Use) p$data$alt_vs_nat # already a factor - 10 Levels: altered < natural < Cleared < 6 years < 7 years < 8 years < 10 years < ... < Remnant C sort(unique(p$data$Detailed.Land.Use)) # [1] "Cereals -wheat" "cotton" # [3] "irrigated seasonal horticulture" "Mt Bold samples" # [5] "National Park" "Nature conservation" # [7] "Pasture legume/grass mixtures" "Rehabilitation" # [9] "Sown grasses" "Strict nature reserves" # [11] "sugar" "Tree fruits -apple" cols <- c("altered" = "#F8766D", "natural" = "#00BFC4", "Cleared" = "#e31a1c", "6 years" = "#addd8e", "7 years" = "#78c679", "8 years" = "#41ab5d", "10 years"= "#238443", "Remnant A" = "#4292c6", "Remnant B" = "#2171b5", "Remnant C" = "#084594" ) shapes <- c( "Cereals -wheat" = 0 , "cotton" = 2, "irrigated seasonal horticulture" =6 , "Mt Bold samples" = 19, "National Park" =3 , "Nature conservation" = 4, "Pasture legume/grass mixtures" = 5 , "Rehabilitation" =7, "Sown grasses" = 11 , "Strict nature reserves" =8, "sugar" =9 , "Tree fruits -apple" =14 ) pp <- ggplot(data = p$data, mapping = aes(x = NMDS1, y = NMDS2) ) + theme_bw() + geom_point(aes(color=alt_vs_nat, shape= Detailed.Land.Use, size = alt_vs_nat)) + scale_size_manual(values= c( "altered" = 1.4, "natural" = 1.4, "Cleared" = 3, "6 years" = 3, "7 years" = 3, "8 years" = 3, "10 years"= 3, "Remnant A" = 3, "Remnant B" = 3, "Remnant C" = 3 ), guide = FALSE ) + scale_shape_manual(values = shapes, name ="Land use", labels = c( "Cereals -wheat" = "Cereals - wheat" , "cotton" = "Cotton", "irrigated seasonal horticulture" ="Irrigated seasonal horticulture" , "Mt Bold samples" = "Mt Bold samples", "National Park" ="National Park" , "Nature conservation" = "Nature conservation", "Pasture legume/grass mixtures" = "Pasture legume/grass mixtures" , "Rehabilitation" = "Rehabilitation", "Sown grasses" = "Sown grasses" , "Strict nature reserves" = "Strict nature reserves", "sugar" = "Sugar" , "Tree fruits -apple" = "Tree fruits - apple" ) ) + scale_colour_manual(values = cols, name ="Sample type", labels = c( "altered" = "Human-\naltered", "natural" = "Natural", "Cleared" = "Cleared", "6 years" = "6 years", "7 years" = "7 years", "8 years" = "8 years", "10 years"= "10 years", "Remnant A" = "Remnant A", "Remnant B" = "Remnant B", "Remnant C" = "Remnant C" ) ) + guides(col = guide_legend(ncol = 2, byrow = FALSE, title.position = "top", order = 1)) + guides(shape = guide_legend(ncol = 2, byrow = FALSE, title.position = "top", order = 2)) + theme( legend.position="bottom", legend.title = element_text(size=10), legend.text = element_text(size = 8), axis.title = element_text(size = 10), axis.text = element_text(size = 8), legend.key.size = unit(0.8, 'lines'), panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + annotate(geom="text", x= -1.0, y= -0.5, label = paste0("Stress = ",round(ord$stress,digits=3)), hjust=0, vjust=1) pp ggsave(plot=pp, filename = paste0("finished-plots/","Ord-NMDS-Bray-16s-MtBold-and-Austwide-Altered-vs-Natural-vFINAL.tiff"), width = 16, height = 14, units = "cm", dpi = 600, compression = "lzw") table(simp.r1.16s@sam_data$alt_vs_nat) # altered natural Cleared 6 years 7 years 8 years 10 years Remnant A Remnant B Remnant C # 78 139 3 3 3 3 3 3 3 3 # #------------------------- #### Preparation and testing for Functional analysis # - VSEARCH details for matching Mt Bold and Aust-wide BASE representative fasta sequences to Greengenes (13-5) sequences # - substituting study OTUs for Greengenes (GG) sequences / GG IDs #------------------------- # Using PICRUSt implemented in R themetagenomics package: # https://cran.r-project.org/web/packages/themetagenomics/vignettes/functional_prediction.html # Note that the OTU table has both rownames and column names that correspond to the metadata and # taxonomy table, respectively. The latter is most important for functional prediction, since the # algorithm looks for these names when mapping the OTU table to functional annotations. # For PICRUSt, these names have to be Greengenes OTU IDs, so they'll be long integer codes. # https://picrust.github.io/picrust/tutorials/otu_picking.html ## Need to assign Greengenes OTU IDs to the Mt Bold and BASE Aust-wide OTUs!! ## Wanting to assign Greengenes OTU ID instead of BASE OTU ID ## Compare BASE 97% clustered OTUs and fasta files to Greengenes representative sequences ## Downloaded from - ftp://greengenes.microbio.me/greengenes_release/gg_13_5/gg_13_5_otus.tar.gz ## BASE representative sequences are stored in: ## 1) from publicly-available 'BASE_16S_OTU.fasta' sequences (labelled '16S_OTU_...') from web: https://data.bioplatforms.com/dataset/base-otus ## 2) Mt Bold project-specific 'AMD_16S_table_otus.fasta' sequences (labelled 'AMD_16S_OTU...') from the 'AMD_16S_table_otus.fasta.gz' file mtbold_and_austwide.16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 15651 taxa and 241 samples ] # sample_data() Sample Data: [ 241 samples by 73 sample variables ] # tax_table() Taxonomy Table: [ 15651 taxa by 7 taxonomic ranks ] ## Treat Aust-wide (base.d16s) and Mt Bold (mtbold_surf.16s) separately ## as their 97% clustered OTUs have been derived separately mtbold_surf.16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 3238 taxa and 24 samples ] # sample_data() Sample Data: [ 24 samples by 69 sample variables ] # tax_table() Taxonomy Table: [ 3238 taxa by 7 taxonomic ranks ] base.d16s # phyloseq-class experiment-level object # otu_table() OTU Table: [ 12413 taxa and 217 samples ] # sample_data() Sample Data: [ 217 samples by 24 sample variables ] # tax_table() Taxonomy Table: [ 12413 taxa by 7 taxonomic ranks ] # read-in fasta files for OTU_Ids and representative sequences #library(seqinr) base_seq <- read.fasta(file = paste0(datadir,"/","BASE_16S_OTU.fasta"), seqtype = "DNA", as.string = TRUE) base_AMD_seq <- read.fasta(file = paste0(datadir,"/","AMD_16S_table_otus.fasta"), seqtype = "DNA", as.string = TRUE) class(base_seq) # "list" head(names(base_seq)) # "16S_OTUa_1" "16S_OTUa_2" "16S_OTUa_3" "16S_OTUa_4" "16S_OTUa_5" "16S_OTUa_6" class(names(base_seq)) # character length(base_seq) # 91928 class(base_AMD_seq) # "list" head(names(base_AMD_seq)) # "AMD_16S_OTUa_1" "AMD_16S_OTUa_2" "AMD_16S_OTUa_3" "AMD_16S_OTUa_4" "AMD_16S_OTUa_5" "AMD_16S_OTUa_6" class(names(base_AMD_seq)) # character length(base_AMD_seq) # 60878 # testing what do elements look like? names(base_seq)[1] # "16S_OTUa_1" base_seq[[1]] # [1] "attgaacgctggcggcaggcttaacacatgcaagtcgagcggtaacatttctagcttgctagaagatgacgagcggcggacgggtgagtaatacttaggaatctacctagtagtgggggatagcacggggaaactcgtattaataccgcatacgacctacgggagaaagggggcagtttactgctctcgctattagatgagcctaagtcggattagctagatggtggggtaaaggcctaccatggcgacgatctgtagctggtctgagaggatgatcagccacaccgggactgagacacggcccggactcctacgggaggcagcagtggggaatattggacaatgggggaaaccctgatccagccatgccgcgtgtgtgaagaaggccttttggttgtaaagcactttaagcagtgaagaagactcaatggttaatacccattgacgatgacattagctgcagaataagcaccggctaactctgt" # attr(,"name") # [1] "16S_OTUa_1" # attr(,"Annot") # [1] ">16S_OTUa_1" # attr(,"class") # [1] "SeqFastadna" base_seq[[1]][1] #[1] "attgaacgctggcggcaggcttaacacatgcaagtcgagcggtaacatttctagcttgctagaagatgacgagcggcggacgggtgagtaatacttaggaatctacctagtagtgggggatagcacggggaaactcgtattaataccgcatacgacctacgggagaaagggggcagtttactgctctcgctattagatgagcctaagtcggattagctagatggtggggtaaaggcctaccatggcgacgatctgtagctggtctgagaggatgatcagccacaccgggactgagacacggcccggactcctacgggaggcagcagtggggaatattggacaatgggggaaaccctgatccagccatgccgcgtgtgtgaagaaggccttttggttgtaaagcactttaagcagtgaagaagactcaatggttaatacccattgacgatgacattagctgcagaataagcaccggctaactctgt" base_seq[["16S_OTUa_1"]][1] #[1] "attgaacgctggcggcaggcttaacacatgcaagtcgagcggtaacatttctagcttgctagaagatgacgagcggcggacgggtgagtaatacttaggaatctacctagtagtgggggatagcacggggaaactcgtattaataccgcatacgacctacgggagaaagggggcagtttactgctctcgctattagatgagcctaagtcggattagctagatggtggggtaaaggcctaccatggcgacgatctgtagctggtctgagaggatgatcagccacaccgggactgagacacggcccggactcctacgggaggcagcagtggggaatattggacaatgggggaaaccctgatccagccatgccgcgtgtgtgaagaaggccttttggttgtaaagcactttaagcagtgaagaagactcaatggttaatacccattgacgatgacattagctgcagaataagcaccggctaactctgt" names(base_AMD_seq)[1] # "AMD_16S_OTUa_7" base_AMD_seq[[1]] # [1] "attgaacgctggcggcaggcttaacacatgcaagtcgagcggggaagagtagcttgctactttacctagcggcggacgggtgagtaatgcttaggaatctgcctattagtgggggacaacatctcgaaagggatgctaataccgcatacgtcctacgggagaaagcaggggaccttcgggccttgcgctaatagatgagcctaagtcggattagctagttggtggggtaaaggcctaccaaggcgacgatctgtagcgggtctgagaggatgatccgccacactgggactgagacacggcccagactcctacgggaggcagcagtggggaatattggacaatggggggaaccctgatccagccatgccgcgtgtgtgaagaaggccttttggttgtaaagcactttaagcgaggaggaggctaccgagattaatactcttggatagtggacgttactcgcagaataagcaccggctaactctgtgc" # attr(,"name") # [1] "AMD_16S_OTUa_7" # attr(,"Annot") # [1] ">AMD_16S_OTUa_7" # attr(,"class") # [1] "SeqFastadna" base_AMD_seq[[1]][1] #[1] "attgaacgctggcggcaggcttaacacatgcaagtcgagcggggaagagtagcttgctactttacctagcggcggacgggtgagtaatgcttaggaatctgcctattagtgggggacaacatctcgaaagggatgctaataccgcatacgtcctacgggagaaagcaggggaccttcgggccttgcgctaatagatgagcctaagtcggattagctagttggtggggtaaaggcctaccaaggcgacgatctgtagcgggtctgagaggatgatccgccacactgggactgagacacggcccagactcctacgggaggcagcagtggggaatattggacaatggggggaaccctgatccagccatgccgcgtgtgtgaagaaggccttttggttgtaaagcactttaagcgaggaggaggctaccgagattaatactcttggatagtggacgttactcgcagaataagcaccggctaactctgtgc" base_AMD_seq[["AMD_16S_OTUa_7"]][1] #[1] "attgaacgctggcggcaggcttaacacatgcaagtcgagcggggaagagtagcttgctactttacctagcggcggacgggtgagtaatgcttaggaatctgcctattagtgggggacaacatctcgaaagggatgctaataccgcatacgtcctacgggagaaagcaggggaccttcgggccttgcgctaatagatgagcctaagtcggattagctagttggtggggtaaaggcctaccaaggcgacgatctgtagcgggtctgagaggatgatccgccacactgggactgagacacggcccagactcctacgggaggcagcagtggggaatattggacaatggggggaaccctgatccagccatgccgcgtgtgtgaagaaggccttttggttgtaaagcactttaagcgaggaggaggctaccgagattaatactcttggatagtggacgttactcgcagaataagcaccggctaactctgtgc" BASE_OTUs_used <- taxa_names(mtbold_and_austwide.16s) identical(BASE_OTUs_used, row.names(mtbold_and_austwide.16s@otu_table)) # TRUE length(BASE_OTUs_used) # 15651 # Confirming that these have different prefix length(grep(pattern = "AMD_",x = BASE_OTUs_used)) # 3238 - which matches taxa_names from Mt Bold surface samples length(taxa_names(mtbold_surf.16s)) # 3238 length(grep(pattern = "^16S",x = BASE_OTUs_used)) # 12413 - which matches taxa_names from Aust-wide selection length(taxa_names(base.d16s)) # 12413 ## Subselect from fasta files, only those used sel1 <- which(names(base_seq) %in% taxa_names(base.d16s)) # qty 12413 base_seq <- base_seq[sel1] length(base_seq) # 12413 base_seq[1:5] getwd() # "C:/Workspace/PROJ/PAPER-Trending-Taxa-Resto/modelling" write.fasta(base_seq, names = names(base_seq), file.out = "study_selection_base_seq.fasta", open = "w",nbchar = 80) sel2 <- which(names(base_AMD_seq) %in% taxa_names(mtbold_surf.16s)) # qty 3238 base_AMD_seq <- base_AMD_seq[sel2] length(base_AMD_seq) # 3238 base_AMD_seq[1:5] write.fasta(base_AMD_seq, names = names(base_AMD_seq), file.out = "study_selection_base_AMD_seq.fasta", open = "w",nbchar = 80) 3238 + 12413 # 15651 length(BASE_OTUs_used) # 15651 mean(nchar(base_seq)) # 470.9844 summary(nchar(base_seq)) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 401 454 470 471 488 515 mean(nchar(base_AMD_seq)) # 470.4339 summary(nchar(base_AMD_seq)) # 470 # Min. 1st Qu. Median Mean 3rd Qu. Max. # 313.0 454.0 469.0 470.4 487.0 517.0 ## Greengenes fasta reference data # ( Available from: ftp://greengenes.microbio.me/greengenes_release/gg_13_5/gg_13_5_otus.tar.gz ) gg_seq <- read.fasta(file = "C:/Workspace/DATA/Greengenes_13_5/gg_13_5.fasta", seqtype = "DNA", as.string = TRUE) head(gg_seq) gg_seq["1111886"] length(gg_seq) # 1262986 names(gg_seq)[1:10] # "1111886" "1111885" "1111883" "1111882" "1111879" "1111876" "1111875" "1111874" "1111869" "1111867" hist(as.integer( names(gg_seq) )) sort(as.integer( names(gg_seq) ))[1:200] #rm(gg_seq) # remove to save memory ### Alignment of study OTU representative sequences to Greengenes 13-5 IDs using VSEARCH ## ## Used VSEARCH (Rognes et al 2016, DOI 10.7717/peerj.2584) to search for closest Greengenes sequence in ## https://github.com/torognes/vsearch/releases/download/v2.8.1/vsearch-2.8.1-win-x86_64.zip ## ## Greengenes 13_5 downloaded from ftp://greengenes.microbio.me/greengenes_release/gg_13_5/gg_13_5_otus.tar.gz ## used Greengenes_13_5\gg_13_5.fasta ## ## To find the closest matching Greengenes sequences to the study 97% clustered OTUs, ## the following VSEARCH commands were run in the Windows DOS Command Prompt, ## after first changing directory to the relevant location (C:\Workspace\PROJ\PAPER-Trending-Taxa-Resto\modelling) ## where the study sequence fasta files (study_selection_base_AMD_seq.fasta, study_selection_base_seq.fasta) are stored. ## Used commands to query base_AMD_seq (Mt Bold) and base_seq (Aust-wide) separately ## ## For Mt Bold surface samples: ## C:\vsearch-2.8.1-win-x86_64\vsearch --usearch_global study_selection_base_AMD_seq.fasta --db C:\Workspace\DATA\Greengenes_13_5\gg_13_5.fasta --id 0.6 --userout base_AMD_search_results2.txt --userfields query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+bits ## Full file path - C:\vsearch-2.8.1-win-x86_64\vsearch --usearch_global C:\Workspace\PROJ\PAPER-Trending-Taxa-Resto\modelling\study_selection_base_AMD_seq.fasta --db C:\Workspace\DATA\Greengenes_13_5\gg_13_5.fasta --id 0.6 --userout base_AMD_search_results2.txt --userfields query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+bits ## [Note: VSEARCH found matching query sequences: 3227 of 3238 (99.66%) above a first pass 60% identity threshold.] ## ## For the selected Australia-wide BASE samples: ## C:\vsearch-2.8.1-win-x86_64\vsearch --usearch_global study_selection_base_seq.fasta --db C:\Workspace\DATA\Greengenes_13_5\gg_13_5.fasta --id 0.6 --userout base_search_results2.txt --userfields query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+bits ## Full file path - C:\vsearch-2.8.1-win-x86_64\vsearch --usearch_global C:\Workspace\PROJ\PAPER-Trending-Taxa-Resto\modelling\study_selection_base_seq.fasta --db C:\Workspace\DATA\Greengenes_13_5\gg_13_5.fasta --id 0.6 --userout base_search_results2.txt --userfields query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+bits ## [Note: VSEARCH found matching query sequences: 12406 of 12413 (99.94%) above a first pass 60% identity threshold.] # x <- scan("base_AMD_search_results.txt",what="character",sep="\t") # x[1:24] ## note contained a blank entry in 2nd position ## Mt Bold surface VSEARCH results stored in: "base_AMD_search_results2.txt" vsearch_mtboldsurf<- as.data.frame(matrix(data = scan("base_AMD_search_results2.txt",what="character",sep="\t"), ncol = 13, byrow=TRUE)) vsearch_mtboldsurf <- vsearch_mtboldsurf[ ,-2] # blank column names(vsearch_mtboldsurf) <- c("query","target","id","alnlen","mism","opens","qlo","qhi","tlo","thi","evalue","bits") str(vsearch_mtboldsurf) vsearch_mtboldsurf[ , "id"] <- as.numeric(as.character(vsearch_mtboldsurf[ , "id"])) str(vsearch_mtboldsurf) hist(vsearch_mtboldsurf$id) # only include GG ids with >= 97% identity with the representative fasta sel <- which(vsearch_mtboldsurf$id >= 97) # qty 1577 ## Aust-wide VSEARCH results stored in: "base_search_results2.txt" vsearch_austwide<- as.data.frame(matrix(data = scan("base_search_results2.txt",what="character",sep="\t"), ncol = 13, byrow=TRUE)) vsearch_austwide <- vsearch_austwide[ ,-2] names(vsearch_austwide) <- c("query","target","id","alnlen","mism","opens","qlo","qhi","tlo","thi","evalue","bits") str(vsearch_austwide) vsearch_austwide[ , "id"] <- as.numeric(as.character(vsearch_austwide[ , "id"])) str(vsearch_austwide) hist(vsearch_austwide$id) # only include GG ids with >= 97% identity with the representative fasta sel <- which(vsearch_austwide$id >= 97) # qty 5404 # 97% identity 1577 + 5404 # 6981 100*(1577 + 5404)/15651 # ~44.6% of OTUs covered ## set identity threshold id_thresh <- 97 sel <- which(vsearch_mtboldsurf$id >= id_thresh) # qty 1577 vsearch_mtboldsurf <- vsearch_mtboldsurf[sel, ] dim(vsearch_mtboldsurf) # 1577 12 sel <- which(vsearch_austwide$id >= id_thresh) # qty 5404 vsearch_austwide <- vsearch_austwide[sel, ] dim(vsearch_austwide) # 5404 12 ## for PICRUSt functional predictions, want matching Greengenes OTU ID ## store matching results in a single data frame matching <- data.frame(BASE_97_OTU_Id=BASE_OTUs_used, ref_seq=NA, dataset=NA, GG_Id=NA, percent_id=NA, success=NA) str(matching) # set as character matching$BASE_97_OTU_Id <- as.character(matching$BASE_97_OTU_Id) for (i in 1:dim(matching)[1]) { #i<-1 if (length(grep(pattern = "AMD_",x = matching$BASE_97_OTU_Id[i])) == 1) { matching$ref_seq[i] <- "base_AMD_seq" matching$dataset[i] <- "Mt Bold" # find matching GG Id sel <- which(vsearch_mtboldsurf$query == matching$BASE_97_OTU_Id[i]) if (length(sel)==1) { matching$GG_Id[i] <- as.character( vsearch_mtboldsurf$target[sel] ) matching$percent_id[i] <- as.numeric( vsearch_mtboldsurf$id[sel] ) matching$success[i] <- "yes" } else { matching$success[i] <- "no" } } if (length(grep(pattern = "^16S",x = matching$BASE_97_OTU_Id[i])) == 1) { matching$ref_seq[i] <- "base_seq" matching$dataset[i] <- "Aust-wide" # find matching GG Id sel <- which(vsearch_austwide$query == matching$BASE_97_OTU_Id[i]) if (length(sel)==1) { matching$GG_Id[i] <- as.character( vsearch_austwide$target[sel] ) matching$percent_id[i] <- as.numeric( vsearch_austwide$id[sel] ) matching$success[i] <- "yes" } else { matching$success[i] <- "no" } } print(paste0("completed ",i)) } write.csv(matching, file = paste0("matching-study-OTU-IDs-and-Greengenes-IDs-with-id-threshold-",id_thresh,"-vFINAL.csv"),row.names = FALSE) dim(matching) # 15651 6 ## How many with no match? sel <- which(matching$success == "no") # 8670 # list of OTUs without GG ID no_GG_Id <- matching$BASE_97_OTU_Id[sel] ## How many with match? sel <- which(matching$success == "yes") # 6981 # list of OTUs WITH GG ID yes_have_GG_Id <- matching$BASE_97_OTU_Id[sel] 8670 + 6981 # 15651 100*(6981)/15651 # ~44.6% head(yes_have_GG_Id) # "AMD_16S_OTUa_14" "AMD_16S_OTUa_143" "AMD_16S_OTUa_409" "AMD_16S_OTUa_82" "AMD_16S_OTUa_288" "AMD_16S_OTUa_388" tail(yes_have_GG_Id) # "16S_OTUa_26321" "16S_OTUa_25603" "16S_OTUa_55111" "16S_OTUa_20444" "16S_OTUa_42427" "16S_OTUa_21269" length(matching$percent_id[ which(matching$BASE_97_OTU_Id %in% yes_have_GG_Id) ]) # 6981 summary(matching$percent_id[ which(matching$BASE_97_OTU_Id %in% yes_have_GG_Id) ]) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 97.00 97.70 98.60 98.58 99.50 100.00 length(unique(matching$BASE_97_OTU_Id[ which(matching$BASE_97_OTU_Id %in% yes_have_GG_Id) ])) # 6981 length(unique(matching$GG_Id[ which(matching$BASE_97_OTU_Id %in% yes_have_GG_Id) ])) # 5461 ## verify matches that have been made? head(matching) # BASE_97_OTU_Id ref_seq dataset GG_Id percent_id success # 1 AMD_16S_OTUa_14 base_AMD_seq Mt Bold 1087375 100.0 yes # 2 AMD_16S_OTUa_143 base_AMD_seq Mt Bold 4386062 100.0 yes # 3 AMD_16S_OTUa_409 base_AMD_seq Mt Bold 250258 99.4 yes # 4 AMD_16S_OTUa_82 base_AMD_seq Mt Bold 104834 100.0 yes # 5 AMD_16S_OTUa_288 base_AMD_seq Mt Bold 752270 100.0 yes # 6 AMD_16S_OTUa_52 base_AMD_seq Mt Bold NA no base_AMD_seq["AMD_16S_OTUa_14"] # [1] "agcgaacgctggcggcaggcttaacacatgcaagtcgagcgggcgtagcaatacgtcagcggcagacgggtgagtaacgcgtgggaacgtaccttttggttcggaacaacacagggaaacttgtgctaataccggataagcccttacggggaaagatttatcgccgaaagatcggcccgcgtctgattagctagttggtgaggtaatggctcaccaaggcgacgatcagtagctggtctgagaggatgatcagccacattgggactgagacacggcccaaactcctacgggaggcagcagtggggaatattggacaatgggcgcaagcctgatccagccatgccgcgtgagtgatgaaggccctagggttgtaaagctcttttgtgcgggaagataatgacggtaccgcaagaataagccccggctaacttcgtgc" gg_seq["1087375"] # [1] "agcgaacgctggcggcaggcttaacacatgcaagtcgagcgggcgtagcaatacgtcagcggcagacgggtgagtaacgcgtgggaacgtaccttttggttcggaacaacacagggaaacttgtgctaataccggataagcccttacggggaaagatttatcgccgaaagatcggcccgcgtctgattagctagttggtgaggtaatggctcaccaaggcgacgatcagtagctggtctgagaggatgatcagccacattgggactgagacacggcccaaactcctacgggaggcagcagtggggaatattggacaatgggcgcaagcctgatccagccatgccgcgtgagtgatgaaggccctagggttgtaaagctcttttgtgcgggaagataatgacggtaccgcaagaataagccccggctaacttcgtgccagcagccgcggtaatacgaagggggctagcgttgctcggaatcactgggcgtaaagggtgcgtaggcgggtctttagtcaggggtgaaatcctggagctcaactccagaactgcctttgatactgaagatcttgagttcgggagaggtgagtggaactgcgagtgtagaggtgaaattcgtagatattcgcaagaacaccagtggcgaaggcggctcactggcccgatactgacgctgaggcacgaaagcgtggggagcaaacaggattagataccctggtagtccacgccgtaaacgatgaatgccagccgttagtgggtttactcactagtggcgcagctaacgctttaagcattccgcctggggagtacggtcgcaagattaaaactcaaaggaattgacgggggcccgcacaagcggtggagcatgtggtttaattcgacgcaacgcgcagaaccttaccagcccttgacatcccggtcgcggactccagagacggagttcttcagttcggctggaccggagacaggtgctgcatggctgtcgtcagctcgtgt... base_AMD_seq["AMD_16S_OTUa_143"] # [1] "gacgaacgctggcggcgtgcctaatacatgcaagtcgagcgaatctttgggagcttgctcctaaaggttagcggcggacgggtgagtaacacgtgggcaacctgcctataagactgggataacttcgggaaaccggagctaataccggataattcttttctacacatgtagaaaagctaaaagacggtttacgctgtcacttatagatgggcccgcggcgcattagctagttggtgaggtaacggctcaccaaggccacgatgcgtagccgacctgagagggtgatcggccacactgggactgagacacggcccagactcctacgggaggcagcagtagggaatcttccgcaatggacgaaagtctgacggagcaacgccgcgtgagtgatgaaggttttcggatcgtaaaactctgttgttagggaagaacaagtacgagagtaactgctcgtaccttgacggtacctaaccagaaagccacggctaactacgtgc" gg_seq["4386062"] # [1] "gacgaacgctggcggcgtgcctaatacatgcaagtcgagcgaatctttgggagcttgctcctaaaggttagcggcggacgggtgagtaacacgtgggcaacctgcctataagactgggataacttcgggaaaccggagctaataccggataattcttttctacacatgtagaaaagctaaaagacggtttacgctgtcacttatagatgggcccgcggcgcattagctagttggtgaggtaacggctcaccaaggccacgatgcgtagccgacctgagagggtgatcggccacactgggactgagacacggcccagactcctacgggaggcagcagtagggaatcttccgcaatggacgaaagtctgacggagcaacgccgcgtgagtgatgaaggttttcggatcgtaaaactctgttgttagggaagaacaagtacgagagtaactgctcgtaccttgacggtacctaaccagaaagccacggctaactacgtgccagcagccgcggtaatacgtaggtggcaagcgttgtccggaattattgggcgtaaagcgcgcgcaggcggtcctttaagtctgatgtgaaagcccacggctcaaccgtggagggtcattggaaactgggggacttgagtacaggagagaagagtggaattccacgtgtagcggtgaaatgcgtagagatgtggaggaacaccagtggcgaaggcgactctttggcctgtaactgacgctgaggcgcgaaagcgtggggagcaaacaggattagataccctggtagtccacgccgtaaacgatgagtgctaagtgttagagggtttccgccctttagtgctgcagtaaacgcattaagcactccgcctggggagtacggccgcaaggctgaaactcaaaggaattgacggggacccgcacaagcggtggagcatgtggtttaattcgaagcaacgcggagaaccttaccaggtcttgacatcctctgacaatcctagag... base_AMD_seq["AMD_16S_OTUa_288"] # [1] "aatcaacgctggcggcgtgcctaacacatgcaagtcgaacgagaaagtggagcaatccatgagtaaagtggcgaccgggtgagtaacacgtgactaacctaccttcgagtgggggataacctcgggaaaccggggctaataccgcataatgccttcgggttaaaggagcaattcgcttgaagagggggtcgcggctgattagctagttggcggggtaacggcccaccaaggcgatgatcggtagccggcctgagagggcgcacggccacactggaactgaaacacggtccagactcctacgggaggcagcagtggggaattttgcgcaatgggggaaaccctgacgcagcaacgccgcgtggaggatgaagtcccttgggacgtaaactcctttcgaccgggacgatgatgacggtaccggtggaagaagccccggctaacttcgtgc" gg_seq["752270"] # [1] "acgctggcggcgtgcctaacacatgcaagtcgaacgagaaagtggagcaatccatgagtaaagtggcgaccgggtgagtaacacgtgactaacctaccttcgagtgggggataacctcgggaaaccggggctaataccgcataatgccttcgggttaaaggagcaattcgcttgaagagggggtcgcggctgattagctagttggcggggtaacggcccaccaaggcgatgatcggtagccggcctgagagggcgcacggccacactggaactgaaacacggtccagactcctacgggaggcagcagtggggaattttgcgcaatgggggaaaccctgacgcagcaacgccgcgtggaggatgaagtcccttgggacgtaaactcctttcgaccgggacgatgatgacggtaccggtggaagaagccccggctaacttcgtgccagcagccgcggtaatacgaggggggcaagcgttgttcggaattattgggcgtaaagggcgcgtaggcggtgcggtaagtcacctgtgaaacctctgggcttaacccagagcctgcaggcgaaactgccgtgctggagtgtgggagaggtgcgtggaattcccggtgtagcggtgaaatgcgtagatatcgggaggaacacctgtggcgaaagcggcgcactggaccacaactgacgctgaggcgcgaaagctaggggagcaaacaggattagataccctggtagtcctagccctaaacgatggatgcttggtgtgttgggtacccaaccccaacgtgccgaagctaacgcgataagcatcccgcctggggagtacggtcgcaaggctgaaactcaaaggaattgacgggggcccgcacaagcggtggagcatgtggttcaattcgacgcaacgcgaagaaccttacctgggctcgaagcgcagtggaccggggtagaaatatccctttctcgcaagagactgctgcggaggtgctgcatggctgtcgtcag... # #------------------------- #### Preparation for functional analysis: # - Identify Aust-wide natural and human-altered case study samples # - Establish minimum read depth across Mt Bold and selected Aust-wide samples # (to normalise 'sampling effort'/reads for use in functional comparison) #------------------------- # For the purpose of examining functional diversity and composition/character # compare Mt Bold restoration gradient to selected Aust-wide natural and human-altered samples # where we can normalise sampling effort and perform the same analysis. # i.e. choose Aust-wide sites where there are at least 3 samples/site - to allow consistent use of the merged-sample bootstrap technique. table(base.d16s@sam_data$Collection.Site) # Ayr Blackheath Booderee Buckley Swamp Camp Mountain # 2 6 20 5 1 # Cape Tribulation Cherreninup Chingarrup Chowilla Credo Blackbutt Plot # 3 2 2 1 1 # Credo Gimlet Plot Credo Redgum Plot Danbulla Fitzgerald River NP Freycinet NP # 1 1 1 15 10 # Gundabooka NP Kallora Kinchega NP King Island Longerenong # 1 2 1 12 3 # Mackay Maram Minnipa Mt Edith Mt Lesueur NP # 3 2 2 2 18 # Mt. Field NP Namadgi NP Narrabri Nowanup Overland Corner # 2 58 20 2 2 # Peniup 1 Peniup 2 Peniup 3 Riggs Creek Streaky Bay # 2 2 2 1 1 # Walpeup Whroo conservation area # 6 1 sel <- which(base.d16s@sam_data$Collection.Site %in% c("Blackheath", "Booderee", "Buckley Swamp", "Cape Tribulation", "Fitzgerald River NP", "Freycinet NP", "King Island","Longerenong", "Mackay", "Mt Lesueur NP", "Namadgi NP", "Narrabri", "Walpeup")) base.d16s@sam_data[sel, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] ### consider examples: aust_select <- list() ### human-altered ## i) Blackheath Tree fruits -apple sel <- which(base.d16s@sam_data$Collection.Site == "Blackheath" & base.d16s@sam_data$Detailed.Land.Use=="Tree fruits -apple") set.seed(123) choose3 <- sample(sel,size=3,replace=FALSE) base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] # Collection.Site Detailed.Land.Use Clay Organic.Carbon # X12616 Blackheath Tree fruits -apple 9.65 3.56 # X12620 Blackheath Tree fruits -apple 9.75 3.38 # X12624 Blackheath Tree fruits -apple 14.93 3.78 aust_select[["Blackheath (NSW) apple trees"]] <- base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] ## ii) Buckley Swamp Pasture legume/grass mixtures sel <- which(base.d16s@sam_data$Collection.Site == "Buckley Swamp" & base.d16s@sam_data$Detailed.Land.Use=="Pasture legume/grass mixtures") set.seed(123) choose3 <- sample(sel,size=3,replace=FALSE) base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] # Collection.Site Detailed.Land.Use Clay Organic.Carbon # X8280 Buckley Swamp Pasture legume/grass mixtures NA 4.92 # X8284 Buckley Swamp Pasture legume/grass mixtures NA 4.28 # X8286 Buckley Swamp Pasture legume/grass mixtures NA 5.11 aust_select[["Buckley Swamp (Vic) pasture"]] <- base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] ## iii) King Island Pasture legume/grass mixtures - Tasmania sel <- which(base.d16s@sam_data$Collection.Site == "King Island" & base.d16s@sam_data$Detailed.Land.Use=="Pasture legume/grass mixtures") set.seed(123) choose3 <- sample(sel,size=3,replace=FALSE) base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] # Collection.Site Detailed.Land.Use Clay Organic.Carbon # X13272 King Island Pasture legume/grass mixtures 7.09 5.03 # X13282 King Island Pasture legume/grass mixtures 15.64 4.11 # X13274 King Island Pasture legume/grass mixtures 8.40 4.84 aust_select[["King Island (Tas) pasture"]] <- base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] ## iv) Longerenong Cereals -wheat - Vic sel <- which(base.d16s@sam_data$Collection.Site == "Longerenong" & base.d16s@sam_data$Detailed.Land.Use=="Cereals -wheat") set.seed(123) choose3 <- sample(sel,size=3,replace=FALSE) base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] # Collection.Site Detailed.Land.Use Clay Organic.Carbon # X8270 Longerenong Cereals -wheat NA 1.38 # X8272 Longerenong Cereals -wheat NA 1.44 # X8274 Longerenong Cereals -wheat NA 0.97 aust_select[["Longerenong (Vic) wheat"]] <- base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] ## v) Mackay sugar - Qld sel <- which(base.d16s@sam_data$Collection.Site == "Mackay" & base.d16s@sam_data$Detailed.Land.Use=="sugar") set.seed(123) choose3 <- sample(sel,size=3,replace=FALSE) base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] # Collection.Site Detailed.Land.Use Clay Organic.Carbon # X8192 Mackay sugar 8.36 0.79 # X8198 Mackay sugar 14.32 0.84 # X8220 Mackay sugar 17.59 1.16 aust_select[["Mackay (Qld) sugar"]] <- base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] ## vi) Narrabri cotton - NSW sel <- which(base.d16s@sam_data$Collection.Site == "Narrabri" & base.d16s@sam_data$Detailed.Land.Use=="cotton") set.seed(123) choose3 <- sample(sel,size=3,replace=FALSE) base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] # Collection.Site Detailed.Land.Use Clay Organic.Carbon # X12509 Narrabri cotton 33.61 1.16 # X12525 Narrabri cotton 36.58 1.19 # X12511 Narrabri cotton 36.64 1.40 aust_select[["Narrabri (NSW) cotton"]] <- base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] ## vii) Walpeup Cereals -wheat - Mallee NW Vic sel <- which(base.d16s@sam_data$Collection.Site == "Walpeup" & base.d16s@sam_data$Detailed.Land.Use=="Cereals -wheat") set.seed(123) choose3 <- sample(sel,size=3,replace=FALSE) base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] # Collection.Site Detailed.Land.Use Clay Organic.Carbon # X8182 Walpeup Cereals -wheat NA 1.16 # X8262 Walpeup Cereals -wheat NA 0.87 # X8266 Walpeup Cereals -wheat NA 0.89 aust_select[["Walpeup (Vic) wheat"]] <- base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] ### Natural ## i) Cape Tribulation National Park - Qld sel <- which(base.d16s@sam_data$Collection.Site == "Cape Tribulation" & base.d16s@sam_data$Detailed.Land.Use=="National Park") set.seed(123) choose3 <- sample(sel,size=3,replace=FALSE) base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] # Collection.Site Detailed.Land.Use Clay Organic.Carbon # X12816 Cape Tribulation National Park 24.16 4.93 # X12818 Cape Tribulation National Park 19.42 4.62 # X12819 Cape Tribulation National Park NA 5.36 aust_select[["Cape Tribulation NP (Qld)"]] <- base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] ## ii) Booderee National Park - NSW sel <- which(base.d16s@sam_data$Collection.Site == "Booderee" & base.d16s@sam_data$Detailed.Land.Use=="National Park") set.seed(123) choose3 <- sample(sel,size=3,replace=FALSE) base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] # Collection.Site Detailed.Land.Use Clay Organic.Carbon # X7833 Booderee National Park 7.86 1.51 # X7851 Booderee National Park 48.62 0.59 # X7837 Booderee National Park 3.81 1.23 aust_select[["Booderee NP (NSW)"]] <- base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] ## iii) Fitzgerald River NP National Park - WA sel <- which(base.d16s@sam_data$Collection.Site == "Fitzgerald River NP" & base.d16s@sam_data$Detailed.Land.Use=="National Park") set.seed(123) choose3 <- sample(sel,size=3,replace=FALSE) base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] # Collection.Site Detailed.Land.Use Clay Organic.Carbon # X8116 Fitzgerald River NP National Park 5.88 0.72 # X8130 Fitzgerald River NP National Park 5.80 0.59 # X8118 Fitzgerald River NP National Park 6.99 1.17 aust_select[["Fitzgerald River NP (WA)"]] <- base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] ## iv) Freycinet NP National Park - Tasmania sel <- which(base.d16s@sam_data$Collection.Site == "Freycinet NP" & base.d16s@sam_data$Detailed.Land.Use=="National Park") set.seed(123) choose3 <- sample(sel,size=3,replace=FALSE) base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] # Collection.Site Detailed.Land.Use Clay Organic.Carbon # X12428 Freycinet NP National Park 16.15 4.08 # X12438 Freycinet NP National Park 10.48 4.33 # X12430 Freycinet NP National Park 15.59 5.30 aust_select[["Freycinet NP (Tas)"]] <- base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] ## v) Mt Lesueur NP National Park - WA sel <- which(base.d16s@sam_data$Collection.Site == "Mt Lesueur NP" & base.d16s@sam_data$Detailed.Land.Use=="National Park") set.seed(123) choose3 <- sample(sel,size=3,replace=FALSE) base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] # Collection.Site Detailed.Land.Use Clay Organic.Carbon # X8082 Mt Lesueur NP National Park 3.88 1.01 # X8099 Mt Lesueur NP National Park 3.89 0.19 # X8084 Mt Lesueur NP National Park 10.17 1.37 aust_select[["Mt Lesueur NP (WA)"]] <- base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] ## vi) Namadgi NP National Park - ACT sel <- which(base.d16s@sam_data$Collection.Site == "Namadgi NP" & base.d16s@sam_data$Detailed.Land.Use=="National Park") set.seed(123) choose3 <- sample(sel,size=3,replace=FALSE) base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] # Collection.Site Detailed.Land.Use Clay Organic.Carbon # X12560 Namadgi NP National Park 26.23 4.41 # X9442 Namadgi NP National Park 11.59 5.02 # X12574 Namadgi NP National Park 11.06 3.64 aust_select[["Namadgi NP (ACT)"]] <- base.d16s@sam_data[choose3, c("Collection.Site","Detailed.Land.Use", "Clay", "Organic.Carbon")] ### determine minimum read depth for these Aust-wide selected sites length(aust_select) # 13 row.names(aust_select[[1]]) # "X12616" "X12620" "X12624" lapply(aust_select, FUN=row.names) # $`Blackheath (NSW) apple trees` # [1] "X12616" "X12620" "X12624" # # $`Buckley Swamp (Vic) pasture` # [1] "X8280" "X8284" "X8286" # # $`King Island (Tas) pasture` # [1] "X13272" "X13282" "X13274" # # $`Longerenong (Vic) wheat` # [1] "X8270" "X8272" "X8274" # # $`Mackay (Qld) sugar` # [1] "X8192" "X8198" "X8220" # # $`Narrabri (NSW) cotton` # [1] "X12509" "X12525" "X12511" # # $`Walpeup (Vic) wheat` # [1] "X8182" "X8262" "X8266" # # $`Cape Tribulation NP (Qld)` # [1] "X12816" "X12818" "X12819" # # $`Booderee NP (NSW)` # [1] "X7833" "X7851" "X7837" # # $`Fitzgerald River NP (WA)` # [1] "X8116" "X8130" "X8118" # # $`Freycinet NP (Tas)` # [1] "X12428" "X12438" "X12430" # # $`Mt Lesueur NP (WA)` # [1] "X8082" "X8099" "X8084" # # $`Namadgi NP (ACT)` # [1] "X12560" "X9442" "X12574" sel.samps <- unlist( lapply(aust_select, FUN=row.names) ) class(sel.samps) # "character" length(sel.samps) # 39 sel.samps # Blackheath (NSW) apple trees1 Blackheath (NSW) apple trees2 Blackheath (NSW) apple trees3 Buckley Swamp (Vic) pasture1 # "X12616" "X12620" "X12624" "X8280" # Buckley Swamp (Vic) pasture2 Buckley Swamp (Vic) pasture3 King Island (Tas) pasture1 King Island (Tas) pasture2 # "X8284" "X8286" "X13272" "X13282" # King Island (Tas) pasture3 Longerenong (Vic) wheat1 Longerenong (Vic) wheat2 Longerenong (Vic) wheat3 # "X13274" "X8270" "X8272" "X8274" # Mackay (Qld) sugar1 Mackay (Qld) sugar2 Mackay (Qld) sugar3 Narrabri (NSW) cotton1 # "X8192" "X8198" "X8220" "X12509" # Narrabri (NSW) cotton2 Narrabri (NSW) cotton3 Walpeup (Vic) wheat1 Walpeup (Vic) wheat2 # "X12525" "X12511" "X8182" "X8262" # Walpeup (Vic) wheat3 Cape Tribulation NP (Qld)1 Cape Tribulation NP (Qld)2 Cape Tribulation NP (Qld)3 # "X8266" "X12816" "X12818" "X12819" # Booderee NP (NSW)1 Booderee NP (NSW)2 Booderee NP (NSW)3 Fitzgerald River NP (WA)1 # "X7833" "X7851" "X7837" "X8116" # Fitzgerald River NP (WA)2 Fitzgerald River NP (WA)3 Freycinet NP (Tas)1 Freycinet NP (Tas)2 # "X8130" "X8118" "X12428" "X12438" # Freycinet NP (Tas)3 Mt Lesueur NP (WA)1 Mt Lesueur NP (WA)2 Mt Lesueur NP (WA)3 # "X12430" "X8082" "X8099" "X8084" # Namadgi NP (ACT)1 Namadgi NP (ACT)2 Namadgi NP (ACT)3 # "X12560" "X9442" "X12574" # perform functional analyses so that results can be compared on basis of normalised sampling effort. # Therefore normalise sampling effort between Mt Bold restoration gradient and Aust-wide samples phy.aust_select <- prune_samples(samples = sample_names(base.d16s) %in% sel.samps, x = base.d16s) phy.aust_select # phyloseq-class experiment-level object # otu_table() OTU Table: [ 12413 taxa and 39 samples ] # sample_data() Sample Data: [ 39 samples by 24 sample variables ] # tax_table() Taxonomy Table: [ 12413 taxa by 7 taxonomic ranks ] samp_size.compare <- min( c( min(sample_sums(phy.aust_select)) , min(sample_sums(mtbold_surf.16s)) ) ) samp_size.compare # 17614 # #------------------------- #### Mt Bold restoration gradient: Functional alpha diversity estimation using merged-sample bootstrap # - retain only NSTI <= 0.15 #------------------------- ### Bootstrap resampling ... ## create function to rarefy for initial sample > merge samples by type > rarefy again > calculate shannon's index # # # # # # # # # # # # calc_FUNCTIONAL_AlphaDiv_in_parallel <- function(phy_obj, merge_by, min_merge_no, rarefy_to, matching, otulist_withGG) { # rarefy ###phy_obj <- mtbold_surf.16s ###merge_by="Reveg_age_and_depth" ###rarefy_to=samp_size.compare ###min_merge_no=3 ###matching=matching ###otulist_withGG=yes_have_GG_Id seed <- 123+j r16s <- rarefy_even_depth(phy_obj, sample.size = rarefy_to, # sample.size = min(sample_sums(phy_obj)) rngseed = seed, replace = TRUE, trimOTUs = TRUE, verbose = TRUE) # Merge samples of the same type # to avoid bias in sample contributions, only include groups with three samples levs <- levels(as.factor( eval(parse(text= paste0("r16s@sam_data$",merge_by))) )) count <- numeric(length = length(levs)) for (i in 1:length(levs)) { sel <- which( eval(parse(text= paste0("r16s@sam_data$",merge_by))) == levs[i]) count[i] <- length(sel) } sel.rem <- which(count != min_merge_no) # 3 # remove samples not corresponding to a triplicate if (length(sel.rem)>0) {levs <- levs[-sel.rem]} # determine which samples to leave in for merging - i.e. triplicates only sel <- which( eval(parse(text= paste0("r16s@sam_data$",merge_by))) %in% levs) # subset samples sub.r16s <- subset_samples( samples = sample_names(r16s)[sel] , r16s ) # merge samples merged.r16s <- merge_samples(sub.r16s, group= eval(parse(text= paste0("sub.r16s@sam_data$",merge_by))) ) # note: merging converts $depth to 1 2 1 2 ..., and degrades variable $Reveg.age # now repair these merged.r16s@sam_data$depth[merged.r16s@sam_data$depth==1] <- "0-10 cm" merged.r16s@sam_data$depth[merged.r16s@sam_data$depth==2] <- "20-30 cm" merged.r16s@sam_data$Reveg_age <- NA for (i in 1:nsamples(merged.r16s)) { if (row.names(merged.r16s@sam_data)[i] == "10 years (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "10 years"} if (row.names(merged.r16s@sam_data)[i] == "10 years (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "10 years"} if (row.names(merged.r16s@sam_data)[i] == "6 years (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "6 years"} if (row.names(merged.r16s@sam_data)[i] == "6 years (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "6 years"} if (row.names(merged.r16s@sam_data)[i] == "7 years (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "7 years"} if (row.names(merged.r16s@sam_data)[i] == "7 years (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "7 years"} if (row.names(merged.r16s@sam_data)[i] == "8 years (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "8 years"} if (row.names(merged.r16s@sam_data)[i] == "8 years (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "8 years"} if (row.names(merged.r16s@sam_data)[i] == "Cleared (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Cleared"} if (row.names(merged.r16s@sam_data)[i] == "Cleared (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Cleared"} if (row.names(merged.r16s@sam_data)[i] == "Remnant A (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Remnant A"} if (row.names(merged.r16s@sam_data)[i] == "Remnant A (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Remnant A"} if (row.names(merged.r16s@sam_data)[i] == "Remnant B (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Remnant B"} if (row.names(merged.r16s@sam_data)[i] == "Remnant B (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Remnant B"} if (row.names(merged.r16s@sam_data)[i] == "Remnant C (0-10 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Remnant C"} if (row.names(merged.r16s@sam_data)[i] == "Remnant C (20-30 cm)") {merged.r16s@sam_data$Reveg_age[i] <- "Remnant C"} } # rarefy back to min sample size for this bootstrap sample - i.e. rarefied-merged-rarefied seed <- 234+j rmr16s <- rarefy_even_depth(merged.r16s, sample.size = min(sample_sums(phy_obj)), rngseed = seed, replace = TRUE, trimOTUs = TRUE, verbose = TRUE) # - - - - - - - - - - ## record ntaxa before Picrust; and after Picrust record ntaxa and relabun covered ntaxa_0 <- ntaxa(rmr16s) # 3204 relabun_rmr16s <- transform_sample_counts(rmr16s, function(x) x / sum(x) ) #100*mean(sample_sums(relabun_rmr16s)) # 100 ### Picrust ## prune taxa - leave only those with GG Ids prune.rmr16s.gg <- prune_taxa(taxa = taxa_names(rmr16s) %in% otulist_withGG, x = rmr16s) ntaxa_withGG <- ntaxa(prune.rmr16s.gg) # 2193 relabun_rmr16s_withGG <- prune_taxa(taxa = taxa_names(relabun_rmr16s) %in% taxa_names(prune.rmr16s.gg), x = relabun_rmr16s) #relabun_cover_withGG <- 100*mean( sample_sums( relabun_rmr16s_withGG ) ) # 84% relabun_cover_withGG <- 100*sample_sums( relabun_rmr16s_withGG ) # 84% ## use table matching OTU Id to GG Id to assess/label duplicate GG Ids ## consider only OTUs that have GG Id sel <- which(matching$BASE_97_OTU_Id %in% otulist_withGG) # qty 10647 match <- matching[sel, ] ## now running Mt Bold only !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! sel <- which(match$dataset == "Mt Bold") # 2230 match <- match[sel, ] # BUT in this rarefied run some OTUs may be missing - find out which OTUs and GG Ids are here? sel <- which(match$BASE_97_OTU_Id %in% taxa_names(prune.rmr16s.gg)) # qty 2193 GG_Ids_in_this_run <- match$GG_Id[sel] ## estimate coverage from Picrust algorithm ## uses stored lookup table picrust_lookup <- themetagenomics:::picrust_otu(file_path="C:/Workspace/DATA/KEGG/ko_13_5_precalculated.tab.gz", otu_id_targets=GG_Ids_in_this_run) # Note: use ::: to access hidden function 'picrust_otu()' # file_path:- Path to the precalculated table; otu_id_targets:- Character vector of OTU IDs to predict ###GG_Ids_in_Picrust_this_run <- picrust_lookup$matches # length(picrust_lookup$matches) ## retain only sequences with NSTI <= 0.15 sel <- which(picrust_lookup$pimeta_table_out <= 0.15) GG_Ids_in_Picrust_this_run <- picrust_lookup$matches[sel] # so what is coverage from Picrust in this run? sel <- which(match$GG_Id %in% GG_Ids_in_Picrust_this_run) # otuid_covered_by_Picrust_this_run <- match$BASE_97_OTU_Id[sel] ntaxa_cover_by_Picrust <- length(otuid_covered_by_Picrust_this_run) relabun_rmr16s_cover_by_Picrust <- prune_taxa(taxa = taxa_names(relabun_rmr16s_withGG) %in% otuid_covered_by_Picrust_this_run, x = relabun_rmr16s_withGG) #relabun_cover_by_Picrust <- 100*mean( sample_sums( relabun_rmr16s_cover_by_Picrust ) ) relabun_cover_by_Picrust <- 100*sample_sums( relabun_rmr16s_cover_by_Picrust ) ## identify duplicate GG IDs - these taxa will need to be merged #dim(match) # 2230 6 sel.dup.gg <- which( duplicated(match$GG_Id) == TRUE) dups <- match[ sel.dup.gg , c("BASE_97_OTU_Id", "GG_Id")] # iterate through OTU IDs to rename with GG IDs, merging OTUs where necessary # (as some GG Ids are duplicated when representing different BASE OTU Ids) for (i in 1:dim(match)[1]) { #i<-1766 otuid <- match$BASE_97_OTU_Id[i] this_GG_Id <- match$GG_Id[i] # check this otuid is still in taxa_names() as taxa are being merged and renamed if (otuid %in% taxa_names(prune.rmr16s.gg)) { if (otuid %in% dups$BASE_97_OTU_Id) { # this means GG_Id for this OTU is a duplicate # i.e. an OTU with a smaller index has already been represented by the same GG_Id # this code relies on the order of match and taxa_names() being the same sel.dup <- which(match$GG_Id == this_GG_Id) taxa_rep_by_same_GG_Id <- match$BASE_97_OTU_Id[sel.dup] # get indices of taxa_names that have already or need to be converte dto that GG_Id sel <- which(taxa_names(prune.rmr16s.gg) %in% c(this_GG_Id, taxa_rep_by_same_GG_Id)) # check there are multiple taxa to merge if (length(sel) > 1) { prune.rmr16s.gg <- merge_taxa(prune.rmr16s.gg, taxa_names(prune.rmr16s.gg)[sel],archetype = 1) # because it is duplicate, archetype will need to be this_GG_Id (as 1st otuid will already be overwritten) # merging taxa should also overwrite taxa_name with GG_Id archetype } else { # this was expected to be a duplicate but rarefying has taken out the duplicates # overwrite taxa_name sel <- which(taxa_names(prune.rmr16s.gg)==otuid) taxa_names(prune.rmr16s.gg)[sel] <- this_GG_Id } # if not a duplicate } else { # overwrite taxa_name sel <- which(taxa_names(prune.rmr16s.gg)==otuid) taxa_names(prune.rmr16s.gg)[sel] <- this_GG_Id } print(paste0("completed ",i," of ",length(taxa_names(prune.rmr16s.gg)))) } } mb_gg_otu_table <- as.data.frame( t(prune.rmr16s.gg@otu_table) ) ##remove "." from sample names #names(mb_gg_otu_table) <- gsub(pattern="[.]", replacement="_",x=names(mb_gg_otu_table) ) names(mb_gg_otu_table) <- gsub(pattern="\\(0-10 cm\\)", replacement="10cm",x=names(mb_gg_otu_table) ) names(mb_gg_otu_table) <- gsub(pattern=" ", replacement="_",x=names(mb_gg_otu_table) ) #mb_gg_otu_table[1:5, 1:5] # # WITHOUT Copy Number Normalization # KEGG_fxn <- picrust(mb_gg_otu_table,rows_are_taxa=TRUE, # reference='gg_ko',reference_path="C:/Workspace/DATA/KEGG", # cn_normalize=FALSE,sample_normalize=FALSE, # !! cn_normalize=FALSE !! # drop=TRUE) # WITH Copy Number Normalization sel <- which(row.names(mb_gg_otu_table) %in% GG_Ids_in_Picrust_this_run) KEGG_fxn <- picrust(mb_gg_otu_table[sel, ], rows_are_taxa=TRUE, reference='gg_ko',reference_path="C:/Workspace/DATA/KEGG", cn_normalize=TRUE,sample_normalize=FALSE, # !! cn_normalize=TRUE !! drop=TRUE) # - - - - - - - - - - ### ALPHA DIVERSITY ## calculate Shannon's index #shan.rmr16s <- plot_richness(rmr16s, measures=c("Shannon")) shan <- vegan::diversity(x=KEGG_fxn$fxn_table, index = "shannon", MARGIN = 1, base = exp(1)) #shan eff_no <- exp(shan) #eff_no # also record weighted NSTI NSTI_scores <- t(KEGG_fxn$method_meta) %*% as.matrix(mb_gg_otu_table[row.names(KEGG_fxn$method_meta), ]) weighted_NSTI <- NSTI_scores/base::colSums( as.matrix(mb_gg_otu_table[row.names(KEGG_fxn$method_meta), ]) ) # export data out <- data.frame(sample=names(shan),boot_rep=j, shannon=as.numeric(shan), eff_no=as.numeric(eff_no), Reveg_age=substr(x=names(shan),start=1,stop=nchar(names(shan))-5), depth=substr(x=names(shan),start=nchar(names(shan))-3,stop=nchar(names(shan))), ntaxa_0=ntaxa_0, ntaxa_withGG=ntaxa_withGG, relabun_cover_withGG=relabun_cover_withGG, ntaxa_cover_by_Picrust=ntaxa_cover_by_Picrust, relabun_cover_by_Picrust=relabun_cover_by_Picrust, weighted_NSTI= as.numeric(weighted_NSTI) ) out$calc_type <- "bootstrap" return(out) } # # # # # # # # # # # # # bootstrap outputs will automatically be stored in a list: b_out # number of bootstrap resamples B <- 100 # iterate in parallel cl<-makeCluster( detectCores()-1 ) # detectCores()-1 registerDoParallel(cl) # remember to stopcluster() b_out <- foreach(j=1:B, .packages=c('phyloseq','themetagenomics','vegan')) %dopar% calc_FUNCTIONAL_AlphaDiv_in_parallel(phy_obj=mtbold_surf.16s, merge_by="Reveg_age_and_depth", min_merge_no=3, rarefy_to=samp_size.compare, matching=matching, otulist_withGG=yes_have_GG_Id ) stopCluster(cl) length(b_out) # 100 names(b_out[[1]]) # [1] "sample" "boot_rep" "shannon" "eff_no" # [5] "Reveg_age" "depth" "ntaxa_0" "ntaxa_withGG" # [9] "relabun_cover_withGG" "ntaxa_cover_by_Picrust" "relabun_cover_by_Picrust" "weighted_NSTI" # [13] "calc_type" dim(b_out[[1]]) # 8 13 b_out[[10]] # sample boot_rep shannon eff_no Reveg_age depth ntaxa_0 ntaxa_withGG relabun_cover_withGG # 10 years (0-10 cm) 10_years_10cm 10 7.482337 1776.388 10_years 10cm 3170 1553 73.40074 # 6 years (0-10 cm) 6_years_10cm 10 7.537026 1876.241 6_years 10cm 3170 1553 73.45428 # 7 years (0-10 cm) 7_years_10cm 10 7.543722 1888.847 7_years 10cm 3170 1553 73.48401 # 8 years (0-10 cm) 8_years_10cm 10 7.536250 1874.786 8_years 10cm 3170 1553 75.77695 # Cleared (0-10 cm) Cleared_10cm 10 7.598229 1994.660 Cleared 10cm 3170 1553 79.35465 # Remnant A (0-10 cm) Remnant_A_10cm 10 7.520135 1844.816 Remnant_A 10cm 3170 1553 73.71599 # Remnant B (0-10 cm) Remnant_B_10cm 10 7.494747 1798.569 Remnant_B 10cm 3170 1553 73.98067 # Remnant C (0-10 cm) Remnant_C_10cm 10 7.538628 1879.249 Remnant_C 10cm 3170 1553 75.97621 # ntaxa_cover_by_Picrust relabun_cover_by_Picrust weighted_NSTI calc_type # 10 years (0-10 cm) 520 26.59331 0.08682520 bootstrap # 6 years (0-10 cm) 520 25.32045 0.08492327 bootstrap # 7 years (0-10 cm) 520 27.28327 0.08948583 bootstrap # 8 years (0-10 cm) 520 25.29071 0.08961406 bootstrap # Cleared (0-10 cm) 520 24.33011 0.08354318 bootstrap # Remnant A (0-10 cm) 520 21.79628 0.09843134 bootstrap # Remnant B (0-10 cm) 520 21.98067 0.09891074 bootstrap # Remnant C (0-10 cm) 520 24.57993 0.09057515 bootstrap b_out__alpha_div_functional_mt_bold_NSTI_cutoff <- b_out getwd() # "C:/Workspace/PROJ/PAPER-Trending-Taxa-Resto/modelling" saveRDS(b_out__alpha_div_functional_mt_bold_NSTI_cutoff, file = "b_out__alpha_div_functional_mt_bold_NSTI_cutoff.RDS") #b_out <- readRDS("b_out__alpha_div_functional_mt_bold_NSTI_cutoff.RDS") rm(b_out__alpha_div_functional_mt_bold_NSTI_cutoff) ### For plotting, 1st calculate diversity from one rarefying step ### then append bootstrap-derived uncertainty # rarefy #1 phy_obj <- mtbold_surf.16s otulist_withGG <- yes_have_GG_Id seed <- 123 r1.16s <- rarefy_even_depth(phy_obj, sample.size = samp_size.compare, rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE) # - - - - - - - - - - ## record ntaxa before Picrust; and after Picrust record ntaxa and relabun covered ntaxa_0 <- ntaxa(r1.16s) # 3200 relabun_1r16s <- transform_sample_counts(r1.16s, function(x) x / sum(x) ) #100*mean(sample_sums(relabun_1r16s)) # 100 ### Picrust ## prune taxa - leave only those with GG Ids prune.1r16s.gg <- prune_taxa(taxa = taxa_names(r1.16s) %in% otulist_withGG, x = r1.16s) ntaxa_withGG <- ntaxa(prune.1r16s.gg) # 1565 relabun_1r16s_withGG <- prune_taxa(taxa = taxa_names(relabun_1r16s) %in% taxa_names(prune.1r16s.gg), x = relabun_1r16s) relabun_cover_withGG <- 100*sample_sums( relabun_1r16s_withGG ) # ~70-80% relabun_cover_withGG ## use table matching OTU Id to GG Id to assess/label duplicate GG Ids ## consider only OTUs that have GG Id sel <- which(matching$BASE_97_OTU_Id %in% otulist_withGG) # qty 6981 match <- matching[sel, ] ## now running Mt Bold only !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! sel <- which(match$dataset == "Mt Bold") # 1577 match <- match[sel, ] # BUT in this rarefied run some OTUs may be missing - find out which OTUs and GG Ids are here? sel <- which(match$BASE_97_OTU_Id %in% taxa_names(prune.1r16s.gg)) # qty 1565 GG_Ids_in_this_run <- match$GG_Id[sel] ## estimate coverage from Picrust algorithm ## uses stored lookup table picrust_lookup <- themetagenomics:::picrust_otu(file_path="C:/Workspace/DATA/KEGG/ko_13_5_precalculated.tab.gz", otu_id_targets=GG_Ids_in_this_run) # Note: use ::: to access hidden function 'picrust_otu()' # file_path:- Path to the precalculated table; otu_id_targets:- Character vector of OTU IDs to predict ###GG_Ids_in_Picrust_this_run <- picrust_lookup$matches # length(picrust_lookup$matches) # 1383 ## retain only sequences with NSTI <= 0.15 sel <- which(picrust_lookup$pimeta_table_out <= 0.15) # qty 496 GG_Ids_in_Picrust_this_run <- picrust_lookup$matches[sel] # so what is coverage from Picrust in this run? sel <- which(match$GG_Id %in% GG_Ids_in_Picrust_this_run) # qty 522 (expect some duplication) otuid_covered_by_Picrust_this_run <- match$BASE_97_OTU_Id[sel] ntaxa_cover_by_Picrust <- length(otuid_covered_by_Picrust_this_run) # 522 relabun_1r16s_cover_by_Picrust <- prune_taxa(taxa = taxa_names(relabun_1r16s_withGG) %in% otuid_covered_by_Picrust_this_run, x = relabun_1r16s_withGG) relabun_cover_by_Picrust <- 100*sample_sums( relabun_1r16s_cover_by_Picrust ) relabun_cover_by_Picrust # ~21-27% ## identify duplicate GG IDs - these taxa will need to be merged sel.dup.gg <- which( duplicated(match$GG_Id) == TRUE) # qty 72 dups <- match[ sel.dup.gg , c("BASE_97_OTU_Id", "GG_Id")] # iterate through OTU IDs to rename with GG IDs, merging OTUs where necessary # (as some GG Ids are duplicated to represent different BASE OTU Ids) for (i in 1:dim(match)[1]) { # otuid <- match$BASE_97_OTU_Id[i] this_GG_Id <- match$GG_Id[i] # check this otuid is still in taxa_names() as taxa are being merged and renamed if (otuid %in% taxa_names(prune.1r16s.gg)) { if (otuid %in% dups$BASE_97_OTU_Id) { # this means GG_Id for this OTU is a duplicate # i.e. an OTU with a smaller index has already been represented by the same GG_Id # this code relies on the order of match and taxa_names() being the same sel.dup <- which(match$GG_Id == this_GG_Id) taxa_rep_by_same_GG_Id <- match$BASE_97_OTU_Id[sel.dup] # get indices of taxa_names that have already or need to be converted to that GG_Id sel <- which(taxa_names(prune.1r16s.gg) %in% c(this_GG_Id, taxa_rep_by_same_GG_Id)) # check there are multiple taxa to merge if (length(sel) > 1) { prune.1r16s.gg <- merge_taxa(prune.1r16s.gg, taxa_names(prune.1r16s.gg)[sel],archetype = 1) # because it is duplicate, archetype will need to be this_GG_Id (as 1st otuid will already be overwritten) # merging taxa should also overwrite taxa_name with GG_Id archetype } else { # this was expected to be a duplicate but rarefying has taken out the duplicates # overwrite taxa_name sel <- which(taxa_names(prune.1r16s.gg)==otuid) taxa_names(prune.1r16s.gg)[sel] <- this_GG_Id } # if not a duplicate } else { # overwrite taxa_name sel <- which(taxa_names(prune.1r16s.gg)==otuid) taxa_names(prune.1r16s.gg)[sel] <- this_GG_Id } print(paste0("completed ",i," of ",length(taxa_names(prune.1r16s.gg)))) } } # for one-off rarefying don't need to transpose dataframe - because has rows as taxa mb_gg_otu_table <- as.data.frame( prune.1r16s.gg@otu_table ) ##remove "." from sample names names(mb_gg_otu_table) <- gsub(pattern="[.]", replacement="_",x=names(mb_gg_otu_table) ) ##names(mb_gg_otu_table) <- gsub(pattern="\\(0-10 cm\\)", replacement="10cm",x=names(mb_gg_otu_table) ) ##names(mb_gg_otu_table) <- gsub(pattern=" ", replacement="_",x=names(mb_gg_otu_table) ) #mb_gg_otu_table[1:5, 1:5] # WITH Copy Number Normalization sel <- which(row.names(mb_gg_otu_table) %in% GG_Ids_in_Picrust_this_run) KEGG_fxn <- picrust(mb_gg_otu_table[sel, ], rows_are_taxa=TRUE, reference='gg_ko',reference_path="C:/Workspace/DATA/KEGG", cn_normalize=TRUE,sample_normalize=FALSE, # !! cn_normalize=TRUE !! drop=TRUE) # - - - - - - - - - - ### ALPHA DIVERSITY ## calculate Shannon's index #shan.rmr16s <- plot_richness(rmr16s, measures=c("Shannon")) dim(KEGG_fxn$fxn_table) # [1] 24 4988 shan <- vegan::diversity(x=KEGG_fxn$fxn_table, index = "shannon", MARGIN = 1, base = exp(1)) #shan eff_no <- exp(shan) #eff_no # also record weighted NSTI NSTI_scores <- t(KEGG_fxn$method_meta) %*% as.matrix(mb_gg_otu_table[row.names(KEGG_fxn$method_meta), ]) weighted_NSTI <- NSTI_scores/base::colSums( as.matrix(mb_gg_otu_table[row.names(KEGG_fxn$method_meta), ]) ) # export data out <- data.frame(sample=names(shan),boot_rep=0, shannon=as.numeric(shan), eff_no=as.numeric(eff_no), #Reveg_age=substr(x=names(shan),start=1,stop=nchar(names(shan))-5), Reveg_age=c( "6 years", "6 years", "6 years", "Cleared", "Cleared", "Cleared", "10 years", "10 years", "10 years", "7 years", "7 years", "7 years", "8 years", "8 years", "8 years", "Remnant A", "Remnant A", "Remnant A", "Remnant B", "Remnant B", "Remnant B", "Remnant C", "Remnant C", "Remnant C" ), #depth=substr(x=names(shan),start=nchar(names(shan))-3,stop=nchar(names(shan))), depth="10cm", ntaxa_0=ntaxa_0, ntaxa_withGG=ntaxa_withGG, relabun_cover_withGG=relabun_cover_withGG, ntaxa_cover_by_Picrust=ntaxa_cover_by_Picrust, relabun_cover_by_Picrust=relabun_cover_by_Picrust, weighted_NSTI= as.numeric(weighted_NSTI) ) out$calc_type <- "rarefyx1" str(out) out$Reveg_age <- factor(out$Reveg_age, levels = c("Cleared", "6 years", "7 years", "8 years", "10 years", "Remnant A", "Remnant B", "Remnant C"), ordered = TRUE) str(out) # 'data.frame': 24 obs. of 13 variables: # $ sample : Factor w/ 24 levels "2005_1_10","2005_2_10",..: 10 11 12 22 23 24 1 2 3 7 ... # $ boot_rep : num 0 0 0 0 0 0 0 0 0 0 ... # $ shannon : num 7.53 7.55 7.53 7.57 7.65 ... # $ eff_no : num 1855 1897 1862 1937 2095 ... # $ Reveg_age : Ord.factor w/ 8 levels "Cleared"<"6 years"<..: 2 2 2 1 1 1 5 5 5 3 ... # $ depth : Factor w/ 1 level "10cm": 1 1 1 1 1 1 1 1 1 1 ... # $ ntaxa_0 : int 3200 3200 3200 3200 3200 3200 3200 3200 3200 3200 ... # $ ntaxa_withGG : int 1565 1565 1565 1565 1565 1565 1565 1565 1565 1565 ... # $ relabun_cover_withGG : num 73 76.1 72.4 76.3 85.6 ... # $ ntaxa_cover_by_Picrust : int 522 522 522 522 522 522 522 522 522 522 ... # $ relabun_cover_by_Picrust: num 27 23.8 24 23.2 25.9 ... # $ weighted_NSTI : num 0.0855 0.0848 0.0855 0.0865 0.076 ... # $ calc_type : chr "rarefyx1" "rarefyx1" "rarefyx1" "rarefyx1" ... names(out) # [1] "sample" "boot_rep" "shannon" "eff_no" # [5] "Reveg_age" "depth" "ntaxa_0" "ntaxa_withGG" # [9] "relabun_cover_withGG" "ntaxa_cover_by_Picrust" "relabun_cover_by_Picrust" "weighted_NSTI" # [13] "calc_type" names(b_out[[1]]) # [1] "sample" "boot_rep" "shannon" "eff_no" # [5] "Reveg_age" "depth" "ntaxa_0" "ntaxa_withGG" # [9] "relabun_cover_withGG" "ntaxa_cover_by_Picrust" "relabun_cover_by_Picrust" "weighted_NSTI" # [13] "calc_type" dim(b_out[[1]]) # 8 13 temp <- out row.names(temp) <- NULL for (j in 1:B) { #j<-1 new <- b_out[[j]] row.names(new) <- NULL # minor edit to Reveg Age label new$Reveg_age <- gsub(pattern="_", replacement=" ",x=new$Reveg_age ) temp <- rbind(temp,new) } head(temp) temp[1:50, ] tail(temp) names(temp) # [1] "sample" "boot_rep" "shannon" "eff_no" # [5] "Reveg_age" "depth" "ntaxa_0" "ntaxa_withGG" # [9] "relabun_cover_withGG" "ntaxa_cover_by_Picrust" "relabun_cover_by_Picrust" "weighted_NSTI" # [13] "calc_type" ## quantify weighted NSTI ?? hist(temp$weighted_NSTI) summary(temp$weighted_NSTI) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 0.07605 0.08608 0.08956 0.09037 0.09555 0.10314 # Weighted NSTI median 0.090 (range 0.076-0.103) ##melt.out <- melt(temp,id.vars = c("Reveg_age","depth","calc_type"), measure.vars = "eff_no") melt.out <- melt(temp,id.vars = c("Reveg_age","calc_type"), measure.vars = "eff_no") ## plot ## [finished-plot] # # # # # # # # # # # # # ## apply same standard colours ("cols") cols <- c("Cleared" = "#e31a1c", "6 years" = "#addd8e", "7 years" = "#78c679", "8 years" = "#41ab5d", "10 years"= "#238443", "Remnant A" = "#4292c6", "Remnant B" = "#2171b5", "Remnant C" = "#084594") p <- ggplot(data=melt.out, aes(x=Reveg_age, value)) + ggtitle("(b)") + geom_violin(data = melt.out[ which(melt.out$calc_type == "bootstrap"), ], aes(color = Reveg_age) ) + scale_colour_manual(values = cols) + geom_point(data = melt.out[ which(melt.out$calc_type == "rarefyx1"), ], color="gray20", shape=1 ) + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5) ) + theme(axis.title.x = element_blank()) + labs(y = "Effective KEGG functions (count)") + #x = "Reveg age", theme(legend.position="none") + scale_x_discrete(labels=c("Cleared" = "Clear", "6 years" = "6 yr", "7 years" = "7 yr", "8 years" = "8 yr", "10 years"= "10 yr", "Remnant A" = "Rem A", "Remnant B" = "Rem B", "Remnant C" = "Rem C")) p ggsave(plot=p, filename = paste0("finished-plots/","Alpha-Div-Eff-No-KEGG-Functions-MtBold-16S-TIDY-NSTI-cutoff-vFINAL.tiff"), width = 6, height = 8, units = "cm", dpi = 600, compression = "lzw") ## now plot % Reads covered by GG Ids and PICRUSt melt.out <- melt(temp,id.vars = c("Reveg_age","calc_type"), measure.vars = c("relabun_cover_withGG", "relabun_cover_by_Picrust")) melt.out <- melt.out[ -which(melt.out$calc_type == "rarefyx1") , ] melt.out[1:10, ] tail(melt.out) lbl1 <- "% sequences with\nGreengenes Ids" lbl2 <- paste0("% sequences covered in\nPICRUSt (where NSTI ",expression('\u2264'), " 0.15)") p <- ggplot(data=melt.out, aes(x=Reveg_age, value)) + ggtitle("(c)") + #geom_violin(data = melt.out[ which(melt.out$calc_type == "bootstrap"), ], aes(color = Reveg_age) ) + scale_colour_manual(values = cols) + #geom_point(data = melt.out[ which(melt.out$calc_type == "rarefyx1"), ], color="gray20", shape=1 ) + geom_jitter(data = melt.out[ which(melt.out$variable == "relabun_cover_withGG"), ],height=0,width=0.2, aes(color = Reveg_age) , shape=6, alpha=0.2 ) + #scale_colour_manual(values = cols) + geom_jitter(data = melt.out[ which(melt.out$variable == "relabun_cover_by_Picrust"), ],height=0,width=0.2, aes(color = Reveg_age) , shape=0, alpha=0.2 ) + scale_colour_manual(values = cols) + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5) ) + theme(axis.title.x = element_blank()) + labs(y = "Sequence coverage (%)") + # x = "Reveg age", theme(legend.position="none") + geom_text(x=1, y=70, label=lbl1, hjust=0, vjust=1, size =3) + geom_text(x=1, y=30, label=lbl2, hjust=0, vjust=0, size =3) + scale_x_discrete(labels=c("Cleared" = "Clear", "6 years" = "6 yr", "7 years" = "7 yr", "8 years" = "8 yr", "10 years"= "10 yr", "Remnant A" = "Rem A", "Remnant B" = "Rem B", "Remnant C" = "Rem C")) p ggsave(plot=p, filename = paste0("finished-plots/","Percent-Sequences-covered-by-GG-Ids-and-Picrust-estimate-NSTI-cutoff-vFINAL.tiff"), width = 6, height = 8, units = "cm", dpi = 600, compression = "lzw") # # # # # # # # # # # # # #------------------------- #### Alpha diversity OTUs - Aust-wide selection #------------------------- ### Bootstrap resampling ... ## create function to rarefy for initial sample > merge samples by type > rarefy again > calculate shannon's index # # # # # # # # # # # # calc_AlphaDiv_in_parallel_AUST_SELECT <- function(phy_obj, merge_by, min_merge_no, rarefy_to ) { # rarefy ###phy_obj <- phy.aust_select ###merge_by="Collection.Site" ###rarefy_to=samp_size.compare ###min_merge_no=3 seed <- 123+j r16s <- rarefy_even_depth(phy_obj, sample.size = rarefy_to, rngseed = seed, replace = TRUE, trimOTUs = TRUE, verbose = TRUE) # Merge samples of the same type # to avoid bias in sample contributions, only include groups with three samples levs <- levels(as.factor( eval(parse(text= paste0("r16s@sam_data$",merge_by))) )) count <- numeric(length = length(levs)) for (i in 1:length(levs)) { sel <- which( eval(parse(text= paste0("r16s@sam_data$",merge_by))) == levs[i]) count[i] <- length(sel) } sel.rem <- which(count != min_merge_no) # 3 # remove samples not corresponding to a triplicate if (length(sel.rem)>0) {levs <- levs[-sel.rem]} # determine which samples to leave in for merging - i.e. triplicates only sel <- which( eval(parse(text= paste0("r16s@sam_data$",merge_by))) %in% levs) # subset samples sub.r16s <- subset_samples( samples = sample_names(r16s)[sel] , r16s ) # merge samples merged.r16s <- merge_samples(sub.r16s, group= eval(parse(text= paste0("sub.r16s@sam_data$",merge_by))) ) # note: merging introduces NAs for many variables # rarefy back to min sample size for this bootstrap sample - i.e. rarefied-merged-rarefied seed <- 234+j rmr16s <- rarefy_even_depth(merged.r16s, sample.size = rarefy_to, rngseed = seed, replace = TRUE, trimOTUs = TRUE, verbose = TRUE) # - - - - - - - - - - ### ALPHA DIVERSITY ## calculate Shannon's index shan.rmr16s <- plot_richness(rmr16s, measures=c("Shannon")) # export data # out <- data.frame(sample=shan.rmr16s$data$samples,shannon=shan.rmr16s$data$value, # Reveg_age=shan.rmr16s$data$Reveg_age, depth=shan.rmr16s$data$depth) out <- data.frame(sample=shan.rmr16s$data$samples,shannon=shan.rmr16s$data$value) out$eff_no <- exp(out$shannon) out$calc_type <- "bootstrap" return(out) } # # # # # # # # # # # # # bootstrap outputs will automatically be stored in a list: b_out # number of bootstrap resamples B <- 100 # iterate in parallel cl<-makeCluster( detectCores()-1 ) # detectCores()-1 registerDoParallel(cl) # remember to stopcluster() b_out <- foreach(j=1:B, .packages=c('phyloseq')) %dopar% calc_AlphaDiv_in_parallel_AUST_SELECT(phy_obj=phy.aust_select, merge_by="Collection.Site", min_merge_no=3, rarefy_to=samp_size.compare) stopCluster(cl) length(b_out) names(b_out[[1]]) # "sample" "shannon" "eff_no" "calc_type" dim(b_out[[1]]) # 13 4 b_out__alpha_div_aust_select <- b_out getwd() # "C:/Workspace/PROJ/PAPER-Trending-Taxa-Resto/modelling" saveRDS(b_out__alpha_div_aust_select, file = "b_out__alpha_div_aust_select.RDS") #b_out <- readRDS("b_out__alpha_div_aust_select.RDS") #rm(b_out__alpha_div_aust_select) ### For plotting, 1st calculate diversity from one rarefying step ### then append bootstrap-derived uncertainty # rarefy #1 phy_obj <- phy.aust_select seed <- 123 r1.16s <- rarefy_even_depth(phy_obj, sample.size = samp_size.compare, rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE) shan.r1.16s <- plot_richness(r1.16s, measures=c("Shannon")) out <- data.frame(sample=shan.r1.16s$data$samples,shannon=shan.r1.16s$data$value) # out$eff_no <- exp(out$shannon) # calculate effective no of species out$calc_type <- "rarefyx1" str(out) # 'data.frame': 39 obs. of 4 variables: # $ sample : Factor w/ 39 levels "X12428","X12430",..: 1 2 3 4 5 6 7 8 9 10 ... # $ shannon : num 6.48 6.41 6.41 6.28 5.94 ... # $ eff_no : num 649 609 606 536 380 ... # $ calc_type: chr "rarefyx1" "rarefyx1" "rarefyx1" "rarefyx1" ... row.names(out) # [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" # [26] "26" "27" "28" "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" out$sample # [1] X12428 X12430 X12438 X12509 X12511 X12525 X12560 X12572 X12616 X12620 X12624 X12816 X12818 X12819 X13272 X13274 X13282 X7833 # [19] X7837 X7851 X8082 X8084 X8099 X8116 X8118 X8130 X8182 X8192 X8198 X8220 X8262 X8266 X8270 X8272 X8274 X8280 # [37] X8284 X8286 X9442 # 39 Levels: X12428 X12430 X12438 X12509 X12511 X12525 X12560 X12572 X12616 X12620 X12624 X12816 X12818 X12819 X13272 ... X9442 str(out) sel.samps # Blackheath (NSW) apple trees1 Blackheath (NSW) apple trees2 Blackheath (NSW) apple trees3 Buckley Swamp (Vic) pasture1 # "X12616" "X12620" "X12624" "X8280" # Buckley Swamp (Vic) pasture2 Buckley Swamp (Vic) pasture3 King Island (Tas) pasture1 King Island (Tas) pasture2 # "X8284" "X8286" "X13272" "X13282" # King Island (Tas) pasture3 Longerenong (Vic) wheat1 Longerenong (Vic) wheat2 Longerenong (Vic) wheat3 # "X13274" "X8270" "X8272" "X8274" # Mackay (Qld) sugar1 Mackay (Qld) sugar2 Mackay (Qld) sugar3 Narrabri (NSW) cotton1 # "X8192" "X8198" "X8220" "X12509" # Narrabri (NSW) cotton2 Narrabri (NSW) cotton3 Walpeup (Vic) wheat1 Walpeup (Vic) wheat2 # "X12525" "X12511" "X8182" "X8262" # Walpeup (Vic) wheat3 Cape Tribulation NP (Qld)1 Cape Tribulation NP (Qld)2 Cape Tribulation NP (Qld)3 # "X8266" "X12816" "X12818" "X12819" # Booderee NP (NSW)1 Booderee NP (NSW)2 Booderee NP (NSW)3 Fitzgerald River NP (WA)1 # "X7833" "X7851" "X7837" "X8116" # Fitzgerald River NP (WA)2 Fitzgerald River NP (WA)3 Freycinet NP (Tas)1 Freycinet NP (Tas)2 # "X8130" "X8118" "X12428" "X12438" # Freycinet NP (Tas)3 Mt Lesueur NP (WA)1 Mt Lesueur NP (WA)2 Mt Lesueur NP (WA)3 # "X12430" "X8082" "X8099" "X8084" # Namadgi NP (ACT)1 Namadgi NP (ACT)2 Namadgi NP (ACT)3 # "X12560" "X9442" "X12574" # use $sample to store Collection Site (as per bootstrap results) # use row.name to preserve sample ID out$sample <- as.character(out$sample) row.names(out) <- out$sample names(aust_select) #[1] "Blackheath (NSW) apple trees" "Buckley Swamp (Vic) pasture" "King Island (Tas) pasture" "Longerenong (Vic) wheat" # [5] "Mackay (Qld) sugar" "Narrabri (NSW) cotton" "Walpeup (Vic) wheat" "Cape Tribulation NP (Qld)" # [9] "Booderee NP (NSW)" "Fitzgerald River NP (WA)" "Freycinet NP (Tas)" "Mt Lesueur NP (WA)" # [13] "Namadgi NP (ACT)" for (i in 1:dim(out)[1]) { #i<-1 if (row.names(out)[i] %in% row.names(aust_select[["Blackheath (NSW) apple trees"]])) {out$sample[i] <- "Blackheath"} if (row.names(out)[i] %in% row.names(aust_select[["Buckley Swamp (Vic) pasture"]])) {out$sample[i] <- "Buckley_Swamp"} if (row.names(out)[i] %in% row.names(aust_select[["King Island (Tas) pasture"]])) {out$sample[i] <- "King_Island"} if (row.names(out)[i] %in% row.names(aust_select[["Longerenong (Vic) wheat"]])) {out$sample[i] <- "Longerenong"} if (row.names(out)[i] %in% row.names(aust_select[["Mackay (Qld) sugar"]])) {out$sample[i] <- "Mackay"} if (row.names(out)[i] %in% row.names(aust_select[["Narrabri (NSW) cotton"]])) {out$sample[i] <- "Narrabri"} if (row.names(out)[i] %in% row.names(aust_select[["Walpeup (Vic) wheat"]])) {out$sample[i] <- "Walpeup"} if (row.names(out)[i] %in% row.names(aust_select[["Cape Tribulation NP (Qld)"]])) {out$sample[i] <- "Cape_Tribulation"} if (row.names(out)[i] %in% row.names(aust_select[["Booderee NP (NSW)"]])) {out$sample[i] <- "Booderee"} if (row.names(out)[i] %in% row.names(aust_select[["Fitzgerald River NP (WA)"]])) {out$sample[i] <- "Fitzgerald_River_NP"} if (row.names(out)[i] %in% row.names(aust_select[["Freycinet NP (Tas)"]])) {out$sample[i] <- "Freycinet_NP"} if (row.names(out)[i] %in% row.names(aust_select[["Mt Lesueur NP (WA)"]])) {out$sample[i] <- "Mt_Lesueur_NP"} if (row.names(out)[i] %in% row.names(aust_select[["Namadgi NP (ACT)"]])) {out$sample[i] <- "Namadgi_NP"} } # check assignment of sample names? - inspect these objects: View(out) sel.samps # check these match ? names(out) # "sample" "shannon" "eff_no" "calc_type" names(b_out[[1]]) # "sample" "shannon" "eff_no" "calc_type" dim(b_out[[1]]) # 13 4 b_out[[1]] temp <- out for (j in 1:B) { #j<-1 new <- b_out[[j]] new$sample <- gsub(pattern = " ", replacement = "_", x = new$sample) temp <- rbind(temp,new) } head(temp) temp[1:50, ] tail(temp) names(temp) # "sample" "shannon" "eff_no" "calc_type" # define ordered factor for Collection site based on order of results dim(temp) # 1339 4 # evaluate mean diversity values - and use this for ordering display results meanDiv <- numeric(length=length(unique(temp$sample))) unique(temp$sample) # [1] "Freycinet_NP" "Narrabri" "Namadgi_NP" "Blackheath" "Cape_Tribulation" # [6] "King_Island" "Booderee" "Mt_Lesueur_NP" "Fitzgerald_River_NP" "Walpeup" # [11] "Mackay" "Longerenong" "Buckley_Swamp" names(meanDiv) <- unique(temp$sample) for (i in 1:length(unique(temp$sample)) ) { #i<-1 sel <- which(temp$sample==names(meanDiv)[i]) meanDiv[i] <- mean(temp$eff_no[sel]) } meanDiv # Freycinet_NP Narrabri Namadgi_NP Blackheath Cape_Tribulation King_Island # 774.4416 529.6657 739.3113 915.9186 272.2211 823.8873 # Booderee Mt_Lesueur_NP Fitzgerald_River_NP Walpeup Mackay Longerenong # 425.7073 752.1421 586.8354 451.8388 701.7904 419.6312 # Buckley_Swamp # 762.7759 meanDiv[ order(meanDiv, decreasing = FALSE) ] # Cape_Tribulation Longerenong Booderee Walpeup Narrabri Fitzgerald_River_NP # 272.2211 419.6312 425.7073 451.8388 529.6657 586.8354 # Mackay Namadgi_NP Mt_Lesueur_NP Buckley_Swamp Freycinet_NP King_Island # 701.7904 739.3113 752.1421 762.7759 774.4416 823.8873 # Blackheath # 915.9186 names(meanDiv[ order(meanDiv, decreasing = FALSE) ]) # [1] "Cape_Tribulation" "Longerenong" "Booderee" "Walpeup" "Narrabri" # [6] "Fitzgerald_River_NP" "Mackay" "Namadgi_NP" "Mt_Lesueur_NP" "Buckley_Swamp" # [11] "Freycinet_NP" "King_Island" "Blackheath" temp$sample <- factor(temp$sample, levels = names(meanDiv[ order(meanDiv, decreasing = FALSE) ]), labels = c( "Cape Tribulation (NP)", "Longerenong (wheat)", "Booderee (NP)", "Walpeup (wheat)", "Narrabri (cotton)", "Fitzgerald River (NP)", "Mackay (sugar)", "Namadgi (NP)", "Mt Lesueur (NP)", "Buckley Swamp (pasture)", "Freycinet (NP)", "King Island (pasture)","Blackheath (apples)" ), ordered = TRUE) temp$alt_vs_nat <- NA sel <- which(temp$sample == "Blackheath (apples)") temp$alt_vs_nat[sel] <- "altered" sel <- which(temp$sample == "Booderee (NP)") temp$alt_vs_nat[sel] <- "natural" sel <- which(temp$sample == "Buckley Swamp (pasture)") temp$alt_vs_nat[sel] <- "altered" sel <- which(temp$sample == "Cape Tribulation (NP)") temp$alt_vs_nat[sel] <- "natural" sel <- which(temp$sample == "Fitzgerald River (NP)") temp$alt_vs_nat[sel] <- "natural" sel <- which(temp$sample == "Freycinet (NP)") temp$alt_vs_nat[sel] <- "natural" sel <- which(temp$sample == "King Island (pasture)") temp$alt_vs_nat[sel] <- "altered" sel <- which(temp$sample == "Longerenong (wheat)") temp$alt_vs_nat[sel] <- "altered" sel <- which(temp$sample == "Mackay (sugar)") temp$alt_vs_nat[sel] <- "altered" sel <- which(temp$sample == "Mt Lesueur (NP)") temp$alt_vs_nat[sel] <- "natural" sel <- which(temp$sample == "Namadgi (NP)") temp$alt_vs_nat[sel] <- "natural" sel <- which(temp$sample == "Narrabri (cotton)") temp$alt_vs_nat[sel] <- "altered" sel <- which(temp$sample == "Walpeup (wheat)") temp$alt_vs_nat[sel] <- "altered" melt.out <- melt(temp,id.vars = c("sample","calc_type", "alt_vs_nat"), measure.vars = "eff_no") ## plot ## [finished-plot] # # # # # # # # # # # # # cols <- c("altered" = "#F8766D", "natural" = "#00BFC4") p <- ggplot(data=melt.out, aes(x=sample, value)) + ggtitle("(a)") + geom_violin(data = melt.out[ which(melt.out$calc_type == "bootstrap"), ], aes(color = alt_vs_nat) ) + scale_colour_manual(values = cols) + geom_point(data = melt.out[ which(melt.out$calc_type == "rarefyx1"), ], color="gray20", shape=1 ) + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1) ) + labs(x = NULL, y = "Effective OTUs (count)") + theme(legend.position="none") ## add legend to rhs of next plot p ggsave(plot=p, filename = paste0("finished-plots/","Alpha-Div-Eff-No-OTUs-16S-ALT-vs_NAT-vFINAL.tiff"), width = 8, height = 10, units = "cm", dpi = 600, compression = "lzw") # pp <- p + theme(axis.title.x=element_blank(), # axis.text.x=element_blank()) # pp # # ggsave(plot=pp, filename = paste0("finished-plots/","Alpha-Div-Eff-No-species-16S-ALT-vs_NAT-NO-XLABS.tiff"), width = 8, height = 6, units = "cm", dpi = 600, compression = "lzw") # # # # # # # # # # # # # # #------------------------- #### Alpha diversity of functions - Aust-wide selection #------------------------- ## use essentially the same algorithm as above - however remove redundant features to handle depth and Reveg age #samp_size.compare <- min( c( min(sample_sums(phy.aust_select)) , min(sample_sums(mtbold_surf.16s)) ) ) samp_size.compare # 17614 phy.aust_select@sam_data[ order(phy.aust_select@sam_data$`Collection.Site`) , c("Collection.Site","Detailed.Land.Use", "alt_vs_nat")] # Sample Data: [39 samples by 3 sample variables]: # Collection.Site Detailed.Land.Use alt_vs_nat # X12616 Blackheath Tree fruits -apple altered # X12620 Blackheath Tree fruits -apple altered # X12624 Blackheath Tree fruits -apple altered # X7833 Booderee National Park natural # X7837 Booderee National Park natural # X7851 Booderee National Park natural # X8280 Buckley Swamp Pasture legume/grass mixtures altered # X8284 Buckley Swamp Pasture legume/grass mixtures altered # X8286 Buckley Swamp Pasture legume/grass mixtures altered # X12816 Cape Tribulation National Park natural # X12818 Cape Tribulation National Park natural # X12819 Cape Tribulation National Park natural # X8116 Fitzgerald River NP National Park natural # X8118 Fitzgerald River NP National Park natural # X8130 Fitzgerald River NP National Park natural # X12428 Freycinet NP National Park natural # X12430 Freycinet NP National Park natural # X12438 Freycinet NP National Park natural # X13272 King Island Pasture legume/grass mixtures altered # X13274 King Island Pasture legume/grass mixtures altered # X13282 King Island Pasture legume/grass mixtures altered # X8270 Longerenong Cereals -wheat altered # X8272 Longerenong Cereals -wheat altered # X8274 Longerenong Cereals -wheat altered # X8192 Mackay sugar altered # X8198 Mackay sugar altered # X8220 Mackay sugar altered # X8082 Mt Lesueur NP National Park natural # X8084 Mt Lesueur NP National Park natural # X8099 Mt Lesueur NP National Park natural # X12560 Namadgi NP National Park natural # X12574 Namadgi NP National Park natural # X9442 Namadgi NP National Park natural # X12509 Narrabri cotton altered # X12511 Narrabri cotton altered # X12525 Narrabri cotton altered # X8182 Walpeup Cereals -wheat altered # X8262 Walpeup Cereals -wheat altered # X8266 Walpeup Cereals -wheat altered ### Bootstrap resampling ... ## create function to rarefy for initial sample > merge samples by type > rarefy again > calculate shannon's index # # # # # # # # # # # # calc_FUNCTIONAL_AlphaDiv_in_parallel_AUST_SELECT <- function(phy_obj, merge_by, min_merge_no, rarefy_to, matching, otulist_withGG) { # rarefy ###phy_obj <- phy.aust_select ###merge_by="Collection.Site" ###rarefy_to=samp_size.compare ###min_merge_no=3 ###matching=matching ###otulist_withGG=yes_have_GG_Id seed <- 123+j r16s <- rarefy_even_depth(phy_obj, sample.size = rarefy_to, # sample.size = min(sample_sums(phy_obj)) rngseed = seed, replace = TRUE, trimOTUs = TRUE, verbose = TRUE) # Merge samples of the same type # to avoid bias in sample contributions, only include groups with three samples levs <- levels(as.factor( eval(parse(text= paste0("r16s@sam_data$",merge_by))) )) count <- numeric(length = length(levs)) for (i in 1:length(levs)) { sel <- which( eval(parse(text= paste0("r16s@sam_data$",merge_by))) == levs[i]) count[i] <- length(sel) } sel.rem <- which(count != min_merge_no) # 3 # remove samples not corresponding to a triplicate if (length(sel.rem)>0) {levs <- levs[-sel.rem]} # determine which samples to leave in for merging - i.e. triplicates only sel <- which( eval(parse(text= paste0("r16s@sam_data$",merge_by))) %in% levs) # subset samples sub.r16s <- subset_samples( samples = sample_names(r16s)[sel] , r16s ) # merge samples merged.r16s <- merge_samples(sub.r16s, group= eval(parse(text= paste0("sub.r16s@sam_data$",merge_by))) ) # rarefy back to min sample size for this bootstrap sample - i.e. rarefied-merged-rarefied seed <- 234+j rmr16s <- rarefy_even_depth(merged.r16s, sample.size = rarefy_to, # sample.size = min(sample_sums(phy_obj)) rngseed = seed, replace = TRUE, trimOTUs = TRUE, verbose = TRUE) # - - - - - - - - - - ## record ntaxa before Picrust; and after Picrust record ntaxa and relabun covered ntaxa_0 <- ntaxa(rmr16s) # 10217 relabun_rmr16s <- transform_sample_counts(rmr16s, function(x) x / sum(x) ) #100*mean(sample_sums(relabun_rmr16s)) # 100 ### Picrust ## prune taxa - leave only those with GG Ids prune.rmr16s.gg <- prune_taxa(taxa = taxa_names(rmr16s) %in% otulist_withGG, x = rmr16s) ntaxa_withGG <- ntaxa(prune.rmr16s.gg) # 4885 relabun_rmr16s_withGG <- prune_taxa(taxa = taxa_names(relabun_rmr16s) %in% taxa_names(prune.rmr16s.gg), x = relabun_rmr16s) #relabun_cover_withGG <- 100*mean( sample_sums( relabun_rmr16s_withGG ) ) # 84% relabun_cover_withGG <- 100*sample_sums( relabun_rmr16s_withGG ) # ~70-80% ## use table matching OTU Id to GG Id to assess/label duplicate GG Ids ## consider only OTUs that have GG Id sel <- which(matching$BASE_97_OTU_Id %in% otulist_withGG) # qty 7044 match <- matching[sel, ] ## now running Aust-wide only !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! sel <- which(match$dataset == "Aust-wide") # 5461 match <- match[sel, ] # BUT in this rarefied run some OTUs may be missing - find out which OTUs and GG Ids are here? sel <- which(match$BASE_97_OTU_Id %in% taxa_names(prune.rmr16s.gg)) # qty 4885 GG_Ids_in_this_run <- match$GG_Id[sel] ## estimate coverage from Picrust algorithm ## uses stored lookup table picrust_lookup <- themetagenomics:::picrust_otu(file_path="C:/Workspace/DATA/KEGG/ko_13_5_precalculated.tab.gz", otu_id_targets=GG_Ids_in_this_run) # Note: use ::: to access hidden function 'picrust_otu()' # file_path:- Path to the precalculated table; otu_id_targets:- Character vector of OTU IDs to predict ###GG_Ids_in_Picrust_this_run <- picrust_lookup$matches # length(picrust_lookup$matches) # 4368 ## retain only sequences with NSTI <= 0.15 sel <- which(picrust_lookup$pimeta_table_out <= 0.15) GG_Ids_in_Picrust_this_run <- picrust_lookup$matches[sel] # so what is coverage from Picrust in this run? sel <- which(match$GG_Id %in% GG_Ids_in_Picrust_this_run) # qty 1627 (expect some duplication) otuid_covered_by_Picrust_this_run <- match$BASE_97_OTU_Id[sel] ntaxa_cover_by_Picrust <- length(otuid_covered_by_Picrust_this_run) # 1627 relabun_rmr16s_cover_by_Picrust <- prune_taxa(taxa = taxa_names(relabun_rmr16s_withGG) %in% otuid_covered_by_Picrust_this_run, x = relabun_rmr16s_withGG) #relabun_cover_by_Picrust <- 100*mean( sample_sums( relabun_rmr16s_cover_by_Picrust ) ) # 47% relabun_cover_by_Picrust <- 100*sample_sums( relabun_rmr16s_cover_by_Picrust ) # ~20-30% ## identify duplicate GG IDs - these taxa will need to be merged #dim(match) # 2230 6 sel.dup.gg <- which( duplicated(match$GG_Id) == TRUE) # qty 375 dups <- match[ sel.dup.gg , c("BASE_97_OTU_Id", "GG_Id")] # iterate through OTU IDs to rename with GG IDs, merging OTUs where necessary # (as some GG Ids are duplicated to represent different BASE OTU Ids) for (i in 1:dim(match)[1]) { #i<-1766 otuid <- match$BASE_97_OTU_Id[i] this_GG_Id <- match$GG_Id[i] # check this otuid is still in taxa_names() as taxa are being merged and renamed if (otuid %in% taxa_names(prune.rmr16s.gg)) { if (otuid %in% dups$BASE_97_OTU_Id) { # this means GG_Id for this OTU is a duplicate # i.e. an OTU with a smaller index has already been represented by the same GG_Id # this code relies on the order of match and taxa_names() being the same sel.dup <- which(match$GG_Id == this_GG_Id) taxa_rep_by_same_GG_Id <- match$BASE_97_OTU_Id[sel.dup] # get indices of taxa_names that have already or need to be converte dto that GG_Id sel <- which(taxa_names(prune.rmr16s.gg) %in% c(this_GG_Id, taxa_rep_by_same_GG_Id)) # check there are multiple taxa to merge if (length(sel) > 1) { prune.rmr16s.gg <- merge_taxa(prune.rmr16s.gg, taxa_names(prune.rmr16s.gg)[sel],archetype = 1) # because it is duplicate, archetype will need to be this_GG_Id (as 1st otuid will already be overwritten) # merging taxa should also overwrite taxa_name with GG_Id archetype } else { # this was expected to be a duplicate but rarefying has taken out the duplicates # overwrite taxa_name sel <- which(taxa_names(prune.rmr16s.gg)==otuid) taxa_names(prune.rmr16s.gg)[sel] <- this_GG_Id } # if not a duplicate } else { # overwrite taxa_name sel <- which(taxa_names(prune.rmr16s.gg)==otuid) taxa_names(prune.rmr16s.gg)[sel] <- this_GG_Id } print(paste0("completed ",i," of ",length(taxa_names(prune.rmr16s.gg)))) } } gg_otu_table <- as.data.frame( t(prune.rmr16s.gg@otu_table) ) ##remove " " from sample names names(gg_otu_table) <- gsub(pattern=" ", replacement="_",x=names(gg_otu_table) ) #mb_gg_otu_table[1:5, 1:5] # KEGG_fxn <- picrust(gg_otu_table,rows_are_taxa=TRUE, # reference='gg_ko',reference_path="C:/Workspace/DATA/KEGG", # cn_normalize=FALSE,sample_normalize=FALSE, # drop=TRUE) # WITH Copy Number Normalization sel <- which(row.names(gg_otu_table) %in% GG_Ids_in_Picrust_this_run) KEGG_fxn <- picrust(gg_otu_table[sel, ], rows_are_taxa=TRUE, reference='gg_ko',reference_path="C:/Workspace/DATA/KEGG", cn_normalize=TRUE,sample_normalize=FALSE, # !! cn_normalize=TRUE !! drop=TRUE) # - - - - - - - - - - ### ALPHA DIVERSITY ## calculate Shannon's index #shan.rmr16s <- plot_richness(rmr16s, measures=c("Shannon")) shan <- vegan::diversity(x=KEGG_fxn$fxn_table, index = "shannon", MARGIN = 1, base = exp(1)) #shan eff_no <- exp(shan) #eff_no # also record weighted NSTI NSTI_scores <- t(KEGG_fxn$method_meta) %*% as.matrix(gg_otu_table[row.names(KEGG_fxn$method_meta), ]) weighted_NSTI <- NSTI_scores/base::colSums( as.matrix(gg_otu_table[row.names(KEGG_fxn$method_meta), ]) ) # export data out <- data.frame(sample=names(shan),boot_rep=j, shannon=as.numeric(shan), eff_no=as.numeric(eff_no), #Reveg_age=substr(x=names(shan),start=1,stop=nchar(names(shan))-5), #depth=substr(x=names(shan),start=nchar(names(shan))-3,stop=nchar(names(shan))), ntaxa_0=ntaxa_0, ntaxa_withGG=ntaxa_withGG, relabun_cover_withGG=relabun_cover_withGG, ntaxa_cover_by_Picrust=ntaxa_cover_by_Picrust, relabun_cover_by_Picrust=relabun_cover_by_Picrust, weighted_NSTI= as.numeric(weighted_NSTI)) out$calc_type <- "bootstrap" return(out) } # # # # # # # # # # # # # bootstrap outputs will automatically be stored in a list: b_out # number of bootstrap resamples B <- 100 # iterate in parallel cl<-makeCluster( detectCores()-1 ) # detectCores()-1 registerDoParallel(cl) # remember to stopcluster() b_out <- foreach(j=1:B, .packages=c('phyloseq','themetagenomics','vegan')) %dopar% calc_FUNCTIONAL_AlphaDiv_in_parallel_AUST_SELECT(phy_obj=phy.aust_select, merge_by="Collection.Site", min_merge_no=3, rarefy_to=samp_size.compare, matching=matching, otulist_withGG=yes_have_GG_Id ) stopCluster(cl) b_out__fxn_alpha_aust_select_NSTI_cutoff <- b_out getwd() # "C:/Workspace/PROJ/PAPER-Trending-Taxa-Resto/modelling" saveRDS(b_out__fxn_alpha_aust_select_NSTI_cutoff, file = "b_out__fxn_alpha_aust_select_NSTI_cutoff.RDS") #b_out <- readRDS("b_out__fxn_alpha_aust_select_NSTI_cutoff.RDS") length(b_out) # 100 names(b_out[[1]]) # [1] "sample" "boot_rep" "shannon" "eff_no" # [5] "ntaxa_0" "ntaxa_withGG" "relabun_cover_withGG" "ntaxa_cover_by_Picrust" # [9] "relabun_cover_by_Picrust" "weighted_NSTI" "calc_type" dim(b_out[[1]]) # 13 11 b_out[[10]] ### For plotting, 1st calculate diversity from one rarefying step ### then append bootstrap-derived uncertainty # rarefy #1 phy_obj <- phy.aust_select otulist_withGG <- yes_have_GG_Id seed <- 123 r1.16s <- rarefy_even_depth(phy_obj, sample.size = samp_size.compare, # sample.size = min(sample_sums(phy_obj)) rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE) # - - - - - - - - - - ## record ntaxa before Picrust; and after Picrust record ntaxa and relabun covered ntaxa_0 <- ntaxa(r1.16s) # 11565 relabun_1r16s <- transform_sample_counts(r1.16s, function(x) x / sum(x) ) ### Picrust ## prune taxa - leave only those with GG Ids prune.1r16s.gg <- prune_taxa(taxa = taxa_names(r1.16s) %in% otulist_withGG, x = r1.16s) ntaxa_withGG <- ntaxa(prune.1r16s.gg) # 5217 relabun_1r16s_withGG <- prune_taxa(taxa = taxa_names(relabun_1r16s) %in% taxa_names(prune.1r16s.gg), x = relabun_1r16s) relabun_cover_withGG <- 100*sample_sums( relabun_1r16s_withGG ) # ~60-80% relabun_cover_withGG # ~63-85% ## use table matching OTU Id to GG Id to assess/label duplicate GG Ids ## consider only OTUs that have GG Id sel <- which(matching$BASE_97_OTU_Id %in% otulist_withGG) # qty 6981 match <- matching[sel, ] ## now running Aust-wide only !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! sel <- which(match$dataset == "Aust-wide") # 5404 match <- match[sel, ] # BUT in this rarefied run some OTUs may be missing - find out which OTUs and GG Ids are here? sel <- which(match$BASE_97_OTU_Id %in% taxa_names(prune.1r16s.gg)) # qty 5217 GG_Ids_in_this_run <- match$GG_Id[sel] ## estimate coverage from Picrust algorithm ## uses stored lookup table picrust_lookup <- themetagenomics:::picrust_otu(file_path="C:/Workspace/DATA/KEGG/ko_13_5_precalculated.tab.gz", otu_id_targets=GG_Ids_in_this_run) # Note: use ::: to access hidden function 'picrust_otu()' # file_path:- Path to the precalculated table; otu_id_targets:- Character vector of OTU IDs to predict ###GG_Ids_in_Picrust_this_run <- picrust_lookup$matches # length(picrust_lookup$matches) # 4791 ## retain only sequences with NSTI <= 0.15 sel <- which(picrust_lookup$pimeta_table_out <= 0.15) GG_Ids_in_Picrust_this_run <- picrust_lookup$matches[sel] # so what is coverage from Picrust in this run? sel <- which(match$GG_Id %in% GG_Ids_in_Picrust_this_run) # qty 1733 (expect some duplication) otuid_covered_by_Picrust_this_run <- match$BASE_97_OTU_Id[sel] ntaxa_cover_by_Picrust <- length(otuid_covered_by_Picrust_this_run) # 1733 relabun_1r16s_cover_by_Picrust <- prune_taxa(taxa = taxa_names(relabun_1r16s_withGG) %in% otuid_covered_by_Picrust_this_run, x = relabun_1r16s_withGG) relabun_cover_by_Picrust <- 100*sample_sums( relabun_1r16s_cover_by_Picrust ) relabun_cover_by_Picrust # 14-30% ## identify duplicate GG IDs - these taxa will need to be merged sel.dup.gg <- which( duplicated(match$GG_Id) == TRUE) # qty 373 dups <- match[ sel.dup.gg , c("BASE_97_OTU_Id", "GG_Id")] # iterate through OTU IDs to rename with GG IDs, merging OTUs where necessary # (as some GG Ids are duplicated to represent different BASE OTU Ids) for (i in 1:dim(match)[1]) { #i<-1766 otuid <- match$BASE_97_OTU_Id[i] this_GG_Id <- match$GG_Id[i] # check this otuid is still in taxa_names() as taxa are being merged and renamed if (otuid %in% taxa_names(prune.1r16s.gg)) { if (otuid %in% dups$BASE_97_OTU_Id) { # this means GG_Id for this OTU is a duplicate # i.e. an OTU with a smaller index has already been represented by the same GG_Id # this code relies on the order of match and taxa_names() being the same sel.dup <- which(match$GG_Id == this_GG_Id) taxa_rep_by_same_GG_Id <- match$BASE_97_OTU_Id[sel.dup] # get indices of taxa_names that have already or need to be converted to that GG_Id sel <- which(taxa_names(prune.1r16s.gg) %in% c(this_GG_Id, taxa_rep_by_same_GG_Id)) # check there are multiple taxa to merge if (length(sel) > 1) { prune.1r16s.gg <- merge_taxa(prune.1r16s.gg, taxa_names(prune.1r16s.gg)[sel],archetype = 1) # because it is duplicate, archetype will need to be this_GG_Id (as 1st otuid will already be overwritten) # merging taxa should also overwrite taxa_name with GG_Id archetype } else { # this was expected to be a duplicate but rarefying has taken out the duplicates # overwrite taxa_name sel <- which(taxa_names(prune.1r16s.gg)==otuid) taxa_names(prune.1r16s.gg)[sel] <- this_GG_Id } # if not a duplicate } else { # overwrite taxa_name sel <- which(taxa_names(prune.1r16s.gg)==otuid) taxa_names(prune.1r16s.gg)[sel] <- this_GG_Id } print(paste0("completed ",i," of ",length(taxa_names(prune.1r16s.gg)))) } } # for one-off rarefying don't need to transpose dataframe - because has rows as taxa gg_otu_table <- as.data.frame( prune.1r16s.gg@otu_table ) gg_otu_table[1:5, 1:5] # X12428 X12430 X12438 X12509 X12511 # 1087375 1079 847 1543 36 21 # 839198 0 0 0 0 0 # 4432941 41 65 144 0 0 # 86097 0 0 0 0 1 # 136781 1 1 0 877 483 # WITH Copy Number Normalization sel <- which(row.names(gg_otu_table) %in% GG_Ids_in_Picrust_this_run) KEGG_fxn <- picrust(gg_otu_table[sel, ], rows_are_taxa=TRUE, reference='gg_ko',reference_path="C:/Workspace/DATA/KEGG", cn_normalize=TRUE,sample_normalize=FALSE, # !! cn_normalize=TRUE !! drop=TRUE) # - - - - - - - - - - ### ALPHA DIVERSITY ## calculate Shannon's index dim(KEGG_fxn$fxn_table) # [1] 39 5429 shan <- vegan::diversity(x=KEGG_fxn$fxn_table, index = "shannon", MARGIN = 1, base = exp(1)) #shan eff_no <- exp(shan) #eff_no # also record weighted NSTI NSTI_scores <- t(KEGG_fxn$method_meta) %*% as.matrix(gg_otu_table[row.names(KEGG_fxn$method_meta), ]) weighted_NSTI <- NSTI_scores/base::colSums( as.matrix(gg_otu_table[row.names(KEGG_fxn$method_meta), ]) ) # export data out <- data.frame(sample=names(shan),boot_rep=0, shannon=as.numeric(shan), eff_no=as.numeric(eff_no), ntaxa_0=ntaxa_0, ntaxa_withGG=ntaxa_withGG, relabun_cover_withGG=relabun_cover_withGG, ntaxa_cover_by_Picrust=ntaxa_cover_by_Picrust, relabun_cover_by_Picrust=relabun_cover_by_Picrust, weighted_NSTI= as.numeric(weighted_NSTI) ) out$calc_type <- "rarefyx1" row.names(out) # [1] "X12428" "X12430" "X12438" "X12509" "X12511" "X12525" "X12560" "X12574" "X12616" "X12620" "X12624" "X12816" # [13] "X12818" "X12819" "X13272" "X13274" "X13282" "X7833" "X7837" "X7851" "X8082" "X8084" "X8099" "X8116" # [25] "X8118" "X8130" "X8182" "X8192" "X8198" "X8220" "X8262" "X8266" "X8270" "X8272" "X8274" "X8280" # [37] "X8284" "X8286" "X9442" out$sample # [1] X12428 X12430 X12438 X12509 X12511 X12525 X12560 X12574 X12616 X12620 X12624 X12816 X12818 X12819 X13272 X13274 # [17] X13282 X7833 X7837 X7851 X8082 X8084 X8099 X8116 X8118 X8130 X8182 X8192 X8198 X8220 X8262 X8266 # [33] X8270 X8272 X8274 X8280 X8284 X8286 X9442 # 39 Levels: X12428 X12430 X12438 X12509 X12511 X12525 X12560 X12574 X12616 X12620 X12624 X12816 X12818 ... X9442 str(out) sel.samps # Blackheath (NSW) apple trees1 Blackheath (NSW) apple trees2 Blackheath (NSW) apple trees3 # "X12616" "X12620" "X12624" # Buckley Swamp (Vic) pasture1 Buckley Swamp (Vic) pasture2 Buckley Swamp (Vic) pasture3 # "X8280" "X8284" "X8286" # King Island (Tas) pasture1 King Island (Tas) pasture2 King Island (Tas) pasture3 # "X13272" "X13282" "X13274" # Longerenong (Vic) wheat1 Longerenong (Vic) wheat2 Longerenong (Vic) wheat3 # "X8270" "X8272" "X8274" # Mackay (Qld) sugar1 Mackay (Qld) sugar2 Mackay (Qld) sugar3 # "X8192" "X8198" "X8220" # Narrabri (NSW) cotton1 Narrabri (NSW) cotton2 Narrabri (NSW) cotton3 # "X12509" "X12525" "X12511" # Walpeup (Vic) wheat1 Walpeup (Vic) wheat2 Walpeup (Vic) wheat3 # "X8182" "X8262" "X8266" # Cape Tribulation NP (Qld)1 Cape Tribulation NP (Qld)2 Cape Tribulation NP (Qld)3 # "X12816" "X12818" "X12819" # Booderee NP (NSW)1 Booderee NP (NSW)2 Booderee NP (NSW)3 # "X7833" "X7851" "X7837" # Fitzgerald River NP (WA)1 Fitzgerald River NP (WA)2 Fitzgerald River NP (WA)3 # "X8116" "X8130" "X8118" # Freycinet NP (Tas)1 Freycinet NP (Tas)2 Freycinet NP (Tas)3 # "X12428" "X12438" "X12430" # Mt Lesueur NP (WA)1 Mt Lesueur NP (WA)2 Mt Lesueur NP (WA)3 # "X8082" "X8099" "X8084" # Namadgi NP (ACT)1 Namadgi NP (ACT)2 Namadgi NP (ACT)3 # "X12560" "X9442" "X12574" # use $sample to store Collection Site (as per bootstrap results), and row.name will preserve sample ID out$sample <- as.character(out$sample) names(aust_select) # [1] "Blackheath (NSW) apple trees" "Buckley Swamp (Vic) pasture" "King Island (Tas) pasture" # [4] "Longerenong (Vic) wheat" "Mackay (Qld) sugar" "Narrabri (NSW) cotton" # [7] "Walpeup (Vic) wheat" "Cape Tribulation NP (Qld)" "Booderee NP (NSW)" # [10] "Fitzgerald River NP (WA)" "Freycinet NP (Tas)" "Mt Lesueur NP (WA)" # [13] "Namadgi NP (ACT)" for (i in 1:dim(out)[1]) { #i<-1 if (row.names(out)[i] %in% row.names(aust_select[["Blackheath (NSW) apple trees"]])) {out$sample[i] <- "Blackheath"} if (row.names(out)[i] %in% row.names(aust_select[["Buckley Swamp (Vic) pasture"]])) {out$sample[i] <- "Buckley_Swamp"} if (row.names(out)[i] %in% row.names(aust_select[["King Island (Tas) pasture"]])) {out$sample[i] <- "King_Island"} if (row.names(out)[i] %in% row.names(aust_select[["Longerenong (Vic) wheat"]])) {out$sample[i] <- "Longerenong"} if (row.names(out)[i] %in% row.names(aust_select[["Mackay (Qld) sugar"]])) {out$sample[i] <- "Mackay"} if (row.names(out)[i] %in% row.names(aust_select[["Narrabri (NSW) cotton"]])) {out$sample[i] <- "Narrabri"} if (row.names(out)[i] %in% row.names(aust_select[["Walpeup (Vic) wheat"]])) {out$sample[i] <- "Walpeup"} if (row.names(out)[i] %in% row.names(aust_select[["Cape Tribulation NP (Qld)"]])) {out$sample[i] <- "Cape_Tribulation"} if (row.names(out)[i] %in% row.names(aust_select[["Booderee NP (NSW)"]])) {out$sample[i] <- "Booderee"} if (row.names(out)[i] %in% row.names(aust_select[["Fitzgerald River NP (WA)"]])) {out$sample[i] <- "Fitzgerald_River_NP"} if (row.names(out)[i] %in% row.names(aust_select[["Freycinet NP (Tas)"]])) {out$sample[i] <- "Freycinet_NP"} if (row.names(out)[i] %in% row.names(aust_select[["Mt Lesueur NP (WA)"]])) {out$sample[i] <- "Mt_Lesueur_NP"} if (row.names(out)[i] %in% row.names(aust_select[["Namadgi NP (ACT)"]])) {out$sample[i] <- "Namadgi_NP"} } # check assignment of sample names? - inspect these objects: View(out) sel.samps str(out) # 'data.frame': 39 obs. of 11 variables: # $ sample : chr "Freycinet_NP" "Freycinet_NP" "Freycinet_NP" "Narrabri" ... # $ boot_rep : num 0 0 0 0 0 0 0 0 0 0 ... # $ shannon : num 7.6 7.69 7.68 7.65 7.66 ... # $ eff_no : num 2006 2182 2154 2099 2124 ... # $ ntaxa_0 : int 11491 11491 11491 11491 11491 11491 11491 11491 11491 11491 ... # $ ntaxa_withGG : int 5217 5217 5217 5217 5217 5217 5217 5217 5217 5217 ... # $ relabun_cover_withGG : num 83.7 86.1 80.9 77.6 74.5 ... # $ ntaxa_cover_by_Picrust : int 1733 1733 1733 1733 1733 1733 1733 1733 1733 1733 ... # $ relabun_cover_by_Picrust: num 25.9 25.3 23.3 17 14 ... # $ weighted_NSTI : num 0.0741 0.0735 0.0773 0.0834 0.0845 ... # $ calc_type : chr "rarefyx1" "rarefyx1" "rarefyx1" "rarefyx1" ... names(out) # [1] "sample" "boot_rep" "shannon" "eff_no" # [5] "ntaxa_0" "ntaxa_withGG" "relabun_cover_withGG" "ntaxa_cover_by_Picrust" # [9] "relabun_cover_by_Picrust" "weighted_NSTI" "calc_type" names(b_out[[1]]) # [1] "sample" "boot_rep" "shannon" "eff_no" # [5] "ntaxa_0" "ntaxa_withGG" "relabun_cover_withGG" "ntaxa_cover_by_Picrust" # [9] "relabun_cover_by_Picrust" "weighted_NSTI" "calc_type" row.names(b_out[[1]]) # [1] "Blackheath" "Booderee" "Buckley Swamp" "Cape Tribulation" "Fitzgerald River NP" # [6] "Freycinet NP" "King Island" "Longerenong" "Mackay" "Mt Lesueur NP" # [11] "Namadgi NP" "Narrabri" "Walpeup" b_out[[1]]$sample # [1] Blackheath Booderee Buckley_Swamp Cape_Tribulation Fitzgerald_River_NP # [6] Freycinet_NP King_Island Longerenong Mackay Mt_Lesueur_NP # [11] Namadgi_NP Narrabri Walpeup # 13 Levels: Blackheath Booderee Buckley_Swamp Cape_Tribulation Fitzgerald_River_NP Freycinet_NP ... Walpeup dim(b_out[[1]]) # 13 11 temp <- out row.names(temp) <- NULL for (j in 1:B) { #j<-1 new <- b_out[[j]] row.names(new) <- NULL temp <- rbind(temp,new) } head(temp) temp[1:50, ] tail(temp) ## quantify weighted NSTI ?? hist(temp$weighted_NSTI) summary(temp$weighted_NSTI) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 0.06006 0.08086 0.08432 0.08555 0.09238 0.10240 # Weighted NSTI median 0.084 (range 0.060-0.102) names(temp) # [1] "sample" "boot_rep" "shannon" "eff_no" # [5] "ntaxa_0" "ntaxa_withGG" "relabun_cover_withGG" "ntaxa_cover_by_Picrust" # [9] "relabun_cover_by_Picrust" "weighted_NSTI" "calc_type" ## use same ordering as per Alpha diversity of OTUs above names(meanDiv[ order(meanDiv, decreasing = FALSE) ]) # [1] "Cape_Tribulation" "Longerenong" "Booderee" "Walpeup" "Narrabri" # [6] "Fitzgerald_River_NP" "Mackay" "Namadgi_NP" "Mt_Lesueur_NP" "Buckley_Swamp" # [11] "Freycinet_NP" "King_Island" "Blackheath" temp$sample <- factor(temp$sample, levels = names(meanDiv[ order(meanDiv, decreasing = FALSE) ]), labels = c( "Cape Tribulation (NP)", "Longerenong (wheat)", "Booderee (NP)", "Walpeup (wheat)", "Narrabri (cotton)", "Fitzgerald River (NP)", "Mackay (sugar)", "Namadgi (NP)", "Mt Lesueur (NP)", "Buckley Swamp (pasture)", "Freycinet (NP)", "King Island (pasture)", "Blackheath (apples)" ), ordered = TRUE) temp$alt_vs_nat <- NA sel <- which(temp$sample == "Blackheath (apples)") temp$alt_vs_nat[sel] <- "altered" sel <- which(temp$sample == "Booderee (NP)") temp$alt_vs_nat[sel] <- "natural" sel <- which(temp$sample == "Buckley Swamp (pasture)") temp$alt_vs_nat[sel] <- "altered" sel <- which(temp$sample == "Cape Tribulation (NP)") temp$alt_vs_nat[sel] <- "natural" sel <- which(temp$sample == "Fitzgerald River (NP)") temp$alt_vs_nat[sel] <- "natural" sel <- which(temp$sample == "Freycinet (NP)") temp$alt_vs_nat[sel] <- "natural" sel <- which(temp$sample == "King Island (pasture)") temp$alt_vs_nat[sel] <- "altered" sel <- which(temp$sample == "Longerenong (wheat)") temp$alt_vs_nat[sel] <- "altered" sel <- which(temp$sample == "Mackay (sugar)") temp$alt_vs_nat[sel] <- "altered" sel <- which(temp$sample == "Mt Lesueur (NP)") temp$alt_vs_nat[sel] <- "natural" sel <- which(temp$sample == "Namadgi (NP)") temp$alt_vs_nat[sel] <- "natural" sel <- which(temp$sample == "Narrabri (cotton)") temp$alt_vs_nat[sel] <- "altered" sel <- which(temp$sample == "Walpeup (wheat)") temp$alt_vs_nat[sel] <- "altered" melt.out <- melt(temp,id.vars = c("sample","calc_type", "alt_vs_nat"), measure.vars = "eff_no") str(melt.out) ## plot ## [finished-plot] # # # # # # # # # # # # # cols <- c("altered" = "#F8766D", "natural" = "#00BFC4") p <- ggplot(data=melt.out, aes(x=sample, value)) + ggtitle("(b)") + geom_violin(data = melt.out[ which(melt.out$calc_type == "bootstrap"), ], aes(color = alt_vs_nat) ) + scale_colour_manual(values = cols) + geom_point(data = melt.out[ which(melt.out$calc_type == "rarefyx1"), ], color="gray20", shape=1 ) + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1) ) + labs(x = NULL, y = "Effective KEGG functions (count)") + theme(legend.position="none") p ggsave(plot=p, filename = paste0("finished-plots/","Alpha-Div-Eff-No-KEGG-Functions-16S-ALT-vs_NAT-NSTI-cutoff-vFINAL.tiff"), width = 8, height = 10, units = "cm", dpi = 600, compression = "lzw") ## now plot coverage by GG Ids and PICRUSt melt.out <- melt(temp,id.vars = c("sample","calc_type", "alt_vs_nat"), measure.vars = c("relabun_cover_withGG", "relabun_cover_by_Picrust")) melt.out <- melt.out[ -which(melt.out$calc_type == "rarefyx1") , ] melt.out[1:10, ] tail(melt.out) lbl1 <- "% sequences with Greengenes Ids" lbl2 <- paste0("% sequences covered in PICRUSt\n(where NSTI ",expression('\u2264'), " 0.15)") p <- ggplot(data=melt.out, aes(x=sample, value)) + ggtitle("(c)") + #geom_violin(data = melt.out[ which(melt.out$calc_type == "bootstrap"), ], aes(color = Reveg_age) ) + scale_colour_manual(values = cols) + #geom_point(data = melt.out[ which(melt.out$calc_type == "rarefyx1"), ], color="gray20", shape=1 ) + geom_jitter(data = melt.out[ which(melt.out$variable == "relabun_cover_withGG"), ],height=0,width=0.2, aes(color = alt_vs_nat) , shape=6, alpha=0.2 ) + #scale_colour_manual(values = cols) + geom_jitter(data = melt.out[ which(melt.out$variable == "relabun_cover_by_Picrust"), ],height=0,width=0.2, aes(color = alt_vs_nat) , shape=0, alpha=0.2 ) + scale_colour_manual(values = cols) + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1) ) + labs(x = NULL, y = "Sequence coverage (%)") + theme(legend.position="none") + geom_text(x=1, y=60, label=lbl1, hjust=0, vjust=0.5, size =3) + geom_text(x=1, y=40, label=lbl2, hjust=0, vjust=0.5, size =3) p ggsave(plot=p, filename = paste0("finished-plots/","Percent-Sequences-covered-by-GG-Ids-and-Picrust-estimate-ALT-vs-NAT-NSTI-cutoff-vFINAL.tiff"), width = 8, height = 10, units = "cm", dpi = 600, compression = "lzw") # # # # # # # # # # # # # #------------------------- #### Relative abundance z-scores for KEGG functions #------------------------- ## consider Mt Bold and Aust-wide selected samples phy.aust_select # phyloseq-class experiment-level object # otu_table() OTU Table: [ 12413 taxa and 39 samples ] # sample_data() Sample Data: [ 39 samples by 24 sample variables ] # tax_table() Taxonomy Table: [ 12413 taxa by 7 taxonomic ranks ] ## For Mt Bold, add extra samples for trending taxa with restoration ## - prune_taxa: top 30 increasing in 10 year sample ## - prune_taxa: top 30 decreasing in cleared sample ## Just do 1 x rarefy >> PICRUSt ## then get mean KEGG functions (n=3) in each case ## https://rdrr.io/cran/OTUtable/man/zscore.html ## Normalizes taxa abundances in a table of relative abundance data using the z-score method. # Mean centre and divide by standard deviation # ((Abundance of one OTU in one sample) - (mean abundance for that OTU ))/(standard deviation of that OTU) ### Prepare top30 trending +/- from Mount Bold ## i) Top 30 increasing taxa from "10 year" reveg samples (n=3) # what are top 30 increasing genera (and their OTUs) - from Mt Bold restoration gradient? top30_inc <- summary_data[ order(summary_data$cor_0_10_mean,decreasing = TRUE)[1:30], ] top30_inc$genus # [1] "g__DA101" "g__Candidatus_Xiphinematobacter" # [3] "g__Bradyrhizobium" "g__Candidatus_Solibacter" # [5] "g__Candidatus_Koribacter" "unclassified (family: Rhodospirillaceae)" # [7] "g__Rhodopila" "g__Edaphobacter" # [9] "unclassified (order: Solibacterales)" "unclassified (family: [Leptospirillaceae])" # [11] "unclassified (order: Ellin6513)" "unclassified (family: Gemmataceae)" # [13] "unclassified (order: Acidobacteriales)" "g__Rhodomicrobium" # [15] "g__Singulisphaera" "unclassified (order: Acidimicrobiales)" # [17] "unclassified (family: Acetobacteraceae)" "unclassified (family: Pseudonocardiaceae)" # [19] "unclassified (family: Isosphaeraceae)" "unclassified (order: WD2101)" # [21] "unclassified (family: Koribacteraceae)" "unclassified (order: Phycisphaerales)" # [23] "unclassified (family: Methylocystaceae)" "unclassified (class: EC1113)" # [25] "g__Rhodoplanes" "g__Kibdelosporangium" # [27] "unclassified (order: CV90)" "unclassified (class: P2-11E)" # [29] "unclassified (family: Nitrosomonadaceae)" "unclassified (family: Myxococcaceae)" sel <- which(summary_relabun.16s$genus %in% top30_inc$genus) top30_inc_otus <- paste0(summary_relabun.16s$otus[sel],collapse = ";") top30_inc_otus #[1] "AMD_16S_OTUa_14;AMD_16S_OTUa_183;AMD_16S_OTUa_493;AMD_16S_OTUa_983;AMD_16S_OTUa_666;AMD_16S_OTUa_720;AMD_16S_OTUa_2216;AMD_16S_OTUa_449;AMD_16S_OTUa_1690;AMD_16S_OTUa_1364;AMD_16S_OTUa_15199;AMD_16S_OTUa_26184;AMD_16S_OTUa_1100;AMD_16S_OTUa_1174;AMD_16S_OTUa_2274;AMD_16S_OTUa_5735;AMD_16S_OTUa_3413;AMD_16S_OTUa_3557;AMD_16S_OTUa_2880;AMD_16S_OTUa_5532;AMD_16S_OTUa_3350;AMD_16S_OTUa_5052;AMD_16S_OTUa_2023;AMD_16S_OTUa_7881;AMD_16S_OTUa_2685;AMD_16S_OTUa_46477;AMD_16S_OTUa_10231;AMD_16S_OTUa_7335;AMD_16S_OTUa_60397;AMD_16S_OTUa_138482;AMD_16S_OTUa_3515;AMD_16S_OTUa_3651;AMD_16S_OTUa_84569;AMD_16S_OTUa_20317;AMD_16S_OTUa_56682;AMD_16S_OTUa_21080;AMD_16S_OTUa_11948;AMD_16S_OTUa_4239;AMD_16S_OTUa_9438;AMD_16S_OTUa_11252;AMD_16S_OTUa_21087;AMD_16S_OTUa_127249;AMD_16S_OTUb_2817;AMD_16S_OTUa_61380;AMD_16S_OTUa_662;AMD_16S_OTUa_252;AMD_16S_OTUa_1265;AMD_16S_OTUa_754;AMD_16S_OTUa_810;AMD_16S_OTUa_1132;AMD_16S_OTUa_4378;AMD_16S_OTUa_3123;AMD_16S_OTUa_525;AMD_16S_OTUa_2378;AMD_16S_OTUa_1342;A... nchar(top30_inc_otus) # 15253 top30_inc_otus <- unlist(strsplit(top30_inc_otus, split = ";")) length(top30_inc_otus) # 830 head(top30_inc_otus) #"AMD_16S_OTUa_14" "AMD_16S_OTUa_183" "AMD_16S_OTUa_493" "AMD_16S_OTUa_983" "AMD_16S_OTUa_666" "AMD_16S_OTUa_720" ## prune top 30 increasing taxa from "10 year" site phy.top30inc_in_10year <- prune_samples(samples = mtbold_surf.16s@sam_data$Reveg_age == "10 years" , x = mtbold_surf.16s) length(taxa_names(phy.top30inc_in_10year)) # 3238 head(taxa_names(phy.top30inc_in_10year)) # "AMD_16S_OTUa_14" "AMD_16S_OTUa_143" "AMD_16S_OTUa_409" "AMD_16S_OTUa_82" "AMD_16S_OTUa_288" "AMD_16S_OTUa_52" phy.top30inc_in_10year <- prune_taxa(taxa = taxa_names(phy.top30inc_in_10year) %in% top30_inc_otus, x = phy.top30inc_in_10year) phy.top30inc_in_10year # phyloseq-class experiment-level object # otu_table() OTU Table: [ 824 taxa and 3 samples ] # sample_data() Sample Data: [ 3 samples by 69 sample variables ] # tax_table() Taxonomy Table: [ 824 taxa by 7 taxonomic ranks ] min(sample_sums(phy.top30inc_in_10year)) # 19901 ## ii) Top 30 decreasing taxa in "Cleared" samples (n=3) # what are top 30 decreasing genera (and their OTUs) - from Mt Bold restoration gradient? top30_dec <- summary_data[ order(summary_data$cor_0_10_mean,decreasing = FALSE)[1:30], ] top30_dec$genus # [1] "g__Bacillus" "g__Rummeliibacillus" # [3] "unclassified (family: Actinospicaceae)" "unclassified (order: Ellin5290)" # [5] "g__Sporosarcina" "g__Cytophagales" # [7] "unclassified (family: Ellin5301)" "g__Ammoniphilus" # [9] "g__Flavisolibacter" "unclassified (class: C0119)" # [11] "g__Coprococcus" "g__Clostridium" # [13] "g__SMB53" "unclassified (family: Nocardioidaceae)" # [15] "unclassified (family: Nitrospiraceae)" "g__Caloramator" # [17] "g__Arthrobacter" "g__Pelosinus" # [19] "unclassified (order: Bacillales)" "g__Solibacillus" # [21] "g__Pseudonocardia" "g__Geobacter" # [23] "unclassified (order: JG30-KF-CM45)" "unclassified (order: Sphaerobacterales)" # [25] "unclassified (family: Peptostreptococcaceae)" "g__Pimelobacter" # [27] "unclassified (family: Dolo_23)" "g__Turicibacter" # [29] "g__[Clostridium]" "g__Nocardioides" sel <- which(summary_relabun.16s$genus %in% top30_dec$genus) top30_dec_otus <- paste0(summary_relabun.16s$otus[sel],collapse = ";") top30_dec_otus #[1] "AMD_16S_OTUa_27646;AMD_16S_OTUa_316;AMD_16S_OTUa_277;AMD_16S_OTUa_1391;AMD_16S_OTUa_241;AMD_16S_OTUb_105;AMD_16S_OTUa_2472;AMD_16S_OTUa_143;AMD_16S_OTUa_388;AMD_16S_OTUa_3381;AMD_16S_OTUa_64;AMD_16S_OTUa_506;AMD_16S_OTUa_172;AMD_16S_OTUa_2070;AMD_16S_OTUa_298;AMD_16S_OTUa_1568;AMD_16S_OTUa_211;AMD_16S_OTUa_1288;AMD_16S_OTUa_884;AMD_16S_OTUa_21975;AMD_16S_OTUa_2651;AMD_16S_OTUa_782;AMD_16S_OTUa_7752;AMD_16S_OTUa_3208;AMD_16S_OTUa_7466;AMD_16S_OTUa_2128;AMD_16S_OTUa_15347;AMD_16S_OTUa_19327;AMD_16S_OTUa_26209;AMD_16S_OTUa_12150;AMD_16S_OTUa_7399;AMD_16S_OTUa_16465;AMD_16S_OTUa_18889;AMD_16S_OTUa_11656;AMD_16S_OTUa_37717;AMD_16S_OTUa_2402;AMD_16S_OTUa_36438;AMD_16S_OTUa_26384;AMD_16S_OTUa_11027;AMD_16S_OTUa_1544;AMD_16S_OTUa_25691;AMD_16S_OTUa_4528;AMD_16S_OTUa_15727;AMD_16S_OTUa_37226;AMD_16S_OTUa_10479;AMD_16S_OTUa_30521;AMD_16S_OTUa_14129;AMD_16S_OTUa_8598;AMD_16S_OTUa_5631;AMD_16S_OTUa_29110;AMD_16S_OTUa_10997;AMD_16S_OTUa_3257;AMD_16S_OTUa_6224;AMD_16S_OTUa_6773;AMD_16S_OTUa_211... nchar(top30_dec_otus) # 4426 top30_dec_otus <- unlist(strsplit(top30_dec_otus, split = ";")) length(top30_dec_otus) # 240 head(top30_dec_otus) # "AMD_16S_OTUa_27646" "AMD_16S_OTUa_316" "AMD_16S_OTUa_277" "AMD_16S_OTUa_1391" "AMD_16S_OTUa_241" "AMD_16S_OTUb_105" ## prune top 30 decreasing taxa from "Cleared" site phy.top30dec_in_cleared <- prune_samples(samples = mtbold_surf.16s@sam_data$Reveg_age == "Cleared" , x = mtbold_surf.16s) length(taxa_names(phy.top30dec_in_cleared)) # 3238 head(taxa_names(phy.top30dec_in_cleared)) # "AMD_16S_OTUa_14" "AMD_16S_OTUa_143" "AMD_16S_OTUa_409" "AMD_16S_OTUa_82" "AMD_16S_OTUa_288" "AMD_16S_OTUa_52" phy.top30dec_in_cleared <- prune_taxa(taxa = taxa_names(phy.top30dec_in_cleared) %in% top30_dec_otus, x = phy.top30dec_in_cleared) phy.top30dec_in_cleared # phyloseq-class experiment-level object # otu_table() OTU Table: [ 240 taxa and 3 samples ] # sample_data() Sample Data: [ 3 samples by 69 sample variables ] # tax_table() Taxonomy Table: [ 240 taxa by 7 taxonomic ranks ] min(sample_sums(phy.top30dec_in_cleared)) # 16704 #### For comparison of Mt Bold restoration gradient and Aust-wide selection #### normalise the sampling effort - i.e. rarefy to same minimum sample_sums() samp_size.compare2 <- min( c( min(sample_sums(phy.aust_select)), min(sample_sums(mtbold_surf.16s)), min(sample_sums(phy.top30inc_in_10year)), min(sample_sums(phy.top30dec_in_cleared))) ) samp_size.compare2 # 16704 ### For heatmap visualisation use the mean composition values from 3 samples (per sample type) ### A) Determine Functional composition for Aust-wide selection ## consider only OTUs that have GG Id sel <- which(matching$BASE_97_OTU_Id %in% yes_have_GG_Id) # qty 6981 match <- matching[sel, ] ## divide matching dataframe to treat Mt Bold and Aust-wide separately ## 2a) Aust-wide sel <- which(match$dataset == "Aust-wide") # 5404 match <- match[sel, ] ## copy phy object - rename for GG IDs phy.gg <- phy.aust_select ## rarefy seed <- 123 r1.phy.gg <- rarefy_even_depth(phy.gg, sample.size = samp_size.compare2, rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE) min(taxa_sums(r1.phy.gg)) # 1 sample_sums(r1.phy.gg) # all 16704 ntaxa(r1.phy.gg) # 11477 ## identify which taxa will be eliminated, what is % relative abundance excluded? relabun.r1.phy.gg <- transform_sample_counts(r1.phy.gg, function(x) x / sum(x) ) sample_sums( relabun.r1.phy.gg ) # all 1 mean( sample_sums( relabun.r1.phy.gg ) ) # 1 length( taxa_names(relabun.r1.phy.gg) ) # 11477 relabun.r1.phy.gg <- prune_taxa(taxa = taxa_names(relabun.r1.phy.gg) %in% yes_have_GG_Id, x = relabun.r1.phy.gg) length( taxa_names(relabun.r1.phy.gg) ) # 5205 100*(5205/11477) # 45.35% of taxa / OTUs represented mean( sample_sums( relabun.r1.phy.gg ) ) # ~0.78 or 78% of 16S sequences will be represented by GG Ids ## prune taxa - leave only those with GG Ids prune.r1.phy.gg <- prune_taxa(taxa = taxa_names(r1.phy.gg) %in% yes_have_GG_Id, x = r1.phy.gg) ## identify duplicate GG IDs - these taxa will need to be merged dim(match) # 5404 6 sel.dup.gg <- which( duplicated(match$GG_Id) == TRUE) # qty 373 dups <- match[ sel.dup.gg , c("BASE_97_OTU_Id", "GG_Id")] # iterate through OTU IDs to rename with GG IDs, merging OTUs where necessary # (as some GG Ids are duplicated to represent different BASE OTU Ids) for (i in 1:dim(match)[1]) { #i<-1766 otuid <- match$BASE_97_OTU_Id[i] this_GG_Id <- match$GG_Id[i] # check this otuid is still in taxa_names() as taxa are being merged and renamed if (otuid %in% taxa_names(prune.r1.phy.gg)) { if (otuid %in% dups$BASE_97_OTU_Id) { # this means GG_Id for this OTU is a duplicate # i.e. an OTU with a smaller index has already been represented by the same GG_Id # this code relies on the order of match and taxa_names() being the same sel.dup <- which(match$GG_Id == this_GG_Id) taxa_rep_by_same_GG_Id <- match$BASE_97_OTU_Id[sel.dup] #length(taxa_names(prune.r1.phy.gg)) # merge_taxa # get indices of taxa_names that have already or need to be converte dto that GG_Id sel <- which(taxa_names(prune.r1.phy.gg) %in% c(this_GG_Id, taxa_rep_by_same_GG_Id)) #taxa_names(prune.r1.phy.gg)[sel] # check there are multiple taxa to merge if (length(sel) > 1) { prune.r1.phy.gg <- merge_taxa(prune.r1.phy.gg, taxa_names(prune.r1.phy.gg)[sel],archetype = 1) # because it is duplicate, archetype will need to be this_GG_Id (as 1st otuid will already be overwritten) # merging taxa should also overwrite taxa_name with GG_Id archetype } else { # this was expected to be a duplicate but rarefying has taken out the duplicates # overwrite taxa_name sel <- which(taxa_names(prune.r1.phy.gg)==otuid) taxa_names(prune.r1.phy.gg)[sel] <- this_GG_Id } # if not a duplicate } else { # overwrite taxa_name sel <- which(taxa_names(prune.r1.phy.gg)==otuid) taxa_names(prune.r1.phy.gg)[sel] <- this_GG_Id } print(paste0("completed ",i," of ",length(taxa_names(prune.r1.phy.gg)))) } } #taxa_names(prune.r1.phy.gg) length(taxa_names(prune.r1.phy.gg)) # 4841 length(unique(taxa_names(prune.r1.phy.gg))) # 4841 length(unique(match$GG_Id)) # 5031 # are all names as integers (GG Ids)? length(!is.na(as.integer(taxa_names(prune.r1.phy.gg)))) # 4841 # save copy for Aust-wide aus.gg.compare <- prune.r1.phy.gg # for one-off rarefying don't need to transpose dataframe - because has rows as taxa aus_gg_otu_table <- as.data.frame( prune.r1.phy.gg@otu_table ) head( names(aus_gg_otu_table) ) # [1] "X12428" "X12430" "X12438" "X12509" "X12511" "X12525" dim(aus_gg_otu_table)[1] # 4841 aus_gg_otu_table[1:5, 1:5] # X12428 X12430 X12438 X12509 X12511 # 1087375 1025 799 1456 35 16 # 839198 0 0 0 0 0 # 4432941 39 63 134 0 0 # 86097 0 0 0 0 0 # 136781 1 1 0 843 497 GG_Ids_in_this_run <- taxa_names(aus.gg.compare) ## estimate coverage from Picrust algorithm ## uses stored lookup table picrust_lookup <- themetagenomics:::picrust_otu(file_path="C:/Workspace/DATA/KEGG/ko_13_5_precalculated.tab.gz", otu_id_targets=GG_Ids_in_this_run) # Note: use ::: to access hidden function 'picrust_otu()' # file_path:- Path to the precalculated table; otu_id_targets:- Character vector of OTU IDs to predict ## retain only sequences with NSTI <= 0.15 sel <- which(picrust_lookup$pimeta_table_out <= 0.15) # qty 1588 GG_Ids_in_Picrust_this_run <- picrust_lookup$matches[sel] # WITH Copy Number Normalization sel <- which(row.names(aus_gg_otu_table) %in% GG_Ids_in_Picrust_this_run) KEGG_fxn_Aust_compare <- picrust(aus_gg_otu_table[sel, ], rows_are_taxa=TRUE, reference='gg_ko',reference_path="C:/Workspace/DATA/KEGG", cn_normalize=TRUE,sample_normalize=TRUE, # !! cn_normalize=TRUE !! drop=TRUE) # sample_normalize=TRUE - to return relative abundance dim(KEGG_fxn_Aust_compare$fxn_table) # 36 5424 KEGG_fxn_Aust_compare$fxn_table[1:5, 1:5] # K01361 K01360 K01362 K02249 K05841 # X12428 3.229262e-06 2.691052e-07 0.001079919 0 9.149576e-06 # X12430 1.876186e-05 0.000000e+00 0.001290475 0 1.671511e-05 # X12438 1.175330e-06 0.000000e+00 0.001340170 0 9.402639e-06 # X12509 1.328110e-06 0.000000e+00 0.001141732 0 6.729091e-05 # X12511 2.818894e-06 0.000000e+00 0.001254408 0 5.694167e-05 # also record weighted NSTI NSTI_scores <- t(KEGG_fxn_Aust_compare$method_meta) %*% as.matrix(aus_gg_otu_table[row.names(KEGG_fxn_Aust_compare$method_meta), ]) weighted_NSTI <- NSTI_scores/base::colSums( as.matrix(aus_gg_otu_table[row.names(KEGG_fxn_Aust_compare$method_meta), ]) ) weighted_NSTI # X12428 X12430 X12438 X12509 X12511 X12525 X12560 X12574 X12616 X12620 X12624 # metadata_NSTI 0.07444579 0.07369172 0.07741282 0.08279569 0.08476798 0.07582457 0.1021032 0.08915777 0.0888212 0.08980971 0.09719104 # X12816 X12818 X12819 X13272 X13274 X13282 X7833 X7837 X7851 X8082 X8084 # metadata_NSTI 0.09327183 0.09633261 0.09356598 0.08056007 0.08699565 0.07535282 0.0590463 0.08710775 0.1036374 0.08275936 0.09631595 # X8099 X8116 X8118 X8130 X8182 X8192 X8198 X8220 X8262 X8266 # metadata_NSTI 0.08258622 0.08812145 0.06658276 0.08299215 0.07972479 0.07542165 0.09477894 0.09008196 0.08318103 0.08352553 # X8270 X8272 X8274 X8280 X8284 X8286 X9442 # metadata_NSTI 0.08249875 0.0849063 0.08445095 0.09133682 0.09408545 0.09568136 0.09427838 summary(as.numeric(weighted_NSTI)) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 0.05905 0.08153 0.08491 0.08577 0.09342 0.10364 table(aus.gg.compare@sam_data$Collection.Site) # Blackheath Booderee Buckley Swamp Cape Tribulation Fitzgerald River NP # 3 3 3 3 3 # Freycinet NP King Island Longerenong Mackay Mt Lesueur NP # 3 3 3 3 3 # Namadgi NP Narrabri Walpeup # 3 3 3 unique(aus.gg.compare@sam_data$Collection.Site) # [1] "Freycinet NP" "Narrabri" "Namadgi NP" "Blackheath" "Cape Tribulation" # [6] "King Island" "Booderee" "Mt Lesueur NP" "Fitzgerald River NP" "Walpeup" # [11] "Mackay" "Longerenong" "Buckley Swamp" ## "Freycinet NP" sel <- which(aus.gg.compare@sam_data$Collection.Site == "Freycinet NP") samps <- aus.gg.compare@sam_data$sampID[sel] samps # "X12428" "X12430" "X12438" KEGG_fxn_Aust_Freycinet_NP <- KEGG_fxn_Aust_compare$fxn_table[samps, ] mean_KEGG_fxn_Aust_Freycinet_NP <- base::colMeans(KEGG_fxn_Aust_Freycinet_NP) head( mean_KEGG_fxn_Aust_Freycinet_NP ) # K01361 K01360 K01362 K02249 K05841 K05844 # 7.722151e-06 8.970173e-08 1.236855e-03 0.000000e+00 1.175578e-05 1.098862e-04 ## "Narrabri" sel <- which(aus.gg.compare@sam_data$Collection.Site == "Narrabri") samps <- aus.gg.compare@sam_data$sampID[sel] samps # "X12509" "X12511" "X12525" KEGG_fxn_Aust_Narrabri <- KEGG_fxn_Aust_compare$fxn_table[samps, ] mean_KEGG_fxn_Aust_Narrabri <- base::colMeans(KEGG_fxn_Aust_Narrabri) head( mean_KEGG_fxn_Aust_Narrabri ) # K01361 K01360 K01362 K02249 K05841 K05844 # 1.772999e-06 0.000000e+00 1.141065e-03 0.000000e+00 5.977206e-05 1.492556e-04 ## "Namadgi NP" sel <- which(aus.gg.compare@sam_data$Collection.Site == "Namadgi NP") samps <- aus.gg.compare@sam_data$sampID[sel] samps # "X12560" "X12574" "X9442" KEGG_fxn_Aust_Namadgi_NP <- KEGG_fxn_Aust_compare$fxn_table[samps, ] mean_KEGG_fxn_Aust_Namadgi_NP <- base::colMeans(KEGG_fxn_Aust_Namadgi_NP) head( mean_KEGG_fxn_Aust_Namadgi_NP ) # K01361 K01360 K01362 K02249 K05841 K05844 # 8.130649e-07 0.000000e+00 1.463367e-03 1.259762e-07 9.422548e-06 1.153639e-04 ## "Blackheath" sel <- which(aus.gg.compare@sam_data$Collection.Site == "Blackheath") samps <- aus.gg.compare@sam_data$sampID[sel] samps # "X12616" "X12620" "X12624" KEGG_fxn_Aust_Blackheath <- KEGG_fxn_Aust_compare$fxn_table[samps, ] mean_KEGG_fxn_Aust_Blackheath <- base::colMeans(KEGG_fxn_Aust_Blackheath) head( mean_KEGG_fxn_Aust_Blackheath ) # K01361 K01360 K01362 K02249 K05841 K05844 # 2.094340e-07 0.000000e+00 1.509308e-03 0.000000e+00 1.841301e-05 1.667075e-04 ## "Cape Tribulation" sel <- which(aus.gg.compare@sam_data$Collection.Site == "Cape Tribulation") samps <- aus.gg.compare@sam_data$sampID[sel] samps # "X12816" "X12818" "X12819" KEGG_fxn_Aust_Cape_Tribulation <- KEGG_fxn_Aust_compare$fxn_table[samps, ] mean_KEGG_fxn_Aust_Cape_Tribulation <- base::colMeans(KEGG_fxn_Aust_Cape_Tribulation) head( mean_KEGG_fxn_Aust_Cape_Tribulation ) # K01361 K01360 K01362 K02249 K05841 K05844 # 4.417551e-07 0.000000e+00 1.422042e-03 0.000000e+00 6.095295e-06 1.150485e-04 ## "King Island" sel <- which(aus.gg.compare@sam_data$Collection.Site == "King Island") # King_Island samps <- aus.gg.compare@sam_data$sampID[sel] samps # "X13272" "X13274" "X13282" KEGG_fxn_Aust_King_Island <- KEGG_fxn_Aust_compare$fxn_table[samps, ] mean_KEGG_fxn_Aust_King_Island <- base::colMeans(KEGG_fxn_Aust_King_Island) head( mean_KEGG_fxn_Aust_King_Island ) # K01361 K01360 K01362 K02249 K05841 K05844 # 5.104571e-06 0.000000e+00 1.264513e-03 0.000000e+00 6.983408e-06 1.261416e-04 ## "Booderee" sel <- which(aus.gg.compare@sam_data$Collection.Site == "Booderee") samps <- aus.gg.compare@sam_data$sampID[sel] samps # "X7833" "X7837" "X7851" KEGG_fxn_Aust_Booderee <- KEGG_fxn_Aust_compare$fxn_table[samps, ] mean_KEGG_fxn_Aust_Booderee <- base::colMeans(KEGG_fxn_Aust_Booderee) head( mean_KEGG_fxn_Aust_Booderee ) # K01361 K01360 K01362 K02249 K05841 K05844 # 1.221835e-07 0.000000e+00 1.307876e-03 0.000000e+00 1.303608e-06 1.112840e-04 ## "Mt Lesueur NP" sel <- which(aus.gg.compare@sam_data$Collection.Site == "Mt Lesueur NP") # Mt_Lesueur_NP samps <- aus.gg.compare@sam_data$sampID[sel] samps # "X8082" "X8084" "X8099" KEGG_fxn_Aust_Mt_Lesueur_NP <- KEGG_fxn_Aust_compare$fxn_table[samps, ] mean_KEGG_fxn_Aust_Mt_Lesueur_NP <- base::colMeans(KEGG_fxn_Aust_Mt_Lesueur_NP) head( mean_KEGG_fxn_Aust_Mt_Lesueur_NP ) # K01361 K01360 K01362 K02249 K05841 K05844 # 7.400099e-07 0.000000e+00 1.246069e-03 0.000000e+00 7.478469e-06 6.311038e-05 ## "Fitzgerald River NP" sel <- which(aus.gg.compare@sam_data$Collection.Site == "Fitzgerald River NP") # Fitzgerald_River_NP samps <- aus.gg.compare@sam_data$sampID[sel] samps # "X8116" "X8118" "X8130" KEGG_fxn_Aust_Fitzgerald_River_NP <- KEGG_fxn_Aust_compare$fxn_table[samps, ] mean_KEGG_fxn_Aust_Fitzgerald_River_NP <- base::colMeans(KEGG_fxn_Aust_Fitzgerald_River_NP) head( mean_KEGG_fxn_Aust_Fitzgerald_River_NP ) # K01361 K01360 K01362 K02249 K05841 K05844 # 1.307869e-06 0.000000e+00 1.253967e-03 0.000000e+00 3.652503e-06 9.205968e-05 ## "Walpeup" sel <- which(aus.gg.compare@sam_data$Collection.Site == "Walpeup") samps <- aus.gg.compare@sam_data$sampID[sel] samps # "X8182" "X8262" "X8266" KEGG_fxn_Aust_Walpeup <- KEGG_fxn_Aust_compare$fxn_table[samps, ] mean_KEGG_fxn_Aust_Walpeup <- base::colMeans(KEGG_fxn_Aust_Walpeup) head( mean_KEGG_fxn_Aust_Walpeup ) # K01361 K01360 K01362 K02249 K05841 K05844 # 8.809687e-07 6.092360e-07 1.128163e-03 0.000000e+00 1.600153e-05 7.407936e-05 ## "Mackay" sel <- which(aus.gg.compare@sam_data$Collection.Site == "Mackay") samps <- aus.gg.compare@sam_data$sampID[sel] samps # "X8192" "X8198" "X8220" KEGG_fxn_Aust_Mackay <- KEGG_fxn_Aust_compare$fxn_table[samps, ] mean_KEGG_fxn_Aust_Mackay <- base::colMeans(KEGG_fxn_Aust_Mackay) head( mean_KEGG_fxn_Aust_Mackay ) # K01361 K01360 K01362 K02249 K05841 K05844 # 1.007713e-05 0.000000e+00 1.223238e-03 0.000000e+00 1.297577e-05 7.189771e-05 ## "Longerenong" sel <- which(aus.gg.compare@sam_data$Collection.Site == "Longerenong") samps <- aus.gg.compare@sam_data$sampID[sel] samps # "X8270" "X8272" "X8274" KEGG_fxn_Aust_Longerenong <- KEGG_fxn_Aust_compare$fxn_table[samps, ] mean_KEGG_fxn_Aust_Longerenong <- base::colMeans(KEGG_fxn_Aust_Longerenong) head( mean_KEGG_fxn_Aust_Longerenong ) # K01361 K01360 K01362 K02249 K05841 K05844 # 1.215753e-07 7.302432e-07 1.236101e-03 0.000000e+00 4.553141e-05 1.134288e-04 ## "Buckley Swamp" sel <- which(aus.gg.compare@sam_data$Collection.Site == "Buckley Swamp") samps <- aus.gg.compare@sam_data$sampID[sel] samps # "X8280" "X8284" "X8286" KEGG_fxn_Aust_Buckley_Swamp <- KEGG_fxn_Aust_compare$fxn_table[samps, ] mean_KEGG_fxn_Aust_Buckley_Swamp <- base::colMeans(KEGG_fxn_Aust_Buckley_Swamp) head( mean_KEGG_fxn_Aust_Buckley_Swamp ) # K01361 K01360 K01362 K02249 K05841 K05844 # 8.925166e-08 0.000000e+00 1.245398e-03 0.000000e+00 8.814822e-06 1.140104e-04 ### B) Determine Functional composition for Mt Bold samples ## consider only OTUs that have GG Id sel <- which(matching$BASE_97_OTU_Id %in% yes_have_GG_Id) # qty 6981 match <- matching[sel, ] sel <- which(match$dataset == "Mt Bold") # 1577 match <- match[sel, ] ## copy phy object - rename for GG IDs phy.gg <- mtbold_surf.16s ## rarefy seed <- 123 r1.phy.gg <- rarefy_even_depth(phy.gg, sample.size = samp_size.compare2, rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE) min(taxa_sums(r1.phy.gg)) # 1 sample_sums(r1.phy.gg) # all 16704 ntaxa(r1.phy.gg) # 3204 ## identify which taxa will be eliminated, what is % relative abundance excluded? relabun.r1.phy.gg <- transform_sample_counts(r1.phy.gg, function(x) x / sum(x) ) sample_sums( relabun.r1.phy.gg ) # all 1 mean( sample_sums( relabun.r1.phy.gg ) ) # 1 length( taxa_names(relabun.r1.phy.gg) ) # 3204 relabun.r1.phy.gg <- prune_taxa(taxa = taxa_names(relabun.r1.phy.gg) %in% yes_have_GG_Id, x = relabun.r1.phy.gg) length( taxa_names(relabun.r1.phy.gg) ) # 1567 100*(1567/3204) # 48.9 % of OTUs mean( sample_sums( relabun.r1.phy.gg ) ) # 74.9 % of 16S sequences will be represented by GG Ids ## prune taxa - leave only those with GG Ids prune.r1.phy.gg <- prune_taxa(taxa = taxa_names(r1.phy.gg) %in% yes_have_GG_Id, x = r1.phy.gg) ## identify duplicate GG IDs - these taxa will need to be merged dim(match) # 1577 6 sel.dup.gg <- which( duplicated(match$GG_Id) == TRUE) # qty 72 dups <- match[ sel.dup.gg , c("BASE_97_OTU_Id", "GG_Id")] # iterate through OTU IDs to rename with GG IDs, merging OTUs where necessary for (i in 1:dim(match)[1]) { #i<-1766 otuid <- match$BASE_97_OTU_Id[i] this_GG_Id <- match$GG_Id[i] # check this otuid is still in taxa_names() as taxa are being merged and renamed if (otuid %in% taxa_names(prune.r1.phy.gg)) { if (otuid %in% dups$BASE_97_OTU_Id) { # this means GG_Id for this OTU is a duplicate # i.e. an OTU with a smaller index has already been represented by the same GG_Id # this code relies on the order of match and taxa_names() being the same sel.dup <- which(match$GG_Id == this_GG_Id) taxa_rep_by_same_GG_Id <- match$BASE_97_OTU_Id[sel.dup] #length(taxa_names(prune.r1.phy.gg)) # merge_taxa # get indices of taxa_names that have already or need to be converte dto that GG_Id sel <- which(taxa_names(prune.r1.phy.gg) %in% c(this_GG_Id, taxa_rep_by_same_GG_Id)) #taxa_names(prune.r1.phy.gg)[sel] # check there are multiple taxa to merge if (length(sel) > 1) { prune.r1.phy.gg <- merge_taxa(prune.r1.phy.gg, taxa_names(prune.r1.phy.gg)[sel],archetype = 1) # because it is duplicate, archetype will need to be this_GG_Id (as 1st otuid will already be overwritten) # merging taxa should also overwrite taxa_name with GG_Id archetype } else { # this was expected to be a duplicate but rarefying has taken out the duplicates # overwrite taxa_name sel <- which(taxa_names(prune.r1.phy.gg)==otuid) taxa_names(prune.r1.phy.gg)[sel] <- this_GG_Id } # if not a duplicate } else { # overwrite taxa_name sel <- which(taxa_names(prune.r1.phy.gg)==otuid) taxa_names(prune.r1.phy.gg)[sel] <- this_GG_Id } print(paste0("completed ",i," of ",length(taxa_names(prune.r1.phy.gg)))) } } taxa_names(prune.r1.phy.gg) length(taxa_names(prune.r1.phy.gg)) # 1495 length(unique(taxa_names(prune.r1.phy.gg))) # 1495 length(unique(match$GG_Id)) # 1505 - higher but remember some OTUs removed during rarefying # are all names as integers (GG Ids)? length(!is.na(as.integer(taxa_names(prune.r1.phy.gg)))) # 1495 # save copy for Mt Bold mb.gg.compare <- prune.r1.phy.gg # for one-off rarefying don't need to transpose dataframe - because has rows as taxa mb_gg_otu_table <- as.data.frame( prune.r1.phy.gg@otu_table ) head( names(mb_gg_otu_table) ) # [1] "2009.1.10" "2009.2.10" "2009.3.10" "neg.1.10" "neg.2.10" "neg.3.10" ##remove "." from sample names for PICRUSt names(mb_gg_otu_table) <- gsub(pattern="[.]", replacement="_",x=names(mb_gg_otu_table) ) mb_gg_otu_table[1:5, 1:5] # 2009_1_10 2009_2_10 2009_3_10 neg_1_10 neg_2_10 # 1087375 820 636 465 452 391 # 4386062 707 1358 1376 945 914 # 250258 206 231 292 131 57 # 104834 180 128 159 248 124 # 752270 0 0 0 1 0 GG_Ids_in_this_run <- row.names(mb_gg_otu_table) ## estimate coverage from Picrust algorithm ## uses stored lookup table picrust_lookup <- themetagenomics:::picrust_otu(file_path="C:/Workspace/DATA/KEGG/ko_13_5_precalculated.tab.gz", otu_id_targets=GG_Ids_in_this_run) # Note: use ::: to access hidden function 'picrust_otu()' # file_path:- Path to the precalculated table; otu_id_targets:- Character vector of OTU IDs to predict ## retain only sequences with NSTI <= 0.15 sel <- which(picrust_lookup$pimeta_table_out <= 0.15) # qty 499 GG_Ids_in_Picrust_this_run <- picrust_lookup$matches[sel] # WITH Copy Number Normalization sel <- which(row.names(mb_gg_otu_table) %in% GG_Ids_in_Picrust_this_run) KEGG_fxn_MtBold_compare <- picrust(mb_gg_otu_table[sel, ], rows_are_taxa=TRUE, reference='gg_ko',reference_path="C:/Workspace/DATA/KEGG", cn_normalize=TRUE,sample_normalize=TRUE, # !! cn_normalize=TRUE !! drop=TRUE) KEGG_fxn_MtBold_compare$fxn_table[1:5, 1:5] # K01361 K01362 K02249 K05841 K05844 # 2009_1_10 2.681954e-06 0.001216575 0.000000e+00 4.332386e-06 5.343277e-05 # 2009_2_10 3.325512e-06 0.001216370 0.000000e+00 6.395215e-06 5.755693e-05 # 2009_3_10 3.984349e-06 0.001196550 0.000000e+00 5.727502e-06 4.756317e-05 # neg_1_10 5.804264e-06 0.001229976 2.638302e-07 5.276604e-06 6.068094e-05 # neg_2_10 2.563261e-06 0.001240333 2.848067e-07 5.126521e-06 4.870195e-05 dim(KEGG_fxn_MtBold_compare$fxn_table) # 24 4989 # also record weighted NSTI NSTI_scores <- t(KEGG_fxn_MtBold_compare$method_meta) %*% as.matrix(mb_gg_otu_table[row.names(KEGG_fxn_MtBold_compare$method_meta), ]) weighted_NSTI <- NSTI_scores/base::colSums( as.matrix(mb_gg_otu_table[row.names(KEGG_fxn_MtBold_compare$method_meta), ]) ) weighted_NSTI # 2009_1_10 2009_2_10 2009_3_10 neg_1_10 neg_2_10 neg_3_10 2005_1_10 2005_2_10 2005_3_10 2008_1_10 # metadata_NSTI 0.08530615 0.08368934 0.08556919 0.08661885 0.07595629 0.08957726 0.08237608 0.09322848 0.08544081 0.08530522 # 2008_2_10 2008_3_10 2007_1_10 2007_2_10 2007_3_10 a_1_10 a_2_10 a_3_10 b_1_10 b_2_10 b_3_10 # metadata_NSTI 0.08794512 0.09441304 0.09085305 0.09350372 0.08616977 0.1032315 0.09729349 0.09488783 0.09979562 0.1006148 0.09615356 # c_1_10 c_2_10 c_4_10 # metadata_NSTI 0.0938315 0.09460768 0.08294747 summary(as.numeric(weighted_NSTI)) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 0.07596 0.08541 0.09022 0.09039 0.09468 0.10323 ## split into sample types levels(mb.gg.compare@sam_data$Reveg_age) # [1] "Cleared" "6 years" "7 years" "8 years" "10 years" "Remnant A" "Remnant B" "Remnant C" head(row.names(mb.gg.compare@sam_data)) # [1] "2009.1.10" "2009.2.10" "2009.3.10" "neg.1.10" "neg.2.10" "neg.3.10" head(row.names(KEGG_fxn_MtBold_compare$fxn_table)) # [1] "2009_1_10" "2009_2_10" "2009_3_10" "neg_1_10" "neg_2_10" "neg_3_10" # convert rownames back to original format row.names(KEGG_fxn_MtBold_compare$fxn_table) <- gsub(pattern="_", replacement=".",x=row.names(KEGG_fxn_MtBold_compare$fxn_table) ) ## "Cleared" sel <- which(mb.gg.compare@sam_data$Reveg_age == "Cleared") samps <- mb.gg.compare@sam_data$Sample.ID3[sel] samps # "neg.1.10" "neg.2.10" "neg.3.10" KEGG_fxn_MtBold_cleared <- KEGG_fxn_MtBold_compare$fxn_table[samps, ] mean_KEGG_fxn_MtBold_cleared <- base::colMeans(KEGG_fxn_MtBold_cleared) head( mean_KEGG_fxn_MtBold_cleared ) # K01361 K01362 K02249 K05841 K05844 K05845 # 4.108349e-06 1.224960e-03 2.653274e-07 6.106057e-06 5.435227e-05 3.911855e-04 ## "6 years" sel <- which(mb.gg.compare@sam_data$Reveg_age == "6 years") samps <- mb.gg.compare@sam_data$Sample.ID3[sel] samps # "2009.1.10" "2009.2.10" "2009.3.10" KEGG_fxn_MtBold_6_years <- KEGG_fxn_MtBold_compare$fxn_table[samps, ] mean_KEGG_fxn_MtBold_6_years <- base::colMeans(KEGG_fxn_MtBold_6_years) head( mean_KEGG_fxn_MtBold_6_years ) # K01361 K01362 K02249 K05841 K05844 K05845 # 3.330605e-06 1.209832e-03 0.000000e+00 5.485034e-06 5.285096e-05 4.244743e-04 ## "7 years" sel <- which(mb.gg.compare@sam_data$Reveg_age == "7 years") samps <- mb.gg.compare@sam_data$Sample.ID3[sel] samps # "2008.1.10" "2008.2.10" "2008.3.10" KEGG_fxn_MtBold_7_years <- KEGG_fxn_MtBold_compare$fxn_table[samps, ] mean_KEGG_fxn_MtBold_7_years <- base::colMeans(KEGG_fxn_MtBold_7_years) head( mean_KEGG_fxn_MtBold_7_years ) # K01361 K01362 K02249 K05841 K05844 K05845 # 1.494232e-06 1.235929e-03 0.000000e+00 8.618295e-06 5.404177e-05 4.139269e-04 ## "8 years" sel <- which(mb.gg.compare@sam_data$Reveg_age == "8 years") samps <- mb.gg.compare@sam_data$Sample.ID3[sel] samps # "2007.1.10" "2007.2.10" "2007.3.10" KEGG_fxn_MtBold_8_years <- KEGG_fxn_MtBold_compare$fxn_table[samps, ] mean_KEGG_fxn_MtBold_8_years <- base::colMeans(KEGG_fxn_MtBold_8_years) head( mean_KEGG_fxn_MtBold_8_years ) # K01361 K01362 K02249 K05841 K05844 K05845 # 1.664535e-06 1.325067e-03 0.000000e+00 3.782021e-06 8.905321e-05 3.857731e-04 ## "10 years" sel <- which(mb.gg.compare@sam_data$Reveg_age == "10 years") samps <- mb.gg.compare@sam_data$Sample.ID3[sel] samps # "2005.1.10" "2005.2.10" "2005.3.10" KEGG_fxn_MtBold_10_years <- KEGG_fxn_MtBold_compare$fxn_table[samps, ] mean_KEGG_fxn_MtBold_10_years <- base::colMeans(KEGG_fxn_MtBold_10_years) head( mean_KEGG_fxn_MtBold_10_years ) # K01361 K01362 K02249 K05841 K05844 K05845 # 3.099091e-06 1.217590e-03 0.000000e+00 4.110228e-06 4.396092e-05 4.440761e-04 ## "Remnant A" sel <- which(mb.gg.compare@sam_data$Reveg_age == "Remnant A") samps <- mb.gg.compare@sam_data$Sample.ID3[sel] samps # "a.1.10" "a.2.10" "a.3.10" KEGG_fxn_MtBold_Remnant_A <- KEGG_fxn_MtBold_compare$fxn_table[samps, ] mean_KEGG_fxn_MtBold_Remnant_A <- base::colMeans(KEGG_fxn_MtBold_Remnant_A) head( mean_KEGG_fxn_MtBold_Remnant_A ) # K01361 K01362 K02249 K05841 K05844 K05845 # 7.170591e-07 1.336995e-03 0.000000e+00 2.413262e-06 7.279772e-05 3.868404e-04 ## "Remnant B" sel <- which(mb.gg.compare@sam_data$Reveg_age == "Remnant B") samps <- mb.gg.compare@sam_data$Sample.ID3[sel] samps # "b.1.10" "b.2.10" "b.3.10" KEGG_fxn_MtBold_Remnant_B <- KEGG_fxn_MtBold_compare$fxn_table[samps, ] mean_KEGG_fxn_MtBold_Remnant_B <- base::colMeans(KEGG_fxn_MtBold_Remnant_B) head( mean_KEGG_fxn_MtBold_Remnant_B ) # K01361 K01362 K02249 K05841 K05844 K05845 # 8.741830e-07 1.310596e-03 0.000000e+00 5.637839e-06 6.405525e-05 4.175408e-04 ## "Remnant C" sel <- which(mb.gg.compare@sam_data$Reveg_age == "Remnant C") samps <- mb.gg.compare@sam_data$Sample.ID3[sel] samps # "c.1.10" "c.2.10" "c.4.10" KEGG_fxn_MtBold_Remnant_C <- KEGG_fxn_MtBold_compare$fxn_table[samps, ] mean_KEGG_fxn_MtBold_Remnant_C <- base::colMeans(KEGG_fxn_MtBold_Remnant_C) head( mean_KEGG_fxn_MtBold_Remnant_C ) # K01361 K01362 K02249 K05841 K05844 K05845 # 4.380765e-07 1.243994e-03 0.000000e+00 4.028910e-06 5.716604e-05 4.027003e-04 ### C) Top 30 increasing taxa from "10 year" reveg samples (n=3) # use phy.top30inc_in_10year - prepared earlier ## consider only OTUs that have GG Id sel <- which(matching$BASE_97_OTU_Id %in% yes_have_GG_Id) # qty 6981 match <- matching[sel, ] sel <- which(match$dataset == "Mt Bold") # 1577 match <- match[sel, ] ## copy phy object - rename for GG IDs phy.gg <- phy.top30inc_in_10year ## rarefy seed <- 123 r1.phy.gg <- rarefy_even_depth(phy.gg, sample.size = samp_size.compare2, rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE) min(taxa_sums(r1.phy.gg)) # 1 sample_sums(r1.phy.gg) # all 16704 ntaxa(r1.phy.gg) # 673 ## identify which taxa will be eliminated, what is % relative abundance excluded? relabun.r1.phy.gg <- transform_sample_counts(r1.phy.gg, function(x) x / sum(x) ) sample_sums( relabun.r1.phy.gg ) # all 1 mean( sample_sums( relabun.r1.phy.gg ) ) # 1 length( taxa_names(relabun.r1.phy.gg) ) # 673 relabun.r1.phy.gg <- prune_taxa(taxa = taxa_names(relabun.r1.phy.gg) %in% yes_have_GG_Id, x = relabun.r1.phy.gg) length( taxa_names(relabun.r1.phy.gg) ) # 303 100*(303/673) # 45 % of OTUs mean( sample_sums( relabun.r1.phy.gg ) ) # ~ 74% of 16S reads will be represented ## prune taxa - leave only those with GG Ids prune.r1.phy.gg <- prune_taxa(taxa = taxa_names(r1.phy.gg) %in% yes_have_GG_Id, x = r1.phy.gg) ## identify duplicate GG IDs - these taxa will need to be merged dim(match) # 1577 6 sel.dup.gg <- which( duplicated(match$GG_Id) == TRUE) # qty 72 dups <- match[ sel.dup.gg , c("BASE_97_OTU_Id", "GG_Id")] # iterate through OTU IDs to rename with GG IDs, merging OTUs where necessary for (i in 1:dim(match)[1]) { #i<-1766 otuid <- match$BASE_97_OTU_Id[i] this_GG_Id <- match$GG_Id[i] # check this otuid is still in taxa_names() as taxa are being merged and renamed if (otuid %in% taxa_names(prune.r1.phy.gg)) { if (otuid %in% dups$BASE_97_OTU_Id) { # this means GG_Id for this OTU is a duplicate # i.e. an OTU with a smaller index has already been represented by the same GG_Id # this code relies on the order of match and taxa_names() being the same sel.dup <- which(match$GG_Id == this_GG_Id) taxa_rep_by_same_GG_Id <- match$BASE_97_OTU_Id[sel.dup] #length(taxa_names(prune.r1.phy.gg)) # merge_taxa # get indices of taxa_names that have already or need to be converte dto that GG_Id sel <- which(taxa_names(prune.r1.phy.gg) %in% c(this_GG_Id, taxa_rep_by_same_GG_Id)) #taxa_names(prune.r1.phy.gg)[sel] # check there are multiple taxa to merge if (length(sel) > 1) { prune.r1.phy.gg <- merge_taxa(prune.r1.phy.gg, taxa_names(prune.r1.phy.gg)[sel],archetype = 1) # because it is duplicate, archetype will need to be this_GG_Id (as 1st otuid will already be overwritten) # merging taxa should also overwrite taxa_name with GG_Id archetype } else { # this was expected to be a duplicate but rarefying has taken out the duplicates # overwrite taxa_name sel <- which(taxa_names(prune.r1.phy.gg)==otuid) taxa_names(prune.r1.phy.gg)[sel] <- this_GG_Id } # if not a duplicate } else { # overwrite taxa_name sel <- which(taxa_names(prune.r1.phy.gg)==otuid) taxa_names(prune.r1.phy.gg)[sel] <- this_GG_Id } print(paste0("completed ",i," of ",length(taxa_names(prune.r1.phy.gg)))) } } taxa_names(prune.r1.phy.gg) length(taxa_names(prune.r1.phy.gg)) # 289 length(unique(taxa_names(prune.r1.phy.gg))) # 289 length(unique(match$GG_Id)) # 1505 - higher but remember 16 OTUs removed during rarefying # are all names as integers (GG Ids)? length(!is.na(as.integer(taxa_names(prune.r1.phy.gg)))) # 289 (NAs introduced by coercion) # save copy for Mt Bold mb.gg.top30inc_in_10year <- prune.r1.phy.gg # for one-off rarefying don't need to transpose dataframe - because has rows as taxa mb_gg_top30inc_otu_table <- as.data.frame( prune.r1.phy.gg@otu_table ) head( names(mb_gg_top30inc_otu_table) ) # [1] "2005.1.10" "2005.2.10" "2005.3.10" ##remove "." from sample names for PICRUSt names(mb_gg_top30inc_otu_table) <- gsub(pattern="[.]", replacement="_",x=names(mb_gg_top30inc_otu_table) ) mb_gg_top30inc_otu_table[1:5, 1:3] # 2005_1_10 2005_2_10 2005_3_10 # 1087375 2021 1839 2460 # 250258 811 1574 920 # 104834 1067 562 546 # 752270 257 297 15 # 221708 0 114 54 GG_Ids_in_this_run <- row.names(mb_gg_top30inc_otu_table) ## estimate coverage from Picrust algorithm ## uses stored lookup table picrust_lookup <- themetagenomics:::picrust_otu(file_path="C:/Workspace/DATA/KEGG/ko_13_5_precalculated.tab.gz", otu_id_targets=GG_Ids_in_this_run) # Note: use ::: to access hidden function 'picrust_otu()' # file_path:- Path to the precalculated table; otu_id_targets:- Character vector of OTU IDs to predict ## retain only sequences with NSTI <= 0.15 sel <- which(picrust_lookup$pimeta_table_out <= 0.15) GG_Ids_in_Picrust_this_run <- picrust_lookup$matches[sel] # qty 118 # WITH Copy Number Normalization sel <- which(row.names(mb_gg_top30inc_otu_table) %in% GG_Ids_in_Picrust_this_run) KEGG_fxn_MtBold_top30inc_in_10year <- picrust(mb_gg_top30inc_otu_table[sel, ], rows_are_taxa=TRUE, reference='gg_ko',reference_path="C:/Workspace/DATA/KEGG", cn_normalize=TRUE,sample_normalize=TRUE, # !! cn_normalize=TRUE !! drop=TRUE) # also record weighted NSTI NSTI_scores <- t(KEGG_fxn_MtBold_top30inc_in_10year$method_meta) %*% as.matrix(mb_gg_top30inc_otu_table[row.names(KEGG_fxn_MtBold_top30inc_in_10year$method_meta), ]) weighted_NSTI <- NSTI_scores/base::colSums( as.matrix(mb_gg_top30inc_otu_table[row.names(KEGG_fxn_MtBold_top30inc_in_10year$method_meta), ]) ) weighted_NSTI # 2005_1_10 2005_2_10 2005_3_10 # metadata_NSTI 0.1102142 0.1113648 0.1001335 KEGG_fxn_MtBold_top30inc_in_10year$fxn_table[1:3, 1:5] # K01362 K05841 K05844 K05845 K05846 # 2005_1_10 0.001198935 0.000000e+00 1.288227e-05 0.0005252683 0.0005540639 # 2005_2_10 0.001212532 0.000000e+00 1.106137e-05 0.0005262705 0.0005542091 # 2005_3_10 0.001217098 1.541216e-06 2.219350e-05 0.0005224721 0.0005593071 mean_KEGG_fxn_MtBold_top30inc_in_10year <- base::colMeans(KEGG_fxn_MtBold_top30inc_in_10year$fxn_table) head( mean_KEGG_fxn_MtBold_top30inc_in_10year ) # K01362 K05841 K05844 K05845 K05846 K05847 # 1.209522e-03 5.137385e-07 1.537905e-05 5.246703e-04 5.558600e-04 5.375029e-04 ## D) Top 30 decreasing taxa in "Cleared" samples (n=3) # use phy.top30dec_in_cleared - prepared earlier ## consider only OTUs that have GG Id sel <- which(matching$BASE_97_OTU_Id %in% yes_have_GG_Id) # qty 7015 match <- matching[sel, ] sel <- which(match$dataset == "Mt Bold") # 1577 match <- match[sel, ] ## copy phy object - rename for GG IDs phy.gg <- phy.top30dec_in_cleared ## rarefy seed <- 123 r1.phy.gg <- rarefy_even_depth(phy.gg, sample.size = samp_size.compare2, rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE) min(taxa_sums(r1.phy.gg)) # 1 sample_sums(r1.phy.gg) # all 16704 ntaxa(r1.phy.gg) # 216 ## identify which taxa will be eliminated, what is % relative abundance excluded? relabun.r1.phy.gg <- transform_sample_counts(r1.phy.gg, function(x) x / sum(x) ) sample_sums( relabun.r1.phy.gg ) # all 1 mean( sample_sums( relabun.r1.phy.gg ) ) # 1 length( taxa_names(relabun.r1.phy.gg) ) # 216 relabun.r1.phy.gg <- prune_taxa(taxa = taxa_names(relabun.r1.phy.gg) %in% yes_have_GG_Id, x = relabun.r1.phy.gg) length( taxa_names(relabun.r1.phy.gg) ) # 117 100*(117/216) # 54 % of OTUs mean( sample_sums( relabun.r1.phy.gg ) ) # ~ 83.5 % of 16S reads will be represented ## prune taxa - leave only those with GG Ids prune.r1.phy.gg <- prune_taxa(taxa = taxa_names(r1.phy.gg) %in% yes_have_GG_Id, x = r1.phy.gg) ## identify duplicate GG IDs - these taxa will need to be merged dim(match) # 1577 6 sel.dup.gg <- which( duplicated(match$GG_Id) == TRUE) # qty 72 dups <- match[ sel.dup.gg , c("BASE_97_OTU_Id", "GG_Id")] # iterate through OTU IDs to rename with GG IDs, merging OTUs where necessary for (i in 1:dim(match)[1]) { #i<-1766 otuid <- match$BASE_97_OTU_Id[i] this_GG_Id <- match$GG_Id[i] # check this otuid is still in taxa_names() as taxa are being merged and renamed if (otuid %in% taxa_names(prune.r1.phy.gg)) { if (otuid %in% dups$BASE_97_OTU_Id) { # this means GG_Id for this OTU is a duplicate # i.e. an OTU with a smaller index has already been represented by the same GG_Id # this code relies on the order of match and taxa_names() being the same sel.dup <- which(match$GG_Id == this_GG_Id) taxa_rep_by_same_GG_Id <- match$BASE_97_OTU_Id[sel.dup] #length(taxa_names(prune.r1.phy.gg)) # merge_taxa # get indices of taxa_names that have already or need to be converte dto that GG_Id sel <- which(taxa_names(prune.r1.phy.gg) %in% c(this_GG_Id, taxa_rep_by_same_GG_Id)) #taxa_names(prune.r1.phy.gg)[sel] # check there are multiple taxa to merge if (length(sel) > 1) { prune.r1.phy.gg <- merge_taxa(prune.r1.phy.gg, taxa_names(prune.r1.phy.gg)[sel],archetype = 1) # because it is duplicate, archetype will need to be this_GG_Id (as 1st otuid will already be overwritten) # merging taxa should also overwrite taxa_name with GG_Id archetype } else { # this was expected to be a duplicate but rarefying has taken out the duplicates # overwrite taxa_name sel <- which(taxa_names(prune.r1.phy.gg)==otuid) taxa_names(prune.r1.phy.gg)[sel] <- this_GG_Id } # if not a duplicate } else { # overwrite taxa_name sel <- which(taxa_names(prune.r1.phy.gg)==otuid) taxa_names(prune.r1.phy.gg)[sel] <- this_GG_Id } print(paste0("completed ",i," of ",length(taxa_names(prune.r1.phy.gg)))) } } taxa_names(prune.r1.phy.gg) length(taxa_names(prune.r1.phy.gg)) # 112 length(unique(taxa_names(prune.r1.phy.gg))) # 112 length(unique(match$GG_Id)) # are all names as integers (GG Ids)? length(!is.na(as.integer(taxa_names(prune.r1.phy.gg)))) # 112 # save copy for Mt Bold mb.gg.top30dec_in_cleared <- prune.r1.phy.gg # for one-off rarefying don't need to transpose dataframe - because has rows as taxa mb_gg_top30dec_otu_table <- as.data.frame( prune.r1.phy.gg@otu_table ) head( names(mb_gg_top30dec_otu_table) ) # [1] "neg.1.10" "neg.2.10" "neg.3.10" ##remove "." from sample names for PICRUSt names(mb_gg_top30dec_otu_table) <- gsub(pattern="[.]", replacement="_",x=names(mb_gg_top30dec_otu_table) ) mb_gg_top30dec_otu_table[1:5, 1:3] # neg_1_10 neg_2_10 neg_3_10 # 4386062 3211 3744 3912 # 138494 2294 1535 1886 # 4304483 199 114 206 # 985431 647 1216 1033 # 675374 610 692 470 GG_Ids_in_this_run <- row.names(mb_gg_top30dec_otu_table) ## estimate coverage from Picrust algorithm ## uses stored lookup table picrust_lookup <- themetagenomics:::picrust_otu(file_path="C:/Workspace/DATA/KEGG/ko_13_5_precalculated.tab.gz", otu_id_targets=GG_Ids_in_this_run) # Note: use ::: to access hidden function 'picrust_otu()' # file_path:- Path to the precalculated table; otu_id_targets:- Character vector of OTU IDs to predict ## retain only sequences with NSTI <= 0.15 sel <- which(picrust_lookup$pimeta_table_out <= 0.15) GG_Ids_in_Picrust_this_run <- picrust_lookup$matches[sel] # qty 51 # WITH Copy Number Normalization sel <- which(row.names(mb_gg_top30dec_otu_table) %in% GG_Ids_in_Picrust_this_run) KEGG_fxn_MtBold_top30dec_in_cleared <- picrust(mb_gg_top30dec_otu_table[sel, ], rows_are_taxa=TRUE, reference='gg_ko',reference_path="C:/Workspace/DATA/KEGG", cn_normalize=TRUE,sample_normalize=TRUE, # !! cn_normalize=TRUE !! drop=TRUE) # also record weighted NSTI NSTI_scores <- t(KEGG_fxn_MtBold_top30dec_in_cleared$method_meta) %*% as.matrix(mb_gg_top30dec_otu_table[row.names(KEGG_fxn_MtBold_top30dec_in_cleared$method_meta), ]) weighted_NSTI <- NSTI_scores/base::colSums( as.matrix(mb_gg_top30dec_otu_table[row.names(KEGG_fxn_MtBold_top30dec_in_cleared$method_meta), ]) ) weighted_NSTI # neg_1_10 neg_2_10 neg_3_10 # metadata_NSTI 0.05914347 0.06894368 0.0558724 KEGG_fxn_MtBold_top30dec_in_cleared$fxn_table[1:3, 1:5] # K01361 K01362 K02249 K05841 K05844 # neg_1_10 6.887025e-05 0.0007506858 1.967721e-06 2.164494e-05 1.387244e-04 # neg_2_10 2.689877e-05 0.0007151138 1.968203e-06 3.936406e-06 7.544778e-05 # neg_3_10 7.214131e-05 0.0007672172 4.580401e-06 3.435301e-05 1.648944e-04 mean_KEGG_fxn_MtBold_top30dec_in_cleared <- base::colMeans(KEGG_fxn_MtBold_top30dec_in_cleared$fxn_table) head( mean_KEGG_fxn_MtBold_top30dec_in_cleared ) # K01361 K01362 K02249 K05841 K05844 K05845 # 5.597011e-05 7.443389e-04 2.838775e-06 1.997812e-05 1.263555e-04 2.399391e-04 #### Analyse these sample type-mean functional relative abundance data ## determine means for Mt Bold below ## add rows together for means to create a new table ## use only the means to then determine SD # inspect examples of data to be loaded length(mean_KEGG_fxn_Aust_Blackheath) # 5424 head(names(mean_KEGG_fxn_Aust_Blackheath)) # "K01361" "K01360" "K01362" "K02249" "K05841" "K05844" length(mean_KEGG_fxn_MtBold_6_years) # 4989 head( names(mean_KEGG_fxn_MtBold_6_years) ) # "K01361" "K01362" "K02249" "K05841" "K05844" "K05845" # i.e. KEGG function names vary between each set of sample-mean data fxn_names <- sort( unique(c( colnames(KEGG_fxn_Aust_compare$fxn_table), colnames(KEGG_fxn_MtBold_compare$fxn_table), colnames(KEGG_fxn_MtBold_top30inc_in_10year), colnames(KEGG_fxn_MtBold_top30dec_in_cleared)))) ## build data frame to store sample type-mean functional relative abundance data mean_fxn_dat <- as.data.frame(matrix(ncol=length(fxn_names))) names(mean_fxn_dat) <- fxn_names # list of data to be loaded x <- c( "mean_KEGG_fxn_Aust_Blackheath", "mean_KEGG_fxn_Aust_Booderee", "mean_KEGG_fxn_Aust_Buckley_Swamp", "mean_KEGG_fxn_Aust_Cape_Tribulation", "mean_KEGG_fxn_Aust_Fitzgerald_River_NP", "mean_KEGG_fxn_Aust_Freycinet_NP", "mean_KEGG_fxn_Aust_King_Island", "mean_KEGG_fxn_Aust_Longerenong", "mean_KEGG_fxn_Aust_Mackay", "mean_KEGG_fxn_Aust_Mt_Lesueur_NP", "mean_KEGG_fxn_Aust_Namadgi_NP", "mean_KEGG_fxn_Aust_Narrabri", "mean_KEGG_fxn_Aust_Walpeup", "mean_KEGG_fxn_MtBold_10_years", "mean_KEGG_fxn_MtBold_6_years", "mean_KEGG_fxn_MtBold_7_years", "mean_KEGG_fxn_MtBold_8_years", "mean_KEGG_fxn_MtBold_cleared", "mean_KEGG_fxn_MtBold_Remnant_A", "mean_KEGG_fxn_MtBold_Remnant_B", "mean_KEGG_fxn_MtBold_Remnant_C", "mean_KEGG_fxn_MtBold_top30inc_in_10year", "mean_KEGG_fxn_MtBold_top30dec_in_cleared" ) # loop to bind together for (i in 1:length(x)) { #i<-1 new <- eval(parse(text= paste0(x[i]))) mean_fxn_dat <- dplyr::bind_rows(mean_fxn_dat, new) print(paste0("completed ", i)) } # inspect mean_fxn_dat[1:5, 1:5] all(is.na(mean_fxn_dat[1, ])) # TRUE # remove NA 1st row mean_fxn_dat <- mean_fxn_dat[-1, ] mean_fxn_dat[1:5, 1:5] # K00001 K00002 K00003 K00004 K00005 # 2 0.0006320483 3.183938e-07 0.0005588197 1.410484e-05 6.325765e-06 # 3 0.0004562733 1.588845e-06 0.0005786527 4.977093e-06 1.413929e-05 # 4 0.0005447002 9.971469e-07 0.0005917501 6.388719e-06 1.790082e-06 # 5 0.0005933945 0.000000e+00 0.0005940520 4.720961e-06 6.134629e-06 # 6 0.0005102657 4.472973e-07 0.0005855608 2.264397e-05 8.399560e-06 dim(mean_fxn_dat) # 23 5434 23*5434 # 124982 row.names(mean_fxn_dat) <- x row.names(mean_fxn_dat) # [1] "mean_KEGG_fxn_Aust_Blackheath" "mean_KEGG_fxn_Aust_Booderee" "mean_KEGG_fxn_Aust_Buckley_Swamp" # [4] "mean_KEGG_fxn_Aust_Cape_Tribulation" "mean_KEGG_fxn_Aust_Fitzgerald_River_NP" "mean_KEGG_fxn_Aust_Freycinet_NP" # [7] "mean_KEGG_fxn_Aust_King_Island" "mean_KEGG_fxn_Aust_Longerenong" "mean_KEGG_fxn_Aust_Mackay" # [10] "mean_KEGG_fxn_Aust_Mt_Lesueur_NP" "mean_KEGG_fxn_Aust_Namadgi_NP" "mean_KEGG_fxn_Aust_Narrabri" # [13] "mean_KEGG_fxn_Aust_Walpeup" "mean_KEGG_fxn_MtBold_10_years" "mean_KEGG_fxn_MtBold_6_years" # [16] "mean_KEGG_fxn_MtBold_7_years" "mean_KEGG_fxn_MtBold_8_years" "mean_KEGG_fxn_MtBold_cleared" # [19] "mean_KEGG_fxn_MtBold_Remnant_A" "mean_KEGG_fxn_MtBold_Remnant_B" "mean_KEGG_fxn_MtBold_Remnant_C" # [22] "mean_KEGG_fxn_MtBold_top30inc_in_10year" "mean_KEGG_fxn_MtBold_top30dec_in_cleared" ## verify that data merging worked ok? head( mean_KEGG_fxn_MtBold_Remnant_C ) # K01361 K01362 K02249 K05841 K05844 K05845 # 4.380765e-07 1.243994e-03 0.000000e+00 4.028910e-06 5.716604e-05 4.027003e-04 mean_fxn_dat[ "mean_KEGG_fxn_MtBold_Remnant_C", head(names(mean_KEGG_fxn_MtBold_Remnant_C)) ] # K01361 K01362 K02249 K05841 K05844 K05845 # mean_KEGG_fxn_MtBold_Remnant_C 4.380765e-07 0.001243994 0 4.02891e-06 5.716604e-05 0.0004027003 head(mean_KEGG_fxn_Aust_Blackheath) # K01361 K01360 K01362 K02249 K05841 K05844 # 2.094340e-07 0.000000e+00 1.509308e-03 0.000000e+00 1.841301e-05 1.667075e-04 mean_fxn_dat[ "mean_KEGG_fxn_Aust_Blackheath", head(names(mean_KEGG_fxn_Aust_Blackheath)) ] # K01361 K01360 K01362 K02249 K05841 K05844 # mean_KEGG_fxn_Aust_Blackheath 2.09434e-07 0 0.001509308 0 1.841301e-05 0.0001667075 # identify columns (functions) containing NA data? # NA data have resulted from row binding above when some functions are absent for some samples # convert these no data NA values to zero sel <- which(is.na(mean_fxn_dat), arr.ind = TRUE) mean_fxn_dat[sel] <- 0 row.names(mean_fxn_dat) # [1] "mean_KEGG_fxn_Aust_Blackheath" "mean_KEGG_fxn_Aust_Booderee" # [3] "mean_KEGG_fxn_Aust_Buckley_Swamp" "mean_KEGG_fxn_Aust_Cape_Tribulation" # [5] "mean_KEGG_fxn_Aust_Fitzgerald_River_NP" "mean_KEGG_fxn_Aust_Freycinet_NP" # [7] "mean_KEGG_fxn_Aust_King_Island" "mean_KEGG_fxn_Aust_Longerenong" # [9] "mean_KEGG_fxn_Aust_Mackay" "mean_KEGG_fxn_Aust_Mt_Lesueur_NP" # [11] "mean_KEGG_fxn_Aust_Namadgi_NP" "mean_KEGG_fxn_Aust_Narrabri" # [13] "mean_KEGG_fxn_Aust_Walpeup" "mean_KEGG_fxn_MtBold_10_years" # [15] "mean_KEGG_fxn_MtBold_6_years" "mean_KEGG_fxn_MtBold_7_years" # [17] "mean_KEGG_fxn_MtBold_8_years" "mean_KEGG_fxn_MtBold_cleared" # [19] "mean_KEGG_fxn_MtBold_Remnant_A" "mean_KEGG_fxn_MtBold_Remnant_B" # [21] "mean_KEGG_fxn_MtBold_Remnant_C" "mean_KEGG_fxn_MtBold_top30inc_in_10year" # [23] "mean_KEGG_fxn_MtBold_top30dec_in_cleared" row.names(mean_fxn_dat) <- c( "Blackheath (apples)" , "Booderee (NP)" , "Buckley Swamp (pasture)" , "Cape Tribulation (NP)" , "Fitzgerald River (NP)" , "Freycinet (NP)" , "King Island (pasture)" , "Longerenong (wheat)" , "Mackay (sugar)" , "Mt Lesueur (NP)" , "Namadgi (NP)" , "Narrabri (cotton)" , "Walpeup (wheat)" , "10 years (Mt Bold)" , "6 years (Mt Bold)" , "7 years (Mt Bold)" , "8 years (Mt Bold)" , "Cleared (Mt Bold)" , "Remnant A (Mt Bold)" , "Remnant B (Mt Bold)" , "Remnant C (Mt Bold)" , "Top 30 increasing (Mt Bold)" , "Top 30 decreasing (Mt Bold)" ) # Top30 increasing are in 10 year samples # Top30 decreasing are in Cleared samples ## (i) Consider all available function data mean_fxn_dat.zrow <- base::scale(mean_fxn_dat) mean_fxn_dat[1:5,1:5] mean_fxn_dat.zrow[1:5,1:5] mean_fxn_dat.zrow.zcol <- t( base::scale(t(mean_fxn_dat.zrow)) ) mean_fxn_dat.zrow.zcol[1:5, 1:5] mat <- as.matrix(mean_fxn_dat.zrow.zcol) #heatmap.2(mat, trace="none",col=greenred(10), margins = c(5,15)) heatmap.2(mat, trace="none",col=colorRampPalette(c("blue","white","red"))(20), margins = c(5,15)) dim(mat) # 23 5434 row.names(mat) # [1] "Blackheath (apples)" "Booderee (NP)" "Buckley Swamp (pasture)" # [4] "Cape Tribulation (NP)" "Fitzgerald River (NP)" "Freycinet (NP)" # [7] "King Island (pasture)" "Longerenong (wheat)" "Mackay (sugar)" # [10] "Mt Lesueur (NP)" "Namadgi (NP)" "Narrabri (cotton)" # [13] "Walpeup (wheat)" "10 years (Mt Bold)" "6 years (Mt Bold)" # [16] "7 years (Mt Bold)" "8 years (Mt Bold)" "Cleared (Mt Bold)" # [19] "Remnant A (Mt Bold)" "Remnant B (Mt Bold)" "Remnant C (Mt Bold)" # [22] "Top 30 increasing (Mt Bold)" "Top 30 decreasing (Mt Bold)" grDevices::tiff(file=paste0("finished-plots/","heatmap-mean-fxn-dat-zrow-zcol-both-dendograms-vFINAL.tif"), width = 17.3, height = 14, units = "cm", res = 600, compression = "lzw") heatmap.2(mat, trace = "none", dendrogram = "both", col=colorRampPalette(c("red","white","blue"))(20), rowsep=c(7,14,15), margins = c(2,12), cexRow = 1, key.title = "Z-score", keysize = 1.5, xlab = "Function relative abundance (z-score)", labCol = NA, RowSideColors=c( "Blackheath (apples)" = "#F8766D" , "Booderee (NP)" = "#00BFC4" , "Buckley Swamp (pasture)" = "#F8766D", "Cape Tribulation (NP)" = "#00BFC4" , "Fitzgerald River (NP)" = "#00BFC4" , "Freycinet (NP)" = "#00BFC4" , "King Island (pasture)" = "#F8766D" , "Longerenong (wheat)" = "#F8766D" , "Mackay (sugar)" = "#F8766D" , "Mt Lesueur (NP)" = "#00BFC4" , "Namadgi (NP)" = "#00BFC4" , "Narrabri (cotton)" = "#F8766D" , "Walpeup (wheat)" = "#F8766D" , "10 years (Mt Bold)"= "#238443" , "6 years (Mt Bold)" = "#addd8e" , "7 years (Mt Bold)" = "#78c679" , "8 years (Mt Bold)" = "#41ab5d" , "Cleared (Mt Bold)" = "#e31a1c" , "Remnant A (Mt Bold)" = "#4292c6" , "Remnant B (Mt Bold)" = "#2171b5" , "Remnant C (Mt Bold)" = "#084594" , "Top 30 increasing (Mt Bold)" = "#238443" , "Top 30 decreasing (Mt Bold)" = "#e31a1c" ) ) legend(x = 0.81, y = 0.83, xpd = TRUE, # "topright" legend = c( "Human-altered (Aust)", "Natural (Aust)", "Cleared (Mt Bold)", "6 years (Mt Bold)", "7 years (Mt Bold)", "8 years (Mt Bold)", "10 years (Mt Bold)", "Remnant A (Mt Bold)", "Remnant B (Mt Bold)", "Remnant C (Mt Bold)" ), col = c( "Human-altered" = "#F8766D" , "Natural" = "#00BFC4" , "Cleared (Mt Bold)" = "#e31a1c" ,"6 years (Mt Bold" = "#addd8e" , "7 years (Mt Bold)" = "#78c679" , "8 years (Mt Bold)" = "#41ab5d" , "10 years (Mt Bold)"= "#238443" , "Remnant A (Mt Bold)" = "#4292c6" , "Remnant B (Mt Bold)" = "#2171b5" , "Remnant C (Mt Bold)" = "#084594" ), lty= 1, lwd = 5, cex=.6 , xjust = 0, # how the legend is to be justified relative to the legend x location. A value of 0 means left justified, 0.5 means centered and 1 means right justified yjust = 0 # how the legend is to be justified relative to the legend x location. A value of 0 means left justified, 0.5 means centered and 1 means right justified ) dev.off() ### Explore if human disease is listed in KEGG Pathways? ko <- colnames(mat) class(ko) # "character" class(names(KEGG_fxn_Aust_compare$fxn_meta$KEGG_Pathways)) # "character" disease_kegg <- data.frame(ko=ko,disease=NA, colour=NA) class(disease_kegg$ko) # "factor" disease_kegg$ko <- as.character(disease_kegg$ko) class(disease_kegg$disease) disease_kegg$colour <- as.character(disease_kegg$colour) for (i in 1:length(ko)) { # length(ko): 5434 #i<-1 sel <- which(names(KEGG_fxn_Aust_compare$fxn_meta$KEGG_Pathways) == ko[i]) disease <- which(unlist(KEGG_fxn_Aust_compare$fxn_meta$KEGG_Pathways[sel]) == "human diseases") if (length(disease) >= 1) { disease_kegg$disease[i] = TRUE disease_kegg$colour[i] = "#bebada" } else { disease_kegg$disease[i] = FALSE disease_kegg$colour[i] = "#ffffb3" } print(paste0("completed KO#",i)) } length( which(disease_kegg$disease == TRUE) ) # 79 sel <- which(disease_kegg$disease == TRUE) head(sel) # 107 306 307 308 334 336 disease_kegg$ko[sel] # [1] "K00134" "K00411" "K00412" "K00413" "K00461" "K00463" "K00758" "K00873" "K01078" "K01283" "K01312" "K01354" "K01389" # [14] "K01392" "K01428" "K01476" "K01488" "K01580" "K01679" "K02040" "K02137" "K02405" "K03087" "K03092" "K03367" "K03648" # [27] "K03739" "K03740" "K03781" "K03943" "K04043" "K04077" "K04079" "K04496" "K04564" "K04565" "K05692" "K05851" "K06236" # [40] "K07173" "K07326" "K07345" "K07347" "K07389" "K07679" "K07690" "K08303" "K08604" "K08683" "K08720" "K08738" "K09565" # [53] "K10912" "K10913" "K10914" "K10918" "K10924" "K10926" "K10927" "K10938" "K10941" "K10942" "K10943" "K10954" "K11003" # [66] "K11004" "K11041" "K11089" "K11631" "K11632" "K11987" "K12340" "K12973" "K13730" "K13963" "K14048" "K14188" "K14205" # [79] "K14475" length(disease_kegg$ko[-sel]) # 5355 disease_kegg$ko[-sel] head(disease_kegg$ko[-sel]) disease_kegg$colour[1:200] ## Examples KEGG_fxn_Aust_compare$fxn_meta$KEGG_Pathways[[ "K00134" ]] # [[1]] # [1] "metabolism" "carbohydrate metabolism" "glycolysis / gluconeogenesis" # # [[2]] # [1] "human diseases" "neurodegenerative diseases" "alzheimer's disease" KEGG_fxn_Aust_compare$fxn_meta$KEGG_Pathways[[ "K00411" ]] # [[1]] # [1] "organismal systems" "circulatory system" "cardiac muscle contraction" # # [[2]] # [1] "human diseases" "neurodegenerative diseases" "huntington's disease" # # [[3]] # [1] "metabolism" "energy metabolism" "oxidative phosphorylation" # # [[4]] # [1] "human diseases" "neurodegenerative diseases" "parkinson's disease" # # [[5]] # [1] "human diseases" "neurodegenerative diseases" "alzheimer's disease" KEGG_fxn_Aust_compare$fxn_meta$KEGG_Pathways[[ "K00412" ]] # [[1]] # [1] "organismal systems" "circulatory system" "cardiac muscle contraction" # # [[2]] # [1] "human diseases" "neurodegenerative diseases" "huntington's disease" # # [[3]] # [1] "metabolism" "energy metabolism" "oxidative phosphorylation" # # [[4]] # [1] "human diseases" "neurodegenerative diseases" "parkinson's disease" # # [[5]] # [1] "human diseases" "neurodegenerative diseases" "alzheimer's disease" ## re-do heatmap with human disease-associated functions highlighted grDevices::tiff(file=paste0("finished-plots/","heatmap-mean-fxn-dat-zrow-zcol-both-dendograms-with-Disease-vFINAL.tif"), width = 17.3, height = 14, units = "cm", res = 600, compression = "lzw") heatmap.2(mat, trace = "none", dendrogram = "both", col=colorRampPalette(c("red","white","blue"))(20), rowsep=c(7,14,15), margins = c(2,12), cexRow = 1, key.title = "Z-score", keysize = 1.5, xlab = "Function relative abundance (z-score)", labCol = NA, RowSideColors=c( "Blackheath (apples)" = "#F8766D" , "Booderee (NP)" = "#00BFC4" , "Buckley Swamp (pasture)" = "#F8766D", "Cape Tribulation (NP)" = "#00BFC4" , "Fitzgerald River (NP)" = "#00BFC4" , "Freycinet (NP)" = "#00BFC4" , "King Island (pasture)" = "#F8766D" , "Longerenong (wheat)" = "#F8766D" , "Mackay (sugar)" = "#F8766D" , "Mt Lesueur (NP)" = "#00BFC4" , "Namadgi (NP)" = "#00BFC4" , "Narrabri (cotton)" = "#F8766D" , "Walpeup (wheat)" = "#F8766D" , "10 years (Mt Bold)"= "#238443" , "6 years (Mt Bold)" = "#addd8e" , "7 years (Mt Bold)" = "#78c679" , "8 years (Mt Bold)" = "#41ab5d" , "Cleared (Mt Bold)" = "#e31a1c" , "Remnant A (Mt Bold)" = "#4292c6" , "Remnant B (Mt Bold)" = "#2171b5" , "Remnant C (Mt Bold)" = "#084594" , "Top 30 increasing (Mt Bold)" = "#238443" , "Top 30 decreasing (Mt Bold)" = "#e31a1c" ), ColSideColors=c( disease_kegg$colour ) ) legend(x = 0.805, y = 0.785, xpd = TRUE, # "topright" legend = c( "Disease annotation", "No disease annotation", "Human-altered (Aust)", "Natural (Aust)", "Cleared (Mt Bold)", "6 years (Mt Bold)", "7 years (Mt Bold)", "8 years (Mt Bold)", "10 years (Mt Bold)", "Remnant A (Mt Bold)", "Remnant B (Mt Bold)", "Remnant C (Mt Bold)" ), col = c( "Disease annotation" = "#bebada", "No disease annotation" = "#ffffb3", "Human-altered" = "#F8766D" , "Natural" = "#00BFC4" , "Cleared (Mt Bold)" = "#e31a1c" ,"6 years (Mt Bold" = "#addd8e" , "7 years (Mt Bold)" = "#78c679" , "8 years (Mt Bold)" = "#41ab5d" , "10 years (Mt Bold)"= "#238443" , "Remnant A (Mt Bold)" = "#4292c6" , "Remnant B (Mt Bold)" = "#2171b5" , "Remnant C (Mt Bold)" = "#084594" ), lty= 1, lwd = 5, cex=.6 , xjust = 0, # how the legend is to be justified relative to the legend x location. A value of 0 means left justified, 0.5 means centered and 1 means right justified yjust = 0 # how the legend is to be justified relative to the legend x location. A value of 0 means left justified, 0.5 means centered and 1 means right justified ) dev.off() ## Why are these grouped?? ## Examine clay, carbon %, etc for # - Blackheath (apples) # - Cape Tribulation (NP) # - Namadgi (NP) # - Booderee (NP) # - Fitzgerald River (NP) # - Buckley Swamp (pasture) # - King Island (pasture) # - Mackay (sugar) # - Mt Lesueur (NP) # - Freycinet (NP) # - Narrabri (cotton) # - Walpeup (wheat) # - Longerong (wheat) phy.aust_select@sam_data[ , c("Collection.Site", "Clay", "Organic.Carbon", "H2O.pH", "NH3.N", "Colwell.P")] # Sample Data: [39 samples by 6 sample variables]: # Collection.Site Clay Organic.Carbon H2O.pH NH3.N Colwell.P # X12428 Freycinet NP 16.15 4.08 7.6 17 12 # X12430 Freycinet NP 15.59 5.3 7.3 6 12 # X12438 Freycinet NP 10.48 4.33 6.3 7 6 # X12509 Narrabri 33.61 1.1599999999999999 7.7 NA 74 # X12511 Narrabri 36.64 1.4 7.7 NA 64 # X12525 Narrabri 36.58 1.19 7.7 1 74 # X12560 Namadgi NP 26.23 4.41 5.7 6 27 # X12574 Namadgi NP 11.06 3.64 5.8 6 27 # X12616 Blackheath 9.65 3.56 6.0 NA 92 # X12620 Blackheath 9.75 3.38 6.5 5 102 # X12624 Blackheath 14.93 3.78 6.1 5 113 # X12816 Cape Tribulation 24.16 4.93 5.1 17 7 # X12818 Cape Tribulation 19.42 4.62 5.5 16 9 # X12819 Cape Tribulation NA 5.36 5.6 16 8 # X13272 King Island 7.09 5.03 7.0 13 24 # X13274 King Island 8.40 4.84 7.1 5 126 # X13282 King Island 15.64 4.1100000000000003 6.6 3 35 # X7833 Booderee 7.86 1.51 4.5 5 NA # X7837 Booderee 3.81 1.23 4.9 4 NA # X7851 Booderee 48.62 0.59 4.9 7 163 # X8082 Mt Lesueur NP 3.88 1.01 5.7 1 NA # X8084 Mt Lesueur NP 10.17 1.37 5.9 4 3 # X8099 Mt Lesueur NP 3.89 0.19 7.2 1 NA # X8116 Fitzgerald River NP 5.88 0.72 5.7 NA NA # X8118 Fitzgerald River NP 6.99 1.17 5.9 1 NA # X8130 Fitzgerald River NP 5.80 0.59 5.8 NA NA # X8182 Walpeup NA 1.1599999999999999 7.6 2 32 # X8192 Mackay 8.36 0.79 5.0 10 89 # X8198 Mackay 14.32 0.84 4.6 3 189 # X8220 Mackay 17.59 1.1599999999999999 4.5 6 190 # X8262 Walpeup NA 0.87 8.0 2 29 # X8266 Walpeup NA 0.89 8.3 3 22 # X8270 Longerenong NA 1.38 8.2 3 61 # X8272 Longerenong NA 1.44 8.1 2 62 # X8274 Longerenong NA 0.97 8.2 3 65 # X8280 Buckley Swamp NA 4.92 5.5 25 NA # X8284 Buckley Swamp NA 4.28 5.5 6 31 # X8286 Buckley Swamp NA 5.1100000000000003 5.5 6 45 # X9442 Namadgi NP 11.59 5.0199999999999996 5.7 20 71 # #------------------------- ### END